Files
master/analyze_batch.py
wea_ondara 532f3ca381 wip
2019-05-21 13:35:04 +02:00

202 lines
11 KiB
Python

from datetime import datetime
from datetime import timedelta
import sys
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from loader import load, dmt, cms
import math
printnoln = lambda text: print(text, end='', flush=True)
rprint = lambda text: print('\r' + text)
DAYS_NEW_USER = 7
OLD_USER_YEAR = 3
analyser = SentimentIntensityAnalyzer()
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
def main(folder):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts)
postcounts = range(1, 5 + 1)
for (option_date_from, option_date_to) in intervals:
# filter users by option_date_from <= creation date <= option_date_to
newusers = dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").getresults()
newuserids = set(dmt(newusers).map(lambda u: u['Id'], "get user id list").getresults())
# get questions for filtered users
newposts = dmt(posts).filter(lambda p: p['OwnerUserId'] in newuserids, "filter posts by selected users").getresults()
if len(newposts) == 0:
continue
print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
gfig, gaxs = plt.subplots(2, 2, figsize=(16, 12))
gaxs[0, 0].set_title('Neg')
gaxs[1, 0].set_title('Neu')
gaxs[0, 1].set_title('Pos')
gaxs[1, 1].set_title('Compound')
gneg = []
gneu = []
gpos = []
gcom = []
outfolder = "output/batch/" + folder.split("/")[-1] + "/"
goutfilename = outfolder + "batch_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
for option_posts in postcounts:
# print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts))
# computer toxic levels
start = cms()
printnoln("computing toxic levels: filtering")
toxlevels = defaultdict(list)
searchedposts = defaultdict(int)
filteredposts = []
for (i, post) in enumerate(newposts):
userid = post['OwnerUserId']
# check first contribution
if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']:
continue
# no more than option_posts posts from one user
searchedposts[userid] += 1
if searchedposts[userid] > option_posts:
continue
filteredposts.append(post)
for (i, post) in enumerate(filteredposts):
if (i + 1) % 100 == 0:
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
if (i + 1) == len(newposts):
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
userid = post['OwnerUserId']
for a in post['Answers']:
toxlevel = computeToxLevel(a['Body'])
toxlevels[userid].append(toxlevel)
rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms")
outfilename = goutfilename + "_" + str(option_posts)
os.system("mkdir -p " + outfolder)
dumptoxlevels(toxlevels, outfilename + ".py")
neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())]
neulevelsflat = [item['neu'] for item in flatmap(toxlevels.values())]
poslevelsflat = [item['pos'] for item in flatmap(toxlevels.values())]
comlevelsflat = [item['compound'] for item in flatmap(toxlevels.values())]
gneg.append(neglevelsflat)
gneu.append(neulevelsflat)
gpos.append(poslevelsflat)
gcom.append(comlevelsflat)
fig, axs = plt.subplots(2, 2, figsize=(16, 12))
axs[0, 0].set_title('Neg')
axs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100))
axs[1, 0].set_title('Neu')
axs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100))
axs[0, 1].set_title('Pos')
axs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100))
axs[1, 1].set_title('Compound')
axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100))
# global
# gaxs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
# gaxs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
# gaxs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
# gaxs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
# gaxs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
# gaxs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
# gaxs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
# gaxs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
# plt.show()
fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts\nUsers registered between "
+ option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
fig.savefig(outfilename + ".png", bbox_inches='tight')
plt.close(fig)
# global
gaxs[0, 0].hist(gneg, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[1, 0].hist(gneu, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[0, 1].hist(gpos, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[0, 0].hist(gneg, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[1, 0].hist(gneu, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[0, 1].hist(gpos, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[0, 0].hist(gneg, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[1, 0].hist(gneu, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[0, 1].hist(gpos, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[0, 0].legend(loc="upper right")
gaxs[1, 0].legend(loc="upper right")
gaxs[0, 1].legend(loc="upper right")
gaxs[1, 1].legend(loc="upper right")
gfig.suptitle("Sentiment of answers to the first X (max) posts\nUsers registered between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
gfig.savefig(goutfilename + ".png", bbox_inches='tight')
plt.close(gfig)
def computeToxLevel(text):
return analyser.polarity_scores(text)
def flatmap(arr):
return [item for sublist in arr for item in sublist]
def dumptoxlevels(lvls, filename):
with open(filename, "w") as file:
file.write("from collections import defaultdict\n\n")
file.write("toxlevels = " + str(lvls).replace("<class 'list'>", "list", 1) + "\n")
def calc_intervals(posts):
firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], "firstpost").getresults()
lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults()
# calc quarter beginning
firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
if firstpost.month not in (1, 4, 7, 10):
firstpost = firstpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[firstpost.month])
lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
if lastpost.month not in (1, 4, 7, 10):
lastpost = lastpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[lastpost.month])
# add 3 months to last post
if lastpost.month == 10:
lastpost = lastpost.replace(month=1, year=lastpost.year + 1)
else:
lastpost = lastpost.replace(month=lastpost.month + 3)
cdate = firstpost
intervals = []
while cdate < lastpost:
nextquarter = cdate.replace(month=(cdate.month + 3) % 12, year=cdate.year + (0 if cdate.month + 3 < 12 else 1))
print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y"))
intervals.append((cdate, nextquarter))
cdate = nextquarter
# sys.exit(0)
return intervals
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
main(folder)