wip
This commit is contained in:
201
analyze_batch.py
Normal file
201
analyze_batch.py
Normal file
@@ -0,0 +1,201 @@
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
import sys
|
||||
import os
|
||||
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from collections import defaultdict
|
||||
from loader import load, dmt, cms
|
||||
import math
|
||||
|
||||
printnoln = lambda text: print(text, end='', flush=True)
|
||||
rprint = lambda text: print('\r' + text)
|
||||
|
||||
DAYS_NEW_USER = 7
|
||||
OLD_USER_YEAR = 3
|
||||
|
||||
analyser = SentimentIntensityAnalyzer()
|
||||
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
|
||||
|
||||
|
||||
def main(folder):
|
||||
users, posts, firstcontrib, sumcontrib = load(folder)
|
||||
|
||||
intervals = calc_intervals(posts)
|
||||
|
||||
postcounts = range(1, 5 + 1)
|
||||
for (option_date_from, option_date_to) in intervals:
|
||||
# filter users by option_date_from <= creation date <= option_date_to
|
||||
newusers = dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").getresults()
|
||||
newuserids = set(dmt(newusers).map(lambda u: u['Id'], "get user id list").getresults())
|
||||
|
||||
# get questions for filtered users
|
||||
newposts = dmt(posts).filter(lambda p: p['OwnerUserId'] in newuserids, "filter posts by selected users").getresults()
|
||||
if len(newposts) == 0:
|
||||
continue
|
||||
print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
||||
gfig, gaxs = plt.subplots(2, 2, figsize=(16, 12))
|
||||
gaxs[0, 0].set_title('Neg')
|
||||
gaxs[1, 0].set_title('Neu')
|
||||
gaxs[0, 1].set_title('Pos')
|
||||
gaxs[1, 1].set_title('Compound')
|
||||
|
||||
gneg = []
|
||||
gneu = []
|
||||
gpos = []
|
||||
gcom = []
|
||||
|
||||
outfolder = "output/batch/" + folder.split("/")[-1] + "/"
|
||||
goutfilename = outfolder + "batch_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
||||
|
||||
for option_posts in postcounts:
|
||||
# print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts))
|
||||
|
||||
# computer toxic levels
|
||||
start = cms()
|
||||
printnoln("computing toxic levels: filtering")
|
||||
toxlevels = defaultdict(list)
|
||||
searchedposts = defaultdict(int)
|
||||
filteredposts = []
|
||||
for (i, post) in enumerate(newposts):
|
||||
userid = post['OwnerUserId']
|
||||
|
||||
# check first contribution
|
||||
if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']:
|
||||
continue
|
||||
|
||||
# no more than option_posts posts from one user
|
||||
searchedposts[userid] += 1
|
||||
if searchedposts[userid] > option_posts:
|
||||
continue
|
||||
|
||||
filteredposts.append(post)
|
||||
|
||||
for (i, post) in enumerate(filteredposts):
|
||||
if (i + 1) % 100 == 0:
|
||||
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
|
||||
if (i + 1) == len(newposts):
|
||||
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
|
||||
userid = post['OwnerUserId']
|
||||
for a in post['Answers']:
|
||||
toxlevel = computeToxLevel(a['Body'])
|
||||
toxlevels[userid].append(toxlevel)
|
||||
rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms")
|
||||
|
||||
outfilename = goutfilename + "_" + str(option_posts)
|
||||
os.system("mkdir -p " + outfolder)
|
||||
dumptoxlevels(toxlevels, outfilename + ".py")
|
||||
|
||||
neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())]
|
||||
neulevelsflat = [item['neu'] for item in flatmap(toxlevels.values())]
|
||||
poslevelsflat = [item['pos'] for item in flatmap(toxlevels.values())]
|
||||
comlevelsflat = [item['compound'] for item in flatmap(toxlevels.values())]
|
||||
|
||||
gneg.append(neglevelsflat)
|
||||
gneu.append(neulevelsflat)
|
||||
gpos.append(poslevelsflat)
|
||||
gcom.append(comlevelsflat)
|
||||
|
||||
fig, axs = plt.subplots(2, 2, figsize=(16, 12))
|
||||
axs[0, 0].set_title('Neg')
|
||||
axs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100))
|
||||
axs[1, 0].set_title('Neu')
|
||||
axs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100))
|
||||
axs[0, 1].set_title('Pos')
|
||||
axs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100))
|
||||
axs[1, 1].set_title('Compound')
|
||||
axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100))
|
||||
|
||||
# global
|
||||
# gaxs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
|
||||
# gaxs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
|
||||
# gaxs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
|
||||
# gaxs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
|
||||
# gaxs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
|
||||
# gaxs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
|
||||
# gaxs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
|
||||
# gaxs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
|
||||
|
||||
# plt.show()
|
||||
fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts\nUsers registered between "
|
||||
+ option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
||||
fig.savefig(outfilename + ".png", bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
|
||||
# global
|
||||
gaxs[0, 0].hist(gneg, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||
gaxs[1, 0].hist(gneu, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||
gaxs[0, 1].hist(gpos, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||
gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||
# gaxs[0, 0].hist(gneg, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||
# gaxs[1, 0].hist(gneu, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||
# gaxs[0, 1].hist(gpos, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||
# gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||
# gaxs[0, 0].hist(gneg, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||
# gaxs[1, 0].hist(gneu, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||
# gaxs[0, 1].hist(gpos, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||
# gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||
gaxs[0, 0].legend(loc="upper right")
|
||||
gaxs[1, 0].legend(loc="upper right")
|
||||
gaxs[0, 1].legend(loc="upper right")
|
||||
gaxs[1, 1].legend(loc="upper right")
|
||||
gfig.suptitle("Sentiment of answers to the first X (max) posts\nUsers registered between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
||||
gfig.savefig(goutfilename + ".png", bbox_inches='tight')
|
||||
plt.close(gfig)
|
||||
|
||||
|
||||
def computeToxLevel(text):
|
||||
return analyser.polarity_scores(text)
|
||||
|
||||
|
||||
def flatmap(arr):
|
||||
return [item for sublist in arr for item in sublist]
|
||||
|
||||
|
||||
def dumptoxlevels(lvls, filename):
|
||||
with open(filename, "w") as file:
|
||||
file.write("from collections import defaultdict\n\n")
|
||||
file.write("toxlevels = " + str(lvls).replace("<class 'list'>", "list", 1) + "\n")
|
||||
|
||||
|
||||
def calc_intervals(posts):
|
||||
firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], "firstpost").getresults()
|
||||
lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults()
|
||||
|
||||
# calc quarter beginning
|
||||
firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
|
||||
if firstpost.month not in (1, 4, 7, 10):
|
||||
firstpost = firstpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[firstpost.month])
|
||||
lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
|
||||
if lastpost.month not in (1, 4, 7, 10):
|
||||
lastpost = lastpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[lastpost.month])
|
||||
# add 3 months to last post
|
||||
if lastpost.month == 10:
|
||||
lastpost = lastpost.replace(month=1, year=lastpost.year + 1)
|
||||
else:
|
||||
lastpost = lastpost.replace(month=lastpost.month + 3)
|
||||
|
||||
cdate = firstpost
|
||||
intervals = []
|
||||
while cdate < lastpost:
|
||||
nextquarter = cdate.replace(month=(cdate.month + 3) % 12, year=cdate.year + (0 if cdate.month + 3 < 12 else 1))
|
||||
print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y"))
|
||||
intervals.append((cdate, nextquarter))
|
||||
cdate = nextquarter
|
||||
# sys.exit(0)
|
||||
return intervals
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# execute only if run as a script
|
||||
usage = sys.argv[0] + " <folder>"
|
||||
if len(sys.argv) < 2:
|
||||
print(usage)
|
||||
sys.exit(1)
|
||||
folder = sys.argv[1]
|
||||
if not os.path.isdir(folder):
|
||||
print(folder + " is not a folder")
|
||||
sys.exit(1)
|
||||
|
||||
main(folder)
|
||||
Reference in New Issue
Block a user