From 58f6d03820c56d4f4506abc9319af23254a770aa Mon Sep 17 00:00:00 2001 From: wea_ondara Date: Sat, 17 Aug 2019 18:04:15 +0200 Subject: [PATCH] meh --- box_sentiment.py | 197 +++++++++++++++++++++++++++++++++++++++++++++++ calctoxdiff.py | 4 +- 2 files changed, 199 insertions(+), 2 deletions(-) create mode 100644 box_sentiment.py diff --git a/box_sentiment.py b/box_sentiment.py new file mode 100644 index 0000000..ae191e7 --- /dev/null +++ b/box_sentiment.py @@ -0,0 +1,197 @@ +import operator +import os +import sys +from collections import defaultdict +from datetime import timedelta +from math import ceil + +import matplotlib.pyplot as plt +import numpy as np + +from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK +from loader import load, dmt, cms + +OLD_USER_PERCENTILE = 0.95 + +colors = ['red', 'green', 'blue', 'orange', 'deeppink'] + + +def main(folder, intervl): + users, posts, firstcontrib, sumcontrib = load(folder) + + intervals = calc_intervals(posts, intervl) + + start = cms() + printnoln("reading sentiments ...") + cachedsentiments = imprt(folder + "/output/sentiments.py").answers + rprint("reading sentiments ... took " + str(cms() - start) + "ms") + + outputdir = folder + "/output/boxsentiment/" + os.system("mkdir -p " + outputdir) + + magicknew = IMAGE_MAGICK + magickold = IMAGE_MAGICK + + avgnewneg = [] + avgnewneu = [] + avgnewpos = [] + avgnewall = [] + avgoldneg = [] + avgoldneu = [] + avgoldpos = [] + avgoldall = [] + for (option_date_from, option_date_to) in intervals: + # get questions for option_date_from <= creation date < option_date_to + newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults() + if len(newposts) == 0: + continue + print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) + + goutfilenamenewusers = outputdir + "boxsent_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + goutfilenameoldusers = outputdir + "boxsent_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + + # print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts)) + + # computer toxic levels + start = cms() + printnoln("computing toxic levels: filtering") + toxlevels = [] + filteredposts = [] + for (i, post) in enumerate(newposts): + userid = post['OwnerUserId'] + + # check first contribution + if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']: + continue + + filteredposts.append(post) + + for (i, post) in enumerate(filteredposts): + printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts))) + for a in post['Answers']: + if a['Id'] in cachedsentiments.keys(): + toxlevel = cachedsentiments[a['Id']] + else: + print("Sentiment not found for " + a['Id']) + toxlevels.append(toxlevel) + + printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...") + + neg = [item['compound'] for item in toxlevels if item['compound'] < -0.05] + pos = [item['compound'] for item in toxlevels if item['compound'] > 0.05] + neu = [item['compound'] for item in toxlevels if -0.05 <= item['compound'] <= 0.05] + avgnewneg.append(np.average(neg)) + avgnewneu.append(np.average(neu)) + avgnewpos.append(np.average(pos)) + avgnewall.append(np.average([item['compound'] for item in toxlevels])) + + fig, axs = plt.subplots(figsize=(16, 12)) + axs.boxplot([neg, neu, pos]) + axs.set_xticklabels(['negative', 'neutral', 'positive']) + axs.set_title("Sentiment categorization of answers to posts within 1 week of 1st contribution\nPosts created between " + + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + ", n=" + str(len(filteredposts)) + "\nn = " + str(len(toxlevels))) + printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...") + fig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight') + plt.close(fig) + rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms") + magicknew += " " + goutfilenamenewusers + ".png" + + # for old users --------------------------------------------------------------------------------- + start = cms() + newuserids = set(dmt(newposts).map(lambda p: p['OwnerUserId']).getresults()) + userposts = {u: 0 for u in newuserids} + for p in newposts: + userposts[p['OwnerUserId']] += 1 + userposts = sorted(userposts.items(), key=operator.itemgetter(1)) + oldusers = [k for k, v in userposts] + oldusers = set(oldusers[ceil(len(oldusers) * OLD_USER_PERCENTILE):]) + filteredposts = dmt(newposts).filter(lambda p: p['OwnerUserId'] in oldusers).getresults() + + toxlevels = [] + for (i, post) in enumerate(filteredposts): + printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts))) + for a in post['Answers']: + if a['Id'] in cachedsentiments.keys(): + toxlevel = cachedsentiments[a['Id']] + else: + print("Sentiment not found for " + a['Id']) + toxlevels.append(toxlevel) + printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...") + + neg = [item['compound'] for item in toxlevels if item['compound'] < -0.05] + pos = [item['compound'] for item in toxlevels if item['compound'] > 0.05] + neu = [item['compound'] for item in toxlevels if -0.05 <= item['compound'] <= 0.05] + avgoldneg.append(np.average(neg)) + avgoldneu.append(np.average(neu)) + avgoldpos.append(np.average(pos)) + avgoldall.append(np.average([item['compound'] for item in toxlevels])) + + fig, axs = plt.subplots(figsize=(16, 12)) + axs.boxplot([neg, neu, pos]) + axs.set_xticklabels(['negative', 'neutral', 'positive']) + axs.set_title("Sentiment categorization of answers to posts within 1 week of 1st contribution\nPosts created between " + + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + ", n=" + str(len(filteredposts)) + "\nn = " + str(len(toxlevels))) + + printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...") + fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight') + plt.close(fig) + rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms") + magickold += " " + goutfilenameoldusers + ".png" + + os.system(magicknew + " " + outputdir + "boxsent_newusers.pdf") + os.system(magickold + " " + outputdir + "boxsent_oldusers.pdf") + + # plot new users + fig = plt.figure(figsize=(16, 12)) + x = [f.strftime("%d-%m-%Y") + " - " + t.strftime("%d-%m-%Y") for (f, t) in intervals] + plt.plot(x, avgnewneg, label='negative') + plt.plot(x, avgnewneu, label='neutral') + plt.plot(x, avgnewpos, label='positive') + plt.plot(x, avgnewall, label='all') + plt.legend(loc="upper right") + plt.xticks(rotation=90) + plt.title("Sentiment categorization for posts from new users") + fig.savefig(outputdir + "avgsentnewusers.png", bbox_inches='tight') + plt.close(fig) + + # plot old users + fig = plt.figure(figsize=(16, 12)) + x = [f.strftime("%d-%m-%Y") + " - " + t.strftime("%d-%m-%Y") for (f, t) in intervals] + plt.plot(x, avgoldneg, label='negative') + plt.plot(x, avgoldneu, label='neutral') + plt.plot(x, avgoldpos, label='positive') + plt.plot(x, avgoldall, label='all') + plt.legend(loc="upper right") + plt.xticks(rotation=90) + plt.title("Sentiment categorization for posts from old users") + fig.savefig(outputdir + "avgsentoldusers.png", bbox_inches='tight') + plt.close(fig) + + +if __name__ == "__main__": + # execute only if run as a script + usage = sys.argv[0] + " " + if len(sys.argv) < 2: + print(usage) + sys.exit(1) + folder = sys.argv[1] + if not os.path.isdir(folder): + print(folder + " is not a folder") + sys.exit(1) + interval = 3 + if len(sys.argv) >= 3: + if sys.argv[2].startswith("-i"): + interval = sys.argv[2][2:] + try: + interval = int(interval) + except ValueError: + print("-i: int required") + sys.exit(1) + if interval < 1 or interval > 12: + print("-i: only 1 - 12") + sys.exit(1) + else: + print("unknown parameter: " + sys.argv[2]) + sys.exit(1) + + main(folder, interval) diff --git a/calctoxdiff.py b/calctoxdiff.py index 3fa78fd..d684cf4 100644 --- a/calctoxdiff.py +++ b/calctoxdiff.py @@ -303,7 +303,7 @@ def plotbydateold(onlyfiles, oldfiles, outputdir): plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) - plt.xlabel("Comparision: X (max) posts - X+1 (max) posts") + plt.xlabel("Comparision: new users X (max) posts - old users posts") plt.ylabel("p-value") plt.legend(loc="upper right") outfile = outputdir + "/ks_olddate_pval_" + d[0] + "_" + d[1] + ".png" @@ -331,7 +331,7 @@ def plotbydateold(onlyfiles, oldfiles, outputdir): plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) - plt.xlabel("Comparision: X (max) posts - X+1 (max) posts") + plt.xlabel("Comparision: new users X (max) posts - old users posts") plt.ylabel("stat value") plt.legend(loc="upper right") outfile = outputdir + "/ks_olddate_stat_" + d[0] + "_" + d[1] + ".png"