import operator import os import sys from collections import defaultdict from datetime import timedelta from math import ceil import matplotlib.pyplot as plt import numpy as np from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK, FIG_SIZE from loader import load, dmt, cms OLD_USER_PERCENTILE = 0.95 colors = ['red', 'green', 'blue', 'orange', 'deeppink'] def main(folder, intervl): users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts, intervl) start = cms() printnoln("reading sentiments ...") cachedsentiments = imprt(folder + "/output/sentiments.py").answers rprint("reading sentiments ... took " + str(cms() - start) + "ms") outputdir = folder + "/output/boxsentiment/" os.system("mkdir -p " + outputdir) magicknew = IMAGE_MAGICK magickold = IMAGE_MAGICK avgnewneg = [] avgnewneu = [] avgnewpos = [] avgnewall = [] avgoldneg = [] avgoldneu = [] avgoldpos = [] avgoldall = [] for (option_date_from, option_date_to) in intervals: # get questions for option_date_from <= creation date < option_date_to newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults() if len(newposts) == 0: continue print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) goutfilenamenewusers = outputdir + "boxsent_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") goutfilenameoldusers = outputdir + "boxsent_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") # print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts)) # computer toxic levels start = cms() printnoln("computing toxic levels: filtering") toxlevels = [] filteredposts = [] for (i, post) in enumerate(newposts): userid = post['OwnerUserId'] # check first contribution if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']: continue filteredposts.append(post) for (i, post) in enumerate(filteredposts): printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts))) for a in post['Answers']: if a['Id'] in cachedsentiments.keys(): toxlevel = cachedsentiments[a['Id']] else: print("Sentiment not found for " + a['Id']) toxlevels.append(toxlevel) printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...") neg = [item['compound'] for item in toxlevels if item['compound'] < -0.05] pos = [item['compound'] for item in toxlevels if item['compound'] > 0.05] neu = [item['compound'] for item in toxlevels if -0.05 <= item['compound'] <= 0.05] avgnewneg.append(np.average(neg)) avgnewneu.append(np.average(neu)) avgnewpos.append(np.average(pos)) avgnewall.append(np.average([item['compound'] for item in toxlevels])) fig, axs = plt.subplots(figsize=FIG_SIZE) axs.boxplot([neg, neu, pos]) axs.set_xticklabels(['negative', 'neutral', 'positive']) axs.set_title("Sentiment categorization of answers to posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + ", n=" + str(len(filteredposts)) + "\nn = " + str(len(toxlevels))) printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...") fig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight') plt.close(fig) rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms") magicknew += " " + goutfilenamenewusers + ".png" # for old users --------------------------------------------------------------------------------- start = cms() newuserids = set(dmt(newposts).map(lambda p: p['OwnerUserId']).getresults()) userposts = {u: 0 for u in newuserids} for p in newposts: userposts[p['OwnerUserId']] += 1 userposts = sorted(userposts.items(), key=operator.itemgetter(1)) oldusers = [k for k, v in userposts] oldusers = set(oldusers[ceil(len(oldusers) * OLD_USER_PERCENTILE):]) filteredposts = dmt(newposts).filter(lambda p: p['OwnerUserId'] in oldusers).getresults() toxlevels = [] for (i, post) in enumerate(filteredposts): printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts))) for a in post['Answers']: if a['Id'] in cachedsentiments.keys(): toxlevel = cachedsentiments[a['Id']] else: print("Sentiment not found for " + a['Id']) toxlevels.append(toxlevel) printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...") neg = [item['compound'] for item in toxlevels if item['compound'] < -0.05] pos = [item['compound'] for item in toxlevels if item['compound'] > 0.05] neu = [item['compound'] for item in toxlevels if -0.05 <= item['compound'] <= 0.05] avgoldneg.append(np.average(neg)) avgoldneu.append(np.average(neu)) avgoldpos.append(np.average(pos)) avgoldall.append(np.average([item['compound'] for item in toxlevels])) fig, axs = plt.subplots(figsize=FIG_SIZE) axs.boxplot([neg, neu, pos]) axs.set_xticklabels(['negative', 'neutral', 'positive']) axs.set_title("Sentiment categorization of answers to posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + ", n=" + str(len(filteredposts)) + "\nn = " + str(len(toxlevels))) printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...") fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight') plt.close(fig) rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms") magickold += " " + goutfilenameoldusers + ".png" os.system(magicknew + " " + outputdir + "boxsent_newusers.pdf") os.system(magickold + " " + outputdir + "boxsent_oldusers.pdf") # plot new users fig = plt.figure(figsize=FIG_SIZE) x = [f.strftime("%d-%m-%Y") + " - " + t.strftime("%d-%m-%Y") for (f, t) in intervals] plt.plot(x, avgnewneg, label='negative') plt.plot(x, avgnewneu, label='neutral') plt.plot(x, avgnewpos, label='positive') plt.plot(x, avgnewall, label='all') plt.legend(loc="upper right") plt.xticks(rotation=90) plt.title("Sentiment categorization for posts from new users") fig.savefig(outputdir + "avgsentnewusers.png", bbox_inches='tight') plt.close(fig) # plot old users fig = plt.figure(figsize=FIG_SIZE) x = [f.strftime("%d-%m-%Y") + " - " + t.strftime("%d-%m-%Y") for (f, t) in intervals] plt.plot(x, avgoldneg, label='negative') plt.plot(x, avgoldneu, label='neutral') plt.plot(x, avgoldpos, label='positive') plt.plot(x, avgoldall, label='all') plt.legend(loc="upper right") plt.xticks(rotation=90) plt.title("Sentiment categorization for posts from old users") fig.savefig(outputdir + "avgsentoldusers.png", bbox_inches='tight') plt.close(fig) if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " " if len(sys.argv) < 2: print(usage) sys.exit(1) folder = sys.argv[1] if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) interval = 3 if len(sys.argv) >= 3: if sys.argv[2].startswith("-i"): interval = sys.argv[2][2:] try: interval = int(interval) except ValueError: print("-i: int required") sys.exit(1) if interval < 1 or interval > 12: print("-i: only 1 - 12") sys.exit(1) else: print("unknown parameter: " + sys.argv[2]) sys.exit(1) main(folder, interval)