import operator import os import sys from collections import defaultdict from datetime import timedelta from math import ceil import matplotlib.pyplot as plt import matplotlib import numpy as np from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK from loader import load, dmt, cms from sentiments import readtoxleveltxt OLD_USER_PERCENTILE = 0.95 colors = ['red', 'green', 'blue', 'orange', 'deeppink'] def main(folder, intervl): # with open(folder + "/output/batch/logi", "w") as f: # f.write(str(readavgsentsingle(folder + "/output/batch/averagesentiment.txt"))) # return matplotlib.use('Agg') # speed up saving of images users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts, intervl) start = cms() printnoln("reading sentiments ...") (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt") rprint("reading sentiments ... took " + str(cms() - start) + "ms") outputdir = folder + "/output/batch/" os.system("mkdir -p " + outputdir) postcounts = range(1, 5 + 1) magickpost = {i: IMAGE_MAGICK for i in postcounts} magickold = IMAGE_MAGICK magickglobal = IMAGE_MAGICK avgsent = [[] for i in range(0, 5 + 1)] avgsentsingle = [[] for i in range(0, 5 + 1)] for (option_date_from, option_date_to) in intervals: magickdate = IMAGE_MAGICK # get questions for option_date_from <= creation date < option_date_to newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults() print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) gfig, gaxs = plt.subplots(2, 2, figsize=(16, 12)) gaxs[0, 0].set_title('Neg') gaxs[1, 0].set_title('Neu') gaxs[0, 1].set_title('Pos') gaxs[1, 1].set_title('Compound') gneg = [] gneu = [] gpos = [] gcom = [] goutfilenamenewusers = outputdir + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "_i" + str(intervl) goutfilenameoldusers = outputdir + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "_i" + str(intervl) start = cms() printnoln("sorting posts ...") sortedposts = defaultdict(list) for (i, post) in enumerate(newposts): userid = post['OwnerUserId'] # check first contribution if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) <= post['CreationDate']: continue sortedposts[userid].append(post) rprint("sorting posts ... took " + str(cms() - start) + "ms") for option_posts in postcounts: # print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts)) # computer toxic levels start = cms() printnoln("computing toxic levels: filtering") toxlevels = [] filteredposts = [posts for (_, posts) in sortedposts.items() if len(posts) == option_posts] filteredposts = [p for posts in filteredposts for p in posts] for (i, post) in enumerate(filteredposts): printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts))) for a in post['Answers']: # if a['Id'] in cachedsentiments.keys(): toxlevel = cachedsentiments[a['Id']] # else: # print("Sentiment not found for " + a['Id']) # continue toxlevels.append(toxlevel) printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...") avgsent[option_posts].append(np.mean([s['compound'] for s in toxlevels]) if len(toxlevels) == 0 else 0) avgsentsingle[option_posts].append([s['compound'] for s in toxlevels]) outfilename = goutfilenamenewusers + "_" + str(option_posts) dumptoxlevels(toxlevels, outfilename + ".py") neglevelsflat = [item['neg'] for item in toxlevels] neulevelsflat = [item['neu'] for item in toxlevels] poslevelsflat = [item['pos'] for item in toxlevels] comlevelsflat = [item['compound'] for item in toxlevels] gneg.append(neglevelsflat) gneu.append(neulevelsflat) gpos.append(poslevelsflat) gcom.append(comlevelsflat) fig, axs = plt.subplots(2, 2, figsize=(16, 12)) axs[0, 0].set_title('Negativity') axs[1, 0].set_title('Neutrality') axs[0, 1].set_title('Positivity') axs[1, 1].set_title('Compound') axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100)) axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100)) axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100)) axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100)) axs[0, 0].set_yscale('log') axs[1, 0].set_yscale('log') axs[0, 1].set_yscale('log') axs[1, 1].set_yscale('log') # plt.show() fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + ", n(q)=" + str(len(filteredposts)) + ", n(a)=" + str(len(toxlevels))) printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...") fig.savefig(outfilename + ".png", bbox_inches='tight') plt.close(fig) rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms") magickpost[option_posts] += " " + outfilename + ".png" magickdate += " " + outfilename + ".png" os.system(magickdate + " " + goutfilenamenewusers + ".pdf") # global start = cms() printnoln("\rglobal plot post ... plotting ...") gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[0, 0].legend(loc="upper right") gaxs[1, 0].legend(loc="upper right") gaxs[0, 1].legend(loc="upper right") gaxs[1, 1].legend(loc="upper right") gaxs[0, 0].set_yscale('log') gaxs[1, 0].set_yscale('log') gaxs[0, 1].set_yscale('log') gaxs[1, 1].set_yscale('log') gfig.suptitle( "Sentiment of answers to the first X posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime( "%d-%m-%Y")) printnoln("\rglobal plot post ... plotting ... saving ...") gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight') plt.close(gfig) rprint("global plot post ... plotting ... saving ... took " + str(cms() - start) + "ms") magickglobal += " " + goutfilenamenewusers + ".png" # for old users --------------------------------------------------------------------------------- start = cms() newuserids = set(dmt(newposts).map(lambda p: p['OwnerUserId']).getresults()) userposts = {u: 0 for u in newuserids} for p in newposts: userposts[p['OwnerUserId']] += 1 userposts = sorted(userposts.items(), key=operator.itemgetter(1)) oldusers = [k for k, v in userposts] oldusers = set(oldusers[ceil(len(oldusers) * OLD_USER_PERCENTILE):]) filteredposts = dmt(newposts).filter(lambda p: p['OwnerUserId'] in oldusers).getresults() toxlevels = [] for (i, post) in enumerate(filteredposts): printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts))) for a in post['Answers']: # if a['Id'] in cachedsentiments.keys(): toxlevel = cachedsentiments[a['Id']] # else: # print("Sentiment not found for " + a['Id']) toxlevels.append(toxlevel) printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...") avgsent[0].append(np.mean([s['compound'] for s in toxlevels]) if len(toxlevels) == 0 else 0) avgsentsingle[0].append([s['compound'] for s in toxlevels]) dumptoxlevels(toxlevels, goutfilenameoldusers + ".py") neglevelsflat = [item['neg'] for item in toxlevels] neulevelsflat = [item['neu'] for item in toxlevels] poslevelsflat = [item['pos'] for item in toxlevels] comlevelsflat = [item['compound'] for item in toxlevels] fig, axs = plt.subplots(2, 2, figsize=(16, 12)) axs[0, 0].set_title('Neg') axs[1, 0].set_title('Neu') axs[0, 1].set_title('Pos') axs[1, 1].set_title('Compound') axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100)) axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100)) axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100)) axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100)) axs[0, 0].set_yscale('log') axs[1, 0].set_yscale('log') axs[0, 1].set_yscale('log') axs[1, 1].set_yscale('log') fig.suptitle("Sentiment of answers to posts by most posting users (" + str(OLD_USER_PERCENTILE * 100) + "%tile)\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + ", n(q)=" + str(len(filteredposts)) + ", n(a)=" + str(len(toxlevels))) printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...") fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight') plt.close(fig) rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms") magickold += " " + goutfilenameoldusers + ".png" os.system(magickglobal + " " + outputdir + "batch_newusers_i" + str(intervl) + ".pdf") os.system(magickold + " " + outputdir + "batch_oldusers_i" + str(intervl) + ".pdf") for (i, cmd) in magickpost.items(): os.system(cmd + " " + outputdir + "batch_newusers_i" + str(intervl) + "_" + str(i) + ".pdf") # avg sentiment graph print("Plotting average sentiments ...") fig = plt.figure(figsize=(16, 12)) for i in postcounts: plt.plot([iv[0] for iv in intervals], avgsent[i], label="new users (" + str(i) + " posts)") plt.plot([iv[0] for iv in intervals], avgsent[0], label="old users (all posts)") plt.title("Average sentiments") plt.xticks(rotation=90) plt.xlabel("time") plt.ylabel("sentiment") plt.legend(loc="upper right") plt.savefig(outputdir + "/averagesentiment-i" + str(intervl) + ".png", bbox_inches='tight') plt.close(fig) # dump avgsentsingle dumpavgsentsingle(avgsentsingle, outputdir + "/averagesentiment.txt") avgss2 = readavgsentsingle(outputdir + "/averagesentiment.txt") if avgsentsingle != avgss2: print("wuaaaaaa") with open(outputdir + "/log", "w") as file: file.write(str(avgsentsingle)) file.write(str(avgss2)) # print("1: " + str(avgsentsingle)) # print("2: " + str(avgss2)) def dumptoxlevels(lvls, filename): with open(filename, "w") as file: file.write("from collections import defaultdict\n\n") file.write("toxlevels = " + str(lvls).replace("", "list", 1) + "\n") def dumpavgsentsingle(avg, filename): with open(filename, "w") as file: s = '\n'.join([str(i) + ':' + ';;'.join([';'.join([str(x) for x in a]) for a in avg[i]]) for i in range(len(avg))]) file.write(s) def readavgsentsingle(filename): with open(filename, "r") as file: s = file.read() s = s.split('\n') s = [l.split(':', 2)[1] for l in s] s = [[[float(x) for x in a.split(';')] if a != '' else [] for a in l.split(';;')] for l in s] return s if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " " if len(sys.argv) < 2: print(usage) sys.exit(1) folder = sys.argv[1] if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) interval = 3 if len(sys.argv) >= 3: if sys.argv[2].startswith("-i"): interval = sys.argv[2][2:] try: interval = int(interval) except ValueError: print("-i: int required") sys.exit(1) if interval < 1 or interval > 12: print("-i: only 1 - 12") sys.exit(1) else: print("unknown parameter: " + sys.argv[2]) sys.exit(1) main(folder, interval)