import operator import os import sys from collections import defaultdict from datetime import timedelta from math import ceil import matplotlib.pyplot as plt import numpy as np from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from common import calc_intervals, imprt, FigSaver from loader import load, dmt, cms printnoln = lambda text: print(text, end='', flush=True) rprint = lambda text: print('\r' + text) DAYS_NEW_USER = 7 OLD_USER_YEAR = 3 OLD_USER_PERCENTILE = 0.95 analyser = SentimentIntensityAnalyzer() figsaver = FigSaver() colors = ['red', 'green', 'blue', 'orange', 'deeppink'] def main(folder): users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts) cachedsentiments = imprt(folder + "/output/sentiments.py").answers outfolder = folder + "/output/batch/" os.system("mkdir -p " + outfolder) postcounts = range(1, 5 + 1) for (option_date_from, option_date_to) in intervals: # get questions for option_date_from <= creation date < option_date_to newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults() if len(newposts) == 0: continue print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) gfig, gaxs = plt.subplots(2, 2, figsize=(16, 12)) gaxs[0, 0].set_title('Neg') gaxs[1, 0].set_title('Neu') gaxs[0, 1].set_title('Pos') gaxs[1, 1].set_title('Compound') gneg = [] gneu = [] gpos = [] gcom = [] goutfilenamenewusers = outfolder + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") goutfilenameoldusers = outfolder + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") for option_posts in postcounts: # print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts)) # computer toxic levels start = cms() printnoln("computing toxic levels: filtering") toxlevels = [] searchedposts = defaultdict(int) filteredposts = [] for (i, post) in enumerate(newposts): userid = post['OwnerUserId'] # check first contribution if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']: continue # no more than option_posts posts from one user searchedposts[userid] += 1 if searchedposts[userid] > option_posts: continue filteredposts.append(post) for (i, post) in enumerate(filteredposts): printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts))) for a in post['Answers']: if a['Id'] in cachedsentiments.keys(): toxlevel = cachedsentiments[a['Id']] else: print("Sentiment not found for " + a['Id']) toxlevels.append(toxlevel) printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...") outfilename = goutfilenamenewusers + "_" + str(option_posts) dumptoxlevels(toxlevels, outfilename + ".py") neglevelsflat = [item['neg'] for item in toxlevels] neulevelsflat = [item['neu'] for item in toxlevels] poslevelsflat = [item['pos'] for item in toxlevels] comlevelsflat = [item['compound'] for item in toxlevels] gneg.append(neglevelsflat) gneu.append(neulevelsflat) gpos.append(poslevelsflat) gcom.append(comlevelsflat) fig, axs = plt.subplots(2, 2, figsize=(16, 12)) axs[0, 0].set_title('Neg') axs[1, 0].set_title('Neu') axs[0, 1].set_title('Pos') axs[1, 1].set_title('Compound') axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100)) axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100)) axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100)) axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100)) axs[0, 0].set_yscale('log') axs[1, 0].set_yscale('log') axs[0, 1].set_yscale('log') axs[1, 1].set_yscale('log') # plt.show() fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) # figsaver.save(fig, outfilename + ".png", bbox_inches='tight') printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...") fig.savefig(outfilename + ".png", bbox_inches='tight') plt.close(fig) rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms") # global start = cms() printnoln("\rglobal plot post ... plotting ...") gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[0, 0].legend(loc="upper right") gaxs[1, 0].legend(loc="upper right") gaxs[0, 1].legend(loc="upper right") gaxs[1, 1].legend(loc="upper right") gaxs[0, 0].set_yscale('log') gaxs[1, 0].set_yscale('log') gaxs[0, 1].set_yscale('log') gaxs[1, 1].set_yscale('log') gfig.suptitle( "Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime( "%d-%m-%Y")) # figsaver.save(gfig, goutfilenamenewusers + ".png", bbox_inches='tight') printnoln("\rglobal plot post ... plotting ... saving ...") gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight') plt.close(gfig) rprint("global plot post ... plotting ... saving ... took " + str(cms() - start) + "ms") # for old users --------------------------------------------------------------------------------- start = cms() newuserids = set(dmt(newposts).map(lambda p: p['OwnerUserId']).getresults()) userposts = {u: 0 for u in newuserids} for p in newposts: userposts[p['OwnerUserId']] += 1 userposts = sorted(userposts.items(), key=operator.itemgetter(1)) oldusers = [k for k, v in userposts] oldusers = set(oldusers[ceil(len(oldusers) * OLD_USER_PERCENTILE):]) filteredposts = dmt(newposts).filter(lambda p: p['OwnerUserId'] in oldusers).getresults() toxlevels = [] for (i, post) in enumerate(filteredposts): printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts))) for a in post['Answers']: if a['Id'] in cachedsentiments.keys(): toxlevel = cachedsentiments[a['Id']] else: print("Sentiment not found for " + a['Id']) toxlevels.append(toxlevel) printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...") dumptoxlevels(toxlevels, goutfilenameoldusers + ".py") neglevelsflat = [item['neg'] for item in toxlevels] neulevelsflat = [item['neu'] for item in toxlevels] poslevelsflat = [item['pos'] for item in toxlevels] comlevelsflat = [item['compound'] for item in toxlevels] fig, axs = plt.subplots(2, 2, figsize=(16, 12)) axs[0, 0].set_title('Neg') axs[1, 0].set_title('Neu') axs[0, 1].set_title('Pos') axs[1, 1].set_title('Compound') axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100)) axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100)) axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100)) axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100)) axs[0, 0].set_yscale('log') axs[1, 0].set_yscale('log') axs[0, 1].set_yscale('log') axs[1, 1].set_yscale('log') # plt.show() fig.suptitle("Sentiment of answers to posts by most posting users (95%tile)\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) # figsaver.save(fig, goutfilenameoldusers + ".png", bbox_inches='tight') printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...") fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight') plt.close(fig) rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms") figsaver.join() figsaver.join() def computeToxLevel(text): return analyser.polarity_scores(text) def dumptoxlevels(lvls, filename): with open(filename, "w") as file: file.write("from collections import defaultdict\n\n") file.write("toxlevels = " + str(lvls).replace("", "list", 1) + "\n") if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " " if len(sys.argv) < 2: print(usage) sys.exit(1) folder = sys.argv[1] if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) main(folder)