import os import sys from collections import defaultdict from datetime import timedelta import matplotlib.pyplot as plt import numpy as np from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from common import calc_intervals from loader import load, dmt, cms printnoln = lambda text: print(text, end='', flush=True) rprint = lambda text: print('\r' + text) DAYS_NEW_USER = 7 OLD_USER_YEAR = 3 analyser = SentimentIntensityAnalyzer() colors = ['red', 'green', 'blue', 'orange', 'deeppink'] def main(folder): users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts) cachedsentiments = {} postcounts = range(1, 5 + 1) for (option_date_from, option_date_to) in intervals: # get questions for option_date_from <= creation date < option_date_to newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults() if len(newposts) == 0: continue print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) gfig, gaxs = plt.subplots(2, 2, figsize=(16, 12)) gaxs[0, 0].set_title('Neg') gaxs[1, 0].set_title('Neu') gaxs[0, 1].set_title('Pos') gaxs[1, 1].set_title('Compound') gneg = [] gneu = [] gpos = [] gcom = [] outfolder = folder + "/output/batch/" os.system("mkdir -p " + outfolder) goutfilenamenewusers = outfolder + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") goutfilenameoldusers = outfolder + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") for option_posts in postcounts: # print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts)) # computer toxic levels start = cms() printnoln("computing toxic levels: filtering") toxlevels = defaultdict(list) searchedposts = defaultdict(int) filteredposts = [] for (i, post) in enumerate(newposts): userid = post['OwnerUserId'] # check first contribution if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']: continue # no more than option_posts posts from one user searchedposts[userid] += 1 if searchedposts[userid] > option_posts: continue filteredposts.append(post) for (i, post) in enumerate(filteredposts): if (i + 1) % 100 == 0: printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts))) if (i + 1) == len(newposts): printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts))) for a in post['Answers']: if a['Id'] in cachedsentiments.keys(): toxlevel = cachedsentiments[a['Id']] else: toxlevel = computeToxLevel(a['Body']) cachedsentiments[a['Id']] = toxlevel toxlevels[post['Id']].append(toxlevel) rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms") outfilename = goutfilenamenewusers + "_" + str(option_posts) dumptoxlevels(toxlevels, outfilename + ".py") neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())] neulevelsflat = [item['neu'] for item in flatmap(toxlevels.values())] poslevelsflat = [item['pos'] for item in flatmap(toxlevels.values())] comlevelsflat = [item['compound'] for item in flatmap(toxlevels.values())] gneg.append(neglevelsflat) gneu.append(neulevelsflat) gpos.append(poslevelsflat) gcom.append(comlevelsflat) fig, axs = plt.subplots(2, 2, figsize=(16, 12)) axs[0, 0].set_title('Neg') axs[1, 0].set_title('Neu') axs[0, 1].set_title('Pos') axs[1, 1].set_title('Compound') axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100)) axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100)) axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100)) axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100)) axs[0, 0].set_yscale('log') axs[1, 0].set_yscale('log') axs[0, 1].set_yscale('log') axs[1, 1].set_yscale('log') # plt.show() fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) fig.savefig(outfilename + ".png", bbox_inches='tight') plt.close(fig) # global gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[0, 0].legend(loc="upper right") gaxs[1, 0].legend(loc="upper right") gaxs[0, 1].legend(loc="upper right") gaxs[1, 1].legend(loc="upper right") gaxs[0, 0].set_yscale('log') gaxs[1, 0].set_yscale('log') gaxs[0, 1].set_yscale('log') gaxs[1, 1].set_yscale('log') gfig.suptitle("Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight') plt.close(gfig) def computeToxLevel(text): return analyser.polarity_scores(text) def flatmap(arr): return [item for sublist in arr for item in sublist] def dumptoxlevels(lvls, filename): with open(filename, "w") as file: file.write("from collections import defaultdict\n\n") file.write("toxlevels = " + str(lvls).replace("", "list", 1) + "\n") if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " " if len(sys.argv) < 2: print(usage) sys.exit(1) folder = sys.argv[1] if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) main(folder)