from datetime import datetime from datetime import timedelta import sys import os from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import numpy as np import matplotlib.pyplot as plt from collections import defaultdict from loader import load, dmt, cms import math from common import calc_intervals printnoln = lambda text: print(text, end='', flush=True) rprint = lambda text: print('\r' + text) DAYS_NEW_USER = 7 OLD_USER_YEAR = 3 analyser = SentimentIntensityAnalyzer() colors = ['red', 'green', 'blue', 'orange', 'deeppink'] def main(folder): users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts) cachedsentiments = {} postcounts = range(1, 5 + 1) for (option_date_from, option_date_to) in intervals: # filter users by option_date_from <= creation date <= option_date_to # newusers = dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").getresults() # newuserids = set(dmt(newusers).map(lambda u: u['Id'], "get user id list").getresults()) # get questions for filtered users newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults() if len(newposts) == 0: continue print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) gfig, gaxs = plt.subplots(2, 2, figsize=(16, 12)) gaxs[0, 0].set_title('Neg') gaxs[1, 0].set_title('Neu') gaxs[0, 1].set_title('Pos') gaxs[1, 1].set_title('Compound') gneg = [] gneu = [] gpos = [] gcom = [] outfolder = folder + "/output/batch/" os.system("mkdir -p " + outfolder) goutfilename = outfolder + "batch_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") for option_posts in postcounts: # print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts)) # computer toxic levels start = cms() printnoln("computing toxic levels: filtering") toxlevels = defaultdict(list) searchedposts = defaultdict(int) filteredposts = [] for (i, post) in enumerate(newposts): userid = post['OwnerUserId'] # check first contribution if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']: continue # no more than option_posts posts from one user searchedposts[userid] += 1 if searchedposts[userid] > option_posts: continue filteredposts.append(post) for (i, post) in enumerate(filteredposts): if (i + 1) % 100 == 0: printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts))) if (i + 1) == len(newposts): printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts))) userid = post['OwnerUserId'] for a in post['Answers']: if a['Id'] in cachedsentiments.keys(): toxlevel = cachedsentiments[a['Id']] else: toxlevel = computeToxLevel(a['Body']) cachedsentiments[a['Id']] = toxlevel toxlevels[userid].append(toxlevel) rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms") outfilename = goutfilename + "_" + str(option_posts) dumptoxlevels(toxlevels, outfilename + ".py") neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())] neulevelsflat = [item['neu'] for item in flatmap(toxlevels.values())] poslevelsflat = [item['pos'] for item in flatmap(toxlevels.values())] comlevelsflat = [item['compound'] for item in flatmap(toxlevels.values())] gneg.append(neglevelsflat) gneu.append(neulevelsflat) gpos.append(poslevelsflat) gcom.append(comlevelsflat) fig, axs = plt.subplots(2, 2, figsize=(16, 12)) axs[0, 0].set_title('Neg') axs[1, 0].set_title('Neu') axs[0, 1].set_title('Pos') axs[1, 1].set_title('Compound') axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100)) axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100)) axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100)) axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100)) axs[0, 0].set_yscale('log') axs[1, 0].set_yscale('log') axs[0, 1].set_yscale('log') axs[1, 1].set_yscale('log') # plt.show() fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) fig.savefig(outfilename + ".png", bbox_inches='tight') plt.close(fig) # global gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[0, 0].legend(loc="upper right") gaxs[1, 0].legend(loc="upper right") gaxs[0, 1].legend(loc="upper right") gaxs[1, 1].legend(loc="upper right") gaxs[0, 0].set_yscale('log') gaxs[1, 0].set_yscale('log') gaxs[0, 1].set_yscale('log') gaxs[1, 1].set_yscale('log') gfig.suptitle("Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) gfig.savefig(goutfilename + ".png", bbox_inches='tight') plt.close(gfig) def computeToxLevel(text): return analyser.polarity_scores(text) def flatmap(arr): return [item for sublist in arr for item in sublist] def dumptoxlevels(lvls, filename): with open(filename, "w") as file: file.write("from collections import defaultdict\n\n") file.write("toxlevels = " + str(lvls).replace("", "list", 1) + "\n") if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " " if len(sys.argv) < 2: print(usage) sys.exit(1) folder = sys.argv[1] if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) main(folder)