diff --git a/analyze_batch.py b/analyze_batch.py index 201a5bc..6b029e7 100644 --- a/analyze_batch.py +++ b/analyze_batch.py @@ -1,13 +1,15 @@ +import operator import os import sys from collections import defaultdict from datetime import timedelta +from math import ceil import matplotlib.pyplot as plt import numpy as np from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer -from common import calc_intervals +from common import calc_intervals, imprt, FigSaver from loader import load, dmt, cms printnoln = lambda text: print(text, end='', flush=True) @@ -15,8 +17,10 @@ rprint = lambda text: print('\r' + text) DAYS_NEW_USER = 7 OLD_USER_YEAR = 3 +OLD_USER_PERCENTILE = 0.95 analyser = SentimentIntensityAnalyzer() +figsaver = FigSaver() colors = ['red', 'green', 'blue', 'orange', 'deeppink'] @@ -24,7 +28,10 @@ def main(folder): users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts) - cachedsentiments = {} + cachedsentiments = imprt(folder + "/output/sentiments.py").answers + + outfolder = folder + "/output/batch/" + os.system("mkdir -p " + outfolder) postcounts = range(1, 5 + 1) for (option_date_from, option_date_to) in intervals: @@ -44,8 +51,6 @@ def main(folder): gpos = [] gcom = [] - outfolder = folder + "/output/batch/" - os.system("mkdir -p " + outfolder) goutfilenamenewusers = outfolder + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") goutfilenameoldusers = outfolder + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") @@ -55,7 +60,7 @@ def main(folder): # computer toxic levels start = cms() printnoln("computing toxic levels: filtering") - toxlevels = defaultdict(list) + toxlevels = [] searchedposts = defaultdict(int) filteredposts = [] for (i, post) in enumerate(newposts): @@ -73,26 +78,22 @@ def main(folder): filteredposts.append(post) for (i, post) in enumerate(filteredposts): - if (i + 1) % 100 == 0: - printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts))) - if (i + 1) == len(newposts): - printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts))) + printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts))) for a in post['Answers']: if a['Id'] in cachedsentiments.keys(): toxlevel = cachedsentiments[a['Id']] else: - toxlevel = computeToxLevel(a['Body']) - cachedsentiments[a['Id']] = toxlevel - toxlevels[post['Id']].append(toxlevel) - rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms") + print("Sentiment not found for " + a['Id']) + toxlevels.append(toxlevel) + printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...") outfilename = goutfilenamenewusers + "_" + str(option_posts) dumptoxlevels(toxlevels, outfilename + ".py") - neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())] - neulevelsflat = [item['neu'] for item in flatmap(toxlevels.values())] - poslevelsflat = [item['pos'] for item in flatmap(toxlevels.values())] - comlevelsflat = [item['compound'] for item in flatmap(toxlevels.values())] + neglevelsflat = [item['neg'] for item in toxlevels] + neulevelsflat = [item['neu'] for item in toxlevels] + poslevelsflat = [item['pos'] for item in toxlevels] + comlevelsflat = [item['compound'] for item in toxlevels] gneg.append(neglevelsflat) gneu.append(neulevelsflat) @@ -116,10 +117,15 @@ def main(folder): # plt.show() fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) + # figsaver.save(fig, outfilename + ".png", bbox_inches='tight') + printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...") fig.savefig(outfilename + ".png", bbox_inches='tight') plt.close(fig) + rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms") # global + start = cms() + printnoln("\rglobal plot post ... plotting ...") gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) @@ -132,19 +138,74 @@ def main(folder): gaxs[1, 0].set_yscale('log') gaxs[0, 1].set_yscale('log') gaxs[1, 1].set_yscale('log') - gfig.suptitle("Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) + gfig.suptitle( + "Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime( + "%d-%m-%Y")) + # figsaver.save(gfig, goutfilenamenewusers + ".png", bbox_inches='tight') + printnoln("\rglobal plot post ... plotting ... saving ...") gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight') plt.close(gfig) + rprint("global plot post ... plotting ... saving ... took " + str(cms() - start) + "ms") + + # for old users --------------------------------------------------------------------------------- + start = cms() + newuserids = set(dmt(newposts).map(lambda p: p['OwnerUserId']).getresults()) + userposts = {u: 0 for u in newuserids} + for p in newposts: + userposts[p['OwnerUserId']] += 1 + userposts = sorted(userposts.items(), key=operator.itemgetter(1)) + oldusers = [k for k, v in userposts] + oldusers = set(oldusers[ceil(len(oldusers) * OLD_USER_PERCENTILE):]) + filteredposts = dmt(newposts).filter(lambda p: p['OwnerUserId'] in oldusers).getresults() + + toxlevels = [] + for (i, post) in enumerate(filteredposts): + printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts))) + for a in post['Answers']: + if a['Id'] in cachedsentiments.keys(): + toxlevel = cachedsentiments[a['Id']] + else: + print("Sentiment not found for " + a['Id']) + toxlevels.append(toxlevel) + printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...") + + dumptoxlevels(toxlevels, goutfilenameoldusers + ".py") + + neglevelsflat = [item['neg'] for item in toxlevels] + neulevelsflat = [item['neu'] for item in toxlevels] + poslevelsflat = [item['pos'] for item in toxlevels] + comlevelsflat = [item['compound'] for item in toxlevels] + + fig, axs = plt.subplots(2, 2, figsize=(16, 12)) + axs[0, 0].set_title('Neg') + axs[1, 0].set_title('Neu') + axs[0, 1].set_title('Pos') + axs[1, 1].set_title('Compound') + axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100)) + axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100)) + axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100)) + axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100)) + axs[0, 0].set_yscale('log') + axs[1, 0].set_yscale('log') + axs[0, 1].set_yscale('log') + axs[1, 1].set_yscale('log') + + # plt.show() + fig.suptitle("Sentiment of answers to posts by most posting users (95%tile)\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) + # figsaver.save(fig, goutfilenameoldusers + ".png", bbox_inches='tight') + printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...") + fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight') + plt.close(fig) + rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms") + + figsaver.join() + figsaver.join() def computeToxLevel(text): return analyser.polarity_scores(text) -def flatmap(arr): - return [item for sublist in arr for item in sublist] - - def dumptoxlevels(lvls, filename): with open(filename, "w") as file: file.write("from collections import defaultdict\n\n") diff --git a/calctoxdiff.py b/calctoxdiff.py index 5a17628..82e606e 100644 --- a/calctoxdiff.py +++ b/calctoxdiff.py @@ -1,4 +1,3 @@ -import importlib import os import sys from collections import defaultdict @@ -10,27 +9,36 @@ import matplotlib.pyplot as plt import numpy as np from scipy.stats import ks_2samp +from common import imprt + colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'} +IMAGE_MAGICK = "magick" def main(folder): outputdir = folder + "/output/ksbatch/" os.system("mkdir -p " + outputdir) - folder = folder + "/output/batch/" + srcfolder = folder + "/output/batch/" - onlyfiles = [folder + f for f in listdir(folder) if isfile(join(folder, f)) and f.endswith(".py")] + onlyfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "newusers" in f] onlyfiles = sorted(onlyfiles) - plotbypost(onlyfiles, outputdir) - plotbydate(onlyfiles, outputdir) + # plotbypost(onlyfiles, outputdir) + # plotbydate(onlyfiles, outputdir) + + oldfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "oldusers" in f] + oldfiles = sorted(oldfiles) + + plotbydateold(onlyfiles, oldfiles, outputdir) def plotbypost(onlyfiles, outputdir): + print("plotbypost") files = defaultdict(list) for f in onlyfiles: s = f[:-3].split("_") - files[int(s[4])].append(f) - files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[2], "%d-%m-%Y")) for (p, l) in files.items()} + files[int(s[5])].append(f) + files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[3], "%d-%m-%Y")) for (p, l) in files.items()} changes_neg = defaultdict(list) changes_neu = defaultdict(list) @@ -45,15 +53,15 @@ def plotbypost(onlyfiles, outputdir): tox1 = imprt(l[i]).toxlevels tox2 = imprt(l[i + 1]).toxlevels - neglevelsflat1 = [item['neg'] for item in flatmap(tox1.values())] - neulevelsflat1 = [item['neu'] for item in flatmap(tox1.values())] - poslevelsflat1 = [item['pos'] for item in flatmap(tox1.values())] - comlevelsflat1 = [item['compound'] for item in flatmap(tox1.values())] + neglevelsflat1 = [item['neg'] for item in tox1] + neulevelsflat1 = [item['neu'] for item in tox1] + poslevelsflat1 = [item['pos'] for item in tox1] + comlevelsflat1 = [item['compound'] for item in tox1] - neglevelsflat2 = [item['neg'] for item in flatmap(tox2.values())] - neulevelsflat2 = [item['neu'] for item in flatmap(tox2.values())] - poslevelsflat2 = [item['pos'] for item in flatmap(tox2.values())] - comlevelsflat2 = [item['compound'] for item in flatmap(tox2.values())] + neglevelsflat2 = [item['neg'] for item in tox2] + neulevelsflat2 = [item['neu'] for item in tox2] + poslevelsflat2 = [item['pos'] for item in tox2] + comlevelsflat2 = [item['compound'] for item in tox2] ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) @@ -74,7 +82,7 @@ def plotbypost(onlyfiles, outputdir): + "; ks pos = " + str(changes_pos[p][i]) + "; ks com = " + str(changes_com[p][i]) + "\n") # pval for (p, l) in files.items(): - x = [l[i].split("_")[2] + " -\n" + l[i + 1].split("_")[2] for i in range(len(l) - 1)] + x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items(): pval = [x.pvalue for x in changes] @@ -92,7 +100,7 @@ def plotbypost(onlyfiles, outputdir): # stat for (p, l) in files.items(): - x = [l[i].split("_")[2] + " -\n" + l[i + 1].split("_")[2] for i in range(len(l) - 1)] + x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items(): stat = [x.statistic for x in changes] @@ -110,11 +118,12 @@ def plotbypost(onlyfiles, outputdir): def plotbydate(onlyfiles, outputdir): + print("plotbydate") files = defaultdict(list) for f in onlyfiles: s = f[:-3].split("_") - files[(s[2], s[3])].append(f) - files = {d: sorted(l, key=lambda e: e.split("_")[4]) for (d, l) in files.items()} + files[(s[3], s[4])].append(f) + files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()} changes_neg = defaultdict(list) changes_neu = defaultdict(list) @@ -129,15 +138,15 @@ def plotbydate(onlyfiles, outputdir): tox1 = imprt(l[i]).toxlevels tox2 = imprt(l[i + 1]).toxlevels - neglevelsflat1 = [item['neg'] for item in flatmap(tox1.values())] - neulevelsflat1 = [item['neu'] for item in flatmap(tox1.values())] - poslevelsflat1 = [item['pos'] for item in flatmap(tox1.values())] - comlevelsflat1 = [item['compound'] for item in flatmap(tox1.values())] + neglevelsflat1 = [item['neg'] for item in tox1] + neulevelsflat1 = [item['neu'] for item in tox1] + poslevelsflat1 = [item['pos'] for item in tox1] + comlevelsflat1 = [item['compound'] for item in tox1] - neglevelsflat2 = [item['neg'] for item in flatmap(tox2.values())] - neulevelsflat2 = [item['neu'] for item in flatmap(tox2.values())] - poslevelsflat2 = [item['pos'] for item in flatmap(tox2.values())] - comlevelsflat2 = [item['compound'] for item in flatmap(tox2.values())] + neglevelsflat2 = [item['neg'] for item in tox2] + neulevelsflat2 = [item['neu'] for item in tox2] + poslevelsflat2 = [item['pos'] for item in tox2] + comlevelsflat2 = [item['compound'] for item in tox2] ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) @@ -159,7 +168,7 @@ def plotbydate(onlyfiles, outputdir): # pval for (d, l) in files.items(): - x = [l[i].split("_")[4][:-3] + "-" + l[i + 1].split("_")[4][:-3] for i in range(len(l) - 1)] + x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): pval = [x.pvalue for x in changes] @@ -177,7 +186,7 @@ def plotbydate(onlyfiles, outputdir): # stat for (d, l) in files.items(): - x = [l[i].split("_")[4][:-3] + "-" + l[i + 1].split("_")[4][:-3] for i in range(len(l) - 1)] + x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): stat = [x.statistic for x in changes] @@ -194,15 +203,122 @@ def plotbydate(onlyfiles, outputdir): plt.close(fig) -def imprt(file): - spec = importlib.util.spec_from_file_location("module.name", file) - foo = importlib.util.module_from_spec(spec) - spec.loader.exec_module(foo) - return foo +def plotbydateold(onlyfiles, oldfiles, outputdir): + print("plotbydateold") + files = defaultdict(list) + for f in onlyfiles: + s = f[:-3].split("_") + files[(s[3], s[4])].append(f) + dates = sorted(files.keys(), key=lambda e: "-".join(reversed(e[0].split("-")))) + files = {d: files[d] for d in dates} + files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()} + oldfiles = {(f[:-3].split("_")[3], f[:-3].split("_")[4]): f for f in oldfiles} + changes_neg = defaultdict(list) + changes_neu = defaultdict(list) + changes_pos = defaultdict(list) + changes_com = defaultdict(list) -def flatmap(arr): - return [item for sublist in arr for item in sublist] + for (d, l) in files.items(): + print(d) + toxold = imprt(oldfiles[d]).toxlevels + + neglevelsold = [item['neg'] for item in toxold] + neulevelsold = [item['neu'] for item in toxold] + poslevelsold = [item['pos'] for item in toxold] + comlevelsold = [item['compound'] for item in toxold] + + for i in range(len(l)): + tox1 = imprt(l[i]).toxlevels + if len(tox1) == 0 or len(toxold) == 0: + changes_neg[d].append(None) + changes_neu[d].append(None) + changes_pos[d].append(None) + changes_com[d].append(None) + continue + + neglevelsflat1 = [item['neg'] for item in tox1] + neulevelsflat1 = [item['neu'] for item in tox1] + poslevelsflat1 = [item['pos'] for item in tox1] + comlevelsflat1 = [item['compound'] for item in tox1] + + ksneg = ks_2samp(neglevelsflat1, neglevelsold) + ksneu = ks_2samp(neulevelsflat1, neulevelsold) + kspos = ks_2samp(poslevelsflat1, poslevelsold) + kscom = ks_2samp(comlevelsflat1, comlevelsold) + + changes_neg[d].append(ksneg) + changes_neu[d].append(ksneu) + changes_pos[d].append(kspos) + changes_com[d].append(kscom) + + print("logs") + for (d, l) in files.items(): + # print(d) + # print("neg is: " + str(len(changes_neg[d])) + " should: " + str(len(l))) + # print("neu is: " + str(len(changes_neu[d])) + " should: " + str(len(l))) + # print("pos is: " + str(len(changes_pos[d])) + " should: " + str(len(l))) + # print("com is: " + str(len(changes_com[d])) + " should: " + str(len(l))) + f1 = oldfiles[d] + with open(outputdir + "/ks_olddate_" + d[0] + "_" + d[1] + ".log", "w") as f: + for i in range(len(l)): + if changes_neg[d][i] is None: + continue + f2 = l[i] + f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[d][i]) + "; ks neu = " + str(changes_neu[d][i]) + + "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n") + + # pval + print("pval") + imgmagickcmd = IMAGE_MAGICK + for (d, l) in files.items(): + print(d) + x = [l[i][:-3].split("_")[5] for i in range(len(l))] + fig = plt.figure(figsize=(16, 12)) + for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): + pval = [(xx, c.pvalue) for xx, c in zip(x, changes) if c is not None] + plt.plot([p[0] for p in pval], [p[1] for p in pval], label=type + ".pval", color=colors[type]) + if len(pval) == 0: + continue + mean = np.mean([p[1] for p in pval]) + std = np.std([p[1] for p in pval]) + dev = [(xx, s) for (xx, s) in pval if s <= mean - std or s >= mean + std] + plt.plot([p[0] for p in pval], [mean] * len(pval), color=colors[type], ls='dashed') + plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o') + plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1]) + plt.xticks(rotation=90) + plt.legend(loc="upper right") + outfile = outputdir + "/ks_olddate_pval_" + d[0] + "_" + d[1] + ".png" + plt.savefig(outfile, bbox_inches='tight') + plt.close(fig) + imgmagickcmd += " " + outfile + os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_pval.pdf") + + # stat + print("stat") + imgmagickcmd = IMAGE_MAGICK + for (d, l) in files.items(): + print(d) + x = [l[i][:-3].split("_")[5] for i in range(len(l))] + fig = plt.figure(figsize=(16, 12)) + for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): + stat = [(xx, c.statistic) for xx, c in zip(x, changes) if c is not None] + plt.plot([p[0] for p in stat], [p[1] for p in stat], label=type + ".stat", color=colors[type]) + if len(stat) == 0: + continue + mean = np.mean([p[1] for p in stat]) + std = np.std([p[1] for p in stat]) + dev = [(xx, s) for (xx, s) in stat if s <= mean - std or s >= mean + std] + plt.plot([p[0] for p in stat], [mean] * len(stat), color=colors[type], ls='dashed') + plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o') + plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1]) + plt.xticks(rotation=90) + plt.legend(loc="upper right") + outfile = outputdir + "/ks_olddate_stat_" + d[0] + "_" + d[1] + ".png" + plt.savefig(outfile, bbox_inches='tight') + plt.close(fig) + imgmagickcmd += " " + outfile + os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_stat.pdf") def filecmp(file1, file2): diff --git a/common.py b/common.py index 7853a3e..a219873 100644 --- a/common.py +++ b/common.py @@ -1,8 +1,14 @@ +import importlib +from threading import Thread, Lock + +import matplotlib.pyplot as plt + from loader import dmt def calc_intervals(posts): - firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], "firstpost").getresults() + firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], + "firstpost").getresults() lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults() # calc quarter beginning @@ -25,5 +31,32 @@ def calc_intervals(posts): print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y")) intervals.append((cdate, nextquarter)) cdate = nextquarter - # sys.exit(0) return intervals + + +def imprt(file): + spec = importlib.util.spec_from_file_location("module.name", file) + foo = importlib.util.module_from_spec(spec) + spec.loader.exec_module(foo) + return foo + + +class FigSaver(): + def __init__(self): + self.__lock = Lock() + self.__threads = [] + + def save(self, fig, path, **kwargs): + thread = Thread(target=self.__dosave, args=(fig, path, kwargs)) + with self.__lock: + self.__threads.append(thread) + thread.start() + + def __dosave(self, fig, path, kwargs): + fig.savefig(path, **kwargs) + plt.close(fig) + + def join(self): + with self.__lock: + for thread in self.__threads: + thread.join() diff --git a/loader.py b/loader.py index 5477209..74f2b4b 100644 --- a/loader.py +++ b/loader.py @@ -1,5 +1,7 @@ +import html import multiprocessing import operator +import re import time import xml.etree.cElementTree as et from collections import defaultdict @@ -7,11 +9,13 @@ from datetime import datetime from mt import mt +TAG_RE = re.compile(r'<[^>]+>') + printnoln = lambda text: print(text, end='', flush=True) rprint = lambda text: print('\r' + text) -def dmt(data): return mt(multiprocessing.cpu_count(), data, False) +def dmt(data, progressinterval=1000): return mt(multiprocessing.cpu_count(), data, False, progressinterval) def cms(): return int(round(time.time() * 1000)) @@ -75,6 +79,7 @@ def mapQuestion(item): question = {tag: getTag(item, tag) for tag in tags} for tag in datetags: question[tag] = datetime.fromisoformat(question[tag]) + question['Body'] = removetags(html.unescape(question['Body'])) return question @@ -84,6 +89,7 @@ def mapAnswer(item): answer = {tag: getTag(item, tag) for tag in tags} for tag in datetags: answer[tag] = datetime.fromisoformat(answer[tag]) + answer['Body'] = removetags(html.unescape(answer['Body'])) return answer @@ -93,6 +99,7 @@ def mapComment(item): comment = {tag: getTag(item, tag) for tag in tags} for tag in datetags: comment[tag] = datetime.fromisoformat(comment[tag]) + comment['Body'] = removetags(html.unescape(comment['Body'])) return comment @@ -201,3 +208,7 @@ def tagExists(item, tag): def setprop(dic, key, value): dic[key] = value return dic + + +def removetags(text): + return TAG_RE.sub('', text) diff --git a/mt.py b/mt.py index 58ec26e..053c6a6 100644 --- a/mt.py +++ b/mt.py @@ -1,10 +1,10 @@ -from threading import Thread, Lock import time from math import ceil +from threading import Thread, Lock class mt(): - def __init__(self, threads, data, verbose=False): + def __init__(self, threads, data, verbose=False, progressinterval=1000): self.__running = False self.__closed = False self.__data = data @@ -21,6 +21,7 @@ class mt(): self.__lock = Lock() self.__results = [] self.__progress = 0 + self.__progressinterval = progressinterval for i in range(self.__threadcount): self.__results.append([]) self.__threads.append(None) @@ -35,9 +36,9 @@ class mt(): self.__running = True self.__final = self.__getresultsmapfilter self.__type = "filter" - self.__comment = comment if comment is not None else "" + self.__comment = comment if comment is not None: - print(self.__comment + ": #" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True) + print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True) self.__starttime = self.__cms() self.__endtime = None for i in range(self.__threadcount): @@ -50,20 +51,22 @@ class mt(): now = self.__cms() results = [] - for j in range(ceil(len(list) / 1000)): - part = list[j * 1000: min((j + 1) * 1000, len(list))] + for j in range(ceil(len(list) / self.__progressinterval)): + part = list[j * self.__progressinterval: min((j + 1) * self.__progressinterval, len(list))] results += [l for l in part if cond(l)] with self.__lock: self.__progress += len(part) if self.__comment is not None: print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True) - # results = [l for l in list if cond(l)] with self.__lock: self.__results[i] = results dur = self.__cms() - now if self.__verbose: - print(self.__comment + ": Thread " + str(i) + ": filter took " + str(dur) + "ms") + if self.__comment is not None: + print(self.__comment + ": Thread " + str(i) + ": filter took " + str(dur) + "ms") + else: + print("Thread " + str(i) + ": filter took " + str(dur) + "ms") def map(self, func, comment=None): if self.__closed: @@ -75,7 +78,7 @@ class mt(): self.__running = True self.__final = self.__getresultsmapfilter self.__type = "map" - self.__comment = comment if comment is not None else "" + self.__comment = comment if comment is not None: print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True) self.__starttime = self.__cms() @@ -89,20 +92,22 @@ class mt(): def __domap(self, i, list, func): now = self.__cms() results = [] - for j in range(ceil(len(list) / 1000)): - part = list[j * 1000: min((j + 1) * 1000, len(list))] + for j in range(ceil(len(list) / self.__progressinterval)): + part = list[j * self.__progressinterval: min((j + 1) * self.__progressinterval, len(list))] results += [func(l) for l in part] with self.__lock: self.__progress += len(part) if self.__comment is not None: print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True) - # results = [func(l) for l in list] with self.__lock: self.__results[i] = results dur = self.__cms() - now if self.__verbose: - print(self.__comment + ": Thread " + str(i) + ": map took " + str(dur) + "ms") + if self.__comment is not None: + print(self.__comment + ": Thread " + str(i) + ": filter took " + str(dur) + "ms") + else: + print("Thread " + str(i) + ": filter took " + str(dur) + "ms") def reduce(self, reducer, aggregator, initval, comment=None): if self.__closed: @@ -114,9 +119,9 @@ class mt(): self.__running = True self.__final = lambda: self.__getresultsreduce(aggregator, initval) self.__type = "reduce" - self.__comment = comment if comment is not None else "" + self.__comment = comment if comment is not None: - print(self.__comment + ": #" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True) + print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True) self.__starttime = self.__cms() self.__endtime = None for i in range(self.__threadcount): @@ -129,8 +134,8 @@ class mt(): now = self.__cms() val = initval() - for j in range(ceil(len(list) / 1000)): - part = list[j * 1000: min((j + 1) * 1000, len(list))] + for j in range(ceil(len(list) / self.__progressinterval)): + part = list[j * self.__progressinterval: min((j + 1) * self.__progressinterval, len(list))] for k in range(len(part)): val = reducer(val, part[k]) with self.__lock: @@ -138,13 +143,14 @@ class mt(): if self.__comment is not None: print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True) - # for j in range(len(list)): - # val = reducer(val, list[j]) with self.__lock: self.__results[i] = val dur = self.__cms() - now if self.__verbose: - print(self.__comment + ": Thread " + str(i) + ": reduce took " + str(dur) + "ms") + if self.__comment is not None: + print(self.__comment + ": Thread " + str(i) + ": filter took " + str(dur) + "ms") + else: + print("Thread " + str(i) + ": filter took " + str(dur) + "ms") def getresults(self): self.join() diff --git a/sentiments.py b/sentiments.py index 61159e8..a112127 100644 --- a/sentiments.py +++ b/sentiments.py @@ -36,17 +36,12 @@ def main(folder): # toxlevel = computeToxLevel(a['Body']) # toxlevels[post['Id']].append(toxlevel) # rprint("computing toxic levels: post #" + str(len(posts)) + "/" + str(len(posts)) + " ... took " + str(cms() - start) + "ms") - toxlevels = dmt(posts).map(lambda p: (p['Id'], computeSentimentForPost(p)), "calculating sentiments").getresults() + toxlevels = dmt(posts, 10).map(lambda p: (p['Id'], {a['Id']: computeToxLevel(a['Body']) for a in p['Answers']}), "calculating sentiments").getresults() toxlevels = {id: p for (id, p) in toxlevels} dumptoxlevels(toxlevels, outfilename + ".py") -def computeSentimentForPost(post): - anwsers = {a['Id']: computeToxLevel(a['Body']) for a in post['Answers']} - return anwsers - - def computeToxLevel(text): return analyser.polarity_scores(text)