diff --git a/analyze_batch.py b/analyze_batch.py index a1ba451..02b4d87 100644 --- a/analyze_batch.py +++ b/analyze_batch.py @@ -20,7 +20,11 @@ def main(folder, intervl): users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts, intervl) + + start = cms() + printnoln("reading sentiments ...") cachedsentiments = imprt(folder + "/output/sentiments.py").answers + rprint("reading sentiments ... took " + str(cms() - start) + "ms") outputdir = folder + "/output/batch/" os.system("mkdir -p " + outputdir) @@ -100,9 +104,9 @@ def main(folder, intervl): gcom.append(comlevelsflat) fig, axs = plt.subplots(2, 2, figsize=(16, 12)) - axs[0, 0].set_title('Neg') - axs[1, 0].set_title('Neu') - axs[0, 1].set_title('Pos') + axs[0, 0].set_title('Negativity') + axs[1, 0].set_title('Neutrality') + axs[0, 1].set_title('Positivity') axs[1, 1].set_title('Compound') axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100)) axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100)) @@ -115,7 +119,7 @@ def main(folder, intervl): # plt.show() fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts within 1 week of 1st contribution\nPosts created between " - + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) + + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + ", n=" + str(len(filteredposts))) # figsaver.save(fig, outfilename + ".png", bbox_inches='tight') printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...") fig.savefig(outfilename + ".png", bbox_inches='tight') @@ -194,7 +198,8 @@ def main(folder, intervl): axs[1, 1].set_yscale('log') # plt.show() - fig.suptitle("Sentiment of answers to posts by most posting users (95%tile)\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) + fig.suptitle("Sentiment of answers to posts by most posting users (" + str(OLD_USER_PERCENTILE * 100) + "%tile)\nPosts created between " + + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + ", n=" + str(len(filteredposts))) printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...") fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight') plt.close(fig) diff --git a/calctoxdiff.py b/calctoxdiff.py index d1c615d..3fa78fd 100644 --- a/calctoxdiff.py +++ b/calctoxdiff.py @@ -80,6 +80,7 @@ def plotbypost(onlyfiles, outputdir): f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[p][i]) + "; ks neu = " + str(changes_neu[p][i]) + "; ks pos = " + str(changes_pos[p][i]) + "; ks com = " + str(changes_com[p][i]) + "\n") # pval + magick = IMAGE_MAGICK for (p, l) in files.items(): x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) @@ -93,11 +94,16 @@ def plotbypost(onlyfiles, outputdir): plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with max " + str(p) + " posts") plt.xticks(rotation=90) + plt.xlabel("Comparision: time frame X - time frame X+1") + plt.ylabel("p-value") plt.legend(loc="upper right") plt.savefig(outputdir + "/ks_post_pval_" + str(p) + ".png", bbox_inches='tight') plt.close(fig) + magick += " " + outputdir + "/ks_post_pval_" + str(p) + ".png" + os.system(magick + " " + outputdir + "/ks_post_pval.pdf") # stat + magick = IMAGE_MAGICK for (p, l) in files.items(): x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) @@ -111,9 +117,13 @@ def plotbypost(onlyfiles, outputdir): plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with max " + str(p) + " posts") plt.xticks(rotation=90) + plt.xlabel("Comparision: time frame X - time frame X+1") + plt.ylabel("stat value") plt.legend(loc="upper right") plt.savefig(outputdir + "/ks_post_stat_" + str(p) + ".png", bbox_inches='tight') plt.close(fig) + magick += " " + outputdir + "/ks_post_stat_" + str(p) + ".png" + os.system(magick + " " + outputdir + "/ks_post_stat.pdf") def plotbydate(onlyfiles, outputdir): @@ -122,6 +132,8 @@ def plotbydate(onlyfiles, outputdir): for f in onlyfiles: s = f[:-3].split("_") files[(s[3], s[4])].append(f) + dates = sorted(files.keys(), key=lambda e: "-".join(reversed(e[0].split("-")))) + files = {d: files[d] for d in dates} files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()} changes_neg = defaultdict(list) @@ -166,6 +178,7 @@ def plotbydate(onlyfiles, outputdir): + "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n") # pval + magick = IMAGE_MAGICK for (d, l) in files.items(): x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) @@ -179,11 +192,16 @@ def plotbydate(onlyfiles, outputdir): plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) + plt.xlabel("Comparision: X (max) posts - X+1 (max) posts") + plt.ylabel("p-value") plt.legend(loc="upper right") plt.savefig(outputdir + "/ks_date_pval_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight') plt.close(fig) + magick += " " + outputdir + "/ks_date_pval_" + d[0] + "_" + d[1] + ".png" + os.system(magick + " " + outputdir + "/ks_date_pval.pdf") # stat + magick = IMAGE_MAGICK for (d, l) in files.items(): x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) @@ -197,9 +215,13 @@ def plotbydate(onlyfiles, outputdir): plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) + plt.xlabel("Comparision: X (max) posts - X+1 (max) posts") + plt.ylabel("stat value") plt.legend(loc="upper right") plt.savefig(outputdir + "/ks_date_stat_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight') plt.close(fig) + magick += " " + outputdir + "/ks_date_stat_" + d[0] + "_" + d[1] + ".png" + os.system(magick + " " + outputdir + "/ks_date_stat.pdf") def plotbydateold(onlyfiles, oldfiles, outputdir): @@ -253,11 +275,6 @@ def plotbydateold(onlyfiles, oldfiles, outputdir): print("logs") for (d, l) in files.items(): - # print(d) - # print("neg is: " + str(len(changes_neg[d])) + " should: " + str(len(l))) - # print("neu is: " + str(len(changes_neu[d])) + " should: " + str(len(l))) - # print("pos is: " + str(len(changes_pos[d])) + " should: " + str(len(l))) - # print("com is: " + str(len(changes_com[d])) + " should: " + str(len(l))) f1 = oldfiles[d] with open(outputdir + "/ks_olddate_" + d[0] + "_" + d[1] + ".log", "w") as f: for i in range(len(l)): @@ -286,6 +303,8 @@ def plotbydateold(onlyfiles, oldfiles, outputdir): plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) + plt.xlabel("Comparision: X (max) posts - X+1 (max) posts") + plt.ylabel("p-value") plt.legend(loc="upper right") outfile = outputdir + "/ks_olddate_pval_" + d[0] + "_" + d[1] + ".png" plt.savefig(outfile, bbox_inches='tight') @@ -312,6 +331,8 @@ def plotbydateold(onlyfiles, oldfiles, outputdir): plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) + plt.xlabel("Comparision: X (max) posts - X+1 (max) posts") + plt.ylabel("stat value") plt.legend(loc="upper right") outfile = outputdir + "/ks_olddate_stat_" + d[0] + "_" + d[1] + ".png" plt.savefig(outfile, bbox_inches='tight') diff --git a/loader.py b/loader.py index 74f2b4b..371f225 100644 --- a/loader.py +++ b/loader.py @@ -8,6 +8,7 @@ from collections import defaultdict from datetime import datetime from mt import mt +import gc TAG_RE = re.compile(r'<[^>]+>') @@ -23,7 +24,9 @@ def cms(): return int(round(time.time() * 1000)) def load(folder): users = readUsers(folder + "/Users.xml") + gc.collect() posts = readPosts(folder + "/Posts.xml") + gc.collect() # get first contribution to page: firstcontrib = computefirstcontrib(posts)