From bca211551cd6385f38dfe82702bf9796ca2307d1 Mon Sep 17 00:00:00 2001 From: wea_ondara Date: Tue, 16 Jul 2019 11:51:40 +0200 Subject: [PATCH] wip --- analyze_batch.py | 74 ++++++++++++---------------------------- calctoxdiff.py | 62 +++++++++++++++++++++++++++++----- common.py | 29 ++++++++++++++++ posthist.py | 88 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 192 insertions(+), 61 deletions(-) create mode 100644 common.py create mode 100644 posthist.py diff --git a/analyze_batch.py b/analyze_batch.py index bffd6ea..9f8cdf6 100644 --- a/analyze_batch.py +++ b/analyze_batch.py @@ -8,6 +8,7 @@ import matplotlib.pyplot as plt from collections import defaultdict from loader import load, dmt, cms import math +from common import calc_intervals printnoln = lambda text: print(text, end='', flush=True) rprint = lambda text: print('\r' + text) @@ -23,6 +24,7 @@ def main(folder): users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts) + cachedsentiments = {} postcounts = range(1, 5 + 1) for (option_date_from, option_date_to) in intervals: @@ -79,7 +81,11 @@ def main(folder): printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts))) userid = post['OwnerUserId'] for a in post['Answers']: - toxlevel = computeToxLevel(a['Body']) + if a['Id'] in cachedsentiments.keys(): + toxlevel = cachedsentiments[a['Id']] + else: + toxlevel = computeToxLevel(a['Body']) + cachedsentiments[a['Id']] = toxlevel toxlevels[userid].append(toxlevel) rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms") @@ -99,23 +105,17 @@ def main(folder): fig, axs = plt.subplots(2, 2, figsize=(16, 12)) axs[0, 0].set_title('Neg') - axs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100)) axs[1, 0].set_title('Neu') - axs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100)) axs[0, 1].set_title('Pos') - axs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100)) axs[1, 1].set_title('Compound') + axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100)) + axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100)) + axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100)) axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100)) - - # global - # gaxs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts") - # gaxs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts") - # gaxs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts") - # gaxs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts") - # gaxs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts") - # gaxs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts") - # gaxs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts") - # gaxs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts") + axs[0, 0].set_yscale('log') + axs[1, 0].set_yscale('log') + axs[0, 1].set_yscale('log') + axs[1, 1].set_yscale('log') # plt.show() fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts\nUsers registered between " @@ -124,22 +124,18 @@ def main(folder): plt.close(fig) # global - gaxs[0, 0].hist(gneg, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) - gaxs[1, 0].hist(gneu, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) - gaxs[0, 1].hist(gpos, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) + gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) + gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) + gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) - # gaxs[0, 0].hist(gneg, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) - # gaxs[1, 0].hist(gneu, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) - # gaxs[0, 1].hist(gpos, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) - # gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) - # gaxs[0, 0].hist(gneg, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) - # gaxs[1, 0].hist(gneu, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) - # gaxs[0, 1].hist(gpos, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) - # gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[0, 0].legend(loc="upper right") gaxs[1, 0].legend(loc="upper right") gaxs[0, 1].legend(loc="upper right") gaxs[1, 1].legend(loc="upper right") + gaxs[0, 0].set_yscale('log') + gaxs[1, 0].set_yscale('log') + gaxs[0, 1].set_yscale('log') + gaxs[1, 1].set_yscale('log') gfig.suptitle("Sentiment of answers to the first X (max) posts\nUsers registered between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) gfig.savefig(goutfilename + ".png", bbox_inches='tight') plt.close(gfig) @@ -159,34 +155,6 @@ def dumptoxlevels(lvls, filename): file.write("toxlevels = " + str(lvls).replace("", "list", 1) + "\n") -def calc_intervals(posts): - firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], "firstpost").getresults() - lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults() - - # calc quarter beginning - firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0) - if firstpost.month not in (1, 4, 7, 10): - firstpost = firstpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[firstpost.month]) - lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0) - if lastpost.month not in (1, 4, 7, 10): - lastpost = lastpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[lastpost.month]) - # add 3 months to last post - if lastpost.month == 10: - lastpost = lastpost.replace(month=1, year=lastpost.year + 1) - else: - lastpost = lastpost.replace(month=lastpost.month + 3) - - cdate = firstpost - intervals = [] - while cdate < lastpost: - nextquarter = cdate.replace(month=(cdate.month + 3) % 12, year=cdate.year + (0 if cdate.month + 3 < 12 else 1)) - print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y")) - intervals.append((cdate, nextquarter)) - cdate = nextquarter - # sys.exit(0) - return intervals - - if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " " diff --git a/calctoxdiff.py b/calctoxdiff.py index 7daca81..ee75ead 100644 --- a/calctoxdiff.py +++ b/calctoxdiff.py @@ -7,6 +7,9 @@ from scipy.stats import ks_2samp from collections import defaultdict from datetime import datetime import matplotlib.pyplot as plt +import numpy as np + +colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'} def main(folder): @@ -70,19 +73,40 @@ def plotbypost(onlyfiles): f2 = l[i + 1] f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[p][i]) + "; ks neu = " + str(changes_neu[p][i]) + "; ks pos = " + str(changes_pos[p][i]) + "; ks com = " + str(changes_com[p][i]) + "\n") + # pval + for (p, l) in files.items(): + x = [l[i].split("_")[2] + " -\n" + l[i + 1].split("_")[2] for i in range(len(l) - 1)] + fig = plt.figure(figsize=(16, 12)) + for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items(): + pval = [x.pvalue for x in changes] + plt.plot(x, pval, label=type + ".pval", color=colors[type]) + mean = np.mean(pval) + std = np.std(pval) + dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std] + plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed') + plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') + plt.title("KS 2-sided test with max " + str(p) + " posts") + plt.xticks(rotation=90) + plt.legend(loc="upper right") + plt.savefig(folder + "/ks_pval_" + str(p) + ".png", bbox_inches='tight') + plt.close(fig) + # stat for (p, l) in files.items(): x = [l[i].split("_")[2] + " -\n" + l[i + 1].split("_")[2] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items(): stat = [x.statistic for x in changes] - pval = [x.pvalue for x in changes] - plt.plot(x, stat, label=type + ".stat") - plt.plot(x, pval, label=type + ".pval") + plt.plot(x, stat, label=type + ".stat", color=colors[type]) + mean = np.mean(stat) + std = np.std(stat) + dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std] + plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed') + plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with max " + str(p) + " posts") plt.xticks(rotation=90) plt.legend(loc="upper right") - plt.savefig(folder + "/ks_" + str(p) + ".png", bbox_inches='tight') + plt.savefig(folder + "/ks_stat_" + str(p) + ".png", bbox_inches='tight') plt.close(fig) @@ -134,18 +158,40 @@ def plotbydate(onlyfiles): f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[d][i]) + "; ks neu = " + str(changes_neu[d][i]) + "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n") + # pval + for (d, l) in files.items(): + x = [l[i].split("_")[4][:-3] + "-" + l[i + 1].split("_")[4][:-3] for i in range(len(l) - 1)] + fig = plt.figure(figsize=(16, 12)) + for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): + pval = [x.pvalue for x in changes] + plt.plot(x, pval, label=type + ".pval", color=colors[type]) + mean = np.mean(pval) + std = np.std(pval) + dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std] + plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed') + plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') + plt.title("KS 2-sided test with between " + d[0] + " and " + d[1]) + plt.xticks(rotation=90) + plt.legend(loc="upper right") + plt.savefig(folder + "/ks_pval_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight') + plt.close(fig) + + # stat for (d, l) in files.items(): x = [l[i].split("_")[4][:-3] + "-" + l[i + 1].split("_")[4][:-3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): stat = [x.statistic for x in changes] - pval = [x.pvalue for x in changes] - plt.plot(x, stat, label=type + ".stat") - plt.plot(x, pval, label=type + ".pval") + plt.plot(x, stat, label=type + ".stat", color=colors[type]) + mean = np.mean(stat) + std = np.std(stat) + dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std] + plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed') + plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) plt.legend(loc="upper right") - plt.savefig(folder + "/ks_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight') + plt.savefig(folder + "/ks_stat_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight') plt.close(fig) diff --git a/common.py b/common.py new file mode 100644 index 0000000..ad05687 --- /dev/null +++ b/common.py @@ -0,0 +1,29 @@ +from loader import load, dmt, cms + + +def calc_intervals(posts): + firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], "firstpost").getresults() + lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults() + + # calc quarter beginning + firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0) + if firstpost.month not in (1, 4, 7, 10): + firstpost = firstpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[firstpost.month]) + lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0) + if lastpost.month not in (1, 4, 7, 10): + lastpost = lastpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[lastpost.month]) + # add 3 months to last post + if lastpost.month == 10: + lastpost = lastpost.replace(month=1, year=lastpost.year + 1) + else: + lastpost = lastpost.replace(month=lastpost.month + 3) + + cdate = firstpost + intervals = [] + while cdate < lastpost: + nextquarter = cdate.replace(month=(cdate.month + 3) % 12, year=cdate.year + (0 if cdate.month + 3 < 12 else 1)) + print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y")) + intervals.append((cdate, nextquarter)) + cdate = nextquarter + # sys.exit(0) + return intervals diff --git a/posthist.py b/posthist.py new file mode 100644 index 0000000..03091be --- /dev/null +++ b/posthist.py @@ -0,0 +1,88 @@ +from datetime import datetime +from datetime import timedelta +import sys +import os +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer +import numpy as np +import matplotlib.pyplot as plt +from collections import defaultdict +from loader import load, dmt, cms +import math +from common import calc_intervals + +printnoln = lambda text: print(text, end='', flush=True) +rprint = lambda text: print('\r' + text) + +DAYS_NEW_USER = 7 +OLD_USER_YEAR = 3 + +analyser = SentimentIntensityAnalyzer() +colors = ['red', 'green', 'blue', 'orange', 'deeppink'] + + +def main(folder): + users, posts, firstcontrib, sumcontrib = load(folder) + intervals = calc_intervals(posts) + + for (option_date_from, option_date_to) in intervals: + print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))) + + # filter posts by option_date_from <= creation date <= option_date_to + newusers = set(dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").map(lambda u: u['Id'], "getting user ids").getresults()) + newposts = dmt(posts).filter(lambda p: p['OwnerUserId'] in newusers, "filtering posts by users").getresults() + + postcounts = defaultdict(list) + i = 0 + for p in newposts: + postcounts[p['OwnerUserId']].append(p) + i = i + 1 + postcounts = {id: len(pc) for (id, pc) in postcounts.items()} + # print("i: " + str(i) + " expected: " + str(len(newposts)) + " is: " + str(sum([pc for pc in postcounts.values()]))) + + os.system("mkdir -p " + folder + "/output") + histfilename = folder + "/output/posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + countfilename = folder + "/output/postcount_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + + # fig = plt.figure(figsize=(16, 12)) + # plt.plot(userids, [len(pc) for pc in postcounts]) + # plt.title("Post count for users between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y")) + # plt.xticks(rotation=90) + # fig.savefig(countfilename + ".png", bbox_inches='tight') + # plt.close(fig) + + histdata = [pc for pc in postcounts.values()] + fig = plt.figure(figsize=(16, 12)) + plt.hist(histdata, range(max(histdata, default=0) + 1)) + plt.yscale('log') + plt.ylim(bottom=0) + plt.title("Histogram for user post count registered between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y")) + fig.savefig(histfilename + ".png", bbox_inches='tight') + plt.close(fig) + + +def computeToxLevel(text): + return analyser.polarity_scores(text) + + +def flatmap(arr): + return [item for sublist in arr for item in sublist] + + +def dumptoxlevels(lvls, filename): + with open(filename, "w") as file: + file.write("from collections import defaultdict\n\n") + file.write("toxlevels = " + str(lvls).replace("", "list", 1) + "\n") + + +if __name__ == "__main__": + # execute only if run as a script + usage = sys.argv[0] + " " + if len(sys.argv) < 2: + print(usage) + sys.exit(1) + folder = sys.argv[1] + if not os.path.isdir(folder): + print(folder + " is not a folder") + sys.exit(1) + + main(folder)