From bca211551cd6385f38dfe82702bf9796ca2307d1 Mon Sep 17 00:00:00 2001
From: wea_ondara <wea_ondara@alpenblock.net>
Date: Tue, 16 Jul 2019 11:51:40 +0200
Subject: [PATCH] wip

---
 analyze_batch.py | 74 ++++++++++++----------------------------
 calctoxdiff.py   | 62 +++++++++++++++++++++++++++++-----
 common.py        | 29 ++++++++++++++++
 posthist.py      | 88 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 192 insertions(+), 61 deletions(-)
 create mode 100644 common.py
 create mode 100644 posthist.py

diff --git a/analyze_batch.py b/analyze_batch.py
index bffd6ea..9f8cdf6 100644
--- a/analyze_batch.py
+++ b/analyze_batch.py
@@ -8,6 +8,7 @@ import matplotlib.pyplot as plt
 from collections import defaultdict
 from loader import load, dmt, cms
 import math
+from common import calc_intervals
 
 printnoln = lambda text: print(text, end='', flush=True)
 rprint = lambda text: print('\r' + text)
@@ -23,6 +24,7 @@ def main(folder):
     users, posts, firstcontrib, sumcontrib = load(folder)
 
     intervals = calc_intervals(posts)
+    cachedsentiments = {}
 
     postcounts = range(1, 5 + 1)
     for (option_date_from, option_date_to) in intervals:
@@ -79,7 +81,11 @@ def main(folder):
                     printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
                 userid = post['OwnerUserId']
                 for a in post['Answers']:
-                    toxlevel = computeToxLevel(a['Body'])
+                    if a['Id'] in cachedsentiments.keys():
+                        toxlevel = cachedsentiments[a['Id']]
+                    else:
+                        toxlevel = computeToxLevel(a['Body'])
+                        cachedsentiments[a['Id']] = toxlevel
                     toxlevels[userid].append(toxlevel)
             rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms")
 
@@ -99,23 +105,17 @@ def main(folder):
 
             fig, axs = plt.subplots(2, 2, figsize=(16, 12))
             axs[0, 0].set_title('Neg')
-            axs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100))
             axs[1, 0].set_title('Neu')
-            axs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100))
             axs[0, 1].set_title('Pos')
-            axs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100))
             axs[1, 1].set_title('Compound')
+            axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100))
+            axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100))
+            axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100))
             axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100))
-
-            # global
-            # gaxs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
-            # gaxs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
-            # gaxs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
-            # gaxs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
-            # gaxs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
-            # gaxs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
-            # gaxs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
-            # gaxs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
+            axs[0, 0].set_yscale('log')
+            axs[1, 0].set_yscale('log')
+            axs[0, 1].set_yscale('log')
+            axs[1, 1].set_yscale('log')
 
             # plt.show()
             fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts\nUsers registered between "
@@ -124,22 +124,18 @@ def main(folder):
             plt.close(fig)
 
         # global
-        gaxs[0, 0].hist(gneg, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
-        gaxs[1, 0].hist(gneu, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
-        gaxs[0, 1].hist(gpos, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
+        gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
+        gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
+        gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
         gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
-        # gaxs[0, 0].hist(gneg, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
-        # gaxs[1, 0].hist(gneu, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
-        # gaxs[0, 1].hist(gpos, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
-        # gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
-        # gaxs[0, 0].hist(gneg, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
-        # gaxs[1, 0].hist(gneu, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
-        # gaxs[0, 1].hist(gpos, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
-        # gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
         gaxs[0, 0].legend(loc="upper right")
         gaxs[1, 0].legend(loc="upper right")
         gaxs[0, 1].legend(loc="upper right")
         gaxs[1, 1].legend(loc="upper right")
+        gaxs[0, 0].set_yscale('log')
+        gaxs[1, 0].set_yscale('log')
+        gaxs[0, 1].set_yscale('log')
+        gaxs[1, 1].set_yscale('log')
         gfig.suptitle("Sentiment of answers to the first X (max) posts\nUsers registered between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
         gfig.savefig(goutfilename + ".png", bbox_inches='tight')
         plt.close(gfig)
@@ -159,34 +155,6 @@ def dumptoxlevels(lvls, filename):
         file.write("toxlevels = " + str(lvls).replace("<class 'list'>", "list", 1) + "\n")
 
 
-def calc_intervals(posts):
-    firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], "firstpost").getresults()
-    lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults()
-
-    # calc quarter beginning
-    firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
-    if firstpost.month not in (1, 4, 7, 10):
-        firstpost = firstpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[firstpost.month])
-    lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
-    if lastpost.month not in (1, 4, 7, 10):
-        lastpost = lastpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[lastpost.month])
-    # add 3 months to last post
-    if lastpost.month == 10:
-        lastpost = lastpost.replace(month=1, year=lastpost.year + 1)
-    else:
-        lastpost = lastpost.replace(month=lastpost.month + 3)
-
-    cdate = firstpost
-    intervals = []
-    while cdate < lastpost:
-        nextquarter = cdate.replace(month=(cdate.month + 3) % 12, year=cdate.year + (0 if cdate.month + 3 < 12 else 1))
-        print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y"))
-        intervals.append((cdate, nextquarter))
-        cdate = nextquarter
-        # sys.exit(0)
-    return intervals
-
-
 if __name__ == "__main__":
     # execute only if run as a script
     usage = sys.argv[0] + " <folder>"
diff --git a/calctoxdiff.py b/calctoxdiff.py
index 7daca81..ee75ead 100644
--- a/calctoxdiff.py
+++ b/calctoxdiff.py
@@ -7,6 +7,9 @@ from scipy.stats import ks_2samp
 from collections import defaultdict
 from datetime import datetime
 import matplotlib.pyplot as plt
+import numpy as np
+
+colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'}
 
 
 def main(folder):
@@ -70,19 +73,40 @@ def plotbypost(onlyfiles):
                 f2 = l[i + 1]
                 f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[p][i]) + "; ks neu = " + str(changes_neu[p][i])
                         + "; ks pos = " + str(changes_pos[p][i]) + "; ks com = " + str(changes_com[p][i]) + "\n")
+    # pval
+    for (p, l) in files.items():
+        x = [l[i].split("_")[2] + " -\n" + l[i + 1].split("_")[2] for i in range(len(l) - 1)]
+        fig = plt.figure(figsize=(16, 12))
+        for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
+            pval = [x.pvalue for x in changes]
+            plt.plot(x, pval, label=type + ".pval", color=colors[type])
+            mean = np.mean(pval)
+            std = np.std(pval)
+            dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std]
+            plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
+            plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
+        plt.title("KS 2-sided test with max " + str(p) + " posts")
+        plt.xticks(rotation=90)
+        plt.legend(loc="upper right")
+        plt.savefig(folder + "/ks_pval_" + str(p) + ".png", bbox_inches='tight')
+        plt.close(fig)
 
+    # stat
     for (p, l) in files.items():
         x = [l[i].split("_")[2] + " -\n" + l[i + 1].split("_")[2] for i in range(len(l) - 1)]
         fig = plt.figure(figsize=(16, 12))
         for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
             stat = [x.statistic for x in changes]
-            pval = [x.pvalue for x in changes]
-            plt.plot(x, stat, label=type + ".stat")
-            plt.plot(x, pval, label=type + ".pval")
+            plt.plot(x, stat, label=type + ".stat", color=colors[type])
+            mean = np.mean(stat)
+            std = np.std(stat)
+            dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std]
+            plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
+            plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
         plt.title("KS 2-sided test with max " + str(p) + " posts")
         plt.xticks(rotation=90)
         plt.legend(loc="upper right")
-        plt.savefig(folder + "/ks_" + str(p) + ".png", bbox_inches='tight')
+        plt.savefig(folder + "/ks_stat_" + str(p) + ".png", bbox_inches='tight')
         plt.close(fig)
 
 
@@ -134,18 +158,40 @@ def plotbydate(onlyfiles):
                 f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[d][i]) + "; ks neu = " + str(changes_neu[d][i])
                         + "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n")
 
+    # pval
+    for (d, l) in files.items():
+        x = [l[i].split("_")[4][:-3] + "-" + l[i + 1].split("_")[4][:-3] for i in range(len(l) - 1)]
+        fig = plt.figure(figsize=(16, 12))
+        for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
+            pval = [x.pvalue for x in changes]
+            plt.plot(x, pval, label=type + ".pval", color=colors[type])
+            mean = np.mean(pval)
+            std = np.std(pval)
+            dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std]
+            plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
+            plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
+        plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
+        plt.xticks(rotation=90)
+        plt.legend(loc="upper right")
+        plt.savefig(folder + "/ks_pval_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
+        plt.close(fig)
+
+    # stat
     for (d, l) in files.items():
         x = [l[i].split("_")[4][:-3] + "-" + l[i + 1].split("_")[4][:-3] for i in range(len(l) - 1)]
         fig = plt.figure(figsize=(16, 12))
         for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
             stat = [x.statistic for x in changes]
-            pval = [x.pvalue for x in changes]
-            plt.plot(x, stat, label=type + ".stat")
-            plt.plot(x, pval, label=type + ".pval")
+            plt.plot(x, stat, label=type + ".stat", color=colors[type])
+            mean = np.mean(stat)
+            std = np.std(stat)
+            dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std]
+            plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
+            plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
         plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
         plt.xticks(rotation=90)
         plt.legend(loc="upper right")
-        plt.savefig(folder + "/ks_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
+        plt.savefig(folder + "/ks_stat_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
         plt.close(fig)
 
 
diff --git a/common.py b/common.py
new file mode 100644
index 0000000..ad05687
--- /dev/null
+++ b/common.py
@@ -0,0 +1,29 @@
+from loader import load, dmt, cms
+
+
+def calc_intervals(posts):
+    firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], "firstpost").getresults()
+    lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults()
+
+    # calc quarter beginning
+    firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
+    if firstpost.month not in (1, 4, 7, 10):
+        firstpost = firstpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[firstpost.month])
+    lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
+    if lastpost.month not in (1, 4, 7, 10):
+        lastpost = lastpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[lastpost.month])
+    # add 3 months to last post
+    if lastpost.month == 10:
+        lastpost = lastpost.replace(month=1, year=lastpost.year + 1)
+    else:
+        lastpost = lastpost.replace(month=lastpost.month + 3)
+
+    cdate = firstpost
+    intervals = []
+    while cdate < lastpost:
+        nextquarter = cdate.replace(month=(cdate.month + 3) % 12, year=cdate.year + (0 if cdate.month + 3 < 12 else 1))
+        print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y"))
+        intervals.append((cdate, nextquarter))
+        cdate = nextquarter
+        # sys.exit(0)
+    return intervals
diff --git a/posthist.py b/posthist.py
new file mode 100644
index 0000000..03091be
--- /dev/null
+++ b/posthist.py
@@ -0,0 +1,88 @@
+from datetime import datetime
+from datetime import timedelta
+import sys
+import os
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+import numpy as np
+import matplotlib.pyplot as plt
+from collections import defaultdict
+from loader import load, dmt, cms
+import math
+from common import calc_intervals
+
+printnoln = lambda text: print(text, end='', flush=True)
+rprint = lambda text: print('\r' + text)
+
+DAYS_NEW_USER = 7
+OLD_USER_YEAR = 3
+
+analyser = SentimentIntensityAnalyzer()
+colors = ['red', 'green', 'blue', 'orange', 'deeppink']
+
+
+def main(folder):
+    users, posts, firstcontrib, sumcontrib = load(folder)
+    intervals = calc_intervals(posts)
+
+    for (option_date_from, option_date_to) in intervals:
+        print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y")))
+
+        # filter posts by option_date_from <= creation date <= option_date_to
+        newusers = set(dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").map(lambda u: u['Id'], "getting user ids").getresults())
+        newposts = dmt(posts).filter(lambda p: p['OwnerUserId'] in newusers, "filtering posts by users").getresults()
+
+        postcounts = defaultdict(list)
+        i = 0
+        for p in newposts:
+            postcounts[p['OwnerUserId']].append(p)
+            i = i + 1
+        postcounts = {id: len(pc) for (id, pc) in postcounts.items()}
+        # print("i: " + str(i) + " expected: " + str(len(newposts)) + " is: " + str(sum([pc for pc in postcounts.values()])))
+
+        os.system("mkdir -p " + folder + "/output")
+        histfilename = folder + "/output/posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
+        countfilename = folder + "/output/postcount_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
+
+        # fig = plt.figure(figsize=(16, 12))
+        # plt.plot(userids, [len(pc) for pc in postcounts])
+        # plt.title("Post count for users between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y"))
+        # plt.xticks(rotation=90)
+        # fig.savefig(countfilename + ".png", bbox_inches='tight')
+        # plt.close(fig)
+
+        histdata = [pc for pc in postcounts.values()]
+        fig = plt.figure(figsize=(16, 12))
+        plt.hist(histdata, range(max(histdata, default=0) + 1))
+        plt.yscale('log')
+        plt.ylim(bottom=0)
+        plt.title("Histogram for user post count registered between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y"))
+        fig.savefig(histfilename + ".png", bbox_inches='tight')
+        plt.close(fig)
+
+
+def computeToxLevel(text):
+    return analyser.polarity_scores(text)
+
+
+def flatmap(arr):
+    return [item for sublist in arr for item in sublist]
+
+
+def dumptoxlevels(lvls, filename):
+    with open(filename, "w") as file:
+        file.write("from collections import defaultdict\n\n")
+        file.write("toxlevels = " + str(lvls).replace("<class 'list'>", "list", 1) + "\n")
+
+
+if __name__ == "__main__":
+    # execute only if run as a script
+    usage = sys.argv[0] + " <folder>"
+    if len(sys.argv) < 2:
+        print(usage)
+        sys.exit(1)
+    folder = sys.argv[1]
+    if not os.path.isdir(folder):
+        print(folder + " is not a folder")
+        sys.exit(1)
+
+    main(folder)