From fdc1743d5d40629ff2814d590028f839c40527d2 Mon Sep 17 00:00:00 2001
From: wea_ondara <wea_ondara@alpenblock.net>
Date: Sat, 25 Jan 2020 13:16:05 +0100
Subject: [PATCH] wip

---
 its.py      |   5 ---
 loader.py   |   4 +-
 notes       |   4 +-
 posthist.py |  37 ++++++++++++++--
 votes.py    | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 157 insertions(+), 13 deletions(-)
 create mode 100644 votes.py

diff --git a/its.py b/its.py
index d1edbb6..787a20c 100644
--- a/its.py
+++ b/its.py
@@ -55,11 +55,6 @@ def main(folder, intervl):
     avgcount = np.mean([x for x in count if str(x) != "nan"])
     stdcount = np.std([x for x in count if str(x) != "nan"])
     for i in range(len(count)):
-        print(count[i])
-        if count[i] == 45:
-            print("m " + str(avgcount))
-            print("s " + str(stdcount))
-            print("N " + str((count[i] - avgcount) / stdcount))
         if str(count[i]) == "nan" or np.abs((count[i] - avgcount) / stdcount) > 3:
             datasingle[i] = float("nan")
             data[i] = float("nan")
diff --git a/loader.py b/loader.py
index 2f080e5..2e2e026 100644
--- a/loader.py
+++ b/loader.py
@@ -100,7 +100,7 @@ def mapuser(item):
 
 
 def mapQuestion(item):
-    tags = ['Id', 'CreationDate', 'Body', 'Title', 'OwnerUserId', 'OwnerDisplayName']
+    tags = ['Id', 'CreationDate', 'Body', 'Title', 'OwnerUserId', 'OwnerDisplayName', 'Score']
     datetags = ['CreationDate']
     question = {tag: getTag(item, tag) for tag in tags}
     for tag in datetags:
@@ -110,7 +110,7 @@ def mapQuestion(item):
 
 
 def mapAnswer(item):
-    tags = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId']
+    tags = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId', 'Score']
     datetags = ['CreationDate']
     answer = {tag: getTag(item, tag) for tag in tags}
     for tag in datetags:
diff --git a/notes b/notes
index c070af4..b603f5e 100644
--- a/notes
+++ b/notes
@@ -38,8 +38,8 @@ http://lindenconsulting.org/documents/Weighted_TSA_Article.pdf
 
 outliner filtern 57 /2000 senitment values in its > done
 threshold 2,3,4,5,6 monate vor und zurück in its neu kurven andere farben>done
-auswertung up downvotes und correlation mit sentiment
-activität neuer user vorher und nachher
+auswertung up downvotes und correlation mit sentiment >done
+activität neuer user vorher und nachher>done
 
 
 
diff --git a/posthist.py b/posthist.py
index 7d317da..036720a 100644
--- a/posthist.py
+++ b/posthist.py
@@ -24,6 +24,7 @@ def main(folder, intervl):
     activeusercounts = []
     answerstonewusers = []
     sentimentstonewusers = []
+    activitynewusers = []
     imgmagickcmd = IMAGE_MAGICK
     for (option_date_from, option_date_to) in intervals:
         print(option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))
@@ -37,9 +38,21 @@ def main(folder, intervl):
         for p in newposts:
             postcounts[p['OwnerUserId']].append(p)
             i = i + 1
+            # for a in p['Answers']:
+            #     postcounts[p['OwnerUserId']].append(a)
         postcounts = {id: len(pc) for (id, pc) in postcounts.items()}
         activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys())))
 
+        activitynewusersinmonth = defaultdict(int)
+        for p in newposts:
+            if firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate']:
+                activitynewusersinmonth[p['OwnerUserId']] += 1
+            for a in p['Answers']:
+                if firstcontrib[a['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']:
+                    activitynewusersinmonth[p['OwnerUserId']] += 1
+        activitysum = sum(activitynewusersinmonth.values())
+        activitynewusers.append(((option_date_from, option_date_to), activitysum / len(activitynewusersinmonth)))
+
         histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "-i" + str(intervl)
 
         histdata = [pc for pc in postcounts.values()]
@@ -73,8 +86,10 @@ def main(folder, intervl):
     # plot posts diagram
     fig = plt.figure(figsize=(16, 12))
     plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts])
+    plt.xlabel('time')
+    plt.ylabel('#active users')
     plt.yscale('log')
-    plt.ylim(bottom=0.001)
+    plt.ylim(bottom=1)
     plt.title("Active users")
     fig.savefig(outputdir + "activeusers-i" + str(intervl) + ".png", bbox_inches='tight')
     plt.close(fig)
@@ -82,9 +97,11 @@ def main(folder, intervl):
     # plot answers to new users diagram
     fig = plt.figure(figsize=(16, 12))
     plt.plot([x[0] for (x, y) in answerstonewusers], [y for (x, y) in answerstonewusers])
+    plt.xlabel('time')
+    plt.ylabel('#answers per question of a new user')
     plt.yscale('log')
-    plt.ylim(bottom=0.001)
-    plt.title("#Answers to new users")
+    plt.ylim(bottom=1)
+    plt.title("Answers to new users")
     fig.savefig(outputdir + "answerstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
     plt.close(fig)
 
@@ -93,13 +110,25 @@ def main(folder, intervl):
     plt.plot([x[0] for (x, y) in sentimentstonewusers], [b for (x, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer")
     plt.plot([x[0] for (x, y) in sentimentstonewusers], [n for (x, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer")
     plt.plot([x[0] for (x, y) in sentimentstonewusers], [g for (x, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer")
+    plt.xlabel('time')
+    plt.ylabel('sentiment')
     plt.yscale('log')
-    plt.ylim(bottom=0.001)
+    plt.ylim(bottom=1)
     plt.legend(loc="upper right")
     plt.title("Sentiments of answers to new users")
     fig.savefig(outputdir + "sentimentstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
     plt.close(fig)
 
+    # plot activity for new users
+    fig = plt.figure(figsize=(16, 12))
+    plt.plot([x[0] for (x, y) in activitynewusers], [y for x, y in activitynewusers], label="activity")
+    plt.xlabel('time')
+    plt.ylabel('#questions or answers created by a new user')
+    plt.legend(loc="upper right")
+    plt.title("Average activity per new user")
+    fig.savefig(outputdir + "activitynewusers-i" + str(intervl) + ".png", bbox_inches='tight')
+    plt.close(fig)
+
 
 if __name__ == "__main__":
     # execute only if run as a script
diff --git a/votes.py b/votes.py
new file mode 100644
index 0000000..7983fb9
--- /dev/null
+++ b/votes.py
@@ -0,0 +1,120 @@
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import statsmodels.api as sm
+from datetime import datetime
+from datetime import timedelta
+from dateutil.relativedelta import relativedelta
+
+from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER
+from loader import load, dmt, cms
+from sentiments import readtoxleveltxt
+
+colors = ['red', 'green', 'blue', 'orange', 'deeppink']
+thresholds = [3, 4, 5, 6]
+changedate = datetime.fromisoformat("2018-09-01T00:00:00")
+
+
+def main(folder, intervl):
+    users, posts, firstcontrib, sumcontrib = load(folder)
+
+    intervals = calc_intervals(posts, intervl)
+
+    start = cms()
+    printnoln("reading sentiments ...")
+    (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
+    rprint("reading sentiments ... took " + str(cms() - start) + "ms")
+
+    outputdir = folder + "/output/votes/"
+    os.system("mkdir -p " + outputdir)
+
+    datasingle = []
+    scoresingle = []
+    for (option_date_from, option_date_to) in intervals:
+        print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
+        # avg sentiments
+        scores = (dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to
+                                              and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= p['CreationDate'])
+                  .map(lambda p: int(p['Score']))
+                  .getresults())
+        filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
+                                              for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
+                                              and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
+                    .filter(lambda p: p != [])
+                    .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
+                    .getresults())
+        scoresingle.append(scores)
+        datasingle.append(filtered)
+
+    # filter nan entries
+    for i in range(len(datasingle)):
+        if len(datasingle[i]) == 0:
+            datasingle = float("nan")
+        if len(datasingle[i]) == 0:
+            scoresingle[i] = float("nan")
+
+    print("Plotting ...")
+    fig, ax = plt.subplots(figsize=(16, 12))
+    data = [np.mean(x) for x in datasingle]
+    l1 = ax.plot([i[0] for i in intervals], data, label="average sentiment")
+    ax2 = ax.twinx()
+    l2 = ax2.plot([i[0] for i in intervals], [np.mean(x) for x in scoresingle], label="average score (votes)", color="red")
+    plt.grid(True)
+    for i in range(len(data)):
+        va = "center"
+        if 0 < i < len(data) - 1:
+            if data[i - 1] < data[i] and data[i + 1] < data[i]:
+                va = "bottom"
+            elif data[i - 1] > data[i] and data[i + 1] > data[i]:
+                va = "top"
+        elif i == 0:
+            if data[i + 1] < data[i]:
+                va = "bottom"
+            else:
+                va = "top"
+        elif i == len(data) - 1:
+            if data[i - 1] < data[i]:
+                va = "bottom"
+            else:
+                va = "top"
+        ax.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
+    plt.title("Average sentiments for new users")
+    plt.xticks(rotation=90)
+    ax.set_xlabel("months")
+    ax.set_ylabel("sentiment")
+    ax.set_ylabel("score (votes)")
+    plt.legend(l1 + l2, [l.get_label() for l in l1 + l2], loc="upper right")
+    outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png"
+    plt.savefig(outfile, bbox_inches='tight')
+    plt.close(fig)
+
+
+if __name__ == "__main__":
+    # execute only if run as a script
+    usage = sys.argv[0] + " <folder>"
+    if len(sys.argv) < 2:
+        print(usage)
+        sys.exit(1)
+    folder = sys.argv[1]
+    if not os.path.isdir(folder):
+        print(folder + " is not a folder")
+        sys.exit(1)
+    interval = 1
+    if len(sys.argv) >= 3:
+        if sys.argv[2].startswith("-i"):
+            interval = sys.argv[2][2:]
+            try:
+                interval = int(interval)
+            except ValueError:
+                print("-i: int required")
+                sys.exit(1)
+            if interval < 1 or interval > 12:
+                print("-i: only 1 - 12")
+                sys.exit(1)
+        else:
+            print("unknown parameter: " + sys.argv[2])
+            sys.exit(1)
+
+    main(folder, interval)