wip

2021-03-22 20:30:32 +01:00
parent 316fed8283
commit 52d7ddb7fc
9 changed files with 270 additions and 36 deletions
--- a/itsnew.py
+++ b/itsnew.py
@@ -0,0 +1,235 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import statsmodels.api as sm
+import sys
+from datetime import datetime
+from datetime import timedelta
+from dateutil.relativedelta import relativedelta
+
+from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, CHANGE_DATE, difftime
+from loader import load, dmt, cms
+from sentiments import readtoxleveltxt
+
+colors = ['red', 'green', 'blue', 'orange', 'deeppink']
+thresholds = [6, 9, 12, 15]
+
+
+def main(folder, intervl):
+    users, posts, firstcontrib, sumcontrib = load(folder)
+
+    intervals = calc_intervals(posts, intervl)
+
+    start = cms()
+    printnoln("reading sentiments ...")
+    (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
+    rprint("reading sentiments ... took " + str(cms() - start) + "ms")
+
+    outputdir = folder + "/output/itsnew/"
+    os.system("mkdir -p " + outputdir)
+
+    data = []
+    datasingle = []
+    count = []
+    for (option_date_from, option_date_to) in intervals:
+        if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"):
+            datasingle.append(float("nan"))
+            data.append(float("nan"))
+            count.append(float("nan"))
+            continue
+        print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
+        # avg sentiments
+        filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
+                                              for a in p['Answers']
+                                              if option_date_from <= p['CreationDate'] < option_date_to  # post in interval
+                                              and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate']  # post created withon 1 week of 1st contrib
+                                              and p['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']])  # answer within 1 week of post creation
+
+                    .filter(lambda p: p != [])
+                    .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
+                    .getresults())
+        datasingle.append(filtered)
+        avg = np.average(filtered) if len(filtered) > 0 else float("nan")
+        data.append(avg)
+        count.append(len(filtered))
+
+    avgcount = np.mean([x for x in count if str(x) != "nan"])
+    stdcount = np.std([x for x in count if str(x) != "nan"])
+    for i in range(len(count)):
+        if str(count[i]) == "nan":  # or np.abs((count[i] - avgcount) / stdcount) > 3:
+            datasingle[i] = float("nan")
+            data[i] = float("nan")
+            count[i] = float("nan")
+
+    # filter nan entries
+    for i in range(len(data)):
+        while i < len(data) and str(data[i]) == "nan":
+            del datasingle[i]
+            del data[i]
+            del intervals[i]
+            del count[i]
+
+    # deseason
+    # mins = [min([data[j] for j in range(len(data)) if j % 12 == i]) for i in range(0, 12)]
+    mins = [np.average([data[j] for j in range(len(data)) if j % 12 == i]) for i in range(0, 12)]
+    # mins = [min(d) / count(d) for d in [[data[j] for j in range(len(data)) if j % 12 == i] for i in range(0, 12)]]
+    # mins = [data[i] for i in range(0, 12)]
+    mins = [m - min(mins) for m in mins]
+    print("mins", mins)
+    dsdata = [data[i] - mins[i % 12] for i in range(len(data))]
+    dsdatasingle = [[d - mins[i % 12] for d in datasingle[i]] for i in range(len(datasingle))]
+
+    # data = dsdata
+    # datasingle = dsdatasingle
+
+    print("Computing full ITS")
+    # t_s = np.reshape(np.array([data[i] for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
+    t = np.reshape(np.array([i for i in range(len(dsdatasingle)) for j in dsdatasingle[i]]), (-1, 1))
+    x = np.reshape(np.array([(0 if intervals[i][0] <= CHANGE_DATE else 1) for i in range(len(dsdatasingle)) for j in dsdatasingle[i]]), (-1, 1))
+    X = np.array(t)  # b1
+    X = np.concatenate((X, x), 1)  # b2
+    X = np.concatenate((X, np.multiply(t, x)), 1)  # 3
+    # X = np.concatenate((X, t_s), 1)  # 4
+    X = sm.add_constant(X)  # b0
+    y = np.reshape(np.array([d for a in dsdatasingle for d in a]), (-1, 1))
+    res = sm.OLS(y, X).fit()
+    p2 = res.pvalues
+    print("coef ols: " + str(res.params))
+    print("sum ols: " + str(res.summary()))
+    coef2ols = np.reshape(np.array(res.params), (-1, 1))
+    # coef2ols[4] = 0
+    its2ols = X.dot(coef2ols)
+    dsits2ols = np.copy(its2ols)
+    # its2ols = np.add(its2ols, np.reshape(np.array([mins[i % 12] for i in range(len(data)) for j in dsdatasingle[i]]), (-1, 1)))
+    minavg = np.average(mins)
+    its2ols = np.add(its2ols, np.reshape(np.array([minavg for i in range(len(data)) for j in dsdatasingle[i]]), (-1, 1)))
+    with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file:
+        file.write(str(res.summary()))
+
+    thresdata = []
+    thresols = []
+    thresiv = []
+    thresp = []
+    print("Computing threshold ITS")
+    for ti in thresholds:
+        # print(1, CHANGE_DATE - relativedelta(months=ti))
+        # print(2, CHANGE_DATE + relativedelta(months=ti))
+        z = [(i, x) for (i, x) in zip(intervals, datasingle) if i[0] >= CHANGE_DATE - relativedelta(months=ti) and i[1] <= CHANGE_DATE + relativedelta(months=ti)]
+        iv = [i for (i, x) in z]
+        # print("iv " + str(iv))
+        d = [x for (i, x) in z]
+        # t_s = np.reshape(np.array([data[i] for i in range(len(d)) for j in d[i]]), (-1, 1))
+        t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1))
+        x = np.reshape(np.array([(0 if iv[i][0] <= CHANGE_DATE else 1) for i in range(len(d)) for j in d[i]]), (-1, 1))
+        X = np.array(t)  # b1
+        X = np.concatenate((X, x), 1)  # b2
+        X = np.concatenate((X, np.multiply(t, x)), 1)  # b3
+        # X = np.concatenate((X, t_s), 1)  # b4
+        X = sm.add_constant(X)  # 0
+        y = np.reshape(np.array([v for a in d for v in a]), (-1, 1))
+        res = sm.OLS(y, X).fit()
+        tp = res.pvalues
+        thresp.append(tp)
+        # print("coef ols: " + str(res.params))
+        # print("sum ols: " + str(res.summary()))
+        coefthresols = np.reshape(np.array(res.params), (-1, 1))
+        # coefthresols[4] = 0
+        thresols.append(X.dot(coefthresols))
+        thresiv.append(iv)
+        thresdata.append(d)
+        with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file:
+            file.write(str(res.summary()))
+
+    fig = plt.figure(figsize=FIG_SIZE)
+    plt.plot([difftime(i[0]) for i in intervals], data, label="average sentiment")
+    # plt.plot([difftime(i[0]) for i in intervals], dsdata, label="average sentiment - deseason")
+    plt.grid(True)
+    for i in range(len(data)):
+        va = "center"
+        if 0 < i < len(data) - 1:
+            if data[i - 1] < data[i] and data[i + 1] < data[i]:
+                va = "bottom"
+            elif data[i - 1] > data[i] and data[i + 1] > data[i]:
+                va = "top"
+        elif i == 0:
+            if data[i + 1] < data[i]:
+                va = "bottom"
+            else:
+                va = "top"
+        elif i == len(data) - 1:
+            if data[i - 1] < data[i]:
+                va = "bottom"
+            else:
+                va = "top"
+        plt.text(difftime(intervals[i][0]), data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
+    plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS")
+    # plt.plot([difftime(intervals[i][0]) for i in range(len(dsdatasingle)) for j in dsdatasingle[i]], dsits2ols, label="sm single ITS - deseason")
+    # print("shape: " + str(np.shape(thresdata)))
+    for (ti, t) in enumerate(thresholds):
+        # print("shape1: " + str(np.shape(thresdata[ti])))
+        plt.plot([difftime(thresiv[ti][i][0]) for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months")
+    plt.title("Average sentiments for new users")
+    plt.xticks(rotation=90)
+    plt.xlabel("months")
+    plt.ylabel("sentiment")
+    plt.legend(loc="upper right")
+    outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png"
+    plt.savefig(outfile, bbox_inches='tight')
+    plt.close(fig)
+
+    # plot seasonality
+    fig = plt.figure(figsize=FIG_SIZE)
+    plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], [mins[i % 12] for i in range(len(datasingle)) for j in datasingle[i]], label="seasonality")
+    # print("shape: " + str(np.shape(thresdata)))
+    plt.title("Average sentiments for new users - seasonality")
+    plt.xticks(rotation=90)
+    plt.xlabel("months")
+    plt.ylabel("sentiment - seasonality")
+    plt.legend(loc="upper right")
+    outfile = outputdir + "/season-i" + str(intervl) + ".png"
+    plt.savefig(outfile, bbox_inches='tight')
+    plt.close(fig)
+
+    # plot seasonality post count
+    pcmins = [len(datasingle[i]) for i in range(0, 12)]
+    pcmins = [m - min(pcmins) for m in pcmins]
+
+    fig = plt.figure(figsize=FIG_SIZE)
+    plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle))], [pcmins[i % 12] for i in range(len(datasingle))], label="seasonality")
+    plt.title("post count for new users - seasonality")
+    plt.xticks(rotation=90)
+    plt.xlabel("months")
+    plt.ylabel("post count - seasonality")
+    plt.legend(loc="upper right")
+    outfile = outputdir + "/season_postcount-i" + str(intervl) + ".png"
+    plt.savefig(outfile, bbox_inches='tight')
+    plt.close(fig)
+
+
+if __name__ == "__main__":
+    # execute only if run as a script
+    usage = sys.argv[0] + " <folder>"
+    if len(sys.argv) < 2:
+        print(usage)
+        sys.exit(1)
+    folder = sys.argv[1]
+    if not os.path.isdir(folder):
+        print(folder + " is not a folder")
+        sys.exit(1)
+    interval = 1
+    if len(sys.argv) >= 3:
+        if sys.argv[2].startswith("-i"):
+            interval = sys.argv[2][2:]
+            try:
+                interval = int(interval)
+            except ValueError:
+                print("-i: int required")
+                sys.exit(1)
+            if interval < 1 or interval > 12:
+                print("-i: only 1 - 12")
+                sys.exit(1)
+        else:
+            print("unknown parameter: " + sys.argv[2])
+            sys.exit(1)
+
+    main(folder, interval)