wip

2019-12-18 13:02:16 +01:00
parent 356eefaf53
commit 2c1524a335
4 changed files with 245 additions and 38 deletions
--- a/its.py
+++ b/its.py
@@ -1,17 +1,17 @@
 import os
 import os
 import sys
 from datetime import datetime
 from datetime import timedelta
 import matplotlib.pyplot as plt
 import numpy as np
 import os
 from datetime import datetime
 from datetime import timedelta
 from sklearn.linear_model import LinearRegression
-from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER
+from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER
 from loader import load, dmt, cms
-
+from sentiments import readtoxleveltxt
-OLD_USER_PERCENTILE = 0.95
+import statsmodels.api as sm
 colors = ['red', 'green', 'blue', 'orange', 'deeppink']
@@ -23,33 +23,31 @@ def main(folder, intervl):
    start = cms()
    printnoln("reading sentiments ...")
-    cachedsentiments = imprt(folder + "/output/sentiments.py").answers
+    (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
    rprint("reading sentiments ... took " + str(cms() - start) + "ms")
    outputdir = folder + "/output/its/"
    os.system("mkdir -p " + outputdir)
    data = []
    datasingle = []
    count = []
    for (option_date_from, option_date_to) in intervals:
        if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"):
            data.append(float("nan"))
            continue
        print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
        # avg sentiments
        # print(dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
        #                                 for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
        #                                 and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
        #       .filter(lambda p: p != [])
        #       .getresults())
        # break
        filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
                                              for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
                                              and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
                    .filter(lambda p: p != [])
                    .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
                    .getresults())
        datasingle.append(filtered)
        avg = np.average(filtered) if len(filtered) > 0 else float("nan")
        data.append(avg)
        count.append(len(filtered))
    # filter nan entries
    for i in range(len(data)):
@@ -57,37 +55,110 @@ def main(folder, intervl):
            del data[i]
            del intervals[i]
-    print("Computing ITS ...")
+    # print("Computing ITS ...")
-    t = np.reshape(np.array([i for i in range(len(data))]), (-1, 1))
+    # t = np.reshape(np.array([i for i in range(len(data))]), (-1, 1))
-    # print("t", t)
+    # x = np.reshape(np.array([(0 if option_date_to <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for (option_date_from, option_date_to) in intervals]), (-1, 1))
-    x = np.reshape(np.array([(0 if option_date_to <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for (option_date_from, option_date_to) in intervals]), (-1, 1))
+    # X = np.array(t)
-    # print("x", x)
+    # X = np.concatenate((X, x), 1)
-    X = np.reshape(np.array([data[0] for i in range(len(data))]), (-1, 1))
+    # X = np.concatenate((X, np.multiply(t, x)), 1)
-    # print("X", X)
+    # y = np.reshape(np.array(data), (-1, 1))
-    X = np.concatenate((X, t), 1)
+    # # print("Xfin", X)
    # # print("y", y)
    # reg = LinearRegression()
    # reg.fit(X, y)
    # score = reg.score(X, y)
    # coef = np.reshape(np.array(reg.coef_), (-1, 1))
    # its = X.dot(coef) + reg.intercept_
    # print("score: " + str(score))
    # print("coef: " + str(coef))
    # print("its: " + str(its))
    print("Computing full ITS")
    t = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
    x = np.reshape(np.array([(0 if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
    X = np.array(t)
    X = np.concatenate((X, x), 1)
    X = np.concatenate((X, np.multiply(t, x)), 1)
-    y = np.reshape(np.array(data), (-1, 1))
+    y = np.reshape(np.array([d for a in datasingle for d in a]), (-1, 1))
    # print("Xfin", X)
    # print("y", y)
-    reg = LinearRegression()
+    # reg = LinearRegression()
-    reg.fit(X, y)
+    # reg.fit(X, y)
-    score = reg.score(X, y);
+    # score2 = reg.score(X, y)
-    coef = np.reshape(np.array(reg.coef_), (-1, 1))
+    # coef2 = np.reshape(np.array(reg.coef_), (-1, 1))
-    its = X.dot(coef) + data[0]
+    # its2 = X.dot(coef2) + reg.intercept_
-    print("score: " + str(score))
+    # print("intercept: " + str(reg.intercept_))
-    print("coef: " + str(coef))
+    # print("score: " + str(score2))
-    print("its: " + str(its))
+    # print("coef: " + str(coef2))
    # print("its: " + str(its2))
    X = sm.add_constant(X)
    res = sm.OLS(y, X).fit()
    p2 = res.pvalues
    print("coef ols: " + str(res.params))
    print("sum ols: " + str(res.summary()))
    coef2ols = np.reshape(np.array(res.params), (-1, 1))
    its2ols = X.dot(coef2ols)
    with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file:
        file.write(str(res.summary()))
    # print("Computing segmented ITS before")
    # X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
    # y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
    # reg = LinearRegression()
    # reg.fit(X, y)
    # scoreb = reg.score(X, y)
    # coefb = np.reshape(np.array(reg.coef_), (-1, 1))
    # itsb = X.dot(coefb) + reg.intercept_
    # print("scoreb: " + str(scoreb))
    # print("coefb: " + str(coefb))
    # print("itsb: " + str(itsb))
    # print("Computing segmented ITS after")
    # X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
    # y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
    # reg = LinearRegression()
    # reg.fit(X, y)
    # scorea = reg.score(X, y)
    # coefa = np.reshape(np.array(reg.coef_), (-1, 1))
    # itsa = X.dot(coefa) + reg.intercept_
    # print("scorea: " + str(scorea))
    # print("coefa: " + str(coefa))
    # print("itsa: " + str(itsa))
    fig = plt.figure(figsize=(16, 12))
    plt.plot([i[0] for i in intervals], data, label="average sentiment")
-    plt.plot([i[0] for i in intervals], its, label="ITS (score " + str(score) + ")")
+    plt.grid(True)
    for i in range(len(data)):
        va = "center"
        if 0 < i < len(data) - 1:
            if data[i - 1] < data[i] and data[i + 1] < data[i]:
                va = "bottom"
            elif data[i - 1] > data[i] and data[i + 1] > data[i]:
                va = "top"
        elif i == 0:
            if data[i + 1] < data[i]:
                va = "bottom"
            else:
                va = "top"
        elif i == len(data) - 1:
            if data[i - 1] < data[i]:
                va = "bottom"
            else:
                va = "top"
        plt.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
    # plt.plot([i[0] for i in intervals], its, label="aggregated ITS (score " + str(score) + ")")
    # plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2, label="single ITS (score " + str(score2) + ", p " + str(p2) + ")")
    plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS (pvalues " + str(p2) + ")")
    # plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")], itsb,
    #          label="segmented ITS b (score " + str(scoreb) + ")")
    # plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")], itsa,
    #          label="segmented ITS a (score " + str(scorea) + ")")
    plt.title("Average sentiments for new users")
    plt.xticks(rotation=90)
    plt.xlabel("months")
    plt.ylabel("sentiment")
    plt.legend(loc="upper right")
-    outfile = outputdir + "/average_sentiments.png"
+    outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png"
    plt.savefig(outfile, bbox_inches='tight')
    plt.close(fig)
--- a/22
+++ b/22
@@ -13,3 +13,25 @@ interrupted time series
 3 monate vor änderung und nache der änderungen
 da gibts 2 varianten: sprung, oder 2 linien
 ----------
 lila lines in its fixen -> done
 coef -> p values significance, statsmodels sm logit, sm.summary() -> done
 bisschen zusammen fassen was ich gemacht habe
 plot über zeit <-0.05, -, >0.05 für sentiments, 3 kurven -> done
 paper, strg+f arousal, methode wichtig, referncen anschauen https://link.springer.com/content/pdf/10.1007%2Fs42001-017-0001-x.pdf
 #einfluss von code sections
 Papers:
 its:
 http://lindenconsulting.org/documents/Weighted_TSA_Article.pdf
--- a/posthist.py
+++ b/posthist.py
@@ -1,12 +1,14 @@
 import os
 import sys
 from collections import defaultdict
 from datetime import timedelta
 import matplotlib.pyplot as plt
 from matplotlib.ticker import MaxNLocator
-from common import calc_intervals, IMAGE_MAGICK
+from common import calc_intervals, IMAGE_MAGICK, DAYS_NEW_USER
 from loader import load, dmt
 from sentiments import readtoxleveltxt
 colors = ['red', 'green', 'blue', 'orange', 'deeppink']
@@ -14,15 +16,19 @@ colors = ['red', 'green', 'blue', 'orange', 'deeppink']
 def main(folder, intervl):
    users, posts, firstcontrib, sumcontrib = load(folder)
    intervals = calc_intervals(posts, intervl)
    (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
    outputdir = folder + "/output/posthist/"
    os.system("mkdir -p " + outputdir)
    activeusercounts = []
    answerstonewusers = []
    sentimentstonewusers = []
    imgmagickcmd = IMAGE_MAGICK
    for (option_date_from, option_date_to) in intervals:
-        print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y")))
+        print(option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))
        # post histograms
        # filter posts by option_date_from <= creation date <= option_date_to
        newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults()
@@ -34,7 +40,7 @@ def main(folder, intervl):
        postcounts = {id: len(pc) for (id, pc) in postcounts.items()}
        activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys())))
-        histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
+        histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "-i" + str(intervl)
        histdata = [pc for pc in postcounts.values()]
        fig = plt.figure(figsize=(16, 12))
@@ -48,14 +54,50 @@ def main(folder, intervl):
        fig.savefig(histfilename + ".png", bbox_inches='tight')
        plt.close(fig)
        imgmagickcmd += " " + histfilename + ".png"
    os.system(imgmagickcmd + " " + outputdir + "/posthist.pdf")
        # answers to new users
        answers = (dmt(posts).map(lambda q: [a for a in q['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
                                             and firstcontrib[q['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
                   .getresults())
        count = sum([len(a) for a in answers])
        answerstonewusers.append(((option_date_from, option_date_to), count))
        sent = ([cachedsentiments[a['Id']] for al in answers for a in al])
        sentbad = len([1 for a in sent if a['compound'] < -0.05])
        sentneu = len([1 for a in sent if -0.05 <= a['compound'] <= 0.05])
        sentgood = len([1 for a in sent if a['compound'] > 0.05])
        sentimentstonewusers.append(((option_date_from, option_date_to), (sent, sentbad, sentneu, sentgood)))
    # gen pdf for post histograms
    os.system(imgmagickcmd + " " + outputdir + "/posthist-i" + str(intervl) + ".pdf")
    # plot posts diagram
    fig = plt.figure(figsize=(16, 12))
    plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts])
    plt.yscale('log')
-    plt.ylim(bottom=0)
+    plt.ylim(bottom=0.001)
    plt.title("Active users")
-    fig.savefig(outputdir + "activeusers.png", bbox_inches='tight')
+    fig.savefig(outputdir + "activeusers-i" + str(intervl) + ".png", bbox_inches='tight')
    plt.close(fig)
    # plot answers to new users diagram
    fig = plt.figure(figsize=(16, 12))
    plt.plot([x[0] for (x, y) in answerstonewusers], [y for (x, y) in answerstonewusers])
    plt.yscale('log')
    plt.ylim(bottom=0.001)
    plt.title("#Answers to new users")
    fig.savefig(outputdir + "answerstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
    plt.close(fig)
    # plot sentiments of answers to new users diagram
    fig = plt.figure(figsize=(16, 12))
    plt.plot([x[0] for (x, y) in sentimentstonewusers], [b for (x, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer")
    plt.plot([x[0] for (x, y) in sentimentstonewusers], [n for (x, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer")
    plt.plot([x[0] for (x, y) in sentimentstonewusers], [g for (x, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer")
    plt.yscale('log')
    plt.ylim(bottom=0.001)
    plt.legend(loc="upper right")
    plt.title("Sentiments of answers to new users")
    fig.savefig(outputdir + "sentimentstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
    plt.close(fig)
--- a/sentiments.py
+++ b/sentiments.py
@@ -4,6 +4,7 @@ import sys
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 from loader import load, dmt
 from common import imprt
 analyser = SentimentIntensityAnalyzer()
@@ -20,6 +21,30 @@ def main(folder):
    toxlevels = {id: p for (id, p) in toxlevels}
    dumptoxlevels(toxlevels, outfilename + ".py")
    dumptoxlevelstxt(toxlevels, outfilename + ".txt")
    # (lvl2q, lvl2a) = readtoxleveltxt(outfilename + ".txt")
    #
    # s1 = str(toxlevels)
    # s2 = str(lvl2q)
    # # print("s1: " + s1)
    # # print("s2: " + s2)
    # if s1 != s2:
    #     print("not equal")
    # else:
    #     print("equal")
    #
    # # print("s1: " + str(imprt(folder + "/output/sentiments.py").answers))
    # # print("s2: " + str(lvl2a))
    # if str(imprt(folder + "/output/sentiments.py").answers) != str(lvl2a):
    #     print("a not equal")
    # else:
    #     print("a equal")
    #
    # if str(imprt(folder + "/output/sentiments.py").posts) != str(lvl2q):
    #     print("q not equal")
    # else:
    #     print("q equal")
 def computeToxLevel(text):
@@ -36,6 +61,53 @@ def dumptoxlevels(lvls, filename):
        file.write("answers = " + str(answers) + "\n")
 def dumptoxlevelstxt(lvls, filename):
    answers = dict()
    for p in lvls.values():
        for id, a in p.items():
            answers[id] = a
    pstr = [str(id) + ":" + ";".join([str(aid) + ":" + str(a['neg']) + ":" + str(a['neu']) + ":" + str(a['pos']) + ":" + str(a['compound']) for (aid, a) in p.items()]) for (id, p) in lvls.items()]
    astr = [str(id) + ":" + str(p['neg']) + ":" + str(p['neu']) + ":" + str(p['pos']) + ":" + str(p['compound']) for (id, p) in answers.items()]
    pstr = ";;".join(pstr)
    astr = ";".join(astr)
    with open(filename, "w") as file:
        file.write("posts=" + pstr + "\n")
        file.write("answers=" + astr + "\n")
 def readtoxleveltxt(filename):
    lines = ""
    with open(filename, 'r') as f:
        lines = f.read()
    lines = lines.split("\n")
    rq = {}
    ra = {}
    for line in lines:
        if line.startswith("posts="):
            line = line[len("posts="):]
            rq = line.split(";;")  # split by q
            # print("i1: " + str(rq[0:5]))
            rq = [l.split(":", 1) for l in rq]  # get q id
            # print("i2: " + str(rq[0:5]))
            rq = [(qid, [x.split(":") for x in a.split(";")]) if len(a) > 0 else (qid, []) for [qid, a] in rq]
            # print("i3:" + str(rq))
            # rq = {int(id): {int(1): "a" for x in a} for (id, a) in rq}
            # rq = {int(id): {str(aid[0]): str(aid) for aid in a} for (id, a) in rq}
            rq = {id: {aid: {"neg": float(neg), "neu": float(neu), "pos": float(pos), "compound": float(compound)} for [aid, neg, neu, pos, compound] in a} for (id, a) in rq}
            # print("i4:" + str(rq)[0:500])
            # sys.exit()
        elif line.startswith("answers="):
            line = line[len("answers="):]
            ra = line.split(";")
            ra = [l.split(":") for l in ra]
            # print("i1: " + str(ra[0:5]))
            ra = {id: {"neg": float(neg), "neu": float(neu), "pos": float(pos), "compound": float(compound)} for [id, neg, neu, pos, compound] in ra}
            # print("i1: " + str(ra)[0:500])
    return rq, ra
 if __name__ == "__main__":
    # execute only if run as a script
    usage = sys.argv[0] + " <folder>"