wip

2020-01-25 13:16:05 +01:00
parent cd0239f39c
commit fdc1743d5d
5 changed files with 157 additions and 13 deletions
--- a/its.py
+++ b/its.py
@@ -55,11 +55,6 @@ def main(folder, intervl):
    avgcount = np.mean([x for x in count if str(x) != "nan"])
    stdcount = np.std([x for x in count if str(x) != "nan"])
    for i in range(len(count)):
        print(count[i])
        if count[i] == 45:
            print("m " + str(avgcount))
            print("s " + str(stdcount))
            print("N " + str((count[i] - avgcount) / stdcount))
        if str(count[i]) == "nan" or np.abs((count[i] - avgcount) / stdcount) > 3:
            datasingle[i] = float("nan")
            data[i] = float("nan")
--- a/loader.py
+++ b/loader.py
@@ -100,7 +100,7 @@ def mapuser(item):
 def mapQuestion(item):
-    tags = ['Id', 'CreationDate', 'Body', 'Title', 'OwnerUserId', 'OwnerDisplayName']
+    tags = ['Id', 'CreationDate', 'Body', 'Title', 'OwnerUserId', 'OwnerDisplayName', 'Score']
    datetags = ['CreationDate']
    question = {tag: getTag(item, tag) for tag in tags}
    for tag in datetags:
@@ -110,7 +110,7 @@ def mapQuestion(item):
 def mapAnswer(item):
-    tags = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId']
+    tags = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId', 'Score']
    datetags = ['CreationDate']
    answer = {tag: getTag(item, tag) for tag in tags}
    for tag in datetags:
--- a/4
+++ b/4
@@ -38,8 +38,8 @@ http://lindenconsulting.org/documents/Weighted_TSA_Article.pdf
 outliner filtern 57 /2000 senitment values in its > done
 threshold 2,3,4,5,6 monate vor und zurück in its neu kurven andere farben>done
-auswertung up downvotes und correlation mit sentiment
+auswertung up downvotes und correlation mit sentiment >done
-activität neuer user vorher und nachher
+activität neuer user vorher und nachher>done
--- a/posthist.py
+++ b/posthist.py
@@ -24,6 +24,7 @@ def main(folder, intervl):
    activeusercounts = []
    answerstonewusers = []
    sentimentstonewusers = []
    activitynewusers = []
    imgmagickcmd = IMAGE_MAGICK
    for (option_date_from, option_date_to) in intervals:
        print(option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))
@@ -37,9 +38,21 @@ def main(folder, intervl):
        for p in newposts:
            postcounts[p['OwnerUserId']].append(p)
            i = i + 1
            # for a in p['Answers']:
            #     postcounts[p['OwnerUserId']].append(a)
        postcounts = {id: len(pc) for (id, pc) in postcounts.items()}
        activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys())))
        activitynewusersinmonth = defaultdict(int)
        for p in newposts:
            if firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate']:
                activitynewusersinmonth[p['OwnerUserId']] += 1
            for a in p['Answers']:
                if firstcontrib[a['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']:
                    activitynewusersinmonth[p['OwnerUserId']] += 1
        activitysum = sum(activitynewusersinmonth.values())
        activitynewusers.append(((option_date_from, option_date_to), activitysum / len(activitynewusersinmonth)))
        histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "-i" + str(intervl)
        histdata = [pc for pc in postcounts.values()]
@@ -73,8 +86,10 @@ def main(folder, intervl):
    # plot posts diagram
    fig = plt.figure(figsize=(16, 12))
    plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts])
    plt.xlabel('time')
    plt.ylabel('#active users')
    plt.yscale('log')
-    plt.ylim(bottom=0.001)
+    plt.ylim(bottom=1)
    plt.title("Active users")
    fig.savefig(outputdir + "activeusers-i" + str(intervl) + ".png", bbox_inches='tight')
    plt.close(fig)
@@ -82,9 +97,11 @@ def main(folder, intervl):
    # plot answers to new users diagram
    fig = plt.figure(figsize=(16, 12))
    plt.plot([x[0] for (x, y) in answerstonewusers], [y for (x, y) in answerstonewusers])
    plt.xlabel('time')
    plt.ylabel('#answers per question of a new user')
    plt.yscale('log')
-    plt.ylim(bottom=0.001)
+    plt.ylim(bottom=1)
-    plt.title("#Answers to new users")
+    plt.title("Answers to new users")
    fig.savefig(outputdir + "answerstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
    plt.close(fig)
@@ -93,13 +110,25 @@ def main(folder, intervl):
    plt.plot([x[0] for (x, y) in sentimentstonewusers], [b for (x, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer")
    plt.plot([x[0] for (x, y) in sentimentstonewusers], [n for (x, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer")
    plt.plot([x[0] for (x, y) in sentimentstonewusers], [g for (x, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer")
    plt.xlabel('time')
    plt.ylabel('sentiment')
    plt.yscale('log')
-    plt.ylim(bottom=0.001)
+    plt.ylim(bottom=1)
    plt.legend(loc="upper right")
    plt.title("Sentiments of answers to new users")
    fig.savefig(outputdir + "sentimentstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
    plt.close(fig)
    # plot activity for new users
    fig = plt.figure(figsize=(16, 12))
    plt.plot([x[0] for (x, y) in activitynewusers], [y for x, y in activitynewusers], label="activity")
    plt.xlabel('time')
    plt.ylabel('#questions or answers created by a new user')
    plt.legend(loc="upper right")
    plt.title("Average activity per new user")
    fig.savefig(outputdir + "activitynewusers-i" + str(intervl) + ".png", bbox_inches='tight')
    plt.close(fig)
 if __name__ == "__main__":
    # execute only if run as a script
--- a/votes.py
+++ b/votes.py
@@ -0,0 +1,120 @@
 import sys
 import matplotlib.pyplot as plt
 import numpy as np
 import os
 import statsmodels.api as sm
 from datetime import datetime
 from datetime import timedelta
 from dateutil.relativedelta import relativedelta
 from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER
 from loader import load, dmt, cms
 from sentiments import readtoxleveltxt
 colors = ['red', 'green', 'blue', 'orange', 'deeppink']
 thresholds = [3, 4, 5, 6]
 changedate = datetime.fromisoformat("2018-09-01T00:00:00")
 def main(folder, intervl):
    users, posts, firstcontrib, sumcontrib = load(folder)
    intervals = calc_intervals(posts, intervl)
    start = cms()
    printnoln("reading sentiments ...")
    (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
    rprint("reading sentiments ... took " + str(cms() - start) + "ms")
    outputdir = folder + "/output/votes/"
    os.system("mkdir -p " + outputdir)
    datasingle = []
    scoresingle = []
    for (option_date_from, option_date_to) in intervals:
        print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
        # avg sentiments
        scores = (dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to
                                              and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= p['CreationDate'])
                  .map(lambda p: int(p['Score']))
                  .getresults())
        filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
                                              for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
                                              and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
                    .filter(lambda p: p != [])
                    .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
                    .getresults())
        scoresingle.append(scores)
        datasingle.append(filtered)
    # filter nan entries
    for i in range(len(datasingle)):
        if len(datasingle[i]) == 0:
            datasingle = float("nan")
        if len(datasingle[i]) == 0:
            scoresingle[i] = float("nan")
    print("Plotting ...")
    fig, ax = plt.subplots(figsize=(16, 12))
    data = [np.mean(x) for x in datasingle]
    l1 = ax.plot([i[0] for i in intervals], data, label="average sentiment")
    ax2 = ax.twinx()
    l2 = ax2.plot([i[0] for i in intervals], [np.mean(x) for x in scoresingle], label="average score (votes)", color="red")
    plt.grid(True)
    for i in range(len(data)):
        va = "center"
        if 0 < i < len(data) - 1:
            if data[i - 1] < data[i] and data[i + 1] < data[i]:
                va = "bottom"
            elif data[i - 1] > data[i] and data[i + 1] > data[i]:
                va = "top"
        elif i == 0:
            if data[i + 1] < data[i]:
                va = "bottom"
            else:
                va = "top"
        elif i == len(data) - 1:
            if data[i - 1] < data[i]:
                va = "bottom"
            else:
                va = "top"
        ax.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
    plt.title("Average sentiments for new users")
    plt.xticks(rotation=90)
    ax.set_xlabel("months")
    ax.set_ylabel("sentiment")
    ax.set_ylabel("score (votes)")
    plt.legend(l1 + l2, [l.get_label() for l in l1 + l2], loc="upper right")
    outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png"
    plt.savefig(outfile, bbox_inches='tight')
    plt.close(fig)
 if __name__ == "__main__":
    # execute only if run as a script
    usage = sys.argv[0] + " <folder>"
    if len(sys.argv) < 2:
        print(usage)
        sys.exit(1)
    folder = sys.argv[1]
    if not os.path.isdir(folder):
        print(folder + " is not a folder")
        sys.exit(1)
    interval = 1
    if len(sys.argv) >= 3:
        if sys.argv[2].startswith("-i"):
            interval = sys.argv[2][2:]
            try:
                interval = int(interval)
            except ValueError:
                print("-i: int required")
                sys.exit(1)
            if interval < 1 or interval > 12:
                print("-i: only 1 - 12")
                sys.exit(1)
        else:
            print("unknown parameter: " + sys.argv[2])
            sys.exit(1)
    main(folder, interval)