From fdc1743d5d40629ff2814d590028f839c40527d2 Mon Sep 17 00:00:00 2001 From: wea_ondara Date: Sat, 25 Jan 2020 13:16:05 +0100 Subject: [PATCH] wip --- its.py | 5 --- loader.py | 4 +- notes | 4 +- posthist.py | 37 ++++++++++++++-- votes.py | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 157 insertions(+), 13 deletions(-) create mode 100644 votes.py diff --git a/its.py b/its.py index d1edbb6..787a20c 100644 --- a/its.py +++ b/its.py @@ -55,11 +55,6 @@ def main(folder, intervl): avgcount = np.mean([x for x in count if str(x) != "nan"]) stdcount = np.std([x for x in count if str(x) != "nan"]) for i in range(len(count)): - print(count[i]) - if count[i] == 45: - print("m " + str(avgcount)) - print("s " + str(stdcount)) - print("N " + str((count[i] - avgcount) / stdcount)) if str(count[i]) == "nan" or np.abs((count[i] - avgcount) / stdcount) > 3: datasingle[i] = float("nan") data[i] = float("nan") diff --git a/loader.py b/loader.py index 2f080e5..2e2e026 100644 --- a/loader.py +++ b/loader.py @@ -100,7 +100,7 @@ def mapuser(item): def mapQuestion(item): - tags = ['Id', 'CreationDate', 'Body', 'Title', 'OwnerUserId', 'OwnerDisplayName'] + tags = ['Id', 'CreationDate', 'Body', 'Title', 'OwnerUserId', 'OwnerDisplayName', 'Score'] datetags = ['CreationDate'] question = {tag: getTag(item, tag) for tag in tags} for tag in datetags: @@ -110,7 +110,7 @@ def mapQuestion(item): def mapAnswer(item): - tags = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId'] + tags = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId', 'Score'] datetags = ['CreationDate'] answer = {tag: getTag(item, tag) for tag in tags} for tag in datetags: diff --git a/notes b/notes index c070af4..b603f5e 100644 --- a/notes +++ b/notes @@ -38,8 +38,8 @@ http://lindenconsulting.org/documents/Weighted_TSA_Article.pdf outliner filtern 57 /2000 senitment values in its > done threshold 2,3,4,5,6 monate vor und zurück in its neu kurven andere farben>done -auswertung up downvotes und correlation mit sentiment -activität neuer user vorher und nachher +auswertung up downvotes und correlation mit sentiment >done +activität neuer user vorher und nachher>done diff --git a/posthist.py b/posthist.py index 7d317da..036720a 100644 --- a/posthist.py +++ b/posthist.py @@ -24,6 +24,7 @@ def main(folder, intervl): activeusercounts = [] answerstonewusers = [] sentimentstonewusers = [] + activitynewusers = [] imgmagickcmd = IMAGE_MAGICK for (option_date_from, option_date_to) in intervals: print(option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y")) @@ -37,9 +38,21 @@ def main(folder, intervl): for p in newposts: postcounts[p['OwnerUserId']].append(p) i = i + 1 + # for a in p['Answers']: + # postcounts[p['OwnerUserId']].append(a) postcounts = {id: len(pc) for (id, pc) in postcounts.items()} activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys()))) + activitynewusersinmonth = defaultdict(int) + for p in newposts: + if firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate']: + activitynewusersinmonth[p['OwnerUserId']] += 1 + for a in p['Answers']: + if firstcontrib[a['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']: + activitynewusersinmonth[p['OwnerUserId']] += 1 + activitysum = sum(activitynewusersinmonth.values()) + activitynewusers.append(((option_date_from, option_date_to), activitysum / len(activitynewusersinmonth))) + histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "-i" + str(intervl) histdata = [pc for pc in postcounts.values()] @@ -73,8 +86,10 @@ def main(folder, intervl): # plot posts diagram fig = plt.figure(figsize=(16, 12)) plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts]) + plt.xlabel('time') + plt.ylabel('#active users') plt.yscale('log') - plt.ylim(bottom=0.001) + plt.ylim(bottom=1) plt.title("Active users") fig.savefig(outputdir + "activeusers-i" + str(intervl) + ".png", bbox_inches='tight') plt.close(fig) @@ -82,9 +97,11 @@ def main(folder, intervl): # plot answers to new users diagram fig = plt.figure(figsize=(16, 12)) plt.plot([x[0] for (x, y) in answerstonewusers], [y for (x, y) in answerstonewusers]) + plt.xlabel('time') + plt.ylabel('#answers per question of a new user') plt.yscale('log') - plt.ylim(bottom=0.001) - plt.title("#Answers to new users") + plt.ylim(bottom=1) + plt.title("Answers to new users") fig.savefig(outputdir + "answerstonewusers-i" + str(intervl) + ".png", bbox_inches='tight') plt.close(fig) @@ -93,13 +110,25 @@ def main(folder, intervl): plt.plot([x[0] for (x, y) in sentimentstonewusers], [b for (x, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer") plt.plot([x[0] for (x, y) in sentimentstonewusers], [n for (x, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer") plt.plot([x[0] for (x, y) in sentimentstonewusers], [g for (x, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer") + plt.xlabel('time') + plt.ylabel('sentiment') plt.yscale('log') - plt.ylim(bottom=0.001) + plt.ylim(bottom=1) plt.legend(loc="upper right") plt.title("Sentiments of answers to new users") fig.savefig(outputdir + "sentimentstonewusers-i" + str(intervl) + ".png", bbox_inches='tight') plt.close(fig) + # plot activity for new users + fig = plt.figure(figsize=(16, 12)) + plt.plot([x[0] for (x, y) in activitynewusers], [y for x, y in activitynewusers], label="activity") + plt.xlabel('time') + plt.ylabel('#questions or answers created by a new user') + plt.legend(loc="upper right") + plt.title("Average activity per new user") + fig.savefig(outputdir + "activitynewusers-i" + str(intervl) + ".png", bbox_inches='tight') + plt.close(fig) + if __name__ == "__main__": # execute only if run as a script diff --git a/votes.py b/votes.py new file mode 100644 index 0000000..7983fb9 --- /dev/null +++ b/votes.py @@ -0,0 +1,120 @@ +import sys + +import matplotlib.pyplot as plt +import numpy as np +import os +import statsmodels.api as sm +from datetime import datetime +from datetime import timedelta +from dateutil.relativedelta import relativedelta + +from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER +from loader import load, dmt, cms +from sentiments import readtoxleveltxt + +colors = ['red', 'green', 'blue', 'orange', 'deeppink'] +thresholds = [3, 4, 5, 6] +changedate = datetime.fromisoformat("2018-09-01T00:00:00") + + +def main(folder, intervl): + users, posts, firstcontrib, sumcontrib = load(folder) + + intervals = calc_intervals(posts, intervl) + + start = cms() + printnoln("reading sentiments ...") + (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt") + rprint("reading sentiments ... took " + str(cms() - start) + "ms") + + outputdir = folder + "/output/votes/" + os.system("mkdir -p " + outputdir) + + datasingle = [] + scoresingle = [] + for (option_date_from, option_date_to) in intervals: + print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) + # avg sentiments + scores = (dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to + and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= p['CreationDate']) + .map(lambda p: int(p['Score'])) + .getresults()) + filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound'] + for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to + and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']]) + .filter(lambda p: p != []) + .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: []) + .getresults()) + scoresingle.append(scores) + datasingle.append(filtered) + + # filter nan entries + for i in range(len(datasingle)): + if len(datasingle[i]) == 0: + datasingle = float("nan") + if len(datasingle[i]) == 0: + scoresingle[i] = float("nan") + + print("Plotting ...") + fig, ax = plt.subplots(figsize=(16, 12)) + data = [np.mean(x) for x in datasingle] + l1 = ax.plot([i[0] for i in intervals], data, label="average sentiment") + ax2 = ax.twinx() + l2 = ax2.plot([i[0] for i in intervals], [np.mean(x) for x in scoresingle], label="average score (votes)", color="red") + plt.grid(True) + for i in range(len(data)): + va = "center" + if 0 < i < len(data) - 1: + if data[i - 1] < data[i] and data[i + 1] < data[i]: + va = "bottom" + elif data[i - 1] > data[i] and data[i + 1] > data[i]: + va = "top" + elif i == 0: + if data[i + 1] < data[i]: + va = "bottom" + else: + va = "top" + elif i == len(data) - 1: + if data[i - 1] < data[i]: + va = "bottom" + else: + va = "top" + ax.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va) + plt.title("Average sentiments for new users") + plt.xticks(rotation=90) + ax.set_xlabel("months") + ax.set_ylabel("sentiment") + ax.set_ylabel("score (votes)") + plt.legend(l1 + l2, [l.get_label() for l in l1 + l2], loc="upper right") + outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png" + plt.savefig(outfile, bbox_inches='tight') + plt.close(fig) + + +if __name__ == "__main__": + # execute only if run as a script + usage = sys.argv[0] + " " + if len(sys.argv) < 2: + print(usage) + sys.exit(1) + folder = sys.argv[1] + if not os.path.isdir(folder): + print(folder + " is not a folder") + sys.exit(1) + interval = 1 + if len(sys.argv) >= 3: + if sys.argv[2].startswith("-i"): + interval = sys.argv[2][2:] + try: + interval = int(interval) + except ValueError: + print("-i: int required") + sys.exit(1) + if interval < 1 or interval > 12: + print("-i: only 1 - 12") + sys.exit(1) + else: + print("unknown parameter: " + sys.argv[2]) + sys.exit(1) + + main(folder, interval)