From 37152813aa9cd325377281556063e3c7cd9b0f9f Mon Sep 17 00:00:00 2001 From: wea_ondara Date: Sat, 20 Jun 2020 17:17:32 +0200 Subject: [PATCH] wip --- votesits.py | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 votesits.py diff --git a/votesits.py b/votesits.py new file mode 100644 index 0000000..dd79744 --- /dev/null +++ b/votesits.py @@ -0,0 +1,193 @@ +import matplotlib.pyplot as plt +import numpy as np +import os +import statsmodels.api as sm +import sys +from collections import defaultdict +from datetime import datetime +from datetime import timedelta +from dateutil.relativedelta import relativedelta + +from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, difftime +from loader import load, dmt, cms, readVotes +from sentiments import readtoxleveltxt + +colors = ['red', 'green', 'blue', 'orange', 'deeppink'] +thresholds = [6, 9, 12, 15] +changedate = datetime.fromisoformat("2018-09-01T00:00:00") + + +def main(folder, intervl): + votes = readVotes(folder) + users, posts, firstcontrib, sumcontrib = load(folder) + + intervals = calc_intervals(posts, intervl) + + # start = cms() + # printnoln("reading sentiments ...") + # (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt") + # rprint("reading sentiments ... took " + str(cms() - start) + "ms") + + start = cms() + printnoln("sorting votes by post ...") + votesbypost = defaultdict(list) + for v in votes: + votesbypost[v['PostId']].append(v) + rprint("sorting votes by post ... took " + str(cms() - start) + "ms") + + outputdir = folder + "/output/votesits/" + os.system("mkdir -p " + outputdir) + + data = [] + datasingle = [] + count = [] + for (option_date_from, option_date_to) in intervals: + if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"): + datasingle.append(float("nan")) + data.append(float("nan")) + count.append(float("nan")) + continue + print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) + # avg sentiments + filtered = (dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to + and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate']) + .map(lambda p: votescore(votesbypost[p['Id']], p)) + .getresults()) + datasingle.append(filtered) + avg = np.average(filtered) if len(filtered) > 0 else float("nan") + data.append(avg) + count.append(len(filtered)) + + avgcount = np.mean([x for x in count if str(x) != "nan"]) + stdcount = np.std([x for x in count if str(x) != "nan"]) + for i in range(len(count)): + if str(count[i]) == "nan": # or np.abs((count[i] - avgcount) / stdcount) > 3: + datasingle[i] = float("nan") + data[i] = float("nan") + count[i] = float("nan") + + # filter nan entries + for i in range(len(data)): + while i < len(data) and str(data[i]) == "nan": + del datasingle[i] + del data[i] + del intervals[i] + del count[i] + + print("Computing full ITS") + t = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) + x = np.reshape(np.array([(0 if intervals[i][0] <= changedate else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) + X = np.array(t) + X = np.concatenate((X, x), 1) + X = np.concatenate((X, np.multiply(t, x)), 1) + y = np.reshape(np.array([d for a in datasingle for d in a]), (-1, 1)) + X = sm.add_constant(X) + res = sm.OLS(y, X).fit() + p2 = res.pvalues + print("coef ols: " + str(res.params)) + print("sum ols: " + str(res.summary())) + coef2ols = np.reshape(np.array(res.params), (-1, 1)) + its2ols = X.dot(coef2ols) + with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file: + file.write(str(res.summary())) + + thresdata = [] + thresols = [] + thresiv = [] + thresp = [] + print("Computing threshold ITS") + for ti in thresholds: + # print(1, changedate - relativedelta(months=ti)) + # print(2, changedate + relativedelta(months=ti)) + z = [(i, x) for (i, x) in zip(intervals, datasingle) if i[0] >= changedate - relativedelta(months=ti) and i[1] <= changedate + relativedelta(months=ti)] + iv = [i for (i, x) in z] + # print("iv " + str(iv)) + d = [x for (i, x) in z] + t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1)) + x = np.reshape(np.array([(0 if iv[i][0] <= changedate else 1) for i in range(len(d)) for j in d[i]]), (-1, 1)) + X = np.array(t) + X = np.concatenate((X, x), 1) + X = np.concatenate((X, np.multiply(t, x)), 1) + y = np.reshape(np.array([v for a in d for v in a]), (-1, 1)) + X = sm.add_constant(X) + res = sm.OLS(y, X).fit() + tp = res.pvalues + thresp.append(tp) + # print("coef ols: " + str(res.params)) + # print("sum ols: " + str(res.summary())) + coefthresols = np.reshape(np.array(res.params), (-1, 1)) + thresols.append(X.dot(coefthresols)) + thresiv.append(iv) + thresdata.append(d) + with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file: + file.write(str(res.summary())) + + fig = plt.figure(figsize=FIG_SIZE) + plt.plot([difftime(i[0]) for i in intervals], data, label="average vote score") + plt.grid(True) + for i in range(len(data)): + va = "center" + if 0 < i < len(data) - 1: + if data[i - 1] < data[i] and data[i + 1] < data[i]: + va = "bottom" + elif data[i - 1] > data[i] and data[i + 1] > data[i]: + va = "top" + elif i == 0: + if data[i + 1] < data[i]: + va = "bottom" + else: + va = "top" + elif i == len(data) - 1: + if data[i - 1] < data[i]: + va = "bottom" + else: + va = "top" + plt.text(difftime(intervals[i][0]), data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va) + plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS") + # print("shape: " + str(np.shape(thresdata))) + for (ti, t) in enumerate(thresholds): + # print("shape1: " + str(np.shape(thresdata[ti]))) + plt.plot([difftime(thresiv[ti][i][0]) for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months") + plt.title("Average vote score for new users") + plt.xticks(rotation=90) + plt.xlabel("months") + plt.ylabel("vote score") + plt.legend(loc="upper right") + outfile = outputdir + "/average_votes-i" + str(intervl) + ".png" + plt.savefig(outfile, bbox_inches='tight') + plt.close(fig) + + +def votescore(votes, post): + filtered = dmt(votes).filter(lambda v: v['PostId'] == post['Id'] and post['CreationDate'] + timedelta(days=DAYS_NEW_USER) > v['CreationDate']).getresults() + score = sum([1 if v['VoteTypeId'] == 2 else (-1 if v['VoteTypeId'] == 3 else 0) for v in filtered]) + return score + + +if __name__ == "__main__": + # execute only if run as a script + usage = sys.argv[0] + " " + if len(sys.argv) < 2: + print(usage) + sys.exit(1) + folder = sys.argv[1] + if not os.path.isdir(folder): + print(folder + " is not a folder") + sys.exit(1) + interval = 1 + if len(sys.argv) >= 3: + if sys.argv[2].startswith("-i"): + interval = sys.argv[2][2:] + try: + interval = int(interval) + except ValueError: + print("-i: int required") + sys.exit(1) + if interval < 1 or interval > 12: + print("-i: only 1 - 12") + sys.exit(1) + else: + print("unknown parameter: " + sys.argv[2]) + sys.exit(1) + + main(folder, interval)