import matplotlib.pyplot as plt import numpy as np import os import statsmodels.api as sm import sys from collections import defaultdict from datetime import datetime from datetime import timedelta from dateutil.relativedelta import relativedelta from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, difftime from loader import load, dmt, cms, readVotes from sentiments import readtoxleveltxt colors = ['red', 'green', 'blue', 'orange', 'deeppink'] thresholds = [6, 9, 12, 15] changedate = datetime.fromisoformat("2018-09-01T00:00:00") def main(folder, intervl): votes = readVotes(folder) users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts, intervl) # start = cms() # printnoln("reading sentiments ...") # (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt") # rprint("reading sentiments ... took " + str(cms() - start) + "ms") start = cms() printnoln("sorting votes by post ...") votesbypost = defaultdict(list) for v in votes: votesbypost[v['PostId']].append(v) rprint("sorting votes by post ... took " + str(cms() - start) + "ms") outputdir = folder + "/output/votesits/" os.system("mkdir -p " + outputdir) data = [] datasingle = [] count = [] for (option_date_from, option_date_to) in intervals: if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"): datasingle.append(float("nan")) data.append(float("nan")) count.append(float("nan")) continue print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) # avg sentiments filtered = (dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate']) .map(lambda p: votescore(votesbypost[p['Id']], p)) .getresults()) datasingle.append(filtered) avg = np.average(filtered) if len(filtered) > 0 else float("nan") data.append(avg) count.append(len(filtered)) avgcount = np.mean([x for x in count if str(x) != "nan"]) stdcount = np.std([x for x in count if str(x) != "nan"]) for i in range(len(count)): if str(count[i]) == "nan": # or np.abs((count[i] - avgcount) / stdcount) > 3: datasingle[i] = float("nan") data[i] = float("nan") count[i] = float("nan") # filter nan entries for i in range(len(data)): while i < len(data) and str(data[i]) == "nan": del datasingle[i] del data[i] del intervals[i] del count[i] print("Computing full ITS") t = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) x = np.reshape(np.array([(0 if intervals[i][0] <= changedate else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) X = np.array(t) X = np.concatenate((X, x), 1) X = np.concatenate((X, np.multiply(t, x)), 1) y = np.reshape(np.array([d for a in datasingle for d in a]), (-1, 1)) X = sm.add_constant(X) res = sm.OLS(y, X).fit() p2 = res.pvalues print("coef ols: " + str(res.params)) print("sum ols: " + str(res.summary())) coef2ols = np.reshape(np.array(res.params), (-1, 1)) its2ols = X.dot(coef2ols) with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file: file.write(str(res.summary())) thresdata = [] thresols = [] thresiv = [] thresp = [] print("Computing threshold ITS") for ti in thresholds: # print(1, changedate - relativedelta(months=ti)) # print(2, changedate + relativedelta(months=ti)) z = [(i, x) for (i, x) in zip(intervals, datasingle) if i[0] >= changedate - relativedelta(months=ti) and i[1] <= changedate + relativedelta(months=ti)] iv = [i for (i, x) in z] # print("iv " + str(iv)) d = [x for (i, x) in z] t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1)) x = np.reshape(np.array([(0 if iv[i][0] <= changedate else 1) for i in range(len(d)) for j in d[i]]), (-1, 1)) X = np.array(t) X = np.concatenate((X, x), 1) X = np.concatenate((X, np.multiply(t, x)), 1) y = np.reshape(np.array([v for a in d for v in a]), (-1, 1)) X = sm.add_constant(X) res = sm.OLS(y, X).fit() tp = res.pvalues thresp.append(tp) # print("coef ols: " + str(res.params)) # print("sum ols: " + str(res.summary())) coefthresols = np.reshape(np.array(res.params), (-1, 1)) thresols.append(X.dot(coefthresols)) thresiv.append(iv) thresdata.append(d) with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file: file.write(str(res.summary())) fig = plt.figure(figsize=FIG_SIZE) plt.plot([difftime(i[0]) for i in intervals], data, label="average vote score") plt.grid(True) for i in range(len(data)): va = "center" if 0 < i < len(data) - 1: if data[i - 1] < data[i] and data[i + 1] < data[i]: va = "bottom" elif data[i - 1] > data[i] and data[i + 1] > data[i]: va = "top" elif i == 0: if data[i + 1] < data[i]: va = "bottom" else: va = "top" elif i == len(data) - 1: if data[i - 1] < data[i]: va = "bottom" else: va = "top" plt.text(difftime(intervals[i][0]), data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va) plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS") # print("shape: " + str(np.shape(thresdata))) for (ti, t) in enumerate(thresholds): # print("shape1: " + str(np.shape(thresdata[ti]))) plt.plot([difftime(thresiv[ti][i][0]) for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months") plt.title("Average vote score for new users") plt.xticks(rotation=90) plt.xlabel("months") plt.ylabel("vote score") plt.legend(loc="upper right") outfile = outputdir + "/average_votes-i" + str(intervl) + ".png" plt.savefig(outfile, bbox_inches='tight') plt.close(fig) def votescore(votes, post): filtered = dmt(votes).filter(lambda v: v['PostId'] == post['Id'] and post['CreationDate'] + timedelta(days=DAYS_NEW_USER) > v['CreationDate']).getresults() score = sum([1 if v['VoteTypeId'] == 2 else (-1 if v['VoteTypeId'] == 3 else 0) for v in filtered]) return score if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " " if len(sys.argv) < 2: print(usage) sys.exit(1) folder = sys.argv[1] if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) interval = 1 if len(sys.argv) >= 3: if sys.argv[2].startswith("-i"): interval = sys.argv[2][2:] try: interval = int(interval) except ValueError: print("-i: int required") sys.exit(1) if interval < 1 or interval > 12: print("-i: only 1 - 12") sys.exit(1) else: print("unknown parameter: " + sys.argv[2]) sys.exit(1) main(folder, interval)