import matplotlib.pyplot as plt import numpy as np import os import statsmodels.api as sm import sys from collections import defaultdict from datetime import datetime from datetime import timedelta from dateutil.relativedelta import relativedelta from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, difftime from loader import load, dmt, cms, readVotes from sentiments import readtoxleveltxt colors = ['red', 'green', 'blue', 'orange', 'deeppink'] thresholds = [6, 9, 12, 15] changedate = datetime.fromisoformat("2018-09-01T00:00:00") def main(folder, intervl): users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts, intervl) # start = cms() # printnoln("reading sentiments ...") # (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt") # rprint("reading sentiments ... took " + str(cms() - start) + "ms") start = cms() printnoln("sorting posts by user ...") postbyuser = defaultdict(list) for p in posts: postbyuser[p['OwnerUserId']].append(p) rprint("sorting posts by user ... took " + str(cms() - start) + "ms") outputdir = folder + "/output/questionits/" os.system("mkdir -p " + outputdir) data1 = [] data2 = [] datasingle1 = [] datasingle2 = [] count = [] for (option_date_from, option_date_to) in intervals: if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"): datasingle1.append(float("nan")) datasingle2.append(float("nan")) data1.append(float("nan")) data2.append(float("nan")) count.append(float("nan")) continue print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) # avg sentiments filtered1 = len(dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'] and postbyuser[p['OwnerUserId']][0] == p) # .map(lambda p: votescore(votesbypost[p['Id']], p)) .getresults()) filtered2 = len(dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'] and postbyuser[p['OwnerUserId']][0] != p) # .map(lambda p: votescore(votesbypost[p['Id']], p)) .getresults()) filtered1 = [filtered1] filtered2 = [filtered2] datasingle1.append(filtered1) datasingle2.append(filtered2) avg1 = np.average(filtered1) if len(filtered1) > 0 else float("nan") avg2 = np.average(filtered2) if len(filtered2) > 0 else float("nan") data1.append(avg1) data2.append(avg2) count.append(len(filtered1)) avgcount = np.mean([x for x in count if str(x) != "nan"]) stdcount = np.std([x for x in count if str(x) != "nan"]) for i in range(len(count)): if str(count[i]) == "nan": # or np.abs((count[i] - avgcount) / stdcount) > 3: datasingle1[i] = float("nan") datasingle2[i] = float("nan") data1[i] = float("nan") data2[i] = float("nan") count[i] = float("nan") # filter nan entries for i in range(len(data1)): while i < len(data1) and str(data1[i]) == "nan": del datasingle1[i] del datasingle2[i] del data1[i] del data2[i] del intervals[i] del count[i] print("Computing full ITS1") t = np.reshape(np.array([i for i in range(len(datasingle1)) for j in datasingle1[i]]), (-1, 1)) x = np.reshape(np.array([(0 if intervals[i][0] <= changedate else 1) for i in range(len(datasingle1)) for j in datasingle1[i]]), (-1, 1)) X = np.array(t) X = np.concatenate((X, x), 1) X = np.concatenate((X, np.multiply(t, x)), 1) y = np.reshape(np.array([d for a in datasingle1 for d in a]), (-1, 1)) X = sm.add_constant(X) res = sm.OLS(y, X).fit() p2 = res.pvalues print("coef ols: " + str(res.params)) print("sum ols: " + str(res.summary())) coef2ols1 = np.reshape(np.array(res.params), (-1, 1)) its2ols1 = X.dot(coef2ols1) with open(outputdir + "/summary1-i" + str(intervl) + ".txt", "w") as file: file.write(str(res.summary())) print("Computing full ITS2") t = np.reshape(np.array([i for i in range(len(datasingle2)) for j in datasingle2[i]]), (-1, 1)) x = np.reshape(np.array([(0 if intervals[i][0] <= changedate else 1) for i in range(len(datasingle2)) for j in datasingle2[i]]), (-1, 1)) X = np.array(t) X = np.concatenate((X, x), 1) X = np.concatenate((X, np.multiply(t, x)), 1) y = np.reshape(np.array([d for a in datasingle2 for d in a]), (-1, 1)) X = sm.add_constant(X) res = sm.OLS(y, X).fit() p2 = res.pvalues print("coef ols: " + str(res.params)) print("sum ols: " + str(res.summary())) coef2ols2 = np.reshape(np.array(res.params), (-1, 1)) its2ols2 = X.dot(coef2ols2) with open(outputdir + "/summary2-i" + str(intervl) + ".txt", "w") as file: file.write(str(res.summary())) thresdata = [] thresols = [] thresiv = [] thresp = [] print("Computing threshold ITS") for ti in thresholds: # print(1, changedate - relativedelta(months=ti)) # print(2, changedate + relativedelta(months=ti)) z = [(i, x) for (i, x) in zip(intervals, datasingle1) if i[0] >= changedate - relativedelta(months=ti) and i[1] <= changedate + relativedelta(months=ti)] iv = [i for (i, x) in z] # print("iv " + str(iv)) d = [x for (i, x) in z] t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1)) x = np.reshape(np.array([(0 if iv[i][0] <= changedate else 1) for i in range(len(d)) for j in d[i]]), (-1, 1)) X = np.array(t) X = np.concatenate((X, x), 1) X = np.concatenate((X, np.multiply(t, x)), 1) y = np.reshape(np.array([v for a in d for v in a]), (-1, 1)) X = sm.add_constant(X) res = sm.OLS(y, X).fit() tp = res.pvalues thresp.append(tp) # print("coef ols: " + str(res.params)) # print("sum ols: " + str(res.summary())) coefthresols = np.reshape(np.array(res.params), (-1, 1)) thresols.append(X.dot(coefthresols)) thresiv.append(iv) thresdata.append(d) with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file: file.write(str(res.summary())) fig = plt.figure(figsize=FIG_SIZE) plt.plot([difftime(i[0]) for i in intervals], data1, label="average #1st-questions") plt.plot([difftime(i[0]) for i in intervals], data2, label="average #2nd+-questions") plt.grid(True) # for i in range(len(data1)): # va = "center" # if 0 < i < len(data1) - 1: # if data1[i - 1] < data1[i] and data1[i + 1] < data1[i]: # va = "bottom" # elif data1[i - 1] > data1[i] and data1[i + 1] > data1[i]: # va = "top" # elif i == 0: # if data1[i + 1] < data1[i]: # va = "bottom" # else: # va = "top" # elif i == len(data1) - 1: # if data1[i - 1] < data1[i]: # va = "bottom" # else: # va = "top" # plt.text(difftime(intervals[i][0]), data1[i], ("n=" if i == 0 else "") + str(len(datasingle1[i])), ha="center", va=va) plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle1)) for j in datasingle1[i]], its2ols1, label="sm single ITS 1") plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle2)) for j in datasingle2[i]], its2ols2, label="sm single ITS 2") # print("shape: " + str(np.shape(thresdata))) # for (ti, t) in enumerate(thresholds): # print("shape1: " + str(np.shape(thresdata[ti]))) # plt.plot([difftime(thresiv[ti][i][0]) for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months") plt.title("Average #1st-questions of new users") plt.xticks(rotation=90) plt.xlabel("months") plt.ylabel("#1st questions") plt.legend(loc="upper right") outfile = outputdir + "/average_questions-i" + str(intervl) + ".png" plt.savefig(outfile, bbox_inches='tight') plt.close(fig) if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " " if len(sys.argv) < 2: print(usage) sys.exit(1) folder = sys.argv[1] if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) interval = 1 if len(sys.argv) >= 3: if sys.argv[2].startswith("-i"): interval = sys.argv[2][2:] try: interval = int(interval) except ValueError: print("-i: int required") sys.exit(1) if interval < 1 or interval > 12: print("-i: only 1 - 12") sys.exit(1) else: print("unknown parameter: " + sys.argv[2]) sys.exit(1) main(folder, interval)