import matplotlib.pyplot as plt import numpy as np import os import statsmodels.api as sm import sys from datetime import datetime from datetime import timedelta from dateutil.relativedelta import relativedelta from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, CHANGE_DATE, difftime from loader import load, dmt, cms from sentiments import readtoxleveltxt colors = ['red', 'green', 'blue', 'orange', 'deeppink'] thresholds = [6, 9, 12, 15] def main(folder, intervl): users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts, intervl) start = cms() printnoln("reading sentiments ...") (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt") rprint("reading sentiments ... took " + str(cms() - start) + "ms") outputdir = folder + "/output/itsnew/" os.system("mkdir -p " + outputdir) data = [] datasingle = [] count = [] for (option_date_from, option_date_to) in intervals: if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"): datasingle.append(float("nan")) data.append(float("nan")) count.append(float("nan")) continue print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) # avg sentiments filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound'] for a in p['Answers'] if option_date_from <= p['CreationDate'] < option_date_to # post in interval and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'] # post created withon 1 week of 1st contrib and p['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) # answer within 1 week of post creation .filter(lambda p: p != []) .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: []) .getresults()) datasingle.append(filtered) avg = np.average(filtered) if len(filtered) > 0 else float("nan") data.append(avg) count.append(len(filtered)) avgcount = np.mean([x for x in count if str(x) != "nan"]) stdcount = np.std([x for x in count if str(x) != "nan"]) for i in range(len(count)): if str(count[i]) == "nan": # or np.abs((count[i] - avgcount) / stdcount) > 3: datasingle[i] = float("nan") data[i] = float("nan") count[i] = float("nan") # filter nan entries for i in range(len(data)): while i < len(data) and str(data[i]) == "nan": del datasingle[i] del data[i] del intervals[i] del count[i] # deseason # mins = [min([data[j] for j in range(len(data)) if j % 12 == i]) for i in range(0, 12)] mins = [np.average([data[j] for j in range(len(data)) if j % 12 == i]) for i in range(0, 12)] # mins = [min(d) / count(d) for d in [[data[j] for j in range(len(data)) if j % 12 == i] for i in range(0, 12)]] # mins = [data[i] for i in range(0, 12)] mins = [m - min(mins) for m in mins] print("mins", mins) dsdata = [data[i] - mins[i % 12] for i in range(len(data))] dsdatasingle = [[d - mins[i % 12] for d in datasingle[i]] for i in range(len(datasingle))] # data = dsdata # datasingle = dsdatasingle print("Computing full ITS") # t_s = np.reshape(np.array([data[i] for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) t = np.reshape(np.array([i for i in range(len(dsdatasingle)) for j in dsdatasingle[i]]), (-1, 1)) x = np.reshape(np.array([(0 if intervals[i][0] <= CHANGE_DATE else 1) for i in range(len(dsdatasingle)) for j in dsdatasingle[i]]), (-1, 1)) X = np.array(t) # b1 X = np.concatenate((X, x), 1) # b2 X = np.concatenate((X, np.multiply(t, x)), 1) # 3 # X = np.concatenate((X, t_s), 1) # 4 X = sm.add_constant(X) # b0 y = np.reshape(np.array([d for a in dsdatasingle for d in a]), (-1, 1)) res = sm.OLS(y, X).fit() p2 = res.pvalues print("coef ols: " + str(res.params)) print("sum ols: " + str(res.summary())) coef2ols = np.reshape(np.array(res.params), (-1, 1)) # coef2ols[4] = 0 its2ols = X.dot(coef2ols) dsits2ols = np.copy(its2ols) # its2ols = np.add(its2ols, np.reshape(np.array([mins[i % 12] for i in range(len(data)) for j in dsdatasingle[i]]), (-1, 1))) minavg = np.average(mins) its2ols = np.add(its2ols, np.reshape(np.array([minavg for i in range(len(data)) for j in dsdatasingle[i]]), (-1, 1))) with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file: file.write(str(res.summary())) thresdata = [] thresols = [] thresiv = [] thresp = [] print("Computing threshold ITS") for ti in thresholds: # print(1, CHANGE_DATE - relativedelta(months=ti)) # print(2, CHANGE_DATE + relativedelta(months=ti)) z = [(i, x) for (i, x) in zip(intervals, datasingle) if i[0] >= CHANGE_DATE - relativedelta(months=ti) and i[1] <= CHANGE_DATE + relativedelta(months=ti)] iv = [i for (i, x) in z] # print("iv " + str(iv)) d = [x for (i, x) in z] # t_s = np.reshape(np.array([data[i] for i in range(len(d)) for j in d[i]]), (-1, 1)) t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1)) x = np.reshape(np.array([(0 if iv[i][0] <= CHANGE_DATE else 1) for i in range(len(d)) for j in d[i]]), (-1, 1)) X = np.array(t) # b1 X = np.concatenate((X, x), 1) # b2 X = np.concatenate((X, np.multiply(t, x)), 1) # b3 # X = np.concatenate((X, t_s), 1) # b4 X = sm.add_constant(X) # 0 y = np.reshape(np.array([v for a in d for v in a]), (-1, 1)) res = sm.OLS(y, X).fit() tp = res.pvalues thresp.append(tp) # print("coef ols: " + str(res.params)) # print("sum ols: " + str(res.summary())) coefthresols = np.reshape(np.array(res.params), (-1, 1)) # coefthresols[4] = 0 thresols.append(X.dot(coefthresols)) thresiv.append(iv) thresdata.append(d) with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file: file.write(str(res.summary())) fig = plt.figure(figsize=FIG_SIZE) plt.plot([difftime(i[0]) for i in intervals], data, label="average sentiment") # plt.plot([difftime(i[0]) for i in intervals], dsdata, label="average sentiment - deseason") plt.grid(True) for i in range(len(data)): va = "center" if 0 < i < len(data) - 1: if data[i - 1] < data[i] and data[i + 1] < data[i]: va = "bottom" elif data[i - 1] > data[i] and data[i + 1] > data[i]: va = "top" elif i == 0: if data[i + 1] < data[i]: va = "bottom" else: va = "top" elif i == len(data) - 1: if data[i - 1] < data[i]: va = "bottom" else: va = "top" plt.text(difftime(intervals[i][0]), data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va) plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS") # plt.plot([difftime(intervals[i][0]) for i in range(len(dsdatasingle)) for j in dsdatasingle[i]], dsits2ols, label="sm single ITS - deseason") # print("shape: " + str(np.shape(thresdata))) for (ti, t) in enumerate(thresholds): # print("shape1: " + str(np.shape(thresdata[ti]))) plt.plot([difftime(thresiv[ti][i][0]) for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months") plt.title("Average sentiments for new users") plt.xticks(rotation=90) plt.xlabel("months") plt.ylabel("sentiment") plt.legend(loc="upper right") outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png" plt.savefig(outfile, bbox_inches='tight') plt.close(fig) # plot seasonality fig = plt.figure(figsize=FIG_SIZE) plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], [mins[i % 12] for i in range(len(datasingle)) for j in datasingle[i]], label="seasonality") # print("shape: " + str(np.shape(thresdata))) plt.title("Average sentiments for new users - seasonality") plt.xticks(rotation=90) plt.xlabel("months") plt.ylabel("sentiment - seasonality") plt.legend(loc="upper right") outfile = outputdir + "/season-i" + str(intervl) + ".png" plt.savefig(outfile, bbox_inches='tight') plt.close(fig) # plot seasonality post count pcmins = [len(datasingle[i]) for i in range(0, 12)] pcmins = [m - min(pcmins) for m in pcmins] fig = plt.figure(figsize=FIG_SIZE) plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle))], [pcmins[i % 12] for i in range(len(datasingle))], label="seasonality") plt.title("post count for new users - seasonality") plt.xticks(rotation=90) plt.xlabel("months") plt.ylabel("post count - seasonality") plt.legend(loc="upper right") outfile = outputdir + "/season_postcount-i" + str(intervl) + ".png" plt.savefig(outfile, bbox_inches='tight') plt.close(fig) if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " " if len(sys.argv) < 2: print(usage) sys.exit(1) folder = sys.argv[1] if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) interval = 1 if len(sys.argv) >= 3: if sys.argv[2].startswith("-i"): interval = sys.argv[2][2:] try: interval = int(interval) except ValueError: print("-i: int required") sys.exit(1) if interval < 1 or interval > 12: print("-i: only 1 - 12") sys.exit(1) else: print("unknown parameter: " + sys.argv[2]) sys.exit(1) main(folder, interval)