import os import sys import matplotlib.pyplot as plt import numpy as np import os from datetime import datetime from datetime import timedelta from sklearn.linear_model import LinearRegression from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER from loader import load, dmt, cms from sentiments import readtoxleveltxt import statsmodels.api as sm colors = ['red', 'green', 'blue', 'orange', 'deeppink'] def main(folder, intervl): users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts, intervl) start = cms() printnoln("reading sentiments ...") (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt") rprint("reading sentiments ... took " + str(cms() - start) + "ms") outputdir = folder + "/output/its/" os.system("mkdir -p " + outputdir) data = [] datasingle = [] count = [] for (option_date_from, option_date_to) in intervals: if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"): data.append(float("nan")) continue print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) # avg sentiments filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound'] for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']]) .filter(lambda p: p != []) .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: []) .getresults()) datasingle.append(filtered) avg = np.average(filtered) if len(filtered) > 0 else float("nan") data.append(avg) count.append(len(filtered)) # filter nan entries for i in range(len(data)): while i < len(data) and str(data[i]) == "nan": del data[i] del intervals[i] # print("Computing ITS ...") # t = np.reshape(np.array([i for i in range(len(data))]), (-1, 1)) # x = np.reshape(np.array([(0 if option_date_to <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for (option_date_from, option_date_to) in intervals]), (-1, 1)) # X = np.array(t) # X = np.concatenate((X, x), 1) # X = np.concatenate((X, np.multiply(t, x)), 1) # y = np.reshape(np.array(data), (-1, 1)) # # print("Xfin", X) # # print("y", y) # reg = LinearRegression() # reg.fit(X, y) # score = reg.score(X, y) # coef = np.reshape(np.array(reg.coef_), (-1, 1)) # its = X.dot(coef) + reg.intercept_ # print("score: " + str(score)) # print("coef: " + str(coef)) # print("its: " + str(its)) print("Computing full ITS") t = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) x = np.reshape(np.array([(0 if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) X = np.array(t) X = np.concatenate((X, x), 1) X = np.concatenate((X, np.multiply(t, x)), 1) y = np.reshape(np.array([d for a in datasingle for d in a]), (-1, 1)) # print("Xfin", X) # print("y", y) # reg = LinearRegression() # reg.fit(X, y) # score2 = reg.score(X, y) # coef2 = np.reshape(np.array(reg.coef_), (-1, 1)) # its2 = X.dot(coef2) + reg.intercept_ # print("intercept: " + str(reg.intercept_)) # print("score: " + str(score2)) # print("coef: " + str(coef2)) # print("its: " + str(its2)) X = sm.add_constant(X) res = sm.OLS(y, X).fit() p2 = res.pvalues print("coef ols: " + str(res.params)) print("sum ols: " + str(res.summary())) coef2ols = np.reshape(np.array(res.params), (-1, 1)) its2ols = X.dot(coef2ols) with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file: file.write(str(res.summary())) # print("Computing segmented ITS before") # X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) # y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) # reg = LinearRegression() # reg.fit(X, y) # scoreb = reg.score(X, y) # coefb = np.reshape(np.array(reg.coef_), (-1, 1)) # itsb = X.dot(coefb) + reg.intercept_ # print("scoreb: " + str(scoreb)) # print("coefb: " + str(coefb)) # print("itsb: " + str(itsb)) # print("Computing segmented ITS after") # X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) # y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) # reg = LinearRegression() # reg.fit(X, y) # scorea = reg.score(X, y) # coefa = np.reshape(np.array(reg.coef_), (-1, 1)) # itsa = X.dot(coefa) + reg.intercept_ # print("scorea: " + str(scorea)) # print("coefa: " + str(coefa)) # print("itsa: " + str(itsa)) fig = plt.figure(figsize=(16, 12)) plt.plot([i[0] for i in intervals], data, label="average sentiment") plt.grid(True) for i in range(len(data)): va = "center" if 0 < i < len(data) - 1: if data[i - 1] < data[i] and data[i + 1] < data[i]: va = "bottom" elif data[i - 1] > data[i] and data[i + 1] > data[i]: va = "top" elif i == 0: if data[i + 1] < data[i]: va = "bottom" else: va = "top" elif i == len(data) - 1: if data[i - 1] < data[i]: va = "bottom" else: va = "top" plt.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va) # plt.plot([i[0] for i in intervals], its, label="aggregated ITS (score " + str(score) + ")") # plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2, label="single ITS (score " + str(score2) + ", p " + str(p2) + ")") plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS (pvalues " + str(p2) + ")") # plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")], itsb, # label="segmented ITS b (score " + str(scoreb) + ")") # plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")], itsa, # label="segmented ITS a (score " + str(scorea) + ")") plt.title("Average sentiments for new users") plt.xticks(rotation=90) plt.xlabel("months") plt.ylabel("sentiment") plt.legend(loc="upper right") outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png" plt.savefig(outfile, bbox_inches='tight') plt.close(fig) if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " " if len(sys.argv) < 2: print(usage) sys.exit(1) folder = sys.argv[1] if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) interval = 3 if len(sys.argv) >= 3: if sys.argv[2].startswith("-i"): interval = sys.argv[2][2:] try: interval = int(interval) except ValueError: print("-i: int required") sys.exit(1) if interval < 1 or interval > 12: print("-i: only 1 - 12") sys.exit(1) else: print("unknown parameter: " + sys.argv[2]) sys.exit(1) main(folder, interval)