diff --git a/its.py b/its.py new file mode 100644 index 0000000..8f70bb2 --- /dev/null +++ b/its.py @@ -0,0 +1,121 @@ +import os +import os +import sys +from datetime import datetime +from datetime import timedelta + +import matplotlib.pyplot as plt +import numpy as np +from sklearn.linear_model import LinearRegression + +from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER +from loader import load, dmt, cms + +OLD_USER_PERCENTILE = 0.95 + +colors = ['red', 'green', 'blue', 'orange', 'deeppink'] + + +def main(folder, intervl): + users, posts, firstcontrib, sumcontrib = load(folder) + + intervals = calc_intervals(posts, intervl) + + start = cms() + printnoln("reading sentiments ...") + cachedsentiments = imprt(folder + "/output/sentiments.py").answers + rprint("reading sentiments ... took " + str(cms() - start) + "ms") + + outputdir = folder + "/output/its/" + os.system("mkdir -p " + outputdir) + + data = [] + for (option_date_from, option_date_to) in intervals: + if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"): + data.append(float("nan")) + continue + print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) + # avg sentiments + # print(dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound'] + # for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to + # and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']]) + # .filter(lambda p: p != []) + # .getresults()) + # break + filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound'] + for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to + and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']]) + .filter(lambda p: p != []) + .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: []) + .getresults()) + avg = np.average(filtered) if len(filtered) > 0 else float("nan") + data.append(avg) + + # filter nan entries + for i in range(len(data)): + while i < len(data) and str(data[i]) == "nan": + del data[i] + del intervals[i] + + print("Computing ITS ...") + t = np.reshape(np.array([i for i in range(len(data))]), (-1, 1)) + # print("t", t) + x = np.reshape(np.array([(0 if option_date_to <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for (option_date_from, option_date_to) in intervals]), (-1, 1)) + # print("x", x) + X = np.reshape(np.array([data[0] for i in range(len(data))]), (-1, 1)) + # print("X", X) + X = np.concatenate((X, t), 1) + X = np.concatenate((X, x), 1) + X = np.concatenate((X, np.multiply(t, x)), 1) + y = np.reshape(np.array(data), (-1, 1)) + # print("Xfin", X) + # print("y", y) + reg = LinearRegression() + reg.fit(X, y) + score = reg.score(X, y); + coef = np.reshape(np.array(reg.coef_), (-1, 1)) + its = X.dot(coef) + data[0] + print("score: " + str(score)) + print("coef: " + str(coef)) + print("its: " + str(its)) + + fig = plt.figure(figsize=(16, 12)) + plt.plot([i[0] for i in intervals], data, label="average sentiment") + plt.plot([i[0] for i in intervals], its, label="ITS (score " + str(score) + ")") + plt.title("Average sentiments for new users") + plt.xticks(rotation=90) + plt.xlabel("months") + plt.ylabel("sentiment") + plt.legend(loc="upper right") + outfile = outputdir + "/average_sentiments.png" + plt.savefig(outfile, bbox_inches='tight') + plt.close(fig) + + +if __name__ == "__main__": + # execute only if run as a script + usage = sys.argv[0] + " " + if len(sys.argv) < 2: + print(usage) + sys.exit(1) + folder = sys.argv[1] + if not os.path.isdir(folder): + print(folder + " is not a folder") + sys.exit(1) + interval = 3 + if len(sys.argv) >= 3: + if sys.argv[2].startswith("-i"): + interval = sys.argv[2][2:] + try: + interval = int(interval) + except ValueError: + print("-i: int required") + sys.exit(1) + if interval < 1 or interval > 12: + print("-i: only 1 - 12") + sys.exit(1) + else: + print("unknown parameter: " + sys.argv[2]) + sys.exit(1) + + main(folder, interval) diff --git a/notes b/notes index 9235a04..2cd6114 100644 --- a/notes +++ b/notes @@ -6,3 +6,10 @@ wann war die abweichung größer als 1 std abweichung plot anzahl der posts pro user in zeitraum x + + +----- +interrupted time series +3 monate vor änderung und nache der änderungen + +da gibts 2 varianten: sprung, oder 2 linien