From 2c1524a33511ae9572c2a02e756ae3f064100ace Mon Sep 17 00:00:00 2001 From: wea_ondara Date: Wed, 18 Dec 2019 13:02:16 +0100 Subject: [PATCH] wip --- its.py | 135 ++++++++++++++++++++++++++++++++++++++------------ notes | 22 ++++++++ posthist.py | 54 +++++++++++++++++--- sentiments.py | 72 +++++++++++++++++++++++++++ 4 files changed, 245 insertions(+), 38 deletions(-) diff --git a/its.py b/its.py index 8f70bb2..a43c3f2 100644 --- a/its.py +++ b/its.py @@ -1,17 +1,17 @@ import os -import os import sys -from datetime import datetime -from datetime import timedelta import matplotlib.pyplot as plt import numpy as np +import os +from datetime import datetime +from datetime import timedelta from sklearn.linear_model import LinearRegression -from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER +from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER from loader import load, dmt, cms - -OLD_USER_PERCENTILE = 0.95 +from sentiments import readtoxleveltxt +import statsmodels.api as sm colors = ['red', 'green', 'blue', 'orange', 'deeppink'] @@ -23,33 +23,31 @@ def main(folder, intervl): start = cms() printnoln("reading sentiments ...") - cachedsentiments = imprt(folder + "/output/sentiments.py").answers + (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt") rprint("reading sentiments ... took " + str(cms() - start) + "ms") outputdir = folder + "/output/its/" os.system("mkdir -p " + outputdir) data = [] + datasingle = [] + count = [] for (option_date_from, option_date_to) in intervals: if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"): data.append(float("nan")) continue print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) # avg sentiments - # print(dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound'] - # for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to - # and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']]) - # .filter(lambda p: p != []) - # .getresults()) - # break filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound'] for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']]) .filter(lambda p: p != []) .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: []) .getresults()) + datasingle.append(filtered) avg = np.average(filtered) if len(filtered) > 0 else float("nan") data.append(avg) + count.append(len(filtered)) # filter nan entries for i in range(len(data)): @@ -57,37 +55,110 @@ def main(folder, intervl): del data[i] del intervals[i] - print("Computing ITS ...") - t = np.reshape(np.array([i for i in range(len(data))]), (-1, 1)) - # print("t", t) - x = np.reshape(np.array([(0 if option_date_to <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for (option_date_from, option_date_to) in intervals]), (-1, 1)) - # print("x", x) - X = np.reshape(np.array([data[0] for i in range(len(data))]), (-1, 1)) - # print("X", X) - X = np.concatenate((X, t), 1) + # print("Computing ITS ...") + # t = np.reshape(np.array([i for i in range(len(data))]), (-1, 1)) + # x = np.reshape(np.array([(0 if option_date_to <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for (option_date_from, option_date_to) in intervals]), (-1, 1)) + # X = np.array(t) + # X = np.concatenate((X, x), 1) + # X = np.concatenate((X, np.multiply(t, x)), 1) + # y = np.reshape(np.array(data), (-1, 1)) + # # print("Xfin", X) + # # print("y", y) + # reg = LinearRegression() + # reg.fit(X, y) + # score = reg.score(X, y) + # coef = np.reshape(np.array(reg.coef_), (-1, 1)) + # its = X.dot(coef) + reg.intercept_ + # print("score: " + str(score)) + # print("coef: " + str(coef)) + # print("its: " + str(its)) + + print("Computing full ITS") + t = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) + x = np.reshape(np.array([(0 if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) + X = np.array(t) X = np.concatenate((X, x), 1) X = np.concatenate((X, np.multiply(t, x)), 1) - y = np.reshape(np.array(data), (-1, 1)) + y = np.reshape(np.array([d for a in datasingle for d in a]), (-1, 1)) # print("Xfin", X) # print("y", y) - reg = LinearRegression() - reg.fit(X, y) - score = reg.score(X, y); - coef = np.reshape(np.array(reg.coef_), (-1, 1)) - its = X.dot(coef) + data[0] - print("score: " + str(score)) - print("coef: " + str(coef)) - print("its: " + str(its)) + # reg = LinearRegression() + # reg.fit(X, y) + # score2 = reg.score(X, y) + # coef2 = np.reshape(np.array(reg.coef_), (-1, 1)) + # its2 = X.dot(coef2) + reg.intercept_ + # print("intercept: " + str(reg.intercept_)) + # print("score: " + str(score2)) + # print("coef: " + str(coef2)) + # print("its: " + str(its2)) + X = sm.add_constant(X) + res = sm.OLS(y, X).fit() + p2 = res.pvalues + print("coef ols: " + str(res.params)) + print("sum ols: " + str(res.summary())) + coef2ols = np.reshape(np.array(res.params), (-1, 1)) + its2ols = X.dot(coef2ols) + with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file: + file.write(str(res.summary())) + + # print("Computing segmented ITS before") + # X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) + # y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) + # reg = LinearRegression() + # reg.fit(X, y) + # scoreb = reg.score(X, y) + # coefb = np.reshape(np.array(reg.coef_), (-1, 1)) + # itsb = X.dot(coefb) + reg.intercept_ + # print("scoreb: " + str(scoreb)) + # print("coefb: " + str(coefb)) + # print("itsb: " + str(itsb)) + + # print("Computing segmented ITS after") + # X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) + # y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) + # reg = LinearRegression() + # reg.fit(X, y) + # scorea = reg.score(X, y) + # coefa = np.reshape(np.array(reg.coef_), (-1, 1)) + # itsa = X.dot(coefa) + reg.intercept_ + # print("scorea: " + str(scorea)) + # print("coefa: " + str(coefa)) + # print("itsa: " + str(itsa)) fig = plt.figure(figsize=(16, 12)) plt.plot([i[0] for i in intervals], data, label="average sentiment") - plt.plot([i[0] for i in intervals], its, label="ITS (score " + str(score) + ")") + plt.grid(True) + for i in range(len(data)): + va = "center" + if 0 < i < len(data) - 1: + if data[i - 1] < data[i] and data[i + 1] < data[i]: + va = "bottom" + elif data[i - 1] > data[i] and data[i + 1] > data[i]: + va = "top" + elif i == 0: + if data[i + 1] < data[i]: + va = "bottom" + else: + va = "top" + elif i == len(data) - 1: + if data[i - 1] < data[i]: + va = "bottom" + else: + va = "top" + plt.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va) + # plt.plot([i[0] for i in intervals], its, label="aggregated ITS (score " + str(score) + ")") + # plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2, label="single ITS (score " + str(score2) + ", p " + str(p2) + ")") + plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS (pvalues " + str(p2) + ")") + # plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")], itsb, + # label="segmented ITS b (score " + str(scoreb) + ")") + # plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")], itsa, + # label="segmented ITS a (score " + str(scorea) + ")") plt.title("Average sentiments for new users") plt.xticks(rotation=90) plt.xlabel("months") plt.ylabel("sentiment") plt.legend(loc="upper right") - outfile = outputdir + "/average_sentiments.png" + outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png" plt.savefig(outfile, bbox_inches='tight') plt.close(fig) diff --git a/notes b/notes index 2cd6114..b77080f 100644 --- a/notes +++ b/notes @@ -13,3 +13,25 @@ interrupted time series 3 monate vor änderung und nache der änderungen da gibts 2 varianten: sprung, oder 2 linien + + + +---------- +lila lines in its fixen -> done +coef -> p values significance, statsmodels sm logit, sm.summary() -> done +bisschen zusammen fassen was ich gemacht habe +plot über zeit <-0.05, -, >0.05 für sentiments, 3 kurven -> done + +paper, strg+f arousal, methode wichtig, referncen anschauen https://link.springer.com/content/pdf/10.1007%2Fs42001-017-0001-x.pdf + +#einfluss von code sections + + +Papers: + +its: +http://lindenconsulting.org/documents/Weighted_TSA_Article.pdf + + + + diff --git a/posthist.py b/posthist.py index c59c639..7d317da 100644 --- a/posthist.py +++ b/posthist.py @@ -1,12 +1,14 @@ import os import sys from collections import defaultdict +from datetime import timedelta import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator -from common import calc_intervals, IMAGE_MAGICK +from common import calc_intervals, IMAGE_MAGICK, DAYS_NEW_USER from loader import load, dmt +from sentiments import readtoxleveltxt colors = ['red', 'green', 'blue', 'orange', 'deeppink'] @@ -14,15 +16,19 @@ colors = ['red', 'green', 'blue', 'orange', 'deeppink'] def main(folder, intervl): users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts, intervl) + (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt") outputdir = folder + "/output/posthist/" os.system("mkdir -p " + outputdir) activeusercounts = [] + answerstonewusers = [] + sentimentstonewusers = [] imgmagickcmd = IMAGE_MAGICK for (option_date_from, option_date_to) in intervals: - print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))) + print(option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y")) + # post histograms # filter posts by option_date_from <= creation date <= option_date_to newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults() @@ -34,7 +40,7 @@ def main(folder, intervl): postcounts = {id: len(pc) for (id, pc) in postcounts.items()} activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys()))) - histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "-i" + str(intervl) histdata = [pc for pc in postcounts.values()] fig = plt.figure(figsize=(16, 12)) @@ -48,14 +54,50 @@ def main(folder, intervl): fig.savefig(histfilename + ".png", bbox_inches='tight') plt.close(fig) imgmagickcmd += " " + histfilename + ".png" - os.system(imgmagickcmd + " " + outputdir + "/posthist.pdf") + # answers to new users + answers = (dmt(posts).map(lambda q: [a for a in q['Answers'] if option_date_from <= a['CreationDate'] < option_date_to + and firstcontrib[q['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']]) + .getresults()) + count = sum([len(a) for a in answers]) + answerstonewusers.append(((option_date_from, option_date_to), count)) + sent = ([cachedsentiments[a['Id']] for al in answers for a in al]) + sentbad = len([1 for a in sent if a['compound'] < -0.05]) + sentneu = len([1 for a in sent if -0.05 <= a['compound'] <= 0.05]) + sentgood = len([1 for a in sent if a['compound'] > 0.05]) + sentimentstonewusers.append(((option_date_from, option_date_to), (sent, sentbad, sentneu, sentgood))) + + # gen pdf for post histograms + os.system(imgmagickcmd + " " + outputdir + "/posthist-i" + str(intervl) + ".pdf") + + # plot posts diagram fig = plt.figure(figsize=(16, 12)) plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts]) plt.yscale('log') - plt.ylim(bottom=0) + plt.ylim(bottom=0.001) plt.title("Active users") - fig.savefig(outputdir + "activeusers.png", bbox_inches='tight') + fig.savefig(outputdir + "activeusers-i" + str(intervl) + ".png", bbox_inches='tight') + plt.close(fig) + + # plot answers to new users diagram + fig = plt.figure(figsize=(16, 12)) + plt.plot([x[0] for (x, y) in answerstonewusers], [y for (x, y) in answerstonewusers]) + plt.yscale('log') + plt.ylim(bottom=0.001) + plt.title("#Answers to new users") + fig.savefig(outputdir + "answerstonewusers-i" + str(intervl) + ".png", bbox_inches='tight') + plt.close(fig) + + # plot sentiments of answers to new users diagram + fig = plt.figure(figsize=(16, 12)) + plt.plot([x[0] for (x, y) in sentimentstonewusers], [b for (x, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer") + plt.plot([x[0] for (x, y) in sentimentstonewusers], [n for (x, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer") + plt.plot([x[0] for (x, y) in sentimentstonewusers], [g for (x, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer") + plt.yscale('log') + plt.ylim(bottom=0.001) + plt.legend(loc="upper right") + plt.title("Sentiments of answers to new users") + fig.savefig(outputdir + "sentimentstonewusers-i" + str(intervl) + ".png", bbox_inches='tight') plt.close(fig) diff --git a/sentiments.py b/sentiments.py index 34bc264..f2dc97b 100644 --- a/sentiments.py +++ b/sentiments.py @@ -4,6 +4,7 @@ import sys from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from loader import load, dmt +from common import imprt analyser = SentimentIntensityAnalyzer() @@ -20,6 +21,30 @@ def main(folder): toxlevels = {id: p for (id, p) in toxlevels} dumptoxlevels(toxlevels, outfilename + ".py") + dumptoxlevelstxt(toxlevels, outfilename + ".txt") + + # (lvl2q, lvl2a) = readtoxleveltxt(outfilename + ".txt") + # + # s1 = str(toxlevels) + # s2 = str(lvl2q) + # # print("s1: " + s1) + # # print("s2: " + s2) + # if s1 != s2: + # print("not equal") + # else: + # print("equal") + # + # # print("s1: " + str(imprt(folder + "/output/sentiments.py").answers)) + # # print("s2: " + str(lvl2a)) + # if str(imprt(folder + "/output/sentiments.py").answers) != str(lvl2a): + # print("a not equal") + # else: + # print("a equal") + # + # if str(imprt(folder + "/output/sentiments.py").posts) != str(lvl2q): + # print("q not equal") + # else: + # print("q equal") def computeToxLevel(text): @@ -36,6 +61,53 @@ def dumptoxlevels(lvls, filename): file.write("answers = " + str(answers) + "\n") +def dumptoxlevelstxt(lvls, filename): + answers = dict() + for p in lvls.values(): + for id, a in p.items(): + answers[id] = a + pstr = [str(id) + ":" + ";".join([str(aid) + ":" + str(a['neg']) + ":" + str(a['neu']) + ":" + str(a['pos']) + ":" + str(a['compound']) for (aid, a) in p.items()]) for (id, p) in lvls.items()] + astr = [str(id) + ":" + str(p['neg']) + ":" + str(p['neu']) + ":" + str(p['pos']) + ":" + str(p['compound']) for (id, p) in answers.items()] + pstr = ";;".join(pstr) + astr = ";".join(astr) + with open(filename, "w") as file: + file.write("posts=" + pstr + "\n") + file.write("answers=" + astr + "\n") + + +def readtoxleveltxt(filename): + lines = "" + with open(filename, 'r') as f: + lines = f.read() + lines = lines.split("\n") + + rq = {} + ra = {} + for line in lines: + if line.startswith("posts="): + line = line[len("posts="):] + rq = line.split(";;") # split by q + # print("i1: " + str(rq[0:5])) + rq = [l.split(":", 1) for l in rq] # get q id + # print("i2: " + str(rq[0:5])) + rq = [(qid, [x.split(":") for x in a.split(";")]) if len(a) > 0 else (qid, []) for [qid, a] in rq] + # print("i3:" + str(rq)) + # rq = {int(id): {int(1): "a" for x in a} for (id, a) in rq} + # rq = {int(id): {str(aid[0]): str(aid) for aid in a} for (id, a) in rq} + rq = {id: {aid: {"neg": float(neg), "neu": float(neu), "pos": float(pos), "compound": float(compound)} for [aid, neg, neu, pos, compound] in a} for (id, a) in rq} + # print("i4:" + str(rq)[0:500]) + # sys.exit() + elif line.startswith("answers="): + line = line[len("answers="):] + ra = line.split(";") + ra = [l.split(":") for l in ra] + # print("i1: " + str(ra[0:5])) + ra = {id: {"neg": float(neg), "neu": float(neu), "pos": float(pos), "compound": float(compound)} for [id, neg, neu, pos, compound] in ra} + # print("i1: " + str(ra)[0:500]) + + return rq, ra + + if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " "