From 69221ac8b37032213a8426e4df6db083641cc1d9 Mon Sep 17 00:00:00 2001 From: wea_ondara Date: Thu, 23 Jan 2020 22:40:28 +0100 Subject: [PATCH] wip --- common.py | 2 +- its.py | 100 +++++++++++++++++---------------------------- text/0_doc.tex | 24 +++++++++++ text/1_intro.tex | 1 + text/2_relwork.tex | 4 ++ text/bib.bib | 10 +++++ 6 files changed, 78 insertions(+), 63 deletions(-) create mode 100644 text/0_doc.tex create mode 100644 text/1_intro.tex create mode 100644 text/2_relwork.tex create mode 100644 text/bib.bib diff --git a/common.py b/common.py index 1ae9cb7..0933923 100644 --- a/common.py +++ b/common.py @@ -34,7 +34,7 @@ def calc_intervals(posts, months=3): intervals = [] while cdate < lastpost: nextmon = cdate.month + months - nextquarter = cdate.replace(month=nextmon if nextmon <=12 else nextmon-12, year=cdate.year + (0 if nextmon <= 12 else 1)) + nextquarter = cdate.replace(month=nextmon if nextmon <= 12 else nextmon - 12, year=cdate.year + (0 if nextmon <= 12 else 1)) print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y")) intervals.append((cdate, nextquarter)) cdate = nextquarter diff --git a/its.py b/its.py index a43c3f2..1b11b3c 100644 --- a/its.py +++ b/its.py @@ -1,19 +1,20 @@ -import os import sys import matplotlib.pyplot as plt import numpy as np import os +import statsmodels.api as sm from datetime import datetime from datetime import timedelta -from sklearn.linear_model import LinearRegression +from dateutil.relativedelta import relativedelta from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER from loader import load, dmt, cms from sentiments import readtoxleveltxt -import statsmodels.api as sm colors = ['red', 'green', 'blue', 'orange', 'deeppink'] +thresholds = [2, 3, 4, 5, 6] +changedate = datetime.fromisoformat("2018-09-01T00:00:00") def main(folder, intervl): @@ -55,42 +56,13 @@ def main(folder, intervl): del data[i] del intervals[i] - # print("Computing ITS ...") - # t = np.reshape(np.array([i for i in range(len(data))]), (-1, 1)) - # x = np.reshape(np.array([(0 if option_date_to <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for (option_date_from, option_date_to) in intervals]), (-1, 1)) - # X = np.array(t) - # X = np.concatenate((X, x), 1) - # X = np.concatenate((X, np.multiply(t, x)), 1) - # y = np.reshape(np.array(data), (-1, 1)) - # # print("Xfin", X) - # # print("y", y) - # reg = LinearRegression() - # reg.fit(X, y) - # score = reg.score(X, y) - # coef = np.reshape(np.array(reg.coef_), (-1, 1)) - # its = X.dot(coef) + reg.intercept_ - # print("score: " + str(score)) - # print("coef: " + str(coef)) - # print("its: " + str(its)) - print("Computing full ITS") t = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) - x = np.reshape(np.array([(0 if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) + x = np.reshape(np.array([(0 if intervals[i][1] <= changedate else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) X = np.array(t) X = np.concatenate((X, x), 1) X = np.concatenate((X, np.multiply(t, x)), 1) y = np.reshape(np.array([d for a in datasingle for d in a]), (-1, 1)) - # print("Xfin", X) - # print("y", y) - # reg = LinearRegression() - # reg.fit(X, y) - # score2 = reg.score(X, y) - # coef2 = np.reshape(np.array(reg.coef_), (-1, 1)) - # its2 = X.dot(coef2) + reg.intercept_ - # print("intercept: " + str(reg.intercept_)) - # print("score: " + str(score2)) - # print("coef: " + str(coef2)) - # print("its: " + str(its2)) X = sm.add_constant(X) res = sm.OLS(y, X).fit() p2 = res.pvalues @@ -101,29 +73,35 @@ def main(folder, intervl): with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file: file.write(str(res.summary())) - # print("Computing segmented ITS before") - # X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) - # y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) - # reg = LinearRegression() - # reg.fit(X, y) - # scoreb = reg.score(X, y) - # coefb = np.reshape(np.array(reg.coef_), (-1, 1)) - # itsb = X.dot(coefb) + reg.intercept_ - # print("scoreb: " + str(scoreb)) - # print("coefb: " + str(coefb)) - # print("itsb: " + str(itsb)) - - # print("Computing segmented ITS after") - # X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) - # y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) - # reg = LinearRegression() - # reg.fit(X, y) - # scorea = reg.score(X, y) - # coefa = np.reshape(np.array(reg.coef_), (-1, 1)) - # itsa = X.dot(coefa) + reg.intercept_ - # print("scorea: " + str(scorea)) - # print("coefa: " + str(coefa)) - # print("itsa: " + str(itsa)) + thresdata = [] + thresols = [] + thresiv = [] + thresp = [] + print("Computing threshold ITS") + for ti in thresholds: + print(1, changedate - relativedelta(months=ti)) + print(2, changedate + relativedelta(months=ti)) + z = [(i, x) for (i, x) in zip(intervals, datasingle) if i[0] >= changedate - relativedelta(months=ti) and i[1] <= changedate + relativedelta(months=ti)] + iv = [i for (i, x) in z] + d = [x for (i, x) in z] + t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1)) + x = np.reshape(np.array([(0 if iv[i][1] <= changedate else 1) for i in range(len(d)) for j in d[i]]), (-1, 1)) + X = np.array(t) + X = np.concatenate((X, x), 1) + X = np.concatenate((X, np.multiply(t, x)), 1) + y = np.reshape(np.array([v for a in d for v in a]), (-1, 1)) + X = sm.add_constant(X) + res = sm.OLS(y, X).fit() + tp = res.pvalues + thresp.append(tp) + # print("coef ols: " + str(res.params)) + # print("sum ols: " + str(res.summary())) + coefthresols = np.reshape(np.array(res.params), (-1, 1)) + thresols.append(X.dot(coefthresols)) + thresiv.append(iv) + thresdata.append(d) + with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file: + file.write(str(res.summary())) fig = plt.figure(figsize=(16, 12)) plt.plot([i[0] for i in intervals], data, label="average sentiment") @@ -146,13 +124,11 @@ def main(folder, intervl): else: va = "top" plt.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va) - # plt.plot([i[0] for i in intervals], its, label="aggregated ITS (score " + str(score) + ")") - # plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2, label="single ITS (score " + str(score2) + ", p " + str(p2) + ")") plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS (pvalues " + str(p2) + ")") - # plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")], itsb, - # label="segmented ITS b (score " + str(scoreb) + ")") - # plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")], itsa, - # label="segmented ITS a (score " + str(scorea) + ")") + print("shape: " + str(np.shape(thresdata))) + for (ti, t) in enumerate(thresholds): + print("shape1: " + str(np.shape(thresdata[ti]))) + plt.plot([thresiv[ti][i][0] for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months (pvalues " + str(thresp[ti]) + ")") plt.title("Average sentiments for new users") plt.xticks(rotation=90) plt.xlabel("months") diff --git a/text/0_doc.tex b/text/0_doc.tex new file mode 100644 index 0000000..b507a7b --- /dev/null +++ b/text/0_doc.tex @@ -0,0 +1,24 @@ +\documentclass[a4paper,10pt]{article} +\usepackage[utf8]{inputenc} + +%opening +\title{} +\author{} + +\begin{document} + +\maketitle + +\begin{abstract} + +\end{abstract} + +\include{1_intro} +\include{2_relwork} +\include{3_method} +\include{4_datasets} +\include{5_results} +\include{6_discussion} +\include{7_conclusion} + +\end{document} diff --git a/text/1_intro.tex b/text/1_intro.tex new file mode 100644 index 0000000..6d8b74b --- /dev/null +++ b/text/1_intro.tex @@ -0,0 +1 @@ +\section{Introduction} diff --git a/text/2_relwork.tex b/text/2_relwork.tex new file mode 100644 index 0000000..0fbd005 --- /dev/null +++ b/text/2_relwork.tex @@ -0,0 +1,4 @@ +\section{Related Work} + +%tutorial: Bernal et al. \cite{bernal2017interrupted} +%literatur analyse todo diff --git a/text/bib.bib b/text/bib.bib new file mode 100644 index 0000000..ce0b86e --- /dev/null +++ b/text/bib.bib @@ -0,0 +1,10 @@ +@article{bernal2017interrupted, + title={Interrupted time series regression for the evaluation of public health interventions: a tutorial}, + author={Bernal, James Lopez and Cummins, Steven and Gasparrini, Antonio}, + journal={International journal of epidemiology}, + volume={46}, + number={1}, + pages={348--355}, + year={2017}, + publisher={Oxford University Press} +}