This commit is contained in:
wea_ondara
2020-01-23 22:40:28 +01:00
parent 8877747692
commit 69221ac8b3
6 changed files with 78 additions and 63 deletions

View File

@@ -34,7 +34,7 @@ def calc_intervals(posts, months=3):
intervals = [] intervals = []
while cdate < lastpost: while cdate < lastpost:
nextmon = cdate.month + months nextmon = cdate.month + months
nextquarter = cdate.replace(month=nextmon if nextmon <=12 else nextmon-12, year=cdate.year + (0 if nextmon <= 12 else 1)) nextquarter = cdate.replace(month=nextmon if nextmon <= 12 else nextmon - 12, year=cdate.year + (0 if nextmon <= 12 else 1))
print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y")) print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y"))
intervals.append((cdate, nextquarter)) intervals.append((cdate, nextquarter))
cdate = nextquarter cdate = nextquarter

100
its.py
View File

@@ -1,19 +1,20 @@
import os
import sys import sys
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import os import os
import statsmodels.api as sm
from datetime import datetime from datetime import datetime
from datetime import timedelta from datetime import timedelta
from sklearn.linear_model import LinearRegression from dateutil.relativedelta import relativedelta
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER
from loader import load, dmt, cms from loader import load, dmt, cms
from sentiments import readtoxleveltxt from sentiments import readtoxleveltxt
import statsmodels.api as sm
colors = ['red', 'green', 'blue', 'orange', 'deeppink'] colors = ['red', 'green', 'blue', 'orange', 'deeppink']
thresholds = [2, 3, 4, 5, 6]
changedate = datetime.fromisoformat("2018-09-01T00:00:00")
def main(folder, intervl): def main(folder, intervl):
@@ -55,42 +56,13 @@ def main(folder, intervl):
del data[i] del data[i]
del intervals[i] del intervals[i]
# print("Computing ITS ...")
# t = np.reshape(np.array([i for i in range(len(data))]), (-1, 1))
# x = np.reshape(np.array([(0 if option_date_to <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for (option_date_from, option_date_to) in intervals]), (-1, 1))
# X = np.array(t)
# X = np.concatenate((X, x), 1)
# X = np.concatenate((X, np.multiply(t, x)), 1)
# y = np.reshape(np.array(data), (-1, 1))
# # print("Xfin", X)
# # print("y", y)
# reg = LinearRegression()
# reg.fit(X, y)
# score = reg.score(X, y)
# coef = np.reshape(np.array(reg.coef_), (-1, 1))
# its = X.dot(coef) + reg.intercept_
# print("score: " + str(score))
# print("coef: " + str(coef))
# print("its: " + str(its))
print("Computing full ITS") print("Computing full ITS")
t = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) t = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
x = np.reshape(np.array([(0 if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) x = np.reshape(np.array([(0 if intervals[i][1] <= changedate else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
X = np.array(t) X = np.array(t)
X = np.concatenate((X, x), 1) X = np.concatenate((X, x), 1)
X = np.concatenate((X, np.multiply(t, x)), 1) X = np.concatenate((X, np.multiply(t, x)), 1)
y = np.reshape(np.array([d for a in datasingle for d in a]), (-1, 1)) y = np.reshape(np.array([d for a in datasingle for d in a]), (-1, 1))
# print("Xfin", X)
# print("y", y)
# reg = LinearRegression()
# reg.fit(X, y)
# score2 = reg.score(X, y)
# coef2 = np.reshape(np.array(reg.coef_), (-1, 1))
# its2 = X.dot(coef2) + reg.intercept_
# print("intercept: " + str(reg.intercept_))
# print("score: " + str(score2))
# print("coef: " + str(coef2))
# print("its: " + str(its2))
X = sm.add_constant(X) X = sm.add_constant(X)
res = sm.OLS(y, X).fit() res = sm.OLS(y, X).fit()
p2 = res.pvalues p2 = res.pvalues
@@ -101,29 +73,35 @@ def main(folder, intervl):
with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file: with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file:
file.write(str(res.summary())) file.write(str(res.summary()))
# print("Computing segmented ITS before") thresdata = []
# X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) thresols = []
# y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) thresiv = []
# reg = LinearRegression() thresp = []
# reg.fit(X, y) print("Computing threshold ITS")
# scoreb = reg.score(X, y) for ti in thresholds:
# coefb = np.reshape(np.array(reg.coef_), (-1, 1)) print(1, changedate - relativedelta(months=ti))
# itsb = X.dot(coefb) + reg.intercept_ print(2, changedate + relativedelta(months=ti))
# print("scoreb: " + str(scoreb)) z = [(i, x) for (i, x) in zip(intervals, datasingle) if i[0] >= changedate - relativedelta(months=ti) and i[1] <= changedate + relativedelta(months=ti)]
# print("coefb: " + str(coefb)) iv = [i for (i, x) in z]
# print("itsb: " + str(itsb)) d = [x for (i, x) in z]
t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1))
# print("Computing segmented ITS after") x = np.reshape(np.array([(0 if iv[i][1] <= changedate else 1) for i in range(len(d)) for j in d[i]]), (-1, 1))
# X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) X = np.array(t)
# y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1)) X = np.concatenate((X, x), 1)
# reg = LinearRegression() X = np.concatenate((X, np.multiply(t, x)), 1)
# reg.fit(X, y) y = np.reshape(np.array([v for a in d for v in a]), (-1, 1))
# scorea = reg.score(X, y) X = sm.add_constant(X)
# coefa = np.reshape(np.array(reg.coef_), (-1, 1)) res = sm.OLS(y, X).fit()
# itsa = X.dot(coefa) + reg.intercept_ tp = res.pvalues
# print("scorea: " + str(scorea)) thresp.append(tp)
# print("coefa: " + str(coefa)) # print("coef ols: " + str(res.params))
# print("itsa: " + str(itsa)) # print("sum ols: " + str(res.summary()))
coefthresols = np.reshape(np.array(res.params), (-1, 1))
thresols.append(X.dot(coefthresols))
thresiv.append(iv)
thresdata.append(d)
with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file:
file.write(str(res.summary()))
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=(16, 12))
plt.plot([i[0] for i in intervals], data, label="average sentiment") plt.plot([i[0] for i in intervals], data, label="average sentiment")
@@ -146,13 +124,11 @@ def main(folder, intervl):
else: else:
va = "top" va = "top"
plt.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va) plt.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
# plt.plot([i[0] for i in intervals], its, label="aggregated ITS (score " + str(score) + ")")
# plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2, label="single ITS (score " + str(score2) + ", p " + str(p2) + ")")
plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS (pvalues " + str(p2) + ")") plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS (pvalues " + str(p2) + ")")
# plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")], itsb, print("shape: " + str(np.shape(thresdata)))
# label="segmented ITS b (score " + str(scoreb) + ")") for (ti, t) in enumerate(thresholds):
# plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")], itsa, print("shape1: " + str(np.shape(thresdata[ti])))
# label="segmented ITS a (score " + str(scorea) + ")") plt.plot([thresiv[ti][i][0] for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months (pvalues " + str(thresp[ti]) + ")")
plt.title("Average sentiments for new users") plt.title("Average sentiments for new users")
plt.xticks(rotation=90) plt.xticks(rotation=90)
plt.xlabel("months") plt.xlabel("months")

24
text/0_doc.tex Normal file
View File

@@ -0,0 +1,24 @@
\documentclass[a4paper,10pt]{article}
\usepackage[utf8]{inputenc}
%opening
\title{}
\author{}
\begin{document}
\maketitle
\begin{abstract}
\end{abstract}
\include{1_intro}
\include{2_relwork}
\include{3_method}
\include{4_datasets}
\include{5_results}
\include{6_discussion}
\include{7_conclusion}
\end{document}

1
text/1_intro.tex Normal file
View File

@@ -0,0 +1 @@
\section{Introduction}

4
text/2_relwork.tex Normal file
View File

@@ -0,0 +1,4 @@
\section{Related Work}
%tutorial: Bernal et al. \cite{bernal2017interrupted}
%literatur analyse todo

10
text/bib.bib Normal file
View File

@@ -0,0 +1,10 @@
@article{bernal2017interrupted,
title={Interrupted time series regression for the evaluation of public health interventions: a tutorial},
author={Bernal, James Lopez and Cummins, Steven and Gasparrini, Antonio},
journal={International journal of epidemiology},
volume={46},
number={1},
pages={348--355},
year={2017},
publisher={Oxford University Press}
}