From 52d7ddb7fc3a872c5d12e587afe346248b046170 Mon Sep 17 00:00:00 2001 From: wea_ondara Date: Mon, 22 Mar 2021 20:30:32 +0100 Subject: [PATCH] wip --- its.py | 11 +-- itsnew.py | 235 +++++++++++++++++++++++++++++++++++++++++++++ questionits.py | 14 ++- text/2_relwork.tex | 8 +- text/3_method.tex | 8 +- text/main.tex | 1 + todo2 | 15 +-- votes.py | 1 - votesits.py | 13 ++- 9 files changed, 270 insertions(+), 36 deletions(-) create mode 100644 itsnew.py diff --git a/its.py b/its.py index 2f3807c..f00a359 100644 --- a/its.py +++ b/its.py @@ -7,13 +7,12 @@ from datetime import datetime from datetime import timedelta from dateutil.relativedelta import relativedelta -from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, difftime +from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, CHANGE_DATE, difftime from loader import load, dmt, cms from sentiments import readtoxleveltxt colors = ['red', 'green', 'blue', 'orange', 'deeppink'] thresholds = [6, 9, 12, 15] -changedate = datetime.fromisoformat("2018-09-01T00:00:00") def main(folder, intervl): @@ -42,9 +41,9 @@ def main(folder, intervl): # avg sentiments filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound'] for a in p['Answers'] - if option_date_from <= p['CreationDate'] < option_date_to #post in interval - and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'] # post created withon 1 week of 1st contrib - and p['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) # answer within 1 week of post creation + if option_date_from <= p['CreationDate'] < option_date_to # post in interval + and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'] # post created withon 1 week of 1st contrib + and p['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) # answer within 1 week of post creation .filter(lambda p: p != []) .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: []) @@ -57,7 +56,7 @@ def main(folder, intervl): avgcount = np.mean([x for x in count if str(x) != "nan"]) stdcount = np.std([x for x in count if str(x) != "nan"]) for i in range(len(count)): - if str(count[i]) == "nan": # or np.abs((count[i] - avgcount) / stdcount) > 3: + if str(count[i]) == "nan": # or np.abs((count[i] - avgcount) / stdcount) > 3: datasingle[i] = float("nan") data[i] = float("nan") count[i] = float("nan") diff --git a/itsnew.py b/itsnew.py new file mode 100644 index 0000000..9ac1cb5 --- /dev/null +++ b/itsnew.py @@ -0,0 +1,235 @@ +import matplotlib.pyplot as plt +import numpy as np +import os +import statsmodels.api as sm +import sys +from datetime import datetime +from datetime import timedelta +from dateutil.relativedelta import relativedelta + +from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, CHANGE_DATE, difftime +from loader import load, dmt, cms +from sentiments import readtoxleveltxt + +colors = ['red', 'green', 'blue', 'orange', 'deeppink'] +thresholds = [6, 9, 12, 15] + + +def main(folder, intervl): + users, posts, firstcontrib, sumcontrib = load(folder) + + intervals = calc_intervals(posts, intervl) + + start = cms() + printnoln("reading sentiments ...") + (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt") + rprint("reading sentiments ... took " + str(cms() - start) + "ms") + + outputdir = folder + "/output/itsnew/" + os.system("mkdir -p " + outputdir) + + data = [] + datasingle = [] + count = [] + for (option_date_from, option_date_to) in intervals: + if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"): + datasingle.append(float("nan")) + data.append(float("nan")) + count.append(float("nan")) + continue + print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) + # avg sentiments + filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound'] + for a in p['Answers'] + if option_date_from <= p['CreationDate'] < option_date_to # post in interval + and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'] # post created withon 1 week of 1st contrib + and p['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) # answer within 1 week of post creation + + .filter(lambda p: p != []) + .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: []) + .getresults()) + datasingle.append(filtered) + avg = np.average(filtered) if len(filtered) > 0 else float("nan") + data.append(avg) + count.append(len(filtered)) + + avgcount = np.mean([x for x in count if str(x) != "nan"]) + stdcount = np.std([x for x in count if str(x) != "nan"]) + for i in range(len(count)): + if str(count[i]) == "nan": # or np.abs((count[i] - avgcount) / stdcount) > 3: + datasingle[i] = float("nan") + data[i] = float("nan") + count[i] = float("nan") + + # filter nan entries + for i in range(len(data)): + while i < len(data) and str(data[i]) == "nan": + del datasingle[i] + del data[i] + del intervals[i] + del count[i] + + # deseason + # mins = [min([data[j] for j in range(len(data)) if j % 12 == i]) for i in range(0, 12)] + mins = [np.average([data[j] for j in range(len(data)) if j % 12 == i]) for i in range(0, 12)] + # mins = [min(d) / count(d) for d in [[data[j] for j in range(len(data)) if j % 12 == i] for i in range(0, 12)]] + # mins = [data[i] for i in range(0, 12)] + mins = [m - min(mins) for m in mins] + print("mins", mins) + dsdata = [data[i] - mins[i % 12] for i in range(len(data))] + dsdatasingle = [[d - mins[i % 12] for d in datasingle[i]] for i in range(len(datasingle))] + + # data = dsdata + # datasingle = dsdatasingle + + print("Computing full ITS") + # t_s = np.reshape(np.array([data[i] for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) + t = np.reshape(np.array([i for i in range(len(dsdatasingle)) for j in dsdatasingle[i]]), (-1, 1)) + x = np.reshape(np.array([(0 if intervals[i][0] <= CHANGE_DATE else 1) for i in range(len(dsdatasingle)) for j in dsdatasingle[i]]), (-1, 1)) + X = np.array(t) # b1 + X = np.concatenate((X, x), 1) # b2 + X = np.concatenate((X, np.multiply(t, x)), 1) # 3 + # X = np.concatenate((X, t_s), 1) # 4 + X = sm.add_constant(X) # b0 + y = np.reshape(np.array([d for a in dsdatasingle for d in a]), (-1, 1)) + res = sm.OLS(y, X).fit() + p2 = res.pvalues + print("coef ols: " + str(res.params)) + print("sum ols: " + str(res.summary())) + coef2ols = np.reshape(np.array(res.params), (-1, 1)) + # coef2ols[4] = 0 + its2ols = X.dot(coef2ols) + dsits2ols = np.copy(its2ols) + # its2ols = np.add(its2ols, np.reshape(np.array([mins[i % 12] for i in range(len(data)) for j in dsdatasingle[i]]), (-1, 1))) + minavg = np.average(mins) + its2ols = np.add(its2ols, np.reshape(np.array([minavg for i in range(len(data)) for j in dsdatasingle[i]]), (-1, 1))) + with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file: + file.write(str(res.summary())) + + thresdata = [] + thresols = [] + thresiv = [] + thresp = [] + print("Computing threshold ITS") + for ti in thresholds: + # print(1, CHANGE_DATE - relativedelta(months=ti)) + # print(2, CHANGE_DATE + relativedelta(months=ti)) + z = [(i, x) for (i, x) in zip(intervals, datasingle) if i[0] >= CHANGE_DATE - relativedelta(months=ti) and i[1] <= CHANGE_DATE + relativedelta(months=ti)] + iv = [i for (i, x) in z] + # print("iv " + str(iv)) + d = [x for (i, x) in z] + # t_s = np.reshape(np.array([data[i] for i in range(len(d)) for j in d[i]]), (-1, 1)) + t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1)) + x = np.reshape(np.array([(0 if iv[i][0] <= CHANGE_DATE else 1) for i in range(len(d)) for j in d[i]]), (-1, 1)) + X = np.array(t) # b1 + X = np.concatenate((X, x), 1) # b2 + X = np.concatenate((X, np.multiply(t, x)), 1) # b3 + # X = np.concatenate((X, t_s), 1) # b4 + X = sm.add_constant(X) # 0 + y = np.reshape(np.array([v for a in d for v in a]), (-1, 1)) + res = sm.OLS(y, X).fit() + tp = res.pvalues + thresp.append(tp) + # print("coef ols: " + str(res.params)) + # print("sum ols: " + str(res.summary())) + coefthresols = np.reshape(np.array(res.params), (-1, 1)) + # coefthresols[4] = 0 + thresols.append(X.dot(coefthresols)) + thresiv.append(iv) + thresdata.append(d) + with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file: + file.write(str(res.summary())) + + fig = plt.figure(figsize=FIG_SIZE) + plt.plot([difftime(i[0]) for i in intervals], data, label="average sentiment") + # plt.plot([difftime(i[0]) for i in intervals], dsdata, label="average sentiment - deseason") + plt.grid(True) + for i in range(len(data)): + va = "center" + if 0 < i < len(data) - 1: + if data[i - 1] < data[i] and data[i + 1] < data[i]: + va = "bottom" + elif data[i - 1] > data[i] and data[i + 1] > data[i]: + va = "top" + elif i == 0: + if data[i + 1] < data[i]: + va = "bottom" + else: + va = "top" + elif i == len(data) - 1: + if data[i - 1] < data[i]: + va = "bottom" + else: + va = "top" + plt.text(difftime(intervals[i][0]), data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va) + plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS") + # plt.plot([difftime(intervals[i][0]) for i in range(len(dsdatasingle)) for j in dsdatasingle[i]], dsits2ols, label="sm single ITS - deseason") + # print("shape: " + str(np.shape(thresdata))) + for (ti, t) in enumerate(thresholds): + # print("shape1: " + str(np.shape(thresdata[ti]))) + plt.plot([difftime(thresiv[ti][i][0]) for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months") + plt.title("Average sentiments for new users") + plt.xticks(rotation=90) + plt.xlabel("months") + plt.ylabel("sentiment") + plt.legend(loc="upper right") + outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png" + plt.savefig(outfile, bbox_inches='tight') + plt.close(fig) + + # plot seasonality + fig = plt.figure(figsize=FIG_SIZE) + plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], [mins[i % 12] for i in range(len(datasingle)) for j in datasingle[i]], label="seasonality") + # print("shape: " + str(np.shape(thresdata))) + plt.title("Average sentiments for new users - seasonality") + plt.xticks(rotation=90) + plt.xlabel("months") + plt.ylabel("sentiment - seasonality") + plt.legend(loc="upper right") + outfile = outputdir + "/season-i" + str(intervl) + ".png" + plt.savefig(outfile, bbox_inches='tight') + plt.close(fig) + + # plot seasonality post count + pcmins = [len(datasingle[i]) for i in range(0, 12)] + pcmins = [m - min(pcmins) for m in pcmins] + + fig = plt.figure(figsize=FIG_SIZE) + plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle))], [pcmins[i % 12] for i in range(len(datasingle))], label="seasonality") + plt.title("post count for new users - seasonality") + plt.xticks(rotation=90) + plt.xlabel("months") + plt.ylabel("post count - seasonality") + plt.legend(loc="upper right") + outfile = outputdir + "/season_postcount-i" + str(intervl) + ".png" + plt.savefig(outfile, bbox_inches='tight') + plt.close(fig) + + +if __name__ == "__main__": + # execute only if run as a script + usage = sys.argv[0] + " " + if len(sys.argv) < 2: + print(usage) + sys.exit(1) + folder = sys.argv[1] + if not os.path.isdir(folder): + print(folder + " is not a folder") + sys.exit(1) + interval = 1 + if len(sys.argv) >= 3: + if sys.argv[2].startswith("-i"): + interval = sys.argv[2][2:] + try: + interval = int(interval) + except ValueError: + print("-i: int required") + sys.exit(1) + if interval < 1 or interval > 12: + print("-i: only 1 - 12") + sys.exit(1) + else: + print("unknown parameter: " + sys.argv[2]) + sys.exit(1) + + main(folder, interval) diff --git a/questionits.py b/questionits.py index 1fb9bca..4d6786e 100644 --- a/questionits.py +++ b/questionits.py @@ -14,8 +14,6 @@ from sentiments import readtoxleveltxt colors = ['red', 'green', 'blue', 'orange', 'deeppink'] thresholds = [6, 9, 12, 15] -changedate = datetime.fromisoformat("2018-09-01T00:00:00") - def main(folder, intervl): users, posts, firstcontrib, sumcontrib = load(folder) @@ -94,7 +92,7 @@ def main(folder, intervl): print("Computing full ITS1") t = np.reshape(np.array([i for i in range(len(datasingle1)) for j in datasingle1[i]]), (-1, 1)) - x = np.reshape(np.array([(0 if intervals[i][0] <= changedate else 1) for i in range(len(datasingle1)) for j in datasingle1[i]]), (-1, 1)) + x = np.reshape(np.array([(0 if intervals[i][0] <= CHANGE_DATE else 1) for i in range(len(datasingle1)) for j in datasingle1[i]]), (-1, 1)) X = np.array(t) X = np.concatenate((X, x), 1) X = np.concatenate((X, np.multiply(t, x)), 1) @@ -111,7 +109,7 @@ def main(folder, intervl): print("Computing full ITS2") t = np.reshape(np.array([i for i in range(len(datasingle2)) for j in datasingle2[i]]), (-1, 1)) - x = np.reshape(np.array([(0 if intervals[i][0] <= changedate else 1) for i in range(len(datasingle2)) for j in datasingle2[i]]), (-1, 1)) + x = np.reshape(np.array([(0 if intervals[i][0] <= CHANGE_DATE else 1) for i in range(len(datasingle2)) for j in datasingle2[i]]), (-1, 1)) X = np.array(t) X = np.concatenate((X, x), 1) X = np.concatenate((X, np.multiply(t, x)), 1) @@ -132,14 +130,14 @@ def main(folder, intervl): thresp = [] print("Computing threshold ITS") for ti in thresholds: - # print(1, changedate - relativedelta(months=ti)) - # print(2, changedate + relativedelta(months=ti)) - z = [(i, x) for (i, x) in zip(intervals, datasingle1) if i[0] >= changedate - relativedelta(months=ti) and i[1] <= changedate + relativedelta(months=ti)] + # print(1, CHANGE_DATE - relativedelta(months=ti)) + # print(2, CHANGE_DATE + relativedelta(months=ti)) + z = [(i, x) for (i, x) in zip(intervals, datasingle1) if i[0] >= CHANGE_DATE - relativedelta(months=ti) and i[1] <= CHANGE_DATE + relativedelta(months=ti)] iv = [i for (i, x) in z] # print("iv " + str(iv)) d = [x for (i, x) in z] t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1)) - x = np.reshape(np.array([(0 if iv[i][0] <= changedate else 1) for i in range(len(d)) for j in d[i]]), (-1, 1)) + x = np.reshape(np.array([(0 if iv[i][0] <= CHANGE_DATE else 1) for i in range(len(d)) for j in d[i]]), (-1, 1)) X = np.array(t) X = np.concatenate((X, x), 1) X = np.concatenate((X, np.multiply(t, x)), 1) diff --git a/text/2_relwork.tex b/text/2_relwork.tex index 66277f4..19e3481 100644 --- a/text/2_relwork.tex +++ b/text/2_relwork.tex @@ -376,9 +376,13 @@ This shortcoming was addressed by \citeauthor{hutto2014vader} who introducted a % ursprüngliches paper ITS, wie hat man das früher (davor) gemacht \subsection{Trend analysis} -When introducing a change to a system (experiment), one often wants to know whether the intervention achieves its intended purpose. This leads to 3 possible outcomes: a) the intervention shows effect and the system changes in the desired way, b) the intervention shows effect and the system changes in an undesired way, or c) the system did not react at all to the change. There are multiple ways to determine which of these outcomes occur. To analyze the behavior of the system data from before and after the intervention as well as the nature of the intervation has be aquired. The are multiple ways to run such an experiment and one has to choose which type of experiment fits best. There are 2 categories of approaches: actively creating an experiment where one design the experiment before it is executed (for example randomized control trials in medical fields), or using existing data of an experiment which was not designed beforehand or where setting up a designed experiment is not possible (quasi-experiment). +When introducing a change to a system (experiment), one often wants to know whether the intervention achieves its intended purpose. This leads to 3 possible outcomes: a) the intervention shows effect and the system changes in the desired way, b) the intervention shows effect and the system changes in an undesired way, or c) the system did not react at all to the change. There are multiple ways to determine which of these outcomes occur. To analyze the behavior of the system, data from before and after the intervention as well as the nature of the intervation has be aquired. The are multiple ways to run such an experiment and one has to choose which type of experiment fits best. There are 2 categories of approaches: actively creating an experiment where one design the experiment before it is executed (for example randomized control trials in medical fields), or using existing data of an experiment which was not designed beforehand or where setting up a designed experiment is not possible (quasi-experiment). -As this thesis investigates a change which has already been implemented by another party, this thesis covers quasi-experiments. A tool that is often used for this purpose is an \emph{Interrupted Time Series} (ITS) analysis. The ITS analysis is a form of segmented regression analysis, where data from before, after and during the intervention is regressed with seperate line segements\cite{mcdowall2019interrupted, bernal2017interrupted}. ITS requires data at (regular) intervals from before and after the intervention (time series). The interrupt signifies the intervention and the time of when it occured must be known. The intervention can be at a single point in time of it can be streched out over a certain time span. This property must also be known to take it into account when designing the regression. Also, as the data is aquired from an quasi-experiment, it may be baised, for example seasonality, ....%TODO +As this thesis investigates a change which has already been implemented by another party, this thesis covers quasi-experiments. A tool that is often used for this purpose is an \emph{Interrupted Time Series} (ITS) analysis. The ITS analysis is a form of segmented regression analysis, where data from before, after and during the intervention is regressed with seperate line segements\cite{mcdowall2019interrupted}. ITS requires data at (regular) intervals from before and after the intervention (time series). The interrupt signifies the intervention and the time of when it occured must be known. The intervention can be at a single point in time or it can be streched out over a certain time span. This property must also be known to take it into account when designing the regression. Also, as the data is aquired from an quasi-experiment, it may be baised\cite{bernal2017interrupted}, for example seasonality, time-varying confunders (for example a change in measuring data), variance in the number of single observations grouped together in an interval measurement, etc.. These biases need to be addressed if present. Seasonality can be accounted for by subtracting the average value of each of the months in succesive years (i.e. subtract the average value of all Januaries in the data set from the the values in Januaries). +%\begin{lstlisting} +% deseasonalized = datasample - average(dataSamplesInMonth(month(datasample))) +%\end{lstlisting} +This removes the differences between different months of the same year thereby filtering out the effect of seasonality. The variance in data density per interval (data samples in an interval) can be addressed by using the each single data point in the regression instead of an average. diff --git a/text/3_method.tex b/text/3_method.tex index c5557b9..074fdad 100644 --- a/text/3_method.tex +++ b/text/3_method.tex @@ -13,6 +13,10 @@ StackExchange introduced a \emph{new contributor} indicator to all communities o % https://meta.stackexchange.com/questions/314472/what-are-the-exact-criteria-for-the-new-contributor-indicator-to-be-shown \cite{sonic2018what} ; change date = 2018-08-21T21:04:49.177 % new user indicator visible for 1 week ... +%TODO state plots of sec 5 here and why these were chosen +% -> also limitierungen, andere faktoren + + %TODO more vader explanation To measure the effectiveness of the change this thesis utilizes Vader, a sentiment analysis tool with exceptional performance in analysing and categorizing microblog-like texts as well as good generalization in other domains \cite{hutto2014vader}. The choice is based on the speed and simplicity of Vader. Vader uses a lexicon of words with attached sentiment values and rules related to grammar and syntax to determine a sentiment value between -1 and 1 to a given piece of text. The sentiment range is divided into 3 classes: negative (-1 to -0.05), neutral (-0.05 to 0.05), and positive (0.05 to 1). The outer edges of the value space are rarely reached as the text would have to be extremely negative or positive which is very unlikely. This design allows fast and verifiable analysis. @@ -26,7 +30,7 @@ StackExchange provides anonymized data dumps of all their communities for resear % broken entries, missing user id % answers in html -> strip html and remove code sections, no contribution to sentiment -After preprocessing the raw data, relevant data is filtered and computed. Questions and answers in the data are mixed together and have to be separated and answers have to be linked to their questions. Also, questions in these datasets do not have the \emph{new contributor} indicator attached to them and neither do users. So, the first contribution date and time of users have to be calculated via the creation dates of the questions and answers the user has posted. Then, questions are filtered per user and by whether they are created within the 7-day window after the first contribution of the user. These questions were created during the period where the \emph{new contributor} indicator would have been displayed, in case the questions had been posted before the change, or has been displayed after the change. From these questions, all answers which arrived within the 7-day window are considered for the analysis. Answers which arrived at a later point are excluded as the answerer most likely has not seen the disclaimer shown in figure \ref{newcontributor}. Included answers are then analyzed with Vader and the resulting sentiments are stored. Furhtermore, votes to questions of new contributors are counted if they arrived within the 7-day window and count 1 if it is an upvote and -1 if it is a downvote. Moreover, number of questions new contributors ask are counted and divided into two classes: 1st-question of a user and follow-up questions of a new contributor. +After preprocessing the raw data, relevant data is filtered and computed. Questions and answers in the data are mixed together and have to be separated and answers have to be linked to their questions. Also, questions in these datasets do not have the \emph{new contributor} indicator attached to them and neither do users. So, the first contribution date and time of users have to be calculated via the creation dates of the questions and answers the user has posted. Then, questions are filtered per user and by whether they are created within the 7-day window after the first contribution of the user. These questions were created during the period where the \emph{new contributor} indicator would have been displayed, in case the questions had been posted before the change, or has been displayed after the change. From these questions, all answers which arrived within the 7-day window are considered for the analysis. Answers which arrived at a later point are excluded as the answerer most likely has not seen the disclaimer shown in figure \ref{newcontributor}. Included answers are then analyzed with Vader and the resulting sentiments are stored. Furthermore, votes to questions of new contributors are counted if they arrived within the 7-day window and count 1 if it is an upvote and -1 if it is a downvote. Moreover, number of questions new contributors ask are counted and divided into two classes: 1st-question of a user and follow-up questions of a new contributor. % calc sentiment for answers % questions do not have a tag if from a new contribtor -> calc first contributor @@ -43,7 +47,7 @@ After preprocessing the raw data, relevant data is filtered and computed. Questi \section{Analysis} An interrupted time series (ITS) analysis captures trends before and after a change in a system and fits very well with the question this thesis investigates. ITS can be applied to a large variety of data if the data contains the same kind of data points before and after the change and when the change date and time are known. \citeauthor{bernal2017interrupted} published a paper on how ITS works \cite{bernal2017interrupted}. ITS performes well on medical data, for instance, when a new treatment is introduced ITS can visualize if the treatment improves a condition. For ITS no control group is required and often control groups are not feasible. ITS only works with the before and after data and a point in time where a change was introduced. -ITS relies on linear regression and tries to fit a three-segment linear function to the data. The authors also described cases where more than three segments are used but these models quickly raise the complexity of the analysis and for this thesis a three-segment linear regression is sufficient. The three segments are lines to fit the data before and after the change as well as one line to connect the other two lines at the change date. Figure \ref{itsexample} shows an example of an ITS. Each segment is captured by a tensor of the following formula $Y_t = \beta_0 + \beta_1T + \beta_2X_t + \beta_3TX_t$, where $T$ represents time as a number, for instance, number of months since the start of data recording, $X_t$ represents 0 or 1 depending on whether the change is in effect, $\beta_0$ represents the value at $T = 0$, $\beta_1$ represents the slope before the change, $\beta_2$ represents the value when the change is introduced, and $\beta_3$ represents the slope after the change. Contrary to the method in \cite{bernal2017interrupted} where the ITS is performed on aggregated values per month, this thesis performs the ITS on single data points, as the premise that the aggregated values all have the same weight within a certain margin is not fulfilled for sentiment and vote score values. Performing the ITS with aggregated values would skew the linear regression more towards data points with less weight. Single data point fitting prevents this, as weight is taken into account with more data points. +ITS relies on linear regression and tries to fit a three-segment linear function to the data. The authors also described cases where more than three segments are used but these models quickly raise the complexity of the analysis and for this thesis a three-segment linear regression is sufficient. The three segments are lines to fit the data before and after the change as well as one line to connect the other two lines at the change date. Figure \ref{itsexample} shows an example of an ITS. Each segment is captured by a tensor of the following formula $Y_t = \beta_0 + \beta_1T + \beta_2X_t + \beta_3TX_t$, where $T$ represents time as a number, for instance, number of months since the start of data recording, $X_t$ represents 0 or 1 depending on whether the change is in effect, $\beta_0$ represents the value at $T = 0$, $\beta_1$ represents the slope before the change, $\beta_2$ represents the value when the change is introduced, and $\beta_3$ represents the slope after the change. Contrary to the basic method explained in \cite{bernal2017interrupted} where the ITS is performed on aggregated values per month, this thesis performs the ITS on single data points, as the premise that the aggregated values all have the same weight within a certain margin is not fulfilled for sentiment and vote score values. Performing the ITS with aggregated values would skew the linear regression more towards data points with less weight. Single data point fitting prevents this, as weight is taken into account with more data points. To filter out seasonal effects, the average value of all data points with the same month of all years is subtracted from the data points (i.e. subtract the average value of all Januaries from each data point in a January). \begin{figure} diff --git a/text/main.tex b/text/main.tex index bc233df..32d6a4a 100644 --- a/text/main.tex +++ b/text/main.tex @@ -203,6 +203,7 @@ \usepackage{float} \usepackage{subcaption} \let\mfs\multiplefootnoteseparator +\usepackage{listings} \addbibresource{\mybiblatexfile} diff --git a/todo2 b/todo2 index 8b49b0f..ceae34d 100644 --- a/todo2 +++ b/todo2 @@ -6,13 +6,13 @@ - ursprüngliches paper ITS, wie hat man das früher (davor) gemacht - onboarding passt, community growth und sustanibiltiy machen -- sentiment analyse: es gibt 10-15 methoden, -- alle sentiment methoden + vader +- DONE sentiment analyse: es gibt 10-15 methoden, +- DONE alle sentiment methoden + vader 3. -- argumente warum ich genau diese variablen (sentiment, votes, #questions) -- limitierungen, andere faktoren -- vader genau beschreiben +- DONEXT argumente warum ich genau diese variablen (sentiment, votes, #questions) +- DONEXT limitierungen, andere faktoren +- DONEXT vader genau beschreiben 5. - DONE gruppieren nach categorien @@ -28,11 +28,6 @@ extra 5. stackoverflow vote score last datapoint: probably questions did not have enougth time to gain votes -ranking -stackoverflow good - - - diff --git a/votes.py b/votes.py index 8720b86..3025ae2 100644 --- a/votes.py +++ b/votes.py @@ -14,7 +14,6 @@ from sentiments import readtoxleveltxt colors = ['red', 'green', 'blue', 'orange', 'deeppink'] thresholds = [3, 4, 5, 6] -changedate = datetime.fromisoformat("2018-09-01T00:00:00") def main(folder, intervl): diff --git a/votesits.py b/votesits.py index dd79744..4320a88 100644 --- a/votesits.py +++ b/votesits.py @@ -8,13 +8,12 @@ from datetime import datetime from datetime import timedelta from dateutil.relativedelta import relativedelta -from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, difftime +from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, CHANGE_DATE, difftime from loader import load, dmt, cms, readVotes from sentiments import readtoxleveltxt colors = ['red', 'green', 'blue', 'orange', 'deeppink'] thresholds = [6, 9, 12, 15] -changedate = datetime.fromisoformat("2018-09-01T00:00:00") def main(folder, intervl): @@ -76,7 +75,7 @@ def main(folder, intervl): print("Computing full ITS") t = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) - x = np.reshape(np.array([(0 if intervals[i][0] <= changedate else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) + x = np.reshape(np.array([(0 if intervals[i][0] <= CHANGE_DATE else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) X = np.array(t) X = np.concatenate((X, x), 1) X = np.concatenate((X, np.multiply(t, x)), 1) @@ -97,14 +96,14 @@ def main(folder, intervl): thresp = [] print("Computing threshold ITS") for ti in thresholds: - # print(1, changedate - relativedelta(months=ti)) - # print(2, changedate + relativedelta(months=ti)) - z = [(i, x) for (i, x) in zip(intervals, datasingle) if i[0] >= changedate - relativedelta(months=ti) and i[1] <= changedate + relativedelta(months=ti)] + # print(1, CHANGE_DATE - relativedelta(months=ti)) + # print(2, CHANGE_DATE + relativedelta(months=ti)) + z = [(i, x) for (i, x) in zip(intervals, datasingle) if i[0] >= CHANGE_DATE - relativedelta(months=ti) and i[1] <= CHANGE_DATE + relativedelta(months=ti)] iv = [i for (i, x) in z] # print("iv " + str(iv)) d = [x for (i, x) in z] t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1)) - x = np.reshape(np.array([(0 if iv[i][0] <= changedate else 1) for i in range(len(d)) for j in d[i]]), (-1, 1)) + x = np.reshape(np.array([(0 if iv[i][0] <= CHANGE_DATE else 1) for i in range(len(d)) for j in d[i]]), (-1, 1)) X = np.array(t) X = np.concatenate((X, x), 1) X = np.concatenate((X, np.multiply(t, x)), 1)