This commit is contained in:
wea_ondara
2021-03-22 20:30:32 +01:00
parent 316fed8283
commit 52d7ddb7fc
9 changed files with 270 additions and 36 deletions

5
its.py
View File

@@ -7,13 +7,12 @@ from datetime import datetime
from datetime import timedelta from datetime import timedelta
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, difftime from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, CHANGE_DATE, difftime
from loader import load, dmt, cms from loader import load, dmt, cms
from sentiments import readtoxleveltxt from sentiments import readtoxleveltxt
colors = ['red', 'green', 'blue', 'orange', 'deeppink'] colors = ['red', 'green', 'blue', 'orange', 'deeppink']
thresholds = [6, 9, 12, 15] thresholds = [6, 9, 12, 15]
changedate = datetime.fromisoformat("2018-09-01T00:00:00")
def main(folder, intervl): def main(folder, intervl):
@@ -42,7 +41,7 @@ def main(folder, intervl):
# avg sentiments # avg sentiments
filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound'] filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
for a in p['Answers'] for a in p['Answers']
if option_date_from <= p['CreationDate'] < option_date_to #post in interval if option_date_from <= p['CreationDate'] < option_date_to # post in interval
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'] # post created withon 1 week of 1st contrib and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'] # post created withon 1 week of 1st contrib
and p['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) # answer within 1 week of post creation and p['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) # answer within 1 week of post creation

235
itsnew.py Normal file
View File

@@ -0,0 +1,235 @@
import matplotlib.pyplot as plt
import numpy as np
import os
import statsmodels.api as sm
import sys
from datetime import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, CHANGE_DATE, difftime
from loader import load, dmt, cms
from sentiments import readtoxleveltxt
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
thresholds = [6, 9, 12, 15]
def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts, intervl)
start = cms()
printnoln("reading sentiments ...")
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
rprint("reading sentiments ... took " + str(cms() - start) + "ms")
outputdir = folder + "/output/itsnew/"
os.system("mkdir -p " + outputdir)
data = []
datasingle = []
count = []
for (option_date_from, option_date_to) in intervals:
if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"):
datasingle.append(float("nan"))
data.append(float("nan"))
count.append(float("nan"))
continue
print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# avg sentiments
filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
for a in p['Answers']
if option_date_from <= p['CreationDate'] < option_date_to # post in interval
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'] # post created withon 1 week of 1st contrib
and p['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) # answer within 1 week of post creation
.filter(lambda p: p != [])
.reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
.getresults())
datasingle.append(filtered)
avg = np.average(filtered) if len(filtered) > 0 else float("nan")
data.append(avg)
count.append(len(filtered))
avgcount = np.mean([x for x in count if str(x) != "nan"])
stdcount = np.std([x for x in count if str(x) != "nan"])
for i in range(len(count)):
if str(count[i]) == "nan": # or np.abs((count[i] - avgcount) / stdcount) > 3:
datasingle[i] = float("nan")
data[i] = float("nan")
count[i] = float("nan")
# filter nan entries
for i in range(len(data)):
while i < len(data) and str(data[i]) == "nan":
del datasingle[i]
del data[i]
del intervals[i]
del count[i]
# deseason
# mins = [min([data[j] for j in range(len(data)) if j % 12 == i]) for i in range(0, 12)]
mins = [np.average([data[j] for j in range(len(data)) if j % 12 == i]) for i in range(0, 12)]
# mins = [min(d) / count(d) for d in [[data[j] for j in range(len(data)) if j % 12 == i] for i in range(0, 12)]]
# mins = [data[i] for i in range(0, 12)]
mins = [m - min(mins) for m in mins]
print("mins", mins)
dsdata = [data[i] - mins[i % 12] for i in range(len(data))]
dsdatasingle = [[d - mins[i % 12] for d in datasingle[i]] for i in range(len(datasingle))]
# data = dsdata
# datasingle = dsdatasingle
print("Computing full ITS")
# t_s = np.reshape(np.array([data[i] for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
t = np.reshape(np.array([i for i in range(len(dsdatasingle)) for j in dsdatasingle[i]]), (-1, 1))
x = np.reshape(np.array([(0 if intervals[i][0] <= CHANGE_DATE else 1) for i in range(len(dsdatasingle)) for j in dsdatasingle[i]]), (-1, 1))
X = np.array(t) # b1
X = np.concatenate((X, x), 1) # b2
X = np.concatenate((X, np.multiply(t, x)), 1) # 3
# X = np.concatenate((X, t_s), 1) # 4
X = sm.add_constant(X) # b0
y = np.reshape(np.array([d for a in dsdatasingle for d in a]), (-1, 1))
res = sm.OLS(y, X).fit()
p2 = res.pvalues
print("coef ols: " + str(res.params))
print("sum ols: " + str(res.summary()))
coef2ols = np.reshape(np.array(res.params), (-1, 1))
# coef2ols[4] = 0
its2ols = X.dot(coef2ols)
dsits2ols = np.copy(its2ols)
# its2ols = np.add(its2ols, np.reshape(np.array([mins[i % 12] for i in range(len(data)) for j in dsdatasingle[i]]), (-1, 1)))
minavg = np.average(mins)
its2ols = np.add(its2ols, np.reshape(np.array([minavg for i in range(len(data)) for j in dsdatasingle[i]]), (-1, 1)))
with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file:
file.write(str(res.summary()))
thresdata = []
thresols = []
thresiv = []
thresp = []
print("Computing threshold ITS")
for ti in thresholds:
# print(1, CHANGE_DATE - relativedelta(months=ti))
# print(2, CHANGE_DATE + relativedelta(months=ti))
z = [(i, x) for (i, x) in zip(intervals, datasingle) if i[0] >= CHANGE_DATE - relativedelta(months=ti) and i[1] <= CHANGE_DATE + relativedelta(months=ti)]
iv = [i for (i, x) in z]
# print("iv " + str(iv))
d = [x for (i, x) in z]
# t_s = np.reshape(np.array([data[i] for i in range(len(d)) for j in d[i]]), (-1, 1))
t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1))
x = np.reshape(np.array([(0 if iv[i][0] <= CHANGE_DATE else 1) for i in range(len(d)) for j in d[i]]), (-1, 1))
X = np.array(t) # b1
X = np.concatenate((X, x), 1) # b2
X = np.concatenate((X, np.multiply(t, x)), 1) # b3
# X = np.concatenate((X, t_s), 1) # b4
X = sm.add_constant(X) # 0
y = np.reshape(np.array([v for a in d for v in a]), (-1, 1))
res = sm.OLS(y, X).fit()
tp = res.pvalues
thresp.append(tp)
# print("coef ols: " + str(res.params))
# print("sum ols: " + str(res.summary()))
coefthresols = np.reshape(np.array(res.params), (-1, 1))
# coefthresols[4] = 0
thresols.append(X.dot(coefthresols))
thresiv.append(iv)
thresdata.append(d)
with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file:
file.write(str(res.summary()))
fig = plt.figure(figsize=FIG_SIZE)
plt.plot([difftime(i[0]) for i in intervals], data, label="average sentiment")
# plt.plot([difftime(i[0]) for i in intervals], dsdata, label="average sentiment - deseason")
plt.grid(True)
for i in range(len(data)):
va = "center"
if 0 < i < len(data) - 1:
if data[i - 1] < data[i] and data[i + 1] < data[i]:
va = "bottom"
elif data[i - 1] > data[i] and data[i + 1] > data[i]:
va = "top"
elif i == 0:
if data[i + 1] < data[i]:
va = "bottom"
else:
va = "top"
elif i == len(data) - 1:
if data[i - 1] < data[i]:
va = "bottom"
else:
va = "top"
plt.text(difftime(intervals[i][0]), data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS")
# plt.plot([difftime(intervals[i][0]) for i in range(len(dsdatasingle)) for j in dsdatasingle[i]], dsits2ols, label="sm single ITS - deseason")
# print("shape: " + str(np.shape(thresdata)))
for (ti, t) in enumerate(thresholds):
# print("shape1: " + str(np.shape(thresdata[ti])))
plt.plot([difftime(thresiv[ti][i][0]) for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months")
plt.title("Average sentiments for new users")
plt.xticks(rotation=90)
plt.xlabel("months")
plt.ylabel("sentiment")
plt.legend(loc="upper right")
outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
# plot seasonality
fig = plt.figure(figsize=FIG_SIZE)
plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], [mins[i % 12] for i in range(len(datasingle)) for j in datasingle[i]], label="seasonality")
# print("shape: " + str(np.shape(thresdata)))
plt.title("Average sentiments for new users - seasonality")
plt.xticks(rotation=90)
plt.xlabel("months")
plt.ylabel("sentiment - seasonality")
plt.legend(loc="upper right")
outfile = outputdir + "/season-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
# plot seasonality post count
pcmins = [len(datasingle[i]) for i in range(0, 12)]
pcmins = [m - min(pcmins) for m in pcmins]
fig = plt.figure(figsize=FIG_SIZE)
plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle))], [pcmins[i % 12] for i in range(len(datasingle))], label="seasonality")
plt.title("post count for new users - seasonality")
plt.xticks(rotation=90)
plt.xlabel("months")
plt.ylabel("post count - seasonality")
plt.legend(loc="upper right")
outfile = outputdir + "/season_postcount-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 1
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder, interval)

View File

@@ -14,8 +14,6 @@ from sentiments import readtoxleveltxt
colors = ['red', 'green', 'blue', 'orange', 'deeppink'] colors = ['red', 'green', 'blue', 'orange', 'deeppink']
thresholds = [6, 9, 12, 15] thresholds = [6, 9, 12, 15]
changedate = datetime.fromisoformat("2018-09-01T00:00:00")
def main(folder, intervl): def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder) users, posts, firstcontrib, sumcontrib = load(folder)
@@ -94,7 +92,7 @@ def main(folder, intervl):
print("Computing full ITS1") print("Computing full ITS1")
t = np.reshape(np.array([i for i in range(len(datasingle1)) for j in datasingle1[i]]), (-1, 1)) t = np.reshape(np.array([i for i in range(len(datasingle1)) for j in datasingle1[i]]), (-1, 1))
x = np.reshape(np.array([(0 if intervals[i][0] <= changedate else 1) for i in range(len(datasingle1)) for j in datasingle1[i]]), (-1, 1)) x = np.reshape(np.array([(0 if intervals[i][0] <= CHANGE_DATE else 1) for i in range(len(datasingle1)) for j in datasingle1[i]]), (-1, 1))
X = np.array(t) X = np.array(t)
X = np.concatenate((X, x), 1) X = np.concatenate((X, x), 1)
X = np.concatenate((X, np.multiply(t, x)), 1) X = np.concatenate((X, np.multiply(t, x)), 1)
@@ -111,7 +109,7 @@ def main(folder, intervl):
print("Computing full ITS2") print("Computing full ITS2")
t = np.reshape(np.array([i for i in range(len(datasingle2)) for j in datasingle2[i]]), (-1, 1)) t = np.reshape(np.array([i for i in range(len(datasingle2)) for j in datasingle2[i]]), (-1, 1))
x = np.reshape(np.array([(0 if intervals[i][0] <= changedate else 1) for i in range(len(datasingle2)) for j in datasingle2[i]]), (-1, 1)) x = np.reshape(np.array([(0 if intervals[i][0] <= CHANGE_DATE else 1) for i in range(len(datasingle2)) for j in datasingle2[i]]), (-1, 1))
X = np.array(t) X = np.array(t)
X = np.concatenate((X, x), 1) X = np.concatenate((X, x), 1)
X = np.concatenate((X, np.multiply(t, x)), 1) X = np.concatenate((X, np.multiply(t, x)), 1)
@@ -132,14 +130,14 @@ def main(folder, intervl):
thresp = [] thresp = []
print("Computing threshold ITS") print("Computing threshold ITS")
for ti in thresholds: for ti in thresholds:
# print(1, changedate - relativedelta(months=ti)) # print(1, CHANGE_DATE - relativedelta(months=ti))
# print(2, changedate + relativedelta(months=ti)) # print(2, CHANGE_DATE + relativedelta(months=ti))
z = [(i, x) for (i, x) in zip(intervals, datasingle1) if i[0] >= changedate - relativedelta(months=ti) and i[1] <= changedate + relativedelta(months=ti)] z = [(i, x) for (i, x) in zip(intervals, datasingle1) if i[0] >= CHANGE_DATE - relativedelta(months=ti) and i[1] <= CHANGE_DATE + relativedelta(months=ti)]
iv = [i for (i, x) in z] iv = [i for (i, x) in z]
# print("iv " + str(iv)) # print("iv " + str(iv))
d = [x for (i, x) in z] d = [x for (i, x) in z]
t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1)) t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1))
x = np.reshape(np.array([(0 if iv[i][0] <= changedate else 1) for i in range(len(d)) for j in d[i]]), (-1, 1)) x = np.reshape(np.array([(0 if iv[i][0] <= CHANGE_DATE else 1) for i in range(len(d)) for j in d[i]]), (-1, 1))
X = np.array(t) X = np.array(t)
X = np.concatenate((X, x), 1) X = np.concatenate((X, x), 1)
X = np.concatenate((X, np.multiply(t, x)), 1) X = np.concatenate((X, np.multiply(t, x)), 1)

View File

@@ -376,9 +376,13 @@ This shortcoming was addressed by \citeauthor{hutto2014vader} who introducted a
% ursprüngliches paper ITS, wie hat man das früher (davor) gemacht % ursprüngliches paper ITS, wie hat man das früher (davor) gemacht
\subsection{Trend analysis} \subsection{Trend analysis}
When introducing a change to a system (experiment), one often wants to know whether the intervention achieves its intended purpose. This leads to 3 possible outcomes: a) the intervention shows effect and the system changes in the desired way, b) the intervention shows effect and the system changes in an undesired way, or c) the system did not react at all to the change. There are multiple ways to determine which of these outcomes occur. To analyze the behavior of the system data from before and after the intervention as well as the nature of the intervation has be aquired. The are multiple ways to run such an experiment and one has to choose which type of experiment fits best. There are 2 categories of approaches: actively creating an experiment where one design the experiment before it is executed (for example randomized control trials in medical fields), or using existing data of an experiment which was not designed beforehand or where setting up a designed experiment is not possible (quasi-experiment). When introducing a change to a system (experiment), one often wants to know whether the intervention achieves its intended purpose. This leads to 3 possible outcomes: a) the intervention shows effect and the system changes in the desired way, b) the intervention shows effect and the system changes in an undesired way, or c) the system did not react at all to the change. There are multiple ways to determine which of these outcomes occur. To analyze the behavior of the system, data from before and after the intervention as well as the nature of the intervation has be aquired. The are multiple ways to run such an experiment and one has to choose which type of experiment fits best. There are 2 categories of approaches: actively creating an experiment where one design the experiment before it is executed (for example randomized control trials in medical fields), or using existing data of an experiment which was not designed beforehand or where setting up a designed experiment is not possible (quasi-experiment).
As this thesis investigates a change which has already been implemented by another party, this thesis covers quasi-experiments. A tool that is often used for this purpose is an \emph{Interrupted Time Series} (ITS) analysis. The ITS analysis is a form of segmented regression analysis, where data from before, after and during the intervention is regressed with seperate line segements\cite{mcdowall2019interrupted, bernal2017interrupted}. ITS requires data at (regular) intervals from before and after the intervention (time series). The interrupt signifies the intervention and the time of when it occured must be known. The intervention can be at a single point in time of it can be streched out over a certain time span. This property must also be known to take it into account when designing the regression. Also, as the data is aquired from an quasi-experiment, it may be baised, for example seasonality, ....%TODO As this thesis investigates a change which has already been implemented by another party, this thesis covers quasi-experiments. A tool that is often used for this purpose is an \emph{Interrupted Time Series} (ITS) analysis. The ITS analysis is a form of segmented regression analysis, where data from before, after and during the intervention is regressed with seperate line segements\cite{mcdowall2019interrupted}. ITS requires data at (regular) intervals from before and after the intervention (time series). The interrupt signifies the intervention and the time of when it occured must be known. The intervention can be at a single point in time or it can be streched out over a certain time span. This property must also be known to take it into account when designing the regression. Also, as the data is aquired from an quasi-experiment, it may be baised\cite{bernal2017interrupted}, for example seasonality, time-varying confunders (for example a change in measuring data), variance in the number of single observations grouped together in an interval measurement, etc.. These biases need to be addressed if present. Seasonality can be accounted for by subtracting the average value of each of the months in succesive years (i.e. subtract the average value of all Januaries in the data set from the the values in Januaries).
%\begin{lstlisting}
% deseasonalized = datasample - average(dataSamplesInMonth(month(datasample)))
%\end{lstlisting}
This removes the differences between different months of the same year thereby filtering out the effect of seasonality. The variance in data density per interval (data samples in an interval) can be addressed by using the each single data point in the regression instead of an average.

View File

@@ -13,6 +13,10 @@ StackExchange introduced a \emph{new contributor} indicator to all communities o
% https://meta.stackexchange.com/questions/314472/what-are-the-exact-criteria-for-the-new-contributor-indicator-to-be-shown \cite{sonic2018what} ; change date = 2018-08-21T21:04:49.177 % https://meta.stackexchange.com/questions/314472/what-are-the-exact-criteria-for-the-new-contributor-indicator-to-be-shown \cite{sonic2018what} ; change date = 2018-08-21T21:04:49.177
% new user indicator visible for 1 week ... % new user indicator visible for 1 week ...
%TODO state plots of sec 5 here and why these were chosen
% -> also limitierungen, andere faktoren
%TODO more vader explanation %TODO more vader explanation
To measure the effectiveness of the change this thesis utilizes Vader, a sentiment analysis tool with exceptional performance in analysing and categorizing microblog-like texts as well as good generalization in other domains \cite{hutto2014vader}. The choice is based on the speed and simplicity of Vader. Vader uses a lexicon of words with attached sentiment values and rules related to grammar and syntax to determine a sentiment value between -1 and 1 to a given piece of text. The sentiment range is divided into 3 classes: negative (-1 to -0.05), neutral (-0.05 to 0.05), and positive (0.05 to 1). The outer edges of the value space are rarely reached as the text would have to be extremely negative or positive which is very unlikely. This design allows fast and verifiable analysis. To measure the effectiveness of the change this thesis utilizes Vader, a sentiment analysis tool with exceptional performance in analysing and categorizing microblog-like texts as well as good generalization in other domains \cite{hutto2014vader}. The choice is based on the speed and simplicity of Vader. Vader uses a lexicon of words with attached sentiment values and rules related to grammar and syntax to determine a sentiment value between -1 and 1 to a given piece of text. The sentiment range is divided into 3 classes: negative (-1 to -0.05), neutral (-0.05 to 0.05), and positive (0.05 to 1). The outer edges of the value space are rarely reached as the text would have to be extremely negative or positive which is very unlikely. This design allows fast and verifiable analysis.
@@ -26,7 +30,7 @@ StackExchange provides anonymized data dumps of all their communities for resear
% broken entries, missing user id % broken entries, missing user id
% answers in html -> strip html and remove code sections, no contribution to sentiment % answers in html -> strip html and remove code sections, no contribution to sentiment
After preprocessing the raw data, relevant data is filtered and computed. Questions and answers in the data are mixed together and have to be separated and answers have to be linked to their questions. Also, questions in these datasets do not have the \emph{new contributor} indicator attached to them and neither do users. So, the first contribution date and time of users have to be calculated via the creation dates of the questions and answers the user has posted. Then, questions are filtered per user and by whether they are created within the 7-day window after the first contribution of the user. These questions were created during the period where the \emph{new contributor} indicator would have been displayed, in case the questions had been posted before the change, or has been displayed after the change. From these questions, all answers which arrived within the 7-day window are considered for the analysis. Answers which arrived at a later point are excluded as the answerer most likely has not seen the disclaimer shown in figure \ref{newcontributor}. Included answers are then analyzed with Vader and the resulting sentiments are stored. Furhtermore, votes to questions of new contributors are counted if they arrived within the 7-day window and count 1 if it is an upvote and -1 if it is a downvote. Moreover, number of questions new contributors ask are counted and divided into two classes: 1st-question of a user and follow-up questions of a new contributor. After preprocessing the raw data, relevant data is filtered and computed. Questions and answers in the data are mixed together and have to be separated and answers have to be linked to their questions. Also, questions in these datasets do not have the \emph{new contributor} indicator attached to them and neither do users. So, the first contribution date and time of users have to be calculated via the creation dates of the questions and answers the user has posted. Then, questions are filtered per user and by whether they are created within the 7-day window after the first contribution of the user. These questions were created during the period where the \emph{new contributor} indicator would have been displayed, in case the questions had been posted before the change, or has been displayed after the change. From these questions, all answers which arrived within the 7-day window are considered for the analysis. Answers which arrived at a later point are excluded as the answerer most likely has not seen the disclaimer shown in figure \ref{newcontributor}. Included answers are then analyzed with Vader and the resulting sentiments are stored. Furthermore, votes to questions of new contributors are counted if they arrived within the 7-day window and count 1 if it is an upvote and -1 if it is a downvote. Moreover, number of questions new contributors ask are counted and divided into two classes: 1st-question of a user and follow-up questions of a new contributor.
% calc sentiment for answers % calc sentiment for answers
% questions do not have a tag if from a new contribtor -> calc first contributor % questions do not have a tag if from a new contribtor -> calc first contributor
@@ -43,7 +47,7 @@ After preprocessing the raw data, relevant data is filtered and computed. Questi
\section{Analysis} \section{Analysis}
An interrupted time series (ITS) analysis captures trends before and after a change in a system and fits very well with the question this thesis investigates. ITS can be applied to a large variety of data if the data contains the same kind of data points before and after the change and when the change date and time are known. \citeauthor{bernal2017interrupted} published a paper on how ITS works \cite{bernal2017interrupted}. ITS performes well on medical data, for instance, when a new treatment is introduced ITS can visualize if the treatment improves a condition. For ITS no control group is required and often control groups are not feasible. ITS only works with the before and after data and a point in time where a change was introduced. An interrupted time series (ITS) analysis captures trends before and after a change in a system and fits very well with the question this thesis investigates. ITS can be applied to a large variety of data if the data contains the same kind of data points before and after the change and when the change date and time are known. \citeauthor{bernal2017interrupted} published a paper on how ITS works \cite{bernal2017interrupted}. ITS performes well on medical data, for instance, when a new treatment is introduced ITS can visualize if the treatment improves a condition. For ITS no control group is required and often control groups are not feasible. ITS only works with the before and after data and a point in time where a change was introduced.
ITS relies on linear regression and tries to fit a three-segment linear function to the data. The authors also described cases where more than three segments are used but these models quickly raise the complexity of the analysis and for this thesis a three-segment linear regression is sufficient. The three segments are lines to fit the data before and after the change as well as one line to connect the other two lines at the change date. Figure \ref{itsexample} shows an example of an ITS. Each segment is captured by a tensor of the following formula $Y_t = \beta_0 + \beta_1T + \beta_2X_t + \beta_3TX_t$, where $T$ represents time as a number, for instance, number of months since the start of data recording, $X_t$ represents 0 or 1 depending on whether the change is in effect, $\beta_0$ represents the value at $T = 0$, $\beta_1$ represents the slope before the change, $\beta_2$ represents the value when the change is introduced, and $\beta_3$ represents the slope after the change. Contrary to the method in \cite{bernal2017interrupted} where the ITS is performed on aggregated values per month, this thesis performs the ITS on single data points, as the premise that the aggregated values all have the same weight within a certain margin is not fulfilled for sentiment and vote score values. Performing the ITS with aggregated values would skew the linear regression more towards data points with less weight. Single data point fitting prevents this, as weight is taken into account with more data points. ITS relies on linear regression and tries to fit a three-segment linear function to the data. The authors also described cases where more than three segments are used but these models quickly raise the complexity of the analysis and for this thesis a three-segment linear regression is sufficient. The three segments are lines to fit the data before and after the change as well as one line to connect the other two lines at the change date. Figure \ref{itsexample} shows an example of an ITS. Each segment is captured by a tensor of the following formula $Y_t = \beta_0 + \beta_1T + \beta_2X_t + \beta_3TX_t$, where $T$ represents time as a number, for instance, number of months since the start of data recording, $X_t$ represents 0 or 1 depending on whether the change is in effect, $\beta_0$ represents the value at $T = 0$, $\beta_1$ represents the slope before the change, $\beta_2$ represents the value when the change is introduced, and $\beta_3$ represents the slope after the change. Contrary to the basic method explained in \cite{bernal2017interrupted} where the ITS is performed on aggregated values per month, this thesis performs the ITS on single data points, as the premise that the aggregated values all have the same weight within a certain margin is not fulfilled for sentiment and vote score values. Performing the ITS with aggregated values would skew the linear regression more towards data points with less weight. Single data point fitting prevents this, as weight is taken into account with more data points. To filter out seasonal effects, the average value of all data points with the same month of all years is subtracted from the data points (i.e. subtract the average value of all Januaries from each data point in a January).
\begin{figure} \begin{figure}

View File

@@ -203,6 +203,7 @@
\usepackage{float} \usepackage{float}
\usepackage{subcaption} \usepackage{subcaption}
\let\mfs\multiplefootnoteseparator \let\mfs\multiplefootnoteseparator
\usepackage{listings}
\addbibresource{\mybiblatexfile} \addbibresource{\mybiblatexfile}

15
todo2
View File

@@ -6,13 +6,13 @@
- ursprüngliches paper ITS, wie hat man das früher (davor) gemacht - ursprüngliches paper ITS, wie hat man das früher (davor) gemacht
- onboarding passt, community growth und sustanibiltiy machen - onboarding passt, community growth und sustanibiltiy machen
- sentiment analyse: es gibt 10-15 methoden, - DONE sentiment analyse: es gibt 10-15 methoden,
- alle sentiment methoden + vader - DONE alle sentiment methoden + vader
3. 3.
- argumente warum ich genau diese variablen (sentiment, votes, #questions) - DONEXT argumente warum ich genau diese variablen (sentiment, votes, #questions)
- limitierungen, andere faktoren - DONEXT limitierungen, andere faktoren
- vader genau beschreiben - DONEXT vader genau beschreiben
5. 5.
- DONE gruppieren nach categorien - DONE gruppieren nach categorien
@@ -28,11 +28,6 @@ extra
5. stackoverflow vote score last datapoint: probably questions did not have enougth time to gain votes 5. stackoverflow vote score last datapoint: probably questions did not have enougth time to gain votes
ranking
stackoverflow good

View File

@@ -14,7 +14,6 @@ from sentiments import readtoxleveltxt
colors = ['red', 'green', 'blue', 'orange', 'deeppink'] colors = ['red', 'green', 'blue', 'orange', 'deeppink']
thresholds = [3, 4, 5, 6] thresholds = [3, 4, 5, 6]
changedate = datetime.fromisoformat("2018-09-01T00:00:00")
def main(folder, intervl): def main(folder, intervl):

View File

@@ -8,13 +8,12 @@ from datetime import datetime
from datetime import timedelta from datetime import timedelta
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, difftime from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, CHANGE_DATE, difftime
from loader import load, dmt, cms, readVotes from loader import load, dmt, cms, readVotes
from sentiments import readtoxleveltxt from sentiments import readtoxleveltxt
colors = ['red', 'green', 'blue', 'orange', 'deeppink'] colors = ['red', 'green', 'blue', 'orange', 'deeppink']
thresholds = [6, 9, 12, 15] thresholds = [6, 9, 12, 15]
changedate = datetime.fromisoformat("2018-09-01T00:00:00")
def main(folder, intervl): def main(folder, intervl):
@@ -76,7 +75,7 @@ def main(folder, intervl):
print("Computing full ITS") print("Computing full ITS")
t = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) t = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
x = np.reshape(np.array([(0 if intervals[i][0] <= changedate else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1)) x = np.reshape(np.array([(0 if intervals[i][0] <= CHANGE_DATE else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
X = np.array(t) X = np.array(t)
X = np.concatenate((X, x), 1) X = np.concatenate((X, x), 1)
X = np.concatenate((X, np.multiply(t, x)), 1) X = np.concatenate((X, np.multiply(t, x)), 1)
@@ -97,14 +96,14 @@ def main(folder, intervl):
thresp = [] thresp = []
print("Computing threshold ITS") print("Computing threshold ITS")
for ti in thresholds: for ti in thresholds:
# print(1, changedate - relativedelta(months=ti)) # print(1, CHANGE_DATE - relativedelta(months=ti))
# print(2, changedate + relativedelta(months=ti)) # print(2, CHANGE_DATE + relativedelta(months=ti))
z = [(i, x) for (i, x) in zip(intervals, datasingle) if i[0] >= changedate - relativedelta(months=ti) and i[1] <= changedate + relativedelta(months=ti)] z = [(i, x) for (i, x) in zip(intervals, datasingle) if i[0] >= CHANGE_DATE - relativedelta(months=ti) and i[1] <= CHANGE_DATE + relativedelta(months=ti)]
iv = [i for (i, x) in z] iv = [i for (i, x) in z]
# print("iv " + str(iv)) # print("iv " + str(iv))
d = [x for (i, x) in z] d = [x for (i, x) in z]
t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1)) t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1))
x = np.reshape(np.array([(0 if iv[i][0] <= changedate else 1) for i in range(len(d)) for j in d[i]]), (-1, 1)) x = np.reshape(np.array([(0 if iv[i][0] <= CHANGE_DATE else 1) for i in range(len(d)) for j in d[i]]), (-1, 1))
X = np.array(t) X = np.array(t)
X = np.concatenate((X, x), 1) X = np.concatenate((X, x), 1)
X = np.concatenate((X, np.multiply(t, x)), 1) X = np.concatenate((X, np.multiply(t, x)), 1)