This commit is contained in:
wea_ondara
2019-12-18 13:02:16 +01:00
parent 356eefaf53
commit 2c1524a335
4 changed files with 245 additions and 38 deletions

135
its.py
View File

@@ -1,17 +1,17 @@
import os
import os
import sys
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import numpy as np
import os
from datetime import datetime
from datetime import timedelta
from sklearn.linear_model import LinearRegression
from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER
from loader import load, dmt, cms
OLD_USER_PERCENTILE = 0.95
from sentiments import readtoxleveltxt
import statsmodels.api as sm
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
@@ -23,33 +23,31 @@ def main(folder, intervl):
start = cms()
printnoln("reading sentiments ...")
cachedsentiments = imprt(folder + "/output/sentiments.py").answers
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
rprint("reading sentiments ... took " + str(cms() - start) + "ms")
outputdir = folder + "/output/its/"
os.system("mkdir -p " + outputdir)
data = []
datasingle = []
count = []
for (option_date_from, option_date_to) in intervals:
if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"):
data.append(float("nan"))
continue
print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# avg sentiments
# print(dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
# for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
# and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
# .filter(lambda p: p != [])
# .getresults())
# break
filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
.filter(lambda p: p != [])
.reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
.getresults())
datasingle.append(filtered)
avg = np.average(filtered) if len(filtered) > 0 else float("nan")
data.append(avg)
count.append(len(filtered))
# filter nan entries
for i in range(len(data)):
@@ -57,37 +55,110 @@ def main(folder, intervl):
del data[i]
del intervals[i]
print("Computing ITS ...")
t = np.reshape(np.array([i for i in range(len(data))]), (-1, 1))
# print("t", t)
x = np.reshape(np.array([(0 if option_date_to <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for (option_date_from, option_date_to) in intervals]), (-1, 1))
# print("x", x)
X = np.reshape(np.array([data[0] for i in range(len(data))]), (-1, 1))
# print("X", X)
X = np.concatenate((X, t), 1)
# print("Computing ITS ...")
# t = np.reshape(np.array([i for i in range(len(data))]), (-1, 1))
# x = np.reshape(np.array([(0 if option_date_to <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for (option_date_from, option_date_to) in intervals]), (-1, 1))
# X = np.array(t)
# X = np.concatenate((X, x), 1)
# X = np.concatenate((X, np.multiply(t, x)), 1)
# y = np.reshape(np.array(data), (-1, 1))
# # print("Xfin", X)
# # print("y", y)
# reg = LinearRegression()
# reg.fit(X, y)
# score = reg.score(X, y)
# coef = np.reshape(np.array(reg.coef_), (-1, 1))
# its = X.dot(coef) + reg.intercept_
# print("score: " + str(score))
# print("coef: " + str(coef))
# print("its: " + str(its))
print("Computing full ITS")
t = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
x = np.reshape(np.array([(0 if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
X = np.array(t)
X = np.concatenate((X, x), 1)
X = np.concatenate((X, np.multiply(t, x)), 1)
y = np.reshape(np.array(data), (-1, 1))
y = np.reshape(np.array([d for a in datasingle for d in a]), (-1, 1))
# print("Xfin", X)
# print("y", y)
reg = LinearRegression()
reg.fit(X, y)
score = reg.score(X, y);
coef = np.reshape(np.array(reg.coef_), (-1, 1))
its = X.dot(coef) + data[0]
print("score: " + str(score))
print("coef: " + str(coef))
print("its: " + str(its))
# reg = LinearRegression()
# reg.fit(X, y)
# score2 = reg.score(X, y)
# coef2 = np.reshape(np.array(reg.coef_), (-1, 1))
# its2 = X.dot(coef2) + reg.intercept_
# print("intercept: " + str(reg.intercept_))
# print("score: " + str(score2))
# print("coef: " + str(coef2))
# print("its: " + str(its2))
X = sm.add_constant(X)
res = sm.OLS(y, X).fit()
p2 = res.pvalues
print("coef ols: " + str(res.params))
print("sum ols: " + str(res.summary()))
coef2ols = np.reshape(np.array(res.params), (-1, 1))
its2ols = X.dot(coef2ols)
with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file:
file.write(str(res.summary()))
# print("Computing segmented ITS before")
# X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
# y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
# reg = LinearRegression()
# reg.fit(X, y)
# scoreb = reg.score(X, y)
# coefb = np.reshape(np.array(reg.coef_), (-1, 1))
# itsb = X.dot(coefb) + reg.intercept_
# print("scoreb: " + str(scoreb))
# print("coefb: " + str(coefb))
# print("itsb: " + str(itsb))
# print("Computing segmented ITS after")
# X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
# y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
# reg = LinearRegression()
# reg.fit(X, y)
# scorea = reg.score(X, y)
# coefa = np.reshape(np.array(reg.coef_), (-1, 1))
# itsa = X.dot(coefa) + reg.intercept_
# print("scorea: " + str(scorea))
# print("coefa: " + str(coefa))
# print("itsa: " + str(itsa))
fig = plt.figure(figsize=(16, 12))
plt.plot([i[0] for i in intervals], data, label="average sentiment")
plt.plot([i[0] for i in intervals], its, label="ITS (score " + str(score) + ")")
plt.grid(True)
for i in range(len(data)):
va = "center"
if 0 < i < len(data) - 1:
if data[i - 1] < data[i] and data[i + 1] < data[i]:
va = "bottom"
elif data[i - 1] > data[i] and data[i + 1] > data[i]:
va = "top"
elif i == 0:
if data[i + 1] < data[i]:
va = "bottom"
else:
va = "top"
elif i == len(data) - 1:
if data[i - 1] < data[i]:
va = "bottom"
else:
va = "top"
plt.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
# plt.plot([i[0] for i in intervals], its, label="aggregated ITS (score " + str(score) + ")")
# plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2, label="single ITS (score " + str(score2) + ", p " + str(p2) + ")")
plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS (pvalues " + str(p2) + ")")
# plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")], itsb,
# label="segmented ITS b (score " + str(scoreb) + ")")
# plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")], itsa,
# label="segmented ITS a (score " + str(scorea) + ")")
plt.title("Average sentiments for new users")
plt.xticks(rotation=90)
plt.xlabel("months")
plt.ylabel("sentiment")
plt.legend(loc="upper right")
outfile = outputdir + "/average_sentiments.png"
outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)

22
notes
View File

@@ -13,3 +13,25 @@ interrupted time series
3 monate vor änderung und nache der änderungen
da gibts 2 varianten: sprung, oder 2 linien
----------
lila lines in its fixen -> done
coef -> p values significance, statsmodels sm logit, sm.summary() -> done
bisschen zusammen fassen was ich gemacht habe
plot über zeit <-0.05, -, >0.05 für sentiments, 3 kurven -> done
paper, strg+f arousal, methode wichtig, referncen anschauen https://link.springer.com/content/pdf/10.1007%2Fs42001-017-0001-x.pdf
#einfluss von code sections
Papers:
its:
http://lindenconsulting.org/documents/Weighted_TSA_Article.pdf

View File

@@ -1,12 +1,14 @@
import os
import sys
from collections import defaultdict
from datetime import timedelta
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from common import calc_intervals, IMAGE_MAGICK
from common import calc_intervals, IMAGE_MAGICK, DAYS_NEW_USER
from loader import load, dmt
from sentiments import readtoxleveltxt
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
@@ -14,15 +16,19 @@ colors = ['red', 'green', 'blue', 'orange', 'deeppink']
def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts, intervl)
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
outputdir = folder + "/output/posthist/"
os.system("mkdir -p " + outputdir)
activeusercounts = []
answerstonewusers = []
sentimentstonewusers = []
imgmagickcmd = IMAGE_MAGICK
for (option_date_from, option_date_to) in intervals:
print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y")))
print(option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))
# post histograms
# filter posts by option_date_from <= creation date <= option_date_to
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults()
@@ -34,7 +40,7 @@ def main(folder, intervl):
postcounts = {id: len(pc) for (id, pc) in postcounts.items()}
activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys())))
histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "-i" + str(intervl)
histdata = [pc for pc in postcounts.values()]
fig = plt.figure(figsize=(16, 12))
@@ -48,14 +54,50 @@ def main(folder, intervl):
fig.savefig(histfilename + ".png", bbox_inches='tight')
plt.close(fig)
imgmagickcmd += " " + histfilename + ".png"
os.system(imgmagickcmd + " " + outputdir + "/posthist.pdf")
# answers to new users
answers = (dmt(posts).map(lambda q: [a for a in q['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
and firstcontrib[q['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
.getresults())
count = sum([len(a) for a in answers])
answerstonewusers.append(((option_date_from, option_date_to), count))
sent = ([cachedsentiments[a['Id']] for al in answers for a in al])
sentbad = len([1 for a in sent if a['compound'] < -0.05])
sentneu = len([1 for a in sent if -0.05 <= a['compound'] <= 0.05])
sentgood = len([1 for a in sent if a['compound'] > 0.05])
sentimentstonewusers.append(((option_date_from, option_date_to), (sent, sentbad, sentneu, sentgood)))
# gen pdf for post histograms
os.system(imgmagickcmd + " " + outputdir + "/posthist-i" + str(intervl) + ".pdf")
# plot posts diagram
fig = plt.figure(figsize=(16, 12))
plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts])
plt.yscale('log')
plt.ylim(bottom=0)
plt.ylim(bottom=0.001)
plt.title("Active users")
fig.savefig(outputdir + "activeusers.png", bbox_inches='tight')
fig.savefig(outputdir + "activeusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig)
# plot answers to new users diagram
fig = plt.figure(figsize=(16, 12))
plt.plot([x[0] for (x, y) in answerstonewusers], [y for (x, y) in answerstonewusers])
plt.yscale('log')
plt.ylim(bottom=0.001)
plt.title("#Answers to new users")
fig.savefig(outputdir + "answerstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig)
# plot sentiments of answers to new users diagram
fig = plt.figure(figsize=(16, 12))
plt.plot([x[0] for (x, y) in sentimentstonewusers], [b for (x, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer")
plt.plot([x[0] for (x, y) in sentimentstonewusers], [n for (x, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer")
plt.plot([x[0] for (x, y) in sentimentstonewusers], [g for (x, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer")
plt.yscale('log')
plt.ylim(bottom=0.001)
plt.legend(loc="upper right")
plt.title("Sentiments of answers to new users")
fig.savefig(outputdir + "sentimentstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig)

View File

@@ -4,6 +4,7 @@ import sys
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from loader import load, dmt
from common import imprt
analyser = SentimentIntensityAnalyzer()
@@ -20,6 +21,30 @@ def main(folder):
toxlevels = {id: p for (id, p) in toxlevels}
dumptoxlevels(toxlevels, outfilename + ".py")
dumptoxlevelstxt(toxlevels, outfilename + ".txt")
# (lvl2q, lvl2a) = readtoxleveltxt(outfilename + ".txt")
#
# s1 = str(toxlevels)
# s2 = str(lvl2q)
# # print("s1: " + s1)
# # print("s2: " + s2)
# if s1 != s2:
# print("not equal")
# else:
# print("equal")
#
# # print("s1: " + str(imprt(folder + "/output/sentiments.py").answers))
# # print("s2: " + str(lvl2a))
# if str(imprt(folder + "/output/sentiments.py").answers) != str(lvl2a):
# print("a not equal")
# else:
# print("a equal")
#
# if str(imprt(folder + "/output/sentiments.py").posts) != str(lvl2q):
# print("q not equal")
# else:
# print("q equal")
def computeToxLevel(text):
@@ -36,6 +61,53 @@ def dumptoxlevels(lvls, filename):
file.write("answers = " + str(answers) + "\n")
def dumptoxlevelstxt(lvls, filename):
answers = dict()
for p in lvls.values():
for id, a in p.items():
answers[id] = a
pstr = [str(id) + ":" + ";".join([str(aid) + ":" + str(a['neg']) + ":" + str(a['neu']) + ":" + str(a['pos']) + ":" + str(a['compound']) for (aid, a) in p.items()]) for (id, p) in lvls.items()]
astr = [str(id) + ":" + str(p['neg']) + ":" + str(p['neu']) + ":" + str(p['pos']) + ":" + str(p['compound']) for (id, p) in answers.items()]
pstr = ";;".join(pstr)
astr = ";".join(astr)
with open(filename, "w") as file:
file.write("posts=" + pstr + "\n")
file.write("answers=" + astr + "\n")
def readtoxleveltxt(filename):
lines = ""
with open(filename, 'r') as f:
lines = f.read()
lines = lines.split("\n")
rq = {}
ra = {}
for line in lines:
if line.startswith("posts="):
line = line[len("posts="):]
rq = line.split(";;") # split by q
# print("i1: " + str(rq[0:5]))
rq = [l.split(":", 1) for l in rq] # get q id
# print("i2: " + str(rq[0:5]))
rq = [(qid, [x.split(":") for x in a.split(";")]) if len(a) > 0 else (qid, []) for [qid, a] in rq]
# print("i3:" + str(rq))
# rq = {int(id): {int(1): "a" for x in a} for (id, a) in rq}
# rq = {int(id): {str(aid[0]): str(aid) for aid in a} for (id, a) in rq}
rq = {id: {aid: {"neg": float(neg), "neu": float(neu), "pos": float(pos), "compound": float(compound)} for [aid, neg, neu, pos, compound] in a} for (id, a) in rq}
# print("i4:" + str(rq)[0:500])
# sys.exit()
elif line.startswith("answers="):
line = line[len("answers="):]
ra = line.split(";")
ra = [l.split(":") for l in ra]
# print("i1: " + str(ra[0:5]))
ra = {id: {"neg": float(neg), "neu": float(neu), "pos": float(pos), "compound": float(compound)} for [id, neg, neu, pos, compound] in ra}
# print("i1: " + str(ra)[0:500])
return rq, ra
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"