This commit is contained in:
wea_ondara
2019-12-18 13:02:16 +01:00
parent 356eefaf53
commit 2c1524a335
4 changed files with 245 additions and 38 deletions

135
its.py
View File

@@ -1,17 +1,17 @@
import os import os
import os
import sys import sys
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import os
from datetime import datetime
from datetime import timedelta
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LinearRegression
from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER
from loader import load, dmt, cms from loader import load, dmt, cms
from sentiments import readtoxleveltxt
OLD_USER_PERCENTILE = 0.95 import statsmodels.api as sm
colors = ['red', 'green', 'blue', 'orange', 'deeppink'] colors = ['red', 'green', 'blue', 'orange', 'deeppink']
@@ -23,33 +23,31 @@ def main(folder, intervl):
start = cms() start = cms()
printnoln("reading sentiments ...") printnoln("reading sentiments ...")
cachedsentiments = imprt(folder + "/output/sentiments.py").answers (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
rprint("reading sentiments ... took " + str(cms() - start) + "ms") rprint("reading sentiments ... took " + str(cms() - start) + "ms")
outputdir = folder + "/output/its/" outputdir = folder + "/output/its/"
os.system("mkdir -p " + outputdir) os.system("mkdir -p " + outputdir)
data = [] data = []
datasingle = []
count = []
for (option_date_from, option_date_to) in intervals: for (option_date_from, option_date_to) in intervals:
if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"): if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"):
data.append(float("nan")) data.append(float("nan"))
continue continue
print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# avg sentiments # avg sentiments
# print(dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
# for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
# and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
# .filter(lambda p: p != [])
# .getresults())
# break
filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound'] filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']]) and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
.filter(lambda p: p != []) .filter(lambda p: p != [])
.reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: []) .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
.getresults()) .getresults())
datasingle.append(filtered)
avg = np.average(filtered) if len(filtered) > 0 else float("nan") avg = np.average(filtered) if len(filtered) > 0 else float("nan")
data.append(avg) data.append(avg)
count.append(len(filtered))
# filter nan entries # filter nan entries
for i in range(len(data)): for i in range(len(data)):
@@ -57,37 +55,110 @@ def main(folder, intervl):
del data[i] del data[i]
del intervals[i] del intervals[i]
print("Computing ITS ...") # print("Computing ITS ...")
t = np.reshape(np.array([i for i in range(len(data))]), (-1, 1)) # t = np.reshape(np.array([i for i in range(len(data))]), (-1, 1))
# print("t", t) # x = np.reshape(np.array([(0 if option_date_to <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for (option_date_from, option_date_to) in intervals]), (-1, 1))
x = np.reshape(np.array([(0 if option_date_to <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for (option_date_from, option_date_to) in intervals]), (-1, 1)) # X = np.array(t)
# print("x", x) # X = np.concatenate((X, x), 1)
X = np.reshape(np.array([data[0] for i in range(len(data))]), (-1, 1)) # X = np.concatenate((X, np.multiply(t, x)), 1)
# print("X", X) # y = np.reshape(np.array(data), (-1, 1))
X = np.concatenate((X, t), 1) # # print("Xfin", X)
# # print("y", y)
# reg = LinearRegression()
# reg.fit(X, y)
# score = reg.score(X, y)
# coef = np.reshape(np.array(reg.coef_), (-1, 1))
# its = X.dot(coef) + reg.intercept_
# print("score: " + str(score))
# print("coef: " + str(coef))
# print("its: " + str(its))
print("Computing full ITS")
t = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
x = np.reshape(np.array([(0 if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
X = np.array(t)
X = np.concatenate((X, x), 1) X = np.concatenate((X, x), 1)
X = np.concatenate((X, np.multiply(t, x)), 1) X = np.concatenate((X, np.multiply(t, x)), 1)
y = np.reshape(np.array(data), (-1, 1)) y = np.reshape(np.array([d for a in datasingle for d in a]), (-1, 1))
# print("Xfin", X) # print("Xfin", X)
# print("y", y) # print("y", y)
reg = LinearRegression() # reg = LinearRegression()
reg.fit(X, y) # reg.fit(X, y)
score = reg.score(X, y); # score2 = reg.score(X, y)
coef = np.reshape(np.array(reg.coef_), (-1, 1)) # coef2 = np.reshape(np.array(reg.coef_), (-1, 1))
its = X.dot(coef) + data[0] # its2 = X.dot(coef2) + reg.intercept_
print("score: " + str(score)) # print("intercept: " + str(reg.intercept_))
print("coef: " + str(coef)) # print("score: " + str(score2))
print("its: " + str(its)) # print("coef: " + str(coef2))
# print("its: " + str(its2))
X = sm.add_constant(X)
res = sm.OLS(y, X).fit()
p2 = res.pvalues
print("coef ols: " + str(res.params))
print("sum ols: " + str(res.summary()))
coef2ols = np.reshape(np.array(res.params), (-1, 1))
its2ols = X.dot(coef2ols)
with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file:
file.write(str(res.summary()))
# print("Computing segmented ITS before")
# X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
# y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
# reg = LinearRegression()
# reg.fit(X, y)
# scoreb = reg.score(X, y)
# coefb = np.reshape(np.array(reg.coef_), (-1, 1))
# itsb = X.dot(coefb) + reg.intercept_
# print("scoreb: " + str(scoreb))
# print("coefb: " + str(coefb))
# print("itsb: " + str(itsb))
# print("Computing segmented ITS after")
# X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
# y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
# reg = LinearRegression()
# reg.fit(X, y)
# scorea = reg.score(X, y)
# coefa = np.reshape(np.array(reg.coef_), (-1, 1))
# itsa = X.dot(coefa) + reg.intercept_
# print("scorea: " + str(scorea))
# print("coefa: " + str(coefa))
# print("itsa: " + str(itsa))
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=(16, 12))
plt.plot([i[0] for i in intervals], data, label="average sentiment") plt.plot([i[0] for i in intervals], data, label="average sentiment")
plt.plot([i[0] for i in intervals], its, label="ITS (score " + str(score) + ")") plt.grid(True)
for i in range(len(data)):
va = "center"
if 0 < i < len(data) - 1:
if data[i - 1] < data[i] and data[i + 1] < data[i]:
va = "bottom"
elif data[i - 1] > data[i] and data[i + 1] > data[i]:
va = "top"
elif i == 0:
if data[i + 1] < data[i]:
va = "bottom"
else:
va = "top"
elif i == len(data) - 1:
if data[i - 1] < data[i]:
va = "bottom"
else:
va = "top"
plt.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
# plt.plot([i[0] for i in intervals], its, label="aggregated ITS (score " + str(score) + ")")
# plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2, label="single ITS (score " + str(score2) + ", p " + str(p2) + ")")
plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS (pvalues " + str(p2) + ")")
# plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")], itsb,
# label="segmented ITS b (score " + str(scoreb) + ")")
# plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")], itsa,
# label="segmented ITS a (score " + str(scorea) + ")")
plt.title("Average sentiments for new users") plt.title("Average sentiments for new users")
plt.xticks(rotation=90) plt.xticks(rotation=90)
plt.xlabel("months") plt.xlabel("months")
plt.ylabel("sentiment") plt.ylabel("sentiment")
plt.legend(loc="upper right") plt.legend(loc="upper right")
outfile = outputdir + "/average_sentiments.png" outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight') plt.savefig(outfile, bbox_inches='tight')
plt.close(fig) plt.close(fig)

22
notes
View File

@@ -13,3 +13,25 @@ interrupted time series
3 monate vor änderung und nache der änderungen 3 monate vor änderung und nache der änderungen
da gibts 2 varianten: sprung, oder 2 linien da gibts 2 varianten: sprung, oder 2 linien
----------
lila lines in its fixen -> done
coef -> p values significance, statsmodels sm logit, sm.summary() -> done
bisschen zusammen fassen was ich gemacht habe
plot über zeit <-0.05, -, >0.05 für sentiments, 3 kurven -> done
paper, strg+f arousal, methode wichtig, referncen anschauen https://link.springer.com/content/pdf/10.1007%2Fs42001-017-0001-x.pdf
#einfluss von code sections
Papers:
its:
http://lindenconsulting.org/documents/Weighted_TSA_Article.pdf

View File

@@ -1,12 +1,14 @@
import os import os
import sys import sys
from collections import defaultdict from collections import defaultdict
from datetime import timedelta
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator from matplotlib.ticker import MaxNLocator
from common import calc_intervals, IMAGE_MAGICK from common import calc_intervals, IMAGE_MAGICK, DAYS_NEW_USER
from loader import load, dmt from loader import load, dmt
from sentiments import readtoxleveltxt
colors = ['red', 'green', 'blue', 'orange', 'deeppink'] colors = ['red', 'green', 'blue', 'orange', 'deeppink']
@@ -14,15 +16,19 @@ colors = ['red', 'green', 'blue', 'orange', 'deeppink']
def main(folder, intervl): def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder) users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts, intervl) intervals = calc_intervals(posts, intervl)
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
outputdir = folder + "/output/posthist/" outputdir = folder + "/output/posthist/"
os.system("mkdir -p " + outputdir) os.system("mkdir -p " + outputdir)
activeusercounts = [] activeusercounts = []
answerstonewusers = []
sentimentstonewusers = []
imgmagickcmd = IMAGE_MAGICK imgmagickcmd = IMAGE_MAGICK
for (option_date_from, option_date_to) in intervals: for (option_date_from, option_date_to) in intervals:
print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))) print(option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))
# post histograms
# filter posts by option_date_from <= creation date <= option_date_to # filter posts by option_date_from <= creation date <= option_date_to
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults() newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults()
@@ -34,7 +40,7 @@ def main(folder, intervl):
postcounts = {id: len(pc) for (id, pc) in postcounts.items()} postcounts = {id: len(pc) for (id, pc) in postcounts.items()}
activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys()))) activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys())))
histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "-i" + str(intervl)
histdata = [pc for pc in postcounts.values()] histdata = [pc for pc in postcounts.values()]
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=(16, 12))
@@ -48,14 +54,50 @@ def main(folder, intervl):
fig.savefig(histfilename + ".png", bbox_inches='tight') fig.savefig(histfilename + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)
imgmagickcmd += " " + histfilename + ".png" imgmagickcmd += " " + histfilename + ".png"
os.system(imgmagickcmd + " " + outputdir + "/posthist.pdf")
# answers to new users
answers = (dmt(posts).map(lambda q: [a for a in q['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
and firstcontrib[q['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
.getresults())
count = sum([len(a) for a in answers])
answerstonewusers.append(((option_date_from, option_date_to), count))
sent = ([cachedsentiments[a['Id']] for al in answers for a in al])
sentbad = len([1 for a in sent if a['compound'] < -0.05])
sentneu = len([1 for a in sent if -0.05 <= a['compound'] <= 0.05])
sentgood = len([1 for a in sent if a['compound'] > 0.05])
sentimentstonewusers.append(((option_date_from, option_date_to), (sent, sentbad, sentneu, sentgood)))
# gen pdf for post histograms
os.system(imgmagickcmd + " " + outputdir + "/posthist-i" + str(intervl) + ".pdf")
# plot posts diagram
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=(16, 12))
plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts]) plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts])
plt.yscale('log') plt.yscale('log')
plt.ylim(bottom=0) plt.ylim(bottom=0.001)
plt.title("Active users") plt.title("Active users")
fig.savefig(outputdir + "activeusers.png", bbox_inches='tight') fig.savefig(outputdir + "activeusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig)
# plot answers to new users diagram
fig = plt.figure(figsize=(16, 12))
plt.plot([x[0] for (x, y) in answerstonewusers], [y for (x, y) in answerstonewusers])
plt.yscale('log')
plt.ylim(bottom=0.001)
plt.title("#Answers to new users")
fig.savefig(outputdir + "answerstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig)
# plot sentiments of answers to new users diagram
fig = plt.figure(figsize=(16, 12))
plt.plot([x[0] for (x, y) in sentimentstonewusers], [b for (x, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer")
plt.plot([x[0] for (x, y) in sentimentstonewusers], [n for (x, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer")
plt.plot([x[0] for (x, y) in sentimentstonewusers], [g for (x, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer")
plt.yscale('log')
plt.ylim(bottom=0.001)
plt.legend(loc="upper right")
plt.title("Sentiments of answers to new users")
fig.savefig(outputdir + "sentimentstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)

View File

@@ -4,6 +4,7 @@ import sys
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from loader import load, dmt from loader import load, dmt
from common import imprt
analyser = SentimentIntensityAnalyzer() analyser = SentimentIntensityAnalyzer()
@@ -20,6 +21,30 @@ def main(folder):
toxlevels = {id: p for (id, p) in toxlevels} toxlevels = {id: p for (id, p) in toxlevels}
dumptoxlevels(toxlevels, outfilename + ".py") dumptoxlevels(toxlevels, outfilename + ".py")
dumptoxlevelstxt(toxlevels, outfilename + ".txt")
# (lvl2q, lvl2a) = readtoxleveltxt(outfilename + ".txt")
#
# s1 = str(toxlevels)
# s2 = str(lvl2q)
# # print("s1: " + s1)
# # print("s2: " + s2)
# if s1 != s2:
# print("not equal")
# else:
# print("equal")
#
# # print("s1: " + str(imprt(folder + "/output/sentiments.py").answers))
# # print("s2: " + str(lvl2a))
# if str(imprt(folder + "/output/sentiments.py").answers) != str(lvl2a):
# print("a not equal")
# else:
# print("a equal")
#
# if str(imprt(folder + "/output/sentiments.py").posts) != str(lvl2q):
# print("q not equal")
# else:
# print("q equal")
def computeToxLevel(text): def computeToxLevel(text):
@@ -36,6 +61,53 @@ def dumptoxlevels(lvls, filename):
file.write("answers = " + str(answers) + "\n") file.write("answers = " + str(answers) + "\n")
def dumptoxlevelstxt(lvls, filename):
answers = dict()
for p in lvls.values():
for id, a in p.items():
answers[id] = a
pstr = [str(id) + ":" + ";".join([str(aid) + ":" + str(a['neg']) + ":" + str(a['neu']) + ":" + str(a['pos']) + ":" + str(a['compound']) for (aid, a) in p.items()]) for (id, p) in lvls.items()]
astr = [str(id) + ":" + str(p['neg']) + ":" + str(p['neu']) + ":" + str(p['pos']) + ":" + str(p['compound']) for (id, p) in answers.items()]
pstr = ";;".join(pstr)
astr = ";".join(astr)
with open(filename, "w") as file:
file.write("posts=" + pstr + "\n")
file.write("answers=" + astr + "\n")
def readtoxleveltxt(filename):
lines = ""
with open(filename, 'r') as f:
lines = f.read()
lines = lines.split("\n")
rq = {}
ra = {}
for line in lines:
if line.startswith("posts="):
line = line[len("posts="):]
rq = line.split(";;") # split by q
# print("i1: " + str(rq[0:5]))
rq = [l.split(":", 1) for l in rq] # get q id
# print("i2: " + str(rq[0:5]))
rq = [(qid, [x.split(":") for x in a.split(";")]) if len(a) > 0 else (qid, []) for [qid, a] in rq]
# print("i3:" + str(rq))
# rq = {int(id): {int(1): "a" for x in a} for (id, a) in rq}
# rq = {int(id): {str(aid[0]): str(aid) for aid in a} for (id, a) in rq}
rq = {id: {aid: {"neg": float(neg), "neu": float(neu), "pos": float(pos), "compound": float(compound)} for [aid, neg, neu, pos, compound] in a} for (id, a) in rq}
# print("i4:" + str(rq)[0:500])
# sys.exit()
elif line.startswith("answers="):
line = line[len("answers="):]
ra = line.split(";")
ra = [l.split(":") for l in ra]
# print("i1: " + str(ra[0:5]))
ra = {id: {"neg": float(neg), "neu": float(neu), "pos": float(pos), "compound": float(compound)} for [id, neg, neu, pos, compound] in ra}
# print("i1: " + str(ra)[0:500])
return rq, ra
if __name__ == "__main__": if __name__ == "__main__":
# execute only if run as a script # execute only if run as a script
usage = sys.argv[0] + " <folder>" usage = sys.argv[0] + " <folder>"