wip
This commit is contained in:
135
its.py
135
its.py
@@ -1,17 +1,17 @@
|
||||
import os
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import os
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from sklearn.linear_model import LinearRegression
|
||||
|
||||
from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER
|
||||
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER
|
||||
from loader import load, dmt, cms
|
||||
|
||||
OLD_USER_PERCENTILE = 0.95
|
||||
from sentiments import readtoxleveltxt
|
||||
import statsmodels.api as sm
|
||||
|
||||
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
|
||||
|
||||
@@ -23,33 +23,31 @@ def main(folder, intervl):
|
||||
|
||||
start = cms()
|
||||
printnoln("reading sentiments ...")
|
||||
cachedsentiments = imprt(folder + "/output/sentiments.py").answers
|
||||
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
|
||||
rprint("reading sentiments ... took " + str(cms() - start) + "ms")
|
||||
|
||||
outputdir = folder + "/output/its/"
|
||||
os.system("mkdir -p " + outputdir)
|
||||
|
||||
data = []
|
||||
datasingle = []
|
||||
count = []
|
||||
for (option_date_from, option_date_to) in intervals:
|
||||
if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"):
|
||||
data.append(float("nan"))
|
||||
continue
|
||||
print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
||||
# avg sentiments
|
||||
# print(dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
|
||||
# for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
|
||||
# and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
|
||||
# .filter(lambda p: p != [])
|
||||
# .getresults())
|
||||
# break
|
||||
filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
|
||||
for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
|
||||
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
|
||||
.filter(lambda p: p != [])
|
||||
.reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
|
||||
.getresults())
|
||||
datasingle.append(filtered)
|
||||
avg = np.average(filtered) if len(filtered) > 0 else float("nan")
|
||||
data.append(avg)
|
||||
count.append(len(filtered))
|
||||
|
||||
# filter nan entries
|
||||
for i in range(len(data)):
|
||||
@@ -57,37 +55,110 @@ def main(folder, intervl):
|
||||
del data[i]
|
||||
del intervals[i]
|
||||
|
||||
print("Computing ITS ...")
|
||||
t = np.reshape(np.array([i for i in range(len(data))]), (-1, 1))
|
||||
# print("t", t)
|
||||
x = np.reshape(np.array([(0 if option_date_to <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for (option_date_from, option_date_to) in intervals]), (-1, 1))
|
||||
# print("x", x)
|
||||
X = np.reshape(np.array([data[0] for i in range(len(data))]), (-1, 1))
|
||||
# print("X", X)
|
||||
X = np.concatenate((X, t), 1)
|
||||
# print("Computing ITS ...")
|
||||
# t = np.reshape(np.array([i for i in range(len(data))]), (-1, 1))
|
||||
# x = np.reshape(np.array([(0 if option_date_to <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for (option_date_from, option_date_to) in intervals]), (-1, 1))
|
||||
# X = np.array(t)
|
||||
# X = np.concatenate((X, x), 1)
|
||||
# X = np.concatenate((X, np.multiply(t, x)), 1)
|
||||
# y = np.reshape(np.array(data), (-1, 1))
|
||||
# # print("Xfin", X)
|
||||
# # print("y", y)
|
||||
# reg = LinearRegression()
|
||||
# reg.fit(X, y)
|
||||
# score = reg.score(X, y)
|
||||
# coef = np.reshape(np.array(reg.coef_), (-1, 1))
|
||||
# its = X.dot(coef) + reg.intercept_
|
||||
# print("score: " + str(score))
|
||||
# print("coef: " + str(coef))
|
||||
# print("its: " + str(its))
|
||||
|
||||
print("Computing full ITS")
|
||||
t = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
|
||||
x = np.reshape(np.array([(0 if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00") else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
|
||||
X = np.array(t)
|
||||
X = np.concatenate((X, x), 1)
|
||||
X = np.concatenate((X, np.multiply(t, x)), 1)
|
||||
y = np.reshape(np.array(data), (-1, 1))
|
||||
y = np.reshape(np.array([d for a in datasingle for d in a]), (-1, 1))
|
||||
# print("Xfin", X)
|
||||
# print("y", y)
|
||||
reg = LinearRegression()
|
||||
reg.fit(X, y)
|
||||
score = reg.score(X, y);
|
||||
coef = np.reshape(np.array(reg.coef_), (-1, 1))
|
||||
its = X.dot(coef) + data[0]
|
||||
print("score: " + str(score))
|
||||
print("coef: " + str(coef))
|
||||
print("its: " + str(its))
|
||||
# reg = LinearRegression()
|
||||
# reg.fit(X, y)
|
||||
# score2 = reg.score(X, y)
|
||||
# coef2 = np.reshape(np.array(reg.coef_), (-1, 1))
|
||||
# its2 = X.dot(coef2) + reg.intercept_
|
||||
# print("intercept: " + str(reg.intercept_))
|
||||
# print("score: " + str(score2))
|
||||
# print("coef: " + str(coef2))
|
||||
# print("its: " + str(its2))
|
||||
X = sm.add_constant(X)
|
||||
res = sm.OLS(y, X).fit()
|
||||
p2 = res.pvalues
|
||||
print("coef ols: " + str(res.params))
|
||||
print("sum ols: " + str(res.summary()))
|
||||
coef2ols = np.reshape(np.array(res.params), (-1, 1))
|
||||
its2ols = X.dot(coef2ols)
|
||||
with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file:
|
||||
file.write(str(res.summary()))
|
||||
|
||||
# print("Computing segmented ITS before")
|
||||
# X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
|
||||
# y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
|
||||
# reg = LinearRegression()
|
||||
# reg.fit(X, y)
|
||||
# scoreb = reg.score(X, y)
|
||||
# coefb = np.reshape(np.array(reg.coef_), (-1, 1))
|
||||
# itsb = X.dot(coefb) + reg.intercept_
|
||||
# print("scoreb: " + str(scoreb))
|
||||
# print("coefb: " + str(coefb))
|
||||
# print("itsb: " + str(itsb))
|
||||
|
||||
# print("Computing segmented ITS after")
|
||||
# X = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
|
||||
# y = np.reshape(np.array([j for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")]), (-1, 1))
|
||||
# reg = LinearRegression()
|
||||
# reg.fit(X, y)
|
||||
# scorea = reg.score(X, y)
|
||||
# coefa = np.reshape(np.array(reg.coef_), (-1, 1))
|
||||
# itsa = X.dot(coefa) + reg.intercept_
|
||||
# print("scorea: " + str(scorea))
|
||||
# print("coefa: " + str(coefa))
|
||||
# print("itsa: " + str(itsa))
|
||||
|
||||
fig = plt.figure(figsize=(16, 12))
|
||||
plt.plot([i[0] for i in intervals], data, label="average sentiment")
|
||||
plt.plot([i[0] for i in intervals], its, label="ITS (score " + str(score) + ")")
|
||||
plt.grid(True)
|
||||
for i in range(len(data)):
|
||||
va = "center"
|
||||
if 0 < i < len(data) - 1:
|
||||
if data[i - 1] < data[i] and data[i + 1] < data[i]:
|
||||
va = "bottom"
|
||||
elif data[i - 1] > data[i] and data[i + 1] > data[i]:
|
||||
va = "top"
|
||||
elif i == 0:
|
||||
if data[i + 1] < data[i]:
|
||||
va = "bottom"
|
||||
else:
|
||||
va = "top"
|
||||
elif i == len(data) - 1:
|
||||
if data[i - 1] < data[i]:
|
||||
va = "bottom"
|
||||
else:
|
||||
va = "top"
|
||||
plt.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
|
||||
# plt.plot([i[0] for i in intervals], its, label="aggregated ITS (score " + str(score) + ")")
|
||||
# plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2, label="single ITS (score " + str(score2) + ", p " + str(p2) + ")")
|
||||
plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS (pvalues " + str(p2) + ")")
|
||||
# plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] <= datetime.fromisoformat("2018-09-01T00:00:00")], itsb,
|
||||
# label="segmented ITS b (score " + str(scoreb) + ")")
|
||||
# plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i] if intervals[i][1] > datetime.fromisoformat("2018-09-01T00:00:00")], itsa,
|
||||
# label="segmented ITS a (score " + str(scorea) + ")")
|
||||
plt.title("Average sentiments for new users")
|
||||
plt.xticks(rotation=90)
|
||||
plt.xlabel("months")
|
||||
plt.ylabel("sentiment")
|
||||
plt.legend(loc="upper right")
|
||||
outfile = outputdir + "/average_sentiments.png"
|
||||
outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png"
|
||||
plt.savefig(outfile, bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
22
notes
22
notes
@@ -13,3 +13,25 @@ interrupted time series
|
||||
3 monate vor änderung und nache der änderungen
|
||||
|
||||
da gibts 2 varianten: sprung, oder 2 linien
|
||||
|
||||
|
||||
|
||||
----------
|
||||
lila lines in its fixen -> done
|
||||
coef -> p values significance, statsmodels sm logit, sm.summary() -> done
|
||||
bisschen zusammen fassen was ich gemacht habe
|
||||
plot über zeit <-0.05, -, >0.05 für sentiments, 3 kurven -> done
|
||||
|
||||
paper, strg+f arousal, methode wichtig, referncen anschauen https://link.springer.com/content/pdf/10.1007%2Fs42001-017-0001-x.pdf
|
||||
|
||||
#einfluss von code sections
|
||||
|
||||
|
||||
Papers:
|
||||
|
||||
its:
|
||||
http://lindenconsulting.org/documents/Weighted_TSA_Article.pdf
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
54
posthist.py
54
posthist.py
@@ -1,12 +1,14 @@
|
||||
import os
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from datetime import timedelta
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.ticker import MaxNLocator
|
||||
|
||||
from common import calc_intervals, IMAGE_MAGICK
|
||||
from common import calc_intervals, IMAGE_MAGICK, DAYS_NEW_USER
|
||||
from loader import load, dmt
|
||||
from sentiments import readtoxleveltxt
|
||||
|
||||
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
|
||||
|
||||
@@ -14,15 +16,19 @@ colors = ['red', 'green', 'blue', 'orange', 'deeppink']
|
||||
def main(folder, intervl):
|
||||
users, posts, firstcontrib, sumcontrib = load(folder)
|
||||
intervals = calc_intervals(posts, intervl)
|
||||
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
|
||||
|
||||
outputdir = folder + "/output/posthist/"
|
||||
os.system("mkdir -p " + outputdir)
|
||||
|
||||
activeusercounts = []
|
||||
answerstonewusers = []
|
||||
sentimentstonewusers = []
|
||||
imgmagickcmd = IMAGE_MAGICK
|
||||
for (option_date_from, option_date_to) in intervals:
|
||||
print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y")))
|
||||
print(option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))
|
||||
|
||||
# post histograms
|
||||
# filter posts by option_date_from <= creation date <= option_date_to
|
||||
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults()
|
||||
|
||||
@@ -34,7 +40,7 @@ def main(folder, intervl):
|
||||
postcounts = {id: len(pc) for (id, pc) in postcounts.items()}
|
||||
activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys())))
|
||||
|
||||
histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
||||
histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "-i" + str(intervl)
|
||||
|
||||
histdata = [pc for pc in postcounts.values()]
|
||||
fig = plt.figure(figsize=(16, 12))
|
||||
@@ -48,14 +54,50 @@ def main(folder, intervl):
|
||||
fig.savefig(histfilename + ".png", bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
imgmagickcmd += " " + histfilename + ".png"
|
||||
os.system(imgmagickcmd + " " + outputdir + "/posthist.pdf")
|
||||
|
||||
# answers to new users
|
||||
answers = (dmt(posts).map(lambda q: [a for a in q['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
|
||||
and firstcontrib[q['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
|
||||
.getresults())
|
||||
count = sum([len(a) for a in answers])
|
||||
answerstonewusers.append(((option_date_from, option_date_to), count))
|
||||
sent = ([cachedsentiments[a['Id']] for al in answers for a in al])
|
||||
sentbad = len([1 for a in sent if a['compound'] < -0.05])
|
||||
sentneu = len([1 for a in sent if -0.05 <= a['compound'] <= 0.05])
|
||||
sentgood = len([1 for a in sent if a['compound'] > 0.05])
|
||||
sentimentstonewusers.append(((option_date_from, option_date_to), (sent, sentbad, sentneu, sentgood)))
|
||||
|
||||
# gen pdf for post histograms
|
||||
os.system(imgmagickcmd + " " + outputdir + "/posthist-i" + str(intervl) + ".pdf")
|
||||
|
||||
# plot posts diagram
|
||||
fig = plt.figure(figsize=(16, 12))
|
||||
plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts])
|
||||
plt.yscale('log')
|
||||
plt.ylim(bottom=0)
|
||||
plt.ylim(bottom=0.001)
|
||||
plt.title("Active users")
|
||||
fig.savefig(outputdir + "activeusers.png", bbox_inches='tight')
|
||||
fig.savefig(outputdir + "activeusers-i" + str(intervl) + ".png", bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
|
||||
# plot answers to new users diagram
|
||||
fig = plt.figure(figsize=(16, 12))
|
||||
plt.plot([x[0] for (x, y) in answerstonewusers], [y for (x, y) in answerstonewusers])
|
||||
plt.yscale('log')
|
||||
plt.ylim(bottom=0.001)
|
||||
plt.title("#Answers to new users")
|
||||
fig.savefig(outputdir + "answerstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
|
||||
# plot sentiments of answers to new users diagram
|
||||
fig = plt.figure(figsize=(16, 12))
|
||||
plt.plot([x[0] for (x, y) in sentimentstonewusers], [b for (x, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer")
|
||||
plt.plot([x[0] for (x, y) in sentimentstonewusers], [n for (x, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer")
|
||||
plt.plot([x[0] for (x, y) in sentimentstonewusers], [g for (x, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer")
|
||||
plt.yscale('log')
|
||||
plt.ylim(bottom=0.001)
|
||||
plt.legend(loc="upper right")
|
||||
plt.title("Sentiments of answers to new users")
|
||||
fig.savefig(outputdir + "sentimentstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import sys
|
||||
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
||||
|
||||
from loader import load, dmt
|
||||
from common import imprt
|
||||
|
||||
analyser = SentimentIntensityAnalyzer()
|
||||
|
||||
@@ -20,6 +21,30 @@ def main(folder):
|
||||
toxlevels = {id: p for (id, p) in toxlevels}
|
||||
|
||||
dumptoxlevels(toxlevels, outfilename + ".py")
|
||||
dumptoxlevelstxt(toxlevels, outfilename + ".txt")
|
||||
|
||||
# (lvl2q, lvl2a) = readtoxleveltxt(outfilename + ".txt")
|
||||
#
|
||||
# s1 = str(toxlevels)
|
||||
# s2 = str(lvl2q)
|
||||
# # print("s1: " + s1)
|
||||
# # print("s2: " + s2)
|
||||
# if s1 != s2:
|
||||
# print("not equal")
|
||||
# else:
|
||||
# print("equal")
|
||||
#
|
||||
# # print("s1: " + str(imprt(folder + "/output/sentiments.py").answers))
|
||||
# # print("s2: " + str(lvl2a))
|
||||
# if str(imprt(folder + "/output/sentiments.py").answers) != str(lvl2a):
|
||||
# print("a not equal")
|
||||
# else:
|
||||
# print("a equal")
|
||||
#
|
||||
# if str(imprt(folder + "/output/sentiments.py").posts) != str(lvl2q):
|
||||
# print("q not equal")
|
||||
# else:
|
||||
# print("q equal")
|
||||
|
||||
|
||||
def computeToxLevel(text):
|
||||
@@ -36,6 +61,53 @@ def dumptoxlevels(lvls, filename):
|
||||
file.write("answers = " + str(answers) + "\n")
|
||||
|
||||
|
||||
def dumptoxlevelstxt(lvls, filename):
|
||||
answers = dict()
|
||||
for p in lvls.values():
|
||||
for id, a in p.items():
|
||||
answers[id] = a
|
||||
pstr = [str(id) + ":" + ";".join([str(aid) + ":" + str(a['neg']) + ":" + str(a['neu']) + ":" + str(a['pos']) + ":" + str(a['compound']) for (aid, a) in p.items()]) for (id, p) in lvls.items()]
|
||||
astr = [str(id) + ":" + str(p['neg']) + ":" + str(p['neu']) + ":" + str(p['pos']) + ":" + str(p['compound']) for (id, p) in answers.items()]
|
||||
pstr = ";;".join(pstr)
|
||||
astr = ";".join(astr)
|
||||
with open(filename, "w") as file:
|
||||
file.write("posts=" + pstr + "\n")
|
||||
file.write("answers=" + astr + "\n")
|
||||
|
||||
|
||||
def readtoxleveltxt(filename):
|
||||
lines = ""
|
||||
with open(filename, 'r') as f:
|
||||
lines = f.read()
|
||||
lines = lines.split("\n")
|
||||
|
||||
rq = {}
|
||||
ra = {}
|
||||
for line in lines:
|
||||
if line.startswith("posts="):
|
||||
line = line[len("posts="):]
|
||||
rq = line.split(";;") # split by q
|
||||
# print("i1: " + str(rq[0:5]))
|
||||
rq = [l.split(":", 1) for l in rq] # get q id
|
||||
# print("i2: " + str(rq[0:5]))
|
||||
rq = [(qid, [x.split(":") for x in a.split(";")]) if len(a) > 0 else (qid, []) for [qid, a] in rq]
|
||||
# print("i3:" + str(rq))
|
||||
# rq = {int(id): {int(1): "a" for x in a} for (id, a) in rq}
|
||||
# rq = {int(id): {str(aid[0]): str(aid) for aid in a} for (id, a) in rq}
|
||||
rq = {id: {aid: {"neg": float(neg), "neu": float(neu), "pos": float(pos), "compound": float(compound)} for [aid, neg, neu, pos, compound] in a} for (id, a) in rq}
|
||||
# print("i4:" + str(rq)[0:500])
|
||||
# sys.exit()
|
||||
elif line.startswith("answers="):
|
||||
line = line[len("answers="):]
|
||||
ra = line.split(";")
|
||||
ra = [l.split(":") for l in ra]
|
||||
# print("i1: " + str(ra[0:5]))
|
||||
ra = {id: {"neg": float(neg), "neu": float(neu), "pos": float(pos), "compound": float(compound)} for [id, neg, neu, pos, compound] in ra}
|
||||
# print("i1: " + str(ra)[0:500])
|
||||
|
||||
return rq, ra
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# execute only if run as a script
|
||||
usage = sys.argv[0] + " <folder>"
|
||||
|
||||
Reference in New Issue
Block a user