This commit is contained in:
wea_ondara
2020-01-25 13:16:05 +01:00
parent cd0239f39c
commit fdc1743d5d
5 changed files with 157 additions and 13 deletions

5
its.py
View File

@@ -55,11 +55,6 @@ def main(folder, intervl):
avgcount = np.mean([x for x in count if str(x) != "nan"]) avgcount = np.mean([x for x in count if str(x) != "nan"])
stdcount = np.std([x for x in count if str(x) != "nan"]) stdcount = np.std([x for x in count if str(x) != "nan"])
for i in range(len(count)): for i in range(len(count)):
print(count[i])
if count[i] == 45:
print("m " + str(avgcount))
print("s " + str(stdcount))
print("N " + str((count[i] - avgcount) / stdcount))
if str(count[i]) == "nan" or np.abs((count[i] - avgcount) / stdcount) > 3: if str(count[i]) == "nan" or np.abs((count[i] - avgcount) / stdcount) > 3:
datasingle[i] = float("nan") datasingle[i] = float("nan")
data[i] = float("nan") data[i] = float("nan")

View File

@@ -100,7 +100,7 @@ def mapuser(item):
def mapQuestion(item): def mapQuestion(item):
tags = ['Id', 'CreationDate', 'Body', 'Title', 'OwnerUserId', 'OwnerDisplayName'] tags = ['Id', 'CreationDate', 'Body', 'Title', 'OwnerUserId', 'OwnerDisplayName', 'Score']
datetags = ['CreationDate'] datetags = ['CreationDate']
question = {tag: getTag(item, tag) for tag in tags} question = {tag: getTag(item, tag) for tag in tags}
for tag in datetags: for tag in datetags:
@@ -110,7 +110,7 @@ def mapQuestion(item):
def mapAnswer(item): def mapAnswer(item):
tags = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId'] tags = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId', 'Score']
datetags = ['CreationDate'] datetags = ['CreationDate']
answer = {tag: getTag(item, tag) for tag in tags} answer = {tag: getTag(item, tag) for tag in tags}
for tag in datetags: for tag in datetags:

4
notes
View File

@@ -38,8 +38,8 @@ http://lindenconsulting.org/documents/Weighted_TSA_Article.pdf
outliner filtern 57 /2000 senitment values in its > done outliner filtern 57 /2000 senitment values in its > done
threshold 2,3,4,5,6 monate vor und zurück in its neu kurven andere farben>done threshold 2,3,4,5,6 monate vor und zurück in its neu kurven andere farben>done
auswertung up downvotes und correlation mit sentiment auswertung up downvotes und correlation mit sentiment >done
activität neuer user vorher und nachher activität neuer user vorher und nachher>done

View File

@@ -24,6 +24,7 @@ def main(folder, intervl):
activeusercounts = [] activeusercounts = []
answerstonewusers = [] answerstonewusers = []
sentimentstonewusers = [] sentimentstonewusers = []
activitynewusers = []
imgmagickcmd = IMAGE_MAGICK imgmagickcmd = IMAGE_MAGICK
for (option_date_from, option_date_to) in intervals: for (option_date_from, option_date_to) in intervals:
print(option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y")) print(option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))
@@ -37,9 +38,21 @@ def main(folder, intervl):
for p in newposts: for p in newposts:
postcounts[p['OwnerUserId']].append(p) postcounts[p['OwnerUserId']].append(p)
i = i + 1 i = i + 1
# for a in p['Answers']:
# postcounts[p['OwnerUserId']].append(a)
postcounts = {id: len(pc) for (id, pc) in postcounts.items()} postcounts = {id: len(pc) for (id, pc) in postcounts.items()}
activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys()))) activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys())))
activitynewusersinmonth = defaultdict(int)
for p in newposts:
if firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate']:
activitynewusersinmonth[p['OwnerUserId']] += 1
for a in p['Answers']:
if firstcontrib[a['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']:
activitynewusersinmonth[p['OwnerUserId']] += 1
activitysum = sum(activitynewusersinmonth.values())
activitynewusers.append(((option_date_from, option_date_to), activitysum / len(activitynewusersinmonth)))
histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "-i" + str(intervl) histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "-i" + str(intervl)
histdata = [pc for pc in postcounts.values()] histdata = [pc for pc in postcounts.values()]
@@ -73,8 +86,10 @@ def main(folder, intervl):
# plot posts diagram # plot posts diagram
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=(16, 12))
plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts]) plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts])
plt.xlabel('time')
plt.ylabel('#active users')
plt.yscale('log') plt.yscale('log')
plt.ylim(bottom=0.001) plt.ylim(bottom=1)
plt.title("Active users") plt.title("Active users")
fig.savefig(outputdir + "activeusers-i" + str(intervl) + ".png", bbox_inches='tight') fig.savefig(outputdir + "activeusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)
@@ -82,9 +97,11 @@ def main(folder, intervl):
# plot answers to new users diagram # plot answers to new users diagram
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=(16, 12))
plt.plot([x[0] for (x, y) in answerstonewusers], [y for (x, y) in answerstonewusers]) plt.plot([x[0] for (x, y) in answerstonewusers], [y for (x, y) in answerstonewusers])
plt.xlabel('time')
plt.ylabel('#answers per question of a new user')
plt.yscale('log') plt.yscale('log')
plt.ylim(bottom=0.001) plt.ylim(bottom=1)
plt.title("#Answers to new users") plt.title("Answers to new users")
fig.savefig(outputdir + "answerstonewusers-i" + str(intervl) + ".png", bbox_inches='tight') fig.savefig(outputdir + "answerstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)
@@ -93,13 +110,25 @@ def main(folder, intervl):
plt.plot([x[0] for (x, y) in sentimentstonewusers], [b for (x, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer") plt.plot([x[0] for (x, y) in sentimentstonewusers], [b for (x, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer")
plt.plot([x[0] for (x, y) in sentimentstonewusers], [n for (x, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer") plt.plot([x[0] for (x, y) in sentimentstonewusers], [n for (x, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer")
plt.plot([x[0] for (x, y) in sentimentstonewusers], [g for (x, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer") plt.plot([x[0] for (x, y) in sentimentstonewusers], [g for (x, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer")
plt.xlabel('time')
plt.ylabel('sentiment')
plt.yscale('log') plt.yscale('log')
plt.ylim(bottom=0.001) plt.ylim(bottom=1)
plt.legend(loc="upper right") plt.legend(loc="upper right")
plt.title("Sentiments of answers to new users") plt.title("Sentiments of answers to new users")
fig.savefig(outputdir + "sentimentstonewusers-i" + str(intervl) + ".png", bbox_inches='tight') fig.savefig(outputdir + "sentimentstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)
# plot activity for new users
fig = plt.figure(figsize=(16, 12))
plt.plot([x[0] for (x, y) in activitynewusers], [y for x, y in activitynewusers], label="activity")
plt.xlabel('time')
plt.ylabel('#questions or answers created by a new user')
plt.legend(loc="upper right")
plt.title("Average activity per new user")
fig.savefig(outputdir + "activitynewusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig)
if __name__ == "__main__": if __name__ == "__main__":
# execute only if run as a script # execute only if run as a script

120
votes.py Normal file
View File

@@ -0,0 +1,120 @@
import sys
import matplotlib.pyplot as plt
import numpy as np
import os
import statsmodels.api as sm
from datetime import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER
from loader import load, dmt, cms
from sentiments import readtoxleveltxt
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
thresholds = [3, 4, 5, 6]
changedate = datetime.fromisoformat("2018-09-01T00:00:00")
def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts, intervl)
start = cms()
printnoln("reading sentiments ...")
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
rprint("reading sentiments ... took " + str(cms() - start) + "ms")
outputdir = folder + "/output/votes/"
os.system("mkdir -p " + outputdir)
datasingle = []
scoresingle = []
for (option_date_from, option_date_to) in intervals:
print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# avg sentiments
scores = (dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= p['CreationDate'])
.map(lambda p: int(p['Score']))
.getresults())
filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
.filter(lambda p: p != [])
.reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
.getresults())
scoresingle.append(scores)
datasingle.append(filtered)
# filter nan entries
for i in range(len(datasingle)):
if len(datasingle[i]) == 0:
datasingle = float("nan")
if len(datasingle[i]) == 0:
scoresingle[i] = float("nan")
print("Plotting ...")
fig, ax = plt.subplots(figsize=(16, 12))
data = [np.mean(x) for x in datasingle]
l1 = ax.plot([i[0] for i in intervals], data, label="average sentiment")
ax2 = ax.twinx()
l2 = ax2.plot([i[0] for i in intervals], [np.mean(x) for x in scoresingle], label="average score (votes)", color="red")
plt.grid(True)
for i in range(len(data)):
va = "center"
if 0 < i < len(data) - 1:
if data[i - 1] < data[i] and data[i + 1] < data[i]:
va = "bottom"
elif data[i - 1] > data[i] and data[i + 1] > data[i]:
va = "top"
elif i == 0:
if data[i + 1] < data[i]:
va = "bottom"
else:
va = "top"
elif i == len(data) - 1:
if data[i - 1] < data[i]:
va = "bottom"
else:
va = "top"
ax.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
plt.title("Average sentiments for new users")
plt.xticks(rotation=90)
ax.set_xlabel("months")
ax.set_ylabel("sentiment")
ax.set_ylabel("score (votes)")
plt.legend(l1 + l2, [l.get_label() for l in l1 + l2], loc="upper right")
outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 1
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder, interval)