This commit is contained in:
wea_ondara
2020-01-25 13:16:05 +01:00
parent cd0239f39c
commit fdc1743d5d
5 changed files with 157 additions and 13 deletions

120
votes.py Normal file
View File

@@ -0,0 +1,120 @@
import sys
import matplotlib.pyplot as plt
import numpy as np
import os
import statsmodels.api as sm
from datetime import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER
from loader import load, dmt, cms
from sentiments import readtoxleveltxt
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
thresholds = [3, 4, 5, 6]
changedate = datetime.fromisoformat("2018-09-01T00:00:00")
def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts, intervl)
start = cms()
printnoln("reading sentiments ...")
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
rprint("reading sentiments ... took " + str(cms() - start) + "ms")
outputdir = folder + "/output/votes/"
os.system("mkdir -p " + outputdir)
datasingle = []
scoresingle = []
for (option_date_from, option_date_to) in intervals:
print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# avg sentiments
scores = (dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= p['CreationDate'])
.map(lambda p: int(p['Score']))
.getresults())
filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
.filter(lambda p: p != [])
.reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
.getresults())
scoresingle.append(scores)
datasingle.append(filtered)
# filter nan entries
for i in range(len(datasingle)):
if len(datasingle[i]) == 0:
datasingle = float("nan")
if len(datasingle[i]) == 0:
scoresingle[i] = float("nan")
print("Plotting ...")
fig, ax = plt.subplots(figsize=(16, 12))
data = [np.mean(x) for x in datasingle]
l1 = ax.plot([i[0] for i in intervals], data, label="average sentiment")
ax2 = ax.twinx()
l2 = ax2.plot([i[0] for i in intervals], [np.mean(x) for x in scoresingle], label="average score (votes)", color="red")
plt.grid(True)
for i in range(len(data)):
va = "center"
if 0 < i < len(data) - 1:
if data[i - 1] < data[i] and data[i + 1] < data[i]:
va = "bottom"
elif data[i - 1] > data[i] and data[i + 1] > data[i]:
va = "top"
elif i == 0:
if data[i + 1] < data[i]:
va = "bottom"
else:
va = "top"
elif i == len(data) - 1:
if data[i - 1] < data[i]:
va = "bottom"
else:
va = "top"
ax.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
plt.title("Average sentiments for new users")
plt.xticks(rotation=90)
ax.set_xlabel("months")
ax.set_ylabel("sentiment")
ax.set_ylabel("score (votes)")
plt.legend(l1 + l2, [l.get_label() for l in l1 + l2], loc="upper right")
outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 1
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder, interval)