Files
master/votes.py
wea_ondara fdc1743d5d wip
2020-01-25 13:16:05 +01:00

121 lines
4.4 KiB
Python

import sys
import matplotlib.pyplot as plt
import numpy as np
import os
import statsmodels.api as sm
from datetime import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER
from loader import load, dmt, cms
from sentiments import readtoxleveltxt
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
thresholds = [3, 4, 5, 6]
changedate = datetime.fromisoformat("2018-09-01T00:00:00")
def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts, intervl)
start = cms()
printnoln("reading sentiments ...")
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
rprint("reading sentiments ... took " + str(cms() - start) + "ms")
outputdir = folder + "/output/votes/"
os.system("mkdir -p " + outputdir)
datasingle = []
scoresingle = []
for (option_date_from, option_date_to) in intervals:
print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# avg sentiments
scores = (dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= p['CreationDate'])
.map(lambda p: int(p['Score']))
.getresults())
filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
.filter(lambda p: p != [])
.reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
.getresults())
scoresingle.append(scores)
datasingle.append(filtered)
# filter nan entries
for i in range(len(datasingle)):
if len(datasingle[i]) == 0:
datasingle = float("nan")
if len(datasingle[i]) == 0:
scoresingle[i] = float("nan")
print("Plotting ...")
fig, ax = plt.subplots(figsize=(16, 12))
data = [np.mean(x) for x in datasingle]
l1 = ax.plot([i[0] for i in intervals], data, label="average sentiment")
ax2 = ax.twinx()
l2 = ax2.plot([i[0] for i in intervals], [np.mean(x) for x in scoresingle], label="average score (votes)", color="red")
plt.grid(True)
for i in range(len(data)):
va = "center"
if 0 < i < len(data) - 1:
if data[i - 1] < data[i] and data[i + 1] < data[i]:
va = "bottom"
elif data[i - 1] > data[i] and data[i + 1] > data[i]:
va = "top"
elif i == 0:
if data[i + 1] < data[i]:
va = "bottom"
else:
va = "top"
elif i == len(data) - 1:
if data[i - 1] < data[i]:
va = "bottom"
else:
va = "top"
ax.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
plt.title("Average sentiments for new users")
plt.xticks(rotation=90)
ax.set_xlabel("months")
ax.set_ylabel("sentiment")
ax.set_ylabel("score (votes)")
plt.legend(l1 + l2, [l.get_label() for l in l1 + l2], loc="upper right")
outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 1
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder, interval)