Files
master/votes.py
wea_ondara da8896eadd wip
2020-01-27 11:58:19 +01:00

151 lines
6.3 KiB
Python

import sys
import matplotlib.pyplot as plt
import numpy as np
import os
import statsmodels.api as sm
from datetime import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER
from loader import load, dmt, cms, readVotes
from sentiments import readtoxleveltxt
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
thresholds = [3, 4, 5, 6]
changedate = datetime.fromisoformat("2018-09-01T00:00:00")
def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts, intervl)
start = cms()
printnoln("reading sentiments ...")
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
rprint("reading sentiments ... took " + str(cms() - start) + "ms")
outputdir = folder + "/output/votes/"
os.system("mkdir -p " + outputdir)
datasingle = []
scoresingle = []
for (option_date_from, option_date_to) in intervals:
print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# avg sentiments
scores = (dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= p['CreationDate'])
.map(lambda p: int(p['Score']))
.getresults())
filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']])
.filter(lambda p: p != [])
.reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
.getresults())
scoresingle.append(scores)
datasingle.append(filtered)
# filter nan entries
for i in range(len(datasingle)):
if len(datasingle[i]) == 0:
datasingle[i] = float("nan")
if len(scoresingle[i]) == 0:
scoresingle[i] = float("nan")
print("Plotting ...")
fig, ax = plt.subplots(figsize=(16, 12))
data = [np.mean(x) for x in datasingle]
l1 = ax.plot([i[0] for i in intervals], data, label="average sentiment")
ax2 = ax.twinx()
l2 = ax2.plot([i[0] for i in intervals], [np.mean(x) for x in scoresingle], label="average score (votes)", color="red")
plt.grid(True)
for i in range(len(data)):
va = "center"
if 0 < i < len(data) - 1:
if data[i - 1] < data[i] and data[i + 1] < data[i]:
va = "bottom"
elif data[i - 1] > data[i] and data[i + 1] > data[i]:
va = "top"
elif i == 0:
if data[i + 1] < data[i]:
va = "bottom"
else:
va = "top"
elif i == len(data) - 1:
if data[i - 1] < data[i]:
va = "bottom"
else:
va = "top"
ax.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + (str(len(datasingle[i])) if str(datasingle[i]) != "nan" else ""), ha="center", va=va)
plt.title("Average sentiments and score for new users")
plt.xticks(rotation=90)
ax.set_xlabel("months")
ax.set_ylabel("sentiment")
ax2.set_ylabel("score (votes)")
plt.legend(l1 + l2, [l.get_label() for l in l1 + l2], loc="upper right")
outfile = outputdir + "/average_votes-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
# votes over time
votes = readVotes(folder)
fig = plt.figure(figsize=(16, 12))
ivs = [(datetime.fromisoformat("2010-01-01T00:00:00"), datetime.fromisoformat(str(y) + "-01-01T00:00:00")) for y in range(2011, 2020)]
for interval in ivs:
print(interval[0].strftime("%d-%m-%Y") + " to " + interval[1].strftime("%d-%m-%Y"))
ivvotes = dmt(votes).filter(lambda v: interval[0] <= v['CreationDate'] < interval[1]).getresults()
scores = []
for (option_date_from, option_date_to) in intervals:
if option_date_to > interval[1]:
continue
intervalposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= p['CreationDate']).getresults()
intervalpostsids = set(dmt(intervalposts).map(lambda p: p['Id']).getresults())
intervalvotes = dmt(ivvotes).filter(lambda v: v['PostId'] in intervalpostsids).getresults()
intervalscore = sum(dmt(intervalvotes).map(lambda v: 1 if v['VoteTypeId'] == "2" else (-1 if v['VoteTypeId'] == "3" else 0)).getresults())
intervalscore = intervalscore / len(intervalpostsids) if len(intervalpostsids) != 0 else float("nan")
scores.append(((option_date_from, option_date_to), intervalscore))
# if all(str(score) == "nan" for iv, score in scores)
# continue
plt.plot([iv[0] for iv, score in scores], [score for iv, score in scores], label=str(interval[0].year) + " - " + str(interval[1].year))
plt.title("Average score for new users over time")
plt.xlabel("months")
plt.ylabel("score")
plt.legend(loc="upper right")
plt.grid(True)
outfile = outputdir + "/average_votes_over_time-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 1
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder, interval)