Files
master/votes.py
wea_ondara 06085870a1 wip
2020-04-11 14:07:57 +02:00

152 lines
6.5 KiB
Python

import sys
import matplotlib.pyplot as plt
import numpy as np
import os
import statsmodels.api as sm
from datetime import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE
from loader import load, dmt, cms, readVotes
from sentiments import readtoxleveltxt
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
thresholds = [3, 4, 5, 6]
changedate = datetime.fromisoformat("2018-09-01T00:00:00")
def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts, intervl)
start = cms()
printnoln("reading sentiments ...")
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
rprint("reading sentiments ... took " + str(cms() - start) + "ms")
outputdir = folder + "/output/votes/"
os.system("mkdir -p " + outputdir)
datasingle = []
scoresingle = []
for (option_date_from, option_date_to) in intervals:
print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# avg sentiments
scores = (dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'])
.map(lambda p: p['Score'])
.getresults())
filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
for a in p['Answers'] if option_date_from <= p['CreationDate'] < option_date_to # post in interval
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'] # post within 1 week of 1st contrib
and p['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) # answer within 1 week of post
.filter(lambda p: p != [])
.reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
.getresults())
scoresingle.append(scores)
datasingle.append(filtered)
# filter nan entries
for i in range(len(datasingle)):
if len(datasingle[i]) == 0:
datasingle[i] = float("nan")
if len(scoresingle[i]) == 0:
scoresingle[i] = float("nan")
print("Plotting ...")
fig, ax = plt.subplots(figsize=FIG_SIZE)
data = [np.mean(x) for x in datasingle]
l1 = ax.plot([i[0] for i in intervals], data, label="average sentiment")
ax2 = ax.twinx()
l2 = ax2.plot([i[0] for i in intervals], [np.mean(x) for x in scoresingle], label="average score (votes)", color="red")
plt.grid(True)
for i in range(len(data)):
va = "center"
if 0 < i < len(data) - 1:
if data[i - 1] < data[i] and data[i + 1] < data[i]:
va = "bottom"
elif data[i - 1] > data[i] and data[i + 1] > data[i]:
va = "top"
elif i == 0:
if data[i + 1] < data[i]:
va = "bottom"
else:
va = "top"
elif i == len(data) - 1:
if data[i - 1] < data[i]:
va = "bottom"
else:
va = "top"
ax.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + (str(len(datasingle[i])) if str(datasingle[i]) != "nan" else ""), ha="center", va=va)
plt.title("Average sentiments and score for new users")
plt.xticks(rotation=90)
ax.set_xlabel("months")
ax.set_ylabel("sentiment")
ax2.set_ylabel("score (votes)")
plt.legend(l1 + l2, [l.get_label() for l in l1 + l2], loc="upper right")
outfile = outputdir + "/average_votes-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
# votes over time
votes = readVotes(folder)
fig = plt.figure(figsize=FIG_SIZE)
ivs = [(datetime.fromisoformat("2010-01-01T00:00:00"), datetime.fromisoformat(str(y) + "-01-01T00:00:00")) for y in range(2011, 2020)]
for interval in ivs:
print(interval[0].strftime("%d-%m-%Y") + " to " + interval[1].strftime("%d-%m-%Y"))
ivvotes = dmt(votes).filter(lambda v: interval[0] <= v['CreationDate'] < interval[1]).getresults()
scores = []
for (option_date_from, option_date_to) in intervals:
if option_date_to > interval[1]:
continue
intervalposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) >= p['CreationDate']).getresults()
intervalpostsids = set(dmt(intervalposts).map(lambda p: p['Id']).getresults())
intervalvotes = dmt(ivvotes).filter(lambda v: v['PostId'] in intervalpostsids).getresults()
intervalscore = sum(dmt(intervalvotes).map(lambda v: 1 if v['VoteTypeId'] == 2 else (-1 if v['VoteTypeId'] == 3 else 0)).getresults())
intervalscore = intervalscore / len(intervalpostsids) if len(intervalpostsids) != 0 else float("nan")
scores.append(((option_date_from, option_date_to), intervalscore))
# if all(str(score) == "nan" for iv, score in scores)
# continue
plt.plot([iv[0] for iv, score in scores], [score for iv, score in scores], label=str(interval[0].year) + " - " + str(interval[1].year))
plt.title("Average score for new users over time")
plt.xlabel("months")
plt.ylabel("score")
plt.legend(loc="upper right")
plt.grid(True)
outfile = outputdir + "/average_votes_over_time-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 1
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder, interval)