152 lines
6.5 KiB
Python
152 lines
6.5 KiB
Python
import sys
|
|
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
import os
|
|
import statsmodels.api as sm
|
|
from datetime import datetime
|
|
from datetime import timedelta
|
|
from dateutil.relativedelta import relativedelta
|
|
|
|
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE
|
|
from loader import load, dmt, cms, readVotes
|
|
from sentiments import readtoxleveltxt
|
|
|
|
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
|
|
thresholds = [3, 4, 5, 6]
|
|
changedate = datetime.fromisoformat("2018-09-01T00:00:00")
|
|
|
|
|
|
def main(folder, intervl):
|
|
users, posts, firstcontrib, sumcontrib = load(folder)
|
|
|
|
intervals = calc_intervals(posts, intervl)
|
|
|
|
start = cms()
|
|
printnoln("reading sentiments ...")
|
|
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
|
|
rprint("reading sentiments ... took " + str(cms() - start) + "ms")
|
|
|
|
outputdir = folder + "/output/votes/"
|
|
os.system("mkdir -p " + outputdir)
|
|
|
|
datasingle = []
|
|
scoresingle = []
|
|
for (option_date_from, option_date_to) in intervals:
|
|
print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
|
# avg sentiments
|
|
scores = (dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to
|
|
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'])
|
|
.map(lambda p: p['Score'])
|
|
.getresults())
|
|
filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
|
|
for a in p['Answers'] if option_date_from <= p['CreationDate'] < option_date_to # post in interval
|
|
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'] # post within 1 week of 1st contrib
|
|
and p['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) # answer within 1 week of post
|
|
.filter(lambda p: p != [])
|
|
.reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
|
|
.getresults())
|
|
scoresingle.append(scores)
|
|
datasingle.append(filtered)
|
|
|
|
# filter nan entries
|
|
for i in range(len(datasingle)):
|
|
if len(datasingle[i]) == 0:
|
|
datasingle[i] = float("nan")
|
|
if len(scoresingle[i]) == 0:
|
|
scoresingle[i] = float("nan")
|
|
|
|
print("Plotting ...")
|
|
fig, ax = plt.subplots(figsize=FIG_SIZE)
|
|
data = [np.mean(x) for x in datasingle]
|
|
l1 = ax.plot([i[0] for i in intervals], data, label="average sentiment")
|
|
ax2 = ax.twinx()
|
|
l2 = ax2.plot([i[0] for i in intervals], [np.mean(x) for x in scoresingle], label="average score (votes)", color="red")
|
|
plt.grid(True)
|
|
for i in range(len(data)):
|
|
va = "center"
|
|
if 0 < i < len(data) - 1:
|
|
if data[i - 1] < data[i] and data[i + 1] < data[i]:
|
|
va = "bottom"
|
|
elif data[i - 1] > data[i] and data[i + 1] > data[i]:
|
|
va = "top"
|
|
elif i == 0:
|
|
if data[i + 1] < data[i]:
|
|
va = "bottom"
|
|
else:
|
|
va = "top"
|
|
elif i == len(data) - 1:
|
|
if data[i - 1] < data[i]:
|
|
va = "bottom"
|
|
else:
|
|
va = "top"
|
|
ax.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + (str(len(datasingle[i])) if str(datasingle[i]) != "nan" else ""), ha="center", va=va)
|
|
plt.title("Average sentiments and score for new users")
|
|
plt.xticks(rotation=90)
|
|
ax.set_xlabel("months")
|
|
ax.set_ylabel("sentiment")
|
|
ax2.set_ylabel("score (votes)")
|
|
plt.legend(l1 + l2, [l.get_label() for l in l1 + l2], loc="upper right")
|
|
outfile = outputdir + "/average_votes-i" + str(intervl) + ".png"
|
|
plt.savefig(outfile, bbox_inches='tight')
|
|
plt.close(fig)
|
|
|
|
# votes over time
|
|
votes = readVotes(folder)
|
|
fig = plt.figure(figsize=FIG_SIZE)
|
|
ivs = [(datetime.fromisoformat("2010-01-01T00:00:00"), datetime.fromisoformat(str(y) + "-01-01T00:00:00")) for y in range(2011, 2021)]
|
|
for interval in ivs:
|
|
print(interval[0].strftime("%d-%m-%Y") + " to " + interval[1].strftime("%d-%m-%Y"))
|
|
ivvotes = dmt(votes).filter(lambda v: interval[0] <= v['CreationDate'] < interval[1]).getresults()
|
|
scores = []
|
|
for (option_date_from, option_date_to) in intervals:
|
|
if option_date_to > interval[1]:
|
|
continue
|
|
intervalposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to
|
|
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) >= p['CreationDate']).getresults()
|
|
intervalpostsids = set(dmt(intervalposts).map(lambda p: p['Id']).getresults())
|
|
intervalvotes = dmt(ivvotes).filter(lambda v: v['PostId'] in intervalpostsids).getresults()
|
|
intervalscore = sum(dmt(intervalvotes).map(lambda v: 1 if v['VoteTypeId'] == 2 else (-1 if v['VoteTypeId'] == 3 else 0)).getresults())
|
|
intervalscore = intervalscore / len(intervalpostsids) if len(intervalpostsids) != 0 else float("nan")
|
|
scores.append(((option_date_from, option_date_to), intervalscore))
|
|
# if all(str(score) == "nan" for iv, score in scores)
|
|
# continue
|
|
plt.plot([iv[0] for iv, score in scores], [score for iv, score in scores], label=str(interval[0].year) + " - " + str(interval[1].year))
|
|
plt.title("Average score for new users over time")
|
|
plt.xlabel("months")
|
|
plt.ylabel("score")
|
|
plt.legend(loc="upper right")
|
|
plt.grid(True)
|
|
outfile = outputdir + "/average_votes_over_time-i" + str(intervl) + ".png"
|
|
plt.savefig(outfile, bbox_inches='tight')
|
|
plt.close(fig)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# execute only if run as a script
|
|
usage = sys.argv[0] + " <folder>"
|
|
if len(sys.argv) < 2:
|
|
print(usage)
|
|
sys.exit(1)
|
|
folder = sys.argv[1]
|
|
if not os.path.isdir(folder):
|
|
print(folder + " is not a folder")
|
|
sys.exit(1)
|
|
interval = 1
|
|
if len(sys.argv) >= 3:
|
|
if sys.argv[2].startswith("-i"):
|
|
interval = sys.argv[2][2:]
|
|
try:
|
|
interval = int(interval)
|
|
except ValueError:
|
|
print("-i: int required")
|
|
sys.exit(1)
|
|
if interval < 1 or interval > 12:
|
|
print("-i: only 1 - 12")
|
|
sys.exit(1)
|
|
else:
|
|
print("unknown parameter: " + sys.argv[2])
|
|
sys.exit(1)
|
|
|
|
main(folder, interval)
|