194 lines
7.6 KiB
Python
194 lines
7.6 KiB
Python
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
import os
|
|
import statsmodels.api as sm
|
|
import sys
|
|
from collections import defaultdict
|
|
from datetime import datetime
|
|
from datetime import timedelta
|
|
from dateutil.relativedelta import relativedelta
|
|
|
|
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, difftime
|
|
from loader import load, dmt, cms, readVotes
|
|
from sentiments import readtoxleveltxt
|
|
|
|
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
|
|
thresholds = [6, 9, 12, 15]
|
|
changedate = datetime.fromisoformat("2018-09-01T00:00:00")
|
|
|
|
|
|
def main(folder, intervl):
|
|
votes = readVotes(folder)
|
|
users, posts, firstcontrib, sumcontrib = load(folder)
|
|
|
|
intervals = calc_intervals(posts, intervl)
|
|
|
|
# start = cms()
|
|
# printnoln("reading sentiments ...")
|
|
# (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
|
|
# rprint("reading sentiments ... took " + str(cms() - start) + "ms")
|
|
|
|
start = cms()
|
|
printnoln("sorting votes by post ...")
|
|
votesbypost = defaultdict(list)
|
|
for v in votes:
|
|
votesbypost[v['PostId']].append(v)
|
|
rprint("sorting votes by post ... took " + str(cms() - start) + "ms")
|
|
|
|
outputdir = folder + "/output/votesits/"
|
|
os.system("mkdir -p " + outputdir)
|
|
|
|
data = []
|
|
datasingle = []
|
|
count = []
|
|
for (option_date_from, option_date_to) in intervals:
|
|
if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"):
|
|
datasingle.append(float("nan"))
|
|
data.append(float("nan"))
|
|
count.append(float("nan"))
|
|
continue
|
|
print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
|
# avg sentiments
|
|
filtered = (dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to
|
|
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'])
|
|
.map(lambda p: votescore(votesbypost[p['Id']], p))
|
|
.getresults())
|
|
datasingle.append(filtered)
|
|
avg = np.average(filtered) if len(filtered) > 0 else float("nan")
|
|
data.append(avg)
|
|
count.append(len(filtered))
|
|
|
|
avgcount = np.mean([x for x in count if str(x) != "nan"])
|
|
stdcount = np.std([x for x in count if str(x) != "nan"])
|
|
for i in range(len(count)):
|
|
if str(count[i]) == "nan": # or np.abs((count[i] - avgcount) / stdcount) > 3:
|
|
datasingle[i] = float("nan")
|
|
data[i] = float("nan")
|
|
count[i] = float("nan")
|
|
|
|
# filter nan entries
|
|
for i in range(len(data)):
|
|
while i < len(data) and str(data[i]) == "nan":
|
|
del datasingle[i]
|
|
del data[i]
|
|
del intervals[i]
|
|
del count[i]
|
|
|
|
print("Computing full ITS")
|
|
t = np.reshape(np.array([i for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
|
|
x = np.reshape(np.array([(0 if intervals[i][0] <= changedate else 1) for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
|
|
X = np.array(t)
|
|
X = np.concatenate((X, x), 1)
|
|
X = np.concatenate((X, np.multiply(t, x)), 1)
|
|
y = np.reshape(np.array([d for a in datasingle for d in a]), (-1, 1))
|
|
X = sm.add_constant(X)
|
|
res = sm.OLS(y, X).fit()
|
|
p2 = res.pvalues
|
|
print("coef ols: " + str(res.params))
|
|
print("sum ols: " + str(res.summary()))
|
|
coef2ols = np.reshape(np.array(res.params), (-1, 1))
|
|
its2ols = X.dot(coef2ols)
|
|
with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file:
|
|
file.write(str(res.summary()))
|
|
|
|
thresdata = []
|
|
thresols = []
|
|
thresiv = []
|
|
thresp = []
|
|
print("Computing threshold ITS")
|
|
for ti in thresholds:
|
|
# print(1, changedate - relativedelta(months=ti))
|
|
# print(2, changedate + relativedelta(months=ti))
|
|
z = [(i, x) for (i, x) in zip(intervals, datasingle) if i[0] >= changedate - relativedelta(months=ti) and i[1] <= changedate + relativedelta(months=ti)]
|
|
iv = [i for (i, x) in z]
|
|
# print("iv " + str(iv))
|
|
d = [x for (i, x) in z]
|
|
t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1))
|
|
x = np.reshape(np.array([(0 if iv[i][0] <= changedate else 1) for i in range(len(d)) for j in d[i]]), (-1, 1))
|
|
X = np.array(t)
|
|
X = np.concatenate((X, x), 1)
|
|
X = np.concatenate((X, np.multiply(t, x)), 1)
|
|
y = np.reshape(np.array([v for a in d for v in a]), (-1, 1))
|
|
X = sm.add_constant(X)
|
|
res = sm.OLS(y, X).fit()
|
|
tp = res.pvalues
|
|
thresp.append(tp)
|
|
# print("coef ols: " + str(res.params))
|
|
# print("sum ols: " + str(res.summary()))
|
|
coefthresols = np.reshape(np.array(res.params), (-1, 1))
|
|
thresols.append(X.dot(coefthresols))
|
|
thresiv.append(iv)
|
|
thresdata.append(d)
|
|
with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file:
|
|
file.write(str(res.summary()))
|
|
|
|
fig = plt.figure(figsize=FIG_SIZE)
|
|
plt.plot([difftime(i[0]) for i in intervals], data, label="average vote score")
|
|
plt.grid(True)
|
|
for i in range(len(data)):
|
|
va = "center"
|
|
if 0 < i < len(data) - 1:
|
|
if data[i - 1] < data[i] and data[i + 1] < data[i]:
|
|
va = "bottom"
|
|
elif data[i - 1] > data[i] and data[i + 1] > data[i]:
|
|
va = "top"
|
|
elif i == 0:
|
|
if data[i + 1] < data[i]:
|
|
va = "bottom"
|
|
else:
|
|
va = "top"
|
|
elif i == len(data) - 1:
|
|
if data[i - 1] < data[i]:
|
|
va = "bottom"
|
|
else:
|
|
va = "top"
|
|
plt.text(difftime(intervals[i][0]), data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
|
|
plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS")
|
|
# print("shape: " + str(np.shape(thresdata)))
|
|
for (ti, t) in enumerate(thresholds):
|
|
# print("shape1: " + str(np.shape(thresdata[ti])))
|
|
plt.plot([difftime(thresiv[ti][i][0]) for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months")
|
|
plt.title("Average vote score for new users")
|
|
plt.xticks(rotation=90)
|
|
plt.xlabel("months")
|
|
plt.ylabel("vote score")
|
|
plt.legend(loc="upper right")
|
|
outfile = outputdir + "/average_votes-i" + str(intervl) + ".png"
|
|
plt.savefig(outfile, bbox_inches='tight')
|
|
plt.close(fig)
|
|
|
|
|
|
def votescore(votes, post):
|
|
filtered = dmt(votes).filter(lambda v: v['PostId'] == post['Id'] and post['CreationDate'] + timedelta(days=DAYS_NEW_USER) > v['CreationDate']).getresults()
|
|
score = sum([1 if v['VoteTypeId'] == 2 else (-1 if v['VoteTypeId'] == 3 else 0) for v in filtered])
|
|
return score
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# execute only if run as a script
|
|
usage = sys.argv[0] + " <folder>"
|
|
if len(sys.argv) < 2:
|
|
print(usage)
|
|
sys.exit(1)
|
|
folder = sys.argv[1]
|
|
if not os.path.isdir(folder):
|
|
print(folder + " is not a folder")
|
|
sys.exit(1)
|
|
interval = 1
|
|
if len(sys.argv) >= 3:
|
|
if sys.argv[2].startswith("-i"):
|
|
interval = sys.argv[2][2:]
|
|
try:
|
|
interval = int(interval)
|
|
except ValueError:
|
|
print("-i: int required")
|
|
sys.exit(1)
|
|
if interval < 1 or interval > 12:
|
|
print("-i: only 1 - 12")
|
|
sys.exit(1)
|
|
else:
|
|
print("unknown parameter: " + sys.argv[2])
|
|
sys.exit(1)
|
|
|
|
main(folder, interval)
|