This commit is contained in:
wea_ondara
2020-07-19 11:22:08 +02:00
parent 4fba514e10
commit d018ead5b5

224
questionits.py Normal file
View File

@@ -0,0 +1,224 @@
import matplotlib.pyplot as plt
import numpy as np
import os
import statsmodels.api as sm
import sys
from collections import defaultdict
from datetime import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, difftime
from loader import load, dmt, cms, readVotes
from sentiments import readtoxleveltxt
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
thresholds = [6, 9, 12, 15]
changedate = datetime.fromisoformat("2018-09-01T00:00:00")
def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts, intervl)
# start = cms()
# printnoln("reading sentiments ...")
# (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
# rprint("reading sentiments ... took " + str(cms() - start) + "ms")
start = cms()
printnoln("sorting posts by user ...")
postbyuser = defaultdict(list)
for p in posts:
postbyuser[p['OwnerUserId']].append(p)
rprint("sorting posts by user ... took " + str(cms() - start) + "ms")
outputdir = folder + "/output/questionits/"
os.system("mkdir -p " + outputdir)
data1 = []
data2 = []
datasingle1 = []
datasingle2 = []
count = []
for (option_date_from, option_date_to) in intervals:
if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"):
datasingle1.append(float("nan"))
datasingle2.append(float("nan"))
data1.append(float("nan"))
data2.append(float("nan"))
count.append(float("nan"))
continue
print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# avg sentiments
filtered1 = len(dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate']
and postbyuser[p['OwnerUserId']][0] == p)
# .map(lambda p: votescore(votesbypost[p['Id']], p))
.getresults())
filtered2 = len(dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate']
and postbyuser[p['OwnerUserId']][0] != p)
# .map(lambda p: votescore(votesbypost[p['Id']], p))
.getresults())
filtered1 = [filtered1]
filtered2 = [filtered2]
datasingle1.append(filtered1)
datasingle2.append(filtered2)
avg1 = np.average(filtered1) if len(filtered1) > 0 else float("nan")
avg2 = np.average(filtered2) if len(filtered2) > 0 else float("nan")
data1.append(avg1)
data2.append(avg2)
count.append(len(filtered1))
avgcount = np.mean([x for x in count if str(x) != "nan"])
stdcount = np.std([x for x in count if str(x) != "nan"])
for i in range(len(count)):
if str(count[i]) == "nan": # or np.abs((count[i] - avgcount) / stdcount) > 3:
datasingle1[i] = float("nan")
datasingle2[i] = float("nan")
data1[i] = float("nan")
data2[i] = float("nan")
count[i] = float("nan")
# filter nan entries
for i in range(len(data1)):
while i < len(data1) and str(data1[i]) == "nan":
del datasingle1[i]
del datasingle2[i]
del data1[i]
del data2[i]
del intervals[i]
del count[i]
print("Computing full ITS1")
t = np.reshape(np.array([i for i in range(len(datasingle1)) for j in datasingle1[i]]), (-1, 1))
x = np.reshape(np.array([(0 if intervals[i][0] <= changedate else 1) for i in range(len(datasingle1)) for j in datasingle1[i]]), (-1, 1))
X = np.array(t)
X = np.concatenate((X, x), 1)
X = np.concatenate((X, np.multiply(t, x)), 1)
y = np.reshape(np.array([d for a in datasingle1 for d in a]), (-1, 1))
X = sm.add_constant(X)
res = sm.OLS(y, X).fit()
p2 = res.pvalues
print("coef ols: " + str(res.params))
print("sum ols: " + str(res.summary()))
coef2ols1 = np.reshape(np.array(res.params), (-1, 1))
its2ols1 = X.dot(coef2ols1)
with open(outputdir + "/summary1-i" + str(intervl) + ".txt", "w") as file:
file.write(str(res.summary()))
print("Computing full ITS2")
t = np.reshape(np.array([i for i in range(len(datasingle2)) for j in datasingle2[i]]), (-1, 1))
x = np.reshape(np.array([(0 if intervals[i][0] <= changedate else 1) for i in range(len(datasingle2)) for j in datasingle2[i]]), (-1, 1))
X = np.array(t)
X = np.concatenate((X, x), 1)
X = np.concatenate((X, np.multiply(t, x)), 1)
y = np.reshape(np.array([d for a in datasingle2 for d in a]), (-1, 1))
X = sm.add_constant(X)
res = sm.OLS(y, X).fit()
p2 = res.pvalues
print("coef ols: " + str(res.params))
print("sum ols: " + str(res.summary()))
coef2ols2 = np.reshape(np.array(res.params), (-1, 1))
its2ols2 = X.dot(coef2ols2)
with open(outputdir + "/summary2-i" + str(intervl) + ".txt", "w") as file:
file.write(str(res.summary()))
thresdata = []
thresols = []
thresiv = []
thresp = []
print("Computing threshold ITS")
for ti in thresholds:
# print(1, changedate - relativedelta(months=ti))
# print(2, changedate + relativedelta(months=ti))
z = [(i, x) for (i, x) in zip(intervals, datasingle1) if i[0] >= changedate - relativedelta(months=ti) and i[1] <= changedate + relativedelta(months=ti)]
iv = [i for (i, x) in z]
# print("iv " + str(iv))
d = [x for (i, x) in z]
t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1))
x = np.reshape(np.array([(0 if iv[i][0] <= changedate else 1) for i in range(len(d)) for j in d[i]]), (-1, 1))
X = np.array(t)
X = np.concatenate((X, x), 1)
X = np.concatenate((X, np.multiply(t, x)), 1)
y = np.reshape(np.array([v for a in d for v in a]), (-1, 1))
X = sm.add_constant(X)
res = sm.OLS(y, X).fit()
tp = res.pvalues
thresp.append(tp)
# print("coef ols: " + str(res.params))
# print("sum ols: " + str(res.summary()))
coefthresols = np.reshape(np.array(res.params), (-1, 1))
thresols.append(X.dot(coefthresols))
thresiv.append(iv)
thresdata.append(d)
with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file:
file.write(str(res.summary()))
fig = plt.figure(figsize=FIG_SIZE)
plt.plot([difftime(i[0]) for i in intervals], data1, label="average #1st-questions")
plt.plot([difftime(i[0]) for i in intervals], data2, label="average #2nd+-questions")
plt.grid(True)
# for i in range(len(data1)):
# va = "center"
# if 0 < i < len(data1) - 1:
# if data1[i - 1] < data1[i] and data1[i + 1] < data1[i]:
# va = "bottom"
# elif data1[i - 1] > data1[i] and data1[i + 1] > data1[i]:
# va = "top"
# elif i == 0:
# if data1[i + 1] < data1[i]:
# va = "bottom"
# else:
# va = "top"
# elif i == len(data1) - 1:
# if data1[i - 1] < data1[i]:
# va = "bottom"
# else:
# va = "top"
# plt.text(difftime(intervals[i][0]), data1[i], ("n=" if i == 0 else "") + str(len(datasingle1[i])), ha="center", va=va)
plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle1)) for j in datasingle1[i]], its2ols1, label="sm single ITS 1")
plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle2)) for j in datasingle2[i]], its2ols2, label="sm single ITS 2")
# print("shape: " + str(np.shape(thresdata)))
# for (ti, t) in enumerate(thresholds):
# print("shape1: " + str(np.shape(thresdata[ti])))
# plt.plot([difftime(thresiv[ti][i][0]) for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months")
plt.title("Average #1st-questions of new users")
plt.xticks(rotation=90)
plt.xlabel("months")
plt.ylabel("#1st questions")
plt.legend(loc="upper right")
outfile = outputdir + "/average_questions-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 1
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder, interval)