This commit is contained in:
wea_ondara
2021-03-22 20:30:32 +01:00
parent 316fed8283
commit 52d7ddb7fc
9 changed files with 270 additions and 36 deletions

235
itsnew.py Normal file
View File

@@ -0,0 +1,235 @@
import matplotlib.pyplot as plt
import numpy as np
import os
import statsmodels.api as sm
import sys
from datetime import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, CHANGE_DATE, difftime
from loader import load, dmt, cms
from sentiments import readtoxleveltxt
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
thresholds = [6, 9, 12, 15]
def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts, intervl)
start = cms()
printnoln("reading sentiments ...")
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
rprint("reading sentiments ... took " + str(cms() - start) + "ms")
outputdir = folder + "/output/itsnew/"
os.system("mkdir -p " + outputdir)
data = []
datasingle = []
count = []
for (option_date_from, option_date_to) in intervals:
if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"):
datasingle.append(float("nan"))
data.append(float("nan"))
count.append(float("nan"))
continue
print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# avg sentiments
filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
for a in p['Answers']
if option_date_from <= p['CreationDate'] < option_date_to # post in interval
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'] # post created withon 1 week of 1st contrib
and p['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) # answer within 1 week of post creation
.filter(lambda p: p != [])
.reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
.getresults())
datasingle.append(filtered)
avg = np.average(filtered) if len(filtered) > 0 else float("nan")
data.append(avg)
count.append(len(filtered))
avgcount = np.mean([x for x in count if str(x) != "nan"])
stdcount = np.std([x for x in count if str(x) != "nan"])
for i in range(len(count)):
if str(count[i]) == "nan": # or np.abs((count[i] - avgcount) / stdcount) > 3:
datasingle[i] = float("nan")
data[i] = float("nan")
count[i] = float("nan")
# filter nan entries
for i in range(len(data)):
while i < len(data) and str(data[i]) == "nan":
del datasingle[i]
del data[i]
del intervals[i]
del count[i]
# deseason
# mins = [min([data[j] for j in range(len(data)) if j % 12 == i]) for i in range(0, 12)]
mins = [np.average([data[j] for j in range(len(data)) if j % 12 == i]) for i in range(0, 12)]
# mins = [min(d) / count(d) for d in [[data[j] for j in range(len(data)) if j % 12 == i] for i in range(0, 12)]]
# mins = [data[i] for i in range(0, 12)]
mins = [m - min(mins) for m in mins]
print("mins", mins)
dsdata = [data[i] - mins[i % 12] for i in range(len(data))]
dsdatasingle = [[d - mins[i % 12] for d in datasingle[i]] for i in range(len(datasingle))]
# data = dsdata
# datasingle = dsdatasingle
print("Computing full ITS")
# t_s = np.reshape(np.array([data[i] for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
t = np.reshape(np.array([i for i in range(len(dsdatasingle)) for j in dsdatasingle[i]]), (-1, 1))
x = np.reshape(np.array([(0 if intervals[i][0] <= CHANGE_DATE else 1) for i in range(len(dsdatasingle)) for j in dsdatasingle[i]]), (-1, 1))
X = np.array(t) # b1
X = np.concatenate((X, x), 1) # b2
X = np.concatenate((X, np.multiply(t, x)), 1) # 3
# X = np.concatenate((X, t_s), 1) # 4
X = sm.add_constant(X) # b0
y = np.reshape(np.array([d for a in dsdatasingle for d in a]), (-1, 1))
res = sm.OLS(y, X).fit()
p2 = res.pvalues
print("coef ols: " + str(res.params))
print("sum ols: " + str(res.summary()))
coef2ols = np.reshape(np.array(res.params), (-1, 1))
# coef2ols[4] = 0
its2ols = X.dot(coef2ols)
dsits2ols = np.copy(its2ols)
# its2ols = np.add(its2ols, np.reshape(np.array([mins[i % 12] for i in range(len(data)) for j in dsdatasingle[i]]), (-1, 1)))
minavg = np.average(mins)
its2ols = np.add(its2ols, np.reshape(np.array([minavg for i in range(len(data)) for j in dsdatasingle[i]]), (-1, 1)))
with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file:
file.write(str(res.summary()))
thresdata = []
thresols = []
thresiv = []
thresp = []
print("Computing threshold ITS")
for ti in thresholds:
# print(1, CHANGE_DATE - relativedelta(months=ti))
# print(2, CHANGE_DATE + relativedelta(months=ti))
z = [(i, x) for (i, x) in zip(intervals, datasingle) if i[0] >= CHANGE_DATE - relativedelta(months=ti) and i[1] <= CHANGE_DATE + relativedelta(months=ti)]
iv = [i for (i, x) in z]
# print("iv " + str(iv))
d = [x for (i, x) in z]
# t_s = np.reshape(np.array([data[i] for i in range(len(d)) for j in d[i]]), (-1, 1))
t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1))
x = np.reshape(np.array([(0 if iv[i][0] <= CHANGE_DATE else 1) for i in range(len(d)) for j in d[i]]), (-1, 1))
X = np.array(t) # b1
X = np.concatenate((X, x), 1) # b2
X = np.concatenate((X, np.multiply(t, x)), 1) # b3
# X = np.concatenate((X, t_s), 1) # b4
X = sm.add_constant(X) # 0
y = np.reshape(np.array([v for a in d for v in a]), (-1, 1))
res = sm.OLS(y, X).fit()
tp = res.pvalues
thresp.append(tp)
# print("coef ols: " + str(res.params))
# print("sum ols: " + str(res.summary()))
coefthresols = np.reshape(np.array(res.params), (-1, 1))
# coefthresols[4] = 0
thresols.append(X.dot(coefthresols))
thresiv.append(iv)
thresdata.append(d)
with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file:
file.write(str(res.summary()))
fig = plt.figure(figsize=FIG_SIZE)
plt.plot([difftime(i[0]) for i in intervals], data, label="average sentiment")
# plt.plot([difftime(i[0]) for i in intervals], dsdata, label="average sentiment - deseason")
plt.grid(True)
for i in range(len(data)):
va = "center"
if 0 < i < len(data) - 1:
if data[i - 1] < data[i] and data[i + 1] < data[i]:
va = "bottom"
elif data[i - 1] > data[i] and data[i + 1] > data[i]:
va = "top"
elif i == 0:
if data[i + 1] < data[i]:
va = "bottom"
else:
va = "top"
elif i == len(data) - 1:
if data[i - 1] < data[i]:
va = "bottom"
else:
va = "top"
plt.text(difftime(intervals[i][0]), data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS")
# plt.plot([difftime(intervals[i][0]) for i in range(len(dsdatasingle)) for j in dsdatasingle[i]], dsits2ols, label="sm single ITS - deseason")
# print("shape: " + str(np.shape(thresdata)))
for (ti, t) in enumerate(thresholds):
# print("shape1: " + str(np.shape(thresdata[ti])))
plt.plot([difftime(thresiv[ti][i][0]) for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months")
plt.title("Average sentiments for new users")
plt.xticks(rotation=90)
plt.xlabel("months")
plt.ylabel("sentiment")
plt.legend(loc="upper right")
outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
# plot seasonality
fig = plt.figure(figsize=FIG_SIZE)
plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], [mins[i % 12] for i in range(len(datasingle)) for j in datasingle[i]], label="seasonality")
# print("shape: " + str(np.shape(thresdata)))
plt.title("Average sentiments for new users - seasonality")
plt.xticks(rotation=90)
plt.xlabel("months")
plt.ylabel("sentiment - seasonality")
plt.legend(loc="upper right")
outfile = outputdir + "/season-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
# plot seasonality post count
pcmins = [len(datasingle[i]) for i in range(0, 12)]
pcmins = [m - min(pcmins) for m in pcmins]
fig = plt.figure(figsize=FIG_SIZE)
plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle))], [pcmins[i % 12] for i in range(len(datasingle))], label="seasonality")
plt.title("post count for new users - seasonality")
plt.xticks(rotation=90)
plt.xlabel("months")
plt.ylabel("post count - seasonality")
plt.legend(loc="upper right")
outfile = outputdir + "/season_postcount-i" + str(intervl) + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 1
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder, interval)