wip
This commit is contained in:
235
itsnew.py
Normal file
235
itsnew.py
Normal file
@@ -0,0 +1,235 @@
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import os
|
||||
import statsmodels.api as sm
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, CHANGE_DATE, difftime
|
||||
from loader import load, dmt, cms
|
||||
from sentiments import readtoxleveltxt
|
||||
|
||||
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
|
||||
thresholds = [6, 9, 12, 15]
|
||||
|
||||
|
||||
def main(folder, intervl):
|
||||
users, posts, firstcontrib, sumcontrib = load(folder)
|
||||
|
||||
intervals = calc_intervals(posts, intervl)
|
||||
|
||||
start = cms()
|
||||
printnoln("reading sentiments ...")
|
||||
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
|
||||
rprint("reading sentiments ... took " + str(cms() - start) + "ms")
|
||||
|
||||
outputdir = folder + "/output/itsnew/"
|
||||
os.system("mkdir -p " + outputdir)
|
||||
|
||||
data = []
|
||||
datasingle = []
|
||||
count = []
|
||||
for (option_date_from, option_date_to) in intervals:
|
||||
if option_date_to <= datetime.fromisoformat("2015-01-01T00:00:00"):
|
||||
datasingle.append(float("nan"))
|
||||
data.append(float("nan"))
|
||||
count.append(float("nan"))
|
||||
continue
|
||||
print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
||||
# avg sentiments
|
||||
filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound']
|
||||
for a in p['Answers']
|
||||
if option_date_from <= p['CreationDate'] < option_date_to # post in interval
|
||||
and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate'] # post created withon 1 week of 1st contrib
|
||||
and p['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) # answer within 1 week of post creation
|
||||
|
||||
.filter(lambda p: p != [])
|
||||
.reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: [])
|
||||
.getresults())
|
||||
datasingle.append(filtered)
|
||||
avg = np.average(filtered) if len(filtered) > 0 else float("nan")
|
||||
data.append(avg)
|
||||
count.append(len(filtered))
|
||||
|
||||
avgcount = np.mean([x for x in count if str(x) != "nan"])
|
||||
stdcount = np.std([x for x in count if str(x) != "nan"])
|
||||
for i in range(len(count)):
|
||||
if str(count[i]) == "nan": # or np.abs((count[i] - avgcount) / stdcount) > 3:
|
||||
datasingle[i] = float("nan")
|
||||
data[i] = float("nan")
|
||||
count[i] = float("nan")
|
||||
|
||||
# filter nan entries
|
||||
for i in range(len(data)):
|
||||
while i < len(data) and str(data[i]) == "nan":
|
||||
del datasingle[i]
|
||||
del data[i]
|
||||
del intervals[i]
|
||||
del count[i]
|
||||
|
||||
# deseason
|
||||
# mins = [min([data[j] for j in range(len(data)) if j % 12 == i]) for i in range(0, 12)]
|
||||
mins = [np.average([data[j] for j in range(len(data)) if j % 12 == i]) for i in range(0, 12)]
|
||||
# mins = [min(d) / count(d) for d in [[data[j] for j in range(len(data)) if j % 12 == i] for i in range(0, 12)]]
|
||||
# mins = [data[i] for i in range(0, 12)]
|
||||
mins = [m - min(mins) for m in mins]
|
||||
print("mins", mins)
|
||||
dsdata = [data[i] - mins[i % 12] for i in range(len(data))]
|
||||
dsdatasingle = [[d - mins[i % 12] for d in datasingle[i]] for i in range(len(datasingle))]
|
||||
|
||||
# data = dsdata
|
||||
# datasingle = dsdatasingle
|
||||
|
||||
print("Computing full ITS")
|
||||
# t_s = np.reshape(np.array([data[i] for i in range(len(datasingle)) for j in datasingle[i]]), (-1, 1))
|
||||
t = np.reshape(np.array([i for i in range(len(dsdatasingle)) for j in dsdatasingle[i]]), (-1, 1))
|
||||
x = np.reshape(np.array([(0 if intervals[i][0] <= CHANGE_DATE else 1) for i in range(len(dsdatasingle)) for j in dsdatasingle[i]]), (-1, 1))
|
||||
X = np.array(t) # b1
|
||||
X = np.concatenate((X, x), 1) # b2
|
||||
X = np.concatenate((X, np.multiply(t, x)), 1) # 3
|
||||
# X = np.concatenate((X, t_s), 1) # 4
|
||||
X = sm.add_constant(X) # b0
|
||||
y = np.reshape(np.array([d for a in dsdatasingle for d in a]), (-1, 1))
|
||||
res = sm.OLS(y, X).fit()
|
||||
p2 = res.pvalues
|
||||
print("coef ols: " + str(res.params))
|
||||
print("sum ols: " + str(res.summary()))
|
||||
coef2ols = np.reshape(np.array(res.params), (-1, 1))
|
||||
# coef2ols[4] = 0
|
||||
its2ols = X.dot(coef2ols)
|
||||
dsits2ols = np.copy(its2ols)
|
||||
# its2ols = np.add(its2ols, np.reshape(np.array([mins[i % 12] for i in range(len(data)) for j in dsdatasingle[i]]), (-1, 1)))
|
||||
minavg = np.average(mins)
|
||||
its2ols = np.add(its2ols, np.reshape(np.array([minavg for i in range(len(data)) for j in dsdatasingle[i]]), (-1, 1)))
|
||||
with open(outputdir + "/summary-i" + str(intervl) + ".txt", "w") as file:
|
||||
file.write(str(res.summary()))
|
||||
|
||||
thresdata = []
|
||||
thresols = []
|
||||
thresiv = []
|
||||
thresp = []
|
||||
print("Computing threshold ITS")
|
||||
for ti in thresholds:
|
||||
# print(1, CHANGE_DATE - relativedelta(months=ti))
|
||||
# print(2, CHANGE_DATE + relativedelta(months=ti))
|
||||
z = [(i, x) for (i, x) in zip(intervals, datasingle) if i[0] >= CHANGE_DATE - relativedelta(months=ti) and i[1] <= CHANGE_DATE + relativedelta(months=ti)]
|
||||
iv = [i for (i, x) in z]
|
||||
# print("iv " + str(iv))
|
||||
d = [x for (i, x) in z]
|
||||
# t_s = np.reshape(np.array([data[i] for i in range(len(d)) for j in d[i]]), (-1, 1))
|
||||
t = np.reshape(np.array([i for i in range(len(d)) for j in d[i]]), (-1, 1))
|
||||
x = np.reshape(np.array([(0 if iv[i][0] <= CHANGE_DATE else 1) for i in range(len(d)) for j in d[i]]), (-1, 1))
|
||||
X = np.array(t) # b1
|
||||
X = np.concatenate((X, x), 1) # b2
|
||||
X = np.concatenate((X, np.multiply(t, x)), 1) # b3
|
||||
# X = np.concatenate((X, t_s), 1) # b4
|
||||
X = sm.add_constant(X) # 0
|
||||
y = np.reshape(np.array([v for a in d for v in a]), (-1, 1))
|
||||
res = sm.OLS(y, X).fit()
|
||||
tp = res.pvalues
|
||||
thresp.append(tp)
|
||||
# print("coef ols: " + str(res.params))
|
||||
# print("sum ols: " + str(res.summary()))
|
||||
coefthresols = np.reshape(np.array(res.params), (-1, 1))
|
||||
# coefthresols[4] = 0
|
||||
thresols.append(X.dot(coefthresols))
|
||||
thresiv.append(iv)
|
||||
thresdata.append(d)
|
||||
with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file:
|
||||
file.write(str(res.summary()))
|
||||
|
||||
fig = plt.figure(figsize=FIG_SIZE)
|
||||
plt.plot([difftime(i[0]) for i in intervals], data, label="average sentiment")
|
||||
# plt.plot([difftime(i[0]) for i in intervals], dsdata, label="average sentiment - deseason")
|
||||
plt.grid(True)
|
||||
for i in range(len(data)):
|
||||
va = "center"
|
||||
if 0 < i < len(data) - 1:
|
||||
if data[i - 1] < data[i] and data[i + 1] < data[i]:
|
||||
va = "bottom"
|
||||
elif data[i - 1] > data[i] and data[i + 1] > data[i]:
|
||||
va = "top"
|
||||
elif i == 0:
|
||||
if data[i + 1] < data[i]:
|
||||
va = "bottom"
|
||||
else:
|
||||
va = "top"
|
||||
elif i == len(data) - 1:
|
||||
if data[i - 1] < data[i]:
|
||||
va = "bottom"
|
||||
else:
|
||||
va = "top"
|
||||
plt.text(difftime(intervals[i][0]), data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
|
||||
plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS")
|
||||
# plt.plot([difftime(intervals[i][0]) for i in range(len(dsdatasingle)) for j in dsdatasingle[i]], dsits2ols, label="sm single ITS - deseason")
|
||||
# print("shape: " + str(np.shape(thresdata)))
|
||||
for (ti, t) in enumerate(thresholds):
|
||||
# print("shape1: " + str(np.shape(thresdata[ti])))
|
||||
plt.plot([difftime(thresiv[ti][i][0]) for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months")
|
||||
plt.title("Average sentiments for new users")
|
||||
plt.xticks(rotation=90)
|
||||
plt.xlabel("months")
|
||||
plt.ylabel("sentiment")
|
||||
plt.legend(loc="upper right")
|
||||
outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png"
|
||||
plt.savefig(outfile, bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
|
||||
# plot seasonality
|
||||
fig = plt.figure(figsize=FIG_SIZE)
|
||||
plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], [mins[i % 12] for i in range(len(datasingle)) for j in datasingle[i]], label="seasonality")
|
||||
# print("shape: " + str(np.shape(thresdata)))
|
||||
plt.title("Average sentiments for new users - seasonality")
|
||||
plt.xticks(rotation=90)
|
||||
plt.xlabel("months")
|
||||
plt.ylabel("sentiment - seasonality")
|
||||
plt.legend(loc="upper right")
|
||||
outfile = outputdir + "/season-i" + str(intervl) + ".png"
|
||||
plt.savefig(outfile, bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
|
||||
# plot seasonality post count
|
||||
pcmins = [len(datasingle[i]) for i in range(0, 12)]
|
||||
pcmins = [m - min(pcmins) for m in pcmins]
|
||||
|
||||
fig = plt.figure(figsize=FIG_SIZE)
|
||||
plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle))], [pcmins[i % 12] for i in range(len(datasingle))], label="seasonality")
|
||||
plt.title("post count for new users - seasonality")
|
||||
plt.xticks(rotation=90)
|
||||
plt.xlabel("months")
|
||||
plt.ylabel("post count - seasonality")
|
||||
plt.legend(loc="upper right")
|
||||
outfile = outputdir + "/season_postcount-i" + str(intervl) + ".png"
|
||||
plt.savefig(outfile, bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# execute only if run as a script
|
||||
usage = sys.argv[0] + " <folder>"
|
||||
if len(sys.argv) < 2:
|
||||
print(usage)
|
||||
sys.exit(1)
|
||||
folder = sys.argv[1]
|
||||
if not os.path.isdir(folder):
|
||||
print(folder + " is not a folder")
|
||||
sys.exit(1)
|
||||
interval = 1
|
||||
if len(sys.argv) >= 3:
|
||||
if sys.argv[2].startswith("-i"):
|
||||
interval = sys.argv[2][2:]
|
||||
try:
|
||||
interval = int(interval)
|
||||
except ValueError:
|
||||
print("-i: int required")
|
||||
sys.exit(1)
|
||||
if interval < 1 or interval > 12:
|
||||
print("-i: only 1 - 12")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("unknown parameter: " + sys.argv[2])
|
||||
sys.exit(1)
|
||||
|
||||
main(folder, interval)
|
||||
Reference in New Issue
Block a user