This commit is contained in:
wea_ondara
2020-04-08 12:28:52 +02:00
parent e926ea5b22
commit 813d5edf24
2 changed files with 73 additions and 45 deletions

103
common.py
View File

@@ -1,7 +1,7 @@
import importlib import importlib
from threading import Thread, Lock
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from datetime import datetime
from threading import Thread, Lock
from loader import dmt from loader import dmt
@@ -10,37 +10,66 @@ rprint = lambda text: print('\r' + text)
DAYS_NEW_USER = 7 DAYS_NEW_USER = 7
IMAGE_MAGICK = "magick" IMAGE_MAGICK = "magick"
CHANGE_DATE = datetime.fromisoformat("2018-08-21T21:00:00")
def calc_intervals(posts, months=3): def calc_intervals(posts, months=3):
firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], "firstpost").getresults()
"firstpost").getresults()
lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults() lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults()
# calc quarter beginning # calc in months intervals from change date
firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0) f = CHANGE_DATE.replace(month=CHANGE_DATE.month)
if (firstpost.month - 1) % months != 0: while firstpost < f:
firstpost = firstpost.replace(month=firstpost.month - ((firstpost.month - 1) % months)) f = f.replace(year=f.year - (1 if f.month - months < 1 else 0))
lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0) f = f.replace(month=(f.month - months + 12 - 1) % 12 + 1)
if (lastpost.month - 1) % months != 0: firstpost = f
lastpost = lastpost.replace(month=lastpost.month - ((lastpost.month - 1) % months))
# add 3 months to last post # calc in months intervals from change date
if lastpost.month + months > 12: l = CHANGE_DATE.replace(month=CHANGE_DATE.month)
lastpost = lastpost.replace(month=lastpost.month + months - 12, year=lastpost.year + 1) while lastpost > l:
else: l = l.replace(year=l.year + (1 if l.month + months > 12 else 0))
lastpost = lastpost.replace(month=lastpost.month + months) l = l.replace(month=(l.month + months - 1) % 12 + 1)
lastpost = l
# firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
# if (firstpost.month - 1) % months != 0:
# firstpost = firstpost.replace(month=firstpost.month - ((firstpost.month - 1) % months))
# lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
# if (lastpost.month - 1) % months != 0:
# lastpost = lastpost.replace(month=lastpost.month - ((lastpost.month - 1) % months))
# # add 3 months to last post
# if lastpost.month + months > 12:
# lastpost = lastpost.replace(month=lastpost.month + months - 12, year=lastpost.year + 1)
# else:
# lastpost = lastpost.replace(month=lastpost.month + months)
cdate = firstpost cdate = firstpost
intervals = [] intervals = []
while cdate < lastpost: while cdate < lastpost:
nextmon = cdate.month + months nextmon = cdate.month + months
nextquarter = cdate.replace(month=nextmon if nextmon <= 12 else nextmon - 12, year=cdate.year + (0 if nextmon <= 12 else 1)) nextquarter = cdate.replace(month=nextmon if nextmon <= 12 else nextmon - 12, year=cdate.year + (0 if nextmon <= 12 else 1))
print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y")) if cdate > firstpost and nextquarter < lastpost: # ignore first and last intervals as there is only partial data
intervals.append((cdate, nextquarter)) print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y"))
intervals.append((cdate, nextquarter))
cdate = nextquarter cdate = nextquarter
return intervals return intervals
def difftime(date):
diff = (date.year - CHANGE_DATE.year) * 12
diff += ((date.month - CHANGE_DATE.month) % 12)
if date.month - CHANGE_DATE.month < 0:
diff -= 12
return diff
# print(str(difftime(datetime.fromisoformat("2018-11-21T21:00:00"))) + ", 3")
# print(str(difftime(datetime.fromisoformat("2018-05-21T21:00:00"))) + ", -3")
# print(str(difftime(datetime.fromisoformat("2019-11-21T21:00:00"))) + ", 15")
# print(str(difftime(datetime.fromisoformat("2017-05-21T21:00:00"))) + ", -15")
# print(str(difftime(datetime.fromisoformat("2020-05-21T21:00:00"))) + ", 21")
# print(str(difftime(datetime.fromisoformat("2016-11-21T21:00:00"))) + ", -21")
def imprt(file): def imprt(file):
spec = importlib.util.spec_from_file_location("module.name", file) spec = importlib.util.spec_from_file_location("module.name", file)
foo = importlib.util.module_from_spec(spec) foo = importlib.util.module_from_spec(spec)
@@ -48,22 +77,22 @@ def imprt(file):
return foo return foo
class FigSaver(): # class FigSaver():
def __init__(self): # def __init__(self):
self.__lock = Lock() # self.__lock = Lock()
self.__threads = [] # self.__threads = []
#
def save(self, fig, path, **kwargs): # def save(self, fig, path, **kwargs):
thread = Thread(target=self.__dosave, args=(fig, path, kwargs)) # thread = Thread(target=self.__dosave, args=(fig, path, kwargs))
with self.__lock: # with self.__lock:
self.__threads.append(thread) # self.__threads.append(thread)
thread.start() # thread.start()
#
def __dosave(self, fig, path, kwargs): # def __dosave(self, fig, path, kwargs):
fig.savefig(path, **kwargs) # fig.savefig(path, **kwargs)
plt.close(fig) # plt.close(fig)
#
def join(self): # def join(self):
with self.__lock: # with self.__lock:
for thread in self.__threads: # for thread in self.__threads:
thread.join() # thread.join()

15
its.py
View File

@@ -1,14 +1,13 @@
import sys
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import os import os
import statsmodels.api as sm import statsmodels.api as sm
import sys
from datetime import datetime from datetime import datetime
from datetime import timedelta from datetime import timedelta
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, difftime
from loader import load, dmt, cms from loader import load, dmt, cms
from sentiments import readtoxleveltxt from sentiments import readtoxleveltxt
@@ -58,7 +57,7 @@ def main(folder, intervl):
avgcount = np.mean([x for x in count if str(x) != "nan"]) avgcount = np.mean([x for x in count if str(x) != "nan"])
stdcount = np.std([x for x in count if str(x) != "nan"]) stdcount = np.std([x for x in count if str(x) != "nan"])
for i in range(len(count)): for i in range(len(count)):
if str(count[i]) == "nan" or np.abs((count[i] - avgcount) / stdcount) > 3: if str(count[i]) == "nan": # or np.abs((count[i] - avgcount) / stdcount) > 3:
datasingle[i] = float("nan") datasingle[i] = float("nan")
data[i] = float("nan") data[i] = float("nan")
count[i] = float("nan") count[i] = float("nan")
@@ -120,7 +119,7 @@ def main(folder, intervl):
file.write(str(res.summary())) file.write(str(res.summary()))
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=(16, 12))
plt.plot([i[0] for i in intervals], data, label="average sentiment") plt.plot([difftime(i[0]) for i in intervals], data, label="average sentiment")
plt.grid(True) plt.grid(True)
for i in range(len(data)): for i in range(len(data)):
va = "center" va = "center"
@@ -139,12 +138,12 @@ def main(folder, intervl):
va = "bottom" va = "bottom"
else: else:
va = "top" va = "top"
plt.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va) plt.text(difftime(intervals[i][0]), data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va)
plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS (pvalues " + str(p2) + ")") plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS (pvalues " + str(p2) + ")")
# print("shape: " + str(np.shape(thresdata))) # print("shape: " + str(np.shape(thresdata)))
for (ti, t) in enumerate(thresholds): for (ti, t) in enumerate(thresholds):
# print("shape1: " + str(np.shape(thresdata[ti]))) # print("shape1: " + str(np.shape(thresdata[ti])))
plt.plot([thresiv[ti][i][0] for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months (pvalues " + str(thresp[ti]) + ")") plt.plot([difftime(thresiv[ti][i][0]) for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months (pvalues " + str(thresp[ti]) + ")")
plt.title("Average sentiments for new users") plt.title("Average sentiments for new users")
plt.xticks(rotation=90) plt.xticks(rotation=90)
plt.xlabel("months") plt.xlabel("months")