From 813d5edf241582402db8baa019c1c66bb05fbc4a Mon Sep 17 00:00:00 2001 From: wea_ondara Date: Wed, 8 Apr 2020 12:28:52 +0200 Subject: [PATCH] wip --- common.py | 103 ++++++++++++++++++++++++++++++++++-------------------- its.py | 15 ++++---- 2 files changed, 73 insertions(+), 45 deletions(-) diff --git a/common.py b/common.py index 0933923..4c5b66b 100644 --- a/common.py +++ b/common.py @@ -1,7 +1,7 @@ import importlib -from threading import Thread, Lock - import matplotlib.pyplot as plt +from datetime import datetime +from threading import Thread, Lock from loader import dmt @@ -10,37 +10,66 @@ rprint = lambda text: print('\r' + text) DAYS_NEW_USER = 7 IMAGE_MAGICK = "magick" +CHANGE_DATE = datetime.fromisoformat("2018-08-21T21:00:00") def calc_intervals(posts, months=3): - firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], - "firstpost").getresults() + firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], "firstpost").getresults() lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults() - # calc quarter beginning - firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0) - if (firstpost.month - 1) % months != 0: - firstpost = firstpost.replace(month=firstpost.month - ((firstpost.month - 1) % months)) - lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0) - if (lastpost.month - 1) % months != 0: - lastpost = lastpost.replace(month=lastpost.month - ((lastpost.month - 1) % months)) - # add 3 months to last post - if lastpost.month + months > 12: - lastpost = lastpost.replace(month=lastpost.month + months - 12, year=lastpost.year + 1) - else: - lastpost = lastpost.replace(month=lastpost.month + months) + # calc in months intervals from change date + f = CHANGE_DATE.replace(month=CHANGE_DATE.month) + while firstpost < f: + f = f.replace(year=f.year - (1 if f.month - months < 1 else 0)) + f = f.replace(month=(f.month - months + 12 - 1) % 12 + 1) + firstpost = f + + # calc in months intervals from change date + l = CHANGE_DATE.replace(month=CHANGE_DATE.month) + while lastpost > l: + l = l.replace(year=l.year + (1 if l.month + months > 12 else 0)) + l = l.replace(month=(l.month + months - 1) % 12 + 1) + lastpost = l + + # firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0) + # if (firstpost.month - 1) % months != 0: + # firstpost = firstpost.replace(month=firstpost.month - ((firstpost.month - 1) % months)) + # lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0) + # if (lastpost.month - 1) % months != 0: + # lastpost = lastpost.replace(month=lastpost.month - ((lastpost.month - 1) % months)) + # # add 3 months to last post + # if lastpost.month + months > 12: + # lastpost = lastpost.replace(month=lastpost.month + months - 12, year=lastpost.year + 1) + # else: + # lastpost = lastpost.replace(month=lastpost.month + months) cdate = firstpost intervals = [] while cdate < lastpost: nextmon = cdate.month + months nextquarter = cdate.replace(month=nextmon if nextmon <= 12 else nextmon - 12, year=cdate.year + (0 if nextmon <= 12 else 1)) - print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y")) - intervals.append((cdate, nextquarter)) + if cdate > firstpost and nextquarter < lastpost: # ignore first and last intervals as there is only partial data + print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y")) + intervals.append((cdate, nextquarter)) cdate = nextquarter return intervals +def difftime(date): + diff = (date.year - CHANGE_DATE.year) * 12 + diff += ((date.month - CHANGE_DATE.month) % 12) + if date.month - CHANGE_DATE.month < 0: + diff -= 12 + return diff + +# print(str(difftime(datetime.fromisoformat("2018-11-21T21:00:00"))) + ", 3") +# print(str(difftime(datetime.fromisoformat("2018-05-21T21:00:00"))) + ", -3") +# print(str(difftime(datetime.fromisoformat("2019-11-21T21:00:00"))) + ", 15") +# print(str(difftime(datetime.fromisoformat("2017-05-21T21:00:00"))) + ", -15") +# print(str(difftime(datetime.fromisoformat("2020-05-21T21:00:00"))) + ", 21") +# print(str(difftime(datetime.fromisoformat("2016-11-21T21:00:00"))) + ", -21") + + def imprt(file): spec = importlib.util.spec_from_file_location("module.name", file) foo = importlib.util.module_from_spec(spec) @@ -48,22 +77,22 @@ def imprt(file): return foo -class FigSaver(): - def __init__(self): - self.__lock = Lock() - self.__threads = [] - - def save(self, fig, path, **kwargs): - thread = Thread(target=self.__dosave, args=(fig, path, kwargs)) - with self.__lock: - self.__threads.append(thread) - thread.start() - - def __dosave(self, fig, path, kwargs): - fig.savefig(path, **kwargs) - plt.close(fig) - - def join(self): - with self.__lock: - for thread in self.__threads: - thread.join() +# class FigSaver(): +# def __init__(self): +# self.__lock = Lock() +# self.__threads = [] +# +# def save(self, fig, path, **kwargs): +# thread = Thread(target=self.__dosave, args=(fig, path, kwargs)) +# with self.__lock: +# self.__threads.append(thread) +# thread.start() +# +# def __dosave(self, fig, path, kwargs): +# fig.savefig(path, **kwargs) +# plt.close(fig) +# +# def join(self): +# with self.__lock: +# for thread in self.__threads: +# thread.join() diff --git a/its.py b/its.py index 378fe92..ccf5140 100644 --- a/its.py +++ b/its.py @@ -1,14 +1,13 @@ -import sys - import matplotlib.pyplot as plt import numpy as np import os import statsmodels.api as sm +import sys from datetime import datetime from datetime import timedelta from dateutil.relativedelta import relativedelta -from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER +from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, difftime from loader import load, dmt, cms from sentiments import readtoxleveltxt @@ -58,7 +57,7 @@ def main(folder, intervl): avgcount = np.mean([x for x in count if str(x) != "nan"]) stdcount = np.std([x for x in count if str(x) != "nan"]) for i in range(len(count)): - if str(count[i]) == "nan" or np.abs((count[i] - avgcount) / stdcount) > 3: + if str(count[i]) == "nan": # or np.abs((count[i] - avgcount) / stdcount) > 3: datasingle[i] = float("nan") data[i] = float("nan") count[i] = float("nan") @@ -120,7 +119,7 @@ def main(folder, intervl): file.write(str(res.summary())) fig = plt.figure(figsize=(16, 12)) - plt.plot([i[0] for i in intervals], data, label="average sentiment") + plt.plot([difftime(i[0]) for i in intervals], data, label="average sentiment") plt.grid(True) for i in range(len(data)): va = "center" @@ -139,12 +138,12 @@ def main(folder, intervl): va = "bottom" else: va = "top" - plt.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va) - plt.plot([intervals[i][0] for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS (pvalues " + str(p2) + ")") + plt.text(difftime(intervals[i][0]), data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va) + plt.plot([difftime(intervals[i][0]) for i in range(len(datasingle)) for j in datasingle[i]], its2ols, label="sm single ITS (pvalues " + str(p2) + ")") # print("shape: " + str(np.shape(thresdata))) for (ti, t) in enumerate(thresholds): # print("shape1: " + str(np.shape(thresdata[ti]))) - plt.plot([thresiv[ti][i][0] for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months (pvalues " + str(thresp[ti]) + ")") + plt.plot([difftime(thresiv[ti][i][0]) for i in range(len(thresdata[ti])) for j in thresdata[ti][i]], thresols[ti], label="thres ITS " + str(t) + " months (pvalues " + str(thresp[ti]) + ")") plt.title("Average sentiments for new users") plt.xticks(rotation=90) plt.xlabel("months")