from datetime import datetime from datetime import timedelta import sys import os from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import numpy as np import matplotlib.pyplot as plt from collections import defaultdict from loader import load, dmt, cms import math from common import calc_intervals printnoln = lambda text: print(text, end='', flush=True) rprint = lambda text: print('\r' + text) DAYS_NEW_USER = 7 OLD_USER_YEAR = 3 analyser = SentimentIntensityAnalyzer() colors = ['red', 'green', 'blue', 'orange', 'deeppink'] def main(folder): users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts) for (option_date_from, option_date_to) in intervals: print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))) # filter posts by option_date_from <= creation date <= option_date_to newusers = set(dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").map(lambda u: u['Id'], "getting user ids").getresults()) newposts = dmt(posts).filter(lambda p: p['OwnerUserId'] in newusers, "filtering posts by users").getresults() postcounts = defaultdict(list) i = 0 for p in newposts: postcounts[p['OwnerUserId']].append(p) i = i + 1 postcounts = {id: len(pc) for (id, pc) in postcounts.items()} # print("i: " + str(i) + " expected: " + str(len(newposts)) + " is: " + str(sum([pc for pc in postcounts.values()]))) os.system("mkdir -p " + folder + "/output") histfilename = folder + "/output/posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") countfilename = folder + "/output/postcount_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") # fig = plt.figure(figsize=(16, 12)) # plt.plot(userids, [len(pc) for pc in postcounts]) # plt.title("Post count for users between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y")) # plt.xticks(rotation=90) # fig.savefig(countfilename + ".png", bbox_inches='tight') # plt.close(fig) histdata = [pc for pc in postcounts.values()] fig = plt.figure(figsize=(16, 12)) plt.hist(histdata, range(max(histdata, default=0) + 1)) plt.yscale('log') plt.ylim(bottom=0) plt.title("Histogram for user post count registered between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y")) fig.savefig(histfilename + ".png", bbox_inches='tight') plt.close(fig) def computeToxLevel(text): return analyser.polarity_scores(text) def flatmap(arr): return [item for sublist in arr for item in sublist] def dumptoxlevels(lvls, filename): with open(filename, "w") as file: file.write("from collections import defaultdict\n\n") file.write("toxlevels = " + str(lvls).replace("", "list", 1) + "\n") if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " " if len(sys.argv) < 2: print(usage) sys.exit(1) folder = sys.argv[1] if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) main(folder)