from datetime import datetime from datetime import timedelta import sys import os from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import numpy as np import matplotlib.pyplot as plt from collections import defaultdict from loader import load, dmt printnoln = lambda text: print(text, end='', flush=True) rprint = lambda text: print('\r' + text) DAYS_NEW_USER = 7 OLD_USER_YEAR = 3 analyser = SentimentIntensityAnalyzer() def main(folder, option_date_from, option_date_to, option_posts): users, posts, firstcontrib, sumcontrib = load(folder) # filter users by option_date_from <= creation date <= option_date_to newusers = dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").getresults() newuserids = set(dmt(newusers).map(lambda u: u['Id'], "get user id list").getresults()) # get questions for filtered users newposts = dmt(posts).filter(lambda p: p['OwnerUserId'] in newuserids, "filter posts by first contrib").getresults() # computer toxic levels print("computing toxic levels") toxlevels = defaultdict(list) searchedposts = defaultdict(int) for (i, post) in enumerate(newposts): if (i + 1) % 100 == 0: printnoln("\rpost #" + str(i + 1) + "/" + str(len(newposts))) if (i + 1) == len(newposts): rprint("post #" + str(i + 1) + "/" + str(len(newposts))) userid = post['OwnerUserId'] # check first contribution if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']: continue # no more than option_posts posts from one user searchedposts[userid] += 1 if searchedposts[userid] > option_posts: continue for a in post['Answers']: toxlevel = computeToxLevel(a['Body']) toxlevels[userid].append(toxlevel) neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())] neulevelsflat = [item['neu'] for item in flatmap(toxlevels.values())] poslevelsflat = [item['pos'] for item in flatmap(toxlevels.values())] comlevelsflat = [item['compound'] for item in flatmap(toxlevels.values())] fig, axs = plt.subplots(2, 2, figsize=(16, 12)) axs[0, 0].set_title('Neg') axs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100)) axs[1, 0].set_title('Neu') axs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100)) axs[0, 1].set_title('Pos') axs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100)) axs[1, 1].set_title('Compound') axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100)) # plt.show() os.system("mkdir -p output/analyze/") pltfile = "output/analyze/" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "_" + str(option_posts) + ".png" plt.savefig(pltfile) plt.close(fig) def computeToxLevel(text): return analyser.polarity_scores(text) def flatmap(arr): return [item for sublist in arr for item in sublist] if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " [--from <%d-%m-%Y>] [--to <%d-%m-%Y>] [--posts <#posts e.g. 2>]" if len(sys.argv) < 2: print(usage) sys.exit(1) folder = sys.argv[1] if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) consider_date_from = datetime.today() - timedelta(days=3 * 30) consider_date_to = datetime.today() consider_posts = 2 i = 2 while i < len(sys.argv) - 1: if sys.argv[i] == "--from": i += 1 try: consider_date_from = datetime.strptime(sys.argv[i], "%d-%m-%Y") except ValueError: print(sys.argv[i] + " is not a valid date") print(usage) sys.exit(1) elif sys.argv[i] == "--to": i += 1 try: consider_date_to = datetime.strptime(sys.argv[i], "%d-%m-%Y") except ValueError: print(sys.argv[i] + " is not a valid date") print(usage) sys.exit(1) elif sys.argv[i] == "--posts": i += 1 if not sys.argv[i].isdigit(): print(sys.argv[i] + " is not a number") print(usage) sys.exit(1) consider_posts = int(sys.argv[i]) i += 1 main(folder, consider_date_from, consider_date_to, consider_posts)