From 0536f5db5f6bfe61d038c794ed87655d2e31b6ae Mon Sep 17 00:00:00 2001 From: wea_ondara Date: Sun, 11 Aug 2019 16:47:52 +0200 Subject: [PATCH] wip --- analyze_batch.py | 62 ++++++++++++++++++++++++--------------- calctoxdiff.py | 3 +- common.py | 25 ++++++++++------ posthist.py | 75 ++++++++++++++++++++++++++---------------------- sentiments.py | 22 +------------- 5 files changed, 98 insertions(+), 89 deletions(-) diff --git a/analyze_batch.py b/analyze_batch.py index 6b029e7..a1ba451 100644 --- a/analyze_batch.py +++ b/analyze_batch.py @@ -7,34 +7,33 @@ from math import ceil import matplotlib.pyplot as plt import numpy as np -from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer -from common import calc_intervals, imprt, FigSaver +from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK from loader import load, dmt, cms -printnoln = lambda text: print(text, end='', flush=True) -rprint = lambda text: print('\r' + text) - -DAYS_NEW_USER = 7 -OLD_USER_YEAR = 3 OLD_USER_PERCENTILE = 0.95 -analyser = SentimentIntensityAnalyzer() -figsaver = FigSaver() colors = ['red', 'green', 'blue', 'orange', 'deeppink'] -def main(folder): +def main(folder, intervl): users, posts, firstcontrib, sumcontrib = load(folder) - intervals = calc_intervals(posts) + intervals = calc_intervals(posts, intervl) cachedsentiments = imprt(folder + "/output/sentiments.py").answers - outfolder = folder + "/output/batch/" - os.system("mkdir -p " + outfolder) + outputdir = folder + "/output/batch/" + os.system("mkdir -p " + outputdir) postcounts = range(1, 5 + 1) + + magickpost = {i: IMAGE_MAGICK for i in postcounts} + magickold = IMAGE_MAGICK + magickglobal = IMAGE_MAGICK + for (option_date_from, option_date_to) in intervals: + magickdate = IMAGE_MAGICK + # get questions for option_date_from <= creation date < option_date_to newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults() if len(newposts) == 0: @@ -51,8 +50,8 @@ def main(folder): gpos = [] gcom = [] - goutfilenamenewusers = outfolder + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") - goutfilenameoldusers = outfolder + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + goutfilenamenewusers = outputdir + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + goutfilenameoldusers = outputdir + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") for option_posts in postcounts: # print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts)) @@ -122,6 +121,9 @@ def main(folder): fig.savefig(outfilename + ".png", bbox_inches='tight') plt.close(fig) rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms") + magickpost[option_posts] += " " + outfilename + ".png" + magickdate += " " + outfilename + ".png" + os.system(magickdate + " " + goutfilenamenewusers + ".pdf") # global start = cms() @@ -146,6 +148,7 @@ def main(folder): gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight') plt.close(gfig) rprint("global plot post ... plotting ... saving ... took " + str(cms() - start) + "ms") + magickglobal += " " + goutfilenamenewusers + ".png" # for old users --------------------------------------------------------------------------------- start = cms() @@ -192,18 +195,16 @@ def main(folder): # plt.show() fig.suptitle("Sentiment of answers to posts by most posting users (95%tile)\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) - # figsaver.save(fig, goutfilenameoldusers + ".png", bbox_inches='tight') printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...") fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight') plt.close(fig) rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms") + magickold += " " + goutfilenameoldusers + ".png" - figsaver.join() - figsaver.join() - - -def computeToxLevel(text): - return analyser.polarity_scores(text) + os.system(magickglobal + " batch_newusers.pdf") + os.system(magickold + " batch_oldusers.pdf") + for (i, cmd) in magickpost.items(): + os.system(cmd + " " + "batch_newusers_" + i + ".pdf") def dumptoxlevels(lvls, filename): @@ -222,5 +223,20 @@ if __name__ == "__main__": if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) + interval = 3 + if len(sys.argv) >= 3: + if sys.argv[2].startswith("-i"): + interval = sys.argv[2][2:] + try: + interval = int(interval) + except ValueError: + print("-i: int required") + sys.exit(1) + if interval < 1 or interval > 12: + print("-i: only 1 - 12") + sys.exit(1) + else: + print("unknown parameter: " + sys.argv[2]) + sys.exit(1) - main(folder) + main(folder, interval) diff --git a/calctoxdiff.py b/calctoxdiff.py index 18efd14..d1c615d 100644 --- a/calctoxdiff.py +++ b/calctoxdiff.py @@ -9,10 +9,9 @@ import matplotlib.pyplot as plt import numpy as np from scipy.stats import ks_2samp -from common import imprt +from common import imprt, IMAGE_MAGICK colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'} -IMAGE_MAGICK = "magick" def main(folder): diff --git a/common.py b/common.py index a219873..1ae9cb7 100644 --- a/common.py +++ b/common.py @@ -5,29 +5,36 @@ import matplotlib.pyplot as plt from loader import dmt +printnoln = lambda text: print(text, end='', flush=True) +rprint = lambda text: print('\r' + text) -def calc_intervals(posts): +DAYS_NEW_USER = 7 +IMAGE_MAGICK = "magick" + + +def calc_intervals(posts, months=3): firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], "firstpost").getresults() lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults() # calc quarter beginning firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0) - if firstpost.month not in (1, 4, 7, 10): - firstpost = firstpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[firstpost.month]) + if (firstpost.month - 1) % months != 0: + firstpost = firstpost.replace(month=firstpost.month - ((firstpost.month - 1) % months)) lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0) - if lastpost.month not in (1, 4, 7, 10): - lastpost = lastpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[lastpost.month]) + if (lastpost.month - 1) % months != 0: + lastpost = lastpost.replace(month=lastpost.month - ((lastpost.month - 1) % months)) # add 3 months to last post - if lastpost.month == 10: - lastpost = lastpost.replace(month=1, year=lastpost.year + 1) + if lastpost.month + months > 12: + lastpost = lastpost.replace(month=lastpost.month + months - 12, year=lastpost.year + 1) else: - lastpost = lastpost.replace(month=lastpost.month + 3) + lastpost = lastpost.replace(month=lastpost.month + months) cdate = firstpost intervals = [] while cdate < lastpost: - nextquarter = cdate.replace(month=(cdate.month + 3) % 12, year=cdate.year + (0 if cdate.month + 3 < 12 else 1)) + nextmon = cdate.month + months + nextquarter = cdate.replace(month=nextmon if nextmon <=12 else nextmon-12, year=cdate.year + (0 if nextmon <= 12 else 1)) print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y")) intervals.append((cdate, nextquarter)) cdate = nextquarter diff --git a/posthist.py b/posthist.py index 5abfefb..c59c639 100644 --- a/posthist.py +++ b/posthist.py @@ -1,34 +1,29 @@ -from datetime import datetime -from datetime import timedelta -import sys import os -from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer -import numpy as np -import matplotlib.pyplot as plt +import sys from collections import defaultdict -from loader import load, dmt, cms -import math -from common import calc_intervals -printnoln = lambda text: print(text, end='', flush=True) -rprint = lambda text: print('\r' + text) +import matplotlib.pyplot as plt +from matplotlib.ticker import MaxNLocator -DAYS_NEW_USER = 7 -OLD_USER_YEAR = 3 +from common import calc_intervals, IMAGE_MAGICK +from loader import load, dmt -analyser = SentimentIntensityAnalyzer() colors = ['red', 'green', 'blue', 'orange', 'deeppink'] -def main(folder): +def main(folder, intervl): users, posts, firstcontrib, sumcontrib = load(folder) - intervals = calc_intervals(posts) + intervals = calc_intervals(posts, intervl) + outputdir = folder + "/output/posthist/" + os.system("mkdir -p " + outputdir) + + activeusercounts = [] + imgmagickcmd = IMAGE_MAGICK for (option_date_from, option_date_to) in intervals: print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))) # filter posts by option_date_from <= creation date <= option_date_to - # newusers = set(dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").map(lambda u: u['Id'], "getting user ids").getresults()) newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults() postcounts = defaultdict(list) @@ -37,10 +32,8 @@ def main(folder): postcounts[p['OwnerUserId']].append(p) i = i + 1 postcounts = {id: len(pc) for (id, pc) in postcounts.items()} - # print("i: " + str(i) + " expected: " + str(len(newposts)) + " is: " + str(sum([pc for pc in postcounts.values()]))) + activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys()))) - outputdir = folder + "/output/posthist/" - os.system("mkdir -p " + outputdir) histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") histdata = [pc for pc in postcounts.values()] @@ -48,23 +41,22 @@ def main(folder): plt.hist(histdata, range(max(histdata, default=0) + 1)) plt.yscale('log') plt.ylim(bottom=0) + plt.xlabel("#posts") + plt.ylabel("#users with X posts") + fig.gca().xaxis.set_major_locator(MaxNLocator(integer=True)) plt.title("Histogram for user post count registered between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y")) fig.savefig(histfilename + ".png", bbox_inches='tight') plt.close(fig) + imgmagickcmd += " " + histfilename + ".png" + os.system(imgmagickcmd + " " + outputdir + "/posthist.pdf") - -def computeToxLevel(text): - return analyser.polarity_scores(text) - - -def flatmap(arr): - return [item for sublist in arr for item in sublist] - - -def dumptoxlevels(lvls, filename): - with open(filename, "w") as file: - file.write("from collections import defaultdict\n\n") - file.write("toxlevels = " + str(lvls).replace("", "list", 1) + "\n") + fig = plt.figure(figsize=(16, 12)) + plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts]) + plt.yscale('log') + plt.ylim(bottom=0) + plt.title("Active users") + fig.savefig(outputdir + "activeusers.png", bbox_inches='tight') + plt.close(fig) if __name__ == "__main__": @@ -77,5 +69,20 @@ if __name__ == "__main__": if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) + interval = 3 + if len(sys.argv) >= 3: + if sys.argv[2].startswith("-i"): + interval = sys.argv[2][2:] + try: + interval = int(interval) + except ValueError: + print("-i: int required") + sys.exit(1) + if interval < 1 or interval > 12: + print("-i: only 1 - 12") + sys.exit(1) + else: + print("unknown parameter: " + sys.argv[2]) + sys.exit(1) - main(folder) + main(folder, interval) diff --git a/sentiments.py b/sentiments.py index a112127..34bc264 100644 --- a/sentiments.py +++ b/sentiments.py @@ -5,14 +5,7 @@ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from loader import load, dmt -printnoln = lambda text: print(text, end='', flush=True) -rprint = lambda text: print('\r' + text) - -DAYS_NEW_USER = 7 -OLD_USER_YEAR = 3 - analyser = SentimentIntensityAnalyzer() -colors = ['red', 'green', 'blue', 'orange', 'deeppink'] def main(folder): @@ -22,20 +15,7 @@ def main(folder): os.system("mkdir -p " + outfolder) outfilename = outfolder + "sentiments" - # computer toxic levels - # start = cms() - # printnoln("computing toxic levels: filtering") - - # toxlevels = defaultdict(list) - # for (i, post) in enumerate(posts): - # if (i + 1) % 100 == 0: - # printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(posts))) - # if (i + 1) == len(posts): - # printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(posts))) - # for a in post['Answers']: - # toxlevel = computeToxLevel(a['Body']) - # toxlevels[post['Id']].append(toxlevel) - # rprint("computing toxic levels: post #" + str(len(posts)) + "/" + str(len(posts)) + " ... took " + str(cms() - start) + "ms") + # compute toxic levels toxlevels = dmt(posts, 10).map(lambda p: (p['Id'], {a['Id']: computeToxLevel(a['Body']) for a in p['Answers']}), "calculating sentiments").getresults() toxlevels = {id: p for (id, p) in toxlevels}