Files
master/analyze_batch.py
wea_ondara 93555d9cbf wip
2020-02-09 11:04:33 +01:00

302 lines
14 KiB
Python

import operator
import os
import sys
from collections import defaultdict
from datetime import timedelta
from math import ceil
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK
from loader import load, dmt, cms
from sentiments import readtoxleveltxt
OLD_USER_PERCENTILE = 0.95
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
def main(folder, intervl):
# with open(folder + "/output/batch/logi", "w") as f:
# f.write(str(readavgsentsingle(folder + "/output/batch/averagesentiment.txt")))
# return
matplotlib.use('Agg') # speed up saving of images
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts, intervl)
start = cms()
printnoln("reading sentiments ...")
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
rprint("reading sentiments ... took " + str(cms() - start) + "ms")
outputdir = folder + "/output/batch/"
os.system("mkdir -p " + outputdir)
postcounts = range(1, 5 + 1)
magickpost = {i: IMAGE_MAGICK for i in postcounts}
magickold = IMAGE_MAGICK
magickglobal = IMAGE_MAGICK
avgsent = [[] for i in range(0, 5 + 1)]
avgsentsingle = [[] for i in range(0, 5 + 1)]
for (option_date_from, option_date_to) in intervals:
magickdate = IMAGE_MAGICK
# get questions for option_date_from <= creation date < option_date_to
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults()
print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
gfig, gaxs = plt.subplots(2, 2, figsize=(16, 12))
gaxs[0, 0].set_title('Neg')
gaxs[1, 0].set_title('Neu')
gaxs[0, 1].set_title('Pos')
gaxs[1, 1].set_title('Compound')
gneg = []
gneu = []
gpos = []
gcom = []
goutfilenamenewusers = outputdir + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "_i" + str(intervl)
goutfilenameoldusers = outputdir + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "_i" + str(intervl)
start = cms()
printnoln("sorting posts ...")
sortedposts = defaultdict(list)
for (i, post) in enumerate(newposts):
userid = post['OwnerUserId']
# check first contribution
if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) <= post['CreationDate']:
continue
sortedposts[userid].append(post)
rprint("sorting posts ... took " + str(cms() - start) + "ms")
for option_posts in postcounts:
# print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts))
# computer toxic levels
start = cms()
printnoln("computing toxic levels: filtering")
toxlevels = []
filteredposts = [posts for (_, posts) in sortedposts.items() if len(posts) == option_posts]
filteredposts = [p for posts in filteredposts for p in posts]
for (i, post) in enumerate(filteredposts):
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
for a in post['Answers']:
if a['CreationDate'] > post['CreationDate'] + timedelta(days=DAYS_NEW_USER):
continue
# if a['Id'] in cachedsentiments.keys():
toxlevel = cachedsentiments[a['Id']]
# else:
# print("Sentiment not found for " + a['Id'])
# continue
toxlevels.append(toxlevel)
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
avgsent[option_posts].append(np.mean([s['compound'] for s in toxlevels]) if len(toxlevels) == 0 else 0)
avgsentsingle[option_posts].append([s['compound'] for s in toxlevels])
outfilename = goutfilenamenewusers + "_" + str(option_posts)
dumptoxlevels(toxlevels, outfilename + ".py")
neglevelsflat = [item['neg'] for item in toxlevels]
neulevelsflat = [item['neu'] for item in toxlevels]
poslevelsflat = [item['pos'] for item in toxlevels]
comlevelsflat = [item['compound'] for item in toxlevels]
gneg.append(neglevelsflat)
gneu.append(neulevelsflat)
gpos.append(poslevelsflat)
gcom.append(comlevelsflat)
fig, axs = plt.subplots(2, 2, figsize=(16, 12))
axs[0, 0].set_title('Negativity')
axs[1, 0].set_title('Neutrality')
axs[0, 1].set_title('Positivity')
axs[1, 1].set_title('Compound')
axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100))
axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100))
axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100))
axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100))
axs[0, 0].set_yscale('log')
axs[1, 0].set_yscale('log')
axs[0, 1].set_yscale('log')
axs[1, 1].set_yscale('log')
# plt.show()
fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " posts within 1 week of 1st contribution\nPosts created between "
+ option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + ", n(q)=" + str(len(filteredposts)) + ", n(a)=" + str(len(toxlevels)))
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
fig.savefig(outfilename + ".png", bbox_inches='tight')
plt.close(fig)
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
magickpost[option_posts] += " " + outfilename + ".png"
magickdate += " " + outfilename + ".png"
os.system(magickdate + " " + goutfilenamenewusers + ".pdf")
# global
start = cms()
printnoln("\rglobal plot post ... plotting ...")
gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[0, 0].legend(loc="upper right")
gaxs[1, 0].legend(loc="upper right")
gaxs[0, 1].legend(loc="upper right")
gaxs[1, 1].legend(loc="upper right")
gaxs[0, 0].set_yscale('log')
gaxs[1, 0].set_yscale('log')
gaxs[0, 1].set_yscale('log')
gaxs[1, 1].set_yscale('log')
gfig.suptitle(
"Sentiment of answers to the first X posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime(
"%d-%m-%Y"))
printnoln("\rglobal plot post ... plotting ... saving ...")
gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight')
plt.close(gfig)
rprint("global plot post ... plotting ... saving ... took " + str(cms() - start) + "ms")
magickglobal += " " + goutfilenamenewusers + ".png"
# for old users ---------------------------------------------------------------------------------
start = cms()
newuserids = set(dmt(newposts).map(lambda p: p['OwnerUserId']).getresults())
userposts = {u: 0 for u in newuserids}
for p in newposts:
userposts[p['OwnerUserId']] += 1
userposts = sorted(userposts.items(), key=operator.itemgetter(1))
oldusers = [k for k, v in userposts]
oldusers = set(oldusers[ceil(len(oldusers) * OLD_USER_PERCENTILE):])
filteredposts = dmt(newposts).filter(lambda p: p['OwnerUserId'] in oldusers).getresults()
toxlevels = []
for (i, post) in enumerate(filteredposts):
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
for a in post['Answers']:
if a['CreationDate'] > post['CreationDate'] + timedelta(days=DAYS_NEW_USER):
continue
# if a['Id'] in cachedsentiments.keys():
toxlevel = cachedsentiments[a['Id']]
# else:
# print("Sentiment not found for " + a['Id'])
toxlevels.append(toxlevel)
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
avgsent[0].append(np.mean([s['compound'] for s in toxlevels]) if len(toxlevels) == 0 else 0)
avgsentsingle[0].append([s['compound'] for s in toxlevels])
dumptoxlevels(toxlevels, goutfilenameoldusers + ".py")
neglevelsflat = [item['neg'] for item in toxlevels]
neulevelsflat = [item['neu'] for item in toxlevels]
poslevelsflat = [item['pos'] for item in toxlevels]
comlevelsflat = [item['compound'] for item in toxlevels]
fig, axs = plt.subplots(2, 2, figsize=(16, 12))
axs[0, 0].set_title('Neg')
axs[1, 0].set_title('Neu')
axs[0, 1].set_title('Pos')
axs[1, 1].set_title('Compound')
axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100))
axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100))
axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100))
axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100))
axs[0, 0].set_yscale('log')
axs[1, 0].set_yscale('log')
axs[0, 1].set_yscale('log')
axs[1, 1].set_yscale('log')
fig.suptitle("Sentiment of answers to posts by most posting users (" + str(OLD_USER_PERCENTILE * 100) + "%tile)\nPosts created between " +
option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + ", n(q)=" + str(len(filteredposts)) + ", n(a)=" + str(len(toxlevels)))
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight')
plt.close(fig)
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
magickold += " " + goutfilenameoldusers + ".png"
os.system(magickglobal + " " + outputdir + "batch_newusers_i" + str(intervl) + ".pdf")
os.system(magickold + " " + outputdir + "batch_oldusers_i" + str(intervl) + ".pdf")
for (i, cmd) in magickpost.items():
os.system(cmd + " " + outputdir + "batch_newusers_i" + str(intervl) + "_" + str(i) + ".pdf")
# avg sentiment graph
print("Plotting average sentiments ...")
fig = plt.figure(figsize=(16, 12))
for i in postcounts:
plt.plot([iv[0] for iv in intervals], avgsent[i], label="new users (" + str(i) + " posts)")
plt.plot([iv[0] for iv in intervals], avgsent[0], label="old users (all posts)")
plt.title("Average sentiments")
plt.xticks(rotation=90)
plt.xlabel("time")
plt.ylabel("sentiment")
plt.legend(loc="upper right")
plt.savefig(outputdir + "/averagesentiment-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig)
# dump avgsentsingle
dumpavgsentsingle(avgsentsingle, outputdir + "/averagesentiment.txt")
avgss2 = readavgsentsingle(outputdir + "/averagesentiment.txt")
if avgsentsingle != avgss2:
print("wuaaaaaa")
with open(outputdir + "/log", "w") as file:
file.write(str(avgsentsingle))
file.write(str(avgss2))
# print("1: " + str(avgsentsingle))
# print("2: " + str(avgss2))
def dumptoxlevels(lvls, filename):
with open(filename, "w") as file:
file.write("from collections import defaultdict\n\n")
file.write("toxlevels = " + str(lvls).replace("<class 'list'>", "list", 1) + "\n")
def dumpavgsentsingle(avg, filename):
with open(filename, "w") as file:
s = '\n'.join([str(i) + ':' + ';;'.join([';'.join([str(x) for x in a]) for a in avg[i]]) for i in range(len(avg))])
file.write(s)
def readavgsentsingle(filename):
with open(filename, "r") as file:
s = file.read()
s = s.split('\n')
s = [l.split(':', 2)[1] for l in s]
s = [[[float(x) for x in a.split(';')] if a != '' else [] for a in l.split(';;')] for l in s]
return s
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 3
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder, interval)