243 lines
11 KiB
Python
243 lines
11 KiB
Python
import operator
|
|
import os
|
|
import sys
|
|
from collections import defaultdict
|
|
from datetime import timedelta
|
|
from math import ceil
|
|
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
|
|
from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK
|
|
from loader import load, dmt, cms
|
|
|
|
OLD_USER_PERCENTILE = 0.95
|
|
|
|
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
|
|
|
|
|
|
def main(folder, intervl):
|
|
users, posts, firstcontrib, sumcontrib = load(folder)
|
|
|
|
intervals = calc_intervals(posts, intervl)
|
|
cachedsentiments = imprt(folder + "/output/sentiments.py").answers
|
|
|
|
outputdir = folder + "/output/batch/"
|
|
os.system("mkdir -p " + outputdir)
|
|
|
|
postcounts = range(1, 5 + 1)
|
|
|
|
magickpost = {i: IMAGE_MAGICK for i in postcounts}
|
|
magickold = IMAGE_MAGICK
|
|
magickglobal = IMAGE_MAGICK
|
|
|
|
for (option_date_from, option_date_to) in intervals:
|
|
magickdate = IMAGE_MAGICK
|
|
|
|
# get questions for option_date_from <= creation date < option_date_to
|
|
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults()
|
|
if len(newposts) == 0:
|
|
continue
|
|
print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
|
gfig, gaxs = plt.subplots(2, 2, figsize=(16, 12))
|
|
gaxs[0, 0].set_title('Neg')
|
|
gaxs[1, 0].set_title('Neu')
|
|
gaxs[0, 1].set_title('Pos')
|
|
gaxs[1, 1].set_title('Compound')
|
|
|
|
gneg = []
|
|
gneu = []
|
|
gpos = []
|
|
gcom = []
|
|
|
|
goutfilenamenewusers = outputdir + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
|
goutfilenameoldusers = outputdir + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
|
|
|
for option_posts in postcounts:
|
|
# print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts))
|
|
|
|
# computer toxic levels
|
|
start = cms()
|
|
printnoln("computing toxic levels: filtering")
|
|
toxlevels = []
|
|
searchedposts = defaultdict(int)
|
|
filteredposts = []
|
|
for (i, post) in enumerate(newposts):
|
|
userid = post['OwnerUserId']
|
|
|
|
# check first contribution
|
|
if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']:
|
|
continue
|
|
|
|
# no more than option_posts posts from one user
|
|
searchedposts[userid] += 1
|
|
if searchedposts[userid] > option_posts:
|
|
continue
|
|
|
|
filteredposts.append(post)
|
|
|
|
for (i, post) in enumerate(filteredposts):
|
|
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
|
|
for a in post['Answers']:
|
|
if a['Id'] in cachedsentiments.keys():
|
|
toxlevel = cachedsentiments[a['Id']]
|
|
else:
|
|
print("Sentiment not found for " + a['Id'])
|
|
toxlevels.append(toxlevel)
|
|
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
|
|
|
|
outfilename = goutfilenamenewusers + "_" + str(option_posts)
|
|
dumptoxlevels(toxlevels, outfilename + ".py")
|
|
|
|
neglevelsflat = [item['neg'] for item in toxlevels]
|
|
neulevelsflat = [item['neu'] for item in toxlevels]
|
|
poslevelsflat = [item['pos'] for item in toxlevels]
|
|
comlevelsflat = [item['compound'] for item in toxlevels]
|
|
|
|
gneg.append(neglevelsflat)
|
|
gneu.append(neulevelsflat)
|
|
gpos.append(poslevelsflat)
|
|
gcom.append(comlevelsflat)
|
|
|
|
fig, axs = plt.subplots(2, 2, figsize=(16, 12))
|
|
axs[0, 0].set_title('Neg')
|
|
axs[1, 0].set_title('Neu')
|
|
axs[0, 1].set_title('Pos')
|
|
axs[1, 1].set_title('Compound')
|
|
axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100))
|
|
axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100))
|
|
axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100))
|
|
axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100))
|
|
axs[0, 0].set_yscale('log')
|
|
axs[1, 0].set_yscale('log')
|
|
axs[0, 1].set_yscale('log')
|
|
axs[1, 1].set_yscale('log')
|
|
|
|
# plt.show()
|
|
fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts within 1 week of 1st contribution\nPosts created between "
|
|
+ option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
|
# figsaver.save(fig, outfilename + ".png", bbox_inches='tight')
|
|
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
|
|
fig.savefig(outfilename + ".png", bbox_inches='tight')
|
|
plt.close(fig)
|
|
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
|
|
magickpost[option_posts] += " " + outfilename + ".png"
|
|
magickdate += " " + outfilename + ".png"
|
|
os.system(magickdate + " " + goutfilenamenewusers + ".pdf")
|
|
|
|
# global
|
|
start = cms()
|
|
printnoln("\rglobal plot post ... plotting ...")
|
|
gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
|
gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
|
gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
|
gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
|
gaxs[0, 0].legend(loc="upper right")
|
|
gaxs[1, 0].legend(loc="upper right")
|
|
gaxs[0, 1].legend(loc="upper right")
|
|
gaxs[1, 1].legend(loc="upper right")
|
|
gaxs[0, 0].set_yscale('log')
|
|
gaxs[1, 0].set_yscale('log')
|
|
gaxs[0, 1].set_yscale('log')
|
|
gaxs[1, 1].set_yscale('log')
|
|
gfig.suptitle(
|
|
"Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime(
|
|
"%d-%m-%Y"))
|
|
# figsaver.save(gfig, goutfilenamenewusers + ".png", bbox_inches='tight')
|
|
printnoln("\rglobal plot post ... plotting ... saving ...")
|
|
gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight')
|
|
plt.close(gfig)
|
|
rprint("global plot post ... plotting ... saving ... took " + str(cms() - start) + "ms")
|
|
magickglobal += " " + goutfilenamenewusers + ".png"
|
|
|
|
# for old users ---------------------------------------------------------------------------------
|
|
start = cms()
|
|
newuserids = set(dmt(newposts).map(lambda p: p['OwnerUserId']).getresults())
|
|
userposts = {u: 0 for u in newuserids}
|
|
for p in newposts:
|
|
userposts[p['OwnerUserId']] += 1
|
|
userposts = sorted(userposts.items(), key=operator.itemgetter(1))
|
|
oldusers = [k for k, v in userposts]
|
|
oldusers = set(oldusers[ceil(len(oldusers) * OLD_USER_PERCENTILE):])
|
|
filteredposts = dmt(newposts).filter(lambda p: p['OwnerUserId'] in oldusers).getresults()
|
|
|
|
toxlevels = []
|
|
for (i, post) in enumerate(filteredposts):
|
|
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
|
|
for a in post['Answers']:
|
|
if a['Id'] in cachedsentiments.keys():
|
|
toxlevel = cachedsentiments[a['Id']]
|
|
else:
|
|
print("Sentiment not found for " + a['Id'])
|
|
toxlevels.append(toxlevel)
|
|
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
|
|
|
|
dumptoxlevels(toxlevels, goutfilenameoldusers + ".py")
|
|
|
|
neglevelsflat = [item['neg'] for item in toxlevels]
|
|
neulevelsflat = [item['neu'] for item in toxlevels]
|
|
poslevelsflat = [item['pos'] for item in toxlevels]
|
|
comlevelsflat = [item['compound'] for item in toxlevels]
|
|
|
|
fig, axs = plt.subplots(2, 2, figsize=(16, 12))
|
|
axs[0, 0].set_title('Neg')
|
|
axs[1, 0].set_title('Neu')
|
|
axs[0, 1].set_title('Pos')
|
|
axs[1, 1].set_title('Compound')
|
|
axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100))
|
|
axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100))
|
|
axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100))
|
|
axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100))
|
|
axs[0, 0].set_yscale('log')
|
|
axs[1, 0].set_yscale('log')
|
|
axs[0, 1].set_yscale('log')
|
|
axs[1, 1].set_yscale('log')
|
|
|
|
# plt.show()
|
|
fig.suptitle("Sentiment of answers to posts by most posting users (95%tile)\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
|
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
|
|
fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight')
|
|
plt.close(fig)
|
|
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
|
|
magickold += " " + goutfilenameoldusers + ".png"
|
|
|
|
os.system(magickglobal + " batch_newusers.pdf")
|
|
os.system(magickold + " batch_oldusers.pdf")
|
|
for (i, cmd) in magickpost.items():
|
|
os.system(cmd + " " + "batch_newusers_" + i + ".pdf")
|
|
|
|
|
|
def dumptoxlevels(lvls, filename):
|
|
with open(filename, "w") as file:
|
|
file.write("from collections import defaultdict\n\n")
|
|
file.write("toxlevels = " + str(lvls).replace("<class 'list'>", "list", 1) + "\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# execute only if run as a script
|
|
usage = sys.argv[0] + " <folder>"
|
|
if len(sys.argv) < 2:
|
|
print(usage)
|
|
sys.exit(1)
|
|
folder = sys.argv[1]
|
|
if not os.path.isdir(folder):
|
|
print(folder + " is not a folder")
|
|
sys.exit(1)
|
|
interval = 3
|
|
if len(sys.argv) >= 3:
|
|
if sys.argv[2].startswith("-i"):
|
|
interval = sys.argv[2][2:]
|
|
try:
|
|
interval = int(interval)
|
|
except ValueError:
|
|
print("-i: int required")
|
|
sys.exit(1)
|
|
if interval < 1 or interval > 12:
|
|
print("-i: only 1 - 12")
|
|
sys.exit(1)
|
|
else:
|
|
print("unknown parameter: " + sys.argv[2])
|
|
sys.exit(1)
|
|
|
|
main(folder, interval)
|