Files
master/box_sentiment.py
wea_ondara 06085870a1 wip
2020-04-11 14:07:57 +02:00

198 lines
8.8 KiB
Python

import operator
import os
import sys
from collections import defaultdict
from datetime import timedelta
from math import ceil
import matplotlib.pyplot as plt
import numpy as np
from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK, FIG_SIZE
from loader import load, dmt, cms
OLD_USER_PERCENTILE = 0.95
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts, intervl)
start = cms()
printnoln("reading sentiments ...")
cachedsentiments = imprt(folder + "/output/sentiments.py").answers
rprint("reading sentiments ... took " + str(cms() - start) + "ms")
outputdir = folder + "/output/boxsentiment/"
os.system("mkdir -p " + outputdir)
magicknew = IMAGE_MAGICK
magickold = IMAGE_MAGICK
avgnewneg = []
avgnewneu = []
avgnewpos = []
avgnewall = []
avgoldneg = []
avgoldneu = []
avgoldpos = []
avgoldall = []
for (option_date_from, option_date_to) in intervals:
# get questions for option_date_from <= creation date < option_date_to
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults()
if len(newposts) == 0:
continue
print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
goutfilenamenewusers = outputdir + "boxsent_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
goutfilenameoldusers = outputdir + "boxsent_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
# print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts))
# computer toxic levels
start = cms()
printnoln("computing toxic levels: filtering")
toxlevels = []
filteredposts = []
for (i, post) in enumerate(newposts):
userid = post['OwnerUserId']
# check first contribution
if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']:
continue
filteredposts.append(post)
for (i, post) in enumerate(filteredposts):
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
for a in post['Answers']:
if a['Id'] in cachedsentiments.keys():
toxlevel = cachedsentiments[a['Id']]
else:
print("Sentiment not found for " + a['Id'])
toxlevels.append(toxlevel)
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
neg = [item['compound'] for item in toxlevels if item['compound'] < -0.05]
pos = [item['compound'] for item in toxlevels if item['compound'] > 0.05]
neu = [item['compound'] for item in toxlevels if -0.05 <= item['compound'] <= 0.05]
avgnewneg.append(np.average(neg))
avgnewneu.append(np.average(neu))
avgnewpos.append(np.average(pos))
avgnewall.append(np.average([item['compound'] for item in toxlevels]))
fig, axs = plt.subplots(figsize=FIG_SIZE)
axs.boxplot([neg, neu, pos])
axs.set_xticklabels(['negative', 'neutral', 'positive'])
axs.set_title("Sentiment categorization of answers to posts within 1 week of 1st contribution\nPosts created between "
+ option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + ", n=" + str(len(filteredposts)) + "\nn = " + str(len(toxlevels)))
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
fig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight')
plt.close(fig)
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
magicknew += " " + goutfilenamenewusers + ".png"
# for old users ---------------------------------------------------------------------------------
start = cms()
newuserids = set(dmt(newposts).map(lambda p: p['OwnerUserId']).getresults())
userposts = {u: 0 for u in newuserids}
for p in newposts:
userposts[p['OwnerUserId']] += 1
userposts = sorted(userposts.items(), key=operator.itemgetter(1))
oldusers = [k for k, v in userposts]
oldusers = set(oldusers[ceil(len(oldusers) * OLD_USER_PERCENTILE):])
filteredposts = dmt(newposts).filter(lambda p: p['OwnerUserId'] in oldusers).getresults()
toxlevels = []
for (i, post) in enumerate(filteredposts):
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
for a in post['Answers']:
if a['Id'] in cachedsentiments.keys():
toxlevel = cachedsentiments[a['Id']]
else:
print("Sentiment not found for " + a['Id'])
toxlevels.append(toxlevel)
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
neg = [item['compound'] for item in toxlevels if item['compound'] < -0.05]
pos = [item['compound'] for item in toxlevels if item['compound'] > 0.05]
neu = [item['compound'] for item in toxlevels if -0.05 <= item['compound'] <= 0.05]
avgoldneg.append(np.average(neg))
avgoldneu.append(np.average(neu))
avgoldpos.append(np.average(pos))
avgoldall.append(np.average([item['compound'] for item in toxlevels]))
fig, axs = plt.subplots(figsize=FIG_SIZE)
axs.boxplot([neg, neu, pos])
axs.set_xticklabels(['negative', 'neutral', 'positive'])
axs.set_title("Sentiment categorization of answers to posts within 1 week of 1st contribution\nPosts created between "
+ option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + ", n=" + str(len(filteredposts)) + "\nn = " + str(len(toxlevels)))
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight')
plt.close(fig)
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
magickold += " " + goutfilenameoldusers + ".png"
os.system(magicknew + " " + outputdir + "boxsent_newusers.pdf")
os.system(magickold + " " + outputdir + "boxsent_oldusers.pdf")
# plot new users
fig = plt.figure(figsize=FIG_SIZE)
x = [f.strftime("%d-%m-%Y") + " - " + t.strftime("%d-%m-%Y") for (f, t) in intervals]
plt.plot(x, avgnewneg, label='negative')
plt.plot(x, avgnewneu, label='neutral')
plt.plot(x, avgnewpos, label='positive')
plt.plot(x, avgnewall, label='all')
plt.legend(loc="upper right")
plt.xticks(rotation=90)
plt.title("Sentiment categorization for posts from new users")
fig.savefig(outputdir + "avgsentnewusers.png", bbox_inches='tight')
plt.close(fig)
# plot old users
fig = plt.figure(figsize=FIG_SIZE)
x = [f.strftime("%d-%m-%Y") + " - " + t.strftime("%d-%m-%Y") for (f, t) in intervals]
plt.plot(x, avgoldneg, label='negative')
plt.plot(x, avgoldneu, label='neutral')
plt.plot(x, avgoldpos, label='positive')
plt.plot(x, avgoldall, label='all')
plt.legend(loc="upper right")
plt.xticks(rotation=90)
plt.title("Sentiment categorization for posts from old users")
fig.savefig(outputdir + "avgsentoldusers.png", bbox_inches='tight')
plt.close(fig)
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 3
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder, interval)