This commit is contained in:
wea_ondara
2019-07-29 21:34:34 +02:00
parent 1f699f6b56
commit a14b3af21a
6 changed files with 309 additions and 87 deletions

View File

@@ -1,13 +1,15 @@
import operator
import os
import sys
from collections import defaultdict
from datetime import timedelta
from math import ceil
import matplotlib.pyplot as plt
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from common import calc_intervals
from common import calc_intervals, imprt, FigSaver
from loader import load, dmt, cms
printnoln = lambda text: print(text, end='', flush=True)
@@ -15,8 +17,10 @@ rprint = lambda text: print('\r' + text)
DAYS_NEW_USER = 7
OLD_USER_YEAR = 3
OLD_USER_PERCENTILE = 0.95
analyser = SentimentIntensityAnalyzer()
figsaver = FigSaver()
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
@@ -24,7 +28,10 @@ def main(folder):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts)
cachedsentiments = {}
cachedsentiments = imprt(folder + "/output/sentiments.py").answers
outfolder = folder + "/output/batch/"
os.system("mkdir -p " + outfolder)
postcounts = range(1, 5 + 1)
for (option_date_from, option_date_to) in intervals:
@@ -44,8 +51,6 @@ def main(folder):
gpos = []
gcom = []
outfolder = folder + "/output/batch/"
os.system("mkdir -p " + outfolder)
goutfilenamenewusers = outfolder + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
goutfilenameoldusers = outfolder + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
@@ -55,7 +60,7 @@ def main(folder):
# computer toxic levels
start = cms()
printnoln("computing toxic levels: filtering")
toxlevels = defaultdict(list)
toxlevels = []
searchedposts = defaultdict(int)
filteredposts = []
for (i, post) in enumerate(newposts):
@@ -73,26 +78,22 @@ def main(folder):
filteredposts.append(post)
for (i, post) in enumerate(filteredposts):
if (i + 1) % 100 == 0:
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
if (i + 1) == len(newposts):
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
for a in post['Answers']:
if a['Id'] in cachedsentiments.keys():
toxlevel = cachedsentiments[a['Id']]
else:
toxlevel = computeToxLevel(a['Body'])
cachedsentiments[a['Id']] = toxlevel
toxlevels[post['Id']].append(toxlevel)
rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms")
print("Sentiment not found for " + a['Id'])
toxlevels.append(toxlevel)
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
outfilename = goutfilenamenewusers + "_" + str(option_posts)
dumptoxlevels(toxlevels, outfilename + ".py")
neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())]
neulevelsflat = [item['neu'] for item in flatmap(toxlevels.values())]
poslevelsflat = [item['pos'] for item in flatmap(toxlevels.values())]
comlevelsflat = [item['compound'] for item in flatmap(toxlevels.values())]
neglevelsflat = [item['neg'] for item in toxlevels]
neulevelsflat = [item['neu'] for item in toxlevels]
poslevelsflat = [item['pos'] for item in toxlevels]
comlevelsflat = [item['compound'] for item in toxlevels]
gneg.append(neglevelsflat)
gneu.append(neulevelsflat)
@@ -116,10 +117,15 @@ def main(folder):
# plt.show()
fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts within 1 week of 1st contribution\nPosts created between "
+ option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# figsaver.save(fig, outfilename + ".png", bbox_inches='tight')
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
fig.savefig(outfilename + ".png", bbox_inches='tight')
plt.close(fig)
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
# global
start = cms()
printnoln("\rglobal plot post ... plotting ...")
gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
@@ -132,19 +138,74 @@ def main(folder):
gaxs[1, 0].set_yscale('log')
gaxs[0, 1].set_yscale('log')
gaxs[1, 1].set_yscale('log')
gfig.suptitle("Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
gfig.suptitle(
"Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime(
"%d-%m-%Y"))
# figsaver.save(gfig, goutfilenamenewusers + ".png", bbox_inches='tight')
printnoln("\rglobal plot post ... plotting ... saving ...")
gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight')
plt.close(gfig)
rprint("global plot post ... plotting ... saving ... took " + str(cms() - start) + "ms")
# for old users ---------------------------------------------------------------------------------
start = cms()
newuserids = set(dmt(newposts).map(lambda p: p['OwnerUserId']).getresults())
userposts = {u: 0 for u in newuserids}
for p in newposts:
userposts[p['OwnerUserId']] += 1
userposts = sorted(userposts.items(), key=operator.itemgetter(1))
oldusers = [k for k, v in userposts]
oldusers = set(oldusers[ceil(len(oldusers) * OLD_USER_PERCENTILE):])
filteredposts = dmt(newposts).filter(lambda p: p['OwnerUserId'] in oldusers).getresults()
toxlevels = []
for (i, post) in enumerate(filteredposts):
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
for a in post['Answers']:
if a['Id'] in cachedsentiments.keys():
toxlevel = cachedsentiments[a['Id']]
else:
print("Sentiment not found for " + a['Id'])
toxlevels.append(toxlevel)
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
dumptoxlevels(toxlevels, goutfilenameoldusers + ".py")
neglevelsflat = [item['neg'] for item in toxlevels]
neulevelsflat = [item['neu'] for item in toxlevels]
poslevelsflat = [item['pos'] for item in toxlevels]
comlevelsflat = [item['compound'] for item in toxlevels]
fig, axs = plt.subplots(2, 2, figsize=(16, 12))
axs[0, 0].set_title('Neg')
axs[1, 0].set_title('Neu')
axs[0, 1].set_title('Pos')
axs[1, 1].set_title('Compound')
axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100))
axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100))
axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100))
axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100))
axs[0, 0].set_yscale('log')
axs[1, 0].set_yscale('log')
axs[0, 1].set_yscale('log')
axs[1, 1].set_yscale('log')
# plt.show()
fig.suptitle("Sentiment of answers to posts by most posting users (95%tile)\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# figsaver.save(fig, goutfilenameoldusers + ".png", bbox_inches='tight')
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight')
plt.close(fig)
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
figsaver.join()
figsaver.join()
def computeToxLevel(text):
return analyser.polarity_scores(text)
def flatmap(arr):
return [item for sublist in arr for item in sublist]
def dumptoxlevels(lvls, filename):
with open(filename, "w") as file:
file.write("from collections import defaultdict\n\n")