wip
This commit is contained in:
105
analyze_batch.py
105
analyze_batch.py
@@ -1,13 +1,15 @@
|
||||
import operator
|
||||
import os
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from datetime import timedelta
|
||||
from math import ceil
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
||||
|
||||
from common import calc_intervals
|
||||
from common import calc_intervals, imprt, FigSaver
|
||||
from loader import load, dmt, cms
|
||||
|
||||
printnoln = lambda text: print(text, end='', flush=True)
|
||||
@@ -15,8 +17,10 @@ rprint = lambda text: print('\r' + text)
|
||||
|
||||
DAYS_NEW_USER = 7
|
||||
OLD_USER_YEAR = 3
|
||||
OLD_USER_PERCENTILE = 0.95
|
||||
|
||||
analyser = SentimentIntensityAnalyzer()
|
||||
figsaver = FigSaver()
|
||||
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
|
||||
|
||||
|
||||
@@ -24,7 +28,10 @@ def main(folder):
|
||||
users, posts, firstcontrib, sumcontrib = load(folder)
|
||||
|
||||
intervals = calc_intervals(posts)
|
||||
cachedsentiments = {}
|
||||
cachedsentiments = imprt(folder + "/output/sentiments.py").answers
|
||||
|
||||
outfolder = folder + "/output/batch/"
|
||||
os.system("mkdir -p " + outfolder)
|
||||
|
||||
postcounts = range(1, 5 + 1)
|
||||
for (option_date_from, option_date_to) in intervals:
|
||||
@@ -44,8 +51,6 @@ def main(folder):
|
||||
gpos = []
|
||||
gcom = []
|
||||
|
||||
outfolder = folder + "/output/batch/"
|
||||
os.system("mkdir -p " + outfolder)
|
||||
goutfilenamenewusers = outfolder + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
||||
goutfilenameoldusers = outfolder + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
||||
|
||||
@@ -55,7 +60,7 @@ def main(folder):
|
||||
# computer toxic levels
|
||||
start = cms()
|
||||
printnoln("computing toxic levels: filtering")
|
||||
toxlevels = defaultdict(list)
|
||||
toxlevels = []
|
||||
searchedposts = defaultdict(int)
|
||||
filteredposts = []
|
||||
for (i, post) in enumerate(newposts):
|
||||
@@ -73,26 +78,22 @@ def main(folder):
|
||||
filteredposts.append(post)
|
||||
|
||||
for (i, post) in enumerate(filteredposts):
|
||||
if (i + 1) % 100 == 0:
|
||||
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
|
||||
if (i + 1) == len(newposts):
|
||||
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
|
||||
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
|
||||
for a in post['Answers']:
|
||||
if a['Id'] in cachedsentiments.keys():
|
||||
toxlevel = cachedsentiments[a['Id']]
|
||||
else:
|
||||
toxlevel = computeToxLevel(a['Body'])
|
||||
cachedsentiments[a['Id']] = toxlevel
|
||||
toxlevels[post['Id']].append(toxlevel)
|
||||
rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms")
|
||||
print("Sentiment not found for " + a['Id'])
|
||||
toxlevels.append(toxlevel)
|
||||
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
|
||||
|
||||
outfilename = goutfilenamenewusers + "_" + str(option_posts)
|
||||
dumptoxlevels(toxlevels, outfilename + ".py")
|
||||
|
||||
neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())]
|
||||
neulevelsflat = [item['neu'] for item in flatmap(toxlevels.values())]
|
||||
poslevelsflat = [item['pos'] for item in flatmap(toxlevels.values())]
|
||||
comlevelsflat = [item['compound'] for item in flatmap(toxlevels.values())]
|
||||
neglevelsflat = [item['neg'] for item in toxlevels]
|
||||
neulevelsflat = [item['neu'] for item in toxlevels]
|
||||
poslevelsflat = [item['pos'] for item in toxlevels]
|
||||
comlevelsflat = [item['compound'] for item in toxlevels]
|
||||
|
||||
gneg.append(neglevelsflat)
|
||||
gneu.append(neulevelsflat)
|
||||
@@ -116,10 +117,15 @@ def main(folder):
|
||||
# plt.show()
|
||||
fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts within 1 week of 1st contribution\nPosts created between "
|
||||
+ option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
||||
# figsaver.save(fig, outfilename + ".png", bbox_inches='tight')
|
||||
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
|
||||
fig.savefig(outfilename + ".png", bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
|
||||
|
||||
# global
|
||||
start = cms()
|
||||
printnoln("\rglobal plot post ... plotting ...")
|
||||
gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||
gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||
gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||
@@ -132,19 +138,74 @@ def main(folder):
|
||||
gaxs[1, 0].set_yscale('log')
|
||||
gaxs[0, 1].set_yscale('log')
|
||||
gaxs[1, 1].set_yscale('log')
|
||||
gfig.suptitle("Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
||||
gfig.suptitle(
|
||||
"Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime(
|
||||
"%d-%m-%Y"))
|
||||
# figsaver.save(gfig, goutfilenamenewusers + ".png", bbox_inches='tight')
|
||||
printnoln("\rglobal plot post ... plotting ... saving ...")
|
||||
gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight')
|
||||
plt.close(gfig)
|
||||
rprint("global plot post ... plotting ... saving ... took " + str(cms() - start) + "ms")
|
||||
|
||||
# for old users ---------------------------------------------------------------------------------
|
||||
start = cms()
|
||||
newuserids = set(dmt(newposts).map(lambda p: p['OwnerUserId']).getresults())
|
||||
userposts = {u: 0 for u in newuserids}
|
||||
for p in newposts:
|
||||
userposts[p['OwnerUserId']] += 1
|
||||
userposts = sorted(userposts.items(), key=operator.itemgetter(1))
|
||||
oldusers = [k for k, v in userposts]
|
||||
oldusers = set(oldusers[ceil(len(oldusers) * OLD_USER_PERCENTILE):])
|
||||
filteredposts = dmt(newposts).filter(lambda p: p['OwnerUserId'] in oldusers).getresults()
|
||||
|
||||
toxlevels = []
|
||||
for (i, post) in enumerate(filteredposts):
|
||||
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
|
||||
for a in post['Answers']:
|
||||
if a['Id'] in cachedsentiments.keys():
|
||||
toxlevel = cachedsentiments[a['Id']]
|
||||
else:
|
||||
print("Sentiment not found for " + a['Id'])
|
||||
toxlevels.append(toxlevel)
|
||||
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
|
||||
|
||||
dumptoxlevels(toxlevels, goutfilenameoldusers + ".py")
|
||||
|
||||
neglevelsflat = [item['neg'] for item in toxlevels]
|
||||
neulevelsflat = [item['neu'] for item in toxlevels]
|
||||
poslevelsflat = [item['pos'] for item in toxlevels]
|
||||
comlevelsflat = [item['compound'] for item in toxlevels]
|
||||
|
||||
fig, axs = plt.subplots(2, 2, figsize=(16, 12))
|
||||
axs[0, 0].set_title('Neg')
|
||||
axs[1, 0].set_title('Neu')
|
||||
axs[0, 1].set_title('Pos')
|
||||
axs[1, 1].set_title('Compound')
|
||||
axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100))
|
||||
axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100))
|
||||
axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100))
|
||||
axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100))
|
||||
axs[0, 0].set_yscale('log')
|
||||
axs[1, 0].set_yscale('log')
|
||||
axs[0, 1].set_yscale('log')
|
||||
axs[1, 1].set_yscale('log')
|
||||
|
||||
# plt.show()
|
||||
fig.suptitle("Sentiment of answers to posts by most posting users (95%tile)\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
||||
# figsaver.save(fig, goutfilenameoldusers + ".png", bbox_inches='tight')
|
||||
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
|
||||
fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
|
||||
|
||||
figsaver.join()
|
||||
figsaver.join()
|
||||
|
||||
|
||||
def computeToxLevel(text):
|
||||
return analyser.polarity_scores(text)
|
||||
|
||||
|
||||
def flatmap(arr):
|
||||
return [item for sublist in arr for item in sublist]
|
||||
|
||||
|
||||
def dumptoxlevels(lvls, filename):
|
||||
with open(filename, "w") as file:
|
||||
file.write("from collections import defaultdict\n\n")
|
||||
|
||||
Reference in New Issue
Block a user