meh
This commit is contained in:
197
box_sentiment.py
Normal file
197
box_sentiment.py
Normal file
@@ -0,0 +1,197 @@
|
||||
import operator
|
||||
import os
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from datetime import timedelta
|
||||
from math import ceil
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK
|
||||
from loader import load, dmt, cms
|
||||
|
||||
OLD_USER_PERCENTILE = 0.95
|
||||
|
||||
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
|
||||
|
||||
|
||||
def main(folder, intervl):
|
||||
users, posts, firstcontrib, sumcontrib = load(folder)
|
||||
|
||||
intervals = calc_intervals(posts, intervl)
|
||||
|
||||
start = cms()
|
||||
printnoln("reading sentiments ...")
|
||||
cachedsentiments = imprt(folder + "/output/sentiments.py").answers
|
||||
rprint("reading sentiments ... took " + str(cms() - start) + "ms")
|
||||
|
||||
outputdir = folder + "/output/boxsentiment/"
|
||||
os.system("mkdir -p " + outputdir)
|
||||
|
||||
magicknew = IMAGE_MAGICK
|
||||
magickold = IMAGE_MAGICK
|
||||
|
||||
avgnewneg = []
|
||||
avgnewneu = []
|
||||
avgnewpos = []
|
||||
avgnewall = []
|
||||
avgoldneg = []
|
||||
avgoldneu = []
|
||||
avgoldpos = []
|
||||
avgoldall = []
|
||||
for (option_date_from, option_date_to) in intervals:
|
||||
# get questions for option_date_from <= creation date < option_date_to
|
||||
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults()
|
||||
if len(newposts) == 0:
|
||||
continue
|
||||
print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
||||
|
||||
goutfilenamenewusers = outputdir + "boxsent_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
||||
goutfilenameoldusers = outputdir + "boxsent_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
||||
|
||||
# print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts))
|
||||
|
||||
# computer toxic levels
|
||||
start = cms()
|
||||
printnoln("computing toxic levels: filtering")
|
||||
toxlevels = []
|
||||
filteredposts = []
|
||||
for (i, post) in enumerate(newposts):
|
||||
userid = post['OwnerUserId']
|
||||
|
||||
# check first contribution
|
||||
if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']:
|
||||
continue
|
||||
|
||||
filteredposts.append(post)
|
||||
|
||||
for (i, post) in enumerate(filteredposts):
|
||||
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
|
||||
for a in post['Answers']:
|
||||
if a['Id'] in cachedsentiments.keys():
|
||||
toxlevel = cachedsentiments[a['Id']]
|
||||
else:
|
||||
print("Sentiment not found for " + a['Id'])
|
||||
toxlevels.append(toxlevel)
|
||||
|
||||
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
|
||||
|
||||
neg = [item['compound'] for item in toxlevels if item['compound'] < -0.05]
|
||||
pos = [item['compound'] for item in toxlevels if item['compound'] > 0.05]
|
||||
neu = [item['compound'] for item in toxlevels if -0.05 <= item['compound'] <= 0.05]
|
||||
avgnewneg.append(np.average(neg))
|
||||
avgnewneu.append(np.average(neu))
|
||||
avgnewpos.append(np.average(pos))
|
||||
avgnewall.append(np.average([item['compound'] for item in toxlevels]))
|
||||
|
||||
fig, axs = plt.subplots(figsize=(16, 12))
|
||||
axs.boxplot([neg, neu, pos])
|
||||
axs.set_xticklabels(['negative', 'neutral', 'positive'])
|
||||
axs.set_title("Sentiment categorization of answers to posts within 1 week of 1st contribution\nPosts created between "
|
||||
+ option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + ", n=" + str(len(filteredposts)) + "\nn = " + str(len(toxlevels)))
|
||||
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
|
||||
fig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
|
||||
magicknew += " " + goutfilenamenewusers + ".png"
|
||||
|
||||
# for old users ---------------------------------------------------------------------------------
|
||||
start = cms()
|
||||
newuserids = set(dmt(newposts).map(lambda p: p['OwnerUserId']).getresults())
|
||||
userposts = {u: 0 for u in newuserids}
|
||||
for p in newposts:
|
||||
userposts[p['OwnerUserId']] += 1
|
||||
userposts = sorted(userposts.items(), key=operator.itemgetter(1))
|
||||
oldusers = [k for k, v in userposts]
|
||||
oldusers = set(oldusers[ceil(len(oldusers) * OLD_USER_PERCENTILE):])
|
||||
filteredposts = dmt(newposts).filter(lambda p: p['OwnerUserId'] in oldusers).getresults()
|
||||
|
||||
toxlevels = []
|
||||
for (i, post) in enumerate(filteredposts):
|
||||
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
|
||||
for a in post['Answers']:
|
||||
if a['Id'] in cachedsentiments.keys():
|
||||
toxlevel = cachedsentiments[a['Id']]
|
||||
else:
|
||||
print("Sentiment not found for " + a['Id'])
|
||||
toxlevels.append(toxlevel)
|
||||
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
|
||||
|
||||
neg = [item['compound'] for item in toxlevels if item['compound'] < -0.05]
|
||||
pos = [item['compound'] for item in toxlevels if item['compound'] > 0.05]
|
||||
neu = [item['compound'] for item in toxlevels if -0.05 <= item['compound'] <= 0.05]
|
||||
avgoldneg.append(np.average(neg))
|
||||
avgoldneu.append(np.average(neu))
|
||||
avgoldpos.append(np.average(pos))
|
||||
avgoldall.append(np.average([item['compound'] for item in toxlevels]))
|
||||
|
||||
fig, axs = plt.subplots(figsize=(16, 12))
|
||||
axs.boxplot([neg, neu, pos])
|
||||
axs.set_xticklabels(['negative', 'neutral', 'positive'])
|
||||
axs.set_title("Sentiment categorization of answers to posts within 1 week of 1st contribution\nPosts created between "
|
||||
+ option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + ", n=" + str(len(filteredposts)) + "\nn = " + str(len(toxlevels)))
|
||||
|
||||
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
|
||||
fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
|
||||
magickold += " " + goutfilenameoldusers + ".png"
|
||||
|
||||
os.system(magicknew + " " + outputdir + "boxsent_newusers.pdf")
|
||||
os.system(magickold + " " + outputdir + "boxsent_oldusers.pdf")
|
||||
|
||||
# plot new users
|
||||
fig = plt.figure(figsize=(16, 12))
|
||||
x = [f.strftime("%d-%m-%Y") + " - " + t.strftime("%d-%m-%Y") for (f, t) in intervals]
|
||||
plt.plot(x, avgnewneg, label='negative')
|
||||
plt.plot(x, avgnewneu, label='neutral')
|
||||
plt.plot(x, avgnewpos, label='positive')
|
||||
plt.plot(x, avgnewall, label='all')
|
||||
plt.legend(loc="upper right")
|
||||
plt.xticks(rotation=90)
|
||||
plt.title("Sentiment categorization for posts from new users")
|
||||
fig.savefig(outputdir + "avgsentnewusers.png", bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
|
||||
# plot old users
|
||||
fig = plt.figure(figsize=(16, 12))
|
||||
x = [f.strftime("%d-%m-%Y") + " - " + t.strftime("%d-%m-%Y") for (f, t) in intervals]
|
||||
plt.plot(x, avgoldneg, label='negative')
|
||||
plt.plot(x, avgoldneu, label='neutral')
|
||||
plt.plot(x, avgoldpos, label='positive')
|
||||
plt.plot(x, avgoldall, label='all')
|
||||
plt.legend(loc="upper right")
|
||||
plt.xticks(rotation=90)
|
||||
plt.title("Sentiment categorization for posts from old users")
|
||||
fig.savefig(outputdir + "avgsentoldusers.png", bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# execute only if run as a script
|
||||
usage = sys.argv[0] + " <folder>"
|
||||
if len(sys.argv) < 2:
|
||||
print(usage)
|
||||
sys.exit(1)
|
||||
folder = sys.argv[1]
|
||||
if not os.path.isdir(folder):
|
||||
print(folder + " is not a folder")
|
||||
sys.exit(1)
|
||||
interval = 3
|
||||
if len(sys.argv) >= 3:
|
||||
if sys.argv[2].startswith("-i"):
|
||||
interval = sys.argv[2][2:]
|
||||
try:
|
||||
interval = int(interval)
|
||||
except ValueError:
|
||||
print("-i: int required")
|
||||
sys.exit(1)
|
||||
if interval < 1 or interval > 12:
|
||||
print("-i: only 1 - 12")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("unknown parameter: " + sys.argv[2])
|
||||
sys.exit(1)
|
||||
|
||||
main(folder, interval)
|
||||
Reference in New Issue
Block a user