This commit is contained in:
wea_ondara
2019-05-21 13:35:04 +02:00
commit 532f3ca381
5 changed files with 870 additions and 0 deletions

123
analyze.py Normal file
View File

@@ -0,0 +1,123 @@
from datetime import datetime
from datetime import timedelta
import sys
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from loader import load, dmt
printnoln = lambda text: print(text, end='', flush=True)
rprint = lambda text: print('\r' + text)
DAYS_NEW_USER = 7
OLD_USER_YEAR = 3
analyser = SentimentIntensityAnalyzer()
def main(folder, option_date_from, option_date_to, option_posts):
users, posts, firstcontrib, sumcontrib = load(folder)
# filter users by option_date_from <= creation date <= option_date_to
newusers = dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").getresults()
newuserids = set(dmt(newusers).map(lambda u: u['Id'], "get user id list").getresults())
# get questions for filtered users
newposts = dmt(posts).filter(lambda p: p['OwnerUserId'] in newuserids, "filter posts by first contrib").getresults()
# computer toxic levels
print("computing toxic levels")
toxlevels = defaultdict(list)
searchedposts = defaultdict(int)
for (i, post) in enumerate(newposts):
if (i + 1) % 100 == 0:
printnoln("\rpost #" + str(i + 1) + "/" + str(len(newposts)))
if (i + 1) == len(newposts):
rprint("post #" + str(i + 1) + "/" + str(len(newposts)))
userid = post['OwnerUserId']
# check first contribution
if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']:
continue
# no more than option_posts posts from one user
searchedposts[userid] += 1
if searchedposts[userid] > option_posts:
continue
for a in post['Answers']:
toxlevel = computeToxLevel(a['Body'])
toxlevels[userid].append(toxlevel)
neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())]
neulevelsflat = [item['neu'] for item in flatmap(toxlevels.values())]
poslevelsflat = [item['pos'] for item in flatmap(toxlevels.values())]
comlevelsflat = [item['compound'] for item in flatmap(toxlevels.values())]
fig, axs = plt.subplots(2, 2, figsize=(16, 12))
axs[0, 0].set_title('Neg')
axs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100))
axs[1, 0].set_title('Neu')
axs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100))
axs[0, 1].set_title('Pos')
axs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100))
axs[1, 1].set_title('Compound')
axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100))
# plt.show()
os.system("mkdir -p output/analyze/")
pltfile = "output/analyze/" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "_" + str(option_posts) + ".png"
plt.savefig(pltfile)
plt.close(fig)
def computeToxLevel(text):
return analyser.polarity_scores(text)
def flatmap(arr):
return [item for sublist in arr for item in sublist]
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder> [--from <%d-%m-%Y>] [--to <%d-%m-%Y>] [--posts <#posts e.g. 2>]"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
consider_date_from = datetime.today() - timedelta(days=3 * 30)
consider_date_to = datetime.today()
consider_posts = 2
i = 2
while i < len(sys.argv) - 1:
if sys.argv[i] == "--from":
i += 1
try:
consider_date_from = datetime.strptime(sys.argv[i], "%d-%m-%Y")
except ValueError:
print(sys.argv[i] + " is not a valid date")
print(usage)
sys.exit(1)
elif sys.argv[i] == "--to":
i += 1
try:
consider_date_to = datetime.strptime(sys.argv[i], "%d-%m-%Y")
except ValueError:
print(sys.argv[i] + " is not a valid date")
print(usage)
sys.exit(1)
elif sys.argv[i] == "--posts":
i += 1
if not sys.argv[i].isdigit():
print(sys.argv[i] + " is not a number")
print(usage)
sys.exit(1)
consider_posts = int(sys.argv[i])
i += 1
main(folder, consider_date_from, consider_date_to, consider_posts)