124 lines
4.4 KiB
Python
124 lines
4.4 KiB
Python
from datetime import datetime
|
|
from datetime import timedelta
|
|
import sys
|
|
import os
|
|
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
from collections import defaultdict
|
|
from loader import load, dmt
|
|
|
|
printnoln = lambda text: print(text, end='', flush=True)
|
|
rprint = lambda text: print('\r' + text)
|
|
|
|
DAYS_NEW_USER = 7
|
|
OLD_USER_YEAR = 3
|
|
|
|
analyser = SentimentIntensityAnalyzer()
|
|
|
|
|
|
def main(folder, option_date_from, option_date_to, option_posts):
|
|
users, posts, firstcontrib, sumcontrib = load(folder)
|
|
|
|
# filter users by option_date_from <= creation date <= option_date_to
|
|
newusers = dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").getresults()
|
|
newuserids = set(dmt(newusers).map(lambda u: u['Id'], "get user id list").getresults())
|
|
|
|
# get questions for filtered users
|
|
newposts = dmt(posts).filter(lambda p: p['OwnerUserId'] in newuserids, "filter posts by first contrib").getresults()
|
|
|
|
# computer toxic levels
|
|
print("computing toxic levels")
|
|
toxlevels = defaultdict(list)
|
|
searchedposts = defaultdict(int)
|
|
for (i, post) in enumerate(newposts):
|
|
if (i + 1) % 100 == 0:
|
|
printnoln("\rpost #" + str(i + 1) + "/" + str(len(newposts)))
|
|
if (i + 1) == len(newposts):
|
|
rprint("post #" + str(i + 1) + "/" + str(len(newposts)))
|
|
userid = post['OwnerUserId']
|
|
|
|
# check first contribution
|
|
if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']:
|
|
continue
|
|
|
|
# no more than option_posts posts from one user
|
|
searchedposts[userid] += 1
|
|
if searchedposts[userid] > option_posts:
|
|
continue
|
|
|
|
for a in post['Answers']:
|
|
toxlevel = computeToxLevel(a['Body'])
|
|
toxlevels[userid].append(toxlevel)
|
|
|
|
neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())]
|
|
neulevelsflat = [item['neu'] for item in flatmap(toxlevels.values())]
|
|
poslevelsflat = [item['pos'] for item in flatmap(toxlevels.values())]
|
|
comlevelsflat = [item['compound'] for item in flatmap(toxlevels.values())]
|
|
|
|
fig, axs = plt.subplots(2, 2, figsize=(16, 12))
|
|
axs[0, 0].set_title('Neg')
|
|
axs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100))
|
|
axs[1, 0].set_title('Neu')
|
|
axs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100))
|
|
axs[0, 1].set_title('Pos')
|
|
axs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100))
|
|
axs[1, 1].set_title('Compound')
|
|
axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100))
|
|
# plt.show()
|
|
os.system("mkdir -p output/analyze/")
|
|
pltfile = "output/analyze/" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "_" + str(option_posts) + ".png"
|
|
plt.savefig(pltfile)
|
|
plt.close(fig)
|
|
|
|
|
|
def computeToxLevel(text):
|
|
return analyser.polarity_scores(text)
|
|
|
|
|
|
def flatmap(arr):
|
|
return [item for sublist in arr for item in sublist]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# execute only if run as a script
|
|
usage = sys.argv[0] + " <folder> [--from <%d-%m-%Y>] [--to <%d-%m-%Y>] [--posts <#posts e.g. 2>]"
|
|
if len(sys.argv) < 2:
|
|
print(usage)
|
|
sys.exit(1)
|
|
folder = sys.argv[1]
|
|
if not os.path.isdir(folder):
|
|
print(folder + " is not a folder")
|
|
sys.exit(1)
|
|
consider_date_from = datetime.today() - timedelta(days=3 * 30)
|
|
consider_date_to = datetime.today()
|
|
consider_posts = 2
|
|
i = 2
|
|
while i < len(sys.argv) - 1:
|
|
if sys.argv[i] == "--from":
|
|
i += 1
|
|
try:
|
|
consider_date_from = datetime.strptime(sys.argv[i], "%d-%m-%Y")
|
|
except ValueError:
|
|
print(sys.argv[i] + " is not a valid date")
|
|
print(usage)
|
|
sys.exit(1)
|
|
elif sys.argv[i] == "--to":
|
|
i += 1
|
|
try:
|
|
consider_date_to = datetime.strptime(sys.argv[i], "%d-%m-%Y")
|
|
except ValueError:
|
|
print(sys.argv[i] + " is not a valid date")
|
|
print(usage)
|
|
sys.exit(1)
|
|
elif sys.argv[i] == "--posts":
|
|
i += 1
|
|
if not sys.argv[i].isdigit():
|
|
print(sys.argv[i] + " is not a number")
|
|
print(usage)
|
|
sys.exit(1)
|
|
consider_posts = int(sys.argv[i])
|
|
i += 1
|
|
|
|
main(folder, consider_date_from, consider_date_to, consider_posts)
|