This commit is contained in:
wea_ondara
2019-07-16 11:51:40 +02:00
parent a557cbd5b9
commit bca211551c
4 changed files with 192 additions and 61 deletions

View File

@@ -8,6 +8,7 @@ import matplotlib.pyplot as plt
from collections import defaultdict
from loader import load, dmt, cms
import math
from common import calc_intervals
printnoln = lambda text: print(text, end='', flush=True)
rprint = lambda text: print('\r' + text)
@@ -23,6 +24,7 @@ def main(folder):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts)
cachedsentiments = {}
postcounts = range(1, 5 + 1)
for (option_date_from, option_date_to) in intervals:
@@ -79,7 +81,11 @@ def main(folder):
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
userid = post['OwnerUserId']
for a in post['Answers']:
toxlevel = computeToxLevel(a['Body'])
if a['Id'] in cachedsentiments.keys():
toxlevel = cachedsentiments[a['Id']]
else:
toxlevel = computeToxLevel(a['Body'])
cachedsentiments[a['Id']] = toxlevel
toxlevels[userid].append(toxlevel)
rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms")
@@ -99,23 +105,17 @@ def main(folder):
fig, axs = plt.subplots(2, 2, figsize=(16, 12))
axs[0, 0].set_title('Neg')
axs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100))
axs[1, 0].set_title('Neu')
axs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100))
axs[0, 1].set_title('Pos')
axs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100))
axs[1, 1].set_title('Compound')
axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100))
axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100))
axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100))
axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100))
# global
# gaxs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
# gaxs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
# gaxs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
# gaxs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100), label=str(option_posts) + " posts")
# gaxs[0, 0].hist(neglevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
# gaxs[1, 0].hist(neulevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
# gaxs[0, 1].hist(poslevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
# gaxs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), label=str(option_posts) + " posts")
axs[0, 0].set_yscale('log')
axs[1, 0].set_yscale('log')
axs[0, 1].set_yscale('log')
axs[1, 1].set_yscale('log')
# plt.show()
fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts\nUsers registered between "
@@ -124,22 +124,18 @@ def main(folder):
plt.close(fig)
# global
gaxs[0, 0].hist(gneg, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[1, 0].hist(gneu, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[0, 1].hist(gpos, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[0, 0].hist(gneg, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[1, 0].hist(gneu, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[0, 1].hist(gpos, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), alpha=1. / len(postcounts), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[0, 0].hist(gneg, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[1, 0].hist(gneu, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[0, 1].hist(gpos, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
# gaxs[1, 1].hist(gcom, np.linspace(-1, 1, 2 * 100), stacked=True, color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[0, 0].legend(loc="upper right")
gaxs[1, 0].legend(loc="upper right")
gaxs[0, 1].legend(loc="upper right")
gaxs[1, 1].legend(loc="upper right")
gaxs[0, 0].set_yscale('log')
gaxs[1, 0].set_yscale('log')
gaxs[0, 1].set_yscale('log')
gaxs[1, 1].set_yscale('log')
gfig.suptitle("Sentiment of answers to the first X (max) posts\nUsers registered between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
gfig.savefig(goutfilename + ".png", bbox_inches='tight')
plt.close(gfig)
@@ -159,34 +155,6 @@ def dumptoxlevels(lvls, filename):
file.write("toxlevels = " + str(lvls).replace("<class 'list'>", "list", 1) + "\n")
def calc_intervals(posts):
firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], "firstpost").getresults()
lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults()
# calc quarter beginning
firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
if firstpost.month not in (1, 4, 7, 10):
firstpost = firstpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[firstpost.month])
lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
if lastpost.month not in (1, 4, 7, 10):
lastpost = lastpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[lastpost.month])
# add 3 months to last post
if lastpost.month == 10:
lastpost = lastpost.replace(month=1, year=lastpost.year + 1)
else:
lastpost = lastpost.replace(month=lastpost.month + 3)
cdate = firstpost
intervals = []
while cdate < lastpost:
nextquarter = cdate.replace(month=(cdate.month + 3) % 12, year=cdate.year + (0 if cdate.month + 3 < 12 else 1))
print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y"))
intervals.append((cdate, nextquarter))
cdate = nextquarter
# sys.exit(0)
return intervals
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"

View File

@@ -7,6 +7,9 @@ from scipy.stats import ks_2samp
from collections import defaultdict
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'}
def main(folder):
@@ -70,19 +73,40 @@ def plotbypost(onlyfiles):
f2 = l[i + 1]
f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[p][i]) + "; ks neu = " + str(changes_neu[p][i])
+ "; ks pos = " + str(changes_pos[p][i]) + "; ks com = " + str(changes_com[p][i]) + "\n")
# pval
for (p, l) in files.items():
x = [l[i].split("_")[2] + " -\n" + l[i + 1].split("_")[2] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
pval = [x.pvalue for x in changes]
plt.plot(x, pval, label=type + ".pval", color=colors[type])
mean = np.mean(pval)
std = np.std(pval)
dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std]
plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with max " + str(p) + " posts")
plt.xticks(rotation=90)
plt.legend(loc="upper right")
plt.savefig(folder + "/ks_pval_" + str(p) + ".png", bbox_inches='tight')
plt.close(fig)
# stat
for (p, l) in files.items():
x = [l[i].split("_")[2] + " -\n" + l[i + 1].split("_")[2] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
stat = [x.statistic for x in changes]
pval = [x.pvalue for x in changes]
plt.plot(x, stat, label=type + ".stat")
plt.plot(x, pval, label=type + ".pval")
plt.plot(x, stat, label=type + ".stat", color=colors[type])
mean = np.mean(stat)
std = np.std(stat)
dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std]
plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with max " + str(p) + " posts")
plt.xticks(rotation=90)
plt.legend(loc="upper right")
plt.savefig(folder + "/ks_" + str(p) + ".png", bbox_inches='tight')
plt.savefig(folder + "/ks_stat_" + str(p) + ".png", bbox_inches='tight')
plt.close(fig)
@@ -134,18 +158,40 @@ def plotbydate(onlyfiles):
f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[d][i]) + "; ks neu = " + str(changes_neu[d][i])
+ "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n")
# pval
for (d, l) in files.items():
x = [l[i].split("_")[4][:-3] + "-" + l[i + 1].split("_")[4][:-3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
pval = [x.pvalue for x in changes]
plt.plot(x, pval, label=type + ".pval", color=colors[type])
mean = np.mean(pval)
std = np.std(pval)
dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std]
plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
plt.xticks(rotation=90)
plt.legend(loc="upper right")
plt.savefig(folder + "/ks_pval_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
plt.close(fig)
# stat
for (d, l) in files.items():
x = [l[i].split("_")[4][:-3] + "-" + l[i + 1].split("_")[4][:-3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
stat = [x.statistic for x in changes]
pval = [x.pvalue for x in changes]
plt.plot(x, stat, label=type + ".stat")
plt.plot(x, pval, label=type + ".pval")
plt.plot(x, stat, label=type + ".stat", color=colors[type])
mean = np.mean(stat)
std = np.std(stat)
dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std]
plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
plt.xticks(rotation=90)
plt.legend(loc="upper right")
plt.savefig(folder + "/ks_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
plt.savefig(folder + "/ks_stat_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
plt.close(fig)

29
common.py Normal file
View File

@@ -0,0 +1,29 @@
from loader import load, dmt, cms
def calc_intervals(posts):
firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], "firstpost").getresults()
lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults()
# calc quarter beginning
firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
if firstpost.month not in (1, 4, 7, 10):
firstpost = firstpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[firstpost.month])
lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
if lastpost.month not in (1, 4, 7, 10):
lastpost = lastpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[lastpost.month])
# add 3 months to last post
if lastpost.month == 10:
lastpost = lastpost.replace(month=1, year=lastpost.year + 1)
else:
lastpost = lastpost.replace(month=lastpost.month + 3)
cdate = firstpost
intervals = []
while cdate < lastpost:
nextquarter = cdate.replace(month=(cdate.month + 3) % 12, year=cdate.year + (0 if cdate.month + 3 < 12 else 1))
print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y"))
intervals.append((cdate, nextquarter))
cdate = nextquarter
# sys.exit(0)
return intervals

88
posthist.py Normal file
View File

@@ -0,0 +1,88 @@
from datetime import datetime
from datetime import timedelta
import sys
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from loader import load, dmt, cms
import math
from common import calc_intervals
printnoln = lambda text: print(text, end='', flush=True)
rprint = lambda text: print('\r' + text)
DAYS_NEW_USER = 7
OLD_USER_YEAR = 3
analyser = SentimentIntensityAnalyzer()
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
def main(folder):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts)
for (option_date_from, option_date_to) in intervals:
print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y")))
# filter posts by option_date_from <= creation date <= option_date_to
newusers = set(dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").map(lambda u: u['Id'], "getting user ids").getresults())
newposts = dmt(posts).filter(lambda p: p['OwnerUserId'] in newusers, "filtering posts by users").getresults()
postcounts = defaultdict(list)
i = 0
for p in newposts:
postcounts[p['OwnerUserId']].append(p)
i = i + 1
postcounts = {id: len(pc) for (id, pc) in postcounts.items()}
# print("i: " + str(i) + " expected: " + str(len(newposts)) + " is: " + str(sum([pc for pc in postcounts.values()])))
os.system("mkdir -p " + folder + "/output")
histfilename = folder + "/output/posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
countfilename = folder + "/output/postcount_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
# fig = plt.figure(figsize=(16, 12))
# plt.plot(userids, [len(pc) for pc in postcounts])
# plt.title("Post count for users between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y"))
# plt.xticks(rotation=90)
# fig.savefig(countfilename + ".png", bbox_inches='tight')
# plt.close(fig)
histdata = [pc for pc in postcounts.values()]
fig = plt.figure(figsize=(16, 12))
plt.hist(histdata, range(max(histdata, default=0) + 1))
plt.yscale('log')
plt.ylim(bottom=0)
plt.title("Histogram for user post count registered between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y"))
fig.savefig(histfilename + ".png", bbox_inches='tight')
plt.close(fig)
def computeToxLevel(text):
return analyser.polarity_scores(text)
def flatmap(arr):
return [item for sublist in arr for item in sublist]
def dumptoxlevels(lvls, filename):
with open(filename, "w") as file:
file.write("from collections import defaultdict\n\n")
file.write("toxlevels = " + str(lvls).replace("<class 'list'>", "list", 1) + "\n")
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
main(folder)