This commit is contained in:
wea_ondara
2019-07-29 21:34:34 +02:00
parent 1f699f6b56
commit a14b3af21a
6 changed files with 309 additions and 87 deletions

View File

@@ -1,13 +1,15 @@
import operator
import os
import sys
from collections import defaultdict
from datetime import timedelta
from math import ceil
import matplotlib.pyplot as plt
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from common import calc_intervals
from common import calc_intervals, imprt, FigSaver
from loader import load, dmt, cms
printnoln = lambda text: print(text, end='', flush=True)
@@ -15,8 +17,10 @@ rprint = lambda text: print('\r' + text)
DAYS_NEW_USER = 7
OLD_USER_YEAR = 3
OLD_USER_PERCENTILE = 0.95
analyser = SentimentIntensityAnalyzer()
figsaver = FigSaver()
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
@@ -24,7 +28,10 @@ def main(folder):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts)
cachedsentiments = {}
cachedsentiments = imprt(folder + "/output/sentiments.py").answers
outfolder = folder + "/output/batch/"
os.system("mkdir -p " + outfolder)
postcounts = range(1, 5 + 1)
for (option_date_from, option_date_to) in intervals:
@@ -44,8 +51,6 @@ def main(folder):
gpos = []
gcom = []
outfolder = folder + "/output/batch/"
os.system("mkdir -p " + outfolder)
goutfilenamenewusers = outfolder + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
goutfilenameoldusers = outfolder + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
@@ -55,7 +60,7 @@ def main(folder):
# computer toxic levels
start = cms()
printnoln("computing toxic levels: filtering")
toxlevels = defaultdict(list)
toxlevels = []
searchedposts = defaultdict(int)
filteredposts = []
for (i, post) in enumerate(newposts):
@@ -73,26 +78,22 @@ def main(folder):
filteredposts.append(post)
for (i, post) in enumerate(filteredposts):
if (i + 1) % 100 == 0:
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
if (i + 1) == len(newposts):
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
for a in post['Answers']:
if a['Id'] in cachedsentiments.keys():
toxlevel = cachedsentiments[a['Id']]
else:
toxlevel = computeToxLevel(a['Body'])
cachedsentiments[a['Id']] = toxlevel
toxlevels[post['Id']].append(toxlevel)
rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms")
print("Sentiment not found for " + a['Id'])
toxlevels.append(toxlevel)
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
outfilename = goutfilenamenewusers + "_" + str(option_posts)
dumptoxlevels(toxlevels, outfilename + ".py")
neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())]
neulevelsflat = [item['neu'] for item in flatmap(toxlevels.values())]
poslevelsflat = [item['pos'] for item in flatmap(toxlevels.values())]
comlevelsflat = [item['compound'] for item in flatmap(toxlevels.values())]
neglevelsflat = [item['neg'] for item in toxlevels]
neulevelsflat = [item['neu'] for item in toxlevels]
poslevelsflat = [item['pos'] for item in toxlevels]
comlevelsflat = [item['compound'] for item in toxlevels]
gneg.append(neglevelsflat)
gneu.append(neulevelsflat)
@@ -116,10 +117,15 @@ def main(folder):
# plt.show()
fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts within 1 week of 1st contribution\nPosts created between "
+ option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# figsaver.save(fig, outfilename + ".png", bbox_inches='tight')
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
fig.savefig(outfilename + ".png", bbox_inches='tight')
plt.close(fig)
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
# global
start = cms()
printnoln("\rglobal plot post ... plotting ...")
gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
@@ -132,19 +138,74 @@ def main(folder):
gaxs[1, 0].set_yscale('log')
gaxs[0, 1].set_yscale('log')
gaxs[1, 1].set_yscale('log')
gfig.suptitle("Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
gfig.suptitle(
"Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime(
"%d-%m-%Y"))
# figsaver.save(gfig, goutfilenamenewusers + ".png", bbox_inches='tight')
printnoln("\rglobal plot post ... plotting ... saving ...")
gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight')
plt.close(gfig)
rprint("global plot post ... plotting ... saving ... took " + str(cms() - start) + "ms")
# for old users ---------------------------------------------------------------------------------
start = cms()
newuserids = set(dmt(newposts).map(lambda p: p['OwnerUserId']).getresults())
userposts = {u: 0 for u in newuserids}
for p in newposts:
userposts[p['OwnerUserId']] += 1
userposts = sorted(userposts.items(), key=operator.itemgetter(1))
oldusers = [k for k, v in userposts]
oldusers = set(oldusers[ceil(len(oldusers) * OLD_USER_PERCENTILE):])
filteredposts = dmt(newposts).filter(lambda p: p['OwnerUserId'] in oldusers).getresults()
toxlevels = []
for (i, post) in enumerate(filteredposts):
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
for a in post['Answers']:
if a['Id'] in cachedsentiments.keys():
toxlevel = cachedsentiments[a['Id']]
else:
print("Sentiment not found for " + a['Id'])
toxlevels.append(toxlevel)
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
dumptoxlevels(toxlevels, goutfilenameoldusers + ".py")
neglevelsflat = [item['neg'] for item in toxlevels]
neulevelsflat = [item['neu'] for item in toxlevels]
poslevelsflat = [item['pos'] for item in toxlevels]
comlevelsflat = [item['compound'] for item in toxlevels]
fig, axs = plt.subplots(2, 2, figsize=(16, 12))
axs[0, 0].set_title('Neg')
axs[1, 0].set_title('Neu')
axs[0, 1].set_title('Pos')
axs[1, 1].set_title('Compound')
axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100))
axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100))
axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100))
axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100))
axs[0, 0].set_yscale('log')
axs[1, 0].set_yscale('log')
axs[0, 1].set_yscale('log')
axs[1, 1].set_yscale('log')
# plt.show()
fig.suptitle("Sentiment of answers to posts by most posting users (95%tile)\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# figsaver.save(fig, goutfilenameoldusers + ".png", bbox_inches='tight')
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight')
plt.close(fig)
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
figsaver.join()
figsaver.join()
def computeToxLevel(text):
return analyser.polarity_scores(text)
def flatmap(arr):
return [item for sublist in arr for item in sublist]
def dumptoxlevels(lvls, filename):
with open(filename, "w") as file:
file.write("from collections import defaultdict\n\n")

View File

@@ -1,4 +1,3 @@
import importlib
import os
import sys
from collections import defaultdict
@@ -10,27 +9,36 @@ import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import ks_2samp
from common import imprt
colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'}
IMAGE_MAGICK = "magick"
def main(folder):
outputdir = folder + "/output/ksbatch/"
os.system("mkdir -p " + outputdir)
folder = folder + "/output/batch/"
srcfolder = folder + "/output/batch/"
onlyfiles = [folder + f for f in listdir(folder) if isfile(join(folder, f)) and f.endswith(".py")]
onlyfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "newusers" in f]
onlyfiles = sorted(onlyfiles)
plotbypost(onlyfiles, outputdir)
plotbydate(onlyfiles, outputdir)
# plotbypost(onlyfiles, outputdir)
# plotbydate(onlyfiles, outputdir)
oldfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "oldusers" in f]
oldfiles = sorted(oldfiles)
plotbydateold(onlyfiles, oldfiles, outputdir)
def plotbypost(onlyfiles, outputdir):
print("plotbypost")
files = defaultdict(list)
for f in onlyfiles:
s = f[:-3].split("_")
files[int(s[4])].append(f)
files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[2], "%d-%m-%Y")) for (p, l) in files.items()}
files[int(s[5])].append(f)
files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[3], "%d-%m-%Y")) for (p, l) in files.items()}
changes_neg = defaultdict(list)
changes_neu = defaultdict(list)
@@ -45,15 +53,15 @@ def plotbypost(onlyfiles, outputdir):
tox1 = imprt(l[i]).toxlevels
tox2 = imprt(l[i + 1]).toxlevels
neglevelsflat1 = [item['neg'] for item in flatmap(tox1.values())]
neulevelsflat1 = [item['neu'] for item in flatmap(tox1.values())]
poslevelsflat1 = [item['pos'] for item in flatmap(tox1.values())]
comlevelsflat1 = [item['compound'] for item in flatmap(tox1.values())]
neglevelsflat1 = [item['neg'] for item in tox1]
neulevelsflat1 = [item['neu'] for item in tox1]
poslevelsflat1 = [item['pos'] for item in tox1]
comlevelsflat1 = [item['compound'] for item in tox1]
neglevelsflat2 = [item['neg'] for item in flatmap(tox2.values())]
neulevelsflat2 = [item['neu'] for item in flatmap(tox2.values())]
poslevelsflat2 = [item['pos'] for item in flatmap(tox2.values())]
comlevelsflat2 = [item['compound'] for item in flatmap(tox2.values())]
neglevelsflat2 = [item['neg'] for item in tox2]
neulevelsflat2 = [item['neu'] for item in tox2]
poslevelsflat2 = [item['pos'] for item in tox2]
comlevelsflat2 = [item['compound'] for item in tox2]
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
@@ -74,7 +82,7 @@ def plotbypost(onlyfiles, outputdir):
+ "; ks pos = " + str(changes_pos[p][i]) + "; ks com = " + str(changes_com[p][i]) + "\n")
# pval
for (p, l) in files.items():
x = [l[i].split("_")[2] + " -\n" + l[i + 1].split("_")[2] for i in range(len(l) - 1)]
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
pval = [x.pvalue for x in changes]
@@ -92,7 +100,7 @@ def plotbypost(onlyfiles, outputdir):
# stat
for (p, l) in files.items():
x = [l[i].split("_")[2] + " -\n" + l[i + 1].split("_")[2] for i in range(len(l) - 1)]
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
stat = [x.statistic for x in changes]
@@ -110,11 +118,12 @@ def plotbypost(onlyfiles, outputdir):
def plotbydate(onlyfiles, outputdir):
print("plotbydate")
files = defaultdict(list)
for f in onlyfiles:
s = f[:-3].split("_")
files[(s[2], s[3])].append(f)
files = {d: sorted(l, key=lambda e: e.split("_")[4]) for (d, l) in files.items()}
files[(s[3], s[4])].append(f)
files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()}
changes_neg = defaultdict(list)
changes_neu = defaultdict(list)
@@ -129,15 +138,15 @@ def plotbydate(onlyfiles, outputdir):
tox1 = imprt(l[i]).toxlevels
tox2 = imprt(l[i + 1]).toxlevels
neglevelsflat1 = [item['neg'] for item in flatmap(tox1.values())]
neulevelsflat1 = [item['neu'] for item in flatmap(tox1.values())]
poslevelsflat1 = [item['pos'] for item in flatmap(tox1.values())]
comlevelsflat1 = [item['compound'] for item in flatmap(tox1.values())]
neglevelsflat1 = [item['neg'] for item in tox1]
neulevelsflat1 = [item['neu'] for item in tox1]
poslevelsflat1 = [item['pos'] for item in tox1]
comlevelsflat1 = [item['compound'] for item in tox1]
neglevelsflat2 = [item['neg'] for item in flatmap(tox2.values())]
neulevelsflat2 = [item['neu'] for item in flatmap(tox2.values())]
poslevelsflat2 = [item['pos'] for item in flatmap(tox2.values())]
comlevelsflat2 = [item['compound'] for item in flatmap(tox2.values())]
neglevelsflat2 = [item['neg'] for item in tox2]
neulevelsflat2 = [item['neu'] for item in tox2]
poslevelsflat2 = [item['pos'] for item in tox2]
comlevelsflat2 = [item['compound'] for item in tox2]
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
@@ -159,7 +168,7 @@ def plotbydate(onlyfiles, outputdir):
# pval
for (d, l) in files.items():
x = [l[i].split("_")[4][:-3] + "-" + l[i + 1].split("_")[4][:-3] for i in range(len(l) - 1)]
x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
pval = [x.pvalue for x in changes]
@@ -177,7 +186,7 @@ def plotbydate(onlyfiles, outputdir):
# stat
for (d, l) in files.items():
x = [l[i].split("_")[4][:-3] + "-" + l[i + 1].split("_")[4][:-3] for i in range(len(l) - 1)]
x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
stat = [x.statistic for x in changes]
@@ -194,15 +203,122 @@ def plotbydate(onlyfiles, outputdir):
plt.close(fig)
def imprt(file):
spec = importlib.util.spec_from_file_location("module.name", file)
foo = importlib.util.module_from_spec(spec)
spec.loader.exec_module(foo)
return foo
def plotbydateold(onlyfiles, oldfiles, outputdir):
print("plotbydateold")
files = defaultdict(list)
for f in onlyfiles:
s = f[:-3].split("_")
files[(s[3], s[4])].append(f)
dates = sorted(files.keys(), key=lambda e: "-".join(reversed(e[0].split("-"))))
files = {d: files[d] for d in dates}
files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()}
oldfiles = {(f[:-3].split("_")[3], f[:-3].split("_")[4]): f for f in oldfiles}
changes_neg = defaultdict(list)
changes_neu = defaultdict(list)
changes_pos = defaultdict(list)
changes_com = defaultdict(list)
def flatmap(arr):
return [item for sublist in arr for item in sublist]
for (d, l) in files.items():
print(d)
toxold = imprt(oldfiles[d]).toxlevels
neglevelsold = [item['neg'] for item in toxold]
neulevelsold = [item['neu'] for item in toxold]
poslevelsold = [item['pos'] for item in toxold]
comlevelsold = [item['compound'] for item in toxold]
for i in range(len(l)):
tox1 = imprt(l[i]).toxlevels
if len(tox1) == 0 or len(toxold) == 0:
changes_neg[d].append(None)
changes_neu[d].append(None)
changes_pos[d].append(None)
changes_com[d].append(None)
continue
neglevelsflat1 = [item['neg'] for item in tox1]
neulevelsflat1 = [item['neu'] for item in tox1]
poslevelsflat1 = [item['pos'] for item in tox1]
comlevelsflat1 = [item['compound'] for item in tox1]
ksneg = ks_2samp(neglevelsflat1, neglevelsold)
ksneu = ks_2samp(neulevelsflat1, neulevelsold)
kspos = ks_2samp(poslevelsflat1, poslevelsold)
kscom = ks_2samp(comlevelsflat1, comlevelsold)
changes_neg[d].append(ksneg)
changes_neu[d].append(ksneu)
changes_pos[d].append(kspos)
changes_com[d].append(kscom)
print("logs")
for (d, l) in files.items():
# print(d)
# print("neg is: " + str(len(changes_neg[d])) + " should: " + str(len(l)))
# print("neu is: " + str(len(changes_neu[d])) + " should: " + str(len(l)))
# print("pos is: " + str(len(changes_pos[d])) + " should: " + str(len(l)))
# print("com is: " + str(len(changes_com[d])) + " should: " + str(len(l)))
f1 = oldfiles[d]
with open(outputdir + "/ks_olddate_" + d[0] + "_" + d[1] + ".log", "w") as f:
for i in range(len(l)):
if changes_neg[d][i] is None:
continue
f2 = l[i]
f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[d][i]) + "; ks neu = " + str(changes_neu[d][i])
+ "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n")
# pval
print("pval")
imgmagickcmd = IMAGE_MAGICK
for (d, l) in files.items():
print(d)
x = [l[i][:-3].split("_")[5] for i in range(len(l))]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
pval = [(xx, c.pvalue) for xx, c in zip(x, changes) if c is not None]
plt.plot([p[0] for p in pval], [p[1] for p in pval], label=type + ".pval", color=colors[type])
if len(pval) == 0:
continue
mean = np.mean([p[1] for p in pval])
std = np.std([p[1] for p in pval])
dev = [(xx, s) for (xx, s) in pval if s <= mean - std or s >= mean + std]
plt.plot([p[0] for p in pval], [mean] * len(pval), color=colors[type], ls='dashed')
plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1])
plt.xticks(rotation=90)
plt.legend(loc="upper right")
outfile = outputdir + "/ks_olddate_pval_" + d[0] + "_" + d[1] + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
imgmagickcmd += " " + outfile
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_pval.pdf")
# stat
print("stat")
imgmagickcmd = IMAGE_MAGICK
for (d, l) in files.items():
print(d)
x = [l[i][:-3].split("_")[5] for i in range(len(l))]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
stat = [(xx, c.statistic) for xx, c in zip(x, changes) if c is not None]
plt.plot([p[0] for p in stat], [p[1] for p in stat], label=type + ".stat", color=colors[type])
if len(stat) == 0:
continue
mean = np.mean([p[1] for p in stat])
std = np.std([p[1] for p in stat])
dev = [(xx, s) for (xx, s) in stat if s <= mean - std or s >= mean + std]
plt.plot([p[0] for p in stat], [mean] * len(stat), color=colors[type], ls='dashed')
plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1])
plt.xticks(rotation=90)
plt.legend(loc="upper right")
outfile = outputdir + "/ks_olddate_stat_" + d[0] + "_" + d[1] + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
imgmagickcmd += " " + outfile
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_stat.pdf")
def filecmp(file1, file2):

View File

@@ -1,8 +1,14 @@
import importlib
from threading import Thread, Lock
import matplotlib.pyplot as plt
from loader import dmt
def calc_intervals(posts):
firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], "firstpost").getresults()
firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'],
"firstpost").getresults()
lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults()
# calc quarter beginning
@@ -25,5 +31,32 @@ def calc_intervals(posts):
print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y"))
intervals.append((cdate, nextquarter))
cdate = nextquarter
# sys.exit(0)
return intervals
def imprt(file):
spec = importlib.util.spec_from_file_location("module.name", file)
foo = importlib.util.module_from_spec(spec)
spec.loader.exec_module(foo)
return foo
class FigSaver():
def __init__(self):
self.__lock = Lock()
self.__threads = []
def save(self, fig, path, **kwargs):
thread = Thread(target=self.__dosave, args=(fig, path, kwargs))
with self.__lock:
self.__threads.append(thread)
thread.start()
def __dosave(self, fig, path, kwargs):
fig.savefig(path, **kwargs)
plt.close(fig)
def join(self):
with self.__lock:
for thread in self.__threads:
thread.join()

View File

@@ -1,5 +1,7 @@
import html
import multiprocessing
import operator
import re
import time
import xml.etree.cElementTree as et
from collections import defaultdict
@@ -7,11 +9,13 @@ from datetime import datetime
from mt import mt
TAG_RE = re.compile(r'<[^>]+>')
printnoln = lambda text: print(text, end='', flush=True)
rprint = lambda text: print('\r' + text)
def dmt(data): return mt(multiprocessing.cpu_count(), data, False)
def dmt(data, progressinterval=1000): return mt(multiprocessing.cpu_count(), data, False, progressinterval)
def cms(): return int(round(time.time() * 1000))
@@ -75,6 +79,7 @@ def mapQuestion(item):
question = {tag: getTag(item, tag) for tag in tags}
for tag in datetags:
question[tag] = datetime.fromisoformat(question[tag])
question['Body'] = removetags(html.unescape(question['Body']))
return question
@@ -84,6 +89,7 @@ def mapAnswer(item):
answer = {tag: getTag(item, tag) for tag in tags}
for tag in datetags:
answer[tag] = datetime.fromisoformat(answer[tag])
answer['Body'] = removetags(html.unescape(answer['Body']))
return answer
@@ -93,6 +99,7 @@ def mapComment(item):
comment = {tag: getTag(item, tag) for tag in tags}
for tag in datetags:
comment[tag] = datetime.fromisoformat(comment[tag])
comment['Body'] = removetags(html.unescape(comment['Body']))
return comment
@@ -201,3 +208,7 @@ def tagExists(item, tag):
def setprop(dic, key, value):
dic[key] = value
return dic
def removetags(text):
return TAG_RE.sub('', text)

44
mt.py
View File

@@ -1,10 +1,10 @@
from threading import Thread, Lock
import time
from math import ceil
from threading import Thread, Lock
class mt():
def __init__(self, threads, data, verbose=False):
def __init__(self, threads, data, verbose=False, progressinterval=1000):
self.__running = False
self.__closed = False
self.__data = data
@@ -21,6 +21,7 @@ class mt():
self.__lock = Lock()
self.__results = []
self.__progress = 0
self.__progressinterval = progressinterval
for i in range(self.__threadcount):
self.__results.append([])
self.__threads.append(None)
@@ -35,9 +36,9 @@ class mt():
self.__running = True
self.__final = self.__getresultsmapfilter
self.__type = "filter"
self.__comment = comment if comment is not None else ""
self.__comment = comment
if comment is not None:
print(self.__comment + ": #" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
self.__starttime = self.__cms()
self.__endtime = None
for i in range(self.__threadcount):
@@ -50,20 +51,22 @@ class mt():
now = self.__cms()
results = []
for j in range(ceil(len(list) / 1000)):
part = list[j * 1000: min((j + 1) * 1000, len(list))]
for j in range(ceil(len(list) / self.__progressinterval)):
part = list[j * self.__progressinterval: min((j + 1) * self.__progressinterval, len(list))]
results += [l for l in part if cond(l)]
with self.__lock:
self.__progress += len(part)
if self.__comment is not None:
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True)
# results = [l for l in list if cond(l)]
with self.__lock:
self.__results[i] = results
dur = self.__cms() - now
if self.__verbose:
if self.__comment is not None:
print(self.__comment + ": Thread " + str(i) + ": filter took " + str(dur) + "ms")
else:
print("Thread " + str(i) + ": filter took " + str(dur) + "ms")
def map(self, func, comment=None):
if self.__closed:
@@ -75,7 +78,7 @@ class mt():
self.__running = True
self.__final = self.__getresultsmapfilter
self.__type = "map"
self.__comment = comment if comment is not None else ""
self.__comment = comment
if comment is not None:
print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
self.__starttime = self.__cms()
@@ -89,20 +92,22 @@ class mt():
def __domap(self, i, list, func):
now = self.__cms()
results = []
for j in range(ceil(len(list) / 1000)):
part = list[j * 1000: min((j + 1) * 1000, len(list))]
for j in range(ceil(len(list) / self.__progressinterval)):
part = list[j * self.__progressinterval: min((j + 1) * self.__progressinterval, len(list))]
results += [func(l) for l in part]
with self.__lock:
self.__progress += len(part)
if self.__comment is not None:
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True)
# results = [func(l) for l in list]
with self.__lock:
self.__results[i] = results
dur = self.__cms() - now
if self.__verbose:
print(self.__comment + ": Thread " + str(i) + ": map took " + str(dur) + "ms")
if self.__comment is not None:
print(self.__comment + ": Thread " + str(i) + ": filter took " + str(dur) + "ms")
else:
print("Thread " + str(i) + ": filter took " + str(dur) + "ms")
def reduce(self, reducer, aggregator, initval, comment=None):
if self.__closed:
@@ -114,9 +119,9 @@ class mt():
self.__running = True
self.__final = lambda: self.__getresultsreduce(aggregator, initval)
self.__type = "reduce"
self.__comment = comment if comment is not None else ""
self.__comment = comment
if comment is not None:
print(self.__comment + ": #" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
self.__starttime = self.__cms()
self.__endtime = None
for i in range(self.__threadcount):
@@ -129,8 +134,8 @@ class mt():
now = self.__cms()
val = initval()
for j in range(ceil(len(list) / 1000)):
part = list[j * 1000: min((j + 1) * 1000, len(list))]
for j in range(ceil(len(list) / self.__progressinterval)):
part = list[j * self.__progressinterval: min((j + 1) * self.__progressinterval, len(list))]
for k in range(len(part)):
val = reducer(val, part[k])
with self.__lock:
@@ -138,13 +143,14 @@ class mt():
if self.__comment is not None:
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True)
# for j in range(len(list)):
# val = reducer(val, list[j])
with self.__lock:
self.__results[i] = val
dur = self.__cms() - now
if self.__verbose:
print(self.__comment + ": Thread " + str(i) + ": reduce took " + str(dur) + "ms")
if self.__comment is not None:
print(self.__comment + ": Thread " + str(i) + ": filter took " + str(dur) + "ms")
else:
print("Thread " + str(i) + ": filter took " + str(dur) + "ms")
def getresults(self):
self.join()

View File

@@ -36,17 +36,12 @@ def main(folder):
# toxlevel = computeToxLevel(a['Body'])
# toxlevels[post['Id']].append(toxlevel)
# rprint("computing toxic levels: post #" + str(len(posts)) + "/" + str(len(posts)) + " ... took " + str(cms() - start) + "ms")
toxlevels = dmt(posts).map(lambda p: (p['Id'], computeSentimentForPost(p)), "calculating sentiments").getresults()
toxlevels = dmt(posts, 10).map(lambda p: (p['Id'], {a['Id']: computeToxLevel(a['Body']) for a in p['Answers']}), "calculating sentiments").getresults()
toxlevels = {id: p for (id, p) in toxlevels}
dumptoxlevels(toxlevels, outfilename + ".py")
def computeSentimentForPost(post):
anwsers = {a['Id']: computeToxLevel(a['Body']) for a in post['Answers']}
return anwsers
def computeToxLevel(text):
return analyser.polarity_scores(text)