This commit is contained in:
wea_ondara
2019-07-29 21:34:34 +02:00
parent 1f699f6b56
commit a14b3af21a
6 changed files with 309 additions and 87 deletions

View File

@@ -1,13 +1,15 @@
import operator
import os import os
import sys import sys
from collections import defaultdict from collections import defaultdict
from datetime import timedelta from datetime import timedelta
from math import ceil
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from common import calc_intervals from common import calc_intervals, imprt, FigSaver
from loader import load, dmt, cms from loader import load, dmt, cms
printnoln = lambda text: print(text, end='', flush=True) printnoln = lambda text: print(text, end='', flush=True)
@@ -15,8 +17,10 @@ rprint = lambda text: print('\r' + text)
DAYS_NEW_USER = 7 DAYS_NEW_USER = 7
OLD_USER_YEAR = 3 OLD_USER_YEAR = 3
OLD_USER_PERCENTILE = 0.95
analyser = SentimentIntensityAnalyzer() analyser = SentimentIntensityAnalyzer()
figsaver = FigSaver()
colors = ['red', 'green', 'blue', 'orange', 'deeppink'] colors = ['red', 'green', 'blue', 'orange', 'deeppink']
@@ -24,7 +28,10 @@ def main(folder):
users, posts, firstcontrib, sumcontrib = load(folder) users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts) intervals = calc_intervals(posts)
cachedsentiments = {} cachedsentiments = imprt(folder + "/output/sentiments.py").answers
outfolder = folder + "/output/batch/"
os.system("mkdir -p " + outfolder)
postcounts = range(1, 5 + 1) postcounts = range(1, 5 + 1)
for (option_date_from, option_date_to) in intervals: for (option_date_from, option_date_to) in intervals:
@@ -44,8 +51,6 @@ def main(folder):
gpos = [] gpos = []
gcom = [] gcom = []
outfolder = folder + "/output/batch/"
os.system("mkdir -p " + outfolder)
goutfilenamenewusers = outfolder + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") goutfilenamenewusers = outfolder + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
goutfilenameoldusers = outfolder + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") goutfilenameoldusers = outfolder + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
@@ -55,7 +60,7 @@ def main(folder):
# computer toxic levels # computer toxic levels
start = cms() start = cms()
printnoln("computing toxic levels: filtering") printnoln("computing toxic levels: filtering")
toxlevels = defaultdict(list) toxlevels = []
searchedposts = defaultdict(int) searchedposts = defaultdict(int)
filteredposts = [] filteredposts = []
for (i, post) in enumerate(newposts): for (i, post) in enumerate(newposts):
@@ -73,26 +78,22 @@ def main(folder):
filteredposts.append(post) filteredposts.append(post)
for (i, post) in enumerate(filteredposts): for (i, post) in enumerate(filteredposts):
if (i + 1) % 100 == 0: printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
if (i + 1) == len(newposts):
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
for a in post['Answers']: for a in post['Answers']:
if a['Id'] in cachedsentiments.keys(): if a['Id'] in cachedsentiments.keys():
toxlevel = cachedsentiments[a['Id']] toxlevel = cachedsentiments[a['Id']]
else: else:
toxlevel = computeToxLevel(a['Body']) print("Sentiment not found for " + a['Id'])
cachedsentiments[a['Id']] = toxlevel toxlevels.append(toxlevel)
toxlevels[post['Id']].append(toxlevel) printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms")
outfilename = goutfilenamenewusers + "_" + str(option_posts) outfilename = goutfilenamenewusers + "_" + str(option_posts)
dumptoxlevels(toxlevels, outfilename + ".py") dumptoxlevels(toxlevels, outfilename + ".py")
neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())] neglevelsflat = [item['neg'] for item in toxlevels]
neulevelsflat = [item['neu'] for item in flatmap(toxlevels.values())] neulevelsflat = [item['neu'] for item in toxlevels]
poslevelsflat = [item['pos'] for item in flatmap(toxlevels.values())] poslevelsflat = [item['pos'] for item in toxlevels]
comlevelsflat = [item['compound'] for item in flatmap(toxlevels.values())] comlevelsflat = [item['compound'] for item in toxlevels]
gneg.append(neglevelsflat) gneg.append(neglevelsflat)
gneu.append(neulevelsflat) gneu.append(neulevelsflat)
@@ -116,10 +117,15 @@ def main(folder):
# plt.show() # plt.show()
fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts within 1 week of 1st contribution\nPosts created between " fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts within 1 week of 1st contribution\nPosts created between "
+ option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# figsaver.save(fig, outfilename + ".png", bbox_inches='tight')
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
fig.savefig(outfilename + ".png", bbox_inches='tight') fig.savefig(outfilename + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
# global # global
start = cms()
printnoln("\rglobal plot post ... plotting ...")
gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts]) gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
@@ -132,19 +138,74 @@ def main(folder):
gaxs[1, 0].set_yscale('log') gaxs[1, 0].set_yscale('log')
gaxs[0, 1].set_yscale('log') gaxs[0, 1].set_yscale('log')
gaxs[1, 1].set_yscale('log') gaxs[1, 1].set_yscale('log')
gfig.suptitle("Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) gfig.suptitle(
"Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime(
"%d-%m-%Y"))
# figsaver.save(gfig, goutfilenamenewusers + ".png", bbox_inches='tight')
printnoln("\rglobal plot post ... plotting ... saving ...")
gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight') gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight')
plt.close(gfig) plt.close(gfig)
rprint("global plot post ... plotting ... saving ... took " + str(cms() - start) + "ms")
# for old users ---------------------------------------------------------------------------------
start = cms()
newuserids = set(dmt(newposts).map(lambda p: p['OwnerUserId']).getresults())
userposts = {u: 0 for u in newuserids}
for p in newposts:
userposts[p['OwnerUserId']] += 1
userposts = sorted(userposts.items(), key=operator.itemgetter(1))
oldusers = [k for k, v in userposts]
oldusers = set(oldusers[ceil(len(oldusers) * OLD_USER_PERCENTILE):])
filteredposts = dmt(newposts).filter(lambda p: p['OwnerUserId'] in oldusers).getresults()
toxlevels = []
for (i, post) in enumerate(filteredposts):
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
for a in post['Answers']:
if a['Id'] in cachedsentiments.keys():
toxlevel = cachedsentiments[a['Id']]
else:
print("Sentiment not found for " + a['Id'])
toxlevels.append(toxlevel)
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
dumptoxlevels(toxlevels, goutfilenameoldusers + ".py")
neglevelsflat = [item['neg'] for item in toxlevels]
neulevelsflat = [item['neu'] for item in toxlevels]
poslevelsflat = [item['pos'] for item in toxlevels]
comlevelsflat = [item['compound'] for item in toxlevels]
fig, axs = plt.subplots(2, 2, figsize=(16, 12))
axs[0, 0].set_title('Neg')
axs[1, 0].set_title('Neu')
axs[0, 1].set_title('Pos')
axs[1, 1].set_title('Compound')
axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100))
axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100))
axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100))
axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100))
axs[0, 0].set_yscale('log')
axs[1, 0].set_yscale('log')
axs[0, 1].set_yscale('log')
axs[1, 1].set_yscale('log')
# plt.show()
fig.suptitle("Sentiment of answers to posts by most posting users (95%tile)\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# figsaver.save(fig, goutfilenameoldusers + ".png", bbox_inches='tight')
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight')
plt.close(fig)
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
figsaver.join()
figsaver.join()
def computeToxLevel(text): def computeToxLevel(text):
return analyser.polarity_scores(text) return analyser.polarity_scores(text)
def flatmap(arr):
return [item for sublist in arr for item in sublist]
def dumptoxlevels(lvls, filename): def dumptoxlevels(lvls, filename):
with open(filename, "w") as file: with open(filename, "w") as file:
file.write("from collections import defaultdict\n\n") file.write("from collections import defaultdict\n\n")

View File

@@ -1,4 +1,3 @@
import importlib
import os import os
import sys import sys
from collections import defaultdict from collections import defaultdict
@@ -10,27 +9,36 @@ import matplotlib.pyplot as plt
import numpy as np import numpy as np
from scipy.stats import ks_2samp from scipy.stats import ks_2samp
from common import imprt
colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'} colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'}
IMAGE_MAGICK = "magick"
def main(folder): def main(folder):
outputdir = folder + "/output/ksbatch/" outputdir = folder + "/output/ksbatch/"
os.system("mkdir -p " + outputdir) os.system("mkdir -p " + outputdir)
folder = folder + "/output/batch/" srcfolder = folder + "/output/batch/"
onlyfiles = [folder + f for f in listdir(folder) if isfile(join(folder, f)) and f.endswith(".py")] onlyfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "newusers" in f]
onlyfiles = sorted(onlyfiles) onlyfiles = sorted(onlyfiles)
plotbypost(onlyfiles, outputdir) # plotbypost(onlyfiles, outputdir)
plotbydate(onlyfiles, outputdir) # plotbydate(onlyfiles, outputdir)
oldfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "oldusers" in f]
oldfiles = sorted(oldfiles)
plotbydateold(onlyfiles, oldfiles, outputdir)
def plotbypost(onlyfiles, outputdir): def plotbypost(onlyfiles, outputdir):
print("plotbypost")
files = defaultdict(list) files = defaultdict(list)
for f in onlyfiles: for f in onlyfiles:
s = f[:-3].split("_") s = f[:-3].split("_")
files[int(s[4])].append(f) files[int(s[5])].append(f)
files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[2], "%d-%m-%Y")) for (p, l) in files.items()} files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[3], "%d-%m-%Y")) for (p, l) in files.items()}
changes_neg = defaultdict(list) changes_neg = defaultdict(list)
changes_neu = defaultdict(list) changes_neu = defaultdict(list)
@@ -45,15 +53,15 @@ def plotbypost(onlyfiles, outputdir):
tox1 = imprt(l[i]).toxlevels tox1 = imprt(l[i]).toxlevels
tox2 = imprt(l[i + 1]).toxlevels tox2 = imprt(l[i + 1]).toxlevels
neglevelsflat1 = [item['neg'] for item in flatmap(tox1.values())] neglevelsflat1 = [item['neg'] for item in tox1]
neulevelsflat1 = [item['neu'] for item in flatmap(tox1.values())] neulevelsflat1 = [item['neu'] for item in tox1]
poslevelsflat1 = [item['pos'] for item in flatmap(tox1.values())] poslevelsflat1 = [item['pos'] for item in tox1]
comlevelsflat1 = [item['compound'] for item in flatmap(tox1.values())] comlevelsflat1 = [item['compound'] for item in tox1]
neglevelsflat2 = [item['neg'] for item in flatmap(tox2.values())] neglevelsflat2 = [item['neg'] for item in tox2]
neulevelsflat2 = [item['neu'] for item in flatmap(tox2.values())] neulevelsflat2 = [item['neu'] for item in tox2]
poslevelsflat2 = [item['pos'] for item in flatmap(tox2.values())] poslevelsflat2 = [item['pos'] for item in tox2]
comlevelsflat2 = [item['compound'] for item in flatmap(tox2.values())] comlevelsflat2 = [item['compound'] for item in tox2]
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
@@ -74,7 +82,7 @@ def plotbypost(onlyfiles, outputdir):
+ "; ks pos = " + str(changes_pos[p][i]) + "; ks com = " + str(changes_com[p][i]) + "\n") + "; ks pos = " + str(changes_pos[p][i]) + "; ks com = " + str(changes_com[p][i]) + "\n")
# pval # pval
for (p, l) in files.items(): for (p, l) in files.items():
x = [l[i].split("_")[2] + " -\n" + l[i + 1].split("_")[2] for i in range(len(l) - 1)] x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items(): for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
pval = [x.pvalue for x in changes] pval = [x.pvalue for x in changes]
@@ -92,7 +100,7 @@ def plotbypost(onlyfiles, outputdir):
# stat # stat
for (p, l) in files.items(): for (p, l) in files.items():
x = [l[i].split("_")[2] + " -\n" + l[i + 1].split("_")[2] for i in range(len(l) - 1)] x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items(): for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
stat = [x.statistic for x in changes] stat = [x.statistic for x in changes]
@@ -110,11 +118,12 @@ def plotbypost(onlyfiles, outputdir):
def plotbydate(onlyfiles, outputdir): def plotbydate(onlyfiles, outputdir):
print("plotbydate")
files = defaultdict(list) files = defaultdict(list)
for f in onlyfiles: for f in onlyfiles:
s = f[:-3].split("_") s = f[:-3].split("_")
files[(s[2], s[3])].append(f) files[(s[3], s[4])].append(f)
files = {d: sorted(l, key=lambda e: e.split("_")[4]) for (d, l) in files.items()} files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()}
changes_neg = defaultdict(list) changes_neg = defaultdict(list)
changes_neu = defaultdict(list) changes_neu = defaultdict(list)
@@ -129,15 +138,15 @@ def plotbydate(onlyfiles, outputdir):
tox1 = imprt(l[i]).toxlevels tox1 = imprt(l[i]).toxlevels
tox2 = imprt(l[i + 1]).toxlevels tox2 = imprt(l[i + 1]).toxlevels
neglevelsflat1 = [item['neg'] for item in flatmap(tox1.values())] neglevelsflat1 = [item['neg'] for item in tox1]
neulevelsflat1 = [item['neu'] for item in flatmap(tox1.values())] neulevelsflat1 = [item['neu'] for item in tox1]
poslevelsflat1 = [item['pos'] for item in flatmap(tox1.values())] poslevelsflat1 = [item['pos'] for item in tox1]
comlevelsflat1 = [item['compound'] for item in flatmap(tox1.values())] comlevelsflat1 = [item['compound'] for item in tox1]
neglevelsflat2 = [item['neg'] for item in flatmap(tox2.values())] neglevelsflat2 = [item['neg'] for item in tox2]
neulevelsflat2 = [item['neu'] for item in flatmap(tox2.values())] neulevelsflat2 = [item['neu'] for item in tox2]
poslevelsflat2 = [item['pos'] for item in flatmap(tox2.values())] poslevelsflat2 = [item['pos'] for item in tox2]
comlevelsflat2 = [item['compound'] for item in flatmap(tox2.values())] comlevelsflat2 = [item['compound'] for item in tox2]
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
@@ -159,7 +168,7 @@ def plotbydate(onlyfiles, outputdir):
# pval # pval
for (d, l) in files.items(): for (d, l) in files.items():
x = [l[i].split("_")[4][:-3] + "-" + l[i + 1].split("_")[4][:-3] for i in range(len(l) - 1)] x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
pval = [x.pvalue for x in changes] pval = [x.pvalue for x in changes]
@@ -177,7 +186,7 @@ def plotbydate(onlyfiles, outputdir):
# stat # stat
for (d, l) in files.items(): for (d, l) in files.items():
x = [l[i].split("_")[4][:-3] + "-" + l[i + 1].split("_")[4][:-3] for i in range(len(l) - 1)] x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
stat = [x.statistic for x in changes] stat = [x.statistic for x in changes]
@@ -194,15 +203,122 @@ def plotbydate(onlyfiles, outputdir):
plt.close(fig) plt.close(fig)
def imprt(file): def plotbydateold(onlyfiles, oldfiles, outputdir):
spec = importlib.util.spec_from_file_location("module.name", file) print("plotbydateold")
foo = importlib.util.module_from_spec(spec) files = defaultdict(list)
spec.loader.exec_module(foo) for f in onlyfiles:
return foo s = f[:-3].split("_")
files[(s[3], s[4])].append(f)
dates = sorted(files.keys(), key=lambda e: "-".join(reversed(e[0].split("-"))))
files = {d: files[d] for d in dates}
files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()}
oldfiles = {(f[:-3].split("_")[3], f[:-3].split("_")[4]): f for f in oldfiles}
changes_neg = defaultdict(list)
changes_neu = defaultdict(list)
changes_pos = defaultdict(list)
changes_com = defaultdict(list)
def flatmap(arr): for (d, l) in files.items():
return [item for sublist in arr for item in sublist] print(d)
toxold = imprt(oldfiles[d]).toxlevels
neglevelsold = [item['neg'] for item in toxold]
neulevelsold = [item['neu'] for item in toxold]
poslevelsold = [item['pos'] for item in toxold]
comlevelsold = [item['compound'] for item in toxold]
for i in range(len(l)):
tox1 = imprt(l[i]).toxlevels
if len(tox1) == 0 or len(toxold) == 0:
changes_neg[d].append(None)
changes_neu[d].append(None)
changes_pos[d].append(None)
changes_com[d].append(None)
continue
neglevelsflat1 = [item['neg'] for item in tox1]
neulevelsflat1 = [item['neu'] for item in tox1]
poslevelsflat1 = [item['pos'] for item in tox1]
comlevelsflat1 = [item['compound'] for item in tox1]
ksneg = ks_2samp(neglevelsflat1, neglevelsold)
ksneu = ks_2samp(neulevelsflat1, neulevelsold)
kspos = ks_2samp(poslevelsflat1, poslevelsold)
kscom = ks_2samp(comlevelsflat1, comlevelsold)
changes_neg[d].append(ksneg)
changes_neu[d].append(ksneu)
changes_pos[d].append(kspos)
changes_com[d].append(kscom)
print("logs")
for (d, l) in files.items():
# print(d)
# print("neg is: " + str(len(changes_neg[d])) + " should: " + str(len(l)))
# print("neu is: " + str(len(changes_neu[d])) + " should: " + str(len(l)))
# print("pos is: " + str(len(changes_pos[d])) + " should: " + str(len(l)))
# print("com is: " + str(len(changes_com[d])) + " should: " + str(len(l)))
f1 = oldfiles[d]
with open(outputdir + "/ks_olddate_" + d[0] + "_" + d[1] + ".log", "w") as f:
for i in range(len(l)):
if changes_neg[d][i] is None:
continue
f2 = l[i]
f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[d][i]) + "; ks neu = " + str(changes_neu[d][i])
+ "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n")
# pval
print("pval")
imgmagickcmd = IMAGE_MAGICK
for (d, l) in files.items():
print(d)
x = [l[i][:-3].split("_")[5] for i in range(len(l))]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
pval = [(xx, c.pvalue) for xx, c in zip(x, changes) if c is not None]
plt.plot([p[0] for p in pval], [p[1] for p in pval], label=type + ".pval", color=colors[type])
if len(pval) == 0:
continue
mean = np.mean([p[1] for p in pval])
std = np.std([p[1] for p in pval])
dev = [(xx, s) for (xx, s) in pval if s <= mean - std or s >= mean + std]
plt.plot([p[0] for p in pval], [mean] * len(pval), color=colors[type], ls='dashed')
plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1])
plt.xticks(rotation=90)
plt.legend(loc="upper right")
outfile = outputdir + "/ks_olddate_pval_" + d[0] + "_" + d[1] + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
imgmagickcmd += " " + outfile
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_pval.pdf")
# stat
print("stat")
imgmagickcmd = IMAGE_MAGICK
for (d, l) in files.items():
print(d)
x = [l[i][:-3].split("_")[5] for i in range(len(l))]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
stat = [(xx, c.statistic) for xx, c in zip(x, changes) if c is not None]
plt.plot([p[0] for p in stat], [p[1] for p in stat], label=type + ".stat", color=colors[type])
if len(stat) == 0:
continue
mean = np.mean([p[1] for p in stat])
std = np.std([p[1] for p in stat])
dev = [(xx, s) for (xx, s) in stat if s <= mean - std or s >= mean + std]
plt.plot([p[0] for p in stat], [mean] * len(stat), color=colors[type], ls='dashed')
plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1])
plt.xticks(rotation=90)
plt.legend(loc="upper right")
outfile = outputdir + "/ks_olddate_stat_" + d[0] + "_" + d[1] + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
imgmagickcmd += " " + outfile
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_stat.pdf")
def filecmp(file1, file2): def filecmp(file1, file2):

View File

@@ -1,8 +1,14 @@
import importlib
from threading import Thread, Lock
import matplotlib.pyplot as plt
from loader import dmt from loader import dmt
def calc_intervals(posts): def calc_intervals(posts):
firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], "firstpost").getresults() firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'],
"firstpost").getresults()
lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults() lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults()
# calc quarter beginning # calc quarter beginning
@@ -25,5 +31,32 @@ def calc_intervals(posts):
print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y")) print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y"))
intervals.append((cdate, nextquarter)) intervals.append((cdate, nextquarter))
cdate = nextquarter cdate = nextquarter
# sys.exit(0)
return intervals return intervals
def imprt(file):
spec = importlib.util.spec_from_file_location("module.name", file)
foo = importlib.util.module_from_spec(spec)
spec.loader.exec_module(foo)
return foo
class FigSaver():
def __init__(self):
self.__lock = Lock()
self.__threads = []
def save(self, fig, path, **kwargs):
thread = Thread(target=self.__dosave, args=(fig, path, kwargs))
with self.__lock:
self.__threads.append(thread)
thread.start()
def __dosave(self, fig, path, kwargs):
fig.savefig(path, **kwargs)
plt.close(fig)
def join(self):
with self.__lock:
for thread in self.__threads:
thread.join()

View File

@@ -1,5 +1,7 @@
import html
import multiprocessing import multiprocessing
import operator import operator
import re
import time import time
import xml.etree.cElementTree as et import xml.etree.cElementTree as et
from collections import defaultdict from collections import defaultdict
@@ -7,11 +9,13 @@ from datetime import datetime
from mt import mt from mt import mt
TAG_RE = re.compile(r'<[^>]+>')
printnoln = lambda text: print(text, end='', flush=True) printnoln = lambda text: print(text, end='', flush=True)
rprint = lambda text: print('\r' + text) rprint = lambda text: print('\r' + text)
def dmt(data): return mt(multiprocessing.cpu_count(), data, False) def dmt(data, progressinterval=1000): return mt(multiprocessing.cpu_count(), data, False, progressinterval)
def cms(): return int(round(time.time() * 1000)) def cms(): return int(round(time.time() * 1000))
@@ -75,6 +79,7 @@ def mapQuestion(item):
question = {tag: getTag(item, tag) for tag in tags} question = {tag: getTag(item, tag) for tag in tags}
for tag in datetags: for tag in datetags:
question[tag] = datetime.fromisoformat(question[tag]) question[tag] = datetime.fromisoformat(question[tag])
question['Body'] = removetags(html.unescape(question['Body']))
return question return question
@@ -84,6 +89,7 @@ def mapAnswer(item):
answer = {tag: getTag(item, tag) for tag in tags} answer = {tag: getTag(item, tag) for tag in tags}
for tag in datetags: for tag in datetags:
answer[tag] = datetime.fromisoformat(answer[tag]) answer[tag] = datetime.fromisoformat(answer[tag])
answer['Body'] = removetags(html.unescape(answer['Body']))
return answer return answer
@@ -93,6 +99,7 @@ def mapComment(item):
comment = {tag: getTag(item, tag) for tag in tags} comment = {tag: getTag(item, tag) for tag in tags}
for tag in datetags: for tag in datetags:
comment[tag] = datetime.fromisoformat(comment[tag]) comment[tag] = datetime.fromisoformat(comment[tag])
comment['Body'] = removetags(html.unescape(comment['Body']))
return comment return comment
@@ -201,3 +208,7 @@ def tagExists(item, tag):
def setprop(dic, key, value): def setprop(dic, key, value):
dic[key] = value dic[key] = value
return dic return dic
def removetags(text):
return TAG_RE.sub('', text)

46
mt.py
View File

@@ -1,10 +1,10 @@
from threading import Thread, Lock
import time import time
from math import ceil from math import ceil
from threading import Thread, Lock
class mt(): class mt():
def __init__(self, threads, data, verbose=False): def __init__(self, threads, data, verbose=False, progressinterval=1000):
self.__running = False self.__running = False
self.__closed = False self.__closed = False
self.__data = data self.__data = data
@@ -21,6 +21,7 @@ class mt():
self.__lock = Lock() self.__lock = Lock()
self.__results = [] self.__results = []
self.__progress = 0 self.__progress = 0
self.__progressinterval = progressinterval
for i in range(self.__threadcount): for i in range(self.__threadcount):
self.__results.append([]) self.__results.append([])
self.__threads.append(None) self.__threads.append(None)
@@ -35,9 +36,9 @@ class mt():
self.__running = True self.__running = True
self.__final = self.__getresultsmapfilter self.__final = self.__getresultsmapfilter
self.__type = "filter" self.__type = "filter"
self.__comment = comment if comment is not None else "" self.__comment = comment
if comment is not None: if comment is not None:
print(self.__comment + ": #" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True) print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
self.__starttime = self.__cms() self.__starttime = self.__cms()
self.__endtime = None self.__endtime = None
for i in range(self.__threadcount): for i in range(self.__threadcount):
@@ -50,20 +51,22 @@ class mt():
now = self.__cms() now = self.__cms()
results = [] results = []
for j in range(ceil(len(list) / 1000)): for j in range(ceil(len(list) / self.__progressinterval)):
part = list[j * 1000: min((j + 1) * 1000, len(list))] part = list[j * self.__progressinterval: min((j + 1) * self.__progressinterval, len(list))]
results += [l for l in part if cond(l)] results += [l for l in part if cond(l)]
with self.__lock: with self.__lock:
self.__progress += len(part) self.__progress += len(part)
if self.__comment is not None: if self.__comment is not None:
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True) print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True)
# results = [l for l in list if cond(l)]
with self.__lock: with self.__lock:
self.__results[i] = results self.__results[i] = results
dur = self.__cms() - now dur = self.__cms() - now
if self.__verbose: if self.__verbose:
print(self.__comment + ": Thread " + str(i) + ": filter took " + str(dur) + "ms") if self.__comment is not None:
print(self.__comment + ": Thread " + str(i) + ": filter took " + str(dur) + "ms")
else:
print("Thread " + str(i) + ": filter took " + str(dur) + "ms")
def map(self, func, comment=None): def map(self, func, comment=None):
if self.__closed: if self.__closed:
@@ -75,7 +78,7 @@ class mt():
self.__running = True self.__running = True
self.__final = self.__getresultsmapfilter self.__final = self.__getresultsmapfilter
self.__type = "map" self.__type = "map"
self.__comment = comment if comment is not None else "" self.__comment = comment
if comment is not None: if comment is not None:
print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True) print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
self.__starttime = self.__cms() self.__starttime = self.__cms()
@@ -89,20 +92,22 @@ class mt():
def __domap(self, i, list, func): def __domap(self, i, list, func):
now = self.__cms() now = self.__cms()
results = [] results = []
for j in range(ceil(len(list) / 1000)): for j in range(ceil(len(list) / self.__progressinterval)):
part = list[j * 1000: min((j + 1) * 1000, len(list))] part = list[j * self.__progressinterval: min((j + 1) * self.__progressinterval, len(list))]
results += [func(l) for l in part] results += [func(l) for l in part]
with self.__lock: with self.__lock:
self.__progress += len(part) self.__progress += len(part)
if self.__comment is not None: if self.__comment is not None:
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True) print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True)
# results = [func(l) for l in list]
with self.__lock: with self.__lock:
self.__results[i] = results self.__results[i] = results
dur = self.__cms() - now dur = self.__cms() - now
if self.__verbose: if self.__verbose:
print(self.__comment + ": Thread " + str(i) + ": map took " + str(dur) + "ms") if self.__comment is not None:
print(self.__comment + ": Thread " + str(i) + ": filter took " + str(dur) + "ms")
else:
print("Thread " + str(i) + ": filter took " + str(dur) + "ms")
def reduce(self, reducer, aggregator, initval, comment=None): def reduce(self, reducer, aggregator, initval, comment=None):
if self.__closed: if self.__closed:
@@ -114,9 +119,9 @@ class mt():
self.__running = True self.__running = True
self.__final = lambda: self.__getresultsreduce(aggregator, initval) self.__final = lambda: self.__getresultsreduce(aggregator, initval)
self.__type = "reduce" self.__type = "reduce"
self.__comment = comment if comment is not None else "" self.__comment = comment
if comment is not None: if comment is not None:
print(self.__comment + ": #" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True) print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
self.__starttime = self.__cms() self.__starttime = self.__cms()
self.__endtime = None self.__endtime = None
for i in range(self.__threadcount): for i in range(self.__threadcount):
@@ -129,8 +134,8 @@ class mt():
now = self.__cms() now = self.__cms()
val = initval() val = initval()
for j in range(ceil(len(list) / 1000)): for j in range(ceil(len(list) / self.__progressinterval)):
part = list[j * 1000: min((j + 1) * 1000, len(list))] part = list[j * self.__progressinterval: min((j + 1) * self.__progressinterval, len(list))]
for k in range(len(part)): for k in range(len(part)):
val = reducer(val, part[k]) val = reducer(val, part[k])
with self.__lock: with self.__lock:
@@ -138,13 +143,14 @@ class mt():
if self.__comment is not None: if self.__comment is not None:
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True) print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True)
# for j in range(len(list)):
# val = reducer(val, list[j])
with self.__lock: with self.__lock:
self.__results[i] = val self.__results[i] = val
dur = self.__cms() - now dur = self.__cms() - now
if self.__verbose: if self.__verbose:
print(self.__comment + ": Thread " + str(i) + ": reduce took " + str(dur) + "ms") if self.__comment is not None:
print(self.__comment + ": Thread " + str(i) + ": filter took " + str(dur) + "ms")
else:
print("Thread " + str(i) + ": filter took " + str(dur) + "ms")
def getresults(self): def getresults(self):
self.join() self.join()

View File

@@ -36,17 +36,12 @@ def main(folder):
# toxlevel = computeToxLevel(a['Body']) # toxlevel = computeToxLevel(a['Body'])
# toxlevels[post['Id']].append(toxlevel) # toxlevels[post['Id']].append(toxlevel)
# rprint("computing toxic levels: post #" + str(len(posts)) + "/" + str(len(posts)) + " ... took " + str(cms() - start) + "ms") # rprint("computing toxic levels: post #" + str(len(posts)) + "/" + str(len(posts)) + " ... took " + str(cms() - start) + "ms")
toxlevels = dmt(posts).map(lambda p: (p['Id'], computeSentimentForPost(p)), "calculating sentiments").getresults() toxlevels = dmt(posts, 10).map(lambda p: (p['Id'], {a['Id']: computeToxLevel(a['Body']) for a in p['Answers']}), "calculating sentiments").getresults()
toxlevels = {id: p for (id, p) in toxlevels} toxlevels = {id: p for (id, p) in toxlevels}
dumptoxlevels(toxlevels, outfilename + ".py") dumptoxlevels(toxlevels, outfilename + ".py")
def computeSentimentForPost(post):
anwsers = {a['Id']: computeToxLevel(a['Body']) for a in post['Answers']}
return anwsers
def computeToxLevel(text): def computeToxLevel(text):
return analyser.polarity_scores(text) return analyser.polarity_scores(text)