wip
This commit is contained in:
105
analyze_batch.py
105
analyze_batch.py
@@ -1,13 +1,15 @@
|
|||||||
|
import operator
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
from math import ceil
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
||||||
|
|
||||||
from common import calc_intervals
|
from common import calc_intervals, imprt, FigSaver
|
||||||
from loader import load, dmt, cms
|
from loader import load, dmt, cms
|
||||||
|
|
||||||
printnoln = lambda text: print(text, end='', flush=True)
|
printnoln = lambda text: print(text, end='', flush=True)
|
||||||
@@ -15,8 +17,10 @@ rprint = lambda text: print('\r' + text)
|
|||||||
|
|
||||||
DAYS_NEW_USER = 7
|
DAYS_NEW_USER = 7
|
||||||
OLD_USER_YEAR = 3
|
OLD_USER_YEAR = 3
|
||||||
|
OLD_USER_PERCENTILE = 0.95
|
||||||
|
|
||||||
analyser = SentimentIntensityAnalyzer()
|
analyser = SentimentIntensityAnalyzer()
|
||||||
|
figsaver = FigSaver()
|
||||||
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
|
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
|
||||||
|
|
||||||
|
|
||||||
@@ -24,7 +28,10 @@ def main(folder):
|
|||||||
users, posts, firstcontrib, sumcontrib = load(folder)
|
users, posts, firstcontrib, sumcontrib = load(folder)
|
||||||
|
|
||||||
intervals = calc_intervals(posts)
|
intervals = calc_intervals(posts)
|
||||||
cachedsentiments = {}
|
cachedsentiments = imprt(folder + "/output/sentiments.py").answers
|
||||||
|
|
||||||
|
outfolder = folder + "/output/batch/"
|
||||||
|
os.system("mkdir -p " + outfolder)
|
||||||
|
|
||||||
postcounts = range(1, 5 + 1)
|
postcounts = range(1, 5 + 1)
|
||||||
for (option_date_from, option_date_to) in intervals:
|
for (option_date_from, option_date_to) in intervals:
|
||||||
@@ -44,8 +51,6 @@ def main(folder):
|
|||||||
gpos = []
|
gpos = []
|
||||||
gcom = []
|
gcom = []
|
||||||
|
|
||||||
outfolder = folder + "/output/batch/"
|
|
||||||
os.system("mkdir -p " + outfolder)
|
|
||||||
goutfilenamenewusers = outfolder + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
goutfilenamenewusers = outfolder + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
||||||
goutfilenameoldusers = outfolder + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
goutfilenameoldusers = outfolder + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
||||||
|
|
||||||
@@ -55,7 +60,7 @@ def main(folder):
|
|||||||
# computer toxic levels
|
# computer toxic levels
|
||||||
start = cms()
|
start = cms()
|
||||||
printnoln("computing toxic levels: filtering")
|
printnoln("computing toxic levels: filtering")
|
||||||
toxlevels = defaultdict(list)
|
toxlevels = []
|
||||||
searchedposts = defaultdict(int)
|
searchedposts = defaultdict(int)
|
||||||
filteredposts = []
|
filteredposts = []
|
||||||
for (i, post) in enumerate(newposts):
|
for (i, post) in enumerate(newposts):
|
||||||
@@ -73,26 +78,22 @@ def main(folder):
|
|||||||
filteredposts.append(post)
|
filteredposts.append(post)
|
||||||
|
|
||||||
for (i, post) in enumerate(filteredposts):
|
for (i, post) in enumerate(filteredposts):
|
||||||
if (i + 1) % 100 == 0:
|
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
|
||||||
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
|
|
||||||
if (i + 1) == len(newposts):
|
|
||||||
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
|
|
||||||
for a in post['Answers']:
|
for a in post['Answers']:
|
||||||
if a['Id'] in cachedsentiments.keys():
|
if a['Id'] in cachedsentiments.keys():
|
||||||
toxlevel = cachedsentiments[a['Id']]
|
toxlevel = cachedsentiments[a['Id']]
|
||||||
else:
|
else:
|
||||||
toxlevel = computeToxLevel(a['Body'])
|
print("Sentiment not found for " + a['Id'])
|
||||||
cachedsentiments[a['Id']] = toxlevel
|
toxlevels.append(toxlevel)
|
||||||
toxlevels[post['Id']].append(toxlevel)
|
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
|
||||||
rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms")
|
|
||||||
|
|
||||||
outfilename = goutfilenamenewusers + "_" + str(option_posts)
|
outfilename = goutfilenamenewusers + "_" + str(option_posts)
|
||||||
dumptoxlevels(toxlevels, outfilename + ".py")
|
dumptoxlevels(toxlevels, outfilename + ".py")
|
||||||
|
|
||||||
neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())]
|
neglevelsflat = [item['neg'] for item in toxlevels]
|
||||||
neulevelsflat = [item['neu'] for item in flatmap(toxlevels.values())]
|
neulevelsflat = [item['neu'] for item in toxlevels]
|
||||||
poslevelsflat = [item['pos'] for item in flatmap(toxlevels.values())]
|
poslevelsflat = [item['pos'] for item in toxlevels]
|
||||||
comlevelsflat = [item['compound'] for item in flatmap(toxlevels.values())]
|
comlevelsflat = [item['compound'] for item in toxlevels]
|
||||||
|
|
||||||
gneg.append(neglevelsflat)
|
gneg.append(neglevelsflat)
|
||||||
gneu.append(neulevelsflat)
|
gneu.append(neulevelsflat)
|
||||||
@@ -116,10 +117,15 @@ def main(folder):
|
|||||||
# plt.show()
|
# plt.show()
|
||||||
fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts within 1 week of 1st contribution\nPosts created between "
|
fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts within 1 week of 1st contribution\nPosts created between "
|
||||||
+ option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
+ option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
||||||
|
# figsaver.save(fig, outfilename + ".png", bbox_inches='tight')
|
||||||
|
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
|
||||||
fig.savefig(outfilename + ".png", bbox_inches='tight')
|
fig.savefig(outfilename + ".png", bbox_inches='tight')
|
||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
|
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
|
||||||
|
|
||||||
# global
|
# global
|
||||||
|
start = cms()
|
||||||
|
printnoln("\rglobal plot post ... plotting ...")
|
||||||
gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
gaxs[0, 0].hist(gneg, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||||
gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
gaxs[1, 0].hist(gneu, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||||
gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
gaxs[0, 1].hist(gpos, np.linspace(0, 1, 1 * 100), color=colors[:len(postcounts)], label=[str(option_posts) + " posts" for option_posts in postcounts])
|
||||||
@@ -132,19 +138,74 @@ def main(folder):
|
|||||||
gaxs[1, 0].set_yscale('log')
|
gaxs[1, 0].set_yscale('log')
|
||||||
gaxs[0, 1].set_yscale('log')
|
gaxs[0, 1].set_yscale('log')
|
||||||
gaxs[1, 1].set_yscale('log')
|
gaxs[1, 1].set_yscale('log')
|
||||||
gfig.suptitle("Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
gfig.suptitle(
|
||||||
|
"Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime(
|
||||||
|
"%d-%m-%Y"))
|
||||||
|
# figsaver.save(gfig, goutfilenamenewusers + ".png", bbox_inches='tight')
|
||||||
|
printnoln("\rglobal plot post ... plotting ... saving ...")
|
||||||
gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight')
|
gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight')
|
||||||
plt.close(gfig)
|
plt.close(gfig)
|
||||||
|
rprint("global plot post ... plotting ... saving ... took " + str(cms() - start) + "ms")
|
||||||
|
|
||||||
|
# for old users ---------------------------------------------------------------------------------
|
||||||
|
start = cms()
|
||||||
|
newuserids = set(dmt(newposts).map(lambda p: p['OwnerUserId']).getresults())
|
||||||
|
userposts = {u: 0 for u in newuserids}
|
||||||
|
for p in newposts:
|
||||||
|
userposts[p['OwnerUserId']] += 1
|
||||||
|
userposts = sorted(userposts.items(), key=operator.itemgetter(1))
|
||||||
|
oldusers = [k for k, v in userposts]
|
||||||
|
oldusers = set(oldusers[ceil(len(oldusers) * OLD_USER_PERCENTILE):])
|
||||||
|
filteredposts = dmt(newposts).filter(lambda p: p['OwnerUserId'] in oldusers).getresults()
|
||||||
|
|
||||||
|
toxlevels = []
|
||||||
|
for (i, post) in enumerate(filteredposts):
|
||||||
|
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
|
||||||
|
for a in post['Answers']:
|
||||||
|
if a['Id'] in cachedsentiments.keys():
|
||||||
|
toxlevel = cachedsentiments[a['Id']]
|
||||||
|
else:
|
||||||
|
print("Sentiment not found for " + a['Id'])
|
||||||
|
toxlevels.append(toxlevel)
|
||||||
|
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
|
||||||
|
|
||||||
|
dumptoxlevels(toxlevels, goutfilenameoldusers + ".py")
|
||||||
|
|
||||||
|
neglevelsflat = [item['neg'] for item in toxlevels]
|
||||||
|
neulevelsflat = [item['neu'] for item in toxlevels]
|
||||||
|
poslevelsflat = [item['pos'] for item in toxlevels]
|
||||||
|
comlevelsflat = [item['compound'] for item in toxlevels]
|
||||||
|
|
||||||
|
fig, axs = plt.subplots(2, 2, figsize=(16, 12))
|
||||||
|
axs[0, 0].set_title('Neg')
|
||||||
|
axs[1, 0].set_title('Neu')
|
||||||
|
axs[0, 1].set_title('Pos')
|
||||||
|
axs[1, 1].set_title('Compound')
|
||||||
|
axs[0, 0].hist(neglevelsflat, np.linspace(0, 1, 1 * 100))
|
||||||
|
axs[1, 0].hist(neulevelsflat, np.linspace(0, 1, 1 * 100))
|
||||||
|
axs[0, 1].hist(poslevelsflat, np.linspace(0, 1, 1 * 100))
|
||||||
|
axs[1, 1].hist(comlevelsflat, np.linspace(-1, 1, 2 * 100))
|
||||||
|
axs[0, 0].set_yscale('log')
|
||||||
|
axs[1, 0].set_yscale('log')
|
||||||
|
axs[0, 1].set_yscale('log')
|
||||||
|
axs[1, 1].set_yscale('log')
|
||||||
|
|
||||||
|
# plt.show()
|
||||||
|
fig.suptitle("Sentiment of answers to posts by most posting users (95%tile)\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
||||||
|
# figsaver.save(fig, goutfilenameoldusers + ".png", bbox_inches='tight')
|
||||||
|
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
|
||||||
|
fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight')
|
||||||
|
plt.close(fig)
|
||||||
|
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
|
||||||
|
|
||||||
|
figsaver.join()
|
||||||
|
figsaver.join()
|
||||||
|
|
||||||
|
|
||||||
def computeToxLevel(text):
|
def computeToxLevel(text):
|
||||||
return analyser.polarity_scores(text)
|
return analyser.polarity_scores(text)
|
||||||
|
|
||||||
|
|
||||||
def flatmap(arr):
|
|
||||||
return [item for sublist in arr for item in sublist]
|
|
||||||
|
|
||||||
|
|
||||||
def dumptoxlevels(lvls, filename):
|
def dumptoxlevels(lvls, filename):
|
||||||
with open(filename, "w") as file:
|
with open(filename, "w") as file:
|
||||||
file.write("from collections import defaultdict\n\n")
|
file.write("from collections import defaultdict\n\n")
|
||||||
|
|||||||
188
calctoxdiff.py
188
calctoxdiff.py
@@ -1,4 +1,3 @@
|
|||||||
import importlib
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
@@ -10,27 +9,36 @@ import matplotlib.pyplot as plt
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.stats import ks_2samp
|
from scipy.stats import ks_2samp
|
||||||
|
|
||||||
|
from common import imprt
|
||||||
|
|
||||||
colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'}
|
colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'}
|
||||||
|
IMAGE_MAGICK = "magick"
|
||||||
|
|
||||||
|
|
||||||
def main(folder):
|
def main(folder):
|
||||||
outputdir = folder + "/output/ksbatch/"
|
outputdir = folder + "/output/ksbatch/"
|
||||||
os.system("mkdir -p " + outputdir)
|
os.system("mkdir -p " + outputdir)
|
||||||
folder = folder + "/output/batch/"
|
srcfolder = folder + "/output/batch/"
|
||||||
|
|
||||||
onlyfiles = [folder + f for f in listdir(folder) if isfile(join(folder, f)) and f.endswith(".py")]
|
onlyfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "newusers" in f]
|
||||||
onlyfiles = sorted(onlyfiles)
|
onlyfiles = sorted(onlyfiles)
|
||||||
|
|
||||||
plotbypost(onlyfiles, outputdir)
|
# plotbypost(onlyfiles, outputdir)
|
||||||
plotbydate(onlyfiles, outputdir)
|
# plotbydate(onlyfiles, outputdir)
|
||||||
|
|
||||||
|
oldfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "oldusers" in f]
|
||||||
|
oldfiles = sorted(oldfiles)
|
||||||
|
|
||||||
|
plotbydateold(onlyfiles, oldfiles, outputdir)
|
||||||
|
|
||||||
|
|
||||||
def plotbypost(onlyfiles, outputdir):
|
def plotbypost(onlyfiles, outputdir):
|
||||||
|
print("plotbypost")
|
||||||
files = defaultdict(list)
|
files = defaultdict(list)
|
||||||
for f in onlyfiles:
|
for f in onlyfiles:
|
||||||
s = f[:-3].split("_")
|
s = f[:-3].split("_")
|
||||||
files[int(s[4])].append(f)
|
files[int(s[5])].append(f)
|
||||||
files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[2], "%d-%m-%Y")) for (p, l) in files.items()}
|
files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[3], "%d-%m-%Y")) for (p, l) in files.items()}
|
||||||
|
|
||||||
changes_neg = defaultdict(list)
|
changes_neg = defaultdict(list)
|
||||||
changes_neu = defaultdict(list)
|
changes_neu = defaultdict(list)
|
||||||
@@ -45,15 +53,15 @@ def plotbypost(onlyfiles, outputdir):
|
|||||||
tox1 = imprt(l[i]).toxlevels
|
tox1 = imprt(l[i]).toxlevels
|
||||||
tox2 = imprt(l[i + 1]).toxlevels
|
tox2 = imprt(l[i + 1]).toxlevels
|
||||||
|
|
||||||
neglevelsflat1 = [item['neg'] for item in flatmap(tox1.values())]
|
neglevelsflat1 = [item['neg'] for item in tox1]
|
||||||
neulevelsflat1 = [item['neu'] for item in flatmap(tox1.values())]
|
neulevelsflat1 = [item['neu'] for item in tox1]
|
||||||
poslevelsflat1 = [item['pos'] for item in flatmap(tox1.values())]
|
poslevelsflat1 = [item['pos'] for item in tox1]
|
||||||
comlevelsflat1 = [item['compound'] for item in flatmap(tox1.values())]
|
comlevelsflat1 = [item['compound'] for item in tox1]
|
||||||
|
|
||||||
neglevelsflat2 = [item['neg'] for item in flatmap(tox2.values())]
|
neglevelsflat2 = [item['neg'] for item in tox2]
|
||||||
neulevelsflat2 = [item['neu'] for item in flatmap(tox2.values())]
|
neulevelsflat2 = [item['neu'] for item in tox2]
|
||||||
poslevelsflat2 = [item['pos'] for item in flatmap(tox2.values())]
|
poslevelsflat2 = [item['pos'] for item in tox2]
|
||||||
comlevelsflat2 = [item['compound'] for item in flatmap(tox2.values())]
|
comlevelsflat2 = [item['compound'] for item in tox2]
|
||||||
|
|
||||||
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
|
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
|
||||||
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
|
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
|
||||||
@@ -74,7 +82,7 @@ def plotbypost(onlyfiles, outputdir):
|
|||||||
+ "; ks pos = " + str(changes_pos[p][i]) + "; ks com = " + str(changes_com[p][i]) + "\n")
|
+ "; ks pos = " + str(changes_pos[p][i]) + "; ks com = " + str(changes_com[p][i]) + "\n")
|
||||||
# pval
|
# pval
|
||||||
for (p, l) in files.items():
|
for (p, l) in files.items():
|
||||||
x = [l[i].split("_")[2] + " -\n" + l[i + 1].split("_")[2] for i in range(len(l) - 1)]
|
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
|
||||||
fig = plt.figure(figsize=(16, 12))
|
fig = plt.figure(figsize=(16, 12))
|
||||||
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
|
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
|
||||||
pval = [x.pvalue for x in changes]
|
pval = [x.pvalue for x in changes]
|
||||||
@@ -92,7 +100,7 @@ def plotbypost(onlyfiles, outputdir):
|
|||||||
|
|
||||||
# stat
|
# stat
|
||||||
for (p, l) in files.items():
|
for (p, l) in files.items():
|
||||||
x = [l[i].split("_")[2] + " -\n" + l[i + 1].split("_")[2] for i in range(len(l) - 1)]
|
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
|
||||||
fig = plt.figure(figsize=(16, 12))
|
fig = plt.figure(figsize=(16, 12))
|
||||||
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
|
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
|
||||||
stat = [x.statistic for x in changes]
|
stat = [x.statistic for x in changes]
|
||||||
@@ -110,11 +118,12 @@ def plotbypost(onlyfiles, outputdir):
|
|||||||
|
|
||||||
|
|
||||||
def plotbydate(onlyfiles, outputdir):
|
def plotbydate(onlyfiles, outputdir):
|
||||||
|
print("plotbydate")
|
||||||
files = defaultdict(list)
|
files = defaultdict(list)
|
||||||
for f in onlyfiles:
|
for f in onlyfiles:
|
||||||
s = f[:-3].split("_")
|
s = f[:-3].split("_")
|
||||||
files[(s[2], s[3])].append(f)
|
files[(s[3], s[4])].append(f)
|
||||||
files = {d: sorted(l, key=lambda e: e.split("_")[4]) for (d, l) in files.items()}
|
files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()}
|
||||||
|
|
||||||
changes_neg = defaultdict(list)
|
changes_neg = defaultdict(list)
|
||||||
changes_neu = defaultdict(list)
|
changes_neu = defaultdict(list)
|
||||||
@@ -129,15 +138,15 @@ def plotbydate(onlyfiles, outputdir):
|
|||||||
tox1 = imprt(l[i]).toxlevels
|
tox1 = imprt(l[i]).toxlevels
|
||||||
tox2 = imprt(l[i + 1]).toxlevels
|
tox2 = imprt(l[i + 1]).toxlevels
|
||||||
|
|
||||||
neglevelsflat1 = [item['neg'] for item in flatmap(tox1.values())]
|
neglevelsflat1 = [item['neg'] for item in tox1]
|
||||||
neulevelsflat1 = [item['neu'] for item in flatmap(tox1.values())]
|
neulevelsflat1 = [item['neu'] for item in tox1]
|
||||||
poslevelsflat1 = [item['pos'] for item in flatmap(tox1.values())]
|
poslevelsflat1 = [item['pos'] for item in tox1]
|
||||||
comlevelsflat1 = [item['compound'] for item in flatmap(tox1.values())]
|
comlevelsflat1 = [item['compound'] for item in tox1]
|
||||||
|
|
||||||
neglevelsflat2 = [item['neg'] for item in flatmap(tox2.values())]
|
neglevelsflat2 = [item['neg'] for item in tox2]
|
||||||
neulevelsflat2 = [item['neu'] for item in flatmap(tox2.values())]
|
neulevelsflat2 = [item['neu'] for item in tox2]
|
||||||
poslevelsflat2 = [item['pos'] for item in flatmap(tox2.values())]
|
poslevelsflat2 = [item['pos'] for item in tox2]
|
||||||
comlevelsflat2 = [item['compound'] for item in flatmap(tox2.values())]
|
comlevelsflat2 = [item['compound'] for item in tox2]
|
||||||
|
|
||||||
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
|
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
|
||||||
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
|
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
|
||||||
@@ -159,7 +168,7 @@ def plotbydate(onlyfiles, outputdir):
|
|||||||
|
|
||||||
# pval
|
# pval
|
||||||
for (d, l) in files.items():
|
for (d, l) in files.items():
|
||||||
x = [l[i].split("_")[4][:-3] + "-" + l[i + 1].split("_")[4][:-3] for i in range(len(l) - 1)]
|
x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)]
|
||||||
fig = plt.figure(figsize=(16, 12))
|
fig = plt.figure(figsize=(16, 12))
|
||||||
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
||||||
pval = [x.pvalue for x in changes]
|
pval = [x.pvalue for x in changes]
|
||||||
@@ -177,7 +186,7 @@ def plotbydate(onlyfiles, outputdir):
|
|||||||
|
|
||||||
# stat
|
# stat
|
||||||
for (d, l) in files.items():
|
for (d, l) in files.items():
|
||||||
x = [l[i].split("_")[4][:-3] + "-" + l[i + 1].split("_")[4][:-3] for i in range(len(l) - 1)]
|
x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)]
|
||||||
fig = plt.figure(figsize=(16, 12))
|
fig = plt.figure(figsize=(16, 12))
|
||||||
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
||||||
stat = [x.statistic for x in changes]
|
stat = [x.statistic for x in changes]
|
||||||
@@ -194,15 +203,122 @@ def plotbydate(onlyfiles, outputdir):
|
|||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
|
|
||||||
|
|
||||||
def imprt(file):
|
def plotbydateold(onlyfiles, oldfiles, outputdir):
|
||||||
spec = importlib.util.spec_from_file_location("module.name", file)
|
print("plotbydateold")
|
||||||
foo = importlib.util.module_from_spec(spec)
|
files = defaultdict(list)
|
||||||
spec.loader.exec_module(foo)
|
for f in onlyfiles:
|
||||||
return foo
|
s = f[:-3].split("_")
|
||||||
|
files[(s[3], s[4])].append(f)
|
||||||
|
dates = sorted(files.keys(), key=lambda e: "-".join(reversed(e[0].split("-"))))
|
||||||
|
files = {d: files[d] for d in dates}
|
||||||
|
files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()}
|
||||||
|
oldfiles = {(f[:-3].split("_")[3], f[:-3].split("_")[4]): f for f in oldfiles}
|
||||||
|
|
||||||
|
changes_neg = defaultdict(list)
|
||||||
|
changes_neu = defaultdict(list)
|
||||||
|
changes_pos = defaultdict(list)
|
||||||
|
changes_com = defaultdict(list)
|
||||||
|
|
||||||
def flatmap(arr):
|
for (d, l) in files.items():
|
||||||
return [item for sublist in arr for item in sublist]
|
print(d)
|
||||||
|
toxold = imprt(oldfiles[d]).toxlevels
|
||||||
|
|
||||||
|
neglevelsold = [item['neg'] for item in toxold]
|
||||||
|
neulevelsold = [item['neu'] for item in toxold]
|
||||||
|
poslevelsold = [item['pos'] for item in toxold]
|
||||||
|
comlevelsold = [item['compound'] for item in toxold]
|
||||||
|
|
||||||
|
for i in range(len(l)):
|
||||||
|
tox1 = imprt(l[i]).toxlevels
|
||||||
|
if len(tox1) == 0 or len(toxold) == 0:
|
||||||
|
changes_neg[d].append(None)
|
||||||
|
changes_neu[d].append(None)
|
||||||
|
changes_pos[d].append(None)
|
||||||
|
changes_com[d].append(None)
|
||||||
|
continue
|
||||||
|
|
||||||
|
neglevelsflat1 = [item['neg'] for item in tox1]
|
||||||
|
neulevelsflat1 = [item['neu'] for item in tox1]
|
||||||
|
poslevelsflat1 = [item['pos'] for item in tox1]
|
||||||
|
comlevelsflat1 = [item['compound'] for item in tox1]
|
||||||
|
|
||||||
|
ksneg = ks_2samp(neglevelsflat1, neglevelsold)
|
||||||
|
ksneu = ks_2samp(neulevelsflat1, neulevelsold)
|
||||||
|
kspos = ks_2samp(poslevelsflat1, poslevelsold)
|
||||||
|
kscom = ks_2samp(comlevelsflat1, comlevelsold)
|
||||||
|
|
||||||
|
changes_neg[d].append(ksneg)
|
||||||
|
changes_neu[d].append(ksneu)
|
||||||
|
changes_pos[d].append(kspos)
|
||||||
|
changes_com[d].append(kscom)
|
||||||
|
|
||||||
|
print("logs")
|
||||||
|
for (d, l) in files.items():
|
||||||
|
# print(d)
|
||||||
|
# print("neg is: " + str(len(changes_neg[d])) + " should: " + str(len(l)))
|
||||||
|
# print("neu is: " + str(len(changes_neu[d])) + " should: " + str(len(l)))
|
||||||
|
# print("pos is: " + str(len(changes_pos[d])) + " should: " + str(len(l)))
|
||||||
|
# print("com is: " + str(len(changes_com[d])) + " should: " + str(len(l)))
|
||||||
|
f1 = oldfiles[d]
|
||||||
|
with open(outputdir + "/ks_olddate_" + d[0] + "_" + d[1] + ".log", "w") as f:
|
||||||
|
for i in range(len(l)):
|
||||||
|
if changes_neg[d][i] is None:
|
||||||
|
continue
|
||||||
|
f2 = l[i]
|
||||||
|
f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[d][i]) + "; ks neu = " + str(changes_neu[d][i])
|
||||||
|
+ "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n")
|
||||||
|
|
||||||
|
# pval
|
||||||
|
print("pval")
|
||||||
|
imgmagickcmd = IMAGE_MAGICK
|
||||||
|
for (d, l) in files.items():
|
||||||
|
print(d)
|
||||||
|
x = [l[i][:-3].split("_")[5] for i in range(len(l))]
|
||||||
|
fig = plt.figure(figsize=(16, 12))
|
||||||
|
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
||||||
|
pval = [(xx, c.pvalue) for xx, c in zip(x, changes) if c is not None]
|
||||||
|
plt.plot([p[0] for p in pval], [p[1] for p in pval], label=type + ".pval", color=colors[type])
|
||||||
|
if len(pval) == 0:
|
||||||
|
continue
|
||||||
|
mean = np.mean([p[1] for p in pval])
|
||||||
|
std = np.std([p[1] for p in pval])
|
||||||
|
dev = [(xx, s) for (xx, s) in pval if s <= mean - std or s >= mean + std]
|
||||||
|
plt.plot([p[0] for p in pval], [mean] * len(pval), color=colors[type], ls='dashed')
|
||||||
|
plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o')
|
||||||
|
plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1])
|
||||||
|
plt.xticks(rotation=90)
|
||||||
|
plt.legend(loc="upper right")
|
||||||
|
outfile = outputdir + "/ks_olddate_pval_" + d[0] + "_" + d[1] + ".png"
|
||||||
|
plt.savefig(outfile, bbox_inches='tight')
|
||||||
|
plt.close(fig)
|
||||||
|
imgmagickcmd += " " + outfile
|
||||||
|
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_pval.pdf")
|
||||||
|
|
||||||
|
# stat
|
||||||
|
print("stat")
|
||||||
|
imgmagickcmd = IMAGE_MAGICK
|
||||||
|
for (d, l) in files.items():
|
||||||
|
print(d)
|
||||||
|
x = [l[i][:-3].split("_")[5] for i in range(len(l))]
|
||||||
|
fig = plt.figure(figsize=(16, 12))
|
||||||
|
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
||||||
|
stat = [(xx, c.statistic) for xx, c in zip(x, changes) if c is not None]
|
||||||
|
plt.plot([p[0] for p in stat], [p[1] for p in stat], label=type + ".stat", color=colors[type])
|
||||||
|
if len(stat) == 0:
|
||||||
|
continue
|
||||||
|
mean = np.mean([p[1] for p in stat])
|
||||||
|
std = np.std([p[1] for p in stat])
|
||||||
|
dev = [(xx, s) for (xx, s) in stat if s <= mean - std or s >= mean + std]
|
||||||
|
plt.plot([p[0] for p in stat], [mean] * len(stat), color=colors[type], ls='dashed')
|
||||||
|
plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o')
|
||||||
|
plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1])
|
||||||
|
plt.xticks(rotation=90)
|
||||||
|
plt.legend(loc="upper right")
|
||||||
|
outfile = outputdir + "/ks_olddate_stat_" + d[0] + "_" + d[1] + ".png"
|
||||||
|
plt.savefig(outfile, bbox_inches='tight')
|
||||||
|
plt.close(fig)
|
||||||
|
imgmagickcmd += " " + outfile
|
||||||
|
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_stat.pdf")
|
||||||
|
|
||||||
|
|
||||||
def filecmp(file1, file2):
|
def filecmp(file1, file2):
|
||||||
|
|||||||
37
common.py
37
common.py
@@ -1,8 +1,14 @@
|
|||||||
|
import importlib
|
||||||
|
from threading import Thread, Lock
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
from loader import dmt
|
from loader import dmt
|
||||||
|
|
||||||
|
|
||||||
def calc_intervals(posts):
|
def calc_intervals(posts):
|
||||||
firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], "firstpost").getresults()
|
firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'],
|
||||||
|
"firstpost").getresults()
|
||||||
lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults()
|
lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults()
|
||||||
|
|
||||||
# calc quarter beginning
|
# calc quarter beginning
|
||||||
@@ -25,5 +31,32 @@ def calc_intervals(posts):
|
|||||||
print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y"))
|
print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y"))
|
||||||
intervals.append((cdate, nextquarter))
|
intervals.append((cdate, nextquarter))
|
||||||
cdate = nextquarter
|
cdate = nextquarter
|
||||||
# sys.exit(0)
|
|
||||||
return intervals
|
return intervals
|
||||||
|
|
||||||
|
|
||||||
|
def imprt(file):
|
||||||
|
spec = importlib.util.spec_from_file_location("module.name", file)
|
||||||
|
foo = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(foo)
|
||||||
|
return foo
|
||||||
|
|
||||||
|
|
||||||
|
class FigSaver():
|
||||||
|
def __init__(self):
|
||||||
|
self.__lock = Lock()
|
||||||
|
self.__threads = []
|
||||||
|
|
||||||
|
def save(self, fig, path, **kwargs):
|
||||||
|
thread = Thread(target=self.__dosave, args=(fig, path, kwargs))
|
||||||
|
with self.__lock:
|
||||||
|
self.__threads.append(thread)
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
def __dosave(self, fig, path, kwargs):
|
||||||
|
fig.savefig(path, **kwargs)
|
||||||
|
plt.close(fig)
|
||||||
|
|
||||||
|
def join(self):
|
||||||
|
with self.__lock:
|
||||||
|
for thread in self.__threads:
|
||||||
|
thread.join()
|
||||||
|
|||||||
13
loader.py
13
loader.py
@@ -1,5 +1,7 @@
|
|||||||
|
import html
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import operator
|
import operator
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
import xml.etree.cElementTree as et
|
import xml.etree.cElementTree as et
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
@@ -7,11 +9,13 @@ from datetime import datetime
|
|||||||
|
|
||||||
from mt import mt
|
from mt import mt
|
||||||
|
|
||||||
|
TAG_RE = re.compile(r'<[^>]+>')
|
||||||
|
|
||||||
printnoln = lambda text: print(text, end='', flush=True)
|
printnoln = lambda text: print(text, end='', flush=True)
|
||||||
rprint = lambda text: print('\r' + text)
|
rprint = lambda text: print('\r' + text)
|
||||||
|
|
||||||
|
|
||||||
def dmt(data): return mt(multiprocessing.cpu_count(), data, False)
|
def dmt(data, progressinterval=1000): return mt(multiprocessing.cpu_count(), data, False, progressinterval)
|
||||||
|
|
||||||
|
|
||||||
def cms(): return int(round(time.time() * 1000))
|
def cms(): return int(round(time.time() * 1000))
|
||||||
@@ -75,6 +79,7 @@ def mapQuestion(item):
|
|||||||
question = {tag: getTag(item, tag) for tag in tags}
|
question = {tag: getTag(item, tag) for tag in tags}
|
||||||
for tag in datetags:
|
for tag in datetags:
|
||||||
question[tag] = datetime.fromisoformat(question[tag])
|
question[tag] = datetime.fromisoformat(question[tag])
|
||||||
|
question['Body'] = removetags(html.unescape(question['Body']))
|
||||||
return question
|
return question
|
||||||
|
|
||||||
|
|
||||||
@@ -84,6 +89,7 @@ def mapAnswer(item):
|
|||||||
answer = {tag: getTag(item, tag) for tag in tags}
|
answer = {tag: getTag(item, tag) for tag in tags}
|
||||||
for tag in datetags:
|
for tag in datetags:
|
||||||
answer[tag] = datetime.fromisoformat(answer[tag])
|
answer[tag] = datetime.fromisoformat(answer[tag])
|
||||||
|
answer['Body'] = removetags(html.unescape(answer['Body']))
|
||||||
return answer
|
return answer
|
||||||
|
|
||||||
|
|
||||||
@@ -93,6 +99,7 @@ def mapComment(item):
|
|||||||
comment = {tag: getTag(item, tag) for tag in tags}
|
comment = {tag: getTag(item, tag) for tag in tags}
|
||||||
for tag in datetags:
|
for tag in datetags:
|
||||||
comment[tag] = datetime.fromisoformat(comment[tag])
|
comment[tag] = datetime.fromisoformat(comment[tag])
|
||||||
|
comment['Body'] = removetags(html.unescape(comment['Body']))
|
||||||
return comment
|
return comment
|
||||||
|
|
||||||
|
|
||||||
@@ -201,3 +208,7 @@ def tagExists(item, tag):
|
|||||||
def setprop(dic, key, value):
|
def setprop(dic, key, value):
|
||||||
dic[key] = value
|
dic[key] = value
|
||||||
return dic
|
return dic
|
||||||
|
|
||||||
|
|
||||||
|
def removetags(text):
|
||||||
|
return TAG_RE.sub('', text)
|
||||||
|
|||||||
44
mt.py
44
mt.py
@@ -1,10 +1,10 @@
|
|||||||
from threading import Thread, Lock
|
|
||||||
import time
|
import time
|
||||||
from math import ceil
|
from math import ceil
|
||||||
|
from threading import Thread, Lock
|
||||||
|
|
||||||
|
|
||||||
class mt():
|
class mt():
|
||||||
def __init__(self, threads, data, verbose=False):
|
def __init__(self, threads, data, verbose=False, progressinterval=1000):
|
||||||
self.__running = False
|
self.__running = False
|
||||||
self.__closed = False
|
self.__closed = False
|
||||||
self.__data = data
|
self.__data = data
|
||||||
@@ -21,6 +21,7 @@ class mt():
|
|||||||
self.__lock = Lock()
|
self.__lock = Lock()
|
||||||
self.__results = []
|
self.__results = []
|
||||||
self.__progress = 0
|
self.__progress = 0
|
||||||
|
self.__progressinterval = progressinterval
|
||||||
for i in range(self.__threadcount):
|
for i in range(self.__threadcount):
|
||||||
self.__results.append([])
|
self.__results.append([])
|
||||||
self.__threads.append(None)
|
self.__threads.append(None)
|
||||||
@@ -35,9 +36,9 @@ class mt():
|
|||||||
self.__running = True
|
self.__running = True
|
||||||
self.__final = self.__getresultsmapfilter
|
self.__final = self.__getresultsmapfilter
|
||||||
self.__type = "filter"
|
self.__type = "filter"
|
||||||
self.__comment = comment if comment is not None else ""
|
self.__comment = comment
|
||||||
if comment is not None:
|
if comment is not None:
|
||||||
print(self.__comment + ": #" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
|
print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
|
||||||
self.__starttime = self.__cms()
|
self.__starttime = self.__cms()
|
||||||
self.__endtime = None
|
self.__endtime = None
|
||||||
for i in range(self.__threadcount):
|
for i in range(self.__threadcount):
|
||||||
@@ -50,20 +51,22 @@ class mt():
|
|||||||
now = self.__cms()
|
now = self.__cms()
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for j in range(ceil(len(list) / 1000)):
|
for j in range(ceil(len(list) / self.__progressinterval)):
|
||||||
part = list[j * 1000: min((j + 1) * 1000, len(list))]
|
part = list[j * self.__progressinterval: min((j + 1) * self.__progressinterval, len(list))]
|
||||||
results += [l for l in part if cond(l)]
|
results += [l for l in part if cond(l)]
|
||||||
with self.__lock:
|
with self.__lock:
|
||||||
self.__progress += len(part)
|
self.__progress += len(part)
|
||||||
if self.__comment is not None:
|
if self.__comment is not None:
|
||||||
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True)
|
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True)
|
||||||
|
|
||||||
# results = [l for l in list if cond(l)]
|
|
||||||
with self.__lock:
|
with self.__lock:
|
||||||
self.__results[i] = results
|
self.__results[i] = results
|
||||||
dur = self.__cms() - now
|
dur = self.__cms() - now
|
||||||
if self.__verbose:
|
if self.__verbose:
|
||||||
|
if self.__comment is not None:
|
||||||
print(self.__comment + ": Thread " + str(i) + ": filter took " + str(dur) + "ms")
|
print(self.__comment + ": Thread " + str(i) + ": filter took " + str(dur) + "ms")
|
||||||
|
else:
|
||||||
|
print("Thread " + str(i) + ": filter took " + str(dur) + "ms")
|
||||||
|
|
||||||
def map(self, func, comment=None):
|
def map(self, func, comment=None):
|
||||||
if self.__closed:
|
if self.__closed:
|
||||||
@@ -75,7 +78,7 @@ class mt():
|
|||||||
self.__running = True
|
self.__running = True
|
||||||
self.__final = self.__getresultsmapfilter
|
self.__final = self.__getresultsmapfilter
|
||||||
self.__type = "map"
|
self.__type = "map"
|
||||||
self.__comment = comment if comment is not None else ""
|
self.__comment = comment
|
||||||
if comment is not None:
|
if comment is not None:
|
||||||
print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
|
print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
|
||||||
self.__starttime = self.__cms()
|
self.__starttime = self.__cms()
|
||||||
@@ -89,20 +92,22 @@ class mt():
|
|||||||
def __domap(self, i, list, func):
|
def __domap(self, i, list, func):
|
||||||
now = self.__cms()
|
now = self.__cms()
|
||||||
results = []
|
results = []
|
||||||
for j in range(ceil(len(list) / 1000)):
|
for j in range(ceil(len(list) / self.__progressinterval)):
|
||||||
part = list[j * 1000: min((j + 1) * 1000, len(list))]
|
part = list[j * self.__progressinterval: min((j + 1) * self.__progressinterval, len(list))]
|
||||||
results += [func(l) for l in part]
|
results += [func(l) for l in part]
|
||||||
with self.__lock:
|
with self.__lock:
|
||||||
self.__progress += len(part)
|
self.__progress += len(part)
|
||||||
if self.__comment is not None:
|
if self.__comment is not None:
|
||||||
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True)
|
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True)
|
||||||
|
|
||||||
# results = [func(l) for l in list]
|
|
||||||
with self.__lock:
|
with self.__lock:
|
||||||
self.__results[i] = results
|
self.__results[i] = results
|
||||||
dur = self.__cms() - now
|
dur = self.__cms() - now
|
||||||
if self.__verbose:
|
if self.__verbose:
|
||||||
print(self.__comment + ": Thread " + str(i) + ": map took " + str(dur) + "ms")
|
if self.__comment is not None:
|
||||||
|
print(self.__comment + ": Thread " + str(i) + ": filter took " + str(dur) + "ms")
|
||||||
|
else:
|
||||||
|
print("Thread " + str(i) + ": filter took " + str(dur) + "ms")
|
||||||
|
|
||||||
def reduce(self, reducer, aggregator, initval, comment=None):
|
def reduce(self, reducer, aggregator, initval, comment=None):
|
||||||
if self.__closed:
|
if self.__closed:
|
||||||
@@ -114,9 +119,9 @@ class mt():
|
|||||||
self.__running = True
|
self.__running = True
|
||||||
self.__final = lambda: self.__getresultsreduce(aggregator, initval)
|
self.__final = lambda: self.__getresultsreduce(aggregator, initval)
|
||||||
self.__type = "reduce"
|
self.__type = "reduce"
|
||||||
self.__comment = comment if comment is not None else ""
|
self.__comment = comment
|
||||||
if comment is not None:
|
if comment is not None:
|
||||||
print(self.__comment + ": #" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
|
print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
|
||||||
self.__starttime = self.__cms()
|
self.__starttime = self.__cms()
|
||||||
self.__endtime = None
|
self.__endtime = None
|
||||||
for i in range(self.__threadcount):
|
for i in range(self.__threadcount):
|
||||||
@@ -129,8 +134,8 @@ class mt():
|
|||||||
now = self.__cms()
|
now = self.__cms()
|
||||||
val = initval()
|
val = initval()
|
||||||
|
|
||||||
for j in range(ceil(len(list) / 1000)):
|
for j in range(ceil(len(list) / self.__progressinterval)):
|
||||||
part = list[j * 1000: min((j + 1) * 1000, len(list))]
|
part = list[j * self.__progressinterval: min((j + 1) * self.__progressinterval, len(list))]
|
||||||
for k in range(len(part)):
|
for k in range(len(part)):
|
||||||
val = reducer(val, part[k])
|
val = reducer(val, part[k])
|
||||||
with self.__lock:
|
with self.__lock:
|
||||||
@@ -138,13 +143,14 @@ class mt():
|
|||||||
if self.__comment is not None:
|
if self.__comment is not None:
|
||||||
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True)
|
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True)
|
||||||
|
|
||||||
# for j in range(len(list)):
|
|
||||||
# val = reducer(val, list[j])
|
|
||||||
with self.__lock:
|
with self.__lock:
|
||||||
self.__results[i] = val
|
self.__results[i] = val
|
||||||
dur = self.__cms() - now
|
dur = self.__cms() - now
|
||||||
if self.__verbose:
|
if self.__verbose:
|
||||||
print(self.__comment + ": Thread " + str(i) + ": reduce took " + str(dur) + "ms")
|
if self.__comment is not None:
|
||||||
|
print(self.__comment + ": Thread " + str(i) + ": filter took " + str(dur) + "ms")
|
||||||
|
else:
|
||||||
|
print("Thread " + str(i) + ": filter took " + str(dur) + "ms")
|
||||||
|
|
||||||
def getresults(self):
|
def getresults(self):
|
||||||
self.join()
|
self.join()
|
||||||
|
|||||||
@@ -36,17 +36,12 @@ def main(folder):
|
|||||||
# toxlevel = computeToxLevel(a['Body'])
|
# toxlevel = computeToxLevel(a['Body'])
|
||||||
# toxlevels[post['Id']].append(toxlevel)
|
# toxlevels[post['Id']].append(toxlevel)
|
||||||
# rprint("computing toxic levels: post #" + str(len(posts)) + "/" + str(len(posts)) + " ... took " + str(cms() - start) + "ms")
|
# rprint("computing toxic levels: post #" + str(len(posts)) + "/" + str(len(posts)) + " ... took " + str(cms() - start) + "ms")
|
||||||
toxlevels = dmt(posts).map(lambda p: (p['Id'], computeSentimentForPost(p)), "calculating sentiments").getresults()
|
toxlevels = dmt(posts, 10).map(lambda p: (p['Id'], {a['Id']: computeToxLevel(a['Body']) for a in p['Answers']}), "calculating sentiments").getresults()
|
||||||
toxlevels = {id: p for (id, p) in toxlevels}
|
toxlevels = {id: p for (id, p) in toxlevels}
|
||||||
|
|
||||||
dumptoxlevels(toxlevels, outfilename + ".py")
|
dumptoxlevels(toxlevels, outfilename + ".py")
|
||||||
|
|
||||||
|
|
||||||
def computeSentimentForPost(post):
|
|
||||||
anwsers = {a['Id']: computeToxLevel(a['Body']) for a in post['Answers']}
|
|
||||||
return anwsers
|
|
||||||
|
|
||||||
|
|
||||||
def computeToxLevel(text):
|
def computeToxLevel(text):
|
||||||
return analyser.polarity_scores(text)
|
return analyser.polarity_scores(text)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user