Files
master/calctoxdiff.py
wea_ondara a14b3af21a wip
2019-07-29 21:34:34 +02:00

350 lines
15 KiB
Python

import os
import sys
from collections import defaultdict
from datetime import datetime
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import ks_2samp
from common import imprt
colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'}
IMAGE_MAGICK = "magick"
def main(folder):
outputdir = folder + "/output/ksbatch/"
os.system("mkdir -p " + outputdir)
srcfolder = folder + "/output/batch/"
onlyfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "newusers" in f]
onlyfiles = sorted(onlyfiles)
# plotbypost(onlyfiles, outputdir)
# plotbydate(onlyfiles, outputdir)
oldfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "oldusers" in f]
oldfiles = sorted(oldfiles)
plotbydateold(onlyfiles, oldfiles, outputdir)
def plotbypost(onlyfiles, outputdir):
print("plotbypost")
files = defaultdict(list)
for f in onlyfiles:
s = f[:-3].split("_")
files[int(s[5])].append(f)
files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[3], "%d-%m-%Y")) for (p, l) in files.items()}
changes_neg = defaultdict(list)
changes_neu = defaultdict(list)
changes_pos = defaultdict(list)
changes_com = defaultdict(list)
for (p, l) in files.items():
if len(l) < 2:
continue
print(p)
for i in range(len(l) - 1):
tox1 = imprt(l[i]).toxlevels
tox2 = imprt(l[i + 1]).toxlevels
neglevelsflat1 = [item['neg'] for item in tox1]
neulevelsflat1 = [item['neu'] for item in tox1]
poslevelsflat1 = [item['pos'] for item in tox1]
comlevelsflat1 = [item['compound'] for item in tox1]
neglevelsflat2 = [item['neg'] for item in tox2]
neulevelsflat2 = [item['neu'] for item in tox2]
poslevelsflat2 = [item['pos'] for item in tox2]
comlevelsflat2 = [item['compound'] for item in tox2]
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
kspos = ks_2samp(poslevelsflat1, poslevelsflat2)
kscom = ks_2samp(comlevelsflat1, comlevelsflat2)
changes_neg[p].append(ksneg)
changes_neu[p].append(ksneu)
changes_pos[p].append(kspos)
changes_com[p].append(kscom)
for (p, l) in files.items():
with open(outputdir + "/ks_post_" + str(p) + ".log", "w") as f:
for i in range(len(l) - 1):
f1 = l[i]
f2 = l[i + 1]
f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[p][i]) + "; ks neu = " + str(changes_neu[p][i])
+ "; ks pos = " + str(changes_pos[p][i]) + "; ks com = " + str(changes_com[p][i]) + "\n")
# pval
for (p, l) in files.items():
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
pval = [x.pvalue for x in changes]
plt.plot(x, pval, label=type + ".pval", color=colors[type])
mean = np.mean(pval)
std = np.std(pval)
dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std]
plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with max " + str(p) + " posts")
plt.xticks(rotation=90)
plt.legend(loc="upper right")
plt.savefig(outputdir + "/ks_post_pval_" + str(p) + ".png", bbox_inches='tight')
plt.close(fig)
# stat
for (p, l) in files.items():
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
stat = [x.statistic for x in changes]
plt.plot(x, stat, label=type + ".stat", color=colors[type])
mean = np.mean(stat)
std = np.std(stat)
dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std]
plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with max " + str(p) + " posts")
plt.xticks(rotation=90)
plt.legend(loc="upper right")
plt.savefig(outputdir + "/ks_post_stat_" + str(p) + ".png", bbox_inches='tight')
plt.close(fig)
def plotbydate(onlyfiles, outputdir):
print("plotbydate")
files = defaultdict(list)
for f in onlyfiles:
s = f[:-3].split("_")
files[(s[3], s[4])].append(f)
files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()}
changes_neg = defaultdict(list)
changes_neu = defaultdict(list)
changes_pos = defaultdict(list)
changes_com = defaultdict(list)
for (d, l) in files.items():
if len(l) < 2:
continue
print(d)
for i in range(len(l) - 1):
tox1 = imprt(l[i]).toxlevels
tox2 = imprt(l[i + 1]).toxlevels
neglevelsflat1 = [item['neg'] for item in tox1]
neulevelsflat1 = [item['neu'] for item in tox1]
poslevelsflat1 = [item['pos'] for item in tox1]
comlevelsflat1 = [item['compound'] for item in tox1]
neglevelsflat2 = [item['neg'] for item in tox2]
neulevelsflat2 = [item['neu'] for item in tox2]
poslevelsflat2 = [item['pos'] for item in tox2]
comlevelsflat2 = [item['compound'] for item in tox2]
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
kspos = ks_2samp(poslevelsflat1, poslevelsflat2)
kscom = ks_2samp(comlevelsflat1, comlevelsflat2)
changes_neg[d].append(ksneg)
changes_neu[d].append(ksneu)
changes_pos[d].append(kspos)
changes_com[d].append(kscom)
for (d, l) in files.items():
with open(outputdir + "/ks_date_" + d[0] + "_" + d[1] + ".log", "w") as f:
for i in range(len(l) - 1):
f1 = l[i]
f2 = l[i + 1]
f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[d][i]) + "; ks neu = " + str(changes_neu[d][i])
+ "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n")
# pval
for (d, l) in files.items():
x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
pval = [x.pvalue for x in changes]
plt.plot(x, pval, label=type + ".pval", color=colors[type])
mean = np.mean(pval)
std = np.std(pval)
dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std]
plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
plt.xticks(rotation=90)
plt.legend(loc="upper right")
plt.savefig(outputdir + "/ks_date_pval_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
plt.close(fig)
# stat
for (d, l) in files.items():
x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
stat = [x.statistic for x in changes]
plt.plot(x, stat, label=type + ".stat", color=colors[type])
mean = np.mean(stat)
std = np.std(stat)
dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std]
plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
plt.xticks(rotation=90)
plt.legend(loc="upper right")
plt.savefig(outputdir + "/ks_date_stat_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
plt.close(fig)
def plotbydateold(onlyfiles, oldfiles, outputdir):
print("plotbydateold")
files = defaultdict(list)
for f in onlyfiles:
s = f[:-3].split("_")
files[(s[3], s[4])].append(f)
dates = sorted(files.keys(), key=lambda e: "-".join(reversed(e[0].split("-"))))
files = {d: files[d] for d in dates}
files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()}
oldfiles = {(f[:-3].split("_")[3], f[:-3].split("_")[4]): f for f in oldfiles}
changes_neg = defaultdict(list)
changes_neu = defaultdict(list)
changes_pos = defaultdict(list)
changes_com = defaultdict(list)
for (d, l) in files.items():
print(d)
toxold = imprt(oldfiles[d]).toxlevels
neglevelsold = [item['neg'] for item in toxold]
neulevelsold = [item['neu'] for item in toxold]
poslevelsold = [item['pos'] for item in toxold]
comlevelsold = [item['compound'] for item in toxold]
for i in range(len(l)):
tox1 = imprt(l[i]).toxlevels
if len(tox1) == 0 or len(toxold) == 0:
changes_neg[d].append(None)
changes_neu[d].append(None)
changes_pos[d].append(None)
changes_com[d].append(None)
continue
neglevelsflat1 = [item['neg'] for item in tox1]
neulevelsflat1 = [item['neu'] for item in tox1]
poslevelsflat1 = [item['pos'] for item in tox1]
comlevelsflat1 = [item['compound'] for item in tox1]
ksneg = ks_2samp(neglevelsflat1, neglevelsold)
ksneu = ks_2samp(neulevelsflat1, neulevelsold)
kspos = ks_2samp(poslevelsflat1, poslevelsold)
kscom = ks_2samp(comlevelsflat1, comlevelsold)
changes_neg[d].append(ksneg)
changes_neu[d].append(ksneu)
changes_pos[d].append(kspos)
changes_com[d].append(kscom)
print("logs")
for (d, l) in files.items():
# print(d)
# print("neg is: " + str(len(changes_neg[d])) + " should: " + str(len(l)))
# print("neu is: " + str(len(changes_neu[d])) + " should: " + str(len(l)))
# print("pos is: " + str(len(changes_pos[d])) + " should: " + str(len(l)))
# print("com is: " + str(len(changes_com[d])) + " should: " + str(len(l)))
f1 = oldfiles[d]
with open(outputdir + "/ks_olddate_" + d[0] + "_" + d[1] + ".log", "w") as f:
for i in range(len(l)):
if changes_neg[d][i] is None:
continue
f2 = l[i]
f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[d][i]) + "; ks neu = " + str(changes_neu[d][i])
+ "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n")
# pval
print("pval")
imgmagickcmd = IMAGE_MAGICK
for (d, l) in files.items():
print(d)
x = [l[i][:-3].split("_")[5] for i in range(len(l))]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
pval = [(xx, c.pvalue) for xx, c in zip(x, changes) if c is not None]
plt.plot([p[0] for p in pval], [p[1] for p in pval], label=type + ".pval", color=colors[type])
if len(pval) == 0:
continue
mean = np.mean([p[1] for p in pval])
std = np.std([p[1] for p in pval])
dev = [(xx, s) for (xx, s) in pval if s <= mean - std or s >= mean + std]
plt.plot([p[0] for p in pval], [mean] * len(pval), color=colors[type], ls='dashed')
plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1])
plt.xticks(rotation=90)
plt.legend(loc="upper right")
outfile = outputdir + "/ks_olddate_pval_" + d[0] + "_" + d[1] + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
imgmagickcmd += " " + outfile
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_pval.pdf")
# stat
print("stat")
imgmagickcmd = IMAGE_MAGICK
for (d, l) in files.items():
print(d)
x = [l[i][:-3].split("_")[5] for i in range(len(l))]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
stat = [(xx, c.statistic) for xx, c in zip(x, changes) if c is not None]
plt.plot([p[0] for p in stat], [p[1] for p in stat], label=type + ".stat", color=colors[type])
if len(stat) == 0:
continue
mean = np.mean([p[1] for p in stat])
std = np.std([p[1] for p in stat])
dev = [(xx, s) for (xx, s) in stat if s <= mean - std or s >= mean + std]
plt.plot([p[0] for p in stat], [mean] * len(stat), color=colors[type], ls='dashed')
plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1])
plt.xticks(rotation=90)
plt.legend(loc="upper right")
outfile = outputdir + "/ks_olddate_stat_" + d[0] + "_" + d[1] + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
imgmagickcmd += " " + outfile
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_stat.pdf")
def filecmp(file1, file2):
if file1 == file2:
return 0
s1 = file1.split("_")
s2 = file2.split("_")
d1 = datetime.strptime(s1[2], "%d-%m-%Y")
d2 = datetime.strptime(s2[2], "%d-%m-%Y")
if d1 < d2:
return -1
elif d1 > d2:
return 1
return 0
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
main(folder)