wip
This commit is contained in:
145
calctoxdiff.py
145
calctoxdiff.py
@@ -14,29 +14,29 @@ from common import imprt, IMAGE_MAGICK
|
||||
colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'}
|
||||
|
||||
|
||||
def main(folder):
|
||||
def main(folder, intervl):
|
||||
outputdir = folder + "/output/ksbatch/"
|
||||
os.system("mkdir -p " + outputdir)
|
||||
srcfolder = folder + "/output/batch/"
|
||||
|
||||
onlyfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "newusers" in f]
|
||||
onlyfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "newusers" in f and "i" + str(intervl) in f]
|
||||
onlyfiles = sorted(onlyfiles)
|
||||
|
||||
plotbypost(onlyfiles, outputdir)
|
||||
plotbydate(onlyfiles, outputdir)
|
||||
plotbypost(onlyfiles, outputdir, intervl)
|
||||
plotbydate(onlyfiles, outputdir, intervl)
|
||||
|
||||
oldfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "oldusers" in f]
|
||||
oldfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "oldusers" in f and "i" + str(intervl) in f]
|
||||
oldfiles = sorted(oldfiles)
|
||||
|
||||
plotbydateold(onlyfiles, oldfiles, outputdir)
|
||||
plotbydateold(onlyfiles, oldfiles, outputdir, intervl)
|
||||
|
||||
|
||||
def plotbypost(onlyfiles, outputdir):
|
||||
def plotbypost(onlyfiles, outputdir, intervl):
|
||||
print("plotbypost")
|
||||
files = defaultdict(list)
|
||||
for f in onlyfiles:
|
||||
s = f[:-3].split("_")
|
||||
files[int(s[5])].append(f)
|
||||
files[int(s[6])].append(f)
|
||||
files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[3], "%d-%m-%Y")) for (p, l) in files.items()}
|
||||
|
||||
changes_neg = defaultdict(list)
|
||||
@@ -62,10 +62,10 @@ def plotbypost(onlyfiles, outputdir):
|
||||
poslevelsflat2 = [item['pos'] for item in tox2]
|
||||
comlevelsflat2 = [item['compound'] for item in tox2]
|
||||
|
||||
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
|
||||
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
|
||||
kspos = ks_2samp(poslevelsflat1, poslevelsflat2)
|
||||
kscom = ks_2samp(comlevelsflat1, comlevelsflat2)
|
||||
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) if len(neglevelsflat1) > 0 and len(neglevelsflat2) > 0 else "no values"
|
||||
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) if len(neulevelsflat1) > 0 and len(neulevelsflat2) > 0 else "no values"
|
||||
kspos = ks_2samp(poslevelsflat1, poslevelsflat2) if len(poslevelsflat1) > 0 and len(poslevelsflat2) > 0 else "no values"
|
||||
kscom = ks_2samp(comlevelsflat1, comlevelsflat2) if len(comlevelsflat1) > 0 and len(comlevelsflat2) > 0 else "no values"
|
||||
|
||||
changes_neg[p].append(ksneg)
|
||||
changes_neu[p].append(ksneu)
|
||||
@@ -73,7 +73,7 @@ def plotbypost(onlyfiles, outputdir):
|
||||
changes_com[p].append(kscom)
|
||||
|
||||
for (p, l) in files.items():
|
||||
with open(outputdir + "/ks_post_" + str(p) + ".log", "w") as f:
|
||||
with open(outputdir + "/ks_post_i" + str(intervl) + "_" + str(p) + ".log", "w") as f:
|
||||
for i in range(len(l) - 1):
|
||||
f1 = l[i]
|
||||
f2 = l[i + 1]
|
||||
@@ -85,11 +85,12 @@ def plotbypost(onlyfiles, outputdir):
|
||||
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
|
||||
fig = plt.figure(figsize=(16, 12))
|
||||
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
|
||||
pval = [x.pvalue for x in changes]
|
||||
pval = [x.pvalue if not isinstance(x, str) else None for x in changes]
|
||||
pvalnotnull = [x for x in pval if x is not None]
|
||||
plt.plot(x, pval, label=type + ".pval", color=colors[type])
|
||||
mean = np.mean(pval)
|
||||
std = np.std(pval)
|
||||
dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std]
|
||||
mean = np.mean(pvalnotnull)
|
||||
std = np.std(pvalnotnull)
|
||||
dev = [(xx, s) for (xx, s) in zip(x, pval) if s is not None and (s <= mean - std or s >= mean + std)]
|
||||
plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
|
||||
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
|
||||
plt.title("KS 2-sided test with max " + str(p) + " posts")
|
||||
@@ -97,10 +98,10 @@ def plotbypost(onlyfiles, outputdir):
|
||||
plt.xlabel("Comparision: time frame X - time frame X+1")
|
||||
plt.ylabel("p-value")
|
||||
plt.legend(loc="upper right")
|
||||
plt.savefig(outputdir + "/ks_post_pval_" + str(p) + ".png", bbox_inches='tight')
|
||||
plt.savefig(outputdir + "/ks_post_pval_i" + str(intervl) + "_" + str(p) + ".png", bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
magick += " " + outputdir + "/ks_post_pval_" + str(p) + ".png"
|
||||
os.system(magick + " " + outputdir + "/ks_post_pval.pdf")
|
||||
magick += " " + outputdir + "/ks_post_pval_i" + str(intervl) + "_" + str(p) + ".png"
|
||||
os.system(magick + " " + outputdir + "/ks_post_pval_i" + str(intervl) + ".pdf")
|
||||
|
||||
# stat
|
||||
magick = IMAGE_MAGICK
|
||||
@@ -108,11 +109,12 @@ def plotbypost(onlyfiles, outputdir):
|
||||
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
|
||||
fig = plt.figure(figsize=(16, 12))
|
||||
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
|
||||
stat = [x.statistic for x in changes]
|
||||
stat = [x.statistic if not isinstance(x, str) else None for x in changes]
|
||||
statnotnull = [x for x in stat if x is not None]
|
||||
plt.plot(x, stat, label=type + ".stat", color=colors[type])
|
||||
mean = np.mean(stat)
|
||||
std = np.std(stat)
|
||||
dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std]
|
||||
mean = np.mean(statnotnull)
|
||||
std = np.std(statnotnull)
|
||||
dev = [(xx, s) for (xx, s) in zip(x, stat) if s is not None and (s <= mean - std or s >= mean + std)]
|
||||
plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
|
||||
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
|
||||
plt.title("KS 2-sided test with max " + str(p) + " posts")
|
||||
@@ -120,13 +122,13 @@ def plotbypost(onlyfiles, outputdir):
|
||||
plt.xlabel("Comparision: time frame X - time frame X+1")
|
||||
plt.ylabel("stat value")
|
||||
plt.legend(loc="upper right")
|
||||
plt.savefig(outputdir + "/ks_post_stat_" + str(p) + ".png", bbox_inches='tight')
|
||||
plt.savefig(outputdir + "/ks_post_stat_i" + str(intervl) + "_" + str(p) + ".png", bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
magick += " " + outputdir + "/ks_post_stat_" + str(p) + ".png"
|
||||
os.system(magick + " " + outputdir + "/ks_post_stat.pdf")
|
||||
magick += " " + outputdir + "/ks_post_stat_i" + str(intervl) + "_" + str(p) + ".png"
|
||||
os.system(magick + " " + outputdir + "/ks_post_stat_i" + str(intervl) + ".pdf")
|
||||
|
||||
|
||||
def plotbydate(onlyfiles, outputdir):
|
||||
def plotbydate(onlyfiles, outputdir, intervl):
|
||||
print("plotbydate")
|
||||
files = defaultdict(list)
|
||||
for f in onlyfiles:
|
||||
@@ -159,10 +161,10 @@ def plotbydate(onlyfiles, outputdir):
|
||||
poslevelsflat2 = [item['pos'] for item in tox2]
|
||||
comlevelsflat2 = [item['compound'] for item in tox2]
|
||||
|
||||
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
|
||||
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
|
||||
kspos = ks_2samp(poslevelsflat1, poslevelsflat2)
|
||||
kscom = ks_2samp(comlevelsflat1, comlevelsflat2)
|
||||
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) if len(neglevelsflat1) > 0 and len(neglevelsflat2) > 0 else "no values"
|
||||
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) if len(neulevelsflat1) > 0 and len(neulevelsflat2) > 0 else "no values"
|
||||
kspos = ks_2samp(poslevelsflat1, poslevelsflat2) if len(poslevelsflat1) > 0 and len(poslevelsflat2) > 0 else "no values"
|
||||
kscom = ks_2samp(comlevelsflat1, comlevelsflat2) if len(comlevelsflat1) > 0 and len(comlevelsflat2) > 0 else "no values"
|
||||
|
||||
changes_neg[d].append(ksneg)
|
||||
changes_neu[d].append(ksneu)
|
||||
@@ -170,7 +172,7 @@ def plotbydate(onlyfiles, outputdir):
|
||||
changes_com[d].append(kscom)
|
||||
|
||||
for (d, l) in files.items():
|
||||
with open(outputdir + "/ks_date_" + d[0] + "_" + d[1] + ".log", "w") as f:
|
||||
with open(outputdir + "/ks_date_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".log", "w") as f:
|
||||
for i in range(len(l) - 1):
|
||||
f1 = l[i]
|
||||
f2 = l[i + 1]
|
||||
@@ -180,14 +182,15 @@ def plotbydate(onlyfiles, outputdir):
|
||||
# pval
|
||||
magick = IMAGE_MAGICK
|
||||
for (d, l) in files.items():
|
||||
x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)]
|
||||
x = [l[i].split("_")[6][:-3] + "-" + l[i + 1].split("_")[6][:-3] for i in range(len(l) - 1)]
|
||||
fig = plt.figure(figsize=(16, 12))
|
||||
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
||||
pval = [x.pvalue for x in changes]
|
||||
pval = [x.pvalue if not isinstance(x, str) else None for x in changes]
|
||||
pvalnotnull = [x for x in pval if x is not None]
|
||||
plt.plot(x, pval, label=type + ".pval", color=colors[type])
|
||||
mean = np.mean(pval)
|
||||
std = np.std(pval)
|
||||
dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std]
|
||||
mean = np.mean(pvalnotnull)
|
||||
std = np.std(pvalnotnull)
|
||||
dev = [(xx, s) for (xx, s) in zip(x, pval) if s is not None and (s <= mean - std or s >= mean + std)]
|
||||
plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
|
||||
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
|
||||
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
|
||||
@@ -195,22 +198,23 @@ def plotbydate(onlyfiles, outputdir):
|
||||
plt.xlabel("Comparision: X (max) posts - X+1 (max) posts")
|
||||
plt.ylabel("p-value")
|
||||
plt.legend(loc="upper right")
|
||||
plt.savefig(outputdir + "/ks_date_pval_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
|
||||
plt.savefig(outputdir + "/ks_date_pval_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
magick += " " + outputdir + "/ks_date_pval_" + d[0] + "_" + d[1] + ".png"
|
||||
os.system(magick + " " + outputdir + "/ks_date_pval.pdf")
|
||||
magick += " " + outputdir + "/ks_date_pval_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
|
||||
os.system(magick + " " + outputdir + "/ks_date_pval_i" + str(intervl) + ".pdf")
|
||||
|
||||
# stat
|
||||
magick = IMAGE_MAGICK
|
||||
for (d, l) in files.items():
|
||||
x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)]
|
||||
x = [l[i].split("_")[6][:-3] + "-" + l[i + 1].split("_")[6][:-3] for i in range(len(l) - 1)]
|
||||
fig = plt.figure(figsize=(16, 12))
|
||||
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
||||
stat = [x.statistic for x in changes]
|
||||
stat = [x.statistic if not isinstance(x, str) else None for x in changes]
|
||||
statnotnull = [x for x in stat if x is not None]
|
||||
plt.plot(x, stat, label=type + ".stat", color=colors[type])
|
||||
mean = np.mean(stat)
|
||||
std = np.std(stat)
|
||||
dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std]
|
||||
mean = np.mean(statnotnull)
|
||||
std = np.std(statnotnull)
|
||||
dev = [(xx, s) for (xx, s) in zip(x, stat) if s is not None and (s <= mean - std or s >= mean + std)]
|
||||
plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
|
||||
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
|
||||
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
|
||||
@@ -218,13 +222,13 @@ def plotbydate(onlyfiles, outputdir):
|
||||
plt.xlabel("Comparision: X (max) posts - X+1 (max) posts")
|
||||
plt.ylabel("stat value")
|
||||
plt.legend(loc="upper right")
|
||||
plt.savefig(outputdir + "/ks_date_stat_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
|
||||
plt.savefig(outputdir + "/ks_date_stat_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
magick += " " + outputdir + "/ks_date_stat_" + d[0] + "_" + d[1] + ".png"
|
||||
os.system(magick + " " + outputdir + "/ks_date_stat.pdf")
|
||||
magick += " " + outputdir + "/ks_date_stat_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
|
||||
os.system(magick + " " + outputdir + "/ks_date_stat_i" + str(intervl) + ".pdf")
|
||||
|
||||
|
||||
def plotbydateold(onlyfiles, oldfiles, outputdir):
|
||||
def plotbydateold(onlyfiles, oldfiles, outputdir, intervl):
|
||||
print("plotbydateold")
|
||||
files = defaultdict(list)
|
||||
for f in onlyfiles:
|
||||
@@ -232,7 +236,7 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
|
||||
files[(s[3], s[4])].append(f)
|
||||
dates = sorted(files.keys(), key=lambda e: "-".join(reversed(e[0].split("-"))))
|
||||
files = {d: files[d] for d in dates}
|
||||
files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()}
|
||||
files = {d: sorted(l, key=lambda e: e.split("_")[6]) for (d, l) in files.items()}
|
||||
oldfiles = {(f[:-3].split("_")[3], f[:-3].split("_")[4]): f for f in oldfiles}
|
||||
|
||||
changes_neg = defaultdict(list)
|
||||
@@ -263,10 +267,10 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
|
||||
poslevelsflat1 = [item['pos'] for item in tox1]
|
||||
comlevelsflat1 = [item['compound'] for item in tox1]
|
||||
|
||||
ksneg = ks_2samp(neglevelsflat1, neglevelsold)
|
||||
ksneu = ks_2samp(neulevelsflat1, neulevelsold)
|
||||
kspos = ks_2samp(poslevelsflat1, poslevelsold)
|
||||
kscom = ks_2samp(comlevelsflat1, comlevelsold)
|
||||
ksneg = ks_2samp(neglevelsflat1, neglevelsold) if len(neglevelsflat1) > 0 and len(neglevelsold) > 0 else "no values"
|
||||
ksneu = ks_2samp(neulevelsflat1, neulevelsold) if len(neulevelsflat1) > 0 and len(neulevelsold) > 0 else "no values"
|
||||
kspos = ks_2samp(poslevelsflat1, poslevelsold) if len(poslevelsflat1) > 0 and len(poslevelsold) > 0 else "no values"
|
||||
kscom = ks_2samp(comlevelsflat1, comlevelsold) if len(comlevelsflat1) > 0 and len(comlevelsold) > 0 else "no values"
|
||||
|
||||
changes_neg[d].append(ksneg)
|
||||
changes_neu[d].append(ksneu)
|
||||
@@ -276,7 +280,7 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
|
||||
print("logs")
|
||||
for (d, l) in files.items():
|
||||
f1 = oldfiles[d]
|
||||
with open(outputdir + "/ks_olddate_" + d[0] + "_" + d[1] + ".log", "w") as f:
|
||||
with open(outputdir + "/ks_olddate_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".log", "w") as f:
|
||||
for i in range(len(l)):
|
||||
if changes_neg[d][i] is None:
|
||||
continue
|
||||
@@ -289,7 +293,7 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
|
||||
imgmagickcmd = IMAGE_MAGICK
|
||||
for (d, l) in files.items():
|
||||
print(d)
|
||||
x = [l[i][:-3].split("_")[5] for i in range(len(l))]
|
||||
x = [l[i][:-3].split("_")[6] for i in range(len(l))]
|
||||
fig = plt.figure(figsize=(16, 12))
|
||||
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
||||
pval = [(xx, c.pvalue) for xx, c in zip(x, changes) if c is not None]
|
||||
@@ -306,18 +310,18 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
|
||||
plt.xlabel("Comparision: new users X (max) posts - old users posts")
|
||||
plt.ylabel("p-value")
|
||||
plt.legend(loc="upper right")
|
||||
outfile = outputdir + "/ks_olddate_pval_" + d[0] + "_" + d[1] + ".png"
|
||||
outfile = outputdir + "/ks_olddate_pval_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
|
||||
plt.savefig(outfile, bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
imgmagickcmd += " " + outfile
|
||||
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_pval.pdf")
|
||||
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_pval_i" + str(intervl) + ".pdf")
|
||||
|
||||
# stat
|
||||
print("stat")
|
||||
imgmagickcmd = IMAGE_MAGICK
|
||||
for (d, l) in files.items():
|
||||
print(d)
|
||||
x = [l[i][:-3].split("_")[5] for i in range(len(l))]
|
||||
x = [l[i][:-3].split("_")[6] for i in range(len(l))]
|
||||
fig = plt.figure(figsize=(16, 12))
|
||||
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
||||
stat = [(xx, c.statistic) for xx, c in zip(x, changes) if c is not None]
|
||||
@@ -334,11 +338,11 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
|
||||
plt.xlabel("Comparision: new users X (max) posts - old users posts")
|
||||
plt.ylabel("stat value")
|
||||
plt.legend(loc="upper right")
|
||||
outfile = outputdir + "/ks_olddate_stat_" + d[0] + "_" + d[1] + ".png"
|
||||
outfile = outputdir + "/ks_olddate_stat_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
|
||||
plt.savefig(outfile, bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
imgmagickcmd += " " + outfile
|
||||
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_stat.pdf")
|
||||
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_stat_i" + str(intervl) + ".pdf")
|
||||
|
||||
|
||||
def filecmp(file1, file2):
|
||||
@@ -365,5 +369,20 @@ if __name__ == "__main__":
|
||||
if not os.path.isdir(folder):
|
||||
print(folder + " is not a folder")
|
||||
sys.exit(1)
|
||||
interval = 3
|
||||
if len(sys.argv) >= 3:
|
||||
if sys.argv[2].startswith("-i"):
|
||||
interval = sys.argv[2][2:]
|
||||
try:
|
||||
interval = int(interval)
|
||||
except ValueError:
|
||||
print("-i: int required")
|
||||
sys.exit(1)
|
||||
if interval < 1 or interval > 12:
|
||||
print("-i: only 1 - 12")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("unknown parameter: " + sys.argv[2])
|
||||
sys.exit(1)
|
||||
|
||||
main(folder)
|
||||
main(folder, interval)
|
||||
|
||||
Reference in New Issue
Block a user