This commit is contained in:
wea_ondara
2019-12-25 13:49:57 +01:00
parent 2c1524a335
commit 19f5835e3a
2 changed files with 103 additions and 84 deletions

View File

@@ -14,29 +14,29 @@ from common import imprt, IMAGE_MAGICK
colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'}
def main(folder):
def main(folder, intervl):
outputdir = folder + "/output/ksbatch/"
os.system("mkdir -p " + outputdir)
srcfolder = folder + "/output/batch/"
onlyfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "newusers" in f]
onlyfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "newusers" in f and "i" + str(intervl) in f]
onlyfiles = sorted(onlyfiles)
plotbypost(onlyfiles, outputdir)
plotbydate(onlyfiles, outputdir)
plotbypost(onlyfiles, outputdir, intervl)
plotbydate(onlyfiles, outputdir, intervl)
oldfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "oldusers" in f]
oldfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "oldusers" in f and "i" + str(intervl) in f]
oldfiles = sorted(oldfiles)
plotbydateold(onlyfiles, oldfiles, outputdir)
plotbydateold(onlyfiles, oldfiles, outputdir, intervl)
def plotbypost(onlyfiles, outputdir):
def plotbypost(onlyfiles, outputdir, intervl):
print("plotbypost")
files = defaultdict(list)
for f in onlyfiles:
s = f[:-3].split("_")
files[int(s[5])].append(f)
files[int(s[6])].append(f)
files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[3], "%d-%m-%Y")) for (p, l) in files.items()}
changes_neg = defaultdict(list)
@@ -62,10 +62,10 @@ def plotbypost(onlyfiles, outputdir):
poslevelsflat2 = [item['pos'] for item in tox2]
comlevelsflat2 = [item['compound'] for item in tox2]
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
kspos = ks_2samp(poslevelsflat1, poslevelsflat2)
kscom = ks_2samp(comlevelsflat1, comlevelsflat2)
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) if len(neglevelsflat1) > 0 and len(neglevelsflat2) > 0 else "no values"
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) if len(neulevelsflat1) > 0 and len(neulevelsflat2) > 0 else "no values"
kspos = ks_2samp(poslevelsflat1, poslevelsflat2) if len(poslevelsflat1) > 0 and len(poslevelsflat2) > 0 else "no values"
kscom = ks_2samp(comlevelsflat1, comlevelsflat2) if len(comlevelsflat1) > 0 and len(comlevelsflat2) > 0 else "no values"
changes_neg[p].append(ksneg)
changes_neu[p].append(ksneu)
@@ -73,7 +73,7 @@ def plotbypost(onlyfiles, outputdir):
changes_com[p].append(kscom)
for (p, l) in files.items():
with open(outputdir + "/ks_post_" + str(p) + ".log", "w") as f:
with open(outputdir + "/ks_post_i" + str(intervl) + "_" + str(p) + ".log", "w") as f:
for i in range(len(l) - 1):
f1 = l[i]
f2 = l[i + 1]
@@ -85,11 +85,12 @@ def plotbypost(onlyfiles, outputdir):
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
pval = [x.pvalue for x in changes]
pval = [x.pvalue if not isinstance(x, str) else None for x in changes]
pvalnotnull = [x for x in pval if x is not None]
plt.plot(x, pval, label=type + ".pval", color=colors[type])
mean = np.mean(pval)
std = np.std(pval)
dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std]
mean = np.mean(pvalnotnull)
std = np.std(pvalnotnull)
dev = [(xx, s) for (xx, s) in zip(x, pval) if s is not None and (s <= mean - std or s >= mean + std)]
plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with max " + str(p) + " posts")
@@ -97,10 +98,10 @@ def plotbypost(onlyfiles, outputdir):
plt.xlabel("Comparision: time frame X - time frame X+1")
plt.ylabel("p-value")
plt.legend(loc="upper right")
plt.savefig(outputdir + "/ks_post_pval_" + str(p) + ".png", bbox_inches='tight')
plt.savefig(outputdir + "/ks_post_pval_i" + str(intervl) + "_" + str(p) + ".png", bbox_inches='tight')
plt.close(fig)
magick += " " + outputdir + "/ks_post_pval_" + str(p) + ".png"
os.system(magick + " " + outputdir + "/ks_post_pval.pdf")
magick += " " + outputdir + "/ks_post_pval_i" + str(intervl) + "_" + str(p) + ".png"
os.system(magick + " " + outputdir + "/ks_post_pval_i" + str(intervl) + ".pdf")
# stat
magick = IMAGE_MAGICK
@@ -108,11 +109,12 @@ def plotbypost(onlyfiles, outputdir):
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
stat = [x.statistic for x in changes]
stat = [x.statistic if not isinstance(x, str) else None for x in changes]
statnotnull = [x for x in stat if x is not None]
plt.plot(x, stat, label=type + ".stat", color=colors[type])
mean = np.mean(stat)
std = np.std(stat)
dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std]
mean = np.mean(statnotnull)
std = np.std(statnotnull)
dev = [(xx, s) for (xx, s) in zip(x, stat) if s is not None and (s <= mean - std or s >= mean + std)]
plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with max " + str(p) + " posts")
@@ -120,13 +122,13 @@ def plotbypost(onlyfiles, outputdir):
plt.xlabel("Comparision: time frame X - time frame X+1")
plt.ylabel("stat value")
plt.legend(loc="upper right")
plt.savefig(outputdir + "/ks_post_stat_" + str(p) + ".png", bbox_inches='tight')
plt.savefig(outputdir + "/ks_post_stat_i" + str(intervl) + "_" + str(p) + ".png", bbox_inches='tight')
plt.close(fig)
magick += " " + outputdir + "/ks_post_stat_" + str(p) + ".png"
os.system(magick + " " + outputdir + "/ks_post_stat.pdf")
magick += " " + outputdir + "/ks_post_stat_i" + str(intervl) + "_" + str(p) + ".png"
os.system(magick + " " + outputdir + "/ks_post_stat_i" + str(intervl) + ".pdf")
def plotbydate(onlyfiles, outputdir):
def plotbydate(onlyfiles, outputdir, intervl):
print("plotbydate")
files = defaultdict(list)
for f in onlyfiles:
@@ -159,10 +161,10 @@ def plotbydate(onlyfiles, outputdir):
poslevelsflat2 = [item['pos'] for item in tox2]
comlevelsflat2 = [item['compound'] for item in tox2]
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
kspos = ks_2samp(poslevelsflat1, poslevelsflat2)
kscom = ks_2samp(comlevelsflat1, comlevelsflat2)
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) if len(neglevelsflat1) > 0 and len(neglevelsflat2) > 0 else "no values"
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) if len(neulevelsflat1) > 0 and len(neulevelsflat2) > 0 else "no values"
kspos = ks_2samp(poslevelsflat1, poslevelsflat2) if len(poslevelsflat1) > 0 and len(poslevelsflat2) > 0 else "no values"
kscom = ks_2samp(comlevelsflat1, comlevelsflat2) if len(comlevelsflat1) > 0 and len(comlevelsflat2) > 0 else "no values"
changes_neg[d].append(ksneg)
changes_neu[d].append(ksneu)
@@ -170,7 +172,7 @@ def plotbydate(onlyfiles, outputdir):
changes_com[d].append(kscom)
for (d, l) in files.items():
with open(outputdir + "/ks_date_" + d[0] + "_" + d[1] + ".log", "w") as f:
with open(outputdir + "/ks_date_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".log", "w") as f:
for i in range(len(l) - 1):
f1 = l[i]
f2 = l[i + 1]
@@ -180,14 +182,15 @@ def plotbydate(onlyfiles, outputdir):
# pval
magick = IMAGE_MAGICK
for (d, l) in files.items():
x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)]
x = [l[i].split("_")[6][:-3] + "-" + l[i + 1].split("_")[6][:-3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
pval = [x.pvalue for x in changes]
pval = [x.pvalue if not isinstance(x, str) else None for x in changes]
pvalnotnull = [x for x in pval if x is not None]
plt.plot(x, pval, label=type + ".pval", color=colors[type])
mean = np.mean(pval)
std = np.std(pval)
dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std]
mean = np.mean(pvalnotnull)
std = np.std(pvalnotnull)
dev = [(xx, s) for (xx, s) in zip(x, pval) if s is not None and (s <= mean - std or s >= mean + std)]
plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
@@ -195,22 +198,23 @@ def plotbydate(onlyfiles, outputdir):
plt.xlabel("Comparision: X (max) posts - X+1 (max) posts")
plt.ylabel("p-value")
plt.legend(loc="upper right")
plt.savefig(outputdir + "/ks_date_pval_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
plt.savefig(outputdir + "/ks_date_pval_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
plt.close(fig)
magick += " " + outputdir + "/ks_date_pval_" + d[0] + "_" + d[1] + ".png"
os.system(magick + " " + outputdir + "/ks_date_pval.pdf")
magick += " " + outputdir + "/ks_date_pval_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
os.system(magick + " " + outputdir + "/ks_date_pval_i" + str(intervl) + ".pdf")
# stat
magick = IMAGE_MAGICK
for (d, l) in files.items():
x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)]
x = [l[i].split("_")[6][:-3] + "-" + l[i + 1].split("_")[6][:-3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
stat = [x.statistic for x in changes]
stat = [x.statistic if not isinstance(x, str) else None for x in changes]
statnotnull = [x for x in stat if x is not None]
plt.plot(x, stat, label=type + ".stat", color=colors[type])
mean = np.mean(stat)
std = np.std(stat)
dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std]
mean = np.mean(statnotnull)
std = np.std(statnotnull)
dev = [(xx, s) for (xx, s) in zip(x, stat) if s is not None and (s <= mean - std or s >= mean + std)]
plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
@@ -218,13 +222,13 @@ def plotbydate(onlyfiles, outputdir):
plt.xlabel("Comparision: X (max) posts - X+1 (max) posts")
plt.ylabel("stat value")
plt.legend(loc="upper right")
plt.savefig(outputdir + "/ks_date_stat_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
plt.savefig(outputdir + "/ks_date_stat_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
plt.close(fig)
magick += " " + outputdir + "/ks_date_stat_" + d[0] + "_" + d[1] + ".png"
os.system(magick + " " + outputdir + "/ks_date_stat.pdf")
magick += " " + outputdir + "/ks_date_stat_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
os.system(magick + " " + outputdir + "/ks_date_stat_i" + str(intervl) + ".pdf")
def plotbydateold(onlyfiles, oldfiles, outputdir):
def plotbydateold(onlyfiles, oldfiles, outputdir, intervl):
print("plotbydateold")
files = defaultdict(list)
for f in onlyfiles:
@@ -232,7 +236,7 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
files[(s[3], s[4])].append(f)
dates = sorted(files.keys(), key=lambda e: "-".join(reversed(e[0].split("-"))))
files = {d: files[d] for d in dates}
files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()}
files = {d: sorted(l, key=lambda e: e.split("_")[6]) for (d, l) in files.items()}
oldfiles = {(f[:-3].split("_")[3], f[:-3].split("_")[4]): f for f in oldfiles}
changes_neg = defaultdict(list)
@@ -263,10 +267,10 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
poslevelsflat1 = [item['pos'] for item in tox1]
comlevelsflat1 = [item['compound'] for item in tox1]
ksneg = ks_2samp(neglevelsflat1, neglevelsold)
ksneu = ks_2samp(neulevelsflat1, neulevelsold)
kspos = ks_2samp(poslevelsflat1, poslevelsold)
kscom = ks_2samp(comlevelsflat1, comlevelsold)
ksneg = ks_2samp(neglevelsflat1, neglevelsold) if len(neglevelsflat1) > 0 and len(neglevelsold) > 0 else "no values"
ksneu = ks_2samp(neulevelsflat1, neulevelsold) if len(neulevelsflat1) > 0 and len(neulevelsold) > 0 else "no values"
kspos = ks_2samp(poslevelsflat1, poslevelsold) if len(poslevelsflat1) > 0 and len(poslevelsold) > 0 else "no values"
kscom = ks_2samp(comlevelsflat1, comlevelsold) if len(comlevelsflat1) > 0 and len(comlevelsold) > 0 else "no values"
changes_neg[d].append(ksneg)
changes_neu[d].append(ksneu)
@@ -276,7 +280,7 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
print("logs")
for (d, l) in files.items():
f1 = oldfiles[d]
with open(outputdir + "/ks_olddate_" + d[0] + "_" + d[1] + ".log", "w") as f:
with open(outputdir + "/ks_olddate_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".log", "w") as f:
for i in range(len(l)):
if changes_neg[d][i] is None:
continue
@@ -289,7 +293,7 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
imgmagickcmd = IMAGE_MAGICK
for (d, l) in files.items():
print(d)
x = [l[i][:-3].split("_")[5] for i in range(len(l))]
x = [l[i][:-3].split("_")[6] for i in range(len(l))]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
pval = [(xx, c.pvalue) for xx, c in zip(x, changes) if c is not None]
@@ -306,18 +310,18 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
plt.xlabel("Comparision: new users X (max) posts - old users posts")
plt.ylabel("p-value")
plt.legend(loc="upper right")
outfile = outputdir + "/ks_olddate_pval_" + d[0] + "_" + d[1] + ".png"
outfile = outputdir + "/ks_olddate_pval_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
imgmagickcmd += " " + outfile
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_pval.pdf")
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_pval_i" + str(intervl) + ".pdf")
# stat
print("stat")
imgmagickcmd = IMAGE_MAGICK
for (d, l) in files.items():
print(d)
x = [l[i][:-3].split("_")[5] for i in range(len(l))]
x = [l[i][:-3].split("_")[6] for i in range(len(l))]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
stat = [(xx, c.statistic) for xx, c in zip(x, changes) if c is not None]
@@ -334,11 +338,11 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
plt.xlabel("Comparision: new users X (max) posts - old users posts")
plt.ylabel("stat value")
plt.legend(loc="upper right")
outfile = outputdir + "/ks_olddate_stat_" + d[0] + "_" + d[1] + ".png"
outfile = outputdir + "/ks_olddate_stat_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
imgmagickcmd += " " + outfile
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_stat.pdf")
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_stat_i" + str(intervl) + ".pdf")
def filecmp(file1, file2):
@@ -365,5 +369,20 @@ if __name__ == "__main__":
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 3
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder)
main(folder, interval)