import os import sys from collections import defaultdict from datetime import datetime from os import listdir from os.path import isfile, join import matplotlib.pyplot as plt import numpy as np from scipy.stats import ks_2samp from common import imprt, IMAGE_MAGICK colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'} def main(folder): outputdir = folder + "/output/ksbatch/" os.system("mkdir -p " + outputdir) srcfolder = folder + "/output/batch/" onlyfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "newusers" in f] onlyfiles = sorted(onlyfiles) plotbypost(onlyfiles, outputdir) plotbydate(onlyfiles, outputdir) oldfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "oldusers" in f] oldfiles = sorted(oldfiles) plotbydateold(onlyfiles, oldfiles, outputdir) def plotbypost(onlyfiles, outputdir): print("plotbypost") files = defaultdict(list) for f in onlyfiles: s = f[:-3].split("_") files[int(s[5])].append(f) files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[3], "%d-%m-%Y")) for (p, l) in files.items()} changes_neg = defaultdict(list) changes_neu = defaultdict(list) changes_pos = defaultdict(list) changes_com = defaultdict(list) for (p, l) in files.items(): if len(l) < 2: continue print(p) for i in range(len(l) - 1): tox1 = imprt(l[i]).toxlevels tox2 = imprt(l[i + 1]).toxlevels neglevelsflat1 = [item['neg'] for item in tox1] neulevelsflat1 = [item['neu'] for item in tox1] poslevelsflat1 = [item['pos'] for item in tox1] comlevelsflat1 = [item['compound'] for item in tox1] neglevelsflat2 = [item['neg'] for item in tox2] neulevelsflat2 = [item['neu'] for item in tox2] poslevelsflat2 = [item['pos'] for item in tox2] comlevelsflat2 = [item['compound'] for item in tox2] ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) kspos = ks_2samp(poslevelsflat1, poslevelsflat2) kscom = ks_2samp(comlevelsflat1, comlevelsflat2) changes_neg[p].append(ksneg) changes_neu[p].append(ksneu) changes_pos[p].append(kspos) changes_com[p].append(kscom) for (p, l) in files.items(): with open(outputdir + "/ks_post_" + str(p) + ".log", "w") as f: for i in range(len(l) - 1): f1 = l[i] f2 = l[i + 1] f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[p][i]) + "; ks neu = " + str(changes_neu[p][i]) + "; ks pos = " + str(changes_pos[p][i]) + "; ks com = " + str(changes_com[p][i]) + "\n") # pval for (p, l) in files.items(): x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items(): pval = [x.pvalue for x in changes] plt.plot(x, pval, label=type + ".pval", color=colors[type]) mean = np.mean(pval) std = np.std(pval) dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std] plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed') plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with max " + str(p) + " posts") plt.xticks(rotation=90) plt.legend(loc="upper right") plt.savefig(outputdir + "/ks_post_pval_" + str(p) + ".png", bbox_inches='tight') plt.close(fig) # stat for (p, l) in files.items(): x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items(): stat = [x.statistic for x in changes] plt.plot(x, stat, label=type + ".stat", color=colors[type]) mean = np.mean(stat) std = np.std(stat) dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std] plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed') plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with max " + str(p) + " posts") plt.xticks(rotation=90) plt.legend(loc="upper right") plt.savefig(outputdir + "/ks_post_stat_" + str(p) + ".png", bbox_inches='tight') plt.close(fig) def plotbydate(onlyfiles, outputdir): print("plotbydate") files = defaultdict(list) for f in onlyfiles: s = f[:-3].split("_") files[(s[3], s[4])].append(f) files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()} changes_neg = defaultdict(list) changes_neu = defaultdict(list) changes_pos = defaultdict(list) changes_com = defaultdict(list) for (d, l) in files.items(): if len(l) < 2: continue print(d) for i in range(len(l) - 1): tox1 = imprt(l[i]).toxlevels tox2 = imprt(l[i + 1]).toxlevels neglevelsflat1 = [item['neg'] for item in tox1] neulevelsflat1 = [item['neu'] for item in tox1] poslevelsflat1 = [item['pos'] for item in tox1] comlevelsflat1 = [item['compound'] for item in tox1] neglevelsflat2 = [item['neg'] for item in tox2] neulevelsflat2 = [item['neu'] for item in tox2] poslevelsflat2 = [item['pos'] for item in tox2] comlevelsflat2 = [item['compound'] for item in tox2] ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) kspos = ks_2samp(poslevelsflat1, poslevelsflat2) kscom = ks_2samp(comlevelsflat1, comlevelsflat2) changes_neg[d].append(ksneg) changes_neu[d].append(ksneu) changes_pos[d].append(kspos) changes_com[d].append(kscom) for (d, l) in files.items(): with open(outputdir + "/ks_date_" + d[0] + "_" + d[1] + ".log", "w") as f: for i in range(len(l) - 1): f1 = l[i] f2 = l[i + 1] f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[d][i]) + "; ks neu = " + str(changes_neu[d][i]) + "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n") # pval for (d, l) in files.items(): x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): pval = [x.pvalue for x in changes] plt.plot(x, pval, label=type + ".pval", color=colors[type]) mean = np.mean(pval) std = np.std(pval) dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std] plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed') plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) plt.legend(loc="upper right") plt.savefig(outputdir + "/ks_date_pval_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight') plt.close(fig) # stat for (d, l) in files.items(): x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): stat = [x.statistic for x in changes] plt.plot(x, stat, label=type + ".stat", color=colors[type]) mean = np.mean(stat) std = np.std(stat) dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std] plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed') plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) plt.legend(loc="upper right") plt.savefig(outputdir + "/ks_date_stat_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight') plt.close(fig) def plotbydateold(onlyfiles, oldfiles, outputdir): print("plotbydateold") files = defaultdict(list) for f in onlyfiles: s = f[:-3].split("_") files[(s[3], s[4])].append(f) dates = sorted(files.keys(), key=lambda e: "-".join(reversed(e[0].split("-")))) files = {d: files[d] for d in dates} files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()} oldfiles = {(f[:-3].split("_")[3], f[:-3].split("_")[4]): f for f in oldfiles} changes_neg = defaultdict(list) changes_neu = defaultdict(list) changes_pos = defaultdict(list) changes_com = defaultdict(list) for (d, l) in files.items(): print(d) toxold = imprt(oldfiles[d]).toxlevels neglevelsold = [item['neg'] for item in toxold] neulevelsold = [item['neu'] for item in toxold] poslevelsold = [item['pos'] for item in toxold] comlevelsold = [item['compound'] for item in toxold] for i in range(len(l)): tox1 = imprt(l[i]).toxlevels if len(tox1) == 0 or len(toxold) == 0: changes_neg[d].append(None) changes_neu[d].append(None) changes_pos[d].append(None) changes_com[d].append(None) continue neglevelsflat1 = [item['neg'] for item in tox1] neulevelsflat1 = [item['neu'] for item in tox1] poslevelsflat1 = [item['pos'] for item in tox1] comlevelsflat1 = [item['compound'] for item in tox1] ksneg = ks_2samp(neglevelsflat1, neglevelsold) ksneu = ks_2samp(neulevelsflat1, neulevelsold) kspos = ks_2samp(poslevelsflat1, poslevelsold) kscom = ks_2samp(comlevelsflat1, comlevelsold) changes_neg[d].append(ksneg) changes_neu[d].append(ksneu) changes_pos[d].append(kspos) changes_com[d].append(kscom) print("logs") for (d, l) in files.items(): # print(d) # print("neg is: " + str(len(changes_neg[d])) + " should: " + str(len(l))) # print("neu is: " + str(len(changes_neu[d])) + " should: " + str(len(l))) # print("pos is: " + str(len(changes_pos[d])) + " should: " + str(len(l))) # print("com is: " + str(len(changes_com[d])) + " should: " + str(len(l))) f1 = oldfiles[d] with open(outputdir + "/ks_olddate_" + d[0] + "_" + d[1] + ".log", "w") as f: for i in range(len(l)): if changes_neg[d][i] is None: continue f2 = l[i] f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[d][i]) + "; ks neu = " + str(changes_neu[d][i]) + "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n") # pval print("pval") imgmagickcmd = IMAGE_MAGICK for (d, l) in files.items(): print(d) x = [l[i][:-3].split("_")[5] for i in range(len(l))] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): pval = [(xx, c.pvalue) for xx, c in zip(x, changes) if c is not None] plt.plot([p[0] for p in pval], [p[1] for p in pval], label=type + ".pval", color=colors[type]) if len(pval) == 0: continue mean = np.mean([p[1] for p in pval]) std = np.std([p[1] for p in pval]) dev = [(xx, s) for (xx, s) in pval if s <= mean - std or s >= mean + std] plt.plot([p[0] for p in pval], [mean] * len(pval), color=colors[type], ls='dashed') plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) plt.legend(loc="upper right") outfile = outputdir + "/ks_olddate_pval_" + d[0] + "_" + d[1] + ".png" plt.savefig(outfile, bbox_inches='tight') plt.close(fig) imgmagickcmd += " " + outfile os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_pval.pdf") # stat print("stat") imgmagickcmd = IMAGE_MAGICK for (d, l) in files.items(): print(d) x = [l[i][:-3].split("_")[5] for i in range(len(l))] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): stat = [(xx, c.statistic) for xx, c in zip(x, changes) if c is not None] plt.plot([p[0] for p in stat], [p[1] for p in stat], label=type + ".stat", color=colors[type]) if len(stat) == 0: continue mean = np.mean([p[1] for p in stat]) std = np.std([p[1] for p in stat]) dev = [(xx, s) for (xx, s) in stat if s <= mean - std or s >= mean + std] plt.plot([p[0] for p in stat], [mean] * len(stat), color=colors[type], ls='dashed') plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) plt.legend(loc="upper right") outfile = outputdir + "/ks_olddate_stat_" + d[0] + "_" + d[1] + ".png" plt.savefig(outfile, bbox_inches='tight') plt.close(fig) imgmagickcmd += " " + outfile os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_stat.pdf") def filecmp(file1, file2): if file1 == file2: return 0 s1 = file1.split("_") s2 = file2.split("_") d1 = datetime.strptime(s1[2], "%d-%m-%Y") d2 = datetime.strptime(s2[2], "%d-%m-%Y") if d1 < d2: return -1 elif d1 > d2: return 1 return 0 if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " " if len(sys.argv) < 2: print(usage) sys.exit(1) folder = sys.argv[1] if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) main(folder)