import os import sys from collections import defaultdict from datetime import datetime from os import listdir from os.path import isfile, join import matplotlib.pyplot as plt import numpy as np from scipy.stats import ks_2samp from analyze_batch import readavgsentsingle from common import imprt, IMAGE_MAGICK, calc_intervals from loader import load colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'} def main(folder, intervl): users, posts, firstcontrib, sumcontrib = load(folder) outputdir = folder + "/output/ksbatch/" os.system("mkdir -p " + outputdir) srcfolder = folder + "/output/batch/" g(srcfolder + "/averagesentiment.txt", outputdir, calc_intervals(posts, intervl)) onlyfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "newusers" in f and "i" + str(intervl) in f] onlyfiles = sorted(onlyfiles) plotbypost(onlyfiles, outputdir, intervl) plotbydate(onlyfiles, outputdir, intervl) oldfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "oldusers" in f and "i" + str(intervl) in f] oldfiles = sorted(oldfiles) plotbydateold(onlyfiles, oldfiles, outputdir, intervl) class fake: def __init__(self, p, s): pass def g(srcfile, outputdir, intervals): print("ks global") avgss2 = readavgsentsingle(srcfile) kscom = [] single = [] for i in range(1, 6): kscom.append(ks_2samp([np.mean(x) if len(x) > 0 else float("nan") for x in avgss2[0]], [np.mean(x) if len(x) > 0 else float("nan") for x in avgss2[i]])) s = [] for j in range(len(avgss2[0])): s.append(ks_2samp(avgss2[0][j], avgss2[i][j]) if len(avgss2[i][j]) > 0 and len(avgss2[0][j]) else float("nan")) single.append(s) fig = plt.figure(figsize=(16, 12)) for i in range(len(single)): plt.plot([iv[0] for iv in intervals], [s if isinstance(s, float) else s.pvalue for s in single[i]], label=str(i + 1) + " posts - most posters") plt.title("KS 2-sided test for sentiments (X posts to 95%tile posters)") plt.xticks(rotation=90) plt.xlabel("Comparision: time frame X - time frame X+1") plt.ylabel("pvalue") plt.legend(loc="upper right") plt.savefig(outputdir + "/ks_averagesentiments_pval.png", bbox_inches='tight') plt.close(fig) fig = plt.figure(figsize=(16, 12)) for i in range(len(single)): plt.plot([iv[0] for iv in intervals], [s if isinstance(s, float) else s.statistic for s in single[i]], label=str(i + 1) + " posts - most posters") plt.title("KS 2-sided test for sentiments (X posts to 95%tile posters)") plt.xticks(rotation=90) plt.xlabel("Comparision: time frame X - time frame X+1") plt.ylabel("statistic") plt.legend(loc="upper right") plt.savefig(outputdir + "/ks_averagesentiments_stat.png", bbox_inches='tight') plt.close(fig) def plotbypost(onlyfiles, outputdir, intervl): print("plotbypost") files = defaultdict(list) for f in onlyfiles: s = f[:-3].split("_") files[int(s[6])].append(f) files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[3], "%d-%m-%Y")) for (p, l) in files.items()} changes_neg = defaultdict(list) changes_neu = defaultdict(list) changes_pos = defaultdict(list) changes_com = defaultdict(list) for (p, l) in files.items(): if len(l) < 2: continue print(p) for i in range(len(l) - 1): tox1 = imprt(l[i]).toxlevels tox2 = imprt(l[i + 1]).toxlevels neglevelsflat1 = [item['neg'] for item in tox1] neulevelsflat1 = [item['neu'] for item in tox1] poslevelsflat1 = [item['pos'] for item in tox1] comlevelsflat1 = [item['compound'] for item in tox1] neglevelsflat2 = [item['neg'] for item in tox2] neulevelsflat2 = [item['neu'] for item in tox2] poslevelsflat2 = [item['pos'] for item in tox2] comlevelsflat2 = [item['compound'] for item in tox2] ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) if len(neglevelsflat1) > 0 and len(neglevelsflat2) > 0 else "no values" ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) if len(neulevelsflat1) > 0 and len(neulevelsflat2) > 0 else "no values" kspos = ks_2samp(poslevelsflat1, poslevelsflat2) if len(poslevelsflat1) > 0 and len(poslevelsflat2) > 0 else "no values" kscom = ks_2samp(comlevelsflat1, comlevelsflat2) if len(comlevelsflat1) > 0 and len(comlevelsflat2) > 0 else "no values" changes_neg[p].append(ksneg) changes_neu[p].append(ksneu) changes_pos[p].append(kspos) changes_com[p].append(kscom) for (p, l) in files.items(): with open(outputdir + "/ks_post_i" + str(intervl) + "_" + str(p) + ".log", "w") as f: for i in range(len(l) - 1): f1 = l[i] f2 = l[i + 1] f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[p][i]) + "; ks neu = " + str(changes_neu[p][i]) + "; ks pos = " + str(changes_pos[p][i]) + "; ks com = " + str(changes_com[p][i]) + "\n") # pval magick = IMAGE_MAGICK for (p, l) in files.items(): x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items(): pval = [x.pvalue if not isinstance(x, str) else None for x in changes] pvalnotnull = [x for x in pval if x is not None] plt.plot(x, pval, label=type + ".pval", color=colors[type]) mean = np.mean(pvalnotnull) std = np.std(pvalnotnull) dev = [(xx, s) for (xx, s) in zip(x, pval) if s is not None and (s <= mean - std or s >= mean + std)] plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed') plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with max " + str(p) + " posts") plt.xticks(rotation=90) plt.xlabel("Comparision: time frame X - time frame X+1") plt.ylabel("p-value") plt.legend(loc="upper right") plt.savefig(outputdir + "/ks_post_pval_i" + str(intervl) + "_" + str(p) + ".png", bbox_inches='tight') plt.close(fig) magick += " " + outputdir + "/ks_post_pval_i" + str(intervl) + "_" + str(p) + ".png" os.system(magick + " " + outputdir + "/ks_post_pval_i" + str(intervl) + ".pdf") # stat magick = IMAGE_MAGICK for (p, l) in files.items(): x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items(): stat = [x.statistic if not isinstance(x, str) else None for x in changes] statnotnull = [x for x in stat if x is not None] plt.plot(x, stat, label=type + ".stat", color=colors[type]) mean = np.mean(statnotnull) std = np.std(statnotnull) dev = [(xx, s) for (xx, s) in zip(x, stat) if s is not None and (s <= mean - std or s >= mean + std)] plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed') plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with max " + str(p) + " posts") plt.xticks(rotation=90) plt.xlabel("Comparision: time frame X - time frame X+1") plt.ylabel("stat value") plt.legend(loc="upper right") plt.savefig(outputdir + "/ks_post_stat_i" + str(intervl) + "_" + str(p) + ".png", bbox_inches='tight') plt.close(fig) magick += " " + outputdir + "/ks_post_stat_i" + str(intervl) + "_" + str(p) + ".png" os.system(magick + " " + outputdir + "/ks_post_stat_i" + str(intervl) + ".pdf") def plotbydate(onlyfiles, outputdir, intervl): print("plotbydate") files = defaultdict(list) for f in onlyfiles: s = f[:-3].split("_") files[(s[3], s[4])].append(f) dates = sorted(files.keys(), key=lambda e: "-".join(reversed(e[0].split("-")))) files = {d: files[d] for d in dates} files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()} changes_neg = defaultdict(list) changes_neu = defaultdict(list) changes_pos = defaultdict(list) changes_com = defaultdict(list) for (d, l) in files.items(): if len(l) < 2: continue print(d) for i in range(len(l) - 1): tox1 = imprt(l[i]).toxlevels tox2 = imprt(l[i + 1]).toxlevels neglevelsflat1 = [item['neg'] for item in tox1] neulevelsflat1 = [item['neu'] for item in tox1] poslevelsflat1 = [item['pos'] for item in tox1] comlevelsflat1 = [item['compound'] for item in tox1] neglevelsflat2 = [item['neg'] for item in tox2] neulevelsflat2 = [item['neu'] for item in tox2] poslevelsflat2 = [item['pos'] for item in tox2] comlevelsflat2 = [item['compound'] for item in tox2] ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) if len(neglevelsflat1) > 0 and len(neglevelsflat2) > 0 else "no values" ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) if len(neulevelsflat1) > 0 and len(neulevelsflat2) > 0 else "no values" kspos = ks_2samp(poslevelsflat1, poslevelsflat2) if len(poslevelsflat1) > 0 and len(poslevelsflat2) > 0 else "no values" kscom = ks_2samp(comlevelsflat1, comlevelsflat2) if len(comlevelsflat1) > 0 and len(comlevelsflat2) > 0 else "no values" changes_neg[d].append(ksneg) changes_neu[d].append(ksneu) changes_pos[d].append(kspos) changes_com[d].append(kscom) for (d, l) in files.items(): with open(outputdir + "/ks_date_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".log", "w") as f: for i in range(len(l) - 1): f1 = l[i] f2 = l[i + 1] f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[d][i]) + "; ks neu = " + str(changes_neu[d][i]) + "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n") # pval magick = IMAGE_MAGICK for (d, l) in files.items(): x = [l[i].split("_")[6][:-3] + "-" + l[i + 1].split("_")[6][:-3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): pval = [x.pvalue if not isinstance(x, str) else None for x in changes] pvalnotnull = [x for x in pval if x is not None] plt.plot(x, pval, label=type + ".pval", color=colors[type]) mean = np.mean(pvalnotnull) std = np.std(pvalnotnull) dev = [(xx, s) for (xx, s) in zip(x, pval) if s is not None and (s <= mean - std or s >= mean + std)] plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed') plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) plt.xlabel("Comparision: X (max) posts - X+1 (max) posts") plt.ylabel("p-value") plt.legend(loc="upper right") plt.savefig(outputdir + "/ks_date_pval_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight') plt.close(fig) magick += " " + outputdir + "/ks_date_pval_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png" os.system(magick + " " + outputdir + "/ks_date_pval_i" + str(intervl) + ".pdf") # stat magick = IMAGE_MAGICK for (d, l) in files.items(): x = [l[i].split("_")[6][:-3] + "-" + l[i + 1].split("_")[6][:-3] for i in range(len(l) - 1)] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): stat = [x.statistic if not isinstance(x, str) else None for x in changes] statnotnull = [x for x in stat if x is not None] plt.plot(x, stat, label=type + ".stat", color=colors[type]) mean = np.mean(statnotnull) std = np.std(statnotnull) dev = [(xx, s) for (xx, s) in zip(x, stat) if s is not None and (s <= mean - std or s >= mean + std)] plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed') plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) plt.xlabel("Comparision: X (max) posts - X+1 (max) posts") plt.ylabel("stat value") plt.legend(loc="upper right") plt.savefig(outputdir + "/ks_date_stat_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight') plt.close(fig) magick += " " + outputdir + "/ks_date_stat_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png" os.system(magick + " " + outputdir + "/ks_date_stat_i" + str(intervl) + ".pdf") def plotbydateold(onlyfiles, oldfiles, outputdir, intervl): print("plotbydateold") files = defaultdict(list) for f in onlyfiles: s = f[:-3].split("_") files[(s[3], s[4])].append(f) dates = sorted(files.keys(), key=lambda e: "-".join(reversed(e[0].split("-")))) files = {d: files[d] for d in dates} files = {d: sorted(l, key=lambda e: e.split("_")[6]) for (d, l) in files.items()} oldfiles = {(f[:-3].split("_")[3], f[:-3].split("_")[4]): f for f in oldfiles} changes_neg = defaultdict(list) changes_neu = defaultdict(list) changes_pos = defaultdict(list) changes_com = defaultdict(list) for (d, l) in files.items(): print(d) toxold = imprt(oldfiles[d]).toxlevels neglevelsold = [item['neg'] for item in toxold] neulevelsold = [item['neu'] for item in toxold] poslevelsold = [item['pos'] for item in toxold] comlevelsold = [item['compound'] for item in toxold] for i in range(len(l)): tox1 = imprt(l[i]).toxlevels if len(tox1) == 0 or len(toxold) == 0: changes_neg[d].append(None) changes_neu[d].append(None) changes_pos[d].append(None) changes_com[d].append(None) continue neglevelsflat1 = [item['neg'] for item in tox1] neulevelsflat1 = [item['neu'] for item in tox1] poslevelsflat1 = [item['pos'] for item in tox1] comlevelsflat1 = [item['compound'] for item in tox1] ksneg = ks_2samp(neglevelsflat1, neglevelsold) if len(neglevelsflat1) > 0 and len(neglevelsold) > 0 else "no values" ksneu = ks_2samp(neulevelsflat1, neulevelsold) if len(neulevelsflat1) > 0 and len(neulevelsold) > 0 else "no values" kspos = ks_2samp(poslevelsflat1, poslevelsold) if len(poslevelsflat1) > 0 and len(poslevelsold) > 0 else "no values" kscom = ks_2samp(comlevelsflat1, comlevelsold) if len(comlevelsflat1) > 0 and len(comlevelsold) > 0 else "no values" changes_neg[d].append(ksneg) changes_neu[d].append(ksneu) changes_pos[d].append(kspos) changes_com[d].append(kscom) print("logs") for (d, l) in files.items(): f1 = oldfiles[d] with open(outputdir + "/ks_olddate_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".log", "w") as f: for i in range(len(l)): if changes_neg[d][i] is None: continue f2 = l[i] f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[d][i]) + "; ks neu = " + str(changes_neu[d][i]) + "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n") # pval print("pval") imgmagickcmd = IMAGE_MAGICK for (d, l) in files.items(): print(d) x = [l[i][:-3].split("_")[6] for i in range(len(l))] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): pval = [(xx, c.pvalue) for xx, c in zip(x, changes) if c is not None] plt.plot([p[0] for p in pval], [p[1] for p in pval], label=type + ".pval", color=colors[type]) if len(pval) == 0: continue mean = np.mean([p[1] for p in pval]) std = np.std([p[1] for p in pval]) dev = [(xx, s) for (xx, s) in pval if s <= mean - std or s >= mean + std] plt.plot([p[0] for p in pval], [mean] * len(pval), color=colors[type], ls='dashed') plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) plt.xlabel("Comparision: new users X (max) posts - old users posts") plt.ylabel("p-value") plt.legend(loc="upper right") outfile = outputdir + "/ks_olddate_pval_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png" plt.savefig(outfile, bbox_inches='tight') plt.close(fig) imgmagickcmd += " " + outfile os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_pval_i" + str(intervl) + ".pdf") # stat print("stat") imgmagickcmd = IMAGE_MAGICK for (d, l) in files.items(): print(d) x = [l[i][:-3].split("_")[6] for i in range(len(l))] fig = plt.figure(figsize=(16, 12)) for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items(): stat = [(xx, c.statistic) for xx, c in zip(x, changes) if c is not None] plt.plot([p[0] for p in stat], [p[1] for p in stat], label=type + ".stat", color=colors[type]) if len(stat) == 0: continue mean = np.mean([p[1] for p in stat]) std = np.std([p[1] for p in stat]) dev = [(xx, s) for (xx, s) in stat if s <= mean - std or s >= mean + std] plt.plot([p[0] for p in stat], [mean] * len(stat), color=colors[type], ls='dashed') plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o') plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) plt.xlabel("Comparision: new users X (max) posts - old users posts") plt.ylabel("stat value") plt.legend(loc="upper right") outfile = outputdir + "/ks_olddate_stat_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png" plt.savefig(outfile, bbox_inches='tight') plt.close(fig) imgmagickcmd += " " + outfile os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_stat_i" + str(intervl) + ".pdf") def filecmp(file1, file2): if file1 == file2: return 0 s1 = file1.split("_") s2 = file2.split("_") d1 = datetime.strptime(s1[2], "%d-%m-%Y") d2 = datetime.strptime(s2[2], "%d-%m-%Y") if d1 < d2: return -1 elif d1 > d2: return 1 return 0 if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " " if len(sys.argv) < 2: print(usage) sys.exit(1) folder = sys.argv[1] if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) interval = 3 if len(sys.argv) >= 3: if sys.argv[2].startswith("-i"): interval = sys.argv[2][2:] try: interval = int(interval) except ValueError: print("-i: int required") sys.exit(1) if interval < 1 or interval > 12: print("-i: only 1 - 12") sys.exit(1) else: print("unknown parameter: " + sys.argv[2]) sys.exit(1) main(folder, interval)