Files
master/calctoxdiff.py
wea_ondara cd0239f39c wip
2020-01-24 15:50:32 +01:00

431 lines
20 KiB
Python

import os
import sys
from collections import defaultdict
from datetime import datetime
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import ks_2samp
from analyze_batch import readavgsentsingle
from common import imprt, IMAGE_MAGICK, calc_intervals
from loader import load
colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'}
def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder)
outputdir = folder + "/output/ksbatch/"
os.system("mkdir -p " + outputdir)
srcfolder = folder + "/output/batch/"
g(srcfolder + "/averagesentiment.txt", outputdir, calc_intervals(posts, intervl))
onlyfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "newusers" in f and "i" + str(intervl) in f]
onlyfiles = sorted(onlyfiles)
plotbypost(onlyfiles, outputdir, intervl)
plotbydate(onlyfiles, outputdir, intervl)
oldfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "oldusers" in f and "i" + str(intervl) in f]
oldfiles = sorted(oldfiles)
plotbydateold(onlyfiles, oldfiles, outputdir, intervl)
def g(srcfile, outputdir, intervals):
print("ks global")
avgss2 = readavgsentsingle(srcfile)
kscom = []
single = []
for i in range(1, 6):
kscom.append(ks_2samp([np.mean(x) if len(x) > 0 else float("nan") for x in avgss2[0]], [np.mean(x) if len(x) > 0 else float("nan") for x in avgss2[i]]))
s = []
for j in range(len(avgss2[0])):
s.append(ks_2samp(avgss2[0][j], avgss2[i][j]) if len(avgss2[i][j]) > 0 and len(avgss2[0][j]) else float("nan"))
single.append(s)
fig = plt.figure(figsize=(16, 12))
for i in range(len(single)):
plt.plot([iv[0] for iv in intervals], [s if isinstance(s, float) else s.pvalue for s in single[i]], label=str(i + 1) + " posts - most posters")
plt.title("KS 2-sided test for sentiments (X posts to 95%tile posters)")
plt.xticks(rotation=90)
plt.xlabel("Comparision: time frame X - time frame X+1")
plt.ylabel("pvalue")
plt.legend(loc="upper right")
plt.savefig(outputdir + "/ks_averagesentiments_pval.png", bbox_inches='tight')
plt.close(fig)
fig = plt.figure(figsize=(16, 12))
for i in range(len(single)):
plt.plot([iv[0] for iv in intervals], [s if isinstance(s, float) else s.statistic for s in single[i]], label=str(i + 1) + " posts - most posters")
plt.title("KS 2-sided test for sentiments (X posts to 95%tile posters)")
plt.xticks(rotation=90)
plt.xlabel("Comparision: time frame X - time frame X+1")
plt.ylabel("statistic")
plt.legend(loc="upper right")
plt.savefig(outputdir + "/ks_averagesentiments_stat.png", bbox_inches='tight')
plt.close(fig)
def plotbypost(onlyfiles, outputdir, intervl):
print("plotbypost")
files = defaultdict(list)
for f in onlyfiles:
s = f[:-3].split("_")
files[int(s[6])].append(f)
files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[3], "%d-%m-%Y")) for (p, l) in files.items()}
changes_neg = defaultdict(list)
changes_neu = defaultdict(list)
changes_pos = defaultdict(list)
changes_com = defaultdict(list)
for (p, l) in files.items():
if len(l) < 2:
continue
print(p)
for i in range(len(l) - 1):
tox1 = imprt(l[i]).toxlevels
tox2 = imprt(l[i + 1]).toxlevels
neglevelsflat1 = [item['neg'] for item in tox1]
neulevelsflat1 = [item['neu'] for item in tox1]
poslevelsflat1 = [item['pos'] for item in tox1]
comlevelsflat1 = [item['compound'] for item in tox1]
neglevelsflat2 = [item['neg'] for item in tox2]
neulevelsflat2 = [item['neu'] for item in tox2]
poslevelsflat2 = [item['pos'] for item in tox2]
comlevelsflat2 = [item['compound'] for item in tox2]
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) if len(neglevelsflat1) > 0 and len(neglevelsflat2) > 0 else "no values"
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) if len(neulevelsflat1) > 0 and len(neulevelsflat2) > 0 else "no values"
kspos = ks_2samp(poslevelsflat1, poslevelsflat2) if len(poslevelsflat1) > 0 and len(poslevelsflat2) > 0 else "no values"
kscom = ks_2samp(comlevelsflat1, comlevelsflat2) if len(comlevelsflat1) > 0 and len(comlevelsflat2) > 0 else "no values"
changes_neg[p].append(ksneg)
changes_neu[p].append(ksneu)
changes_pos[p].append(kspos)
changes_com[p].append(kscom)
for (p, l) in files.items():
with open(outputdir + "/ks_post_i" + str(intervl) + "_" + str(p) + ".log", "w") as f:
for i in range(len(l) - 1):
f1 = l[i]
f2 = l[i + 1]
f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[p][i]) + "; ks neu = " + str(changes_neu[p][i])
+ "; ks pos = " + str(changes_pos[p][i]) + "; ks com = " + str(changes_com[p][i]) + "\n")
# pval
magick = IMAGE_MAGICK
for (p, l) in files.items():
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
pval = [x.pvalue if not isinstance(x, str) else None for x in changes]
pvalnotnull = [x for x in pval if x is not None]
plt.plot(x, pval, label=type + ".pval", color=colors[type])
mean = np.mean(pvalnotnull)
std = np.std(pvalnotnull)
dev = [(xx, s) for (xx, s) in zip(x, pval) if s is not None and (s <= mean - std or s >= mean + std)]
plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with max " + str(p) + " posts")
plt.xticks(rotation=90)
plt.xlabel("Comparision: time frame X - time frame X+1")
plt.ylabel("p-value")
plt.legend(loc="upper right")
plt.savefig(outputdir + "/ks_post_pval_i" + str(intervl) + "_" + str(p) + ".png", bbox_inches='tight')
plt.close(fig)
magick += " " + outputdir + "/ks_post_pval_i" + str(intervl) + "_" + str(p) + ".png"
os.system(magick + " " + outputdir + "/ks_post_pval_i" + str(intervl) + ".pdf")
# stat
magick = IMAGE_MAGICK
for (p, l) in files.items():
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
stat = [x.statistic if not isinstance(x, str) else None for x in changes]
statnotnull = [x for x in stat if x is not None]
plt.plot(x, stat, label=type + ".stat", color=colors[type])
mean = np.mean(statnotnull)
std = np.std(statnotnull)
dev = [(xx, s) for (xx, s) in zip(x, stat) if s is not None and (s <= mean - std or s >= mean + std)]
plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with max " + str(p) + " posts")
plt.xticks(rotation=90)
plt.xlabel("Comparision: time frame X - time frame X+1")
plt.ylabel("stat value")
plt.legend(loc="upper right")
plt.savefig(outputdir + "/ks_post_stat_i" + str(intervl) + "_" + str(p) + ".png", bbox_inches='tight')
plt.close(fig)
magick += " " + outputdir + "/ks_post_stat_i" + str(intervl) + "_" + str(p) + ".png"
os.system(magick + " " + outputdir + "/ks_post_stat_i" + str(intervl) + ".pdf")
def plotbydate(onlyfiles, outputdir, intervl):
print("plotbydate")
files = defaultdict(list)
for f in onlyfiles:
s = f[:-3].split("_")
files[(s[3], s[4])].append(f)
dates = sorted(files.keys(), key=lambda e: "-".join(reversed(e[0].split("-"))))
files = {d: files[d] for d in dates}
files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()}
changes_neg = defaultdict(list)
changes_neu = defaultdict(list)
changes_pos = defaultdict(list)
changes_com = defaultdict(list)
for (d, l) in files.items():
if len(l) < 2:
continue
print(d)
for i in range(len(l) - 1):
tox1 = imprt(l[i]).toxlevels
tox2 = imprt(l[i + 1]).toxlevels
neglevelsflat1 = [item['neg'] for item in tox1]
neulevelsflat1 = [item['neu'] for item in tox1]
poslevelsflat1 = [item['pos'] for item in tox1]
comlevelsflat1 = [item['compound'] for item in tox1]
neglevelsflat2 = [item['neg'] for item in tox2]
neulevelsflat2 = [item['neu'] for item in tox2]
poslevelsflat2 = [item['pos'] for item in tox2]
comlevelsflat2 = [item['compound'] for item in tox2]
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) if len(neglevelsflat1) > 0 and len(neglevelsflat2) > 0 else "no values"
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) if len(neulevelsflat1) > 0 and len(neulevelsflat2) > 0 else "no values"
kspos = ks_2samp(poslevelsflat1, poslevelsflat2) if len(poslevelsflat1) > 0 and len(poslevelsflat2) > 0 else "no values"
kscom = ks_2samp(comlevelsflat1, comlevelsflat2) if len(comlevelsflat1) > 0 and len(comlevelsflat2) > 0 else "no values"
changes_neg[d].append(ksneg)
changes_neu[d].append(ksneu)
changes_pos[d].append(kspos)
changes_com[d].append(kscom)
for (d, l) in files.items():
with open(outputdir + "/ks_date_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".log", "w") as f:
for i in range(len(l) - 1):
f1 = l[i]
f2 = l[i + 1]
f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[d][i]) + "; ks neu = " + str(changes_neu[d][i])
+ "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n")
# pval
magick = IMAGE_MAGICK
for (d, l) in files.items():
x = [l[i].split("_")[6][:-3] + "-" + l[i + 1].split("_")[6][:-3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
pval = [x.pvalue if not isinstance(x, str) else None for x in changes]
pvalnotnull = [x for x in pval if x is not None]
plt.plot(x, pval, label=type + ".pval", color=colors[type])
mean = np.mean(pvalnotnull)
std = np.std(pvalnotnull)
dev = [(xx, s) for (xx, s) in zip(x, pval) if s is not None and (s <= mean - std or s >= mean + std)]
plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
plt.xticks(rotation=90)
plt.xlabel("Comparision: X (max) posts - X+1 (max) posts")
plt.ylabel("p-value")
plt.legend(loc="upper right")
plt.savefig(outputdir + "/ks_date_pval_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
plt.close(fig)
magick += " " + outputdir + "/ks_date_pval_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
os.system(magick + " " + outputdir + "/ks_date_pval_i" + str(intervl) + ".pdf")
# stat
magick = IMAGE_MAGICK
for (d, l) in files.items():
x = [l[i].split("_")[6][:-3] + "-" + l[i + 1].split("_")[6][:-3] for i in range(len(l) - 1)]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
stat = [x.statistic if not isinstance(x, str) else None for x in changes]
statnotnull = [x for x in stat if x is not None]
plt.plot(x, stat, label=type + ".stat", color=colors[type])
mean = np.mean(statnotnull)
std = np.std(statnotnull)
dev = [(xx, s) for (xx, s) in zip(x, stat) if s is not None and (s <= mean - std or s >= mean + std)]
plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
plt.xticks(rotation=90)
plt.xlabel("Comparision: X (max) posts - X+1 (max) posts")
plt.ylabel("stat value")
plt.legend(loc="upper right")
plt.savefig(outputdir + "/ks_date_stat_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
plt.close(fig)
magick += " " + outputdir + "/ks_date_stat_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
os.system(magick + " " + outputdir + "/ks_date_stat_i" + str(intervl) + ".pdf")
def plotbydateold(onlyfiles, oldfiles, outputdir, intervl):
print("plotbydateold")
files = defaultdict(list)
for f in onlyfiles:
s = f[:-3].split("_")
files[(s[3], s[4])].append(f)
dates = sorted(files.keys(), key=lambda e: "-".join(reversed(e[0].split("-"))))
files = {d: files[d] for d in dates}
files = {d: sorted(l, key=lambda e: e.split("_")[6]) for (d, l) in files.items()}
oldfiles = {(f[:-3].split("_")[3], f[:-3].split("_")[4]): f for f in oldfiles}
changes_neg = defaultdict(list)
changes_neu = defaultdict(list)
changes_pos = defaultdict(list)
changes_com = defaultdict(list)
for (d, l) in files.items():
print(d)
toxold = imprt(oldfiles[d]).toxlevels
neglevelsold = [item['neg'] for item in toxold]
neulevelsold = [item['neu'] for item in toxold]
poslevelsold = [item['pos'] for item in toxold]
comlevelsold = [item['compound'] for item in toxold]
for i in range(len(l)):
tox1 = imprt(l[i]).toxlevels
if len(tox1) == 0 or len(toxold) == 0:
changes_neg[d].append(None)
changes_neu[d].append(None)
changes_pos[d].append(None)
changes_com[d].append(None)
continue
neglevelsflat1 = [item['neg'] for item in tox1]
neulevelsflat1 = [item['neu'] for item in tox1]
poslevelsflat1 = [item['pos'] for item in tox1]
comlevelsflat1 = [item['compound'] for item in tox1]
ksneg = ks_2samp(neglevelsflat1, neglevelsold) if len(neglevelsflat1) > 0 and len(neglevelsold) > 0 else "no values"
ksneu = ks_2samp(neulevelsflat1, neulevelsold) if len(neulevelsflat1) > 0 and len(neulevelsold) > 0 else "no values"
kspos = ks_2samp(poslevelsflat1, poslevelsold) if len(poslevelsflat1) > 0 and len(poslevelsold) > 0 else "no values"
kscom = ks_2samp(comlevelsflat1, comlevelsold) if len(comlevelsflat1) > 0 and len(comlevelsold) > 0 else "no values"
changes_neg[d].append(ksneg)
changes_neu[d].append(ksneu)
changes_pos[d].append(kspos)
changes_com[d].append(kscom)
print("logs")
for (d, l) in files.items():
f1 = oldfiles[d]
with open(outputdir + "/ks_olddate_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".log", "w") as f:
for i in range(len(l)):
if changes_neg[d][i] is None:
continue
f2 = l[i]
f.write(f1 + " -> " + f2 + ": ks neg = " + str(changes_neg[d][i]) + "; ks neu = " + str(changes_neu[d][i])
+ "; ks pos = " + str(changes_pos[d][i]) + "; ks com = " + str(changes_com[d][i]) + "\n")
# pval
print("pval")
imgmagickcmd = IMAGE_MAGICK
for (d, l) in files.items():
print(d)
x = [l[i][:-3].split("_")[6] for i in range(len(l))]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
pval = [(xx, c.pvalue) for xx, c in zip(x, changes) if c is not None]
plt.plot([p[0] for p in pval], [p[1] for p in pval], label=type + ".pval", color=colors[type])
if len(pval) == 0:
continue
mean = np.mean([p[1] for p in pval])
std = np.std([p[1] for p in pval])
dev = [(xx, s) for (xx, s) in pval if s <= mean - std or s >= mean + std]
plt.plot([p[0] for p in pval], [mean] * len(pval), color=colors[type], ls='dashed')
plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1])
plt.xticks(rotation=90)
plt.xlabel("Comparision: new users X (max) posts - old users posts")
plt.ylabel("p-value")
plt.legend(loc="upper right")
outfile = outputdir + "/ks_olddate_pval_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
imgmagickcmd += " " + outfile
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_pval_i" + str(intervl) + ".pdf")
# stat
print("stat")
imgmagickcmd = IMAGE_MAGICK
for (d, l) in files.items():
print(d)
x = [l[i][:-3].split("_")[6] for i in range(len(l))]
fig = plt.figure(figsize=(16, 12))
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
stat = [(xx, c.statistic) for xx, c in zip(x, changes) if c is not None]
plt.plot([p[0] for p in stat], [p[1] for p in stat], label=type + ".stat", color=colors[type])
if len(stat) == 0:
continue
mean = np.mean([p[1] for p in stat])
std = np.std([p[1] for p in stat])
dev = [(xx, s) for (xx, s) in stat if s <= mean - std or s >= mean + std]
plt.plot([p[0] for p in stat], [mean] * len(stat), color=colors[type], ls='dashed')
plt.plot([dx[0] for dx in dev], [dx[1] for dx in dev], color=colors[type], ls='None', marker='o')
plt.title("KS 2-sided test with new and old users between " + d[0] + " and " + d[1])
plt.xticks(rotation=90)
plt.xlabel("Comparision: new users X (max) posts - old users posts")
plt.ylabel("stat value")
plt.legend(loc="upper right")
outfile = outputdir + "/ks_olddate_stat_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
plt.savefig(outfile, bbox_inches='tight')
plt.close(fig)
imgmagickcmd += " " + outfile
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_stat_i" + str(intervl) + ".pdf")
def filecmp(file1, file2):
if file1 == file2:
return 0
s1 = file1.split("_")
s2 = file2.split("_")
d1 = datetime.strptime(s1[2], "%d-%m-%Y")
d2 = datetime.strptime(s2[2], "%d-%m-%Y")
if d1 < d2:
return -1
elif d1 > d2:
return 1
return 0
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 3
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder, interval)