wip
This commit is contained in:
@@ -54,8 +54,21 @@ def main(folder, intervl):
|
|||||||
gpos = []
|
gpos = []
|
||||||
gcom = []
|
gcom = []
|
||||||
|
|
||||||
goutfilenamenewusers = outputdir + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
goutfilenamenewusers = outputdir + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "_i" + str(intervl)
|
||||||
goutfilenameoldusers = outputdir + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
goutfilenameoldusers = outputdir + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "_i" + str(intervl)
|
||||||
|
|
||||||
|
start = cms()
|
||||||
|
printnoln("sorting posts ...")
|
||||||
|
sortedposts = defaultdict(list)
|
||||||
|
for (i, post) in enumerate(newposts):
|
||||||
|
userid = post['OwnerUserId']
|
||||||
|
|
||||||
|
# check first contribution
|
||||||
|
if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']:
|
||||||
|
continue
|
||||||
|
|
||||||
|
sortedposts[userid].append(post)
|
||||||
|
rprint("sorting posts ... took " + str(cms() - start) + "ms")
|
||||||
|
|
||||||
for option_posts in postcounts:
|
for option_posts in postcounts:
|
||||||
# print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts))
|
# print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts))
|
||||||
@@ -64,22 +77,8 @@ def main(folder, intervl):
|
|||||||
start = cms()
|
start = cms()
|
||||||
printnoln("computing toxic levels: filtering")
|
printnoln("computing toxic levels: filtering")
|
||||||
toxlevels = []
|
toxlevels = []
|
||||||
searchedposts = defaultdict(int)
|
filteredposts = [posts for (_, posts) in sortedposts.items() if len(posts) == option_posts]
|
||||||
filteredposts = []
|
filteredposts = [p for posts in filteredposts for p in posts]
|
||||||
for (i, post) in enumerate(newposts):
|
|
||||||
userid = post['OwnerUserId']
|
|
||||||
|
|
||||||
# check first contribution
|
|
||||||
if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# no more than option_posts posts from one user
|
|
||||||
searchedposts[userid] += 1
|
|
||||||
if searchedposts[userid] > option_posts:
|
|
||||||
continue
|
|
||||||
|
|
||||||
filteredposts.append(post)
|
|
||||||
|
|
||||||
for (i, post) in enumerate(filteredposts):
|
for (i, post) in enumerate(filteredposts):
|
||||||
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
|
printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts)))
|
||||||
for a in post['Answers']:
|
for a in post['Answers']:
|
||||||
@@ -87,6 +86,7 @@ def main(folder, intervl):
|
|||||||
toxlevel = cachedsentiments[a['Id']]
|
toxlevel = cachedsentiments[a['Id']]
|
||||||
else:
|
else:
|
||||||
print("Sentiment not found for " + a['Id'])
|
print("Sentiment not found for " + a['Id'])
|
||||||
|
continue
|
||||||
toxlevels.append(toxlevel)
|
toxlevels.append(toxlevel)
|
||||||
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
|
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...")
|
||||||
|
|
||||||
@@ -206,10 +206,10 @@ def main(folder, intervl):
|
|||||||
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
|
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
|
||||||
magickold += " " + goutfilenameoldusers + ".png"
|
magickold += " " + goutfilenameoldusers + ".png"
|
||||||
|
|
||||||
os.system(magickglobal + " " + outputdir + "batch_newusers.pdf")
|
os.system(magickglobal + " " + outputdir + "batch_newusers_i" + str(intervl) + ".pdf")
|
||||||
os.system(magickold + " " + outputdir + "batch_oldusers.pdf")
|
os.system(magickold + " " + outputdir + "batch_oldusers_i" + str(intervl) + ".pdf")
|
||||||
for (i, cmd) in magickpost.items():
|
for (i, cmd) in magickpost.items():
|
||||||
os.system(cmd + " " + outputdir + "batch_newusers_" + str(i) + ".pdf")
|
os.system(cmd + " " + outputdir + "batch_newusers_i" + str(intervl) + "_" + str(i) + ".pdf")
|
||||||
|
|
||||||
|
|
||||||
def dumptoxlevels(lvls, filename):
|
def dumptoxlevels(lvls, filename):
|
||||||
|
|||||||
145
calctoxdiff.py
145
calctoxdiff.py
@@ -14,29 +14,29 @@ from common import imprt, IMAGE_MAGICK
|
|||||||
colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'}
|
colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'}
|
||||||
|
|
||||||
|
|
||||||
def main(folder):
|
def main(folder, intervl):
|
||||||
outputdir = folder + "/output/ksbatch/"
|
outputdir = folder + "/output/ksbatch/"
|
||||||
os.system("mkdir -p " + outputdir)
|
os.system("mkdir -p " + outputdir)
|
||||||
srcfolder = folder + "/output/batch/"
|
srcfolder = folder + "/output/batch/"
|
||||||
|
|
||||||
onlyfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "newusers" in f]
|
onlyfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "newusers" in f and "i" + str(intervl) in f]
|
||||||
onlyfiles = sorted(onlyfiles)
|
onlyfiles = sorted(onlyfiles)
|
||||||
|
|
||||||
plotbypost(onlyfiles, outputdir)
|
plotbypost(onlyfiles, outputdir, intervl)
|
||||||
plotbydate(onlyfiles, outputdir)
|
plotbydate(onlyfiles, outputdir, intervl)
|
||||||
|
|
||||||
oldfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "oldusers" in f]
|
oldfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "oldusers" in f and "i" + str(intervl) in f]
|
||||||
oldfiles = sorted(oldfiles)
|
oldfiles = sorted(oldfiles)
|
||||||
|
|
||||||
plotbydateold(onlyfiles, oldfiles, outputdir)
|
plotbydateold(onlyfiles, oldfiles, outputdir, intervl)
|
||||||
|
|
||||||
|
|
||||||
def plotbypost(onlyfiles, outputdir):
|
def plotbypost(onlyfiles, outputdir, intervl):
|
||||||
print("plotbypost")
|
print("plotbypost")
|
||||||
files = defaultdict(list)
|
files = defaultdict(list)
|
||||||
for f in onlyfiles:
|
for f in onlyfiles:
|
||||||
s = f[:-3].split("_")
|
s = f[:-3].split("_")
|
||||||
files[int(s[5])].append(f)
|
files[int(s[6])].append(f)
|
||||||
files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[3], "%d-%m-%Y")) for (p, l) in files.items()}
|
files = {p: sorted(l, key=lambda e: datetime.strptime(e.split("_")[3], "%d-%m-%Y")) for (p, l) in files.items()}
|
||||||
|
|
||||||
changes_neg = defaultdict(list)
|
changes_neg = defaultdict(list)
|
||||||
@@ -62,10 +62,10 @@ def plotbypost(onlyfiles, outputdir):
|
|||||||
poslevelsflat2 = [item['pos'] for item in tox2]
|
poslevelsflat2 = [item['pos'] for item in tox2]
|
||||||
comlevelsflat2 = [item['compound'] for item in tox2]
|
comlevelsflat2 = [item['compound'] for item in tox2]
|
||||||
|
|
||||||
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
|
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) if len(neglevelsflat1) > 0 and len(neglevelsflat2) > 0 else "no values"
|
||||||
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
|
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) if len(neulevelsflat1) > 0 and len(neulevelsflat2) > 0 else "no values"
|
||||||
kspos = ks_2samp(poslevelsflat1, poslevelsflat2)
|
kspos = ks_2samp(poslevelsflat1, poslevelsflat2) if len(poslevelsflat1) > 0 and len(poslevelsflat2) > 0 else "no values"
|
||||||
kscom = ks_2samp(comlevelsflat1, comlevelsflat2)
|
kscom = ks_2samp(comlevelsflat1, comlevelsflat2) if len(comlevelsflat1) > 0 and len(comlevelsflat2) > 0 else "no values"
|
||||||
|
|
||||||
changes_neg[p].append(ksneg)
|
changes_neg[p].append(ksneg)
|
||||||
changes_neu[p].append(ksneu)
|
changes_neu[p].append(ksneu)
|
||||||
@@ -73,7 +73,7 @@ def plotbypost(onlyfiles, outputdir):
|
|||||||
changes_com[p].append(kscom)
|
changes_com[p].append(kscom)
|
||||||
|
|
||||||
for (p, l) in files.items():
|
for (p, l) in files.items():
|
||||||
with open(outputdir + "/ks_post_" + str(p) + ".log", "w") as f:
|
with open(outputdir + "/ks_post_i" + str(intervl) + "_" + str(p) + ".log", "w") as f:
|
||||||
for i in range(len(l) - 1):
|
for i in range(len(l) - 1):
|
||||||
f1 = l[i]
|
f1 = l[i]
|
||||||
f2 = l[i + 1]
|
f2 = l[i + 1]
|
||||||
@@ -85,11 +85,12 @@ def plotbypost(onlyfiles, outputdir):
|
|||||||
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
|
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
|
||||||
fig = plt.figure(figsize=(16, 12))
|
fig = plt.figure(figsize=(16, 12))
|
||||||
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
|
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
|
||||||
pval = [x.pvalue for x in changes]
|
pval = [x.pvalue if not isinstance(x, str) else None for x in changes]
|
||||||
|
pvalnotnull = [x for x in pval if x is not None]
|
||||||
plt.plot(x, pval, label=type + ".pval", color=colors[type])
|
plt.plot(x, pval, label=type + ".pval", color=colors[type])
|
||||||
mean = np.mean(pval)
|
mean = np.mean(pvalnotnull)
|
||||||
std = np.std(pval)
|
std = np.std(pvalnotnull)
|
||||||
dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std]
|
dev = [(xx, s) for (xx, s) in zip(x, pval) if s is not None and (s <= mean - std or s >= mean + std)]
|
||||||
plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
|
plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
|
||||||
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
|
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
|
||||||
plt.title("KS 2-sided test with max " + str(p) + " posts")
|
plt.title("KS 2-sided test with max " + str(p) + " posts")
|
||||||
@@ -97,10 +98,10 @@ def plotbypost(onlyfiles, outputdir):
|
|||||||
plt.xlabel("Comparision: time frame X - time frame X+1")
|
plt.xlabel("Comparision: time frame X - time frame X+1")
|
||||||
plt.ylabel("p-value")
|
plt.ylabel("p-value")
|
||||||
plt.legend(loc="upper right")
|
plt.legend(loc="upper right")
|
||||||
plt.savefig(outputdir + "/ks_post_pval_" + str(p) + ".png", bbox_inches='tight')
|
plt.savefig(outputdir + "/ks_post_pval_i" + str(intervl) + "_" + str(p) + ".png", bbox_inches='tight')
|
||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
magick += " " + outputdir + "/ks_post_pval_" + str(p) + ".png"
|
magick += " " + outputdir + "/ks_post_pval_i" + str(intervl) + "_" + str(p) + ".png"
|
||||||
os.system(magick + " " + outputdir + "/ks_post_pval.pdf")
|
os.system(magick + " " + outputdir + "/ks_post_pval_i" + str(intervl) + ".pdf")
|
||||||
|
|
||||||
# stat
|
# stat
|
||||||
magick = IMAGE_MAGICK
|
magick = IMAGE_MAGICK
|
||||||
@@ -108,11 +109,12 @@ def plotbypost(onlyfiles, outputdir):
|
|||||||
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
|
x = [l[i].split("_")[3] + " -\n" + l[i + 1].split("_")[3] for i in range(len(l) - 1)]
|
||||||
fig = plt.figure(figsize=(16, 12))
|
fig = plt.figure(figsize=(16, 12))
|
||||||
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
|
for type, changes in {"neg": changes_neg[p], "neu": changes_neu[p], "pos": changes_pos[p], "com": changes_com[p]}.items():
|
||||||
stat = [x.statistic for x in changes]
|
stat = [x.statistic if not isinstance(x, str) else None for x in changes]
|
||||||
|
statnotnull = [x for x in stat if x is not None]
|
||||||
plt.plot(x, stat, label=type + ".stat", color=colors[type])
|
plt.plot(x, stat, label=type + ".stat", color=colors[type])
|
||||||
mean = np.mean(stat)
|
mean = np.mean(statnotnull)
|
||||||
std = np.std(stat)
|
std = np.std(statnotnull)
|
||||||
dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std]
|
dev = [(xx, s) for (xx, s) in zip(x, stat) if s is not None and (s <= mean - std or s >= mean + std)]
|
||||||
plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
|
plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
|
||||||
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
|
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
|
||||||
plt.title("KS 2-sided test with max " + str(p) + " posts")
|
plt.title("KS 2-sided test with max " + str(p) + " posts")
|
||||||
@@ -120,13 +122,13 @@ def plotbypost(onlyfiles, outputdir):
|
|||||||
plt.xlabel("Comparision: time frame X - time frame X+1")
|
plt.xlabel("Comparision: time frame X - time frame X+1")
|
||||||
plt.ylabel("stat value")
|
plt.ylabel("stat value")
|
||||||
plt.legend(loc="upper right")
|
plt.legend(loc="upper right")
|
||||||
plt.savefig(outputdir + "/ks_post_stat_" + str(p) + ".png", bbox_inches='tight')
|
plt.savefig(outputdir + "/ks_post_stat_i" + str(intervl) + "_" + str(p) + ".png", bbox_inches='tight')
|
||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
magick += " " + outputdir + "/ks_post_stat_" + str(p) + ".png"
|
magick += " " + outputdir + "/ks_post_stat_i" + str(intervl) + "_" + str(p) + ".png"
|
||||||
os.system(magick + " " + outputdir + "/ks_post_stat.pdf")
|
os.system(magick + " " + outputdir + "/ks_post_stat_i" + str(intervl) + ".pdf")
|
||||||
|
|
||||||
|
|
||||||
def plotbydate(onlyfiles, outputdir):
|
def plotbydate(onlyfiles, outputdir, intervl):
|
||||||
print("plotbydate")
|
print("plotbydate")
|
||||||
files = defaultdict(list)
|
files = defaultdict(list)
|
||||||
for f in onlyfiles:
|
for f in onlyfiles:
|
||||||
@@ -159,10 +161,10 @@ def plotbydate(onlyfiles, outputdir):
|
|||||||
poslevelsflat2 = [item['pos'] for item in tox2]
|
poslevelsflat2 = [item['pos'] for item in tox2]
|
||||||
comlevelsflat2 = [item['compound'] for item in tox2]
|
comlevelsflat2 = [item['compound'] for item in tox2]
|
||||||
|
|
||||||
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2)
|
ksneg = ks_2samp(neglevelsflat1, neglevelsflat2) if len(neglevelsflat1) > 0 and len(neglevelsflat2) > 0 else "no values"
|
||||||
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2)
|
ksneu = ks_2samp(neulevelsflat1, neulevelsflat2) if len(neulevelsflat1) > 0 and len(neulevelsflat2) > 0 else "no values"
|
||||||
kspos = ks_2samp(poslevelsflat1, poslevelsflat2)
|
kspos = ks_2samp(poslevelsflat1, poslevelsflat2) if len(poslevelsflat1) > 0 and len(poslevelsflat2) > 0 else "no values"
|
||||||
kscom = ks_2samp(comlevelsflat1, comlevelsflat2)
|
kscom = ks_2samp(comlevelsflat1, comlevelsflat2) if len(comlevelsflat1) > 0 and len(comlevelsflat2) > 0 else "no values"
|
||||||
|
|
||||||
changes_neg[d].append(ksneg)
|
changes_neg[d].append(ksneg)
|
||||||
changes_neu[d].append(ksneu)
|
changes_neu[d].append(ksneu)
|
||||||
@@ -170,7 +172,7 @@ def plotbydate(onlyfiles, outputdir):
|
|||||||
changes_com[d].append(kscom)
|
changes_com[d].append(kscom)
|
||||||
|
|
||||||
for (d, l) in files.items():
|
for (d, l) in files.items():
|
||||||
with open(outputdir + "/ks_date_" + d[0] + "_" + d[1] + ".log", "w") as f:
|
with open(outputdir + "/ks_date_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".log", "w") as f:
|
||||||
for i in range(len(l) - 1):
|
for i in range(len(l) - 1):
|
||||||
f1 = l[i]
|
f1 = l[i]
|
||||||
f2 = l[i + 1]
|
f2 = l[i + 1]
|
||||||
@@ -180,14 +182,15 @@ def plotbydate(onlyfiles, outputdir):
|
|||||||
# pval
|
# pval
|
||||||
magick = IMAGE_MAGICK
|
magick = IMAGE_MAGICK
|
||||||
for (d, l) in files.items():
|
for (d, l) in files.items():
|
||||||
x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)]
|
x = [l[i].split("_")[6][:-3] + "-" + l[i + 1].split("_")[6][:-3] for i in range(len(l) - 1)]
|
||||||
fig = plt.figure(figsize=(16, 12))
|
fig = plt.figure(figsize=(16, 12))
|
||||||
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
||||||
pval = [x.pvalue for x in changes]
|
pval = [x.pvalue if not isinstance(x, str) else None for x in changes]
|
||||||
|
pvalnotnull = [x for x in pval if x is not None]
|
||||||
plt.plot(x, pval, label=type + ".pval", color=colors[type])
|
plt.plot(x, pval, label=type + ".pval", color=colors[type])
|
||||||
mean = np.mean(pval)
|
mean = np.mean(pvalnotnull)
|
||||||
std = np.std(pval)
|
std = np.std(pvalnotnull)
|
||||||
dev = [(xx, s) for (xx, s) in zip(x, pval) if s <= mean - std or s >= mean + std]
|
dev = [(xx, s) for (xx, s) in zip(x, pval) if s is not None and (s <= mean - std or s >= mean + std)]
|
||||||
plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
|
plt.plot(x, [mean] * len(pval), color=colors[type], ls='dashed')
|
||||||
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
|
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
|
||||||
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
|
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
|
||||||
@@ -195,22 +198,23 @@ def plotbydate(onlyfiles, outputdir):
|
|||||||
plt.xlabel("Comparision: X (max) posts - X+1 (max) posts")
|
plt.xlabel("Comparision: X (max) posts - X+1 (max) posts")
|
||||||
plt.ylabel("p-value")
|
plt.ylabel("p-value")
|
||||||
plt.legend(loc="upper right")
|
plt.legend(loc="upper right")
|
||||||
plt.savefig(outputdir + "/ks_date_pval_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
|
plt.savefig(outputdir + "/ks_date_pval_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
|
||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
magick += " " + outputdir + "/ks_date_pval_" + d[0] + "_" + d[1] + ".png"
|
magick += " " + outputdir + "/ks_date_pval_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
|
||||||
os.system(magick + " " + outputdir + "/ks_date_pval.pdf")
|
os.system(magick + " " + outputdir + "/ks_date_pval_i" + str(intervl) + ".pdf")
|
||||||
|
|
||||||
# stat
|
# stat
|
||||||
magick = IMAGE_MAGICK
|
magick = IMAGE_MAGICK
|
||||||
for (d, l) in files.items():
|
for (d, l) in files.items():
|
||||||
x = [l[i].split("_")[5][:-3] + "-" + l[i + 1].split("_")[5][:-3] for i in range(len(l) - 1)]
|
x = [l[i].split("_")[6][:-3] + "-" + l[i + 1].split("_")[6][:-3] for i in range(len(l) - 1)]
|
||||||
fig = plt.figure(figsize=(16, 12))
|
fig = plt.figure(figsize=(16, 12))
|
||||||
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
||||||
stat = [x.statistic for x in changes]
|
stat = [x.statistic if not isinstance(x, str) else None for x in changes]
|
||||||
|
statnotnull = [x for x in stat if x is not None]
|
||||||
plt.plot(x, stat, label=type + ".stat", color=colors[type])
|
plt.plot(x, stat, label=type + ".stat", color=colors[type])
|
||||||
mean = np.mean(stat)
|
mean = np.mean(statnotnull)
|
||||||
std = np.std(stat)
|
std = np.std(statnotnull)
|
||||||
dev = [(xx, s) for (xx, s) in zip(x, stat) if s <= mean - std or s >= mean + std]
|
dev = [(xx, s) for (xx, s) in zip(x, stat) if s is not None and (s <= mean - std or s >= mean + std)]
|
||||||
plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
|
plt.plot(x, [mean] * len(stat), color=colors[type], ls='dashed')
|
||||||
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
|
plt.plot([xx for (xx, s) in dev], [s for (xx, s) in dev], color=colors[type], ls='None', marker='o')
|
||||||
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
|
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
|
||||||
@@ -218,13 +222,13 @@ def plotbydate(onlyfiles, outputdir):
|
|||||||
plt.xlabel("Comparision: X (max) posts - X+1 (max) posts")
|
plt.xlabel("Comparision: X (max) posts - X+1 (max) posts")
|
||||||
plt.ylabel("stat value")
|
plt.ylabel("stat value")
|
||||||
plt.legend(loc="upper right")
|
plt.legend(loc="upper right")
|
||||||
plt.savefig(outputdir + "/ks_date_stat_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
|
plt.savefig(outputdir + "/ks_date_stat_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
|
||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
magick += " " + outputdir + "/ks_date_stat_" + d[0] + "_" + d[1] + ".png"
|
magick += " " + outputdir + "/ks_date_stat_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
|
||||||
os.system(magick + " " + outputdir + "/ks_date_stat.pdf")
|
os.system(magick + " " + outputdir + "/ks_date_stat_i" + str(intervl) + ".pdf")
|
||||||
|
|
||||||
|
|
||||||
def plotbydateold(onlyfiles, oldfiles, outputdir):
|
def plotbydateold(onlyfiles, oldfiles, outputdir, intervl):
|
||||||
print("plotbydateold")
|
print("plotbydateold")
|
||||||
files = defaultdict(list)
|
files = defaultdict(list)
|
||||||
for f in onlyfiles:
|
for f in onlyfiles:
|
||||||
@@ -232,7 +236,7 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
|
|||||||
files[(s[3], s[4])].append(f)
|
files[(s[3], s[4])].append(f)
|
||||||
dates = sorted(files.keys(), key=lambda e: "-".join(reversed(e[0].split("-"))))
|
dates = sorted(files.keys(), key=lambda e: "-".join(reversed(e[0].split("-"))))
|
||||||
files = {d: files[d] for d in dates}
|
files = {d: files[d] for d in dates}
|
||||||
files = {d: sorted(l, key=lambda e: e.split("_")[5]) for (d, l) in files.items()}
|
files = {d: sorted(l, key=lambda e: e.split("_")[6]) for (d, l) in files.items()}
|
||||||
oldfiles = {(f[:-3].split("_")[3], f[:-3].split("_")[4]): f for f in oldfiles}
|
oldfiles = {(f[:-3].split("_")[3], f[:-3].split("_")[4]): f for f in oldfiles}
|
||||||
|
|
||||||
changes_neg = defaultdict(list)
|
changes_neg = defaultdict(list)
|
||||||
@@ -263,10 +267,10 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
|
|||||||
poslevelsflat1 = [item['pos'] for item in tox1]
|
poslevelsflat1 = [item['pos'] for item in tox1]
|
||||||
comlevelsflat1 = [item['compound'] for item in tox1]
|
comlevelsflat1 = [item['compound'] for item in tox1]
|
||||||
|
|
||||||
ksneg = ks_2samp(neglevelsflat1, neglevelsold)
|
ksneg = ks_2samp(neglevelsflat1, neglevelsold) if len(neglevelsflat1) > 0 and len(neglevelsold) > 0 else "no values"
|
||||||
ksneu = ks_2samp(neulevelsflat1, neulevelsold)
|
ksneu = ks_2samp(neulevelsflat1, neulevelsold) if len(neulevelsflat1) > 0 and len(neulevelsold) > 0 else "no values"
|
||||||
kspos = ks_2samp(poslevelsflat1, poslevelsold)
|
kspos = ks_2samp(poslevelsflat1, poslevelsold) if len(poslevelsflat1) > 0 and len(poslevelsold) > 0 else "no values"
|
||||||
kscom = ks_2samp(comlevelsflat1, comlevelsold)
|
kscom = ks_2samp(comlevelsflat1, comlevelsold) if len(comlevelsflat1) > 0 and len(comlevelsold) > 0 else "no values"
|
||||||
|
|
||||||
changes_neg[d].append(ksneg)
|
changes_neg[d].append(ksneg)
|
||||||
changes_neu[d].append(ksneu)
|
changes_neu[d].append(ksneu)
|
||||||
@@ -276,7 +280,7 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
|
|||||||
print("logs")
|
print("logs")
|
||||||
for (d, l) in files.items():
|
for (d, l) in files.items():
|
||||||
f1 = oldfiles[d]
|
f1 = oldfiles[d]
|
||||||
with open(outputdir + "/ks_olddate_" + d[0] + "_" + d[1] + ".log", "w") as f:
|
with open(outputdir + "/ks_olddate_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".log", "w") as f:
|
||||||
for i in range(len(l)):
|
for i in range(len(l)):
|
||||||
if changes_neg[d][i] is None:
|
if changes_neg[d][i] is None:
|
||||||
continue
|
continue
|
||||||
@@ -289,7 +293,7 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
|
|||||||
imgmagickcmd = IMAGE_MAGICK
|
imgmagickcmd = IMAGE_MAGICK
|
||||||
for (d, l) in files.items():
|
for (d, l) in files.items():
|
||||||
print(d)
|
print(d)
|
||||||
x = [l[i][:-3].split("_")[5] for i in range(len(l))]
|
x = [l[i][:-3].split("_")[6] for i in range(len(l))]
|
||||||
fig = plt.figure(figsize=(16, 12))
|
fig = plt.figure(figsize=(16, 12))
|
||||||
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
||||||
pval = [(xx, c.pvalue) for xx, c in zip(x, changes) if c is not None]
|
pval = [(xx, c.pvalue) for xx, c in zip(x, changes) if c is not None]
|
||||||
@@ -306,18 +310,18 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
|
|||||||
plt.xlabel("Comparision: new users X (max) posts - old users posts")
|
plt.xlabel("Comparision: new users X (max) posts - old users posts")
|
||||||
plt.ylabel("p-value")
|
plt.ylabel("p-value")
|
||||||
plt.legend(loc="upper right")
|
plt.legend(loc="upper right")
|
||||||
outfile = outputdir + "/ks_olddate_pval_" + d[0] + "_" + d[1] + ".png"
|
outfile = outputdir + "/ks_olddate_pval_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
|
||||||
plt.savefig(outfile, bbox_inches='tight')
|
plt.savefig(outfile, bbox_inches='tight')
|
||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
imgmagickcmd += " " + outfile
|
imgmagickcmd += " " + outfile
|
||||||
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_pval.pdf")
|
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_pval_i" + str(intervl) + ".pdf")
|
||||||
|
|
||||||
# stat
|
# stat
|
||||||
print("stat")
|
print("stat")
|
||||||
imgmagickcmd = IMAGE_MAGICK
|
imgmagickcmd = IMAGE_MAGICK
|
||||||
for (d, l) in files.items():
|
for (d, l) in files.items():
|
||||||
print(d)
|
print(d)
|
||||||
x = [l[i][:-3].split("_")[5] for i in range(len(l))]
|
x = [l[i][:-3].split("_")[6] for i in range(len(l))]
|
||||||
fig = plt.figure(figsize=(16, 12))
|
fig = plt.figure(figsize=(16, 12))
|
||||||
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
for type, changes in {"neg": changes_neg[d], "neu": changes_neu[d], "pos": changes_pos[d], "com": changes_com[d]}.items():
|
||||||
stat = [(xx, c.statistic) for xx, c in zip(x, changes) if c is not None]
|
stat = [(xx, c.statistic) for xx, c in zip(x, changes) if c is not None]
|
||||||
@@ -334,11 +338,11 @@ def plotbydateold(onlyfiles, oldfiles, outputdir):
|
|||||||
plt.xlabel("Comparision: new users X (max) posts - old users posts")
|
plt.xlabel("Comparision: new users X (max) posts - old users posts")
|
||||||
plt.ylabel("stat value")
|
plt.ylabel("stat value")
|
||||||
plt.legend(loc="upper right")
|
plt.legend(loc="upper right")
|
||||||
outfile = outputdir + "/ks_olddate_stat_" + d[0] + "_" + d[1] + ".png"
|
outfile = outputdir + "/ks_olddate_stat_i" + str(intervl) + "_" + d[0] + "_" + d[1] + ".png"
|
||||||
plt.savefig(outfile, bbox_inches='tight')
|
plt.savefig(outfile, bbox_inches='tight')
|
||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
imgmagickcmd += " " + outfile
|
imgmagickcmd += " " + outfile
|
||||||
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_stat.pdf")
|
os.system(imgmagickcmd + " " + outputdir + "/ks_olddate_stat_i" + str(intervl) + ".pdf")
|
||||||
|
|
||||||
|
|
||||||
def filecmp(file1, file2):
|
def filecmp(file1, file2):
|
||||||
@@ -365,5 +369,20 @@ if __name__ == "__main__":
|
|||||||
if not os.path.isdir(folder):
|
if not os.path.isdir(folder):
|
||||||
print(folder + " is not a folder")
|
print(folder + " is not a folder")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
interval = 3
|
||||||
|
if len(sys.argv) >= 3:
|
||||||
|
if sys.argv[2].startswith("-i"):
|
||||||
|
interval = sys.argv[2][2:]
|
||||||
|
try:
|
||||||
|
interval = int(interval)
|
||||||
|
except ValueError:
|
||||||
|
print("-i: int required")
|
||||||
|
sys.exit(1)
|
||||||
|
if interval < 1 or interval > 12:
|
||||||
|
print("-i: only 1 - 12")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print("unknown parameter: " + sys.argv[2])
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
main(folder)
|
main(folder, interval)
|
||||||
|
|||||||
Reference in New Issue
Block a user