diff --git a/calctoxdiff.py b/calctoxdiff.py index bcfa073..2f263d7 100644 --- a/calctoxdiff.py +++ b/calctoxdiff.py @@ -9,16 +9,22 @@ import matplotlib.pyplot as plt import numpy as np from scipy.stats import ks_2samp -from common import imprt, IMAGE_MAGICK +from analyze_batch import readavgsentsingle +from common import imprt, IMAGE_MAGICK, calc_intervals +from loader import load colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'} def main(folder, intervl): + users, posts, firstcontrib, sumcontrib = load(folder) + outputdir = folder + "/output/ksbatch/" os.system("mkdir -p " + outputdir) srcfolder = folder + "/output/batch/" + g(srcfolder + "/averagesentiment.txt", outputdir, calc_intervals(posts, intervl)) + onlyfiles = [srcfolder + f for f in listdir(srcfolder) if isfile(join(srcfolder, f)) and f.endswith(".py") and "newusers" in f and "i" + str(intervl) in f] onlyfiles = sorted(onlyfiles) @@ -31,6 +37,47 @@ def main(folder, intervl): plotbydateold(onlyfiles, oldfiles, outputdir, intervl) +class fake: + def __init__(self, p, s): + pass + + +def g(srcfile, outputdir, intervals): + print("ks global") + avgss2 = readavgsentsingle(srcfile) + + kscom = [] + single = [] + for i in range(1, 6): + kscom.append(ks_2samp([np.mean(x) if len(x) > 0 else float("nan") for x in avgss2[0]], [np.mean(x) if len(x) > 0 else float("nan") for x in avgss2[i]])) + s = [] + for j in range(len(avgss2[0])): + s.append(ks_2samp(avgss2[0][j], avgss2[i][j]) if len(avgss2[i][j]) > 0 and len(avgss2[0][j]) else float("nan")) + single.append(s) + + fig = plt.figure(figsize=(16, 12)) + for i in range(len(single)): + plt.plot([iv[0] for iv in intervals], [s if isinstance(s, float) else s.pvalue for s in single[i]], label=str(i + 1) + " posts - most posters") + plt.title("KS 2-sided test for sentiments (X posts to 95%tile posters)") + plt.xticks(rotation=90) + plt.xlabel("Comparision: time frame X - time frame X+1") + plt.ylabel("pvalue") + plt.legend(loc="upper right") + plt.savefig(outputdir + "/ks_averagesentiments_pval.png", bbox_inches='tight') + + plt.close(fig) + fig = plt.figure(figsize=(16, 12)) + for i in range(len(single)): + plt.plot([iv[0] for iv in intervals], [s if isinstance(s, float) else s.statistic for s in single[i]], label=str(i + 1) + " posts - most posters") + plt.title("KS 2-sided test for sentiments (X posts to 95%tile posters)") + plt.xticks(rotation=90) + plt.xlabel("Comparision: time frame X - time frame X+1") + plt.ylabel("statistic") + plt.legend(loc="upper right") + plt.savefig(outputdir + "/ks_averagesentiments_stat.png", bbox_inches='tight') + plt.close(fig) + + def plotbypost(onlyfiles, outputdir, intervl): print("plotbypost") files = defaultdict(list) diff --git a/summary b/summary index f42660f..9a74ca1 100644 --- a/summary +++ b/summary @@ -9,7 +9,7 @@ Data: The data sets are aquired from archive.org [https://archive.org/download/s - math.stackexchange.com (kaputt timeout) - mathoverflow.net - serverfault.com -- stats.stackexchange.com (kaputt analyse_batch letzter plot, 42, 37 datapoints) +- stats.stackexchange.com - stackoverflow.com (not yet) - superuser.com - tex.stackexchange.com