diff --git a/analyze_batch.py b/analyze_batch.py index 9f8cdf6..d790de7 100644 --- a/analyze_batch.py +++ b/analyze_batch.py @@ -29,11 +29,11 @@ def main(folder): postcounts = range(1, 5 + 1) for (option_date_from, option_date_to) in intervals: # filter users by option_date_from <= creation date <= option_date_to - newusers = dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").getresults() - newuserids = set(dmt(newusers).map(lambda u: u['Id'], "get user id list").getresults()) + # newusers = dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").getresults() + # newuserids = set(dmt(newusers).map(lambda u: u['Id'], "get user id list").getresults()) # get questions for filtered users - newposts = dmt(posts).filter(lambda p: p['OwnerUserId'] in newuserids, "filter posts by selected users").getresults() + newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults() if len(newposts) == 0: continue print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) @@ -48,7 +48,8 @@ def main(folder): gpos = [] gcom = [] - outfolder = "output/batch/" + folder.split("/")[-1] + "/" + outfolder = folder + "/output/batch/" + os.system("mkdir -p " + outfolder) goutfilename = outfolder + "batch_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") for option_posts in postcounts: @@ -90,7 +91,6 @@ def main(folder): rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms") outfilename = goutfilename + "_" + str(option_posts) - os.system("mkdir -p " + outfolder) dumptoxlevels(toxlevels, outfilename + ".py") neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())] @@ -118,7 +118,7 @@ def main(folder): axs[1, 1].set_yscale('log') # plt.show() - fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts\nUsers registered between " + fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) fig.savefig(outfilename + ".png", bbox_inches='tight') plt.close(fig) @@ -136,7 +136,7 @@ def main(folder): gaxs[1, 0].set_yscale('log') gaxs[0, 1].set_yscale('log') gaxs[1, 1].set_yscale('log') - gfig.suptitle("Sentiment of answers to the first X (max) posts\nUsers registered between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) + gfig.suptitle("Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) gfig.savefig(goutfilename + ".png", bbox_inches='tight') plt.close(gfig) diff --git a/calctoxdiff.py b/calctoxdiff.py index ee75ead..eea7047 100644 --- a/calctoxdiff.py +++ b/calctoxdiff.py @@ -13,20 +13,18 @@ colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'} def main(folder): - if folder.endswith("/"): - folder = folder[:-1] - onlyfiles = [f for f in listdir(folder)] - onlyfiles = [f for f in onlyfiles if isfile(join(folder, f))] - onlyfiles = [f for f in onlyfiles if f.endswith(".py")] - # onlyfiles = [f[:-3] for f in onlyfiles] - # onlyfiles = [f.replace(".", "\.") for f in onlyfiles] + outputdir = folder + "/output/ksbatch/" + os.system("mkdir -p " + outputdir) + folder = folder + "/output/batch/" + onlyfiles = [folder + f for f in listdir(folder) if isfile(join(folder, f)) and f.endswith(".py")] onlyfiles = sorted(onlyfiles) - plotbypost(onlyfiles) - plotbydate(onlyfiles) + + plotbypost(onlyfiles, outputdir) + plotbydate(onlyfiles, outputdir) -def plotbypost(onlyfiles): +def plotbypost(onlyfiles, outputdir): files = defaultdict(list) for f in onlyfiles: s = f[:-3].split("_") @@ -43,8 +41,8 @@ def plotbypost(onlyfiles): continue print(p) for i in range(len(l) - 1): - tox1 = imprt(folder + "/" + l[i]).toxlevels - tox2 = imprt(folder + "/" + l[i + 1]).toxlevels + tox1 = imprt(l[i]).toxlevels + tox2 = imprt(l[i + 1]).toxlevels neglevelsflat1 = [item['neg'] for item in flatmap(tox1.values())] neulevelsflat1 = [item['neu'] for item in flatmap(tox1.values())] @@ -67,7 +65,7 @@ def plotbypost(onlyfiles): changes_com[p].append(kscom) for (p, l) in files.items(): - with open(folder + "/ks_" + str(p) + ".log", "w") as f: + with open(outputdir + "/ks_post_" + str(p) + ".log", "w") as f: for i in range(len(l) - 1): f1 = l[i] f2 = l[i + 1] @@ -88,7 +86,7 @@ def plotbypost(onlyfiles): plt.title("KS 2-sided test with max " + str(p) + " posts") plt.xticks(rotation=90) plt.legend(loc="upper right") - plt.savefig(folder + "/ks_pval_" + str(p) + ".png", bbox_inches='tight') + plt.savefig(outputdir + "/ks_post_pval_" + str(p) + ".png", bbox_inches='tight') plt.close(fig) # stat @@ -106,11 +104,11 @@ def plotbypost(onlyfiles): plt.title("KS 2-sided test with max " + str(p) + " posts") plt.xticks(rotation=90) plt.legend(loc="upper right") - plt.savefig(folder + "/ks_stat_" + str(p) + ".png", bbox_inches='tight') + plt.savefig(outputdir + "/ks_post_stat_" + str(p) + ".png", bbox_inches='tight') plt.close(fig) -def plotbydate(onlyfiles): +def plotbydate(onlyfiles, outputdir): files = defaultdict(list) for f in onlyfiles: s = f[:-3].split("_") @@ -127,8 +125,8 @@ def plotbydate(onlyfiles): continue print(d) for i in range(len(l) - 1): - tox1 = imprt(folder + "/" + l[i]).toxlevels - tox2 = imprt(folder + "/" + l[i + 1]).toxlevels + tox1 = imprt(l[i]).toxlevels + tox2 = imprt(l[i + 1]).toxlevels neglevelsflat1 = [item['neg'] for item in flatmap(tox1.values())] neulevelsflat1 = [item['neu'] for item in flatmap(tox1.values())] @@ -151,7 +149,7 @@ def plotbydate(onlyfiles): changes_com[d].append(kscom) for (d, l) in files.items(): - with open(folder + "/ks_" + d[0] + "_" + d[1] + ".log", "w") as f: + with open(outputdir + "/ks_date_" + d[0] + "_" + d[1] + ".log", "w") as f: for i in range(len(l) - 1): f1 = l[i] f2 = l[i + 1] @@ -173,7 +171,7 @@ def plotbydate(onlyfiles): plt.title("KS 2-sided test with between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) plt.legend(loc="upper right") - plt.savefig(folder + "/ks_pval_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight') + plt.savefig(outputdir + "/ks_date_pval_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight') plt.close(fig) # stat @@ -191,7 +189,7 @@ def plotbydate(onlyfiles): plt.title("KS 2-sided test with between " + d[0] + " and " + d[1]) plt.xticks(rotation=90) plt.legend(loc="upper right") - plt.savefig(folder + "/ks_stat_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight') + plt.savefig(outputdir + "/ks_date_stat_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight') plt.close(fig) diff --git a/posthist.py b/posthist.py index 03091be..5abfefb 100644 --- a/posthist.py +++ b/posthist.py @@ -28,8 +28,8 @@ def main(folder): print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))) # filter posts by option_date_from <= creation date <= option_date_to - newusers = set(dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").map(lambda u: u['Id'], "getting user ids").getresults()) - newposts = dmt(posts).filter(lambda p: p['OwnerUserId'] in newusers, "filtering posts by users").getresults() + # newusers = set(dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").map(lambda u: u['Id'], "getting user ids").getresults()) + newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults() postcounts = defaultdict(list) i = 0 @@ -39,16 +39,9 @@ def main(folder): postcounts = {id: len(pc) for (id, pc) in postcounts.items()} # print("i: " + str(i) + " expected: " + str(len(newposts)) + " is: " + str(sum([pc for pc in postcounts.values()]))) - os.system("mkdir -p " + folder + "/output") - histfilename = folder + "/output/posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") - countfilename = folder + "/output/postcount_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") - - # fig = plt.figure(figsize=(16, 12)) - # plt.plot(userids, [len(pc) for pc in postcounts]) - # plt.title("Post count for users between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y")) - # plt.xticks(rotation=90) - # fig.savefig(countfilename + ".png", bbox_inches='tight') - # plt.close(fig) + outputdir = folder + "/output/posthist/" + os.system("mkdir -p " + outputdir) + histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") histdata = [pc for pc in postcounts.values()] fig = plt.figure(figsize=(16, 12))