This commit is contained in:
wea_ondara
2019-07-18 13:58:27 +02:00
parent bca211551c
commit 3d4b1f26ba
3 changed files with 31 additions and 40 deletions

View File

@@ -29,11 +29,11 @@ def main(folder):
postcounts = range(1, 5 + 1) postcounts = range(1, 5 + 1)
for (option_date_from, option_date_to) in intervals: for (option_date_from, option_date_to) in intervals:
# filter users by option_date_from <= creation date <= option_date_to # filter users by option_date_from <= creation date <= option_date_to
newusers = dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").getresults() # newusers = dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").getresults()
newuserids = set(dmt(newusers).map(lambda u: u['Id'], "get user id list").getresults()) # newuserids = set(dmt(newusers).map(lambda u: u['Id'], "get user id list").getresults())
# get questions for filtered users # get questions for filtered users
newposts = dmt(posts).filter(lambda p: p['OwnerUserId'] in newuserids, "filter posts by selected users").getresults() newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults()
if len(newposts) == 0: if len(newposts) == 0:
continue continue
print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
@@ -48,7 +48,8 @@ def main(folder):
gpos = [] gpos = []
gcom = [] gcom = []
outfolder = "output/batch/" + folder.split("/")[-1] + "/" outfolder = folder + "/output/batch/"
os.system("mkdir -p " + outfolder)
goutfilename = outfolder + "batch_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") goutfilename = outfolder + "batch_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
for option_posts in postcounts: for option_posts in postcounts:
@@ -90,7 +91,6 @@ def main(folder):
rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms") rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms")
outfilename = goutfilename + "_" + str(option_posts) outfilename = goutfilename + "_" + str(option_posts)
os.system("mkdir -p " + outfolder)
dumptoxlevels(toxlevels, outfilename + ".py") dumptoxlevels(toxlevels, outfilename + ".py")
neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())] neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())]
@@ -118,7 +118,7 @@ def main(folder):
axs[1, 1].set_yscale('log') axs[1, 1].set_yscale('log')
# plt.show() # plt.show()
fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts\nUsers registered between " fig.suptitle("Sentiment of answers to the first " + str(option_posts) + " (max) posts within 1 week of 1st contribution\nPosts created between "
+ option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
fig.savefig(outfilename + ".png", bbox_inches='tight') fig.savefig(outfilename + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)
@@ -136,7 +136,7 @@ def main(folder):
gaxs[1, 0].set_yscale('log') gaxs[1, 0].set_yscale('log')
gaxs[0, 1].set_yscale('log') gaxs[0, 1].set_yscale('log')
gaxs[1, 1].set_yscale('log') gaxs[1, 1].set_yscale('log')
gfig.suptitle("Sentiment of answers to the first X (max) posts\nUsers registered between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) gfig.suptitle("Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
gfig.savefig(goutfilename + ".png", bbox_inches='tight') gfig.savefig(goutfilename + ".png", bbox_inches='tight')
plt.close(gfig) plt.close(gfig)

View File

@@ -13,20 +13,18 @@ colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'}
def main(folder): def main(folder):
if folder.endswith("/"): outputdir = folder + "/output/ksbatch/"
folder = folder[:-1] os.system("mkdir -p " + outputdir)
onlyfiles = [f for f in listdir(folder)] folder = folder + "/output/batch/"
onlyfiles = [f for f in onlyfiles if isfile(join(folder, f))]
onlyfiles = [f for f in onlyfiles if f.endswith(".py")]
# onlyfiles = [f[:-3] for f in onlyfiles]
# onlyfiles = [f.replace(".", "\.") for f in onlyfiles]
onlyfiles = [folder + f for f in listdir(folder) if isfile(join(folder, f)) and f.endswith(".py")]
onlyfiles = sorted(onlyfiles) onlyfiles = sorted(onlyfiles)
plotbypost(onlyfiles)
plotbydate(onlyfiles) plotbypost(onlyfiles, outputdir)
plotbydate(onlyfiles, outputdir)
def plotbypost(onlyfiles): def plotbypost(onlyfiles, outputdir):
files = defaultdict(list) files = defaultdict(list)
for f in onlyfiles: for f in onlyfiles:
s = f[:-3].split("_") s = f[:-3].split("_")
@@ -43,8 +41,8 @@ def plotbypost(onlyfiles):
continue continue
print(p) print(p)
for i in range(len(l) - 1): for i in range(len(l) - 1):
tox1 = imprt(folder + "/" + l[i]).toxlevels tox1 = imprt(l[i]).toxlevels
tox2 = imprt(folder + "/" + l[i + 1]).toxlevels tox2 = imprt(l[i + 1]).toxlevels
neglevelsflat1 = [item['neg'] for item in flatmap(tox1.values())] neglevelsflat1 = [item['neg'] for item in flatmap(tox1.values())]
neulevelsflat1 = [item['neu'] for item in flatmap(tox1.values())] neulevelsflat1 = [item['neu'] for item in flatmap(tox1.values())]
@@ -67,7 +65,7 @@ def plotbypost(onlyfiles):
changes_com[p].append(kscom) changes_com[p].append(kscom)
for (p, l) in files.items(): for (p, l) in files.items():
with open(folder + "/ks_" + str(p) + ".log", "w") as f: with open(outputdir + "/ks_post_" + str(p) + ".log", "w") as f:
for i in range(len(l) - 1): for i in range(len(l) - 1):
f1 = l[i] f1 = l[i]
f2 = l[i + 1] f2 = l[i + 1]
@@ -88,7 +86,7 @@ def plotbypost(onlyfiles):
plt.title("KS 2-sided test with max " + str(p) + " posts") plt.title("KS 2-sided test with max " + str(p) + " posts")
plt.xticks(rotation=90) plt.xticks(rotation=90)
plt.legend(loc="upper right") plt.legend(loc="upper right")
plt.savefig(folder + "/ks_pval_" + str(p) + ".png", bbox_inches='tight') plt.savefig(outputdir + "/ks_post_pval_" + str(p) + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)
# stat # stat
@@ -106,11 +104,11 @@ def plotbypost(onlyfiles):
plt.title("KS 2-sided test with max " + str(p) + " posts") plt.title("KS 2-sided test with max " + str(p) + " posts")
plt.xticks(rotation=90) plt.xticks(rotation=90)
plt.legend(loc="upper right") plt.legend(loc="upper right")
plt.savefig(folder + "/ks_stat_" + str(p) + ".png", bbox_inches='tight') plt.savefig(outputdir + "/ks_post_stat_" + str(p) + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)
def plotbydate(onlyfiles): def plotbydate(onlyfiles, outputdir):
files = defaultdict(list) files = defaultdict(list)
for f in onlyfiles: for f in onlyfiles:
s = f[:-3].split("_") s = f[:-3].split("_")
@@ -127,8 +125,8 @@ def plotbydate(onlyfiles):
continue continue
print(d) print(d)
for i in range(len(l) - 1): for i in range(len(l) - 1):
tox1 = imprt(folder + "/" + l[i]).toxlevels tox1 = imprt(l[i]).toxlevels
tox2 = imprt(folder + "/" + l[i + 1]).toxlevels tox2 = imprt(l[i + 1]).toxlevels
neglevelsflat1 = [item['neg'] for item in flatmap(tox1.values())] neglevelsflat1 = [item['neg'] for item in flatmap(tox1.values())]
neulevelsflat1 = [item['neu'] for item in flatmap(tox1.values())] neulevelsflat1 = [item['neu'] for item in flatmap(tox1.values())]
@@ -151,7 +149,7 @@ def plotbydate(onlyfiles):
changes_com[d].append(kscom) changes_com[d].append(kscom)
for (d, l) in files.items(): for (d, l) in files.items():
with open(folder + "/ks_" + d[0] + "_" + d[1] + ".log", "w") as f: with open(outputdir + "/ks_date_" + d[0] + "_" + d[1] + ".log", "w") as f:
for i in range(len(l) - 1): for i in range(len(l) - 1):
f1 = l[i] f1 = l[i]
f2 = l[i + 1] f2 = l[i + 1]
@@ -173,7 +171,7 @@ def plotbydate(onlyfiles):
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1]) plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
plt.xticks(rotation=90) plt.xticks(rotation=90)
plt.legend(loc="upper right") plt.legend(loc="upper right")
plt.savefig(folder + "/ks_pval_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight') plt.savefig(outputdir + "/ks_date_pval_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)
# stat # stat
@@ -191,7 +189,7 @@ def plotbydate(onlyfiles):
plt.title("KS 2-sided test with between " + d[0] + " and " + d[1]) plt.title("KS 2-sided test with between " + d[0] + " and " + d[1])
plt.xticks(rotation=90) plt.xticks(rotation=90)
plt.legend(loc="upper right") plt.legend(loc="upper right")
plt.savefig(folder + "/ks_stat_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight') plt.savefig(outputdir + "/ks_date_stat_" + d[0] + "_" + d[1] + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)

View File

@@ -28,8 +28,8 @@ def main(folder):
print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))) print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y")))
# filter posts by option_date_from <= creation date <= option_date_to # filter posts by option_date_from <= creation date <= option_date_to
newusers = set(dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").map(lambda u: u['Id'], "getting user ids").getresults()) # newusers = set(dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").map(lambda u: u['Id'], "getting user ids").getresults())
newposts = dmt(posts).filter(lambda p: p['OwnerUserId'] in newusers, "filtering posts by users").getresults() newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults()
postcounts = defaultdict(list) postcounts = defaultdict(list)
i = 0 i = 0
@@ -39,16 +39,9 @@ def main(folder):
postcounts = {id: len(pc) for (id, pc) in postcounts.items()} postcounts = {id: len(pc) for (id, pc) in postcounts.items()}
# print("i: " + str(i) + " expected: " + str(len(newposts)) + " is: " + str(sum([pc for pc in postcounts.values()]))) # print("i: " + str(i) + " expected: " + str(len(newposts)) + " is: " + str(sum([pc for pc in postcounts.values()])))
os.system("mkdir -p " + folder + "/output") outputdir = folder + "/output/posthist/"
histfilename = folder + "/output/posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") os.system("mkdir -p " + outputdir)
countfilename = folder + "/output/postcount_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
# fig = plt.figure(figsize=(16, 12))
# plt.plot(userids, [len(pc) for pc in postcounts])
# plt.title("Post count for users between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y"))
# plt.xticks(rotation=90)
# fig.savefig(countfilename + ".png", bbox_inches='tight')
# plt.close(fig)
histdata = [pc for pc in postcounts.values()] histdata = [pc for pc in postcounts.values()]
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=(16, 12))