diff --git a/analyze_batch.py b/analyze_batch.py index d790de7..a286d33 100644 --- a/analyze_batch.py +++ b/analyze_batch.py @@ -28,11 +28,7 @@ def main(folder): postcounts = range(1, 5 + 1) for (option_date_from, option_date_to) in intervals: - # filter users by option_date_from <= creation date <= option_date_to - # newusers = dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").getresults() - # newuserids = set(dmt(newusers).map(lambda u: u['Id'], "get user id list").getresults()) - - # get questions for filtered users + # get questions for option_date_from <= creation date < option_date_to newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults() if len(newposts) == 0: continue @@ -50,7 +46,8 @@ def main(folder): outfolder = folder + "/output/batch/" os.system("mkdir -p " + outfolder) - goutfilename = outfolder + "batch_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + goutfilenamenewusers = outfolder + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + goutfilenameoldusers = outfolder + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") for option_posts in postcounts: # print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts)) @@ -80,17 +77,16 @@ def main(folder): printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts))) if (i + 1) == len(newposts): printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts))) - userid = post['OwnerUserId'] for a in post['Answers']: if a['Id'] in cachedsentiments.keys(): toxlevel = cachedsentiments[a['Id']] else: toxlevel = computeToxLevel(a['Body']) cachedsentiments[a['Id']] = toxlevel - toxlevels[userid].append(toxlevel) + toxlevels[post['Id']].append(toxlevel) rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms") - outfilename = goutfilename + "_" + str(option_posts) + outfilename = goutfilenamenewusers + "_" + str(option_posts) dumptoxlevels(toxlevels, outfilename + ".py") neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())] @@ -137,7 +133,7 @@ def main(folder): gaxs[0, 1].set_yscale('log') gaxs[1, 1].set_yscale('log') gfig.suptitle("Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) - gfig.savefig(goutfilename + ".png", bbox_inches='tight') + gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight') plt.close(gfig) diff --git a/mt.py b/mt.py index 02d75e0..58ec26e 100644 --- a/mt.py +++ b/mt.py @@ -1,5 +1,6 @@ from threading import Thread, Lock import time +from math import ceil class mt(): @@ -19,6 +20,7 @@ class mt(): self.__threads = [] self.__lock = Lock() self.__results = [] + self.__progress = 0 for i in range(self.__threadcount): self.__results.append([]) self.__threads.append(None) @@ -29,6 +31,7 @@ class mt(): if self.__running: self.join() self.__data = self.getresults() + self.__progress = 0 self.__running = True self.__final = self.__getresultsmapfilter self.__type = "filter" @@ -45,7 +48,17 @@ class mt(): def __dofilter(self, i, list, cond): now = self.__cms() - results = [l for l in list if cond(l)] + + results = [] + for j in range(ceil(len(list) / 1000)): + part = list[j * 1000: min((j + 1) * 1000, len(list))] + results += [l for l in part if cond(l)] + with self.__lock: + self.__progress += len(part) + if self.__comment is not None: + print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True) + + # results = [l for l in list if cond(l)] with self.__lock: self.__results[i] = results dur = self.__cms() - now @@ -58,12 +71,13 @@ class mt(): if self.__running: self.join() self.__data = self.getresults() + self.__progress = 0 self.__running = True self.__final = self.__getresultsmapfilter self.__type = "map" self.__comment = comment if comment is not None else "" if comment is not None: - print(self.__comment + ": #" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True) + print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True) self.__starttime = self.__cms() self.__endtime = None for i in range(self.__threadcount): @@ -74,7 +88,16 @@ class mt(): def __domap(self, i, list, func): now = self.__cms() - results = [func(l) for l in list] + results = [] + for j in range(ceil(len(list) / 1000)): + part = list[j * 1000: min((j + 1) * 1000, len(list))] + results += [func(l) for l in part] + with self.__lock: + self.__progress += len(part) + if self.__comment is not None: + print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True) + + # results = [func(l) for l in list] with self.__lock: self.__results[i] = results dur = self.__cms() - now @@ -87,6 +110,7 @@ class mt(): if self.__running: self.join() self.__data = self.getresults() + self.__progress = 0 self.__running = True self.__final = lambda: self.__getresultsreduce(aggregator, initval) self.__type = "reduce" @@ -104,8 +128,18 @@ class mt(): def __doreduce(self, i, list, reducer, initval): now = self.__cms() val = initval() - for j in range(len(list)): - val = reducer(val, list[j]) + + for j in range(ceil(len(list) / 1000)): + part = list[j * 1000: min((j + 1) * 1000, len(list))] + for k in range(len(part)): + val = reducer(val, part[k]) + with self.__lock: + self.__progress += len(part) + if self.__comment is not None: + print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True) + + # for j in range(len(list)): + # val = reducer(val, list[j]) with self.__lock: self.__results[i] = val dur = self.__cms() - now @@ -136,13 +170,15 @@ class mt(): self.__threads[i].join() self.__threads[i] = None if self.__endtime is None: - self.__endtime = self.__cms(); + self.__endtime = self.__cms() if self.__comment is not None: dur = self.__endtime - self.__starttime if self.__verbose: - print(self.__comment + ": #" + str(len(self.__data)) + (" -> #" + str(sum([len(l) for l in self.__results])) if self.__type == "filter" else "") + " ... took " + str(dur) + "ms") + print(self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + ( + " -> #" + str(sum([len(l) for l in self.__results])) if self.__type == "filter" else "") + " ... took " + str(dur) + "ms") else: - print("\r" + self.__comment + ": #" + str(len(self.__data)) + (" -> #" + str(sum([len(l) for l in self.__results])) if self.__type == "filter" else "") + " ... took " + str(dur) + "ms") + print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + ( + " -> #" + str(sum([len(l) for l in self.__results])) if self.__type == "filter" else "") + " ... took " + str(dur) + "ms") return self def close(self): diff --git a/sentiments.py b/sentiments.py new file mode 100644 index 0000000..1115de3 --- /dev/null +++ b/sentiments.py @@ -0,0 +1,76 @@ +import os +import sys +from collections import defaultdict + +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer + +from loader import load, cms, dmt + +printnoln = lambda text: print(text, end='', flush=True) +rprint = lambda text: print('\r' + text) + +DAYS_NEW_USER = 7 +OLD_USER_YEAR = 3 + +analyser = SentimentIntensityAnalyzer() +colors = ['red', 'green', 'blue', 'orange', 'deeppink'] + + +def main(folder): + users, posts, firstcontrib, sumcontrib = load(folder) + + outfolder = folder + "/output/" + os.system("mkdir -p " + outfolder) + outfilename = outfolder + "sentiments" + + # computer toxic levels + # start = cms() + # printnoln("computing toxic levels: filtering") + + # toxlevels = defaultdict(list) + # for (i, post) in enumerate(posts): + # if (i + 1) % 100 == 0: + # printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(posts))) + # if (i + 1) == len(posts): + # printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(posts))) + # for a in post['Answers']: + # toxlevel = computeToxLevel(a['Body']) + # toxlevels[post['Id']].append(toxlevel) + # rprint("computing toxic levels: post #" + str(len(posts)) + "/" + str(len(posts)) + " ... took " + str(cms() - start) + "ms") + toxlevels = dmt(posts).map(lambda p: (p['Id'], computeSentimentForPost(p)), "calculating sentiments").getresults() + toxlevels = {id: p for (id, p) in toxlevels} + + dumptoxlevels(toxlevels, outfilename + ".py") + + +def computeSentimentForPost(post): + anwsers = {a['Id']: computeToxLevel(a['Body']) for a in post['Answers']} + return anwsers + + +def computeToxLevel(text): + return analyser.polarity_scores(text) + + +def dumptoxlevels(lvls, filename): + answers = dict() + for p in lvls.values(): + for id, a in p.items(): + answers[id] = a + with open(filename, "w") as file: + file.write("posts = " + str(lvls) + "\n") + file.write("answers = " + str(answers) + "\n") + + +if __name__ == "__main__": + # execute only if run as a script + usage = sys.argv[0] + " " + if len(sys.argv) < 2: + print(usage) + sys.exit(1) + folder = sys.argv[1] + if not os.path.isdir(folder): + print(folder + " is not a folder") + sys.exit(1) + + main(folder)