wip
This commit is contained in:
@@ -28,11 +28,7 @@ def main(folder):
|
|||||||
|
|
||||||
postcounts = range(1, 5 + 1)
|
postcounts = range(1, 5 + 1)
|
||||||
for (option_date_from, option_date_to) in intervals:
|
for (option_date_from, option_date_to) in intervals:
|
||||||
# filter users by option_date_from <= creation date <= option_date_to
|
# get questions for option_date_from <= creation date < option_date_to
|
||||||
# newusers = dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").getresults()
|
|
||||||
# newuserids = set(dmt(newusers).map(lambda u: u['Id'], "get user id list").getresults())
|
|
||||||
|
|
||||||
# get questions for filtered users
|
|
||||||
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults()
|
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults()
|
||||||
if len(newposts) == 0:
|
if len(newposts) == 0:
|
||||||
continue
|
continue
|
||||||
@@ -50,7 +46,8 @@ def main(folder):
|
|||||||
|
|
||||||
outfolder = folder + "/output/batch/"
|
outfolder = folder + "/output/batch/"
|
||||||
os.system("mkdir -p " + outfolder)
|
os.system("mkdir -p " + outfolder)
|
||||||
goutfilename = outfolder + "batch_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
goutfilenamenewusers = outfolder + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
||||||
|
goutfilenameoldusers = outfolder + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
|
||||||
|
|
||||||
for option_posts in postcounts:
|
for option_posts in postcounts:
|
||||||
# print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts))
|
# print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts))
|
||||||
@@ -80,17 +77,16 @@ def main(folder):
|
|||||||
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
|
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
|
||||||
if (i + 1) == len(newposts):
|
if (i + 1) == len(newposts):
|
||||||
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
|
printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(filteredposts)))
|
||||||
userid = post['OwnerUserId']
|
|
||||||
for a in post['Answers']:
|
for a in post['Answers']:
|
||||||
if a['Id'] in cachedsentiments.keys():
|
if a['Id'] in cachedsentiments.keys():
|
||||||
toxlevel = cachedsentiments[a['Id']]
|
toxlevel = cachedsentiments[a['Id']]
|
||||||
else:
|
else:
|
||||||
toxlevel = computeToxLevel(a['Body'])
|
toxlevel = computeToxLevel(a['Body'])
|
||||||
cachedsentiments[a['Id']] = toxlevel
|
cachedsentiments[a['Id']] = toxlevel
|
||||||
toxlevels[userid].append(toxlevel)
|
toxlevels[post['Id']].append(toxlevel)
|
||||||
rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms")
|
rprint("computing toxic levels: post #" + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... took " + str(cms() - start) + "ms")
|
||||||
|
|
||||||
outfilename = goutfilename + "_" + str(option_posts)
|
outfilename = goutfilenamenewusers + "_" + str(option_posts)
|
||||||
dumptoxlevels(toxlevels, outfilename + ".py")
|
dumptoxlevels(toxlevels, outfilename + ".py")
|
||||||
|
|
||||||
neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())]
|
neglevelsflat = [item['neg'] for item in flatmap(toxlevels.values())]
|
||||||
@@ -137,7 +133,7 @@ def main(folder):
|
|||||||
gaxs[0, 1].set_yscale('log')
|
gaxs[0, 1].set_yscale('log')
|
||||||
gaxs[1, 1].set_yscale('log')
|
gaxs[1, 1].set_yscale('log')
|
||||||
gfig.suptitle("Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
gfig.suptitle("Sentiment of answers to the first X (max) posts within 1 week of 1st contribution\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
|
||||||
gfig.savefig(goutfilename + ".png", bbox_inches='tight')
|
gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight')
|
||||||
plt.close(gfig)
|
plt.close(gfig)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
52
mt.py
52
mt.py
@@ -1,5 +1,6 @@
|
|||||||
from threading import Thread, Lock
|
from threading import Thread, Lock
|
||||||
import time
|
import time
|
||||||
|
from math import ceil
|
||||||
|
|
||||||
|
|
||||||
class mt():
|
class mt():
|
||||||
@@ -19,6 +20,7 @@ class mt():
|
|||||||
self.__threads = []
|
self.__threads = []
|
||||||
self.__lock = Lock()
|
self.__lock = Lock()
|
||||||
self.__results = []
|
self.__results = []
|
||||||
|
self.__progress = 0
|
||||||
for i in range(self.__threadcount):
|
for i in range(self.__threadcount):
|
||||||
self.__results.append([])
|
self.__results.append([])
|
||||||
self.__threads.append(None)
|
self.__threads.append(None)
|
||||||
@@ -29,6 +31,7 @@ class mt():
|
|||||||
if self.__running:
|
if self.__running:
|
||||||
self.join()
|
self.join()
|
||||||
self.__data = self.getresults()
|
self.__data = self.getresults()
|
||||||
|
self.__progress = 0
|
||||||
self.__running = True
|
self.__running = True
|
||||||
self.__final = self.__getresultsmapfilter
|
self.__final = self.__getresultsmapfilter
|
||||||
self.__type = "filter"
|
self.__type = "filter"
|
||||||
@@ -45,7 +48,17 @@ class mt():
|
|||||||
|
|
||||||
def __dofilter(self, i, list, cond):
|
def __dofilter(self, i, list, cond):
|
||||||
now = self.__cms()
|
now = self.__cms()
|
||||||
results = [l for l in list if cond(l)]
|
|
||||||
|
results = []
|
||||||
|
for j in range(ceil(len(list) / 1000)):
|
||||||
|
part = list[j * 1000: min((j + 1) * 1000, len(list))]
|
||||||
|
results += [l for l in part if cond(l)]
|
||||||
|
with self.__lock:
|
||||||
|
self.__progress += len(part)
|
||||||
|
if self.__comment is not None:
|
||||||
|
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True)
|
||||||
|
|
||||||
|
# results = [l for l in list if cond(l)]
|
||||||
with self.__lock:
|
with self.__lock:
|
||||||
self.__results[i] = results
|
self.__results[i] = results
|
||||||
dur = self.__cms() - now
|
dur = self.__cms() - now
|
||||||
@@ -58,12 +71,13 @@ class mt():
|
|||||||
if self.__running:
|
if self.__running:
|
||||||
self.join()
|
self.join()
|
||||||
self.__data = self.getresults()
|
self.__data = self.getresults()
|
||||||
|
self.__progress = 0
|
||||||
self.__running = True
|
self.__running = True
|
||||||
self.__final = self.__getresultsmapfilter
|
self.__final = self.__getresultsmapfilter
|
||||||
self.__type = "map"
|
self.__type = "map"
|
||||||
self.__comment = comment if comment is not None else ""
|
self.__comment = comment if comment is not None else ""
|
||||||
if comment is not None:
|
if comment is not None:
|
||||||
print(self.__comment + ": #" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
|
print(self.__comment + ": 0/" + str(len(self.__data)) + " ...", end='\n' if self.__verbose else '', flush=True)
|
||||||
self.__starttime = self.__cms()
|
self.__starttime = self.__cms()
|
||||||
self.__endtime = None
|
self.__endtime = None
|
||||||
for i in range(self.__threadcount):
|
for i in range(self.__threadcount):
|
||||||
@@ -74,7 +88,16 @@ class mt():
|
|||||||
|
|
||||||
def __domap(self, i, list, func):
|
def __domap(self, i, list, func):
|
||||||
now = self.__cms()
|
now = self.__cms()
|
||||||
results = [func(l) for l in list]
|
results = []
|
||||||
|
for j in range(ceil(len(list) / 1000)):
|
||||||
|
part = list[j * 1000: min((j + 1) * 1000, len(list))]
|
||||||
|
results += [func(l) for l in part]
|
||||||
|
with self.__lock:
|
||||||
|
self.__progress += len(part)
|
||||||
|
if self.__comment is not None:
|
||||||
|
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True)
|
||||||
|
|
||||||
|
# results = [func(l) for l in list]
|
||||||
with self.__lock:
|
with self.__lock:
|
||||||
self.__results[i] = results
|
self.__results[i] = results
|
||||||
dur = self.__cms() - now
|
dur = self.__cms() - now
|
||||||
@@ -87,6 +110,7 @@ class mt():
|
|||||||
if self.__running:
|
if self.__running:
|
||||||
self.join()
|
self.join()
|
||||||
self.__data = self.getresults()
|
self.__data = self.getresults()
|
||||||
|
self.__progress = 0
|
||||||
self.__running = True
|
self.__running = True
|
||||||
self.__final = lambda: self.__getresultsreduce(aggregator, initval)
|
self.__final = lambda: self.__getresultsreduce(aggregator, initval)
|
||||||
self.__type = "reduce"
|
self.__type = "reduce"
|
||||||
@@ -104,8 +128,18 @@ class mt():
|
|||||||
def __doreduce(self, i, list, reducer, initval):
|
def __doreduce(self, i, list, reducer, initval):
|
||||||
now = self.__cms()
|
now = self.__cms()
|
||||||
val = initval()
|
val = initval()
|
||||||
for j in range(len(list)):
|
|
||||||
val = reducer(val, list[j])
|
for j in range(ceil(len(list) / 1000)):
|
||||||
|
part = list[j * 1000: min((j + 1) * 1000, len(list))]
|
||||||
|
for k in range(len(part)):
|
||||||
|
val = reducer(val, part[k])
|
||||||
|
with self.__lock:
|
||||||
|
self.__progress += len(part)
|
||||||
|
if self.__comment is not None:
|
||||||
|
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + " ...", end='', flush=True)
|
||||||
|
|
||||||
|
# for j in range(len(list)):
|
||||||
|
# val = reducer(val, list[j])
|
||||||
with self.__lock:
|
with self.__lock:
|
||||||
self.__results[i] = val
|
self.__results[i] = val
|
||||||
dur = self.__cms() - now
|
dur = self.__cms() - now
|
||||||
@@ -136,13 +170,15 @@ class mt():
|
|||||||
self.__threads[i].join()
|
self.__threads[i].join()
|
||||||
self.__threads[i] = None
|
self.__threads[i] = None
|
||||||
if self.__endtime is None:
|
if self.__endtime is None:
|
||||||
self.__endtime = self.__cms();
|
self.__endtime = self.__cms()
|
||||||
if self.__comment is not None:
|
if self.__comment is not None:
|
||||||
dur = self.__endtime - self.__starttime
|
dur = self.__endtime - self.__starttime
|
||||||
if self.__verbose:
|
if self.__verbose:
|
||||||
print(self.__comment + ": #" + str(len(self.__data)) + (" -> #" + str(sum([len(l) for l in self.__results])) if self.__type == "filter" else "") + " ... took " + str(dur) + "ms")
|
print(self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + (
|
||||||
|
" -> #" + str(sum([len(l) for l in self.__results])) if self.__type == "filter" else "") + " ... took " + str(dur) + "ms")
|
||||||
else:
|
else:
|
||||||
print("\r" + self.__comment + ": #" + str(len(self.__data)) + (" -> #" + str(sum([len(l) for l in self.__results])) if self.__type == "filter" else "") + " ... took " + str(dur) + "ms")
|
print("\r" + self.__comment + ": " + str(self.__progress) + "/" + str(len(self.__data)) + (
|
||||||
|
" -> #" + str(sum([len(l) for l in self.__results])) if self.__type == "filter" else "") + " ... took " + str(dur) + "ms")
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
|
|||||||
76
sentiments.py
Normal file
76
sentiments.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
||||||
|
|
||||||
|
from loader import load, cms, dmt
|
||||||
|
|
||||||
|
printnoln = lambda text: print(text, end='', flush=True)
|
||||||
|
rprint = lambda text: print('\r' + text)
|
||||||
|
|
||||||
|
DAYS_NEW_USER = 7
|
||||||
|
OLD_USER_YEAR = 3
|
||||||
|
|
||||||
|
analyser = SentimentIntensityAnalyzer()
|
||||||
|
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
|
||||||
|
|
||||||
|
|
||||||
|
def main(folder):
|
||||||
|
users, posts, firstcontrib, sumcontrib = load(folder)
|
||||||
|
|
||||||
|
outfolder = folder + "/output/"
|
||||||
|
os.system("mkdir -p " + outfolder)
|
||||||
|
outfilename = outfolder + "sentiments"
|
||||||
|
|
||||||
|
# computer toxic levels
|
||||||
|
# start = cms()
|
||||||
|
# printnoln("computing toxic levels: filtering")
|
||||||
|
|
||||||
|
# toxlevels = defaultdict(list)
|
||||||
|
# for (i, post) in enumerate(posts):
|
||||||
|
# if (i + 1) % 100 == 0:
|
||||||
|
# printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(posts)))
|
||||||
|
# if (i + 1) == len(posts):
|
||||||
|
# printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(posts)))
|
||||||
|
# for a in post['Answers']:
|
||||||
|
# toxlevel = computeToxLevel(a['Body'])
|
||||||
|
# toxlevels[post['Id']].append(toxlevel)
|
||||||
|
# rprint("computing toxic levels: post #" + str(len(posts)) + "/" + str(len(posts)) + " ... took " + str(cms() - start) + "ms")
|
||||||
|
toxlevels = dmt(posts).map(lambda p: (p['Id'], computeSentimentForPost(p)), "calculating sentiments").getresults()
|
||||||
|
toxlevels = {id: p for (id, p) in toxlevels}
|
||||||
|
|
||||||
|
dumptoxlevels(toxlevels, outfilename + ".py")
|
||||||
|
|
||||||
|
|
||||||
|
def computeSentimentForPost(post):
|
||||||
|
anwsers = {a['Id']: computeToxLevel(a['Body']) for a in post['Answers']}
|
||||||
|
return anwsers
|
||||||
|
|
||||||
|
|
||||||
|
def computeToxLevel(text):
|
||||||
|
return analyser.polarity_scores(text)
|
||||||
|
|
||||||
|
|
||||||
|
def dumptoxlevels(lvls, filename):
|
||||||
|
answers = dict()
|
||||||
|
for p in lvls.values():
|
||||||
|
for id, a in p.items():
|
||||||
|
answers[id] = a
|
||||||
|
with open(filename, "w") as file:
|
||||||
|
file.write("posts = " + str(lvls) + "\n")
|
||||||
|
file.write("answers = " + str(answers) + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# execute only if run as a script
|
||||||
|
usage = sys.argv[0] + " <folder>"
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print(usage)
|
||||||
|
sys.exit(1)
|
||||||
|
folder = sys.argv[1]
|
||||||
|
if not os.path.isdir(folder):
|
||||||
|
print(folder + " is not a folder")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
main(folder)
|
||||||
Reference in New Issue
Block a user