This commit is contained in:
wea_ondara
2019-08-11 16:47:52 +02:00
parent aacf71fad8
commit 0536f5db5f
5 changed files with 98 additions and 89 deletions

View File

@@ -7,34 +7,33 @@ from math import ceil
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from common import calc_intervals, imprt, FigSaver from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK
from loader import load, dmt, cms from loader import load, dmt, cms
printnoln = lambda text: print(text, end='', flush=True)
rprint = lambda text: print('\r' + text)
DAYS_NEW_USER = 7
OLD_USER_YEAR = 3
OLD_USER_PERCENTILE = 0.95 OLD_USER_PERCENTILE = 0.95
analyser = SentimentIntensityAnalyzer()
figsaver = FigSaver()
colors = ['red', 'green', 'blue', 'orange', 'deeppink'] colors = ['red', 'green', 'blue', 'orange', 'deeppink']
def main(folder): def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder) users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts) intervals = calc_intervals(posts, intervl)
cachedsentiments = imprt(folder + "/output/sentiments.py").answers cachedsentiments = imprt(folder + "/output/sentiments.py").answers
outfolder = folder + "/output/batch/" outputdir = folder + "/output/batch/"
os.system("mkdir -p " + outfolder) os.system("mkdir -p " + outputdir)
postcounts = range(1, 5 + 1) postcounts = range(1, 5 + 1)
magickpost = {i: IMAGE_MAGICK for i in postcounts}
magickold = IMAGE_MAGICK
magickglobal = IMAGE_MAGICK
for (option_date_from, option_date_to) in intervals: for (option_date_from, option_date_to) in intervals:
magickdate = IMAGE_MAGICK
# get questions for option_date_from <= creation date < option_date_to # get questions for option_date_from <= creation date < option_date_to
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults() newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults()
if len(newposts) == 0: if len(newposts) == 0:
@@ -51,8 +50,8 @@ def main(folder):
gpos = [] gpos = []
gcom = [] gcom = []
goutfilenamenewusers = outfolder + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") goutfilenamenewusers = outputdir + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
goutfilenameoldusers = outfolder + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") goutfilenameoldusers = outputdir + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
for option_posts in postcounts: for option_posts in postcounts:
# print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts)) # print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts))
@@ -122,6 +121,9 @@ def main(folder):
fig.savefig(outfilename + ".png", bbox_inches='tight') fig.savefig(outfilename + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms") rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
magickpost[option_posts] += " " + outfilename + ".png"
magickdate += " " + outfilename + ".png"
os.system(magickdate + " " + goutfilenamenewusers + ".pdf")
# global # global
start = cms() start = cms()
@@ -146,6 +148,7 @@ def main(folder):
gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight') gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight')
plt.close(gfig) plt.close(gfig)
rprint("global plot post ... plotting ... saving ... took " + str(cms() - start) + "ms") rprint("global plot post ... plotting ... saving ... took " + str(cms() - start) + "ms")
magickglobal += " " + goutfilenamenewusers + ".png"
# for old users --------------------------------------------------------------------------------- # for old users ---------------------------------------------------------------------------------
start = cms() start = cms()
@@ -192,18 +195,16 @@ def main(folder):
# plt.show() # plt.show()
fig.suptitle("Sentiment of answers to posts by most posting users (95%tile)\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) fig.suptitle("Sentiment of answers to posts by most posting users (95%tile)\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# figsaver.save(fig, goutfilenameoldusers + ".png", bbox_inches='tight')
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...") printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight') fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms") rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
magickold += " " + goutfilenameoldusers + ".png"
figsaver.join() os.system(magickglobal + " batch_newusers.pdf")
figsaver.join() os.system(magickold + " batch_oldusers.pdf")
for (i, cmd) in magickpost.items():
os.system(cmd + " " + "batch_newusers_" + i + ".pdf")
def computeToxLevel(text):
return analyser.polarity_scores(text)
def dumptoxlevels(lvls, filename): def dumptoxlevels(lvls, filename):
@@ -222,5 +223,20 @@ if __name__ == "__main__":
if not os.path.isdir(folder): if not os.path.isdir(folder):
print(folder + " is not a folder") print(folder + " is not a folder")
sys.exit(1) sys.exit(1)
interval = 3
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder) main(folder, interval)

View File

@@ -9,10 +9,9 @@ import matplotlib.pyplot as plt
import numpy as np import numpy as np
from scipy.stats import ks_2samp from scipy.stats import ks_2samp
from common import imprt from common import imprt, IMAGE_MAGICK
colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'} colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'}
IMAGE_MAGICK = "magick"
def main(folder): def main(folder):

View File

@@ -5,29 +5,36 @@ import matplotlib.pyplot as plt
from loader import dmt from loader import dmt
printnoln = lambda text: print(text, end='', flush=True)
rprint = lambda text: print('\r' + text)
def calc_intervals(posts): DAYS_NEW_USER = 7
IMAGE_MAGICK = "magick"
def calc_intervals(posts, months=3):
firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'], firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'],
"firstpost").getresults() "firstpost").getresults()
lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults() lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults()
# calc quarter beginning # calc quarter beginning
firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0) firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
if firstpost.month not in (1, 4, 7, 10): if (firstpost.month - 1) % months != 0:
firstpost = firstpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[firstpost.month]) firstpost = firstpost.replace(month=firstpost.month - ((firstpost.month - 1) % months))
lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0) lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
if lastpost.month not in (1, 4, 7, 10): if (lastpost.month - 1) % months != 0:
lastpost = lastpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[lastpost.month]) lastpost = lastpost.replace(month=lastpost.month - ((lastpost.month - 1) % months))
# add 3 months to last post # add 3 months to last post
if lastpost.month == 10: if lastpost.month + months > 12:
lastpost = lastpost.replace(month=1, year=lastpost.year + 1) lastpost = lastpost.replace(month=lastpost.month + months - 12, year=lastpost.year + 1)
else: else:
lastpost = lastpost.replace(month=lastpost.month + 3) lastpost = lastpost.replace(month=lastpost.month + months)
cdate = firstpost cdate = firstpost
intervals = [] intervals = []
while cdate < lastpost: while cdate < lastpost:
nextquarter = cdate.replace(month=(cdate.month + 3) % 12, year=cdate.year + (0 if cdate.month + 3 < 12 else 1)) nextmon = cdate.month + months
nextquarter = cdate.replace(month=nextmon if nextmon <=12 else nextmon-12, year=cdate.year + (0 if nextmon <= 12 else 1))
print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y")) print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y"))
intervals.append((cdate, nextquarter)) intervals.append((cdate, nextquarter))
cdate = nextquarter cdate = nextquarter

View File

@@ -1,34 +1,29 @@
from datetime import datetime
from datetime import timedelta
import sys
import os import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import sys
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict from collections import defaultdict
from loader import load, dmt, cms
import math
from common import calc_intervals
printnoln = lambda text: print(text, end='', flush=True) import matplotlib.pyplot as plt
rprint = lambda text: print('\r' + text) from matplotlib.ticker import MaxNLocator
DAYS_NEW_USER = 7 from common import calc_intervals, IMAGE_MAGICK
OLD_USER_YEAR = 3 from loader import load, dmt
analyser = SentimentIntensityAnalyzer()
colors = ['red', 'green', 'blue', 'orange', 'deeppink'] colors = ['red', 'green', 'blue', 'orange', 'deeppink']
def main(folder): def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder) users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts) intervals = calc_intervals(posts, intervl)
outputdir = folder + "/output/posthist/"
os.system("mkdir -p " + outputdir)
activeusercounts = []
imgmagickcmd = IMAGE_MAGICK
for (option_date_from, option_date_to) in intervals: for (option_date_from, option_date_to) in intervals:
print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))) print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y")))
# filter posts by option_date_from <= creation date <= option_date_to # filter posts by option_date_from <= creation date <= option_date_to
# newusers = set(dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").map(lambda u: u['Id'], "getting user ids").getresults())
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults() newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults()
postcounts = defaultdict(list) postcounts = defaultdict(list)
@@ -37,10 +32,8 @@ def main(folder):
postcounts[p['OwnerUserId']].append(p) postcounts[p['OwnerUserId']].append(p)
i = i + 1 i = i + 1
postcounts = {id: len(pc) for (id, pc) in postcounts.items()} postcounts = {id: len(pc) for (id, pc) in postcounts.items()}
# print("i: " + str(i) + " expected: " + str(len(newposts)) + " is: " + str(sum([pc for pc in postcounts.values()]))) activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys())))
outputdir = folder + "/output/posthist/"
os.system("mkdir -p " + outputdir)
histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
histdata = [pc for pc in postcounts.values()] histdata = [pc for pc in postcounts.values()]
@@ -48,23 +41,22 @@ def main(folder):
plt.hist(histdata, range(max(histdata, default=0) + 1)) plt.hist(histdata, range(max(histdata, default=0) + 1))
plt.yscale('log') plt.yscale('log')
plt.ylim(bottom=0) plt.ylim(bottom=0)
plt.xlabel("#posts")
plt.ylabel("#users with X posts")
fig.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.title("Histogram for user post count registered between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y")) plt.title("Histogram for user post count registered between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y"))
fig.savefig(histfilename + ".png", bbox_inches='tight') fig.savefig(histfilename + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)
imgmagickcmd += " " + histfilename + ".png"
os.system(imgmagickcmd + " " + outputdir + "/posthist.pdf")
fig = plt.figure(figsize=(16, 12))
def computeToxLevel(text): plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts])
return analyser.polarity_scores(text) plt.yscale('log')
plt.ylim(bottom=0)
plt.title("Active users")
def flatmap(arr): fig.savefig(outputdir + "activeusers.png", bbox_inches='tight')
return [item for sublist in arr for item in sublist] plt.close(fig)
def dumptoxlevels(lvls, filename):
with open(filename, "w") as file:
file.write("from collections import defaultdict\n\n")
file.write("toxlevels = " + str(lvls).replace("<class 'list'>", "list", 1) + "\n")
if __name__ == "__main__": if __name__ == "__main__":
@@ -77,5 +69,20 @@ if __name__ == "__main__":
if not os.path.isdir(folder): if not os.path.isdir(folder):
print(folder + " is not a folder") print(folder + " is not a folder")
sys.exit(1) sys.exit(1)
interval = 3
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder) main(folder, interval)

View File

@@ -5,14 +5,7 @@ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from loader import load, dmt from loader import load, dmt
printnoln = lambda text: print(text, end='', flush=True)
rprint = lambda text: print('\r' + text)
DAYS_NEW_USER = 7
OLD_USER_YEAR = 3
analyser = SentimentIntensityAnalyzer() analyser = SentimentIntensityAnalyzer()
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
def main(folder): def main(folder):
@@ -22,20 +15,7 @@ def main(folder):
os.system("mkdir -p " + outfolder) os.system("mkdir -p " + outfolder)
outfilename = outfolder + "sentiments" outfilename = outfolder + "sentiments"
# computer toxic levels # compute toxic levels
# start = cms()
# printnoln("computing toxic levels: filtering")
# toxlevels = defaultdict(list)
# for (i, post) in enumerate(posts):
# if (i + 1) % 100 == 0:
# printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(posts)))
# if (i + 1) == len(posts):
# printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(posts)))
# for a in post['Answers']:
# toxlevel = computeToxLevel(a['Body'])
# toxlevels[post['Id']].append(toxlevel)
# rprint("computing toxic levels: post #" + str(len(posts)) + "/" + str(len(posts)) + " ... took " + str(cms() - start) + "ms")
toxlevels = dmt(posts, 10).map(lambda p: (p['Id'], {a['Id']: computeToxLevel(a['Body']) for a in p['Answers']}), "calculating sentiments").getresults() toxlevels = dmt(posts, 10).map(lambda p: (p['Id'], {a['Id']: computeToxLevel(a['Body']) for a in p['Answers']}), "calculating sentiments").getresults()
toxlevels = {id: p for (id, p) in toxlevels} toxlevels = {id: p for (id, p) in toxlevels}