This commit is contained in:
wea_ondara
2019-08-11 16:47:52 +02:00
parent aacf71fad8
commit 0536f5db5f
5 changed files with 98 additions and 89 deletions

View File

@@ -7,34 +7,33 @@ from math import ceil
import matplotlib.pyplot as plt
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from common import calc_intervals, imprt, FigSaver
from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK
from loader import load, dmt, cms
printnoln = lambda text: print(text, end='', flush=True)
rprint = lambda text: print('\r' + text)
DAYS_NEW_USER = 7
OLD_USER_YEAR = 3
OLD_USER_PERCENTILE = 0.95
analyser = SentimentIntensityAnalyzer()
figsaver = FigSaver()
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
def main(folder):
def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts)
intervals = calc_intervals(posts, intervl)
cachedsentiments = imprt(folder + "/output/sentiments.py").answers
outfolder = folder + "/output/batch/"
os.system("mkdir -p " + outfolder)
outputdir = folder + "/output/batch/"
os.system("mkdir -p " + outputdir)
postcounts = range(1, 5 + 1)
magickpost = {i: IMAGE_MAGICK for i in postcounts}
magickold = IMAGE_MAGICK
magickglobal = IMAGE_MAGICK
for (option_date_from, option_date_to) in intervals:
magickdate = IMAGE_MAGICK
# get questions for option_date_from <= creation date < option_date_to
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults()
if len(newposts) == 0:
@@ -51,8 +50,8 @@ def main(folder):
gpos = []
gcom = []
goutfilenamenewusers = outfolder + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
goutfilenameoldusers = outfolder + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
goutfilenamenewusers = outputdir + "batch_newusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
goutfilenameoldusers = outputdir + "batch_oldusers_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
for option_posts in postcounts:
# print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y") + " - #posts: " + str(option_posts))
@@ -122,6 +121,9 @@ def main(folder):
fig.savefig(outfilename + ".png", bbox_inches='tight')
plt.close(fig)
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
magickpost[option_posts] += " " + outfilename + ".png"
magickdate += " " + outfilename + ".png"
os.system(magickdate + " " + goutfilenamenewusers + ".pdf")
# global
start = cms()
@@ -146,6 +148,7 @@ def main(folder):
gfig.savefig(goutfilenamenewusers + ".png", bbox_inches='tight')
plt.close(gfig)
rprint("global plot post ... plotting ... saving ... took " + str(cms() - start) + "ms")
magickglobal += " " + goutfilenamenewusers + ".png"
# for old users ---------------------------------------------------------------------------------
start = cms()
@@ -192,18 +195,16 @@ def main(folder):
# plt.show()
fig.suptitle("Sentiment of answers to posts by most posting users (95%tile)\nPosts created between " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
# figsaver.save(fig, goutfilenameoldusers + ".png", bbox_inches='tight')
printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ...")
fig.savefig(goutfilenameoldusers + ".png", bbox_inches='tight')
plt.close(fig)
rprint("computing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ... saving ... took " + str(cms() - start) + "ms")
magickold += " " + goutfilenameoldusers + ".png"
figsaver.join()
figsaver.join()
def computeToxLevel(text):
return analyser.polarity_scores(text)
os.system(magickglobal + " batch_newusers.pdf")
os.system(magickold + " batch_oldusers.pdf")
for (i, cmd) in magickpost.items():
os.system(cmd + " " + "batch_newusers_" + i + ".pdf")
def dumptoxlevels(lvls, filename):
@@ -222,5 +223,20 @@ if __name__ == "__main__":
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 3
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder)
main(folder, interval)

View File

@@ -9,10 +9,9 @@ import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import ks_2samp
from common import imprt
from common import imprt, IMAGE_MAGICK
colors = {'neg': 'red', 'neu': 'green', 'pos': 'blue', 'com': 'orange'}
IMAGE_MAGICK = "magick"
def main(folder):

View File

@@ -5,29 +5,36 @@ import matplotlib.pyplot as plt
from loader import dmt
printnoln = lambda text: print(text, end='', flush=True)
rprint = lambda text: print('\r' + text)
def calc_intervals(posts):
DAYS_NEW_USER = 7
IMAGE_MAGICK = "magick"
def calc_intervals(posts, months=3):
firstpost = dmt(posts).reduce(lambda acc, e: acc if acc < e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc < e else e, lambda: posts[0]['CreationDate'],
"firstpost").getresults()
lastpost = dmt(posts).reduce(lambda acc, e: acc if acc > e['CreationDate'] else e['CreationDate'], lambda acc, e: acc if acc > e else e, lambda: posts[0]['CreationDate'], "lastpost").getresults()
# calc quarter beginning
firstpost = firstpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
if firstpost.month not in (1, 4, 7, 10):
firstpost = firstpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[firstpost.month])
if (firstpost.month - 1) % months != 0:
firstpost = firstpost.replace(month=firstpost.month - ((firstpost.month - 1) % months))
lastpost = lastpost.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
if lastpost.month not in (1, 4, 7, 10):
lastpost = lastpost.replace(month={1: 1, 2: 1, 3: 1, 4: 4, 5: 4, 6: 4, 7: 7, 8: 7, 9: 7, 10: 10, 11: 10, 12: 10}[lastpost.month])
if (lastpost.month - 1) % months != 0:
lastpost = lastpost.replace(month=lastpost.month - ((lastpost.month - 1) % months))
# add 3 months to last post
if lastpost.month == 10:
lastpost = lastpost.replace(month=1, year=lastpost.year + 1)
if lastpost.month + months > 12:
lastpost = lastpost.replace(month=lastpost.month + months - 12, year=lastpost.year + 1)
else:
lastpost = lastpost.replace(month=lastpost.month + 3)
lastpost = lastpost.replace(month=lastpost.month + months)
cdate = firstpost
intervals = []
while cdate < lastpost:
nextquarter = cdate.replace(month=(cdate.month + 3) % 12, year=cdate.year + (0 if cdate.month + 3 < 12 else 1))
nextmon = cdate.month + months
nextquarter = cdate.replace(month=nextmon if nextmon <=12 else nextmon-12, year=cdate.year + (0 if nextmon <= 12 else 1))
print("adding interval: " + cdate.strftime("%d-%m-%Y") + " - " + nextquarter.strftime("%d-%m-%Y"))
intervals.append((cdate, nextquarter))
cdate = nextquarter

View File

@@ -1,34 +1,29 @@
from datetime import datetime
from datetime import timedelta
import sys
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import matplotlib.pyplot as plt
import sys
from collections import defaultdict
from loader import load, dmt, cms
import math
from common import calc_intervals
printnoln = lambda text: print(text, end='', flush=True)
rprint = lambda text: print('\r' + text)
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
DAYS_NEW_USER = 7
OLD_USER_YEAR = 3
from common import calc_intervals, IMAGE_MAGICK
from loader import load, dmt
analyser = SentimentIntensityAnalyzer()
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
def main(folder):
def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts)
intervals = calc_intervals(posts, intervl)
outputdir = folder + "/output/posthist/"
os.system("mkdir -p " + outputdir)
activeusercounts = []
imgmagickcmd = IMAGE_MAGICK
for (option_date_from, option_date_to) in intervals:
print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y")))
# filter posts by option_date_from <= creation date <= option_date_to
# newusers = set(dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").map(lambda u: u['Id'], "getting user ids").getresults())
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults()
postcounts = defaultdict(list)
@@ -37,10 +32,8 @@ def main(folder):
postcounts[p['OwnerUserId']].append(p)
i = i + 1
postcounts = {id: len(pc) for (id, pc) in postcounts.items()}
# print("i: " + str(i) + " expected: " + str(len(newposts)) + " is: " + str(sum([pc for pc in postcounts.values()])))
activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys())))
outputdir = folder + "/output/posthist/"
os.system("mkdir -p " + outputdir)
histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
histdata = [pc for pc in postcounts.values()]
@@ -48,23 +41,22 @@ def main(folder):
plt.hist(histdata, range(max(histdata, default=0) + 1))
plt.yscale('log')
plt.ylim(bottom=0)
plt.xlabel("#posts")
plt.ylabel("#users with X posts")
fig.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.title("Histogram for user post count registered between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y"))
fig.savefig(histfilename + ".png", bbox_inches='tight')
plt.close(fig)
imgmagickcmd += " " + histfilename + ".png"
os.system(imgmagickcmd + " " + outputdir + "/posthist.pdf")
def computeToxLevel(text):
return analyser.polarity_scores(text)
def flatmap(arr):
return [item for sublist in arr for item in sublist]
def dumptoxlevels(lvls, filename):
with open(filename, "w") as file:
file.write("from collections import defaultdict\n\n")
file.write("toxlevels = " + str(lvls).replace("<class 'list'>", "list", 1) + "\n")
fig = plt.figure(figsize=(16, 12))
plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts])
plt.yscale('log')
plt.ylim(bottom=0)
plt.title("Active users")
fig.savefig(outputdir + "activeusers.png", bbox_inches='tight')
plt.close(fig)
if __name__ == "__main__":
@@ -77,5 +69,20 @@ if __name__ == "__main__":
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 3
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder)
main(folder, interval)

View File

@@ -5,14 +5,7 @@ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from loader import load, dmt
printnoln = lambda text: print(text, end='', flush=True)
rprint = lambda text: print('\r' + text)
DAYS_NEW_USER = 7
OLD_USER_YEAR = 3
analyser = SentimentIntensityAnalyzer()
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
def main(folder):
@@ -22,20 +15,7 @@ def main(folder):
os.system("mkdir -p " + outfolder)
outfilename = outfolder + "sentiments"
# computer toxic levels
# start = cms()
# printnoln("computing toxic levels: filtering")
# toxlevels = defaultdict(list)
# for (i, post) in enumerate(posts):
# if (i + 1) % 100 == 0:
# printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(posts)))
# if (i + 1) == len(posts):
# printnoln("\rcomputing toxic levels: post #" + str(i + 1) + "/" + str(len(posts)))
# for a in post['Answers']:
# toxlevel = computeToxLevel(a['Body'])
# toxlevels[post['Id']].append(toxlevel)
# rprint("computing toxic levels: post #" + str(len(posts)) + "/" + str(len(posts)) + " ... took " + str(cms() - start) + "ms")
# compute toxic levels
toxlevels = dmt(posts, 10).map(lambda p: (p['Id'], {a['Id']: computeToxLevel(a['Body']) for a in p['Answers']}), "calculating sentiments").getresults()
toxlevels = {id: p for (id, p) in toxlevels}