This commit is contained in:
wea_ondara
2019-08-11 16:47:52 +02:00
parent aacf71fad8
commit 0536f5db5f
5 changed files with 98 additions and 89 deletions

View File

@@ -1,34 +1,29 @@
from datetime import datetime
from datetime import timedelta
import sys
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import matplotlib.pyplot as plt
import sys
from collections import defaultdict
from loader import load, dmt, cms
import math
from common import calc_intervals
printnoln = lambda text: print(text, end='', flush=True)
rprint = lambda text: print('\r' + text)
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
DAYS_NEW_USER = 7
OLD_USER_YEAR = 3
from common import calc_intervals, IMAGE_MAGICK
from loader import load, dmt
analyser = SentimentIntensityAnalyzer()
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
def main(folder):
def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts)
intervals = calc_intervals(posts, intervl)
outputdir = folder + "/output/posthist/"
os.system("mkdir -p " + outputdir)
activeusercounts = []
imgmagickcmd = IMAGE_MAGICK
for (option_date_from, option_date_to) in intervals:
print((option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y")))
# filter posts by option_date_from <= creation date <= option_date_to
# newusers = set(dmt(users).filter(lambda u: option_date_from <= u['CreationDate'] < option_date_to, "filtering users by creation").map(lambda u: u['Id'], "getting user ids").getresults())
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults()
postcounts = defaultdict(list)
@@ -37,10 +32,8 @@ def main(folder):
postcounts[p['OwnerUserId']].append(p)
i = i + 1
postcounts = {id: len(pc) for (id, pc) in postcounts.items()}
# print("i: " + str(i) + " expected: " + str(len(newposts)) + " is: " + str(sum([pc for pc in postcounts.values()])))
activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys())))
outputdir = folder + "/output/posthist/"
os.system("mkdir -p " + outputdir)
histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y")
histdata = [pc for pc in postcounts.values()]
@@ -48,23 +41,22 @@ def main(folder):
plt.hist(histdata, range(max(histdata, default=0) + 1))
plt.yscale('log')
plt.ylim(bottom=0)
plt.xlabel("#posts")
plt.ylabel("#users with X posts")
fig.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.title("Histogram for user post count registered between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y"))
fig.savefig(histfilename + ".png", bbox_inches='tight')
plt.close(fig)
imgmagickcmd += " " + histfilename + ".png"
os.system(imgmagickcmd + " " + outputdir + "/posthist.pdf")
def computeToxLevel(text):
return analyser.polarity_scores(text)
def flatmap(arr):
return [item for sublist in arr for item in sublist]
def dumptoxlevels(lvls, filename):
with open(filename, "w") as file:
file.write("from collections import defaultdict\n\n")
file.write("toxlevels = " + str(lvls).replace("<class 'list'>", "list", 1) + "\n")
fig = plt.figure(figsize=(16, 12))
plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts])
plt.yscale('log')
plt.ylim(bottom=0)
plt.title("Active users")
fig.savefig(outputdir + "activeusers.png", bbox_inches='tight')
plt.close(fig)
if __name__ == "__main__":
@@ -77,5 +69,20 @@ if __name__ == "__main__":
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 3
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder)
main(folder, interval)