Files
master/posthist.py
wea_ondara 6dd5145c42 wip
2020-04-11 16:07:08 +02:00

199 lines
9.2 KiB
Python

import os
import sys
from collections import defaultdict
from datetime import timedelta
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from common import calc_intervals, IMAGE_MAGICK, DAYS_NEW_USER, FIG_SIZE
from loader import load, dmt
from sentiments import readtoxleveltxt
colors = ['red', 'green', 'blue', 'orange', 'deeppink']
def main(folder, intervl):
users, posts, firstcontrib, sumcontrib = load(folder)
intervals = calc_intervals(posts, intervl)
(_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt")
outputdir = folder + "/output/posthist/"
os.system("mkdir -p " + outputdir)
activeusercounts = []
newusercounts = []
answerstonewusers = []
sentimentstonewusers = []
activitynewusers = []
questionsininterval = []
answersininterval = []
imgmagickcmd = IMAGE_MAGICK
for (option_date_from, option_date_to) in intervals:
print(option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))
# post histograms
# filter posts by option_date_from <= creation date <= option_date_to
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults()
questionsininterval.append(((option_date_from, option_date_to), len(newposts)))
newanswers = dmt(posts).map(lambda p: [a for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to], "filtering answers by date") \
.reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: []).getresults()
answersininterval.append(((option_date_from, option_date_to), len(newanswers)))
postcounts = defaultdict(list)
i = 0
for p in newposts:
postcounts[p['OwnerUserId']].append(p)
i = i + 1
# for a in p['Answers']:
# postcounts[p['OwnerUserId']].append(a)
postcounts = {id: len(pc) for (id, pc) in postcounts.items()}
activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys())))
newusercounts.append(((option_date_from, option_date_to), len([u for u in users if option_date_from <= u['CreationDate'] < option_date_to])))
activitynewusersinmonth = defaultdict(int)
for p in newposts:
if firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate']:
activitynewusersinmonth[p['OwnerUserId']] += 1
for p in posts:
for a in p['Answers']:
if firstcontrib[a['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']:
activitynewusersinmonth[p['OwnerUserId']] += 1
activitysum = sum(activitynewusersinmonth.values())
activitynewusers.append(((option_date_from, option_date_to), activitysum / len(activitynewusersinmonth) if len(activitynewusersinmonth) > 0 else float("nan")))
histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "-i" + str(intervl)
histdata = [pc for pc in postcounts.values()]
fig = plt.figure(figsize=FIG_SIZE)
plt.hist(histdata, range(max(histdata, default=0) + 1))
plt.yscale('log')
plt.ylim(bottom=0)
plt.xlabel("#posts")
plt.ylabel("#users with X posts")
fig.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.title("Histogram for user post count between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y"))
fig.savefig(histfilename + ".png", bbox_inches='tight')
plt.close(fig)
imgmagickcmd += " " + histfilename + ".png"
# answers to new users
answers = (dmt(posts).map(lambda q: [a for a in q['Answers']
if option_date_from <= a['CreationDate'] < option_date_to # answer in interval
and firstcontrib[q['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > q['CreationDate'] # post created within 1 week of 1st contrib
and q['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) # answer created within 1 week of post
.getresults())
count = sum([len(a) for a in answers])
answerstonewusers.append(((option_date_from, option_date_to), count))
sent = ([cachedsentiments[a['Id']] for al in answers for a in al])
sentbad = len([1 for a in sent if a['compound'] < -0.05])
sentneu = len([1 for a in sent if -0.05 <= a['compound'] <= 0.05])
sentgood = len([1 for a in sent if a['compound'] > 0.05])
sentimentstonewusers.append(((option_date_from, option_date_to), (sent, sentbad, sentneu, sentgood)))
# gen pdf for post histograms
os.system(imgmagickcmd + " " + outputdir + "/posthist-i" + str(intervl) + ".pdf")
# plot posts diagram
fig = plt.figure(figsize=FIG_SIZE)
plt.plot([u[0] for (u, y) in activeusercounts], [y for (u, y) in activeusercounts], label="active users")
plt.plot([u[0] for (u, y) in newusercounts], [y for (u, y) in newusercounts], label='newly registered users')
plt.xlabel('time')
plt.ylabel('#users')
plt.yscale('log')
# plt.ylim(bottom=1)
plt.title("Active users")
plt.legend(loc="upper right")
fig.savefig(outputdir + "activeusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig)
# plot answers to new users diagram
fig = plt.figure(figsize=FIG_SIZE)
plt.plot([u[0] for (u, y) in answerstonewusers], [y for (u, y) in answerstonewusers])
plt.xlabel('time')
plt.ylabel('#answers per question of a new user')
plt.yscale('log')
# plt.ylim(bottom=1)
plt.title("Answers to new users")
fig.savefig(outputdir + "answerstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig)
# plot sentiments of answers to new users diagram
fig = plt.figure(figsize=FIG_SIZE)
plt.plot([u[0] for (u, y) in sentimentstonewusers], [b for (u, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer")
plt.plot([u[0] for (u, y) in sentimentstonewusers], [n for (u, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer")
plt.plot([u[0] for (u, y) in sentimentstonewusers], [g for (u, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer")
plt.xlabel('time')
plt.ylabel('sentiment')
plt.yscale('log')
plt.ylim(bottom=1)
plt.legend(loc="upper right")
plt.title("Sentiments of answers to new users")
fig.savefig(outputdir + "sentimentstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig)
# plot activity for new users
fig = plt.figure(figsize=FIG_SIZE)
plt.plot([u[0] for (u, y) in activitynewusers], [y for u, y in activitynewusers], label="activity")
plt.xlabel('time')
plt.ylabel('#questions or answers created by a new user')
plt.legend(loc="upper right")
plt.title("Average activity per new user")
fig.savefig(outputdir + "activitynewusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig)
# plot activity for new users
fig = plt.figure(figsize=FIG_SIZE)
plt.plot([u[0] for (u, y) in questionsininterval], [y for u, y in questionsininterval], label="questions")
plt.plot([u[0] for (u, y) in answersininterval], [y for u, y in answersininterval], label="answer")
plt.xlabel('time')
plt.ylabel('quantity')
plt.legend(loc="upper right")
plt.title("Average activity per new user")
fig.savefig(outputdir + "postsanswers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig)
#print data set stats
stats = ""
stats += "users: " + str(len(users)) + "\n"
stats += "questions: " + str(len(posts)) + "\n"
stats += "answers: " + str(sum(dmt(posts).map(lambda q: len(q['Answers'])).getresults())) + "\n"
stats += "active user last month: " + str(activeusercounts[-1]) + "\n"
useridmapping = {u['Id']: u for u in users}
newuserposts = dmt(posts).filter(lambda q: q['CreationDate'] < useridmapping[q['OwnerUserId']]['CreationDate'] + timedelta(days=DAYS_NEW_USER)).getresults()
newuserlist = set([q['OwnerUserId'] for q in newuserposts])
stats += "questions from new users: " + str(len(newuserposts)) + "\n"
stats += "questions from new users/new user: " + str(len(newuserposts) / len(newuserlist)) + "\n"
print(stats)
with open(outputdir + "/stats.txt", "w") as file:
file.write(stats)
if __name__ == "__main__":
# execute only if run as a script
usage = sys.argv[0] + " <folder>"
if len(sys.argv) < 2:
print(usage)
sys.exit(1)
folder = sys.argv[1]
if not os.path.isdir(folder):
print(folder + " is not a folder")
sys.exit(1)
interval = 3
if len(sys.argv) >= 3:
if sys.argv[2].startswith("-i"):
interval = sys.argv[2][2:]
try:
interval = int(interval)
except ValueError:
print("-i: int required")
sys.exit(1)
if interval < 1 or interval > 12:
print("-i: only 1 - 12")
sys.exit(1)
else:
print("unknown parameter: " + sys.argv[2])
sys.exit(1)
main(folder, interval)