From 06085870a1d7e8f361e1339c4943886291456c38 Mon Sep 17 00:00:00 2001 From: wea_ondara Date: Sat, 11 Apr 2020 14:07:57 +0200 Subject: [PATCH] wip --- analyze_batch.py | 10 ++++---- box_sentiment.py | 10 ++++---- common.py | 2 ++ its.py | 4 +-- posthist.py | 67 +++++++++++++++++++++++++++++++++++------------- votes.py | 6 ++--- 6 files changed, 66 insertions(+), 33 deletions(-) diff --git a/analyze_batch.py b/analyze_batch.py index 1566322..7513961 100644 --- a/analyze_batch.py +++ b/analyze_batch.py @@ -9,7 +9,7 @@ import matplotlib.pyplot as plt import matplotlib import numpy as np -from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK +from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK, FIG_SIZE from loader import load, dmt, cms from sentiments import readtoxleveltxt @@ -51,7 +51,7 @@ def main(folder, intervl): # get questions for option_date_from <= creation date < option_date_to newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults() print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) - gfig, gaxs = plt.subplots(2, 2, figsize=(16, 12)) + gfig, gaxs = plt.subplots(2, 2, figsize=FIG_SIZE) gaxs[0, 0].set_title('Neg') gaxs[1, 0].set_title('Neu') gaxs[0, 1].set_title('Pos') @@ -116,7 +116,7 @@ def main(folder, intervl): gpos.append(poslevelsflat) gcom.append(comlevelsflat) - fig, axs = plt.subplots(2, 2, figsize=(16, 12)) + fig, axs = plt.subplots(2, 2, figsize=FIG_SIZE) axs[0, 0].set_title('Negativity') axs[1, 0].set_title('Neutrality') axs[0, 1].set_title('Positivity') @@ -198,7 +198,7 @@ def main(folder, intervl): poslevelsflat = [item['pos'] for item in toxlevels] comlevelsflat = [item['compound'] for item in toxlevels] - fig, axs = plt.subplots(2, 2, figsize=(16, 12)) + fig, axs = plt.subplots(2, 2, figsize=FIG_SIZE) axs[0, 0].set_title('Neg') axs[1, 0].set_title('Neu') axs[0, 1].set_title('Pos') @@ -227,7 +227,7 @@ def main(folder, intervl): # avg sentiment graph print("Plotting average sentiments ...") - fig = plt.figure(figsize=(16, 12)) + fig = plt.figure(figsize=FIG_SIZE) for i in postcounts: plt.plot([iv[0] for iv in intervals], avgsent[i], label="new users (" + str(i) + " posts)") plt.plot([iv[0] for iv in intervals], avgsent[0], label="old users (all posts)") diff --git a/box_sentiment.py b/box_sentiment.py index ae191e7..49f2276 100644 --- a/box_sentiment.py +++ b/box_sentiment.py @@ -8,7 +8,7 @@ from math import ceil import matplotlib.pyplot as plt import numpy as np -from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK +from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK, FIG_SIZE from loader import load, dmt, cms OLD_USER_PERCENTILE = 0.95 @@ -85,7 +85,7 @@ def main(folder, intervl): avgnewpos.append(np.average(pos)) avgnewall.append(np.average([item['compound'] for item in toxlevels])) - fig, axs = plt.subplots(figsize=(16, 12)) + fig, axs = plt.subplots(figsize=FIG_SIZE) axs.boxplot([neg, neu, pos]) axs.set_xticklabels(['negative', 'neutral', 'positive']) axs.set_title("Sentiment categorization of answers to posts within 1 week of 1st contribution\nPosts created between " @@ -126,7 +126,7 @@ def main(folder, intervl): avgoldpos.append(np.average(pos)) avgoldall.append(np.average([item['compound'] for item in toxlevels])) - fig, axs = plt.subplots(figsize=(16, 12)) + fig, axs = plt.subplots(figsize=FIG_SIZE) axs.boxplot([neg, neu, pos]) axs.set_xticklabels(['negative', 'neutral', 'positive']) axs.set_title("Sentiment categorization of answers to posts within 1 week of 1st contribution\nPosts created between " @@ -142,7 +142,7 @@ def main(folder, intervl): os.system(magickold + " " + outputdir + "boxsent_oldusers.pdf") # plot new users - fig = plt.figure(figsize=(16, 12)) + fig = plt.figure(figsize=FIG_SIZE) x = [f.strftime("%d-%m-%Y") + " - " + t.strftime("%d-%m-%Y") for (f, t) in intervals] plt.plot(x, avgnewneg, label='negative') plt.plot(x, avgnewneu, label='neutral') @@ -155,7 +155,7 @@ def main(folder, intervl): plt.close(fig) # plot old users - fig = plt.figure(figsize=(16, 12)) + fig = plt.figure(figsize=FIG_SIZE) x = [f.strftime("%d-%m-%Y") + " - " + t.strftime("%d-%m-%Y") for (f, t) in intervals] plt.plot(x, avgoldneg, label='negative') plt.plot(x, avgoldneu, label='neutral') diff --git a/common.py b/common.py index 4c5b66b..eca186f 100644 --- a/common.py +++ b/common.py @@ -11,6 +11,8 @@ rprint = lambda text: print('\r' + text) DAYS_NEW_USER = 7 IMAGE_MAGICK = "magick" CHANGE_DATE = datetime.fromisoformat("2018-08-21T21:00:00") +FIG_SIZE = (8,6) +# FIG_LAYOUT = def calc_intervals(posts, months=3): diff --git a/its.py b/its.py index ccf5140..4d9932c 100644 --- a/its.py +++ b/its.py @@ -7,7 +7,7 @@ from datetime import datetime from datetime import timedelta from dateutil.relativedelta import relativedelta -from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, difftime +from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, difftime from loader import load, dmt, cms from sentiments import readtoxleveltxt @@ -118,7 +118,7 @@ def main(folder, intervl): with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file: file.write(str(res.summary())) - fig = plt.figure(figsize=(16, 12)) + fig = plt.figure(figsize=FIG_SIZE) plt.plot([difftime(i[0]) for i in intervals], data, label="average sentiment") plt.grid(True) for i in range(len(data)): diff --git a/posthist.py b/posthist.py index 65c027a..49f3574 100644 --- a/posthist.py +++ b/posthist.py @@ -6,7 +6,7 @@ from datetime import timedelta import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator -from common import calc_intervals, IMAGE_MAGICK, DAYS_NEW_USER +from common import calc_intervals, IMAGE_MAGICK, DAYS_NEW_USER, FIG_SIZE from loader import load, dmt from sentiments import readtoxleveltxt @@ -22,9 +22,12 @@ def main(folder, intervl): os.system("mkdir -p " + outputdir) activeusercounts = [] + newusercounts = [] answerstonewusers = [] sentimentstonewusers = [] activitynewusers = [] + questionsininterval = [] + answersininterval = [] imgmagickcmd = IMAGE_MAGICK for (option_date_from, option_date_to) in intervals: print(option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y")) @@ -32,6 +35,10 @@ def main(folder, intervl): # post histograms # filter posts by option_date_from <= creation date <= option_date_to newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults() + questionsininterval.append(((option_date_from, option_date_to), len(newposts))) + newanswers = dmt(posts).map(lambda p: [a for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to], "filtering answers by date") \ + .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: []).getresults() + answersininterval.append(((option_date_from, option_date_to), len(newanswers))) postcounts = defaultdict(list) i = 0 @@ -42,6 +49,7 @@ def main(folder, intervl): # postcounts[p['OwnerUserId']].append(a) postcounts = {id: len(pc) for (id, pc) in postcounts.items()} activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys()))) + newusercounts.append(((option_date_from, option_date_to), len([u for u in users if option_date_from <= u['CreationDate'] < option_date_to]))) activitynewusersinmonth = defaultdict(int) for p in newposts: @@ -57,7 +65,7 @@ def main(folder, intervl): histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "-i" + str(intervl) histdata = [pc for pc in postcounts.values()] - fig = plt.figure(figsize=(16, 12)) + fig = plt.figure(figsize=FIG_SIZE) plt.hist(histdata, range(max(histdata, default=0) + 1)) plt.yscale('log') plt.ylim(bottom=0) @@ -71,9 +79,9 @@ def main(folder, intervl): # answers to new users answers = (dmt(posts).map(lambda q: [a for a in q['Answers'] - if option_date_from <= a['CreationDate'] < option_date_to # answer in interval - and firstcontrib[q['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > q['CreationDate'] # post created within 1 week of 1st contrib - and q['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) # answer created within 1 week of post + if option_date_from <= a['CreationDate'] < option_date_to # answer in interval + and firstcontrib[q['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > q['CreationDate'] # post created within 1 week of 1st contrib + and q['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) # answer created within 1 week of post .getresults()) count = sum([len(a) for a in answers]) answerstonewusers.append(((option_date_from, option_date_to), count)) @@ -87,32 +95,34 @@ def main(folder, intervl): os.system(imgmagickcmd + " " + outputdir + "/posthist-i" + str(intervl) + ".pdf") # plot posts diagram - fig = plt.figure(figsize=(16, 12)) - plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts]) + fig = plt.figure(figsize=FIG_SIZE) + plt.plot([u[0] for (u, y) in activeusercounts], [y for (u, y) in activeusercounts], label="active users") + plt.plot([u[0] for (u, y) in newusercounts], [y for (u, y) in newusercounts], label='newly registered users') plt.xlabel('time') - plt.ylabel('#active users') + plt.ylabel('#users') plt.yscale('log') - plt.ylim(bottom=1) + # plt.ylim(bottom=1) plt.title("Active users") + plt.legend(loc="upper right") fig.savefig(outputdir + "activeusers-i" + str(intervl) + ".png", bbox_inches='tight') plt.close(fig) # plot answers to new users diagram - fig = plt.figure(figsize=(16, 12)) - plt.plot([x[0] for (x, y) in answerstonewusers], [y for (x, y) in answerstonewusers]) + fig = plt.figure(figsize=FIG_SIZE) + plt.plot([u[0] for (u, y) in answerstonewusers], [y for (u, y) in answerstonewusers]) plt.xlabel('time') plt.ylabel('#answers per question of a new user') plt.yscale('log') - plt.ylim(bottom=1) + # plt.ylim(bottom=1) plt.title("Answers to new users") fig.savefig(outputdir + "answerstonewusers-i" + str(intervl) + ".png", bbox_inches='tight') plt.close(fig) # plot sentiments of answers to new users diagram - fig = plt.figure(figsize=(16, 12)) - plt.plot([x[0] for (x, y) in sentimentstonewusers], [b for (x, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer") - plt.plot([x[0] for (x, y) in sentimentstonewusers], [n for (x, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer") - plt.plot([x[0] for (x, y) in sentimentstonewusers], [g for (x, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer") + fig = plt.figure(figsize=FIG_SIZE) + plt.plot([u[0] for (u, y) in sentimentstonewusers], [b for (u, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer") + plt.plot([u[0] for (u, y) in sentimentstonewusers], [n for (u, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer") + plt.plot([u[0] for (u, y) in sentimentstonewusers], [g for (u, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer") plt.xlabel('time') plt.ylabel('sentiment') plt.yscale('log') @@ -123,8 +133,8 @@ def main(folder, intervl): plt.close(fig) # plot activity for new users - fig = plt.figure(figsize=(16, 12)) - plt.plot([x[0] for (x, y) in activitynewusers], [y for x, y in activitynewusers], label="activity") + fig = plt.figure(figsize=FIG_SIZE) + plt.plot([u[0] for (u, y) in activitynewusers], [y for u, y in activitynewusers], label="activity") plt.xlabel('time') plt.ylabel('#questions or answers created by a new user') plt.legend(loc="upper right") @@ -132,6 +142,27 @@ def main(folder, intervl): fig.savefig(outputdir + "activitynewusers-i" + str(intervl) + ".png", bbox_inches='tight') plt.close(fig) + # plot activity for new users + fig = plt.figure(figsize=FIG_SIZE) + plt.plot([u[0] for (u, y) in questionsininterval], [y for u, y in questionsininterval], label="questions") + plt.plot([u[0] for (u, y) in answersininterval], [y for u, y in answersininterval], label="answer") + plt.xlabel('time') + plt.ylabel('quantity') + plt.legend(loc="upper right") + plt.title("Average activity per new user") + fig.savefig(outputdir + "postsanswers-i" + str(intervl) + ".png", bbox_inches='tight') + plt.close(fig) + + #print data set stats + print("users: " + str(len(users))) + print("questions: " + str(len(posts))) + print("answers: " + str(sum(dmt(posts).map(lambda q: len(q['Answers'])).getresults()))) + print("active user last month: " + str(activeusercounts[-1])) + useridmapping = {u['Id']: u for u in users} + newuserposts = dmt(posts).filter(lambda q: q['CreationDate'] < useridmapping[q['OwnerUserId']]['CreationDate'] + timedelta(days=DAYS_NEW_USER)).getresults() + newuserlist = set([q['OwnerUserId'] for q in newuserposts]) + print("questions from new users: " + str(len(newuserposts))) + print("questions from new users/new user: " + str(len(newuserposts) / len(newuserlist))) if __name__ == "__main__": # execute only if run as a script diff --git a/votes.py b/votes.py index 2f163ea..3746282 100644 --- a/votes.py +++ b/votes.py @@ -8,7 +8,7 @@ from datetime import datetime from datetime import timedelta from dateutil.relativedelta import relativedelta -from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER +from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE from loader import load, dmt, cms, readVotes from sentiments import readtoxleveltxt @@ -57,7 +57,7 @@ def main(folder, intervl): scoresingle[i] = float("nan") print("Plotting ...") - fig, ax = plt.subplots(figsize=(16, 12)) + fig, ax = plt.subplots(figsize=FIG_SIZE) data = [np.mean(x) for x in datasingle] l1 = ax.plot([i[0] for i in intervals], data, label="average sentiment") ax2 = ax.twinx() @@ -93,7 +93,7 @@ def main(folder, intervl): # votes over time votes = readVotes(folder) - fig = plt.figure(figsize=(16, 12)) + fig = plt.figure(figsize=FIG_SIZE) ivs = [(datetime.fromisoformat("2010-01-01T00:00:00"), datetime.fromisoformat(str(y) + "-01-01T00:00:00")) for y in range(2011, 2020)] for interval in ivs: print(interval[0].strftime("%d-%m-%Y") + " to " + interval[1].strftime("%d-%m-%Y"))