This commit is contained in:
wea_ondara
2020-04-11 14:07:57 +02:00
parent 4195c83ef8
commit 06085870a1
6 changed files with 66 additions and 33 deletions

View File

@@ -9,7 +9,7 @@ import matplotlib.pyplot as plt
import matplotlib import matplotlib
import numpy as np import numpy as np
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK, FIG_SIZE
from loader import load, dmt, cms from loader import load, dmt, cms
from sentiments import readtoxleveltxt from sentiments import readtoxleveltxt
@@ -51,7 +51,7 @@ def main(folder, intervl):
# get questions for option_date_from <= creation date < option_date_to # get questions for option_date_from <= creation date < option_date_to
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults() newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filter posts by dates").getresults()
print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) print("computing toxic levels: " + option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y"))
gfig, gaxs = plt.subplots(2, 2, figsize=(16, 12)) gfig, gaxs = plt.subplots(2, 2, figsize=FIG_SIZE)
gaxs[0, 0].set_title('Neg') gaxs[0, 0].set_title('Neg')
gaxs[1, 0].set_title('Neu') gaxs[1, 0].set_title('Neu')
gaxs[0, 1].set_title('Pos') gaxs[0, 1].set_title('Pos')
@@ -116,7 +116,7 @@ def main(folder, intervl):
gpos.append(poslevelsflat) gpos.append(poslevelsflat)
gcom.append(comlevelsflat) gcom.append(comlevelsflat)
fig, axs = plt.subplots(2, 2, figsize=(16, 12)) fig, axs = plt.subplots(2, 2, figsize=FIG_SIZE)
axs[0, 0].set_title('Negativity') axs[0, 0].set_title('Negativity')
axs[1, 0].set_title('Neutrality') axs[1, 0].set_title('Neutrality')
axs[0, 1].set_title('Positivity') axs[0, 1].set_title('Positivity')
@@ -198,7 +198,7 @@ def main(folder, intervl):
poslevelsflat = [item['pos'] for item in toxlevels] poslevelsflat = [item['pos'] for item in toxlevels]
comlevelsflat = [item['compound'] for item in toxlevels] comlevelsflat = [item['compound'] for item in toxlevels]
fig, axs = plt.subplots(2, 2, figsize=(16, 12)) fig, axs = plt.subplots(2, 2, figsize=FIG_SIZE)
axs[0, 0].set_title('Neg') axs[0, 0].set_title('Neg')
axs[1, 0].set_title('Neu') axs[1, 0].set_title('Neu')
axs[0, 1].set_title('Pos') axs[0, 1].set_title('Pos')
@@ -227,7 +227,7 @@ def main(folder, intervl):
# avg sentiment graph # avg sentiment graph
print("Plotting average sentiments ...") print("Plotting average sentiments ...")
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=FIG_SIZE)
for i in postcounts: for i in postcounts:
plt.plot([iv[0] for iv in intervals], avgsent[i], label="new users (" + str(i) + " posts)") plt.plot([iv[0] for iv in intervals], avgsent[i], label="new users (" + str(i) + " posts)")
plt.plot([iv[0] for iv in intervals], avgsent[0], label="old users (all posts)") plt.plot([iv[0] for iv in intervals], avgsent[0], label="old users (all posts)")

View File

@@ -8,7 +8,7 @@ from math import ceil
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK from common import calc_intervals, imprt, printnoln, rprint, DAYS_NEW_USER, IMAGE_MAGICK, FIG_SIZE
from loader import load, dmt, cms from loader import load, dmt, cms
OLD_USER_PERCENTILE = 0.95 OLD_USER_PERCENTILE = 0.95
@@ -85,7 +85,7 @@ def main(folder, intervl):
avgnewpos.append(np.average(pos)) avgnewpos.append(np.average(pos))
avgnewall.append(np.average([item['compound'] for item in toxlevels])) avgnewall.append(np.average([item['compound'] for item in toxlevels]))
fig, axs = plt.subplots(figsize=(16, 12)) fig, axs = plt.subplots(figsize=FIG_SIZE)
axs.boxplot([neg, neu, pos]) axs.boxplot([neg, neu, pos])
axs.set_xticklabels(['negative', 'neutral', 'positive']) axs.set_xticklabels(['negative', 'neutral', 'positive'])
axs.set_title("Sentiment categorization of answers to posts within 1 week of 1st contribution\nPosts created between " axs.set_title("Sentiment categorization of answers to posts within 1 week of 1st contribution\nPosts created between "
@@ -126,7 +126,7 @@ def main(folder, intervl):
avgoldpos.append(np.average(pos)) avgoldpos.append(np.average(pos))
avgoldall.append(np.average([item['compound'] for item in toxlevels])) avgoldall.append(np.average([item['compound'] for item in toxlevels]))
fig, axs = plt.subplots(figsize=(16, 12)) fig, axs = plt.subplots(figsize=FIG_SIZE)
axs.boxplot([neg, neu, pos]) axs.boxplot([neg, neu, pos])
axs.set_xticklabels(['negative', 'neutral', 'positive']) axs.set_xticklabels(['negative', 'neutral', 'positive'])
axs.set_title("Sentiment categorization of answers to posts within 1 week of 1st contribution\nPosts created between " axs.set_title("Sentiment categorization of answers to posts within 1 week of 1st contribution\nPosts created between "
@@ -142,7 +142,7 @@ def main(folder, intervl):
os.system(magickold + " " + outputdir + "boxsent_oldusers.pdf") os.system(magickold + " " + outputdir + "boxsent_oldusers.pdf")
# plot new users # plot new users
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=FIG_SIZE)
x = [f.strftime("%d-%m-%Y") + " - " + t.strftime("%d-%m-%Y") for (f, t) in intervals] x = [f.strftime("%d-%m-%Y") + " - " + t.strftime("%d-%m-%Y") for (f, t) in intervals]
plt.plot(x, avgnewneg, label='negative') plt.plot(x, avgnewneg, label='negative')
plt.plot(x, avgnewneu, label='neutral') plt.plot(x, avgnewneu, label='neutral')
@@ -155,7 +155,7 @@ def main(folder, intervl):
plt.close(fig) plt.close(fig)
# plot old users # plot old users
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=FIG_SIZE)
x = [f.strftime("%d-%m-%Y") + " - " + t.strftime("%d-%m-%Y") for (f, t) in intervals] x = [f.strftime("%d-%m-%Y") + " - " + t.strftime("%d-%m-%Y") for (f, t) in intervals]
plt.plot(x, avgoldneg, label='negative') plt.plot(x, avgoldneg, label='negative')
plt.plot(x, avgoldneu, label='neutral') plt.plot(x, avgoldneu, label='neutral')

View File

@@ -11,6 +11,8 @@ rprint = lambda text: print('\r' + text)
DAYS_NEW_USER = 7 DAYS_NEW_USER = 7
IMAGE_MAGICK = "magick" IMAGE_MAGICK = "magick"
CHANGE_DATE = datetime.fromisoformat("2018-08-21T21:00:00") CHANGE_DATE = datetime.fromisoformat("2018-08-21T21:00:00")
FIG_SIZE = (8,6)
# FIG_LAYOUT =
def calc_intervals(posts, months=3): def calc_intervals(posts, months=3):

4
its.py
View File

@@ -7,7 +7,7 @@ from datetime import datetime
from datetime import timedelta from datetime import timedelta
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, difftime from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE, difftime
from loader import load, dmt, cms from loader import load, dmt, cms
from sentiments import readtoxleveltxt from sentiments import readtoxleveltxt
@@ -118,7 +118,7 @@ def main(folder, intervl):
with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file: with open(outputdir + "/summary_threshold" + str(ti) + "-i" + str(intervl) + ".txt", "w") as file:
file.write(str(res.summary())) file.write(str(res.summary()))
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=FIG_SIZE)
plt.plot([difftime(i[0]) for i in intervals], data, label="average sentiment") plt.plot([difftime(i[0]) for i in intervals], data, label="average sentiment")
plt.grid(True) plt.grid(True)
for i in range(len(data)): for i in range(len(data)):

View File

@@ -6,7 +6,7 @@ from datetime import timedelta
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator from matplotlib.ticker import MaxNLocator
from common import calc_intervals, IMAGE_MAGICK, DAYS_NEW_USER from common import calc_intervals, IMAGE_MAGICK, DAYS_NEW_USER, FIG_SIZE
from loader import load, dmt from loader import load, dmt
from sentiments import readtoxleveltxt from sentiments import readtoxleveltxt
@@ -22,9 +22,12 @@ def main(folder, intervl):
os.system("mkdir -p " + outputdir) os.system("mkdir -p " + outputdir)
activeusercounts = [] activeusercounts = []
newusercounts = []
answerstonewusers = [] answerstonewusers = []
sentimentstonewusers = [] sentimentstonewusers = []
activitynewusers = [] activitynewusers = []
questionsininterval = []
answersininterval = []
imgmagickcmd = IMAGE_MAGICK imgmagickcmd = IMAGE_MAGICK
for (option_date_from, option_date_to) in intervals: for (option_date_from, option_date_to) in intervals:
print(option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y")) print(option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y"))
@@ -32,6 +35,10 @@ def main(folder, intervl):
# post histograms # post histograms
# filter posts by option_date_from <= creation date <= option_date_to # filter posts by option_date_from <= creation date <= option_date_to
newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults() newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults()
questionsininterval.append(((option_date_from, option_date_to), len(newposts)))
newanswers = dmt(posts).map(lambda p: [a for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to], "filtering answers by date") \
.reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: []).getresults()
answersininterval.append(((option_date_from, option_date_to), len(newanswers)))
postcounts = defaultdict(list) postcounts = defaultdict(list)
i = 0 i = 0
@@ -42,6 +49,7 @@ def main(folder, intervl):
# postcounts[p['OwnerUserId']].append(a) # postcounts[p['OwnerUserId']].append(a)
postcounts = {id: len(pc) for (id, pc) in postcounts.items()} postcounts = {id: len(pc) for (id, pc) in postcounts.items()}
activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys()))) activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys())))
newusercounts.append(((option_date_from, option_date_to), len([u for u in users if option_date_from <= u['CreationDate'] < option_date_to])))
activitynewusersinmonth = defaultdict(int) activitynewusersinmonth = defaultdict(int)
for p in newposts: for p in newposts:
@@ -57,7 +65,7 @@ def main(folder, intervl):
histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "-i" + str(intervl) histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "-i" + str(intervl)
histdata = [pc for pc in postcounts.values()] histdata = [pc for pc in postcounts.values()]
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=FIG_SIZE)
plt.hist(histdata, range(max(histdata, default=0) + 1)) plt.hist(histdata, range(max(histdata, default=0) + 1))
plt.yscale('log') plt.yscale('log')
plt.ylim(bottom=0) plt.ylim(bottom=0)
@@ -87,32 +95,34 @@ def main(folder, intervl):
os.system(imgmagickcmd + " " + outputdir + "/posthist-i" + str(intervl) + ".pdf") os.system(imgmagickcmd + " " + outputdir + "/posthist-i" + str(intervl) + ".pdf")
# plot posts diagram # plot posts diagram
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=FIG_SIZE)
plt.plot([x[0] for (x, y) in activeusercounts], [y for (x, y) in activeusercounts]) plt.plot([u[0] for (u, y) in activeusercounts], [y for (u, y) in activeusercounts], label="active users")
plt.plot([u[0] for (u, y) in newusercounts], [y for (u, y) in newusercounts], label='newly registered users')
plt.xlabel('time') plt.xlabel('time')
plt.ylabel('#active users') plt.ylabel('#users')
plt.yscale('log') plt.yscale('log')
plt.ylim(bottom=1) # plt.ylim(bottom=1)
plt.title("Active users") plt.title("Active users")
plt.legend(loc="upper right")
fig.savefig(outputdir + "activeusers-i" + str(intervl) + ".png", bbox_inches='tight') fig.savefig(outputdir + "activeusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)
# plot answers to new users diagram # plot answers to new users diagram
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=FIG_SIZE)
plt.plot([x[0] for (x, y) in answerstonewusers], [y for (x, y) in answerstonewusers]) plt.plot([u[0] for (u, y) in answerstonewusers], [y for (u, y) in answerstonewusers])
plt.xlabel('time') plt.xlabel('time')
plt.ylabel('#answers per question of a new user') plt.ylabel('#answers per question of a new user')
plt.yscale('log') plt.yscale('log')
plt.ylim(bottom=1) # plt.ylim(bottom=1)
plt.title("Answers to new users") plt.title("Answers to new users")
fig.savefig(outputdir + "answerstonewusers-i" + str(intervl) + ".png", bbox_inches='tight') fig.savefig(outputdir + "answerstonewusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)
# plot sentiments of answers to new users diagram # plot sentiments of answers to new users diagram
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=FIG_SIZE)
plt.plot([x[0] for (x, y) in sentimentstonewusers], [b for (x, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer") plt.plot([u[0] for (u, y) in sentimentstonewusers], [b for (u, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer")
plt.plot([x[0] for (x, y) in sentimentstonewusers], [n for (x, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer") plt.plot([u[0] for (u, y) in sentimentstonewusers], [n for (u, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer")
plt.plot([x[0] for (x, y) in sentimentstonewusers], [g for (x, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer") plt.plot([u[0] for (u, y) in sentimentstonewusers], [g for (u, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer")
plt.xlabel('time') plt.xlabel('time')
plt.ylabel('sentiment') plt.ylabel('sentiment')
plt.yscale('log') plt.yscale('log')
@@ -123,8 +133,8 @@ def main(folder, intervl):
plt.close(fig) plt.close(fig)
# plot activity for new users # plot activity for new users
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=FIG_SIZE)
plt.plot([x[0] for (x, y) in activitynewusers], [y for x, y in activitynewusers], label="activity") plt.plot([u[0] for (u, y) in activitynewusers], [y for u, y in activitynewusers], label="activity")
plt.xlabel('time') plt.xlabel('time')
plt.ylabel('#questions or answers created by a new user') plt.ylabel('#questions or answers created by a new user')
plt.legend(loc="upper right") plt.legend(loc="upper right")
@@ -132,6 +142,27 @@ def main(folder, intervl):
fig.savefig(outputdir + "activitynewusers-i" + str(intervl) + ".png", bbox_inches='tight') fig.savefig(outputdir + "activitynewusers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig) plt.close(fig)
# plot activity for new users
fig = plt.figure(figsize=FIG_SIZE)
plt.plot([u[0] for (u, y) in questionsininterval], [y for u, y in questionsininterval], label="questions")
plt.plot([u[0] for (u, y) in answersininterval], [y for u, y in answersininterval], label="answer")
plt.xlabel('time')
plt.ylabel('quantity')
plt.legend(loc="upper right")
plt.title("Average activity per new user")
fig.savefig(outputdir + "postsanswers-i" + str(intervl) + ".png", bbox_inches='tight')
plt.close(fig)
#print data set stats
print("users: " + str(len(users)))
print("questions: " + str(len(posts)))
print("answers: " + str(sum(dmt(posts).map(lambda q: len(q['Answers'])).getresults())))
print("active user last month: " + str(activeusercounts[-1]))
useridmapping = {u['Id']: u for u in users}
newuserposts = dmt(posts).filter(lambda q: q['CreationDate'] < useridmapping[q['OwnerUserId']]['CreationDate'] + timedelta(days=DAYS_NEW_USER)).getresults()
newuserlist = set([q['OwnerUserId'] for q in newuserposts])
print("questions from new users: " + str(len(newuserposts)))
print("questions from new users/new user: " + str(len(newuserposts) / len(newuserlist)))
if __name__ == "__main__": if __name__ == "__main__":
# execute only if run as a script # execute only if run as a script

View File

@@ -8,7 +8,7 @@ from datetime import datetime
from datetime import timedelta from datetime import timedelta
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER, FIG_SIZE
from loader import load, dmt, cms, readVotes from loader import load, dmt, cms, readVotes
from sentiments import readtoxleveltxt from sentiments import readtoxleveltxt
@@ -57,7 +57,7 @@ def main(folder, intervl):
scoresingle[i] = float("nan") scoresingle[i] = float("nan")
print("Plotting ...") print("Plotting ...")
fig, ax = plt.subplots(figsize=(16, 12)) fig, ax = plt.subplots(figsize=FIG_SIZE)
data = [np.mean(x) for x in datasingle] data = [np.mean(x) for x in datasingle]
l1 = ax.plot([i[0] for i in intervals], data, label="average sentiment") l1 = ax.plot([i[0] for i in intervals], data, label="average sentiment")
ax2 = ax.twinx() ax2 = ax.twinx()
@@ -93,7 +93,7 @@ def main(folder, intervl):
# votes over time # votes over time
votes = readVotes(folder) votes = readVotes(folder)
fig = plt.figure(figsize=(16, 12)) fig = plt.figure(figsize=FIG_SIZE)
ivs = [(datetime.fromisoformat("2010-01-01T00:00:00"), datetime.fromisoformat(str(y) + "-01-01T00:00:00")) for y in range(2011, 2020)] ivs = [(datetime.fromisoformat("2010-01-01T00:00:00"), datetime.fromisoformat(str(y) + "-01-01T00:00:00")) for y in range(2011, 2020)]
for interval in ivs: for interval in ivs:
print(interval[0].strftime("%d-%m-%Y") + " to " + interval[1].strftime("%d-%m-%Y")) print(interval[0].strftime("%d-%m-%Y") + " to " + interval[1].strftime("%d-%m-%Y"))