From da8896eadd243d0d2b4bb0815a93e91bee6d3497 Mon Sep 17 00:00:00 2001 From: wea_ondara Date: Mon, 27 Jan 2020 11:58:19 +0100 Subject: [PATCH] wip --- loader.py | 27 +++++++++++++++++++++++++++ votes.py | 44 +++++++++++++++++++++++++++++++++++++------- 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/loader.py b/loader.py index 2e2e026..100c2b7 100644 --- a/loader.py +++ b/loader.py @@ -58,6 +58,33 @@ def load(folder): return users, posts, firstcontrib, sumcontrib +def readVotes(folder): + file = folder + "/Votes.xml" + prefix = "readVotes: " + printnoln(prefix + "reading xml file ...") + + now = cms() + items = [elem for event, elem in et.iterparse(file) if elem.tag == "row"] + rprint(prefix + "reading xml file ... took " + str(cms() - now) + "ms") + + votes = dmt(items).map(mapvote, prefix + "mapping votes").getresults() + + print(prefix + "done") + return votes + + +def mapvote(item): + tags = ['PostId', 'VoteTypeId', 'CreationDate'] + datetags = ['CreationDate'] + vote = {tag: getTag(item, tag) for tag in tags} + for tag in datetags: + if vote[tag] is not None: + vote[tag] = datetime.fromisoformat(vote[tag]) + else: + print("map vote: tag " + tag + " is None: " + str(vote)) + return vote + + def computesumcontrib(posts): x1 = dmt(posts).map(lambda q: q['OwnerUserId'], "calc sum contrib q").getresults() x2 = dmt(posts).map(lambda q: [a['OwnerUserId'] for a in q['Answers']], "calc sum contrib a").getresults() diff --git a/votes.py b/votes.py index 7983fb9..4399e9e 100644 --- a/votes.py +++ b/votes.py @@ -9,7 +9,7 @@ from datetime import timedelta from dateutil.relativedelta import relativedelta from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER -from loader import load, dmt, cms +from loader import load, dmt, cms, readVotes from sentiments import readtoxleveltxt colors = ['red', 'green', 'blue', 'orange', 'deeppink'] @@ -51,8 +51,8 @@ def main(folder, intervl): # filter nan entries for i in range(len(datasingle)): if len(datasingle[i]) == 0: - datasingle = float("nan") - if len(datasingle[i]) == 0: + datasingle[i] = float("nan") + if len(scoresingle[i]) == 0: scoresingle[i] = float("nan") print("Plotting ...") @@ -79,14 +79,44 @@ def main(folder, intervl): va = "bottom" else: va = "top" - ax.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + str(len(datasingle[i])), ha="center", va=va) - plt.title("Average sentiments for new users") + ax.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + (str(len(datasingle[i])) if str(datasingle[i]) != "nan" else ""), ha="center", va=va) + plt.title("Average sentiments and score for new users") plt.xticks(rotation=90) ax.set_xlabel("months") ax.set_ylabel("sentiment") - ax.set_ylabel("score (votes)") + ax2.set_ylabel("score (votes)") plt.legend(l1 + l2, [l.get_label() for l in l1 + l2], loc="upper right") - outfile = outputdir + "/average_sentiments-i" + str(intervl) + ".png" + outfile = outputdir + "/average_votes-i" + str(intervl) + ".png" + plt.savefig(outfile, bbox_inches='tight') + plt.close(fig) + + # votes over time + votes = readVotes(folder) + fig = plt.figure(figsize=(16, 12)) + ivs = [(datetime.fromisoformat("2010-01-01T00:00:00"), datetime.fromisoformat(str(y) + "-01-01T00:00:00")) for y in range(2011, 2020)] + for interval in ivs: + print(interval[0].strftime("%d-%m-%Y") + " to " + interval[1].strftime("%d-%m-%Y")) + ivvotes = dmt(votes).filter(lambda v: interval[0] <= v['CreationDate'] < interval[1]).getresults() + scores = [] + for (option_date_from, option_date_to) in intervals: + if option_date_to > interval[1]: + continue + intervalposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to + and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= p['CreationDate']).getresults() + intervalpostsids = set(dmt(intervalposts).map(lambda p: p['Id']).getresults()) + intervalvotes = dmt(ivvotes).filter(lambda v: v['PostId'] in intervalpostsids).getresults() + intervalscore = sum(dmt(intervalvotes).map(lambda v: 1 if v['VoteTypeId'] == "2" else (-1 if v['VoteTypeId'] == "3" else 0)).getresults()) + intervalscore = intervalscore / len(intervalpostsids) if len(intervalpostsids) != 0 else float("nan") + scores.append(((option_date_from, option_date_to), intervalscore)) + # if all(str(score) == "nan" for iv, score in scores) + # continue + plt.plot([iv[0] for iv, score in scores], [score for iv, score in scores], label=str(interval[0].year) + " - " + str(interval[1].year)) + plt.title("Average score for new users over time") + plt.xlabel("months") + plt.ylabel("score") + plt.legend(loc="upper right") + plt.grid(True) + outfile = outputdir + "/average_votes_over_time-i" + str(intervl) + ".png" plt.savefig(outfile, bbox_inches='tight') plt.close(fig)