import sys import matplotlib.pyplot as plt import numpy as np import os import statsmodels.api as sm from datetime import datetime from datetime import timedelta from dateutil.relativedelta import relativedelta from common import calc_intervals, printnoln, rprint, DAYS_NEW_USER from loader import load, dmt, cms, readVotes from sentiments import readtoxleveltxt colors = ['red', 'green', 'blue', 'orange', 'deeppink'] thresholds = [3, 4, 5, 6] changedate = datetime.fromisoformat("2018-09-01T00:00:00") def main(folder, intervl): users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts, intervl) start = cms() printnoln("reading sentiments ...") (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt") rprint("reading sentiments ... took " + str(cms() - start) + "ms") outputdir = folder + "/output/votes/" os.system("mkdir -p " + outputdir) datasingle = [] scoresingle = [] for (option_date_from, option_date_to) in intervals: print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) # avg sentiments scores = (dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= p['CreationDate']) .map(lambda p: int(p['Score'])) .getresults()) filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound'] for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']]) .filter(lambda p: p != []) .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: []) .getresults()) scoresingle.append(scores) datasingle.append(filtered) # filter nan entries for i in range(len(datasingle)): if len(datasingle[i]) == 0: datasingle[i] = float("nan") if len(scoresingle[i]) == 0: scoresingle[i] = float("nan") print("Plotting ...") fig, ax = plt.subplots(figsize=(16, 12)) data = [np.mean(x) for x in datasingle] l1 = ax.plot([i[0] for i in intervals], data, label="average sentiment") ax2 = ax.twinx() l2 = ax2.plot([i[0] for i in intervals], [np.mean(x) for x in scoresingle], label="average score (votes)", color="red") plt.grid(True) for i in range(len(data)): va = "center" if 0 < i < len(data) - 1: if data[i - 1] < data[i] and data[i + 1] < data[i]: va = "bottom" elif data[i - 1] > data[i] and data[i + 1] > data[i]: va = "top" elif i == 0: if data[i + 1] < data[i]: va = "bottom" else: va = "top" elif i == len(data) - 1: if data[i - 1] < data[i]: va = "bottom" else: va = "top" ax.text(intervals[i][0], data[i], ("n=" if i == 0 else "") + (str(len(datasingle[i])) if str(datasingle[i]) != "nan" else ""), ha="center", va=va) plt.title("Average sentiments and score for new users") plt.xticks(rotation=90) ax.set_xlabel("months") ax.set_ylabel("sentiment") ax2.set_ylabel("score (votes)") plt.legend(l1 + l2, [l.get_label() for l in l1 + l2], loc="upper right") outfile = outputdir + "/average_votes-i" + str(intervl) + ".png" plt.savefig(outfile, bbox_inches='tight') plt.close(fig) # votes over time votes = readVotes(folder) fig = plt.figure(figsize=(16, 12)) ivs = [(datetime.fromisoformat("2010-01-01T00:00:00"), datetime.fromisoformat(str(y) + "-01-01T00:00:00")) for y in range(2011, 2020)] for interval in ivs: print(interval[0].strftime("%d-%m-%Y") + " to " + interval[1].strftime("%d-%m-%Y")) ivvotes = dmt(votes).filter(lambda v: interval[0] <= v['CreationDate'] < interval[1]).getresults() scores = [] for (option_date_from, option_date_to) in intervals: if option_date_to > interval[1]: continue intervalposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= p['CreationDate']).getresults() intervalpostsids = set(dmt(intervalposts).map(lambda p: p['Id']).getresults()) intervalvotes = dmt(ivvotes).filter(lambda v: v['PostId'] in intervalpostsids).getresults() intervalscore = sum(dmt(intervalvotes).map(lambda v: 1 if v['VoteTypeId'] == "2" else (-1 if v['VoteTypeId'] == "3" else 0)).getresults()) intervalscore = intervalscore / len(intervalpostsids) if len(intervalpostsids) != 0 else float("nan") scores.append(((option_date_from, option_date_to), intervalscore)) # if all(str(score) == "nan" for iv, score in scores) # continue plt.plot([iv[0] for iv, score in scores], [score for iv, score in scores], label=str(interval[0].year) + " - " + str(interval[1].year)) plt.title("Average score for new users over time") plt.xlabel("months") plt.ylabel("score") plt.legend(loc="upper right") plt.grid(True) outfile = outputdir + "/average_votes_over_time-i" + str(intervl) + ".png" plt.savefig(outfile, bbox_inches='tight') plt.close(fig) if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " " if len(sys.argv) < 2: print(usage) sys.exit(1) folder = sys.argv[1] if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) interval = 1 if len(sys.argv) >= 3: if sys.argv[2].startswith("-i"): interval = sys.argv[2][2:] try: interval = int(interval) except ValueError: print("-i: int required") sys.exit(1) if interval < 1 or interval > 12: print("-i: only 1 - 12") sys.exit(1) else: print("unknown parameter: " + sys.argv[2]) sys.exit(1) main(folder, interval)