import os import sys from collections import defaultdict from datetime import timedelta import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator from common import calc_intervals, IMAGE_MAGICK, DAYS_NEW_USER, FIG_SIZE from loader import load, dmt from sentiments import readtoxleveltxt colors = ['red', 'green', 'blue', 'orange', 'deeppink'] def main(folder, intervl): users, posts, firstcontrib, sumcontrib = load(folder) intervals = calc_intervals(posts, intervl) (_, cachedsentiments) = readtoxleveltxt(folder + "/output/sentiments.txt") outputdir = folder + "/output/posthist/" os.system("mkdir -p " + outputdir) activeusercounts = [] newusercounts = [] answerstonewusers = [] sentimentstonewusers = [] activitynewusers = [] questionsininterval = [] answersininterval = [] imgmagickcmd = IMAGE_MAGICK for (option_date_from, option_date_to) in intervals: print(option_date_from.strftime("%d-%m-%Y"), option_date_to.strftime("%d-%m-%Y")) # post histograms # filter posts by option_date_from <= creation date <= option_date_to newposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to, "filtering posts by date").getresults() questionsininterval.append(((option_date_from, option_date_to), len(newposts))) newanswers = dmt(posts).map(lambda p: [a for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to], "filtering answers by date") \ .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: []).getresults() answersininterval.append(((option_date_from, option_date_to), len(newanswers))) postcounts = defaultdict(list) i = 0 for p in newposts: postcounts[p['OwnerUserId']].append(p) i = i + 1 # for a in p['Answers']: # postcounts[p['OwnerUserId']].append(a) postcounts = {id: len(pc) for (id, pc) in postcounts.items()} activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys()))) newusercounts.append(((option_date_from, option_date_to), len([u for u in users if option_date_from <= u['CreationDate'] < option_date_to]))) activitynewusersinmonth = defaultdict(int) for p in newposts: if firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate']: activitynewusersinmonth[p['OwnerUserId']] += 1 for p in posts: for a in p['Answers']: if firstcontrib[a['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']: activitynewusersinmonth[p['OwnerUserId']] += 1 activitysum = sum(activitynewusersinmonth.values()) activitynewusers.append(((option_date_from, option_date_to), activitysum / len(activitynewusersinmonth) if len(activitynewusersinmonth) > 0 else float("nan"))) histfilename = outputdir + "posthist_" + folder.split("/")[-1] + "_" + option_date_from.strftime("%d-%m-%Y") + "_" + option_date_to.strftime("%d-%m-%Y") + "-i" + str(intervl) histdata = [pc for pc in postcounts.values()] fig = plt.figure(figsize=FIG_SIZE) plt.hist(histdata, range(max(histdata, default=0) + 1)) plt.yscale('log') plt.ylim(bottom=0) plt.xlabel("#posts") plt.ylabel("#users with X posts") fig.gca().xaxis.set_major_locator(MaxNLocator(integer=True)) plt.title("Histogram for user post count between " + option_date_from.strftime("%d-%m-%Y") + " and " + option_date_to.strftime("%d-%m-%Y")) fig.savefig(histfilename + ".png", bbox_inches='tight') plt.close(fig) imgmagickcmd += " " + histfilename + ".png" # answers to new users answers = (dmt(posts).map(lambda q: [a for a in q['Answers'] if option_date_from <= a['CreationDate'] < option_date_to # answer in interval and firstcontrib[q['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > q['CreationDate'] # post created within 1 week of 1st contrib and q['CreationDate'] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) # answer created within 1 week of post .getresults()) count = sum([len(a) for a in answers]) answerstonewusers.append(((option_date_from, option_date_to), count)) sent = ([cachedsentiments[a['Id']] for al in answers for a in al]) sentbad = len([1 for a in sent if a['compound'] < -0.05]) sentneu = len([1 for a in sent if -0.05 <= a['compound'] <= 0.05]) sentgood = len([1 for a in sent if a['compound'] > 0.05]) sentimentstonewusers.append(((option_date_from, option_date_to), (sent, sentbad, sentneu, sentgood))) # gen pdf for post histograms os.system(imgmagickcmd + " " + outputdir + "/posthist-i" + str(intervl) + ".pdf") # plot posts diagram fig = plt.figure(figsize=FIG_SIZE) plt.plot([u[0] for (u, y) in activeusercounts], [y for (u, y) in activeusercounts], label="active users") plt.plot([u[0] for (u, y) in newusercounts], [y for (u, y) in newusercounts], label='newly registered users') plt.xlabel('time') plt.ylabel('#users') plt.yscale('log') # plt.ylim(bottom=1) plt.title("Active users") plt.legend(loc="upper right") fig.savefig(outputdir + "activeusers-i" + str(intervl) + ".png", bbox_inches='tight') plt.close(fig) # plot answers to new users diagram fig = plt.figure(figsize=FIG_SIZE) plt.plot([u[0] for (u, y) in answerstonewusers], [y for (u, y) in answerstonewusers]) plt.xlabel('time') plt.ylabel('#answers per question of a new user') plt.yscale('log') # plt.ylim(bottom=1) plt.title("Answers to new users") fig.savefig(outputdir + "answerstonewusers-i" + str(intervl) + ".png", bbox_inches='tight') plt.close(fig) # plot sentiments of answers to new users diagram fig = plt.figure(figsize=FIG_SIZE) plt.plot([u[0] for (u, y) in sentimentstonewusers], [b for (u, [y, b, n, g]) in sentimentstonewusers], label="Neg. answer") plt.plot([u[0] for (u, y) in sentimentstonewusers], [n for (u, [y, b, n, g]) in sentimentstonewusers], label="Neu. answer") plt.plot([u[0] for (u, y) in sentimentstonewusers], [g for (u, [y, b, n, g]) in sentimentstonewusers], label="Pos. answer") plt.xlabel('time') plt.ylabel('sentiment') plt.yscale('log') plt.ylim(bottom=1) plt.legend(loc="upper right") plt.title("Sentiments of answers to new users") fig.savefig(outputdir + "sentimentstonewusers-i" + str(intervl) + ".png", bbox_inches='tight') plt.close(fig) # plot activity for new users fig = plt.figure(figsize=FIG_SIZE) plt.plot([u[0] for (u, y) in activitynewusers], [y for u, y in activitynewusers], label="activity") plt.xlabel('time') plt.ylabel('#questions or answers created by a new user') plt.legend(loc="upper right") plt.title("Average activity per new user") fig.savefig(outputdir + "activitynewusers-i" + str(intervl) + ".png", bbox_inches='tight') plt.close(fig) # plot activity for new users fig = plt.figure(figsize=FIG_SIZE) plt.plot([u[0] for (u, y) in questionsininterval], [y for u, y in questionsininterval], label="questions") plt.plot([u[0] for (u, y) in answersininterval], [y for u, y in answersininterval], label="answer") plt.xlabel('time') plt.ylabel('quantity') plt.legend(loc="upper right") plt.title("Average activity per new user") fig.savefig(outputdir + "postsanswers-i" + str(intervl) + ".png", bbox_inches='tight') plt.close(fig) #print data set stats stats = "" stats += "users: " + str(len(users)) + "\n" stats += "questions: " + str(len(posts)) + "\n" stats += "answers: " + str(sum(dmt(posts).map(lambda q: len(q['Answers'])).getresults())) + "\n" stats += "active user last month: " + str(activeusercounts[-1]) + "\n" useridmapping = {u['Id']: u for u in users} newuserposts = dmt(posts).filter(lambda q: q['CreationDate'] < useridmapping[q['OwnerUserId']]['CreationDate'] + timedelta(days=DAYS_NEW_USER)).getresults() newuserlist = set([q['OwnerUserId'] for q in newuserposts]) stats += "questions from new users: " + str(len(newuserposts)) + "\n" stats += "questions from new users/new user: " + str(len(newuserposts) / len(newuserlist)) + "\n" print(stats) with open(outputdir + "/stats.txt", "w") as file: file.write(stats) if __name__ == "__main__": # execute only if run as a script usage = sys.argv[0] + " " if len(sys.argv) < 2: print(usage) sys.exit(1) folder = sys.argv[1] if not os.path.isdir(folder): print(folder + " is not a folder") sys.exit(1) interval = 3 if len(sys.argv) >= 3: if sys.argv[2].startswith("-i"): interval = sys.argv[2][2:] try: interval = int(interval) except ValueError: print("-i: int required") sys.exit(1) if interval < 1 or interval > 12: print("-i: only 1 - 12") sys.exit(1) else: print("unknown parameter: " + sys.argv[2]) sys.exit(1) main(folder, interval)