diff --git a/analyze_batch.py b/analyze_batch.py index 6d100ad..6dde225 100644 --- a/analyze_batch.py +++ b/analyze_batch.py @@ -72,7 +72,7 @@ def main(folder, intervl): userid = post['OwnerUserId'] # check first contribution - if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) < post['CreationDate']: + if firstcontrib[userid] + timedelta(days=DAYS_NEW_USER) <= post['CreationDate']: continue sortedposts[userid].append(post) @@ -90,11 +90,11 @@ def main(folder, intervl): for (i, post) in enumerate(filteredposts): printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts))) for a in post['Answers']: - if a['Id'] in cachedsentiments.keys(): - toxlevel = cachedsentiments[a['Id']] - else: - print("Sentiment not found for " + a['Id']) - continue + # if a['Id'] in cachedsentiments.keys(): + toxlevel = cachedsentiments[a['Id']] + # else: + # print("Sentiment not found for " + a['Id']) + # continue toxlevels.append(toxlevel) printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...") @@ -178,10 +178,10 @@ def main(folder, intervl): for (i, post) in enumerate(filteredposts): printnoln("\rcomputing toxic levels: post " + str(i + 1) + "/" + str(len(filteredposts))) for a in post['Answers']: - if a['Id'] in cachedsentiments.keys(): - toxlevel = cachedsentiments[a['Id']] - else: - print("Sentiment not found for " + a['Id']) + # if a['Id'] in cachedsentiments.keys(): + toxlevel = cachedsentiments[a['Id']] + # else: + # print("Sentiment not found for " + a['Id']) toxlevels.append(toxlevel) printnoln("\rcomputing toxic levels: post " + str(len(filteredposts)) + "/" + str(len(filteredposts)) + " ... plotting ...") avgsent[0].append(np.mean([s['compound'] for s in toxlevels]) if len(toxlevels) == 0 else 0) diff --git a/calctoxdiff.py b/calctoxdiff.py index c0fe678..203cfce 100644 --- a/calctoxdiff.py +++ b/calctoxdiff.py @@ -59,8 +59,8 @@ def g(srcfile, outputdir, intervals): plt.ylabel("pvalue") plt.legend(loc="upper right") plt.savefig(outputdir + "/ks_averagesentiments_pval.png", bbox_inches='tight') - plt.close(fig) + fig = plt.figure(figsize=(16, 12)) for i in range(len(single)): plt.plot([iv[0] for iv in intervals], [s if isinstance(s, float) else s.statistic for s in single[i]], label=str(i + 1) + " posts - most posters") diff --git a/its.py b/its.py index 787a20c..88047ee 100644 --- a/its.py +++ b/its.py @@ -42,8 +42,8 @@ def main(folder, intervl): print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) # avg sentiments filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound'] - for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to - and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']]) + for a in p['Answers'] if option_date_from <= p['CreationDate'] < option_date_to + and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > a['CreationDate']]) .filter(lambda p: p != []) .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: []) .getresults()) diff --git a/loader.py b/loader.py index b7ca843..c2ad1d3 100644 --- a/loader.py +++ b/loader.py @@ -72,15 +72,21 @@ def readVotes(folder): print(prefix + "done") return votes + VOTE_TAGS = ['PostId', 'VoteTypeId', 'CreationDate'] +VOTE_DTAGS = ['CreationDate'] +VOTE_ITAGS = ['PostId', 'VoteTypeId'] + + def mapvote(item): - datetags = ['CreationDate'] vote = {tag: getTag(item, tag) for tag in VOTE_TAGS} - for tag in datetags: + for tag in VOTE_DTAGS: if vote[tag] is not None: vote[tag] = datetime.fromisoformat(vote[tag]) else: print("map vote: tag " + tag + " is None: " + str(vote)) + for tag in VOTE_ITAGS: + vote[tag] = int(vote[tag]) return vote @@ -112,8 +118,12 @@ def computefirstcontrib(posts): firstcontrib = {id: min(ldate) for (id, ldate) in firstcontrib.items()} return firstcontrib + USER_TAGS = ['Id', 'CreationDate'] USER_DTAGS = ['CreationDate'] +USER_ITAGS = ['Id'] + + def mapuser(item): user = {tag: getTag(item, tag) for tag in USER_TAGS} for tag in USER_DTAGS: @@ -121,24 +131,38 @@ def mapuser(item): user[tag] = datetime.fromisoformat(user[tag]) else: print("map user: tag " + tag + " is None: " + str(user)) + for tag in USER_ITAGS: + user[tag] = int(user[tag]) return user + Q_TAGS = ['Id', 'CreationDate', 'Body', 'Title', 'OwnerUserId', 'OwnerDisplayName', 'Score'] Q_DTAGS = ['CreationDate'] +Q_ITAGS = ['Id', 'OwnerUserId', 'Score'] Q_BODY = 'Body' + + def mapQuestion(item): question = {tag: getTag(item, tag) for tag in Q_TAGS} for tag in Q_DTAGS: question[tag] = datetime.fromisoformat(question[tag]) + for tag in Q_ITAGS: + question[tag] = int(question[tag]) if question[tag] is not None else None question[Q_BODY] = removetags(html.unescape(question[Q_BODY])) return question + A_TAGS = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId', 'Score'] A_DTAGS = ['CreationDate'] +A_ITAGS = ['Id', 'ParentId', 'OwnerUserId', 'Score'] + + def mapAnswer(item): answer = {tag: getTag(item, tag) for tag in A_TAGS} for tag in A_DTAGS: answer[tag] = datetime.fromisoformat(answer[tag]) + for tag in A_ITAGS: + answer[tag] = int(answer[tag]) if answer[tag] is not None else None answer['Body'] = removetags(html.unescape(answer['Body'])) return answer diff --git a/posthist.py b/posthist.py index c17ef05..ee36c24 100644 --- a/posthist.py +++ b/posthist.py @@ -43,7 +43,7 @@ def main(folder, intervl): postcounts = {id: len(pc) for (id, pc) in postcounts.items()} activeusercounts.append(((option_date_from, option_date_to), len(postcounts.keys()))) - activitynewusersinmonth = defaultdict(int) + activitynewusersinmonth = defaultdict(int) # TODO match month exactly for p in newposts: if firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) > p['CreationDate']: activitynewusersinmonth[p['OwnerUserId']] += 1 diff --git a/sentiments.py b/sentiments.py index 628fd64..9791f47 100644 --- a/sentiments.py +++ b/sentiments.py @@ -101,7 +101,7 @@ def readtoxleveltxt(filename): ra = line.split(";") ra = [l.split(":") for l in ra] # print("i1: " + str(ra[0:5])) - ra = {id: {"neg": float(neg), "neu": float(neu), "pos": float(pos), "compound": float(compound)} for [id, neg, neu, pos, compound] in ra} + ra = {int(id): {"neg": float(neg), "neu": float(neu), "pos": float(pos), "compound": float(compound)} for [id, neg, neu, pos, compound] in ra} # print("i1: " + str(ra)[0:500]) return rq, ra diff --git a/votes.py b/votes.py index 4399e9e..36cc8fb 100644 --- a/votes.py +++ b/votes.py @@ -36,12 +36,12 @@ def main(folder, intervl): print(option_date_from.strftime("%d-%m-%Y") + " to " + option_date_to.strftime("%d-%m-%Y")) # avg sentiments scores = (dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to - and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= p['CreationDate']) - .map(lambda p: int(p['Score'])) + and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) >= p['CreationDate']) + .map(lambda p: p['Score']) .getresults()) filtered = (dmt(posts).map(lambda p: [cachedsentiments[a['Id']]['compound'] - for a in p['Answers'] if option_date_from <= a['CreationDate'] < option_date_to - and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= a['CreationDate']]) + for a in p['Answers'] if option_date_from <= p['CreationDate'] < option_date_to + and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) >= a['CreationDate']]) .filter(lambda p: p != []) .reduce(lambda a, b: a + b, lambda a, b: a + b, lambda: []) .getresults()) @@ -102,10 +102,10 @@ def main(folder, intervl): if option_date_to > interval[1]: continue intervalposts = dmt(posts).filter(lambda p: option_date_from <= p['CreationDate'] < option_date_to - and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) <= p['CreationDate']).getresults() + and firstcontrib[p['OwnerUserId']] + timedelta(days=DAYS_NEW_USER) >= p['CreationDate']).getresults() intervalpostsids = set(dmt(intervalposts).map(lambda p: p['Id']).getresults()) intervalvotes = dmt(ivvotes).filter(lambda v: v['PostId'] in intervalpostsids).getresults() - intervalscore = sum(dmt(intervalvotes).map(lambda v: 1 if v['VoteTypeId'] == "2" else (-1 if v['VoteTypeId'] == "3" else 0)).getresults()) + intervalscore = sum(dmt(intervalvotes).map(lambda v: 1 if v['VoteTypeId'] == 2 else (-1 if v['VoteTypeId'] == 3 else 0)).getresults()) intervalscore = intervalscore / len(intervalpostsids) if len(intervalpostsids) != 0 else float("nan") scores.append(((option_date_from, option_date_to), intervalscore)) # if all(str(score) == "nan" for iv, score in scores)