From c2695e0d49a35c4612b6357ca62b14235f190438 Mon Sep 17 00:00:00 2001 From: wea_ondara Date: Mon, 27 Jan 2020 12:17:42 +0100 Subject: [PATCH] wip --- loader.py | 67 ++++++++++++++++++++++++++----------------------------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/loader.py b/loader.py index 100c2b7..b7ca843 100644 --- a/loader.py +++ b/loader.py @@ -72,11 +72,10 @@ def readVotes(folder): print(prefix + "done") return votes - +VOTE_TAGS = ['PostId', 'VoteTypeId', 'CreationDate'] def mapvote(item): - tags = ['PostId', 'VoteTypeId', 'CreationDate'] datetags = ['CreationDate'] - vote = {tag: getTag(item, tag) for tag in tags} + vote = {tag: getTag(item, tag) for tag in VOTE_TAGS} for tag in datetags: if vote[tag] is not None: vote[tag] = datetime.fromisoformat(vote[tag]) @@ -113,47 +112,45 @@ def computefirstcontrib(posts): firstcontrib = {id: min(ldate) for (id, ldate) in firstcontrib.items()} return firstcontrib - +USER_TAGS = ['Id', 'CreationDate'] +USER_DTAGS = ['CreationDate'] def mapuser(item): - tags = ['Id', 'CreationDate'] - datetags = ['CreationDate'] - user = {tag: getTag(item, tag) for tag in tags} - for tag in datetags: + user = {tag: getTag(item, tag) for tag in USER_TAGS} + for tag in USER_DTAGS: if user[tag] is not None: user[tag] = datetime.fromisoformat(user[tag]) else: print("map user: tag " + tag + " is None: " + str(user)) return user - +Q_TAGS = ['Id', 'CreationDate', 'Body', 'Title', 'OwnerUserId', 'OwnerDisplayName', 'Score'] +Q_DTAGS = ['CreationDate'] +Q_BODY = 'Body' def mapQuestion(item): - tags = ['Id', 'CreationDate', 'Body', 'Title', 'OwnerUserId', 'OwnerDisplayName', 'Score'] - datetags = ['CreationDate'] - question = {tag: getTag(item, tag) for tag in tags} - for tag in datetags: + question = {tag: getTag(item, tag) for tag in Q_TAGS} + for tag in Q_DTAGS: question[tag] = datetime.fromisoformat(question[tag]) - question['Body'] = removetags(html.unescape(question['Body'])) + question[Q_BODY] = removetags(html.unescape(question[Q_BODY])) return question - +A_TAGS = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId', 'Score'] +A_DTAGS = ['CreationDate'] def mapAnswer(item): - tags = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId', 'Score'] - datetags = ['CreationDate'] - answer = {tag: getTag(item, tag) for tag in tags} - for tag in datetags: + answer = {tag: getTag(item, tag) for tag in A_TAGS} + for tag in A_DTAGS: answer[tag] = datetime.fromisoformat(answer[tag]) answer['Body'] = removetags(html.unescape(answer['Body'])) return answer -def mapComment(item): - tags = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId'] - datetags = ['CreationDate'] - comment = {tag: getTag(item, tag) for tag in tags} - for tag in datetags: - comment[tag] = datetime.fromisoformat(comment[tag]) - comment['Body'] = removetags(html.unescape(comment['Body'])) - return comment +# def mapComment(item): +# tags = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId'] +# datetags = ['CreationDate'] +# comment = {tag: getTag(item, tag) for tag in tags} +# for tag in datetags: +# comment[tag] = datetime.fromisoformat(comment[tag]) +# comment['Body'] = removetags(html.unescape(comment['Body'])) +# return comment def readUsers(file): @@ -240,14 +237,14 @@ def readAnswers(items): return answers -def readComments(items): - prefix = "readComments: " - comments = dmt(items).filter(lambda item: getTag(item, 'PostTypeId') == "3", prefix + "filter out comments") \ - .map(mapComment, prefix + "mapping comments") \ - .filter(lambda c: c['OwnerUserId'] is not None, prefix + "filter out broken comments").getresults() - - print(prefix + "comments read: " + str(len(comments))) - return comments +# def readComments(items): +# prefix = "readComments: " +# comments = dmt(items).filter(lambda item: getTag(item, 'PostTypeId') == "3", prefix + "filter out comments") \ +# .map(mapComment, prefix + "mapping comments") \ +# .filter(lambda c: c['OwnerUserId'] is not None, prefix + "filter out broken comments").getresults() +# +# print(prefix + "comments read: " + str(len(comments))) +# return comments def getTag(item, tag):