This commit is contained in:
wea_ondara
2020-01-27 12:17:42 +01:00
parent da8896eadd
commit c2695e0d49

View File

@@ -72,11 +72,10 @@ def readVotes(folder):
print(prefix + "done") print(prefix + "done")
return votes return votes
VOTE_TAGS = ['PostId', 'VoteTypeId', 'CreationDate']
def mapvote(item): def mapvote(item):
tags = ['PostId', 'VoteTypeId', 'CreationDate']
datetags = ['CreationDate'] datetags = ['CreationDate']
vote = {tag: getTag(item, tag) for tag in tags} vote = {tag: getTag(item, tag) for tag in VOTE_TAGS}
for tag in datetags: for tag in datetags:
if vote[tag] is not None: if vote[tag] is not None:
vote[tag] = datetime.fromisoformat(vote[tag]) vote[tag] = datetime.fromisoformat(vote[tag])
@@ -113,47 +112,45 @@ def computefirstcontrib(posts):
firstcontrib = {id: min(ldate) for (id, ldate) in firstcontrib.items()} firstcontrib = {id: min(ldate) for (id, ldate) in firstcontrib.items()}
return firstcontrib return firstcontrib
USER_TAGS = ['Id', 'CreationDate']
USER_DTAGS = ['CreationDate']
def mapuser(item): def mapuser(item):
tags = ['Id', 'CreationDate'] user = {tag: getTag(item, tag) for tag in USER_TAGS}
datetags = ['CreationDate'] for tag in USER_DTAGS:
user = {tag: getTag(item, tag) for tag in tags}
for tag in datetags:
if user[tag] is not None: if user[tag] is not None:
user[tag] = datetime.fromisoformat(user[tag]) user[tag] = datetime.fromisoformat(user[tag])
else: else:
print("map user: tag " + tag + " is None: " + str(user)) print("map user: tag " + tag + " is None: " + str(user))
return user return user
Q_TAGS = ['Id', 'CreationDate', 'Body', 'Title', 'OwnerUserId', 'OwnerDisplayName', 'Score']
Q_DTAGS = ['CreationDate']
Q_BODY = 'Body'
def mapQuestion(item): def mapQuestion(item):
tags = ['Id', 'CreationDate', 'Body', 'Title', 'OwnerUserId', 'OwnerDisplayName', 'Score'] question = {tag: getTag(item, tag) for tag in Q_TAGS}
datetags = ['CreationDate'] for tag in Q_DTAGS:
question = {tag: getTag(item, tag) for tag in tags}
for tag in datetags:
question[tag] = datetime.fromisoformat(question[tag]) question[tag] = datetime.fromisoformat(question[tag])
question['Body'] = removetags(html.unescape(question['Body'])) question[Q_BODY] = removetags(html.unescape(question[Q_BODY]))
return question return question
A_TAGS = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId', 'Score']
A_DTAGS = ['CreationDate']
def mapAnswer(item): def mapAnswer(item):
tags = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId', 'Score'] answer = {tag: getTag(item, tag) for tag in A_TAGS}
datetags = ['CreationDate'] for tag in A_DTAGS:
answer = {tag: getTag(item, tag) for tag in tags}
for tag in datetags:
answer[tag] = datetime.fromisoformat(answer[tag]) answer[tag] = datetime.fromisoformat(answer[tag])
answer['Body'] = removetags(html.unescape(answer['Body'])) answer['Body'] = removetags(html.unescape(answer['Body']))
return answer return answer
def mapComment(item): # def mapComment(item):
tags = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId'] # tags = ['Id', 'ParentId', 'CreationDate', 'Body', 'OwnerUserId']
datetags = ['CreationDate'] # datetags = ['CreationDate']
comment = {tag: getTag(item, tag) for tag in tags} # comment = {tag: getTag(item, tag) for tag in tags}
for tag in datetags: # for tag in datetags:
comment[tag] = datetime.fromisoformat(comment[tag]) # comment[tag] = datetime.fromisoformat(comment[tag])
comment['Body'] = removetags(html.unescape(comment['Body'])) # comment['Body'] = removetags(html.unescape(comment['Body']))
return comment # return comment
def readUsers(file): def readUsers(file):
@@ -240,14 +237,14 @@ def readAnswers(items):
return answers return answers
def readComments(items): # def readComments(items):
prefix = "readComments: " # prefix = "readComments: "
comments = dmt(items).filter(lambda item: getTag(item, 'PostTypeId') == "3", prefix + "filter out comments") \ # comments = dmt(items).filter(lambda item: getTag(item, 'PostTypeId') == "3", prefix + "filter out comments") \
.map(mapComment, prefix + "mapping comments") \ # .map(mapComment, prefix + "mapping comments") \
.filter(lambda c: c['OwnerUserId'] is not None, prefix + "filter out broken comments").getresults() # .filter(lambda c: c['OwnerUserId'] is not None, prefix + "filter out broken comments").getresults()
#
print(prefix + "comments read: " + str(len(comments))) # print(prefix + "comments read: " + str(len(comments)))
return comments # return comments
def getTag(item, tag): def getTag(item, tag):