This commit is contained in:
wea_ondara
2019-07-29 21:34:34 +02:00
parent 1f699f6b56
commit a14b3af21a
6 changed files with 309 additions and 87 deletions

View File

@@ -1,5 +1,7 @@
import html
import multiprocessing
import operator
import re
import time
import xml.etree.cElementTree as et
from collections import defaultdict
@@ -7,11 +9,13 @@ from datetime import datetime
from mt import mt
TAG_RE = re.compile(r'<[^>]+>')
printnoln = lambda text: print(text, end='', flush=True)
rprint = lambda text: print('\r' + text)
def dmt(data): return mt(multiprocessing.cpu_count(), data, False)
def dmt(data, progressinterval=1000): return mt(multiprocessing.cpu_count(), data, False, progressinterval)
def cms(): return int(round(time.time() * 1000))
@@ -75,6 +79,7 @@ def mapQuestion(item):
question = {tag: getTag(item, tag) for tag in tags}
for tag in datetags:
question[tag] = datetime.fromisoformat(question[tag])
question['Body'] = removetags(html.unescape(question['Body']))
return question
@@ -84,6 +89,7 @@ def mapAnswer(item):
answer = {tag: getTag(item, tag) for tag in tags}
for tag in datetags:
answer[tag] = datetime.fromisoformat(answer[tag])
answer['Body'] = removetags(html.unescape(answer['Body']))
return answer
@@ -93,6 +99,7 @@ def mapComment(item):
comment = {tag: getTag(item, tag) for tag in tags}
for tag in datetags:
comment[tag] = datetime.fromisoformat(comment[tag])
comment['Body'] = removetags(html.unescape(comment['Body']))
return comment
@@ -201,3 +208,7 @@ def tagExists(item, tag):
def setprop(dic, key, value):
dic[key] = value
return dic
def removetags(text):
return TAG_RE.sub('', text)