-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathutils.py
63 lines (47 loc) · 1.94 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os, re
def header_info(xml_path):
os.system("head {}".format(xml_path))
def handle_unicode_errors(txt):
return txt.encode('utf-8', 'replace').decode()
def is_question(elem_attribs):
if elem_attribs["PostTypeId"] is not None:
if elem_attribs["PostTypeId"] == "1":
return True
return False
def is_answer(elem_attribs):
if elem_attribs["PostTypeId"] is not None:
if elem_attribs["PostTypeId"] == "2":
return True
return False
def filter_newlines(text):
return re.sub("\n{3,}", "\n\n", text)
def is_accepted_answer(a_attribs, q_attribs):
assert is_question(q_attribs), "Must be a question to have an accepted answer"
assert is_answer(a_attribs), "Must be an answer to be an accepted answer"
if q_attribs["AcceptedAnswerId"] is not None:
if q_attribs["AcceptedAnswerId"] == a_attribs["Id"]:
return True
else:
return False
def has_answers(elem_attribs):
assert is_question(elem_attribs), "Must be a question to have answers"
if elem_attribs["AnswerCount"] is not None:
if int(elem_attribs["AnswerCount"]):
return True
return False
def trim_attribs(elem_attribs, attrib_type="question"):
"""deletes non-useful data from attribs dict for questions / answers, returns remaining"""
if attrib_type == "question":
to_keep = ['Id', 'Body', 'Title', 'Tags', 'AnswerCount', 'AcceptedAnswerId', 'PostTypeId']
to_delete = [x for x in elem_attribs.keys() if x not in to_keep]
[elem_attribs.pop(x, None) for x in to_delete]
elem_attribs["ParsedAnswers"] = 0
elem_attribs["Answers"] = {}
elif attrib_type == "answer":
to_keep = ['Id', 'Body', 'Score']
new_dict = {}
for item in to_keep:
new_dict[item] = elem_attribs[item]
return new_dict
else:
raise Exception('Unrecognized attribute type - please specify either question or answer')