-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathold_methods.py
147 lines (110 loc) · 4.98 KB
/
old_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def extract_footnotes(docx_file):
"""Extracts all footnotes from a given .docx file.
Args:
docx_file (str): Path to the .docx file.
Returns:
List of footnotes, each footnote being on a single row.
"""
footnotes_array = []
document = docx2python(docx_file)
for x in np.array(document.footnotes, dtype=object):
for y in x:
for z in y:
footnotes_array.append(z)
return footnotes_array
def get_footnotes_as_list(docx_file):
"""Returns all the footnotes of a specific .docx document as a sorted list.
Args:
docx_file (str): Path to the .docx file.
Returns:
List of footnotes, each footnote being on a single row.
"""
footnotes_array = extract_footnotes(docx_file)
footnotes_df = pd.DataFrame(footnotes_array, columns=['Footnote'])
footnotes_list = footnotes_df['Footnote'].tolist()
footnotes_list.sort()
return footnotes_list
def get_base_footnotes_as_list():
"""Returns all the footnotes of the base .docx document as a sorted list.
Returns:
List of footnotes, each footnote being on a single row.
"""
return get_footnotes_as_list(BASE_DOCUMENT)
def get_variant_footnotes_as_list(docx_file):
"""Returns all the footnotes of a variant .docx document as a sorted list.
Args:
docx_file (str): Name of the .docx file.
Returns:
List of footnotes, each footnote being on a single row.
"""
variant_docx_file = os.path.join(DOCX_FILES_DIRECTORY, docx_file)
return get_footnotes_as_list(variant_docx_file)
def find_missing_footnotes(variant_footnotes_list, original_footnotes_list):
"""Finds footnotes that are in the variant list but not in the original list.
Args:
variant_footnotes_list (list): List of footnotes from the variant document.
original_footnotes_list (list): List of footnotes from the original document.
Returns:
Set of missing footnotes.
"""
return set(variant_footnotes_list).difference(original_footnotes_list)
def find_additional_footnotes(original_footnotes_list, variant_footnotes_list):
additional_footnotes = set(
original_footnotes_list).difference(variant_footnotes_list)
return additional_footnotes
def create_csv_for_missing_footnotes():
""" Creates a .csv file listing the missing footnotes on the base document
Returns:
A message informing about the operation.
"""
for docx_file in docx_files:
variant_footnotes_list = get_variant_footnotes_as_list(docx_file)
original_footnotes_list = get_base_footnotes_as_list()
missing_footnotes = find_missing_footnotes(
variant_footnotes_list, original_footnotes_list)
missing_footnotes_df = pd.DataFrame(missing_footnotes)
# You might need to create those folders
missing_footnotes_df.to_csv("./CSV/MissingFootnotes/Missing_Footnotes_" +
docx_file + ".csv", sep=",")
return print("CSVs finished")
def create_csv_for_additional_footnotes():
""" Creates a .csv file listing the additional footnotes on the base document
Returns:
A message informing about the operation.
"""
for docx_file in docx_files:
variant_footnotes_list = get_variant_footnotes_as_list(docx_file)
original_footnotes_list = get_base_footnotes_as_list()
additional_footnotes = find_additional_footnotes(
original_footnotes_list, variant_footnotes_list)
additional_footnotes_df = pd.DataFrame(additional_footnotes)
# You might need to create those folders
additional_footnotes_df.to_csv(
"./CSV/AdditionalFootnotes/Additional_Footnotes_" + docx_file + ".csv",
sep=",")
return print("CSVs finished")
def create_footnotes_comparison_csv(docx_file):
""" Creates a .csv file where footnotes from both files are displayed and the missing ones are marked
Returns:
A message informing about the operation.
"""
variant_footnotes_list = get_variant_footnotes_as_list(docx_file)
original_footnotes_list = get_base_footnotes_as_list()
missing_footnotes_list = []
for x in original_footnotes_list:
for y in variant_footnotes_list:
if x == y:
missing_footnotes_list.append('MATCH')
else:
missing_footnotes_list.append('MISSING')
comparison_df = pd.DataFrame({'Original Footnotes': pd.Series(original_footnotes_list),
'Variant Footnotes': pd.Series(variant_footnotes_list),
'Missing Footer?': pd.Series(missing_footnotes_list)})
# You might need to create those folders
comparison_csv = comparison_df.to_csv(
"./CSV/ComparisonCSV/Comparison_CSV" + docx_file + ".csv", sep=",")
return comparison_csv
def compare_base_document_with_variant():
for docx_file in docx_files:
create_footnotes_comparison_csv(docx_file)
return print("CSVs finished")