-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcombine_hocr.py
88 lines (72 loc) · 3.55 KB
/
combine_hocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import re
import argparse
from bs4 import BeautifulSoup as Soup
BATCH_LOCATION = "E:\\backups\\chronam\\dlc_flavory_ver01"
def addXBboxValue(offset, item):
titleValues = []
for titleItem in item['title'].split(";"):
if "bbox" in titleItem:
values = titleItem.strip().split(" ")
x0 = int(values[1]) + offset
values[1] = str(x0)
x1 = int(values[3]) + offset
values[3] = str(x1)
titleValues.append(" ".join(values))
else:
titleValues.append(titleItem)
item['title'] = "; ".join(titleValues)
#print("[" + item['title'] + "]")
def fixPageTitle(soup, xendingValue):
titleValues = []
for titleItem in soup.html.body.div['title'].split(";"):
if "bbox" in titleItem:
values = titleItem.strip().split(" ")
values[1] = "0"
values[3] = str(xendingValue)
titleValues.append(" ".join(values))
elif "image" in titleItem:
pathParts = titleItem.split("\\")
pathParts[-1] = re.sub(r"_.*", ".tiff", pathParts[-1])
titleValues.append("\\".join(pathParts))
else:
titleValues.append(titleItem)
soup.html.body.div['title'] = "; ".join(titleValues)
def combineHocrFiles(batch_dir):
for root, dirs, files in os.walk(batch_dir):
fileNumbers = set([f[:4] for f in files])
for fileNumber in fileNumbers:
filteredFiles = list(filter(lambda file: file.startswith(fileNumber) and file.endswith(".hocr"), files))
hocrFilenames = [os.path.join(root, f) for f in filteredFiles]
print("DEBUG: processing files", hocrFilenames)
maxOffset = 0
combinedSoup = None
with open(hocrFilenames[0], 'r', encoding='utf8') as f:
combinedSoup = Soup(f.read(), "xml")
#remove all the findings so we can easily add them when iterating over each file
for child in combinedSoup.html.body.div('div'):
child.decompose()
for hocrFilename in hocrFilenames:
with open(hocrFilename, 'r', encoding='utf8') as f:
soup = Soup(f.read(), "xml")
offsets = os.path.basename(hocrFilename).split("_")
offset = int(offsets[1][6:])
maxOffset = max(maxOffset, int(offsets[2][4:-5]))
#don't include the ocr_page div, find all divs, update their X value, and then add them to the combined XML
for item in soup.html.body.div.find_all('div'):
addXBboxValue(offset, item)
combinedSoup.html.body.div.append(item)
fixPageTitle(combinedSoup, maxOffset)
#write out updated dom...
print("check out", os.path.join(root, fileNumber + ".hocr"))
with open(os.path.join(root, fileNumber + ".hocr"), 'w', encoding='utf8') as f:
f.write(combinedSoup.prettify())
#delete old hocr files?
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='combine the hocr files for each column into 1 combined hocr file with updated x coordinates.')
parser.add_argument('--chronamdir', help='The directory of the chronicling america batch')
args = parser.parse_args()
if args.chronamdir:
BATCH_LOCATION = args.chronamdir
combineHocrFiles(BATCH_LOCATION)
print('finished')