-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathepub_reader.py
126 lines (108 loc) · 4.41 KB
/
epub_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import ebooklib
from bs4 import BeautifulSoup as bs
from ebooklib import epub
from markdownify import markdownify as md
def chap2text(chap):
output = ''
soup = bs(chap, 'html.parser')
blacklist = ['[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input', 'script', ]
# there may be more elements you don't want, such as "style", etc.
text = soup.find_all(text=True)
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t)
return output
def thtml2ttext(thtml):
Output = []
for html in thtml:
text = chap2text(html)
Output.append(text)
return Output
def epub2text(epub_path):
chapters = epub2thtml(epub_path)
ttext = thtml2ttext(chapters)
return ttext
def epub2thtml(epub_path):
book = epub.read_epub(epub_path)
chapters = []
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
chapters.append(item.get_content())
return chapters
class EpubReader:
# reade text from epub file
def __init__(self, epub_path=''):
self.epub_path = epub_path
if epub_path != '':
self.book = epub.read_epub(epub_path)
self.spine_ids = self._get_spine_ids()
self.item_ids = self._get_item_ids()
self.images = self._get_item_images()
# sort list of docs ids in order they follow in spine_ids
self.item_ids.sort(key=self._sort_by_spine)
else:
self.book = None
pass
def _get_item_ids(self):
item_ids = []
doc_item_list = self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT)
for elem in doc_item_list:
item_ids.append(elem.id)
return item_ids
def _get_spine_ids(self):
spine_ids = []
for sp in self.book.spine:
spine_ids.append(sp[0])
return spine_ids
def get_booktitle(self):
if self.book is None:
return ''
return self.book.title
def get_toc(self):
# return table of content
if self.book is None:
return ''
return self.book.toc
def _sort_by_spine(self, item):
# for sorting list of items by list of ids from spine.
# epub readers read book in spine order
if item not in self.spine_ids:
return 0
return self.spine_ids.index(item)
def get_next_item_text(self):
# return text of next item with type ITEM_DOCUMENT
if len(self.item_ids) == 0:
return None
item_id = self.item_ids.pop(0)
item_doc = self.book.get_item_with_id(item_id)
# soup = bs(item_doc.content.decode('utf-8'), "lxml")
# return soup.body.get_text()
soup = bs(item_doc.content.decode('utf-8'), 'html.parser')
blacklist = ['[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input', 'script', 'style']
# there may be more elements you don't want, such as "style", etc.
text = soup.find_all(text=True)
output = ''
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t.replace(
'body {padding:0;} img {height: 100%; max-width: 100%;} div {text-align: center; page-break-after: always;}',
'\n')
.replace('Cover of ', '')
.replace(
'Cover body { margin: 0; padding:0 } div.cover { text-align: center; text-indent: 0px; margin:0; vertical-align:middle; } img { max-width:100%; height:100%; border: 0; }',
'')
.replace(
'page {padding: 0pt; margin:0pt} body { text-align: center; padding:0pt; margin: 0pt; }', '')
.replace(
'Cover @page {padding: 0pt; margin:0pt} body { text-align: center; padding:0pt; margin: 0pt; }',
'').replace('Annotation', '\n')) + "\n"
return output
def _get_item_images(self):
images = []
# doc_item_list = self.book.get_items_of_type(ebooklib.ITEM_IMAGE)
# for elem in doc_item_list:
# images.append(elem.id)
# with open("./files/" + elem.book.uid + "/" + elem.file_name, "wb") as binary_file:
# # Write bytes to file
# binary_file.write(elem.content)
return images