-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocument_class.py
37 lines (36 loc) · 1.27 KB
/
document_class.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
class Document:
def __init__(self, filename):
########################################
# metadata
########################################
# name of the documents: <N *>
self.name = ""
# date of the original: <O *>
self.o_date = ""
# date of the manuscript: <M *>
self.m_date = ""
# dialect: <D *>
self.dialect = ""
# section of the corpus: <C *>
self.corpus_section = ""
# text style: <V *>
self.style = ""
# relationship to translated work
self.translate_relation = ""
# original lanugage of foreign work
self.original_language = ""
# filename
self.filename = filename
########################################
# text
########################################
# the text as in the corpus
self.raw_text = ""
# cleaned text; see preprocess.clean_text()
self.cleaned_text = ""
# cleaned text with no foreign languages; see preprocess.clean_text()
self.cleaned_text_no_foreign = ""
# the document as characters
self.chars = []
# chars with no foreign languages
self.chars_no_foreign = []