Skip to content

Commit

Permalink
Merge pull request #53 from HazyResearch/simpletokenizer-patch
Browse files Browse the repository at this point in the history
fix simpletokenizer
  • Loading branch information
senwu authored May 17, 2018
2 parents 12101d7 + d20bc30 commit 06cbe7b
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 5 deletions.
10 changes: 8 additions & 2 deletions fonduer/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,20 @@ def parse(self, document, contents):
if not len(text.strip()):
continue
words = text.split()
char_offsets = [0] + list(np.cumsum([len(x) + 1
for x in words]))[:-1]
char_offsets = [0] + [int(_) for _ in np.cumsum([len(x) + 1
for x in words])[:-1]]
text = ' '.join(words)
stable_id = construct_stable_id(document, 'phrase', i, i)
yield {
'text': text,
'words': words,
'pos_tags': [''] * len(words),
'ner_tags': [''] * len(words),
'lemmas': [''] * len(words),
'dep_parents': [0] * len(words),
'dep_labels': [''] * len(words),
'char_offsets': char_offsets,
'abs_char_offsets': char_offsets,
'stable_id': stable_id
}
i += 1
Expand Down
48 changes: 45 additions & 3 deletions tests/parser/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,54 @@ def test_parse_structure(caplog):
assert len(phrases) == 45


def test_simple_tokenizer(caplog):
"""Unit test of OmniParser on a single document with lingual features off.
"""
caplog.set_level(logging.INFO)
logger = logging.getLogger(__name__)
session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session()

PARALLEL = 2
max_docs = 2
docs_path = 'tests/data/html_simple/'
pdf_path = 'tests/data/pdf_simple/'

# Preprocessor for the Docs
preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

omni = OmniParser(
structural=True, lingual=False, visual=True, pdf_path=pdf_path)
omni.apply(preprocessor, parallelism=PARALLEL)

doc = session.query(Document).order_by(Document.name).all()[1]

logger.info("Doc: {}".format(doc))
for i, phrase in enumerate(doc.phrases):
logger.info(" Phrase[{}]: {}".format(i, phrase.text))

header = doc.phrases[0]
# Test structural attributes
assert header.xpath == '/html/body/h1'
assert header.html_tag == 'h1'
assert header.html_attrs == ['id=sample-markdown']

# Test lingual attributes
assert header.ner_tags == ['', '']
assert header.dep_labels == ['', '']
assert header.dep_parents == [0, 0]
assert header.lemmas == ['', '']
assert header.pos_tags == ['', '']

assert len(doc.phrases) == 44


def test_parse_document_md(caplog):
"""Unit test of OmniParser on a single document.
This tests both the structural and visual parse of the document. This
also serves as a test of single-threaded parsing.
"""
caplog.set_level(logging.INFO)
logger = logging.getLogger(__name__)
session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session()

Expand All @@ -99,8 +141,8 @@ def test_parse_document_md(caplog):
doc = session.query(Document).order_by(Document.name).all()[1]

logger.info("Doc: {}".format(doc))
for phrase in doc.phrases:
logger.info(" Phrase: {}".format(phrase.text))
for i, phrase in enumerate(doc.phrases):
logger.info(" Phrase[{}]: {}".format(i, phrase.text))

header = doc.phrases[0]
# Test structural attributes
Expand All @@ -119,7 +161,7 @@ def test_parse_document_md(caplog):
assert header.ner_tags == ['O', 'O']
assert header.dep_labels == ['compound', 'ROOT']

# 44 phrases expected in the "md" document.
# 45 phrases expected in the "md" document.
assert len(doc.phrases) == 45


Expand Down

0 comments on commit 06cbe7b

Please sign in to comment.