diff --git a/fonduer/matchers.py b/fonduer/matchers.py index 6a7be511e..06801f394 100644 --- a/fonduer/matchers.py +++ b/fonduer/matchers.py @@ -228,11 +228,18 @@ def init(self): self.attrib = self.opts.get('attrib', WORDS) self.sep = self.opts.get('sep', " ") + # Extending the RegexMatch to handle search(instead of only match) + # and adding a toggle for full span match. + # Default values are set to False and True for search flag and full + # span matching flag respectively. + self.search = self.opts.get('search', False) + self.full_match = self.opts.get('full_match', True) + # Compile regex matcher # NOTE: Enforce full span matching by ensuring that regex ends with $. # Group self.rgx first so that $ applies to all components of an 'OR' # expression. (e.g., we want r'(a|b)$' instead of r'a|b$') - self.rgx = self.rgx if self.rgx.endswith('$') else ( + self.rgx = self.rgx if self.rgx.endswith('$') or not self.full_match else ( '(' + self.rgx + ')$') self.r = re.compile( self.rgx, flags=(re.I if self.ignore_case else 0) | re.UNICODE) @@ -242,12 +249,18 @@ def _f(self, c): class RegexMatchSpan(RegexMatch): - """Matches regex pattern on **full concatenated span**""" - + """ + Matches regex pattern on **full concatenated span** + If search flag is set to True: + Search regex pattern in **full concatenated span** + """ def _f(self, c): - return True if self.r.match( - c.get_attrib_span(self.attrib, - sep=self.sep)) is not None else False + if self.search: + return True if self.r.search( + c.get_attrib_span(self.attrib, sep=self.sep)) is not None else False + else: + return True if self.r.match( + c.get_attrib_span(self.attrib, sep=self.sep)) is not None else False class RegexMatchEach(RegexMatch): diff --git a/fonduer/parser/parser.py b/fonduer/parser/parser.py index c61481a9b..aacbddcd1 100644 --- a/fonduer/parser/parser.py +++ b/fonduer/parser/parser.py @@ -266,6 +266,32 @@ def parse_node(node, table_info=None, figure_info=None): '='.join(x) for x in list( context_node.attrib.items()) ] + + # Extending html style attribute with the styles + # from inline style class for the element. + cur_style_index = None + for index, attr in enumerate(parts['html_attrs']): + if attr.find('style') >= 0: + cur_style_index = index + break + styles = root.find('head').find('style') + if styles is not None: + for x in list(context_node.attrib.items()): + if x[0] == 'class': + exp = r'(.' + x[1] + ')([\n\s\r]*)\{(.*?)\}' + r = re.compile(exp, re.DOTALL) + if r.search(styles.text) is not None: + if cur_style_index is not None: + parts['html_attrs'][cur_style_index] += r.search(styles.text).group(3)\ + .replace('\r', '').replace('\n', '').replace('\t', '') + else: + parts['html_attrs'].extend([ + 'style=' + re.sub( + r'\s{1,}', ' ', r.search(styles.text).group(3). + replace('\r', '').replace('\n', '').replace('\t', '').strip() + ) + ]) + break if self.tabular: parent = table_info.parent parts = table_info.apply_tabular( diff --git a/tests/data/html_extended/ext_diseases.html b/tests/data/html_extended/ext_diseases.html new file mode 100644 index 000000000..8600336ce --- /dev/null +++ b/tests/data/html_extended/ext_diseases.html @@ -0,0 +1,90 @@ + + + + +
+ + + +Here is
a line break
I don't have Brain Canceror the hiccups
+See Table 1
Below.Disease | +Location | +Year | +|
---|---|---|---|
Polio and BC546 is −55OC cold. | +-Dublin to Milwaukee |
+ 2001 | +|
+
|
+ whooping cough | +2009 | +|
Scurvy | +Annapolis | +Junction and Storage Temperature −55 to 150 o ? C | +
In between the tables there is a nasty case of heart attack
+Problem | +Cause | +Cost | +
---|---|---|
Arthritis | +Pokemon Go | +Free | +
YellowFever | +Unicorns | +$17.75 | +
Hypochondria | +Fear | +$100 | +
And here is a final sentence with warts.
+ + diff --git a/tests/data/pdf_extended/ext_diseases.pdf b/tests/data/pdf_extended/ext_diseases.pdf new file mode 100644 index 000000000..6af968ab9 Binary files /dev/null and b/tests/data/pdf_extended/ext_diseases.pdf differ diff --git a/tests/parser/test_parser.py b/tests/parser/test_parser.py index a30f1b18d..67138808f 100644 --- a/tests/parser/test_parser.py +++ b/tests/parser/test_parser.py @@ -246,3 +246,66 @@ def test_spacy_integration(caplog): assert session.query(Document).count() == 2 assert session.query(Phrase).count() == 81 + + +def test_parse_style(caplog): + """Test style tag parsing.""" + caplog.set_level(logging.INFO) + logger = logging.getLogger(__name__) + session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session() + + max_docs = 1 + docs_path = 'tests/data/html_extended/ext_diseases.html' + pdf_path = 'tests/data/pdf_extended/ext_diseases.pdf' + + # Preprocessor for the Docs + preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs) + + # Grab the document, text tuple from the preprocessor + doc, text = next(preprocessor.generate()) + logger.info(" Text: {}".format(text)) + + # Create an OmniParserUDF + omni_udf = OmniParserUDF( + True, # structural + [], # blacklist, empty so that style is not blacklisted + ["span", "br"], # flatten + '', # flatten delim + True, # lingual + True, # strip + [], # replace + True, # tabular + True, # visual + pdf_path, # pdf path + Spacy()) # lingual parser + + # Grab the phrases parsed by the OmniParser + phrases = list(omni_udf.parse_structure(doc, text)) + + logger.warning("Doc: {}".format(doc)) + for phrase in phrases: + logger.warning(" Phrase: {}".format(phrase.html_attrs)) + + # Phrases for testing + sub_phrases = [ + { + 'index': 7, + 'attr': [ + 'class=col-header', + 'hobbies=work:hard;play:harder', + 'type=phenotype', + 'style=background: #f1f1f1; color: aquamarine; font-size: 18px;' + ] + }, + { + 'index': 10, + 'attr': ['class=row-header', 'style=background: #f1f1f1;'] + }, + { + 'index': 12, + 'attr': ['class=cell', 'style=text-align: center;'] + } + ] + + # Assertions + assert(all(phrases[p['index']].html_attrs == p['attr'] for p in sub_phrases)) \ No newline at end of file