Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extending styles parsing and RegEx search #52

Merged
merged 11 commits into from
May 23, 2018
25 changes: 19 additions & 6 deletions fonduer/matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,18 @@ def init(self):
self.attrib = self.opts.get('attrib', WORDS)
self.sep = self.opts.get('sep', " ")

# Extending the RegexMatch to handle search(instead of only match)
# and adding a toggle for full span match.
# Default values are set to False and True for search flag and full
# span matching flag respectively.
self.search = self.opts.get('search', False)
self.full_match = self.opts.get('full_match', True)
Copy link
Contributor

@lukehsiao lukehsiao May 16, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it necessary to have both of these flags? It seems like these should never both be true. Only one or the other would be true at one time, if I understand correctly.

I would prefer just having self.search.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self.full_match is to toggle appending $ to regex

Eg:

phrase = 'Invoice#:2387621387'
r1 = re.compile(r'Invoice')
r2 = re.compile(r'(Invoice)$')

r1.search(phrase) # returns <_sre.SRE_Match object; span=(0, 7), match='Invoice'>
r2.search(phrase) # returns None

This is happening because $ matches the end of the string but the expression can be part of the span not at the end.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Then this looks good to me, thanks!


# Compile regex matcher
# NOTE: Enforce full span matching by ensuring that regex ends with $.
# Group self.rgx first so that $ applies to all components of an 'OR'
# expression. (e.g., we want r'(a|b)$' instead of r'a|b$')
self.rgx = self.rgx if self.rgx.endswith('$') else (
self.rgx = self.rgx if self.rgx.endswith('$') or not self.full_match else (

This comment was marked as resolved.

'(' + self.rgx + ')$')
self.r = re.compile(
self.rgx, flags=(re.I if self.ignore_case else 0) | re.UNICODE)
Expand All @@ -242,12 +249,18 @@ def _f(self, c):


class RegexMatchSpan(RegexMatch):
"""Matches regex pattern on **full concatenated span**"""

"""
Matches regex pattern on **full concatenated span**
If search flag is set to True:
Search regex pattern in **full concatenated span**
"""
def _f(self, c):
return True if self.r.match(
c.get_attrib_span(self.attrib,
sep=self.sep)) is not None else False
if self.search:
return True if self.r.search(
c.get_attrib_span(self.attrib, sep=self.sep)) is not None else False
else:
return True if self.r.match(
c.get_attrib_span(self.attrib, sep=self.sep)) is not None else False


class RegexMatchEach(RegexMatch):
Expand Down
26 changes: 26 additions & 0 deletions fonduer/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,32 @@ def parse_node(node, table_info=None, figure_info=None):
'='.join(x) for x in list(
context_node.attrib.items())
]

# Extending html style attribute with the styles
# from inline style class for the element.
cur_style_index = None
for index, attr in enumerate(parts['html_attrs']):
if attr.find('style') >= 0:
cur_style_index = index
break
styles = root.find('head').find('style')
if styles is not None:
for x in list(context_node.attrib.items()):
if x[0] == 'class':
exp = r'(.' + x[1] + ')([\n\s\r]*)\{(.*?)\}'
r = re.compile(exp, re.DOTALL)
if r.search(styles.text) is not None:
if cur_style_index is not None:
parts['html_attrs'][cur_style_index] += r.search(styles.text).group(3)\
.replace('\r', '').replace('\n', '').replace('\t', '')
else:
parts['html_attrs'].extend([
'style=' + re.sub(
r'\s{1,}', ' ', r.search(styles.text).group(3).
replace('\r', '').replace('\n', '').replace('\t', '').strip()
)
])
break
if self.tabular:
parent = table_info.parent
parts = table_info.apply_tabular(
Expand Down
90 changes: 90 additions & 0 deletions tests/data/html_extended/ext_diseases.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<meta charset="utf-8">
<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
<head>
<style>
.row-header{
background: #f1f1f1;
}
.col-header{
background: #f1f1f1;
color: aquamarine;
font-size: 18px;
}
.cell{
text-align: center;
}
</style>
</head>
<body>
<h1>Types of viruses, coughs, and colds</h1>
<p>Here is<br/>a line break</p>
<p>I don't have <span>Brain Cancer</span>or the hiccups</p>
<h1><span><p>See Table 1</p> Below.</span></h1>
<h2>Common Ailments</h2>
<table>
<tbody animal="donkey">
<tr></tr>
<tr hobbies="run:fast;jump:high" letter="Q" >
<th class="col-header" type="phenotype" hobbies="work:hard;play:harder" >Disease</th>
<th class="col-header" day="Monday">Location</th>
<th class="col-header">Year</th>
</tr>
<tr>
<th class="row-header">Polio and BC546 is <span>−</span>55<span>O</span>C cold.</th>
<td class="cell" style="width:53pt"><p class="s6" style="padding-top: 1pt">-<span class="s5">Dublin to Milwaukee</span></p></td>
<td class="cell">2001</td>
</tr>
<tr>
<th>
<table>
<tr>
<td class="row-header"> I don't like TIPL761 or Chicken Pox or pizza. Shingles is also bad. </td>
</tr>
</table>
</th>
<td class="cell">whooping cough</td>
<td class="cell">2009</td>
</tr>
<tr>
<th class="row-header">Scurvy</th>
<td class="cell">Annapolis</td>
<td class="cell"> Junction and Storage Temperature −55 to 150 o ? C</td> <!--dash is u'/u2212'-->
</tr>
</tbody>
<caption>
Table 1: Infectious diseases and where to find them.
</caption>
</table>
<p> In between the tables there is a nasty case of heart attack </p>
<table>
<tbody>
<tr>
<th class="col-header">Problem</th>
<th class="col-header">Cause</th>
<th class="col-header">Cost</th>
</tr>
<tr>
<th class="row-header">Arthritis</th>
<td class="cell">Pokemon Go</td>
<td class="cell">Free</td>
</tr>
<tr>
<th class="row-header">Yellow<i>Fever</i></th>
<td class="cell">Unicorns</td>
<td class="cell">$17.75</td>
</tr>
<tr>
<th class="row-header">Hypochondria</th>
<td class="cell">Fear</td>
<td class="cell">$100</td>
</tr>
</tbody>
<caption>
Table 2: Three ways to get Pneumonia and how much they cost.
</caption>
</table>
<p> And here is a final sentence with warts. </p>
</body>
</html>
Binary file added tests/data/pdf_extended/ext_diseases.pdf
Binary file not shown.
63 changes: 63 additions & 0 deletions tests/parser/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,3 +246,66 @@ def test_spacy_integration(caplog):

assert session.query(Document).count() == 2
assert session.query(Phrase).count() == 81


def test_parse_style(caplog):
"""Test style tag parsing."""
caplog.set_level(logging.INFO)
logger = logging.getLogger(__name__)
session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session()

max_docs = 1
docs_path = 'tests/data/html_extended/ext_diseases.html'
pdf_path = 'tests/data/pdf_extended/ext_diseases.pdf'

# Preprocessor for the Docs
preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

# Grab the document, text tuple from the preprocessor
doc, text = next(preprocessor.generate())
logger.info(" Text: {}".format(text))

# Create an OmniParserUDF
omni_udf = OmniParserUDF(
True, # structural
[], # blacklist, empty so that style is not blacklisted
["span", "br"], # flatten
'', # flatten delim
True, # lingual
True, # strip
[], # replace
True, # tabular
True, # visual
pdf_path, # pdf path
Spacy()) # lingual parser

# Grab the phrases parsed by the OmniParser
phrases = list(omni_udf.parse_structure(doc, text))

logger.warning("Doc: {}".format(doc))
for phrase in phrases:
logger.warning(" Phrase: {}".format(phrase.html_attrs))

# Phrases for testing
sub_phrases = [
{
'index': 7,
'attr': [
'class=col-header',
'hobbies=work:hard;play:harder',
'type=phenotype',
'style=background: #f1f1f1; color: aquamarine; font-size: 18px;'
]
},
{
'index': 10,
'attr': ['class=row-header', 'style=background: #f1f1f1;']
},
{
'index': 12,
'attr': ['class=cell', 'style=text-align: center;']
}
]

# Assertions
assert(all(phrases[p['index']].html_attrs == p['attr'] for p in sub_phrases))