Skip to content

Commit

Permalink
Merge pull request #52 from Prabh06/master
Browse files Browse the repository at this point in the history
Extending styles parsing and RegEx search
  • Loading branch information
lukehsiao authored May 23, 2018
2 parents 06cbe7b + d435623 commit 3b20d74
Show file tree
Hide file tree
Showing 5 changed files with 198 additions and 6 deletions.
25 changes: 19 additions & 6 deletions fonduer/matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,18 @@ def init(self):
self.attrib = self.opts.get('attrib', WORDS)
self.sep = self.opts.get('sep', " ")

# Extending the RegexMatch to handle search(instead of only match)
# and adding a toggle for full span match.
# Default values are set to False and True for search flag and full
# span matching flag respectively.
self.search = self.opts.get('search', False)
self.full_match = self.opts.get('full_match', True)

# Compile regex matcher
# NOTE: Enforce full span matching by ensuring that regex ends with $.
# Group self.rgx first so that $ applies to all components of an 'OR'
# expression. (e.g., we want r'(a|b)$' instead of r'a|b$')
self.rgx = self.rgx if self.rgx.endswith('$') else (
self.rgx = self.rgx if self.rgx.endswith('$') or not self.full_match else (
'(' + self.rgx + ')$')
self.r = re.compile(
self.rgx, flags=(re.I if self.ignore_case else 0) | re.UNICODE)
Expand All @@ -242,12 +249,18 @@ def _f(self, c):


class RegexMatchSpan(RegexMatch):
"""Matches regex pattern on **full concatenated span**"""

"""
Matches regex pattern on **full concatenated span**
If search flag is set to True:
Search regex pattern in **full concatenated span**
"""
def _f(self, c):
return True if self.r.match(
c.get_attrib_span(self.attrib,
sep=self.sep)) is not None else False
if self.search:
return True if self.r.search(
c.get_attrib_span(self.attrib, sep=self.sep)) is not None else False
else:
return True if self.r.match(
c.get_attrib_span(self.attrib, sep=self.sep)) is not None else False


class RegexMatchEach(RegexMatch):
Expand Down
26 changes: 26 additions & 0 deletions fonduer/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,32 @@ def parse_node(node, table_info=None, figure_info=None):
'='.join(x) for x in list(
context_node.attrib.items())
]

# Extending html style attribute with the styles
# from inline style class for the element.
cur_style_index = None
for index, attr in enumerate(parts['html_attrs']):
if attr.find('style') >= 0:
cur_style_index = index
break
styles = root.find('head').find('style')
if styles is not None:
for x in list(context_node.attrib.items()):
if x[0] == 'class':
exp = r'(.' + x[1] + ')([\n\s\r]*)\{(.*?)\}'
r = re.compile(exp, re.DOTALL)
if r.search(styles.text) is not None:
if cur_style_index is not None:
parts['html_attrs'][cur_style_index] += r.search(styles.text).group(3)\
.replace('\r', '').replace('\n', '').replace('\t', '')
else:
parts['html_attrs'].extend([
'style=' + re.sub(
r'\s{1,}', ' ', r.search(styles.text).group(3).
replace('\r', '').replace('\n', '').replace('\t', '').strip()
)
])
break
if self.tabular:
parent = table_info.parent
parts = table_info.apply_tabular(
Expand Down
90 changes: 90 additions & 0 deletions tests/data/html_extended/ext_diseases.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<meta charset="utf-8">
<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
<head>
<style>
.row-header{
background: #f1f1f1;
}
.col-header{
background: #f1f1f1;
color: aquamarine;
font-size: 18px;
}
.cell{
text-align: center;
}
</style>
</head>
<body>
<h1>Types of viruses, coughs, and colds</h1>
<p>Here is<br/>a line break</p>
<p>I don't have <span>Brain Cancer</span>or the hiccups</p>
<h1><span><p>See Table 1</p> Below.</span></h1>
<h2>Common Ailments</h2>
<table>
<tbody animal="donkey">
<tr></tr>
<tr hobbies="run:fast;jump:high" letter="Q" >
<th class="col-header" type="phenotype" hobbies="work:hard;play:harder" >Disease</th>
<th class="col-header" day="Monday">Location</th>
<th class="col-header">Year</th>
</tr>
<tr>
<th class="row-header">Polio and BC546 is <span></span>55<span>O</span>C cold.</th>
<td class="cell" style="width:53pt"><p class="s6" style="padding-top: 1pt">-<span class="s5">Dublin to Milwaukee</span></p></td>
<td class="cell">2001</td>
</tr>
<tr>
<th>
<table>
<tr>
<td class="row-header"> I don't like TIPL761 or Chicken Pox or pizza. Shingles is also bad. </td>
</tr>
</table>
</th>
<td class="cell">whooping cough</td>
<td class="cell">2009</td>
</tr>
<tr>
<th class="row-header">Scurvy</th>
<td class="cell">Annapolis</td>
<td class="cell"> Junction and Storage Temperature −55 to 150 o ? C</td> <!--dash is u'/u2212'-->
</tr>
</tbody>
<caption>
Table 1: Infectious diseases and where to find them.
</caption>
</table>
<p> In between the tables there is a nasty case of heart attack </p>
<table>
<tbody>
<tr>
<th class="col-header">Problem</th>
<th class="col-header">Cause</th>
<th class="col-header">Cost</th>
</tr>
<tr>
<th class="row-header">Arthritis</th>
<td class="cell">Pokemon Go</td>
<td class="cell">Free</td>
</tr>
<tr>
<th class="row-header">Yellow<i>Fever</i></th>
<td class="cell">Unicorns</td>
<td class="cell">$17.75</td>
</tr>
<tr>
<th class="row-header">Hypochondria</th>
<td class="cell">Fear</td>
<td class="cell">$100</td>
</tr>
</tbody>
<caption>
Table 2: Three ways to get Pneumonia and how much they cost.
</caption>
</table>
<p> And here is a final sentence with warts. </p>
</body>
</html>
Binary file added tests/data/pdf_extended/ext_diseases.pdf
Binary file not shown.
63 changes: 63 additions & 0 deletions tests/parser/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,3 +246,66 @@ def test_spacy_integration(caplog):

assert session.query(Document).count() == 2
assert session.query(Phrase).count() == 81


def test_parse_style(caplog):
"""Test style tag parsing."""
caplog.set_level(logging.INFO)
logger = logging.getLogger(__name__)
session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session()

max_docs = 1
docs_path = 'tests/data/html_extended/ext_diseases.html'
pdf_path = 'tests/data/pdf_extended/ext_diseases.pdf'

# Preprocessor for the Docs
preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

# Grab the document, text tuple from the preprocessor
doc, text = next(preprocessor.generate())
logger.info(" Text: {}".format(text))

# Create an OmniParserUDF
omni_udf = OmniParserUDF(
True, # structural
[], # blacklist, empty so that style is not blacklisted
["span", "br"], # flatten
'', # flatten delim
True, # lingual
True, # strip
[], # replace
True, # tabular
True, # visual
pdf_path, # pdf path
Spacy()) # lingual parser

# Grab the phrases parsed by the OmniParser
phrases = list(omni_udf.parse_structure(doc, text))

logger.warning("Doc: {}".format(doc))
for phrase in phrases:
logger.warning(" Phrase: {}".format(phrase.html_attrs))

# Phrases for testing
sub_phrases = [
{
'index': 7,
'attr': [
'class=col-header',
'hobbies=work:hard;play:harder',
'type=phenotype',
'style=background: #f1f1f1; color: aquamarine; font-size: 18px;'
]
},
{
'index': 10,
'attr': ['class=row-header', 'style=background: #f1f1f1;']
},
{
'index': 12,
'attr': ['class=cell', 'style=text-align: center;']
}
]

# Assertions
assert(all(phrases[p['index']].html_attrs == p['attr'] for p in sub_phrases))

0 comments on commit 3b20d74

Please sign in to comment.