Merge pull request #52 from Prabh06/master

Extending styles parsing and RegEx search
HazyResearch · May 23, 2018 · 3b20d74 · 3b20d74
2 parents 06cbe7b + d435623
commit 3b20d74
Show file tree

Hide file tree

Showing 5 changed files with 198 additions and 6 deletions.
diff --git a/fonduer/matchers.py b/fonduer/matchers.py
@@ -228,11 +228,18 @@ def init(self):
         self.attrib = self.opts.get('attrib', WORDS)
         self.sep = self.opts.get('sep', " ")
 
+        # Extending the RegexMatch to handle search(instead of only match)
+        # and adding a toggle for full span match.
+        # Default values are set to False and True for search flag and full
+        # span matching flag respectively.
+        self.search = self.opts.get('search', False)
+        self.full_match = self.opts.get('full_match', True)
+
         # Compile regex matcher
         # NOTE: Enforce full span matching by ensuring that regex ends with $.
         # Group self.rgx first so that $ applies to all components of an 'OR'
         # expression. (e.g., we want r'(a|b)$' instead of r'a|b$')
-        self.rgx = self.rgx if self.rgx.endswith('$') else (
+        self.rgx = self.rgx if self.rgx.endswith('$') or not self.full_match else (
             '(' + self.rgx + ')$')
         self.r = re.compile(
             self.rgx, flags=(re.I if self.ignore_case else 0) | re.UNICODE)
@@ -242,12 +249,18 @@ def _f(self, c):
 
 
 class RegexMatchSpan(RegexMatch):
-    """Matches regex pattern on **full concatenated span**"""
-
+    """
+        Matches regex pattern on **full concatenated span**
+        If search flag is set to True:
+            Search regex pattern in **full concatenated span**
+    """
     def _f(self, c):
-        return True if self.r.match(
-            c.get_attrib_span(self.attrib,
-                              sep=self.sep)) is not None else False
+        if self.search:
+            return True if self.r.search(
+                c.get_attrib_span(self.attrib, sep=self.sep)) is not None else False
+        else:
+            return True if self.r.match(
+                c.get_attrib_span(self.attrib, sep=self.sep)) is not None else False
 
 
 class RegexMatchEach(RegexMatch):

diff --git a/fonduer/parser/parser.py b/fonduer/parser/parser.py
@@ -266,6 +266,32 @@ def parse_node(node, table_info=None, figure_info=None):
                                         '='.join(x) for x in list(
                                             context_node.attrib.items())
                                     ]
+
+                                    # Extending html style attribute with the styles
+                                    # from inline style class for the element.
+                                    cur_style_index = None
+                                    for index, attr in enumerate(parts['html_attrs']):
+                                        if attr.find('style') >= 0:
+                                            cur_style_index = index
+                                            break
+                                    styles = root.find('head').find('style')
+                                    if styles is not None:
+                                        for x in list(context_node.attrib.items()):
+                                            if x[0] == 'class':
+                                                exp = r'(.' + x[1] + ')([\n\s\r]*)\{(.*?)\}'
+                                                r = re.compile(exp, re.DOTALL)
+                                                if r.search(styles.text) is not None:
+                                                    if cur_style_index is not None:
+                                                        parts['html_attrs'][cur_style_index] += r.search(styles.text).group(3)\
+                                                            .replace('\r', '').replace('\n', '').replace('\t', '')
+                                                    else:
+                                                        parts['html_attrs'].extend([
+                                                            'style=' + re.sub(
+                                                                r'\s{1,}', ' ', r.search(styles.text).group(3).
+                                                                replace('\r', '').replace('\n', '').replace('\t', '').strip()
+                                                            )
+                                                        ])
+                                                break
                                 if self.tabular:
                                     parent = table_info.parent
                                     parts = table_info.apply_tabular(

diff --git a/tests/data/html_extended/ext_diseases.html b/tests/data/html_extended/ext_diseases.html
@@ -0,0 +1,90 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<meta charset="utf-8">
+<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
+ <head>
+  <style>
+    .row-header{
+      background: #f1f1f1;
+    }
+    .col-header{
+      background: #f1f1f1;
+      color: aquamarine;
+      font-size: 18px;
+    }
+    .cell{
+      text-align: center;
+    }
+  </style>
+ </head>
+ <body>
+    <h1>Types of viruses, coughs, and colds</h1>
+    <p>Here is<br/>a line break</p>
+    <p>I don't have <span>Brain Cancer</span>or the hiccups</p>
+    <h1><span><p>See Table 1</p> Below.</span></h1>
+    <h2>Common Ailments</h2>
+  <table>
+   <tbody animal="donkey">
+    <tr></tr>
+    <tr hobbies="run:fast;jump:high" letter="Q" >
+     <th class="col-header" type="phenotype" hobbies="work:hard;play:harder" >Disease</th>
+     <th class="col-header" day="Monday">Location</th>
+     <th class="col-header">Year</th>
+    </tr>
+    <tr>
+     <th class="row-header">Polio and BC546 is <span>−</span>55<span>O</span>C cold.</th>
+     <td class="cell" style="width:53pt"><p class="s6" style="padding-top: 1pt">-<span class="s5">Dublin to Milwaukee</span></p></td>
+     <td class="cell">2001</td>
+    </tr>
+    <tr>
+     <th>
+       <table>
+        <tr>
+          <td  class="row-header"> I don't like TIPL761 or Chicken Pox or pizza. Shingles is also bad. </td>
+        </tr> 
+       </table>
+    </th>
+     <td class="cell">whooping cough</td>
+     <td class="cell">2009</td>
+    </tr>
+    <tr>
+     <th class="row-header">Scurvy</th>
+     <td class="cell">Annapolis</td>
+     <td class="cell"> Junction and Storage Temperature −55 to 150 o ? C</td> <!--dash is u'/u2212'-->
+    </tr>
+   </tbody>
+   <caption>
+    Table 1: Infectious diseases and where to find them.
+   </caption>
+  </table>
+  <p> In between the tables there is a nasty case of heart attack </p>
+  <table>
+   <tbody>
+    <tr>
+     <th class="col-header">Problem</th>
+     <th class="col-header">Cause</th>
+     <th class="col-header">Cost</th>
+    </tr>
+    <tr>
+     <th class="row-header">Arthritis</th>
+     <td class="cell">Pokemon Go</td>
+     <td class="cell">Free</td>
+    </tr>
+    <tr>
+     <th class="row-header">Yellow<i>Fever</i></th>
+     <td class="cell">Unicorns</td>
+     <td class="cell">$17.75</td>
+    </tr>
+    <tr>
+     <th class="row-header">Hypochondria</th>
+     <td class="cell">Fear</td>
+     <td class="cell">$100</td>
+    </tr>
+   </tbody>
+   <caption>
+    Table 2: Three ways to get Pneumonia and how much they cost.
+   </caption>
+  </table>
+  <p> And here is a final sentence with warts. </p>
+ </body>
+</html>
diff --git a/tests/data/pdf_extended/ext_diseases.pdf b/tests/data/pdf_extended/ext_diseases.pdf
diff --git a/tests/parser/test_parser.py b/tests/parser/test_parser.py
@@ -246,3 +246,66 @@ def test_spacy_integration(caplog):
 
     assert session.query(Document).count() == 2
     assert session.query(Phrase).count() == 81
+
+
+def test_parse_style(caplog):
+    """Test style tag parsing."""
+    caplog.set_level(logging.INFO)
+    logger = logging.getLogger(__name__)
+    session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session()
+
+    max_docs = 1
+    docs_path = 'tests/data/html_extended/ext_diseases.html'
+    pdf_path = 'tests/data/pdf_extended/ext_diseases.pdf'
+
+    # Preprocessor for the Docs
+    preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)
+
+    # Grab the document, text tuple from the preprocessor
+    doc, text = next(preprocessor.generate())
+    logger.info("    Text: {}".format(text))
+
+    # Create an OmniParserUDF
+    omni_udf = OmniParserUDF(
+        True,           # structural
+        [],             # blacklist, empty so that style is not blacklisted
+        ["span", "br"],  # flatten
+        '',             # flatten delim
+        True,           # lingual
+        True,           # strip
+        [],             # replace
+        True,           # tabular
+        True,           # visual
+        pdf_path,       # pdf path
+        Spacy())        # lingual parser
+
+    # Grab the phrases parsed by the OmniParser
+    phrases = list(omni_udf.parse_structure(doc, text))
+
+    logger.warning("Doc: {}".format(doc))
+    for phrase in phrases:
+        logger.warning("    Phrase: {}".format(phrase.html_attrs))
+
+    # Phrases for testing
+    sub_phrases = [
+        {
+            'index': 7,
+            'attr': [
+                'class=col-header',
+                'hobbies=work:hard;play:harder',
+                'type=phenotype',
+                'style=background: #f1f1f1; color: aquamarine; font-size: 18px;'
+            ]
+        },
+        {
+            'index': 10,
+            'attr': ['class=row-header', 'style=background: #f1f1f1;']
+        },
+        {
+            'index': 12,
+            'attr': ['class=cell', 'style=text-align: center;']
+        }
+    ]
+
+    # Assertions
+    assert(all(phrases[p['index']].html_attrs == p['attr'] for p in sub_phrases))