HazyResearch · lukehsiao · May 23, 2018 · May 14, 2018 · May 14, 2018 · May 15, 2018
diff --git a/fonduer/matchers.py b/fonduer/matchers.py
@@ -228,11 +228,18 @@ def init(self):
         self.attrib = self.opts.get('attrib', WORDS)
         self.sep = self.opts.get('sep', " ")
 
+        # Extending the RegexMatch to handle search(instead of only match)
+        # and adding a toggle for full span match.
+        # Default values are set to False and True for search flag and full
+        # span matching flag respectively.
+        self.search = self.opts.get('search', False)
+        self.full_match = self.opts.get('full_match', True)
+
         # Compile regex matcher
         # NOTE: Enforce full span matching by ensuring that regex ends with $.
         # Group self.rgx first so that $ applies to all components of an 'OR'
         # expression. (e.g., we want r'(a|b)$' instead of r'a|b$')
-        self.rgx = self.rgx if self.rgx.endswith('$') else (
+        self.rgx = self.rgx if self.rgx.endswith('$') or not self.full_match else (
             '(' + self.rgx + ')$')
         self.r = re.compile(
             self.rgx, flags=(re.I if self.ignore_case else 0) | re.UNICODE)
@@ -242,12 +249,18 @@ def _f(self, c):
 
 
 class RegexMatchSpan(RegexMatch):
-    """Matches regex pattern on **full concatenated span**"""
-
+    """
+        Matches regex pattern on **full concatenated span**
+        If search flag is set to True:
+            Search regex pattern in **full concatenated span**
+    """
     def _f(self, c):
-        return True if self.r.match(
-            c.get_attrib_span(self.attrib,
-                              sep=self.sep)) is not None else False
+        if not self.search:
+            return True if self.r.match(
+                c.get_attrib_span(self.attrib, sep=self.sep)) is not None else False
+        else:
+            return True if self.r.search(
+                c.get_attrib_span(self.attrib, sep=self.sep)) is not None else False
 
 
 class RegexMatchEach(RegexMatch):

diff --git a/fonduer/parser/parser.py b/fonduer/parser/parser.py
@@ -260,6 +260,28 @@ def parse_node(node, table_info=None, figure_info=None):
                                         '='.join(x) for x in list(
                                             context_node.attrib.items())
                                     ]
+
+                                    # Extending html style attribute with the styles
+                                    # from inline style class for the element.
+                                    cur_style_index = None
+                                    for index, attr in enumerate(parts['html_attrs']):
+                                        if attr.find('style') >= 0:
+                                            cur_style_index = index
+                                            break
+                                    styles = root.find('head').find('style')
+                                    if styles is not None:
+                                        for x in list(context_node.attrib.items()):
+                                            if x[0] == 'class':
+                                                exp = r'(.' + x[1] + ')([\n\s\r]*)\{(.*?)\}'
+                                                r = re.compile(exp, re.DOTALL)
+                                                if r.search(styles.text) is not None:
+                                                    if cur_style_index is not None:
+                                                        parts['html_attrs'][cur_style_index] += r.search(styles.text).group(3)\
+                                                            .replace('\r', '').replace('\n', '').replace('\t', '')
+                                                    else:
+                                                        parts['html_attrs'] = 'style=' + r.search(styles.text).group(3)\
+                                                            .replace('\r', '').replace('\n', '').replace('\t', '')
+                                                break
                                 if self.tabular:
                                     parent = table_info.parent
                                     parts = table_info.apply_tabular(