-
Notifications
You must be signed in to change notification settings - Fork 78
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Extending styles parsing and RegEx search #52
Changes from 4 commits
54ac34d
b74f343
7638aa0
37187c2
41fda5c
a6f2a8e
4531f3a
2107c62
7e2977b
446440f
d435623
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -228,11 +228,18 @@ def init(self): | |
self.attrib = self.opts.get('attrib', WORDS) | ||
self.sep = self.opts.get('sep', " ") | ||
|
||
# Extending the RegexMatch to handle search(instead of only match) | ||
# and adding a toggle for full span match. | ||
# Default values are set to False and True for search flag and full | ||
# span matching flag respectively. | ||
self.search = self.opts.get('search', False) | ||
self.full_match = self.opts.get('full_match', True) | ||
|
||
# Compile regex matcher | ||
# NOTE: Enforce full span matching by ensuring that regex ends with $. | ||
# Group self.rgx first so that $ applies to all components of an 'OR' | ||
# expression. (e.g., we want r'(a|b)$' instead of r'a|b$') | ||
self.rgx = self.rgx if self.rgx.endswith('$') else ( | ||
self.rgx = self.rgx if self.rgx.endswith('$') or not self.full_match else ( | ||
This comment was marked as resolved.
Sorry, something went wrong. |
||
'(' + self.rgx + ')$') | ||
self.r = re.compile( | ||
self.rgx, flags=(re.I if self.ignore_case else 0) | re.UNICODE) | ||
|
@@ -242,12 +249,18 @@ def _f(self, c): | |
|
||
|
||
class RegexMatchSpan(RegexMatch): | ||
"""Matches regex pattern on **full concatenated span**""" | ||
|
||
""" | ||
Matches regex pattern on **full concatenated span** | ||
If search flag is set to True: | ||
Search regex pattern in **full concatenated span** | ||
""" | ||
def _f(self, c): | ||
return True if self.r.match( | ||
c.get_attrib_span(self.attrib, | ||
sep=self.sep)) is not None else False | ||
if not self.search: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor nit, it feels a little smoother to read to use the affirmative case first if self.search:
return True if self.r.search(
c.get_attrib_span(self.attrib, sep=self.sep)) is not None else False
else:
return True if self.r.match(
c.get_attrib_span(self.attrib, sep=self.sep)) is not None else False There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, done. |
||
return True if self.r.match( | ||
c.get_attrib_span(self.attrib, sep=self.sep)) is not None else False | ||
else: | ||
return True if self.r.search( | ||
c.get_attrib_span(self.attrib, sep=self.sep)) is not None else False | ||
|
||
|
||
class RegexMatchEach(RegexMatch): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -260,6 +260,28 @@ def parse_node(node, table_info=None, figure_info=None): | |
'='.join(x) for x in list( | ||
context_node.attrib.items()) | ||
] | ||
|
||
# Extending html style attribute with the styles | ||
# from inline style class for the element. | ||
cur_style_index = None | ||
for index, attr in enumerate(parts['html_attrs']): | ||
if attr.find('style') >= 0: | ||
cur_style_index = index | ||
break | ||
styles = root.find('head').find('style') | ||
if styles is not None: | ||
for x in list(context_node.attrib.items()): | ||
if x[0] == 'class': | ||
exp = r'(.' + x[1] + ')([\n\s\r]*)\{(.*?)\}' | ||
r = re.compile(exp, re.DOTALL) | ||
if r.search(styles.text) is not None: | ||
if cur_style_index is not None: | ||
parts['html_attrs'][cur_style_index] += r.search(styles.text).group(3)\ | ||
.replace('\r', '').replace('\n', '').replace('\t', '') | ||
else: | ||
parts['html_attrs'] = 'style=' + r.search(styles.text).group(3)\ | ||
.replace('\r', '').replace('\n', '').replace('\t', '') | ||
break | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not super obvious to me what the use case for this code is. It looks to me like if an element has style attributes, then your extending the element's styles with the first style class in the head of the document that you find. Is that correct? Could you explain why you needed this code? Also, could you add a test case for this code? This would be added to test_parser.py. For example, although The easiest way to do this would be to add a new, simplified HTML document into def test_parse_style(caplog):
"""Test style tag parsing."""
caplog.set_level(logging.INFO)
logger = logging.getLogger(__name__)
session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session()
max_docs = 1
docs_path = 'tests/data/html/[your new document].html'
# Preprocessor for the Docs
preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)
# Grab the document, text tuple from the preprocessor
doc, text = next(preprocessor.generate())
logger.info(" Text: {}".format(text))
# Create an OmniParserUDF
omni_udf = OmniParserUDF(
True, # structural
[], # blacklist, empty so that style is not blacklisted
["span", "br"], # flatten
'', # flatten delim
True, # lingual
True, # strip
[], # replace
True, # tabular
True, # visual
pdf_path, # pdf path
Spacy()) # lingual parser
# Grab the phrases parsed by the OmniParser
phrases = list(omni_udf.parse_structure(doc, text))
# Add your assertions There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If an html element There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will add test case and check in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, makes sense. Thanks! |
||
if self.tabular: | ||
parent = table_info.parent | ||
parts = table_info.apply_tabular( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it necessary to have both of these flags? It seems like these should never both be true. Only one or the other would be true at one time, if I understand correctly.
I would prefer just having
self.search
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
self.full_match
is to toggle appending$
to regexEg:
This is happening because $ matches the end of the string but the expression can be part of the span not at the end.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see. Then this looks good to me, thanks!