Skip to content

Commit

Permalink
Fix issue with emphasis and whitespace
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-s committed Aug 11, 2017
1 parent 54976f1 commit b2765e2
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 5 deletions.
1 change: 1 addition & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ The AUTHORS/Contributors are (and/or have been):
* Andres Rey
* Ciprian Miclaus
* Toshihiro Kamiya <[email protected]>
* Jonathan Sundqvist <[email protected]>

Maintainer:

Expand Down
51 changes: 46 additions & 5 deletions html2text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,10 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH):
self.abbr_data = None # last inner HTML (for abbr being defined)
self.abbr_list = {} # stack of abbreviations to write later
self.baseurl = baseurl
self.stressed = False
self.preceding_stressed = False
self.preceding_data = None
self.current_tag = None

try:
del unifiable_n[name2cp('nbsp')]
Expand Down Expand Up @@ -276,6 +280,7 @@ def handle_emphasis(self, start, tag_style, parent_style):
self.quiet -= 1

def handle_tag(self, tag, attrs, start):
self.current_tag = tag
# attrs is None for endtags
if attrs is None:
attrs = {}
Expand Down Expand Up @@ -368,15 +373,37 @@ def handle_tag(self, tag, attrs, start):
self.blockquote -= 1
self.p()

def no_preceding_space(self):
if self.preceding_data and re.match(r'[^\s]', self.preceding_data[-1]):
return True

if tag in ['em', 'i', 'u'] and not self.ignore_emphasis:
self.o(self.emphasis_mark)
if start and no_preceding_space(self):
emphasis = ' ' + self.emphasis_mark
else:
emphasis = self.emphasis_mark

self.o(emphasis)
if start:
self.stressed = True
if tag in ['strong', 'b'] and not self.ignore_emphasis:
self.o(self.strong_mark)
if tag in ['del', 'strike', 's']:
if start and no_preceding_space(self):
strong = ' ' + self.strong_mark
else:
strong = self.strong_mark

self.o(strong)
if start:
self.o('~~')
self.stressed = True
if tag in ['del', 'strike', 's']:
if start and no_preceding_space(self):
strike = ' ~~'
else:
self.o('~~')
strike = '~~'

self.o(strike)
if start:
self.stressed = True

if self.google_doc:
if not self.inheader:
Expand Down Expand Up @@ -761,6 +788,19 @@ def o(self, data, puredata=0, force=0):
self.outcount += 1

def handle_data(self, data, entity_char=False):

if self.stressed:
data = data.strip()
self.stressed = False
self.preceding_stressed = True
elif (self.preceding_stressed
and re.match(r'[^\s.!?]', data[0])
and not hn(self.current_tag)
and self.current_tag not in ['a', 'code', 'pre']):
# should match a letter or common punctuation
data = ' ' + data
self.preceding_stressed = False

if self.style:
self.style_def.update(dumb_css_parser(data))

Expand All @@ -778,6 +818,7 @@ def handle_data(self, data, entity_char=False):

if not self.code and not self.pre and not entity_char:
data = escape_md_section(data, snob=self.escape_snob)
self.preceding_data = data
self.o(data, 1)

def unknown_decl(self, data): # pragma: no cover
Expand Down
7 changes: 7 additions & 0 deletions test/edge_case_emphasis.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<p><em> emphasis </em></p>
<p><em>emphasis: </em>some text</p>
<p><em>repeat: </em>again</p>
<p>separate<em> emphasis</em> some more text</p>
<p><em>emphasis</em>.</p>
<p><em>emphasis</em>?</p>
<p><em>emphasis</em>!</p>
14 changes: 14 additions & 0 deletions test/edge_case_emphasis.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
_emphasis_

_emphasis:_ some text

_repeat:_ again

separate _emphasis_ some more text

_emphasis_.

_emphasis_?

_emphasis_!

0 comments on commit b2765e2

Please sign in to comment.