Fix issue with emphasis and whitespace

Alir3z4 · Aug 11, 2017 · b2765e2 · b2765e2
1 parent 54976f1
commit b2765e2
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 5 deletions.
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -23,6 +23,7 @@ The AUTHORS/Contributors are (and/or have been):
 * Andres Rey
 * Ciprian Miclaus
 * Toshihiro Kamiya <[email protected]>
+* Jonathan Sundqvist <[email protected]>
 
 Maintainer:
 

diff --git a/html2text/__init__.py b/html2text/__init__.py
@@ -119,6 +119,10 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH):
         self.abbr_data = None  # last inner HTML (for abbr being defined)
         self.abbr_list = {}  # stack of abbreviations to write later
         self.baseurl = baseurl
+        self.stressed = False
+        self.preceding_stressed = False
+        self.preceding_data = None
+        self.current_tag = None
 
         try:
             del unifiable_n[name2cp('nbsp')]
@@ -276,6 +280,7 @@ def handle_emphasis(self, start, tag_style, parent_style):
                 self.quiet -= 1
 
     def handle_tag(self, tag, attrs, start):
+        self.current_tag = tag
         # attrs is None for endtags
         if attrs is None:
             attrs = {}
@@ -368,15 +373,37 @@ def handle_tag(self, tag, attrs, start):
                 self.blockquote -= 1
                 self.p()
 
+        def no_preceding_space(self):
+            if self.preceding_data and re.match(r'[^\s]', self.preceding_data[-1]):
+                return True
+
         if tag in ['em', 'i', 'u'] and not self.ignore_emphasis:
-            self.o(self.emphasis_mark)
+            if start and no_preceding_space(self):
+                emphasis = ' ' + self.emphasis_mark
+            else:
+                emphasis = self.emphasis_mark
+
+            self.o(emphasis)
+            if start:
+                self.stressed = True
         if tag in ['strong', 'b'] and not self.ignore_emphasis:
-            self.o(self.strong_mark)
-        if tag in ['del', 'strike', 's']:
+            if start and no_preceding_space(self):
+                strong = ' ' + self.strong_mark
+            else:
+                strong = self.strong_mark
+
+            self.o(strong)
             if start:
-                self.o('~~')
+                self.stressed = True
+        if tag in ['del', 'strike', 's']:
+            if start and no_preceding_space(self):
+                strike = ' ~~'
             else:
-                self.o('~~')
+                strike = '~~'
+
+            self.o(strike)
+            if start:
+                self.stressed = True
 
         if self.google_doc:
             if not self.inheader:
@@ -761,6 +788,19 @@ def o(self, data, puredata=0, force=0):
             self.outcount += 1
 
     def handle_data(self, data, entity_char=False):
+
+        if self.stressed:
+            data = data.strip()
+            self.stressed = False
+            self.preceding_stressed = True
+        elif (self.preceding_stressed
+              and re.match(r'[^\s.!?]', data[0])
+              and not hn(self.current_tag)
+              and self.current_tag not in ['a', 'code', 'pre']):
+            # should match a letter or common punctuation
+            data = ' ' + data
+            self.preceding_stressed = False
+
         if self.style:
             self.style_def.update(dumb_css_parser(data))
 
@@ -778,6 +818,7 @@ def handle_data(self, data, entity_char=False):
 
         if not self.code and not self.pre and not entity_char:
             data = escape_md_section(data, snob=self.escape_snob)
+        self.preceding_data = data
         self.o(data, 1)
 
     def unknown_decl(self, data):  # pragma: no cover

diff --git a/test/edge_case_emphasis.html b/test/edge_case_emphasis.html
@@ -0,0 +1,7 @@
+<p><em> emphasis </em></p>
+<p><em>emphasis: </em>some text</p>
+<p><em>repeat: </em>again</p>
+<p>separate<em> emphasis</em> some more text</p>
+<p><em>emphasis</em>.</p>
+<p><em>emphasis</em>?</p>
+<p><em>emphasis</em>!</p>
diff --git a/test/edge_case_emphasis.md b/test/edge_case_emphasis.md
@@ -0,0 +1,14 @@
+_emphasis_
+
+_emphasis:_ some text
+
+_repeat:_ again
+
+separate _emphasis_ some more text
+
+_emphasis_.
+
+_emphasis_?
+
+_emphasis_!
+