Fix for miyuchina#89, List Items separated by tab character not parse…

…d correctly. Also fixes failing examples 312 and 313 in the CommonMark 0.30 spec, due to the way leading space is now checked for list items. The direct cause of the reported bug was that only spaces and not tabs were considered valid separators for list item markers. Another problem was that the implemented tab expansion, where tabs were always expanded to four spaces, did not work according to the spec, which states that tabs should be expanded to the nearest tab stop (of width 4). This fix uses `expandtabs()` to implement the tab stops correctly and moves extraction of content into the `parse_marker()` and `parse_continuation()` methods. This lets us implement use cases like "list interrupts a paragraph" and "list item continuation" in a less error-prone way.
anderskaplan · Oct 30, 2022 · 666a754 · 666a754
1 parent 10cc468
commit 666a754
Show file tree

Hide file tree

Showing 2 changed files with 149 additions and 59 deletions.
diff --git a/mistletoe/block_token.py b/mistletoe/block_token.py
@@ -324,14 +324,14 @@ def read(cls, lines):
                 and not Quote.start(next_line)):
 
             # check if next_line starts List
-            list_pair = ListItem.parse_marker(next_line)
+            marker_tuple = ListItem.parse_marker(next_line)
             if (len(next_line) - len(next_line.lstrip()) < 4
-                    and list_pair is not None):
-                prepend, leader = list_pair
-                # non-empty list item
-                if next_line[:prepend].endswith(' '):
-                    # unordered list, or ordered list starting from 1
-                    if not leader[:-1].isdigit() or leader[:-1] == '1':
+                    and marker_tuple is not None):
+                _, leader, content = marker_tuple
+                # to break a paragraph, the first line may not be empty,
+                # and the list must be unordered or start from 1.
+                if not content.strip() == '':
+                    if not leader[0].isdigit() or leader in ['1.', '1)']:
                         break
 
             # check if next_line starts HTMLBlock other than type 7
@@ -512,17 +512,32 @@ class ListItem(BlockToken):
     Not included in the parsing process, but called by List.
     """
     repr_attributes = ("leader", "prepend", "loose")
-    pattern = re.compile(r'\s*(\d{0,9}[.)]|[+\-*])(\s*$|\s+)')
+    pattern = re.compile(r' {0,3}(\d{0,9}[.)]|[+\-*])(\s*$|\s+)')
+    continuation_pattern = re.compile(r'([ \t]*)(\S.*\n|\n)')
 
     def __init__(self, parse_buffer, prepend, leader):
         self.leader = leader
         self.prepend = prepend
         self.children = tokenizer.make_tokens(parse_buffer)
         self.loose = parse_buffer.loose
 
-    @staticmethod
-    def in_continuation(line, prepend):
-        return line.strip() == '' or len(line) - len(line.lstrip()) >= prepend
+    @classmethod
+    def parse_continuation(cls, line, prepend):
+        """
+        Returns content (i.e. the line with the prepend stripped off) iff the line
+        is a valid continuation line for a list item with the given prepend length,
+        otherwise None.
+
+        Note that the list item may still continue even if this test doesn't pass
+        due to lazy continuation.
+        """
+        match_obj = cls.continuation_pattern.match(line)
+        if match_obj is None:
+            return None
+        if match_obj.group(2) == '\n':
+            return '\n'
+        expanded_spaces = match_obj.group(1).expandtabs(4)
+        return expanded_spaces[prepend:] + match_obj.group(2) if len(expanded_spaces) >= prepend else None
 
     @staticmethod
     def other_token(line):
@@ -534,65 +549,64 @@ def other_token(line):
     @classmethod
     def parse_marker(cls, line):
         """
-        Returns a pair (prepend, leader) iff the line has a valid leader.
+        Returns a tuple (prepend, leader, content) iff the line has a valid leader and a
+        valid number of spaces between leader and content: between 1 and 4 according to
+        the spec. Or if the content is empty, in which case there need not be any spaces.
+        The return value is None if the line doesn't have a valid marker.
+
+        The leader is a bullet list marker or an ordered list marker.
+
+        The prepend is the start position of the content, i.e., the indentation required
+        for continuation lines.
         """
-        match_obj = cls.pattern.match(line)
+        match_obj = cls.pattern.match(line.removesuffix('\n'))
         if match_obj is None:
-            return None        # no valid leader
+            return None
+        prepend = len(match_obj.group(0).expandtabs(4))
         leader = match_obj.group(1)
-        content = match_obj.group(0).replace(leader+'\t', leader+'   ', 1)
-        # reassign prepend and leader
-        prepend = len(content)
-        if prepend == len(line.rstrip('\n')):
-            prepend = match_obj.end(1) + 1
-        else:
-            spaces = match_obj.group(2)
-            if spaces.startswith('\t'):
-                spaces = spaces.replace('\t', '   ', 1)
-            spaces = spaces.replace('\t', '    ')
-            n_spaces = len(spaces)
-            if n_spaces > 4:
-                prepend = match_obj.end(1) + 1
-        return prepend, leader
+        content = line[match_obj.end(0):]
+        n_spaces = prepend - match_obj.end(1)
+        if n_spaces > 4:
+            prepend -= n_spaces - 1
+            content = ' ' * (n_spaces - 1) + content
+        return prepend, leader, content
 
     @classmethod
     def read(cls, lines, prev_marker=None):
         next_marker = None
         lines.anchor()
-        prepend = -1
-        leader = None
         line_buffer = []
 
         # first line
         line = next(lines)
-        prepend, leader = prev_marker if prev_marker else cls.parse_marker(line)
-        line = line.replace(leader+'\t', leader+'   ', 1).replace('\t', '    ')
-        empty_first_line = line[prepend:].strip() == ''
-        if not empty_first_line:
-            line_buffer.append(line[prepend:])
         next_line = lines.peek()
-        if empty_first_line and next_line is not None and next_line.strip() == '':
-            parse_buffer = tokenizer.tokenize_block([next(lines)], _token_types)
-            next_line = lines.peek()
-            if next_line is not None:
-                marker_info = cls.parse_marker(next_line)
-                if marker_info is not None:
-                    next_marker = marker_info
-            return (parse_buffer, prepend, leader), next_marker
+        prepend, leader, content = prev_marker if prev_marker else cls.parse_marker(line)
+        if content.strip() == '':
+            prepend = len(leader) + 1
+            blanks = 1
+            while next_line is not None and next_line.strip() == '':
+                blanks += 1
+                next(lines)
+                next_line = lines.peek()
+            if blanks > 1:
+                parse_buffer = tokenizer.ParseBuffer()
+                parse_buffer.loose = True
+                next_marker = cls.parse_marker(next_line) if next_line is not None else None
+                return (parse_buffer, prepend, leader), next_marker
+        else:
+            line_buffer.append(content)
 
-        # loop
         newline = 0
         while True:
-            # no more lines
             if next_line is None:
-                # strip off newlines
+                # strip off trailing newlines
                 if newline:
                     lines.backstep()
                     del line_buffer[-newline:]
                 break
-            next_line = next_line.replace('\t', '    ')
-            # not in continuation
-            if not cls.in_continuation(next_line, prepend):
+
+            continuation = cls.parse_continuation(next_line, prepend)
+            if not continuation:
                 # directly followed by another token
                 if cls.other_token(next_line):
                     if newline:
@@ -609,14 +623,13 @@ def read(cls, lines, prev_marker=None):
                     lines.backstep()
                     del line_buffer[-newline:]
                     break
+                # lazy continuation
+                continuation = next_line
+
+            # continuation
+            line_buffer.append(continuation)
+            newline = newline + 1 if continuation == '\n' else 0
             next(lines)
-            line = next_line
-            stripped = line.lstrip(' ')
-            diff = len(line) - len(stripped)
-            if diff > prepend:
-                stripped = ' ' * (diff - prepend) + stripped
-            line_buffer.append(stripped)
-            newline = newline + 1 if next_line.strip() == '' else 0
             next_line = lines.peek()
 
         # block-level tokens are parsed here, so that footnotes can be

diff --git a/test/test_block_token.py b/test/test_block_token.py
@@ -142,17 +142,19 @@ def test_read(self):
 class TestListItem(unittest.TestCase):
     def test_parse_marker(self):
         lines = ['- foo\n',
-                 '*    bar\n',
+                 '   *    bar\n',
                  ' + baz\n',
                  '1. item 1\n',
                  '2) item 2\n',
-                 '123456789. item x\n']
+                 '123456789. item x\n',
+                 '*\n']
         for line in lines:
             self.assertTrue(block_token.ListItem.parse_marker(line))
         bad_lines = ['> foo\n',
                      '1item 1\n',
                      '2| item 2\n',
-                     '1234567890. item x\n']
+                     '1234567890. item x\n',
+                     '    * too many spaces\n']
         for line in bad_lines:
             self.assertFalse(block_token.ListItem.parse_marker(line))
 
@@ -204,6 +206,81 @@ def test_tight_list(self):
         list_item = block_token.tokenize(lines)[0].children[0]
         self.assertEqual(list_item.loose, False)
 
+    def test_tabbed_list_items(self):
+        # according to the CommonMark spec:
+        # in contexts where spaces help to define block structure, tabs behave as if they
+        # were replaced by spaces with a tab stop of 4 characters.
+        lines = ['title\n',
+                 '*\ttabbed item long line\n',
+                 '\n', # break lazy continuation
+                 '    continuation 1\n',
+                 '*   second list item\n',
+                 '\n', # break lazy continuation
+                 '\tcontinuation 2\n']
+        tokens = block_token.tokenize(lines)
+        self.assertEqual(len(tokens), 2)
+        self.assertIsInstance(tokens[0], block_token.Paragraph)
+        self.assertIsInstance(tokens[1], block_token.List)
+        self.assertTrue('tabbed item long line' in tokens[1].children[0])
+        self.assertTrue('continuation 1' in tokens[1].children[0])
+        self.assertTrue('second list item' in tokens[1].children[1])
+        self.assertTrue('continuation 2' in tokens[1].children[1])
+
+    def test_list_items_starting_with_blank_line(self):
+        lines = ['-\n',
+                 '  foo\n',
+                 '-\n',
+                 '  ```\n',
+                 '  bar\n',
+                 '  ```\n',
+                 '-\n',
+                 '      baz\n']
+        tokens = block_token.tokenize(lines)
+        self.assertEqual(len(tokens), 1)
+        self.assertIsInstance(tokens[0], block_token.List)
+        self.assertIsInstance(tokens[0].children[0].children[0], block_token.Paragraph)
+        self.assertIsInstance(tokens[0].children[1].children[0], block_token.CodeFence)
+        self.assertIsInstance(tokens[0].children[2].children[0], block_token.BlockCode)
+        self.assertTrue('foo' in tokens[0].children[0].children[0])
+        self.assertTrue('bar' in tokens[0].children[1].children[0])
+        self.assertEqual('baz\n', tokens[0].children[2].children[0].children[0].content)
+
+    def test_a_list_item_can_begin_with_at_most_one_blank_line(self):
+        lines = ['-\n',
+                 '\n',
+                 '  foo\n']
+        tokens = block_token.tokenize(lines)
+        self.assertEqual(len(tokens), 2)
+        self.assertIsInstance(tokens[0], block_token.List)
+        self.assertIsInstance(tokens[1], block_token.Paragraph)
+        self.assertTrue('foo' in tokens[1].children[0])
+
+    def test_empty_list_item_in_the_middle(self):
+        lines = ['* a\n',
+                 '*\n',
+                 '\n',
+                 '* c\n']
+        tokens = block_token.tokenize(lines)
+        self.assertEqual(len(tokens), 1)
+        self.assertIsInstance(tokens[0], block_token.List)
+        self.assertEqual(len(tokens[0].children), 3)
+        self.assertTrue(tokens[0].loose)
+
+    def test_list_with_code_block(self):
+        lines = ['1.      indented code\n',
+                 '\n',
+                 '   paragraph\n',
+                 '\n',
+                 '       more code\n']
+        tokens = block_token.tokenize(lines)
+        self.assertEqual(len(tokens), 1)
+        self.assertIsInstance(tokens[0], block_token.List)
+        self.assertEqual(len(tokens[0].children), 1)
+        self.assertIsInstance(tokens[0].children[0].children[0], block_token.BlockCode)
+        self.assertEqual(' indented code\n', tokens[0].children[0].children[0].children[0].content)
+        self.assertIsInstance(tokens[0].children[0].children[1], block_token.Paragraph)
+        self.assertIsInstance(tokens[0].children[0].children[2], block_token.BlockCode)
+
 
 class TestList(unittest.TestCase):
     def test_different_markers(self):