Skip to content

Commit

Permalink
Fix for miyuchina#89, List Items separated by tab character not parse…
Browse files Browse the repository at this point in the history
…d correctly.

Also fixes failing examples 312 and 313 in the CommonMark 0.30 spec, due to the way leading space
is now checked for list items.

The direct cause of the reported bug was that only spaces and not tabs were considered
valid separators for list item markers. Another problem was that the implemented tab expansion,
where tabs were always expanded to four spaces, did not work according to the spec, which states
that tabs should be expanded to the nearest tab stop (of width 4).

This fix uses `expandtabs()` to implement the tab stops correctly and moves extraction
of content into the `parse_marker()` and `parse_continuation()` methods. This lets
us implement use cases like "list interrupts a paragraph" and "list item continuation"
in a less error-prone way.
  • Loading branch information
anderskaplan committed Oct 30, 2022
1 parent 10cc468 commit 666a754
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 59 deletions.
125 changes: 69 additions & 56 deletions mistletoe/block_token.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,14 +324,14 @@ def read(cls, lines):
and not Quote.start(next_line)):

# check if next_line starts List
list_pair = ListItem.parse_marker(next_line)
marker_tuple = ListItem.parse_marker(next_line)
if (len(next_line) - len(next_line.lstrip()) < 4
and list_pair is not None):
prepend, leader = list_pair
# non-empty list item
if next_line[:prepend].endswith(' '):
# unordered list, or ordered list starting from 1
if not leader[:-1].isdigit() or leader[:-1] == '1':
and marker_tuple is not None):
_, leader, content = marker_tuple
# to break a paragraph, the first line may not be empty,
# and the list must be unordered or start from 1.
if not content.strip() == '':
if not leader[0].isdigit() or leader in ['1.', '1)']:
break

# check if next_line starts HTMLBlock other than type 7
Expand Down Expand Up @@ -512,17 +512,32 @@ class ListItem(BlockToken):
Not included in the parsing process, but called by List.
"""
repr_attributes = ("leader", "prepend", "loose")
pattern = re.compile(r'\s*(\d{0,9}[.)]|[+\-*])(\s*$|\s+)')
pattern = re.compile(r' {0,3}(\d{0,9}[.)]|[+\-*])(\s*$|\s+)')
continuation_pattern = re.compile(r'([ \t]*)(\S.*\n|\n)')

def __init__(self, parse_buffer, prepend, leader):
self.leader = leader
self.prepend = prepend
self.children = tokenizer.make_tokens(parse_buffer)
self.loose = parse_buffer.loose

@staticmethod
def in_continuation(line, prepend):
return line.strip() == '' or len(line) - len(line.lstrip()) >= prepend
@classmethod
def parse_continuation(cls, line, prepend):
"""
Returns content (i.e. the line with the prepend stripped off) iff the line
is a valid continuation line for a list item with the given prepend length,
otherwise None.
Note that the list item may still continue even if this test doesn't pass
due to lazy continuation.
"""
match_obj = cls.continuation_pattern.match(line)
if match_obj is None:
return None
if match_obj.group(2) == '\n':
return '\n'
expanded_spaces = match_obj.group(1).expandtabs(4)
return expanded_spaces[prepend:] + match_obj.group(2) if len(expanded_spaces) >= prepend else None

@staticmethod
def other_token(line):
Expand All @@ -534,65 +549,64 @@ def other_token(line):
@classmethod
def parse_marker(cls, line):
"""
Returns a pair (prepend, leader) iff the line has a valid leader.
Returns a tuple (prepend, leader, content) iff the line has a valid leader and a
valid number of spaces between leader and content: between 1 and 4 according to
the spec. Or if the content is empty, in which case there need not be any spaces.
The return value is None if the line doesn't have a valid marker.
The leader is a bullet list marker or an ordered list marker.
The prepend is the start position of the content, i.e., the indentation required
for continuation lines.
"""
match_obj = cls.pattern.match(line)
match_obj = cls.pattern.match(line.removesuffix('\n'))
if match_obj is None:
return None # no valid leader
return None
prepend = len(match_obj.group(0).expandtabs(4))
leader = match_obj.group(1)
content = match_obj.group(0).replace(leader+'\t', leader+' ', 1)
# reassign prepend and leader
prepend = len(content)
if prepend == len(line.rstrip('\n')):
prepend = match_obj.end(1) + 1
else:
spaces = match_obj.group(2)
if spaces.startswith('\t'):
spaces = spaces.replace('\t', ' ', 1)
spaces = spaces.replace('\t', ' ')
n_spaces = len(spaces)
if n_spaces > 4:
prepend = match_obj.end(1) + 1
return prepend, leader
content = line[match_obj.end(0):]
n_spaces = prepend - match_obj.end(1)
if n_spaces > 4:
prepend -= n_spaces - 1
content = ' ' * (n_spaces - 1) + content
return prepend, leader, content

@classmethod
def read(cls, lines, prev_marker=None):
next_marker = None
lines.anchor()
prepend = -1
leader = None
line_buffer = []

# first line
line = next(lines)
prepend, leader = prev_marker if prev_marker else cls.parse_marker(line)
line = line.replace(leader+'\t', leader+' ', 1).replace('\t', ' ')
empty_first_line = line[prepend:].strip() == ''
if not empty_first_line:
line_buffer.append(line[prepend:])
next_line = lines.peek()
if empty_first_line and next_line is not None and next_line.strip() == '':
parse_buffer = tokenizer.tokenize_block([next(lines)], _token_types)
next_line = lines.peek()
if next_line is not None:
marker_info = cls.parse_marker(next_line)
if marker_info is not None:
next_marker = marker_info
return (parse_buffer, prepend, leader), next_marker
prepend, leader, content = prev_marker if prev_marker else cls.parse_marker(line)
if content.strip() == '':
prepend = len(leader) + 1
blanks = 1
while next_line is not None and next_line.strip() == '':
blanks += 1
next(lines)
next_line = lines.peek()
if blanks > 1:
parse_buffer = tokenizer.ParseBuffer()
parse_buffer.loose = True
next_marker = cls.parse_marker(next_line) if next_line is not None else None
return (parse_buffer, prepend, leader), next_marker
else:
line_buffer.append(content)

# loop
newline = 0
while True:
# no more lines
if next_line is None:
# strip off newlines
# strip off trailing newlines
if newline:
lines.backstep()
del line_buffer[-newline:]
break
next_line = next_line.replace('\t', ' ')
# not in continuation
if not cls.in_continuation(next_line, prepend):

continuation = cls.parse_continuation(next_line, prepend)
if not continuation:
# directly followed by another token
if cls.other_token(next_line):
if newline:
Expand All @@ -609,14 +623,13 @@ def read(cls, lines, prev_marker=None):
lines.backstep()
del line_buffer[-newline:]
break
# lazy continuation
continuation = next_line

# continuation
line_buffer.append(continuation)
newline = newline + 1 if continuation == '\n' else 0
next(lines)
line = next_line
stripped = line.lstrip(' ')
diff = len(line) - len(stripped)
if diff > prepend:
stripped = ' ' * (diff - prepend) + stripped
line_buffer.append(stripped)
newline = newline + 1 if next_line.strip() == '' else 0
next_line = lines.peek()

# block-level tokens are parsed here, so that footnotes can be
Expand Down
83 changes: 80 additions & 3 deletions test/test_block_token.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,17 +142,19 @@ def test_read(self):
class TestListItem(unittest.TestCase):
def test_parse_marker(self):
lines = ['- foo\n',
'* bar\n',
' * bar\n',
' + baz\n',
'1. item 1\n',
'2) item 2\n',
'123456789. item x\n']
'123456789. item x\n',
'*\n']
for line in lines:
self.assertTrue(block_token.ListItem.parse_marker(line))
bad_lines = ['> foo\n',
'1item 1\n',
'2| item 2\n',
'1234567890. item x\n']
'1234567890. item x\n',
' * too many spaces\n']
for line in bad_lines:
self.assertFalse(block_token.ListItem.parse_marker(line))

Expand Down Expand Up @@ -204,6 +206,81 @@ def test_tight_list(self):
list_item = block_token.tokenize(lines)[0].children[0]
self.assertEqual(list_item.loose, False)

def test_tabbed_list_items(self):
# according to the CommonMark spec:
# in contexts where spaces help to define block structure, tabs behave as if they
# were replaced by spaces with a tab stop of 4 characters.
lines = ['title\n',
'*\ttabbed item long line\n',
'\n', # break lazy continuation
' continuation 1\n',
'* second list item\n',
'\n', # break lazy continuation
'\tcontinuation 2\n']
tokens = block_token.tokenize(lines)
self.assertEqual(len(tokens), 2)
self.assertIsInstance(tokens[0], block_token.Paragraph)
self.assertIsInstance(tokens[1], block_token.List)
self.assertTrue('tabbed item long line' in tokens[1].children[0])
self.assertTrue('continuation 1' in tokens[1].children[0])
self.assertTrue('second list item' in tokens[1].children[1])
self.assertTrue('continuation 2' in tokens[1].children[1])

def test_list_items_starting_with_blank_line(self):
lines = ['-\n',
' foo\n',
'-\n',
' ```\n',
' bar\n',
' ```\n',
'-\n',
' baz\n']
tokens = block_token.tokenize(lines)
self.assertEqual(len(tokens), 1)
self.assertIsInstance(tokens[0], block_token.List)
self.assertIsInstance(tokens[0].children[0].children[0], block_token.Paragraph)
self.assertIsInstance(tokens[0].children[1].children[0], block_token.CodeFence)
self.assertIsInstance(tokens[0].children[2].children[0], block_token.BlockCode)
self.assertTrue('foo' in tokens[0].children[0].children[0])
self.assertTrue('bar' in tokens[0].children[1].children[0])
self.assertEqual('baz\n', tokens[0].children[2].children[0].children[0].content)

def test_a_list_item_can_begin_with_at_most_one_blank_line(self):
lines = ['-\n',
'\n',
' foo\n']
tokens = block_token.tokenize(lines)
self.assertEqual(len(tokens), 2)
self.assertIsInstance(tokens[0], block_token.List)
self.assertIsInstance(tokens[1], block_token.Paragraph)
self.assertTrue('foo' in tokens[1].children[0])

def test_empty_list_item_in_the_middle(self):
lines = ['* a\n',
'*\n',
'\n',
'* c\n']
tokens = block_token.tokenize(lines)
self.assertEqual(len(tokens), 1)
self.assertIsInstance(tokens[0], block_token.List)
self.assertEqual(len(tokens[0].children), 3)
self.assertTrue(tokens[0].loose)

def test_list_with_code_block(self):
lines = ['1. indented code\n',
'\n',
' paragraph\n',
'\n',
' more code\n']
tokens = block_token.tokenize(lines)
self.assertEqual(len(tokens), 1)
self.assertIsInstance(tokens[0], block_token.List)
self.assertEqual(len(tokens[0].children), 1)
self.assertIsInstance(tokens[0].children[0].children[0], block_token.BlockCode)
self.assertEqual(' indented code\n', tokens[0].children[0].children[0].children[0].content)
self.assertIsInstance(tokens[0].children[0].children[1], block_token.Paragraph)
self.assertIsInstance(tokens[0].children[0].children[2], block_token.BlockCode)


class TestList(unittest.TestCase):
def test_different_markers(self):
Expand Down

0 comments on commit 666a754

Please sign in to comment.