Skip to content

Commit

Permalink
Fix for #89, List Items separated by tab character not parsed correct…
Browse files Browse the repository at this point in the history
…ly. (#164)

Also fixes failing examples 312 and 313 in the CommonMark 0.30 spec due to the way leading space is now checked for list items.

The 1st problem was that `# check if next_line starts List` considered only markers followed
by a space and not by a tab. Another problem was that the implemented tab expansion
to tab stops (of width 4 as per spec) could work only in a limited scope.

This fix uses `expandtabs()` to implement the tab stops correctly and moves extraction
of content into the `parse_marker()` and `parse_continuation()` methods. This lets
us implement use cases like "list interrupts a paragraph" and "list item continuation"
in a less error-prone way.
  • Loading branch information
anderskaplan authored Oct 30, 2022
1 parent 808abf4 commit 3335e5e
Show file tree
Hide file tree
Showing 2 changed files with 164 additions and 69 deletions.
150 changes: 84 additions & 66 deletions mistletoe/block_token.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,14 +324,13 @@ def read(cls, lines):
and not Quote.start(next_line)):

# check if next_line starts List
list_pair = ListItem.parse_marker(next_line)
if (len(next_line) - len(next_line.lstrip()) < 4
and list_pair is not None):
prepend, leader = list_pair
# non-empty list item
if next_line[:prepend].endswith(' '):
# unordered list, or ordered list starting from 1
if not leader[:-1].isdigit() or leader[:-1] == '1':
marker_tuple = ListItem.parse_marker(next_line)
if (marker_tuple is not None):
_, leader, content = marker_tuple
# to break a paragraph, the first line may not be empty,
# and the list must be unordered or start from 1.
if not content.strip() == '':
if not leader[0].isdigit() or leader in ['1.', '1)']:
break

# check if next_line starts HTMLBlock other than type 7
Expand Down Expand Up @@ -512,17 +511,32 @@ class ListItem(BlockToken):
Not included in the parsing process, but called by List.
"""
repr_attributes = ("leader", "prepend", "loose")
pattern = re.compile(r'\s*(\d{0,9}[.)]|[+\-*])(\s*$|\s+)')
pattern = re.compile(r' {0,3}(\d{0,9}[.)]|[+\-*])($|\s+)')
continuation_pattern = re.compile(r'([ \t]*)(\S.*\n|\n)')

def __init__(self, parse_buffer, prepend, leader):
self.leader = leader
self.prepend = prepend
self.children = tokenizer.make_tokens(parse_buffer)
self.loose = parse_buffer.loose

@staticmethod
def in_continuation(line, prepend):
return line.strip() == '' or len(line) - len(line.lstrip()) >= prepend
@classmethod
def parse_continuation(cls, line, prepend):
"""
Returns content (i.e. the line with the prepend stripped off) iff the line
is a valid continuation line for a list item with the given prepend length,
otherwise None.
Note that the list item may still continue even if this test doesn't pass
due to lazy continuation.
"""
match_obj = cls.continuation_pattern.match(line)
if match_obj is None:
return None
if match_obj.group(2) == '\n':
return '\n'
expanded_spaces = match_obj.group(1).expandtabs(4)
return expanded_spaces[prepend:] + match_obj.group(2) if len(expanded_spaces) >= prepend else None

@staticmethod
def other_token(line):
Expand All @@ -534,89 +548,93 @@ def other_token(line):
@classmethod
def parse_marker(cls, line):
"""
Returns a pair (prepend, leader) iff the line has a valid leader.
Returns a tuple (prepend, leader, content) iff the line has a valid leader and at
least one space separating leader and content, or if the content is empty, in which
case there need not be any spaces.
The return value is None if the line doesn't have a valid marker.
The leader is a bullet list marker, or an ordered list marker.
The prepend is the start position of the content, i.e., the indentation required
for continuation lines.
"""
match_obj = cls.pattern.match(line)
if match_obj is None:
return None # no valid leader
return None
prepend = len(match_obj.group(0).expandtabs(4))
leader = match_obj.group(1)
content = match_obj.group(0).replace(leader+'\t', leader+' ', 1)
# reassign prepend and leader
prepend = len(content)
if prepend == len(line.rstrip('\n')):
prepend = match_obj.end(1) + 1
else:
spaces = match_obj.group(2)
if spaces.startswith('\t'):
spaces = spaces.replace('\t', ' ', 1)
spaces = spaces.replace('\t', ' ')
n_spaces = len(spaces)
if n_spaces > 4:
prepend = match_obj.end(1) + 1
return prepend, leader
content = line[match_obj.end(0):]
n_spaces = prepend - match_obj.end(1)
if n_spaces > 4:
# if there are more than 4 spaces after the leader, we treat them as part of the content
# with the exception of the first (marker separator) space.
prepend -= n_spaces - 1
content = ' ' * (n_spaces - 1) + content
return prepend, leader, content

@classmethod
def read(cls, lines, prev_marker=None):
next_marker = None
lines.anchor()
prepend = -1
leader = None
line_buffer = []

# first line
line = next(lines)
prepend, leader = prev_marker if prev_marker else cls.parse_marker(line)
line = line.replace(leader+'\t', leader+' ', 1).replace('\t', ' ')
empty_first_line = line[prepend:].strip() == ''
if not empty_first_line:
line_buffer.append(line[prepend:])
next_line = lines.peek()
if empty_first_line and next_line is not None and next_line.strip() == '':
parse_buffer = tokenizer.tokenize_block([next(lines)], _token_types)
next_line = lines.peek()
if next_line is not None:
marker_info = cls.parse_marker(next_line)
if marker_info is not None:
next_marker = marker_info
return (parse_buffer, prepend, leader), next_marker
prepend, leader, content = prev_marker if prev_marker else cls.parse_marker(line)
if content.strip() == '':
# item starting with a blank line: look for the next non-blank line
prepend = len(leader) + 1
blanks = 1
while next_line is not None and next_line.strip() == '':
blanks += 1
next(lines)
next_line = lines.peek()
# if the line following the list marker is also empty, then this is an empty
# list item.
if blanks > 1:
parse_buffer = tokenizer.ParseBuffer()
parse_buffer.loose = True
next_marker = cls.parse_marker(next_line) if next_line is not None else None
return (parse_buffer, prepend, leader), next_marker
else:
line_buffer.append(content)

# loop
newline = 0
# loop over the following lines, looking for the end of the list item
newline_count = 0
while True:
# no more lines
if next_line is None:
# strip off newlines
if newline:
# list item ends here because we have reached the end of content
if newline_count:
lines.backstep()
del line_buffer[-newline:]
del line_buffer[-newline_count:]
break
next_line = next_line.replace('\t', ' ')
# not in continuation
if not cls.in_continuation(next_line, prepend):
# directly followed by another token

continuation = cls.parse_continuation(next_line, prepend)
if not continuation:
# the line doesn't have the indentation to show that it belongs to
# the list item, but it should be included anyway by lazy continuation...
# ...unless it's the start of another token
if cls.other_token(next_line):
if newline:
if newline_count:
lines.backstep()
del line_buffer[-newline:]
del line_buffer[-newline_count:]
break
# next_line is a new list item
# ...or it's a new list item
marker_info = cls.parse_marker(next_line)
if marker_info is not None:
next_marker = marker_info
break
# not another item, has newlines -> not continuation
if newline:
# ...or the line above it was blank
if newline_count:
lines.backstep()
del line_buffer[-newline:]
del line_buffer[-newline_count:]
break
continuation = next_line

line_buffer.append(continuation)
newline_count = newline_count + 1 if continuation == '\n' else 0
next(lines)
line = next_line
stripped = line.lstrip(' ')
diff = len(line) - len(stripped)
if diff > prepend:
stripped = ' ' * (diff - prepend) + stripped
line_buffer.append(stripped)
newline = newline + 1 if next_line.strip() == '' else 0
next_line = lines.peek()

# block-level tokens are parsed here, so that footnotes can be
Expand Down
83 changes: 80 additions & 3 deletions test/test_block_token.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,17 +142,19 @@ def test_read(self):
class TestListItem(unittest.TestCase):
def test_parse_marker(self):
lines = ['- foo\n',
'* bar\n',
' * bar\n',
' + baz\n',
'1. item 1\n',
'2) item 2\n',
'123456789. item x\n']
'123456789. item x\n',
'*\n']
for line in lines:
self.assertTrue(block_token.ListItem.parse_marker(line))
bad_lines = ['> foo\n',
'1item 1\n',
'2| item 2\n',
'1234567890. item x\n']
'1234567890. item x\n',
' * too many spaces\n']
for line in bad_lines:
self.assertFalse(block_token.ListItem.parse_marker(line))

Expand Down Expand Up @@ -204,6 +206,81 @@ def test_tight_list(self):
list_item = block_token.tokenize(lines)[0].children[0]
self.assertEqual(list_item.loose, False)

def test_tabbed_list_items(self):
# according to the CommonMark spec:
# in contexts where spaces help to define block structure, tabs behave as if they
# were replaced by spaces with a tab stop of 4 characters.
lines = ['title\n',
'*\ttabbed item long line\n',
'\n', # break lazy continuation
' continuation 1\n',
'* second list item\n',
'\n', # break lazy continuation
'\tcontinuation 2\n']
tokens = block_token.tokenize(lines)
self.assertEqual(len(tokens), 2)
self.assertIsInstance(tokens[0], block_token.Paragraph)
self.assertIsInstance(tokens[1], block_token.List)
self.assertTrue('tabbed item long line' in tokens[1].children[0])
self.assertTrue('continuation 1' in tokens[1].children[0])
self.assertTrue('second list item' in tokens[1].children[1])
self.assertTrue('continuation 2' in tokens[1].children[1])

def test_list_items_starting_with_blank_line(self):
lines = ['-\n',
' foo\n',
'-\n',
' ```\n',
' bar\n',
' ```\n',
'-\n',
' baz\n']
tokens = block_token.tokenize(lines)
self.assertEqual(len(tokens), 1)
self.assertIsInstance(tokens[0], block_token.List)
self.assertIsInstance(tokens[0].children[0].children[0], block_token.Paragraph)
self.assertIsInstance(tokens[0].children[1].children[0], block_token.CodeFence)
self.assertIsInstance(tokens[0].children[2].children[0], block_token.BlockCode)
self.assertTrue('foo' in tokens[0].children[0].children[0])
self.assertEqual('bar\n', tokens[0].children[1].children[0].children[0].content)
self.assertEqual('baz\n', tokens[0].children[2].children[0].children[0].content)

def test_a_list_item_may_begin_with_at_most_one_blank_line(self):
lines = ['-\n',
'\n',
' foo\n']
tokens = block_token.tokenize(lines)
self.assertEqual(len(tokens), 2)
self.assertIsInstance(tokens[0], block_token.List)
self.assertIsInstance(tokens[1], block_token.Paragraph)
self.assertTrue('foo' in tokens[1].children[0])

def test_empty_list_item_in_the_middle(self):
lines = ['* a\n',
'*\n',
'\n',
'* c\n']
tokens = block_token.tokenize(lines)
self.assertEqual(len(tokens), 1)
self.assertIsInstance(tokens[0], block_token.List)
self.assertEqual(len(tokens[0].children), 3)
self.assertTrue(tokens[0].loose)

def test_list_with_code_block(self):
lines = ['1. indented code\n',
'\n',
' paragraph\n',
'\n',
' more code\n']
tokens = block_token.tokenize(lines)
self.assertEqual(len(tokens), 1)
self.assertIsInstance(tokens[0], block_token.List)
self.assertEqual(len(tokens[0].children), 1)
self.assertIsInstance(tokens[0].children[0].children[0], block_token.BlockCode)
self.assertEqual(' indented code\n', tokens[0].children[0].children[0].children[0].content)
self.assertIsInstance(tokens[0].children[0].children[1], block_token.Paragraph)
self.assertIsInstance(tokens[0].children[0].children[2], block_token.BlockCode)


class TestList(unittest.TestCase):
def test_different_markers(self):
Expand Down

0 comments on commit 3335e5e

Please sign in to comment.