diff --git a/mistletoe/block_token.py b/mistletoe/block_token.py index 241431ee..ea720baa 100644 --- a/mistletoe/block_token.py +++ b/mistletoe/block_token.py @@ -324,14 +324,14 @@ def read(cls, lines): and not Quote.start(next_line)): # check if next_line starts List - list_pair = ListItem.parse_marker(next_line) + marker_tuple = ListItem.parse_marker(next_line) if (len(next_line) - len(next_line.lstrip()) < 4 - and list_pair is not None): - prepend, leader = list_pair - # non-empty list item - if next_line[:prepend].endswith(' '): - # unordered list, or ordered list starting from 1 - if not leader[:-1].isdigit() or leader[:-1] == '1': + and marker_tuple is not None): + _, leader, content = marker_tuple + # to break a paragraph, the first line may not be empty, + # and the list must be unordered or start from 1. + if not content.strip() == '': + if not leader[0].isdigit() or leader in ['1.', '1)']: break # check if next_line starts HTMLBlock other than type 7 @@ -512,7 +512,8 @@ class ListItem(BlockToken): Not included in the parsing process, but called by List. """ repr_attributes = ("leader", "prepend", "loose") - pattern = re.compile(r'\s*(\d{0,9}[.)]|[+\-*])(\s*$|\s+)') + pattern = re.compile(r' {0,3}(\d{0,9}[.)]|[+\-*])(\s*$|\s+)') + continuation_pattern = re.compile(r'([ \t]*)(\S.*\n|\n)') def __init__(self, parse_buffer, prepend, leader): self.leader = leader @@ -520,9 +521,23 @@ def __init__(self, parse_buffer, prepend, leader): self.children = tokenizer.make_tokens(parse_buffer) self.loose = parse_buffer.loose - @staticmethod - def in_continuation(line, prepend): - return line.strip() == '' or len(line) - len(line.lstrip()) >= prepend + @classmethod + def parse_continuation(cls, line, prepend): + """ + Returns content (i.e. the line with the prepend stripped off) iff the line + is a valid continuation line for a list item with the given prepend length, + otherwise None. + + Note that the list item may still continue even if this test doesn't pass + due to lazy continuation. + """ + match_obj = cls.continuation_pattern.match(line) + if match_obj is None: + return None + if match_obj.group(2) == '\n': + return '\n' + expanded_spaces = match_obj.group(1).expandtabs(4) + return expanded_spaces[prepend:] + match_obj.group(2) if len(expanded_spaces) >= prepend else None @staticmethod def other_token(line): @@ -534,65 +549,64 @@ def other_token(line): @classmethod def parse_marker(cls, line): """ - Returns a pair (prepend, leader) iff the line has a valid leader. + Returns a tuple (prepend, leader, content) iff the line has a valid leader and a + valid number of spaces between leader and content: between 1 and 4 according to + the spec. Or if the content is empty, in which case there need not be any spaces. + The return value is None if the line doesn't have a valid marker. + + The leader is a bullet list marker or an ordered list marker. + + The prepend is the start position of the content, i.e., the indentation required + for continuation lines. """ - match_obj = cls.pattern.match(line) + match_obj = cls.pattern.match(line.removesuffix('\n')) if match_obj is None: - return None # no valid leader + return None + prepend = len(match_obj.group(0).expandtabs(4)) leader = match_obj.group(1) - content = match_obj.group(0).replace(leader+'\t', leader+' ', 1) - # reassign prepend and leader - prepend = len(content) - if prepend == len(line.rstrip('\n')): - prepend = match_obj.end(1) + 1 - else: - spaces = match_obj.group(2) - if spaces.startswith('\t'): - spaces = spaces.replace('\t', ' ', 1) - spaces = spaces.replace('\t', ' ') - n_spaces = len(spaces) - if n_spaces > 4: - prepend = match_obj.end(1) + 1 - return prepend, leader + content = line[match_obj.end(0):] + n_spaces = prepend - match_obj.end(1) + if n_spaces > 4: + prepend -= n_spaces - 1 + content = ' ' * (n_spaces - 1) + content + return prepend, leader, content @classmethod def read(cls, lines, prev_marker=None): next_marker = None lines.anchor() - prepend = -1 - leader = None line_buffer = [] # first line line = next(lines) - prepend, leader = prev_marker if prev_marker else cls.parse_marker(line) - line = line.replace(leader+'\t', leader+' ', 1).replace('\t', ' ') - empty_first_line = line[prepend:].strip() == '' - if not empty_first_line: - line_buffer.append(line[prepend:]) next_line = lines.peek() - if empty_first_line and next_line is not None and next_line.strip() == '': - parse_buffer = tokenizer.tokenize_block([next(lines)], _token_types) - next_line = lines.peek() - if next_line is not None: - marker_info = cls.parse_marker(next_line) - if marker_info is not None: - next_marker = marker_info - return (parse_buffer, prepend, leader), next_marker + prepend, leader, content = prev_marker if prev_marker else cls.parse_marker(line) + if content.strip() == '': + prepend = len(leader) + 1 + blanks = 1 + while next_line is not None and next_line.strip() == '': + blanks += 1 + next(lines) + next_line = lines.peek() + if blanks > 1: + parse_buffer = tokenizer.ParseBuffer() + parse_buffer.loose = True + next_marker = cls.parse_marker(next_line) if next_line is not None else None + return (parse_buffer, prepend, leader), next_marker + else: + line_buffer.append(content) - # loop newline = 0 while True: - # no more lines if next_line is None: - # strip off newlines + # strip off trailing newlines if newline: lines.backstep() del line_buffer[-newline:] break - next_line = next_line.replace('\t', ' ') - # not in continuation - if not cls.in_continuation(next_line, prepend): + + continuation = cls.parse_continuation(next_line, prepend) + if not continuation: # directly followed by another token if cls.other_token(next_line): if newline: @@ -609,14 +623,13 @@ def read(cls, lines, prev_marker=None): lines.backstep() del line_buffer[-newline:] break + # lazy continuation + continuation = next_line + + # continuation + line_buffer.append(continuation) + newline = newline + 1 if continuation == '\n' else 0 next(lines) - line = next_line - stripped = line.lstrip(' ') - diff = len(line) - len(stripped) - if diff > prepend: - stripped = ' ' * (diff - prepend) + stripped - line_buffer.append(stripped) - newline = newline + 1 if next_line.strip() == '' else 0 next_line = lines.peek() # block-level tokens are parsed here, so that footnotes can be diff --git a/test/test_block_token.py b/test/test_block_token.py index dd243458..87804d2c 100644 --- a/test/test_block_token.py +++ b/test/test_block_token.py @@ -142,17 +142,19 @@ def test_read(self): class TestListItem(unittest.TestCase): def test_parse_marker(self): lines = ['- foo\n', - '* bar\n', + ' * bar\n', ' + baz\n', '1. item 1\n', '2) item 2\n', - '123456789. item x\n'] + '123456789. item x\n', + '*\n'] for line in lines: self.assertTrue(block_token.ListItem.parse_marker(line)) bad_lines = ['> foo\n', '1item 1\n', '2| item 2\n', - '1234567890. item x\n'] + '1234567890. item x\n', + ' * too many spaces\n'] for line in bad_lines: self.assertFalse(block_token.ListItem.parse_marker(line)) @@ -204,6 +206,81 @@ def test_tight_list(self): list_item = block_token.tokenize(lines)[0].children[0] self.assertEqual(list_item.loose, False) + def test_tabbed_list_items(self): + # according to the CommonMark spec: + # in contexts where spaces help to define block structure, tabs behave as if they + # were replaced by spaces with a tab stop of 4 characters. + lines = ['title\n', + '*\ttabbed item long line\n', + '\n', # break lazy continuation + ' continuation 1\n', + '* second list item\n', + '\n', # break lazy continuation + '\tcontinuation 2\n'] + tokens = block_token.tokenize(lines) + self.assertEqual(len(tokens), 2) + self.assertIsInstance(tokens[0], block_token.Paragraph) + self.assertIsInstance(tokens[1], block_token.List) + self.assertTrue('tabbed item long line' in tokens[1].children[0]) + self.assertTrue('continuation 1' in tokens[1].children[0]) + self.assertTrue('second list item' in tokens[1].children[1]) + self.assertTrue('continuation 2' in tokens[1].children[1]) + + def test_list_items_starting_with_blank_line(self): + lines = ['-\n', + ' foo\n', + '-\n', + ' ```\n', + ' bar\n', + ' ```\n', + '-\n', + ' baz\n'] + tokens = block_token.tokenize(lines) + self.assertEqual(len(tokens), 1) + self.assertIsInstance(tokens[0], block_token.List) + self.assertIsInstance(tokens[0].children[0].children[0], block_token.Paragraph) + self.assertIsInstance(tokens[0].children[1].children[0], block_token.CodeFence) + self.assertIsInstance(tokens[0].children[2].children[0], block_token.BlockCode) + self.assertTrue('foo' in tokens[0].children[0].children[0]) + self.assertTrue('bar' in tokens[0].children[1].children[0]) + self.assertEqual('baz\n', tokens[0].children[2].children[0].children[0].content) + + def test_a_list_item_can_begin_with_at_most_one_blank_line(self): + lines = ['-\n', + '\n', + ' foo\n'] + tokens = block_token.tokenize(lines) + self.assertEqual(len(tokens), 2) + self.assertIsInstance(tokens[0], block_token.List) + self.assertIsInstance(tokens[1], block_token.Paragraph) + self.assertTrue('foo' in tokens[1].children[0]) + + def test_empty_list_item_in_the_middle(self): + lines = ['* a\n', + '*\n', + '\n', + '* c\n'] + tokens = block_token.tokenize(lines) + self.assertEqual(len(tokens), 1) + self.assertIsInstance(tokens[0], block_token.List) + self.assertEqual(len(tokens[0].children), 3) + self.assertTrue(tokens[0].loose) + + def test_list_with_code_block(self): + lines = ['1. indented code\n', + '\n', + ' paragraph\n', + '\n', + ' more code\n'] + tokens = block_token.tokenize(lines) + self.assertEqual(len(tokens), 1) + self.assertIsInstance(tokens[0], block_token.List) + self.assertEqual(len(tokens[0].children), 1) + self.assertIsInstance(tokens[0].children[0].children[0], block_token.BlockCode) + self.assertEqual(' indented code\n', tokens[0].children[0].children[0].children[0].content) + self.assertIsInstance(tokens[0].children[0].children[1], block_token.Paragraph) + self.assertIsInstance(tokens[0].children[0].children[2], block_token.BlockCode) + class TestList(unittest.TestCase): def test_different_markers(self):