diff options
Diffstat (limited to 'src/mistune/list_parser.py')
-rw-r--r-- | src/mistune/list_parser.py | 250 |
1 files changed, 250 insertions, 0 deletions
diff --git a/src/mistune/list_parser.py b/src/mistune/list_parser.py new file mode 100644 index 0000000..b5ff866 --- /dev/null +++ b/src/mistune/list_parser.py @@ -0,0 +1,250 @@ +import re +from .core import BlockState +from .util import ( + strip_end, + expand_tab, + expand_leading_tab, +) +# because list is complex, split list parser in a new file + +LIST_PATTERN = ( + r'^(?P<list_1> {0,3})' + r'(?P<list_2>[\*\+-]|\d{1,9}[.)])' + r'(?P<list_3>[ \t]*|[ \t].+)$' +) + +_LINE_HAS_TEXT = re.compile(r'( *)\S') + + +def parse_list(block, m: re.Match, state: BlockState) -> int: + """Parse tokens for ordered and unordered list.""" + text = m.group('list_3') + if not text.strip(): + # Example 285 + # an empty list item cannot interrupt a paragraph + end_pos = state.append_paragraph() + if end_pos: + return end_pos + + marker = m.group('list_2') + ordered = len(marker) > 1 + depth = state.depth() + token = { + 'type': 'list', + 'children': [], + 'tight': True, + 'bullet': marker[-1], + 'attrs': { + 'depth': depth, + 'ordered': ordered, + }, + } + if ordered: + start = int(marker[:-1]) + if start != 1: + # Example 304 + # we allow only lists starting with 1 to interrupt paragraphs + end_pos = state.append_paragraph() + if end_pos: + return end_pos + token['attrs']['start'] = start + + state.cursor = m.end() + 1 + groups = (m.group('list_1'), marker, text) + + if depth >= block.max_nested_level - 1: + rules = list(block.list_rules) + rules.remove('list') + else: + rules = block.list_rules + + bullet = _get_list_bullet(marker[-1]) + while groups: + groups = _parse_list_item(block, bullet, groups, token, state, rules) + + end_pos = token.pop('_end_pos', None) + _transform_tight_list(token) + if end_pos: + index = token.pop('_tok_index') + state.tokens.insert(index, token) + return end_pos + + state.append_token(token) + return state.cursor + + +def _transform_tight_list(token): + if token['tight']: + # reset tight list item + for list_item in token['children']: + for tok in list_item['children']: + if tok['type'] == 'paragraph': + tok['type'] = 'block_text' + elif tok['type'] == 'list': + _transform_tight_list(tok) + + +def _parse_list_item(block, bullet, groups, token, state, rules): + spaces, marker, text = groups + + leading_width = len(spaces) + len(marker) + text, continue_width = _compile_continue_width(text, leading_width) + item_pattern = _compile_list_item_pattern(bullet, leading_width) + pairs = [ + ('thematic_break', block.specification['thematic_break']), + ('fenced_code', block.specification['fenced_code']), + ('axt_heading', block.specification['axt_heading']), + ('block_quote', block.specification['block_quote']), + ('block_html', block.specification['block_html']), + ('list', block.specification['list']), + ] + if leading_width < 3: + _repl_w = str(leading_width) + pairs = [(n, p.replace('3', _repl_w, 1)) for n, p in pairs] + + pairs.insert(1, ('list_item', item_pattern)) + regex = '|'.join(r'(?P<%s>(?<=\n)%s)' % pair for pair in pairs) + sc = re.compile(regex, re.M) + + src = '' + next_group = None + prev_blank_line = False + pos = state.cursor + + continue_space = ' ' * continue_width + while pos < state.cursor_max: + pos = state.find_line_end() + line = state.get_text(pos) + if block.BLANK_LINE.match(line): + src += '\n' + prev_blank_line = True + state.cursor = pos + continue + + line = expand_leading_tab(line) + if line.startswith(continue_space): + if prev_blank_line and not text and not src.strip(): + # Example 280 + # A list item can begin with at most one blank line + break + + src += line + prev_blank_line = False + state.cursor = pos + continue + + m = sc.match(state.src, state.cursor) + if m: + tok_type = m.lastgroup + if tok_type == 'list_item': + if prev_blank_line: + token['tight'] = False + next_group = ( + m.group('listitem_1'), + m.group('listitem_2'), + m.group('listitem_3') + ) + state.cursor = m.end() + 1 + break + tok_index = len(state.tokens) + end_pos = block.parse_method(m, state) + if end_pos: + token['_tok_index'] = tok_index + token['_end_pos'] = end_pos + break + + if prev_blank_line and not line.startswith(continue_space): + # not a continue line, and previous line is blank + break + + src += line + state.cursor = pos + + text += _clean_list_item_text(src, continue_width) + child = state.child_state(strip_end(text)) + + block.parse(child, rules) + + if token['tight'] and _is_loose_list(child.tokens): + token['tight'] = False + + token['children'].append({ + 'type': 'list_item', + 'children': child.tokens, + }) + if next_group: + return next_group + + +def _get_list_bullet(c): + if c == '.': + bullet = r'\d{0,9}\.' + elif c == ')': + bullet = r'\d{0,9}\)' + elif c == '*': + bullet = r'\*' + elif c == '+': + bullet = r'\+' + else: + bullet = '-' + return bullet + + +def _compile_list_item_pattern(bullet, leading_width): + if leading_width > 3: + leading_width = 3 + return ( + r'^(?P<listitem_1> {0,' + str(leading_width) + '})' + r'(?P<listitem_2>' + bullet + ')' + r'(?P<listitem_3>[ \t]*|[ \t][^\n]+)$' + ) + + +def _compile_continue_width(text, leading_width): + text = expand_leading_tab(text, 3) + text = expand_tab(text) + + m2 = _LINE_HAS_TEXT.match(text) + if m2: + # indent code, startswith 5 spaces + if text.startswith(' '): + space_width = 1 + else: + space_width = len(m2.group(1)) + + text = text[space_width:] + '\n' + else: + space_width = 1 + text = '' + + continue_width = leading_width + space_width + return text, continue_width + + +def _clean_list_item_text(src, continue_width): + # according to Example 7, tab should be treated as 3 spaces + rv = [] + trim_space = ' ' * continue_width + lines = src.split('\n') + for line in lines: + if line.startswith(trim_space): + line = line.replace(trim_space, '', 1) + # according to CommonMark Example 5 + # tab should be treated as 4 spaces + line = expand_tab(line) + rv.append(line) + else: + rv.append(line) + + return '\n'.join(rv) + + +def _is_loose_list(tokens): + paragraph_count = 0 + for tok in tokens: + if tok['type'] == 'blank_line': + return True + if tok['type'] == 'paragraph': + paragraph_count += 1 + if paragraph_count > 1: + return True |