summaryrefslogtreecommitdiff
path: root/src/mistune/list_parser.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/mistune/list_parser.py')
-rw-r--r--src/mistune/list_parser.py250
1 files changed, 250 insertions, 0 deletions
diff --git a/src/mistune/list_parser.py b/src/mistune/list_parser.py
new file mode 100644
index 0000000..b5ff866
--- /dev/null
+++ b/src/mistune/list_parser.py
@@ -0,0 +1,250 @@
+import re
+from .core import BlockState
+from .util import (
+ strip_end,
+ expand_tab,
+ expand_leading_tab,
+)
+# because list is complex, split list parser in a new file
+
+LIST_PATTERN = (
+ r'^(?P<list_1> {0,3})'
+ r'(?P<list_2>[\*\+-]|\d{1,9}[.)])'
+ r'(?P<list_3>[ \t]*|[ \t].+)$'
+)
+
+_LINE_HAS_TEXT = re.compile(r'( *)\S')
+
+
+def parse_list(block, m: re.Match, state: BlockState) -> int:
+ """Parse tokens for ordered and unordered list."""
+ text = m.group('list_3')
+ if not text.strip():
+ # Example 285
+ # an empty list item cannot interrupt a paragraph
+ end_pos = state.append_paragraph()
+ if end_pos:
+ return end_pos
+
+ marker = m.group('list_2')
+ ordered = len(marker) > 1
+ depth = state.depth()
+ token = {
+ 'type': 'list',
+ 'children': [],
+ 'tight': True,
+ 'bullet': marker[-1],
+ 'attrs': {
+ 'depth': depth,
+ 'ordered': ordered,
+ },
+ }
+ if ordered:
+ start = int(marker[:-1])
+ if start != 1:
+ # Example 304
+ # we allow only lists starting with 1 to interrupt paragraphs
+ end_pos = state.append_paragraph()
+ if end_pos:
+ return end_pos
+ token['attrs']['start'] = start
+
+ state.cursor = m.end() + 1
+ groups = (m.group('list_1'), marker, text)
+
+ if depth >= block.max_nested_level - 1:
+ rules = list(block.list_rules)
+ rules.remove('list')
+ else:
+ rules = block.list_rules
+
+ bullet = _get_list_bullet(marker[-1])
+ while groups:
+ groups = _parse_list_item(block, bullet, groups, token, state, rules)
+
+ end_pos = token.pop('_end_pos', None)
+ _transform_tight_list(token)
+ if end_pos:
+ index = token.pop('_tok_index')
+ state.tokens.insert(index, token)
+ return end_pos
+
+ state.append_token(token)
+ return state.cursor
+
+
+def _transform_tight_list(token):
+ if token['tight']:
+ # reset tight list item
+ for list_item in token['children']:
+ for tok in list_item['children']:
+ if tok['type'] == 'paragraph':
+ tok['type'] = 'block_text'
+ elif tok['type'] == 'list':
+ _transform_tight_list(tok)
+
+
+def _parse_list_item(block, bullet, groups, token, state, rules):
+ spaces, marker, text = groups
+
+ leading_width = len(spaces) + len(marker)
+ text, continue_width = _compile_continue_width(text, leading_width)
+ item_pattern = _compile_list_item_pattern(bullet, leading_width)
+ pairs = [
+ ('thematic_break', block.specification['thematic_break']),
+ ('fenced_code', block.specification['fenced_code']),
+ ('axt_heading', block.specification['axt_heading']),
+ ('block_quote', block.specification['block_quote']),
+ ('block_html', block.specification['block_html']),
+ ('list', block.specification['list']),
+ ]
+ if leading_width < 3:
+ _repl_w = str(leading_width)
+ pairs = [(n, p.replace('3', _repl_w, 1)) for n, p in pairs]
+
+ pairs.insert(1, ('list_item', item_pattern))
+ regex = '|'.join(r'(?P<%s>(?<=\n)%s)' % pair for pair in pairs)
+ sc = re.compile(regex, re.M)
+
+ src = ''
+ next_group = None
+ prev_blank_line = False
+ pos = state.cursor
+
+ continue_space = ' ' * continue_width
+ while pos < state.cursor_max:
+ pos = state.find_line_end()
+ line = state.get_text(pos)
+ if block.BLANK_LINE.match(line):
+ src += '\n'
+ prev_blank_line = True
+ state.cursor = pos
+ continue
+
+ line = expand_leading_tab(line)
+ if line.startswith(continue_space):
+ if prev_blank_line and not text and not src.strip():
+ # Example 280
+ # A list item can begin with at most one blank line
+ break
+
+ src += line
+ prev_blank_line = False
+ state.cursor = pos
+ continue
+
+ m = sc.match(state.src, state.cursor)
+ if m:
+ tok_type = m.lastgroup
+ if tok_type == 'list_item':
+ if prev_blank_line:
+ token['tight'] = False
+ next_group = (
+ m.group('listitem_1'),
+ m.group('listitem_2'),
+ m.group('listitem_3')
+ )
+ state.cursor = m.end() + 1
+ break
+ tok_index = len(state.tokens)
+ end_pos = block.parse_method(m, state)
+ if end_pos:
+ token['_tok_index'] = tok_index
+ token['_end_pos'] = end_pos
+ break
+
+ if prev_blank_line and not line.startswith(continue_space):
+ # not a continue line, and previous line is blank
+ break
+
+ src += line
+ state.cursor = pos
+
+ text += _clean_list_item_text(src, continue_width)
+ child = state.child_state(strip_end(text))
+
+ block.parse(child, rules)
+
+ if token['tight'] and _is_loose_list(child.tokens):
+ token['tight'] = False
+
+ token['children'].append({
+ 'type': 'list_item',
+ 'children': child.tokens,
+ })
+ if next_group:
+ return next_group
+
+
+def _get_list_bullet(c):
+ if c == '.':
+ bullet = r'\d{0,9}\.'
+ elif c == ')':
+ bullet = r'\d{0,9}\)'
+ elif c == '*':
+ bullet = r'\*'
+ elif c == '+':
+ bullet = r'\+'
+ else:
+ bullet = '-'
+ return bullet
+
+
+def _compile_list_item_pattern(bullet, leading_width):
+ if leading_width > 3:
+ leading_width = 3
+ return (
+ r'^(?P<listitem_1> {0,' + str(leading_width) + '})'
+ r'(?P<listitem_2>' + bullet + ')'
+ r'(?P<listitem_3>[ \t]*|[ \t][^\n]+)$'
+ )
+
+
+def _compile_continue_width(text, leading_width):
+ text = expand_leading_tab(text, 3)
+ text = expand_tab(text)
+
+ m2 = _LINE_HAS_TEXT.match(text)
+ if m2:
+ # indent code, startswith 5 spaces
+ if text.startswith(' '):
+ space_width = 1
+ else:
+ space_width = len(m2.group(1))
+
+ text = text[space_width:] + '\n'
+ else:
+ space_width = 1
+ text = ''
+
+ continue_width = leading_width + space_width
+ return text, continue_width
+
+
+def _clean_list_item_text(src, continue_width):
+ # according to Example 7, tab should be treated as 3 spaces
+ rv = []
+ trim_space = ' ' * continue_width
+ lines = src.split('\n')
+ for line in lines:
+ if line.startswith(trim_space):
+ line = line.replace(trim_space, '', 1)
+ # according to CommonMark Example 5
+ # tab should be treated as 4 spaces
+ line = expand_tab(line)
+ rv.append(line)
+ else:
+ rv.append(line)
+
+ return '\n'.join(rv)
+
+
+def _is_loose_list(tokens):
+ paragraph_count = 0
+ for tok in tokens:
+ if tok['type'] == 'blank_line':
+ return True
+ if tok['type'] == 'paragraph':
+ paragraph_count += 1
+ if paragraph_count > 1:
+ return True