1 files changed, 250 insertions, 0 deletions
diff --git a/src/mistune/list_parser.py b/src/mistune/list_parser.py
new file mode 100644
index 0000000..b5ff866
--- /dev/null
+++ b/src/mistune/list_parser.py
@@ -0,0 +1,250 @@
+import re
+from .core import BlockState
+from .util import (
+    strip_end,
+    expand_tab,
+    expand_leading_tab,
+)
+# because list is complex, split list parser in a new file
+
+LIST_PATTERN = (
+    r'^(?P<list_1> {0,3})'
+    r'(?P<list_2>[\*\+-]|\d{1,9}[.)])'
+    r'(?P<list_3>[ \t]*|[ \t].+)$'
+)
+
+_LINE_HAS_TEXT = re.compile(r'( *)\S')
+
+
+def parse_list(block, m: re.Match, state: BlockState) -> int:
+    """Parse tokens for ordered and unordered list."""
+    text = m.group('list_3')
+    if not text.strip():
+        # Example 285
+        # an empty list item cannot interrupt a paragraph
+        end_pos = state.append_paragraph()
+        if end_pos:
+            return end_pos
+
+    marker = m.group('list_2')
+    ordered = len(marker) > 1
+    depth = state.depth()
+    token = {
+        'type': 'list',
+        'children': [],
+        'tight': True,
+        'bullet': marker[-1],
+        'attrs': {
+            'depth': depth,
+            'ordered': ordered,
+        },
+    }
+    if ordered:
+        start = int(marker[:-1])
+        if start != 1:
+            # Example 304
+            # we allow only lists starting with 1 to interrupt paragraphs
+            end_pos = state.append_paragraph()
+            if end_pos:
+                return end_pos
+            token['attrs']['start'] = start
+
+    state.cursor = m.end() + 1
+    groups = (m.group('list_1'), marker, text)
+
+    if depth >= block.max_nested_level - 1:
+        rules = list(block.list_rules)
+        rules.remove('list')
+    else:
+        rules = block.list_rules
+
+    bullet = _get_list_bullet(marker[-1])
+    while groups:
+        groups = _parse_list_item(block, bullet, groups, token, state, rules)
+
+    end_pos = token.pop('_end_pos', None)
+    _transform_tight_list(token)
+    if end_pos:
+        index = token.pop('_tok_index')
+        state.tokens.insert(index, token)
+        return end_pos
+
+    state.append_token(token)
+    return state.cursor
+
+
+def _transform_tight_list(token):
+    if token['tight']:
+        # reset tight list item
+        for list_item in token['children']:
+            for tok in list_item['children']:
+                if tok['type'] == 'paragraph':
+                    tok['type'] = 'block_text'
+                elif tok['type'] == 'list':
+                    _transform_tight_list(tok)
+
+
+def _parse_list_item(block, bullet, groups, token, state, rules):
+    spaces, marker, text = groups
+
+    leading_width = len(spaces) + len(marker)
+    text, continue_width = _compile_continue_width(text, leading_width)
+    item_pattern = _compile_list_item_pattern(bullet, leading_width)
+    pairs = [
+        ('thematic_break', block.specification['thematic_break']),
+        ('fenced_code', block.specification['fenced_code']),
+        ('axt_heading', block.specification['axt_heading']),
+        ('block_quote', block.specification['block_quote']),
+        ('block_html', block.specification['block_html']),
+        ('list', block.specification['list']),
+    ]
+    if leading_width < 3:
+        _repl_w = str(leading_width)
+        pairs = [(n, p.replace('3', _repl_w, 1)) for n, p in pairs]
+
+    pairs.insert(1, ('list_item', item_pattern))
+    regex = '|'.join(r'(?P<%s>(?<=\n)%s)' % pair for pair in pairs)
+    sc = re.compile(regex, re.M)
+
+    src = ''
+    next_group = None
+    prev_blank_line = False
+    pos = state.cursor
+
+    continue_space = ' ' * continue_width
+    while pos < state.cursor_max:
+        pos = state.find_line_end()
+        line = state.get_text(pos)
+        if block.BLANK_LINE.match(line):
+            src += '\n'
+            prev_blank_line = True
+            state.cursor = pos
+            continue
+
+        line = expand_leading_tab(line)
+        if line.startswith(continue_space):
+            if prev_blank_line and not text and not src.strip():
+                # Example 280
+                # A list item can begin with at most one blank line
+                break
+
+            src += line
+            prev_blank_line = False
+            state.cursor = pos
+            continue
+
+        m = sc.match(state.src, state.cursor)
+        if m:
+            tok_type = m.lastgroup
+            if tok_type == 'list_item':
+                if prev_blank_line:
+                    token['tight'] = False
+                next_group = (
+                    m.group('listitem_1'),
+                    m.group('listitem_2'),
+                    m.group('listitem_3')
+                )
+                state.cursor = m.end() + 1
+                break
+            tok_index = len(state.tokens)
+            end_pos = block.parse_method(m, state)
+            if end_pos:
+                token['_tok_index'] = tok_index
+                token['_end_pos'] = end_pos
+                break
+
+        if prev_blank_line and not line.startswith(continue_space):
+            # not a continue line, and previous line is blank
+            break
+
+        src += line
+        state.cursor = pos
+
+    text += _clean_list_item_text(src, continue_width)
+    child = state.child_state(strip_end(text))
+
+    block.parse(child, rules)
+
+    if token['tight'] and _is_loose_list(child.tokens):
+        token['tight'] = False
+
+    token['children'].append({
+        'type': 'list_item',
+        'children': child.tokens,
+    })
+    if next_group:
+        return next_group
+
+
+def _get_list_bullet(c):
+    if c == '.':
+        bullet = r'\d{0,9}\.'
+    elif c == ')':
+        bullet = r'\d{0,9}\)'
+    elif c == '*':
+        bullet = r'\*'
+    elif c == '+':
+        bullet = r'\+'
+    else:
+        bullet = '-'
+    return bullet
+
+
+def _compile_list_item_pattern(bullet, leading_width):
+    if leading_width > 3:
+        leading_width = 3
+    return (
+        r'^(?P<listitem_1> {0,' + str(leading_width) + '})'
+        r'(?P<listitem_2>' + bullet + ')'
+        r'(?P<listitem_3>[ \t]*|[ \t][^\n]+)$'
+    )
+
+
+def _compile_continue_width(text, leading_width):
+    text = expand_leading_tab(text, 3)
+    text = expand_tab(text)
+
+    m2 = _LINE_HAS_TEXT.match(text)
+    if m2:
+        # indent code, startswith 5 spaces
+        if text.startswith('     '):
+            space_width = 1
+        else:
+            space_width = len(m2.group(1))
+
+        text = text[space_width:] + '\n'
+    else:
+        space_width = 1
+        text = ''
+
+    continue_width = leading_width + space_width
+    return text, continue_width
+
+
+def _clean_list_item_text(src, continue_width):
+    # according to Example 7, tab should be treated as 3 spaces
+    rv = []
+    trim_space = ' ' * continue_width
+    lines = src.split('\n')
+    for line in lines:
+        if line.startswith(trim_space):
+            line = line.replace(trim_space, '', 1)
+            # according to CommonMark Example 5
+            # tab should be treated as 4 spaces
+            line = expand_tab(line)
+            rv.append(line)
+        else:
+            rv.append(line)
+
+    return '\n'.join(rv)
+
+
+def _is_loose_list(tokens):
+    paragraph_count = 0
+    for tok in tokens:
+        if tok['type'] == 'blank_line':
+            return True
+        if tok['type'] == 'paragraph':
+            paragraph_count += 1
+            if paragraph_count > 1:
+                return True