summaryrefslogblamecommitdiff
path: root/src/mistune/list_parser.py
blob: b5ff866e13b6f3c35ed838cf4b1acd4b67c47207 (plain) (tree)
























































































































































































































































                                                                             
import re
from .core import BlockState
from .util import (
    strip_end,
    expand_tab,
    expand_leading_tab,
)
# because list is complex, split list parser in a new file

LIST_PATTERN = (
    r'^(?P<list_1> {0,3})'
    r'(?P<list_2>[\*\+-]|\d{1,9}[.)])'
    r'(?P<list_3>[ \t]*|[ \t].+)$'
)

_LINE_HAS_TEXT = re.compile(r'( *)\S')


def parse_list(block, m: re.Match, state: BlockState) -> int:
    """Parse tokens for ordered and unordered list."""
    text = m.group('list_3')
    if not text.strip():
        # Example 285
        # an empty list item cannot interrupt a paragraph
        end_pos = state.append_paragraph()
        if end_pos:
            return end_pos

    marker = m.group('list_2')
    ordered = len(marker) > 1
    depth = state.depth()
    token = {
        'type': 'list',
        'children': [],
        'tight': True,
        'bullet': marker[-1],
        'attrs': {
            'depth': depth,
            'ordered': ordered,
        },
    }
    if ordered:
        start = int(marker[:-1])
        if start != 1:
            # Example 304
            # we allow only lists starting with 1 to interrupt paragraphs
            end_pos = state.append_paragraph()
            if end_pos:
                return end_pos
            token['attrs']['start'] = start

    state.cursor = m.end() + 1
    groups = (m.group('list_1'), marker, text)

    if depth >= block.max_nested_level - 1:
        rules = list(block.list_rules)
        rules.remove('list')
    else:
        rules = block.list_rules

    bullet = _get_list_bullet(marker[-1])
    while groups:
        groups = _parse_list_item(block, bullet, groups, token, state, rules)

    end_pos = token.pop('_end_pos', None)
    _transform_tight_list(token)
    if end_pos:
        index = token.pop('_tok_index')
        state.tokens.insert(index, token)
        return end_pos

    state.append_token(token)
    return state.cursor


def _transform_tight_list(token):
    if token['tight']:
        # reset tight list item
        for list_item in token['children']:
            for tok in list_item['children']:
                if tok['type'] == 'paragraph':
                    tok['type'] = 'block_text'
                elif tok['type'] == 'list':
                    _transform_tight_list(tok)


def _parse_list_item(block, bullet, groups, token, state, rules):
    spaces, marker, text = groups

    leading_width = len(spaces) + len(marker)
    text, continue_width = _compile_continue_width(text, leading_width)
    item_pattern = _compile_list_item_pattern(bullet, leading_width)
    pairs = [
        ('thematic_break', block.specification['thematic_break']),
        ('fenced_code', block.specification['fenced_code']),
        ('axt_heading', block.specification['axt_heading']),
        ('block_quote', block.specification['block_quote']),
        ('block_html', block.specification['block_html']),
        ('list', block.specification['list']),
    ]
    if leading_width < 3:
        _repl_w = str(leading_width)
        pairs = [(n, p.replace('3', _repl_w, 1)) for n, p in pairs]

    pairs.insert(1, ('list_item', item_pattern))
    regex = '|'.join(r'(?P<%s>(?<=\n)%s)' % pair for pair in pairs)
    sc = re.compile(regex, re.M)

    src = ''
    next_group = None
    prev_blank_line = False
    pos = state.cursor

    continue_space = ' ' * continue_width
    while pos < state.cursor_max:
        pos = state.find_line_end()
        line = state.get_text(pos)
        if block.BLANK_LINE.match(line):
            src += '\n'
            prev_blank_line = True
            state.cursor = pos
            continue

        line = expand_leading_tab(line)
        if line.startswith(continue_space):
            if prev_blank_line and not text and not src.strip():
                # Example 280
                # A list item can begin with at most one blank line
                break

            src += line
            prev_blank_line = False
            state.cursor = pos
            continue

        m = sc.match(state.src, state.cursor)
        if m:
            tok_type = m.lastgroup
            if tok_type == 'list_item':
                if prev_blank_line:
                    token['tight'] = False
                next_group = (
                    m.group('listitem_1'),
                    m.group('listitem_2'),
                    m.group('listitem_3')
                )
                state.cursor = m.end() + 1
                break
            tok_index = len(state.tokens)
            end_pos = block.parse_method(m, state)
            if end_pos:
                token['_tok_index'] = tok_index
                token['_end_pos'] = end_pos
                break

        if prev_blank_line and not line.startswith(continue_space):
            # not a continue line, and previous line is blank
            break

        src += line
        state.cursor = pos

    text += _clean_list_item_text(src, continue_width)
    child = state.child_state(strip_end(text))

    block.parse(child, rules)

    if token['tight'] and _is_loose_list(child.tokens):
        token['tight'] = False

    token['children'].append({
        'type': 'list_item',
        'children': child.tokens,
    })
    if next_group:
        return next_group


def _get_list_bullet(c):
    if c == '.':
        bullet = r'\d{0,9}\.'
    elif c == ')':
        bullet = r'\d{0,9}\)'
    elif c == '*':
        bullet = r'\*'
    elif c == '+':
        bullet = r'\+'
    else:
        bullet = '-'
    return bullet


def _compile_list_item_pattern(bullet, leading_width):
    if leading_width > 3:
        leading_width = 3
    return (
        r'^(?P<listitem_1> {0,' + str(leading_width) + '})'
        r'(?P<listitem_2>' + bullet + ')'
        r'(?P<listitem_3>[ \t]*|[ \t][^\n]+)$'
    )


def _compile_continue_width(text, leading_width):
    text = expand_leading_tab(text, 3)
    text = expand_tab(text)

    m2 = _LINE_HAS_TEXT.match(text)
    if m2:
        # indent code, startswith 5 spaces
        if text.startswith('     '):
            space_width = 1
        else:
            space_width = len(m2.group(1))

        text = text[space_width:] + '\n'
    else:
        space_width = 1
        text = ''

    continue_width = leading_width + space_width
    return text, continue_width


def _clean_list_item_text(src, continue_width):
    # according to Example 7, tab should be treated as 3 spaces
    rv = []
    trim_space = ' ' * continue_width
    lines = src.split('\n')
    for line in lines:
        if line.startswith(trim_space):
            line = line.replace(trim_space, '', 1)
            # according to CommonMark Example 5
            # tab should be treated as 4 spaces
            line = expand_tab(line)
            rv.append(line)
        else:
            rv.append(line)

    return '\n'.join(rv)


def _is_loose_list(tokens):
    paragraph_count = 0
    for tok in tokens:
        if tok['type'] == 'blank_line':
            return True
        if tok['type'] == 'paragraph':
            paragraph_count += 1
            if paragraph_count > 1:
                return True