import re
from .core import BlockState
from .util import (
strip_end,
expand_tab,
expand_leading_tab,
)
# because list is complex, split list parser in a new file
LIST_PATTERN = (
r'^(?P<list_1> {0,3})'
r'(?P<list_2>[\*\+-]|\d{1,9}[.)])'
r'(?P<list_3>[ \t]*|[ \t].+)$'
)
_LINE_HAS_TEXT = re.compile(r'( *)\S')
def parse_list(block, m: re.Match, state: BlockState) -> int:
"""Parse tokens for ordered and unordered list."""
text = m.group('list_3')
if not text.strip():
# Example 285
# an empty list item cannot interrupt a paragraph
end_pos = state.append_paragraph()
if end_pos:
return end_pos
marker = m.group('list_2')
ordered = len(marker) > 1
depth = state.depth()
token = {
'type': 'list',
'children': [],
'tight': True,
'bullet': marker[-1],
'attrs': {
'depth': depth,
'ordered': ordered,
},
}
if ordered:
start = int(marker[:-1])
if start != 1:
# Example 304
# we allow only lists starting with 1 to interrupt paragraphs
end_pos = state.append_paragraph()
if end_pos:
return end_pos
token['attrs']['start'] = start
state.cursor = m.end() + 1
groups = (m.group('list_1'), marker, text)
if depth >= block.max_nested_level - 1:
rules = list(block.list_rules)
rules.remove('list')
else:
rules = block.list_rules
bullet = _get_list_bullet(marker[-1])
while groups:
groups = _parse_list_item(block, bullet, groups, token, state, rules)
end_pos = token.pop('_end_pos', None)
_transform_tight_list(token)
if end_pos:
index = token.pop('_tok_index')
state.tokens.insert(index, token)
return end_pos
state.append_token(token)
return state.cursor
def _transform_tight_list(token):
if token['tight']:
# reset tight list item
for list_item in token['children']:
for tok in list_item['children']:
if tok['type'] == 'paragraph':
tok['type'] = 'block_text'
elif tok['type'] == 'list':
_transform_tight_list(tok)
def _parse_list_item(block, bullet, groups, token, state, rules):
spaces, marker, text = groups
leading_width = len(spaces) + len(marker)
text, continue_width = _compile_continue_width(text, leading_width)
item_pattern = _compile_list_item_pattern(bullet, leading_width)
pairs = [
('thematic_break', block.specification['thematic_break']),
('fenced_code', block.specification['fenced_code']),
('axt_heading', block.specification['axt_heading']),
('block_quote', block.specification['block_quote']),
('block_html', block.specification['block_html']),
('list', block.specification['list']),
]
if leading_width < 3:
_repl_w = str(leading_width)
pairs = [(n, p.replace('3', _repl_w, 1)) for n, p in pairs]
pairs.insert(1, ('list_item', item_pattern))
regex = '|'.join(r'(?P<%s>(?<=\n)%s)' % pair for pair in pairs)
sc = re.compile(regex, re.M)
src = ''
next_group = None
prev_blank_line = False
pos = state.cursor
continue_space = ' ' * continue_width
while pos < state.cursor_max:
pos = state.find_line_end()
line = state.get_text(pos)
if block.BLANK_LINE.match(line):
src += '\n'
prev_blank_line = True
state.cursor = pos
continue
line = expand_leading_tab(line)
if line.startswith(continue_space):
if prev_blank_line and not text and not src.strip():
# Example 280
# A list item can begin with at most one blank line
break
src += line
prev_blank_line = False
state.cursor = pos
continue
m = sc.match(state.src, state.cursor)
if m:
tok_type = m.lastgroup
if tok_type == 'list_item':
if prev_blank_line:
token['tight'] = False
next_group = (
m.group('listitem_1'),
m.group('listitem_2'),
m.group('listitem_3')
)
state.cursor = m.end() + 1
break
tok_index = len(state.tokens)
end_pos = block.parse_method(m, state)
if end_pos:
token['_tok_index'] = tok_index
token['_end_pos'] = end_pos
break
if prev_blank_line and not line.startswith(continue_space):
# not a continue line, and previous line is blank
break
src += line
state.cursor = pos
text += _clean_list_item_text(src, continue_width)
child = state.child_state(strip_end(text))
block.parse(child, rules)
if token['tight'] and _is_loose_list(child.tokens):
token['tight'] = False
token['children'].append({
'type': 'list_item',
'children': child.tokens,
})
if next_group:
return next_group
def _get_list_bullet(c):
if c == '.':
bullet = r'\d{0,9}\.'
elif c == ')':
bullet = r'\d{0,9}\)'
elif c == '*':
bullet = r'\*'
elif c == '+':
bullet = r'\+'
else:
bullet = '-'
return bullet
def _compile_list_item_pattern(bullet, leading_width):
if leading_width > 3:
leading_width = 3
return (
r'^(?P<listitem_1> {0,' + str(leading_width) + '})'
r'(?P<listitem_2>' + bullet + ')'
r'(?P<listitem_3>[ \t]*|[ \t][^\n]+)$'
)
def _compile_continue_width(text, leading_width):
text = expand_leading_tab(text, 3)
text = expand_tab(text)
m2 = _LINE_HAS_TEXT.match(text)
if m2:
# indent code, startswith 5 spaces
if text.startswith(' '):
space_width = 1
else:
space_width = len(m2.group(1))
text = text[space_width:] + '\n'
else:
space_width = 1
text = ''
continue_width = leading_width + space_width
return text, continue_width
def _clean_list_item_text(src, continue_width):
# according to Example 7, tab should be treated as 3 spaces
rv = []
trim_space = ' ' * continue_width
lines = src.split('\n')
for line in lines:
if line.startswith(trim_space):
line = line.replace(trim_space, '', 1)
# according to CommonMark Example 5
# tab should be treated as 4 spaces
line = expand_tab(line)
rv.append(line)
else:
rv.append(line)
return '\n'.join(rv)
def _is_loose_list(tokens):
paragraph_count = 0
for tok in tokens:
if tok['type'] == 'blank_line':
return True
if tok['type'] == 'paragraph':
paragraph_count += 1
if paragraph_count > 1:
return True