import re from typing import Optional, List, Tuple from .util import ( unikey, escape_url, expand_tab, expand_leading_tab, ) from .core import Parser, BlockState from .helpers import ( LINK_LABEL, HTML_TAGNAME, HTML_ATTRIBUTES, BLOCK_TAGS, PRE_TAGS, unescape_char, parse_link_href, parse_link_title, ) from .list_parser import parse_list, LIST_PATTERN _INDENT_CODE_TRIM = re.compile(r'^ {1,4}', flags=re.M) _AXT_HEADING_TRIM = re.compile(r'(\s+|^)#+\s*$') _BLOCK_QUOTE_TRIM = re.compile(r'^ ?', flags=re.M) _BLOCK_QUOTE_LEADING = re.compile(r'^ *>', flags=re.M) _LINE_BLANK_END = re.compile(r'\n[ \t]*\n$') _BLANK_TO_LINE = re.compile(r'[ \t]*\n') _BLOCK_TAGS_PATTERN = '|'.join(BLOCK_TAGS) + '|' + '|'.join(PRE_TAGS) _OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r'[ \t]*>[ \t]*(?:\n|$)') _CLOSE_TAG_END = re.compile(r'[ \t]*>[ \t]*(?:\n|$)') _STRICT_BLOCK_QUOTE = re.compile(r'( {0,3}>[^\n]*(?:\n|$))+') class BlockParser(Parser): BLANK_LINE = re.compile(r'(^[ \t\v\f]*\n)+', re.M) RAW_HTML = ( r'^ {0,3}(' r'#{1,6})(?!#+)(?P[ \t]*|[ \t]+.*?)$', 'setex_heading': r'^ {0,3}(?P=|-){1,}[ \t]*$', 'fenced_code': ( r'^(?P {0,3})(?P`{3,}|~{3,})' r'[ \t]*(?P.*?)$' ), 'indent_code': ( r'^(?: {4}| *\t)[^\n]+(?:\n+|$)' r'((?:(?: {4}| *\t)[^\n]+(?:\n+|$))|\s)*' ), 'thematic_break': r'^ {0,3}((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$', 'ref_link': r'^ {0,3}\[(?P' + LINK_LABEL + r')\]:', 'block_quote': r'^ {0,3}>(?P.*?)$', 'list': LIST_PATTERN, 'block_html': BLOCK_HTML, 'raw_html': RAW_HTML, } DEFAULT_RULES = ( 'fenced_code', 'indent_code', 'axt_heading', 'setex_heading', 'thematic_break', 'block_quote', 'list', 'ref_link', 'raw_html', 'blank_line', ) def __init__( self, block_quote_rules: Optional[List[str]]=None, list_rules: Optional[List[str]]=None, max_nested_level: int=6 ): super(BlockParser, self).__init__() if block_quote_rules is None: block_quote_rules = list(self.DEFAULT_RULES) if list_rules is None: list_rules = list(self.DEFAULT_RULES) self.block_quote_rules = block_quote_rules self.list_rules = list_rules self.max_nested_level = max_nested_level # register default parse methods self._methods = { name: getattr(self, 'parse_' + name) for name in self.SPECIFICATION } def parse_blank_line(self, m: re.Match, state: BlockState) -> int: """Parse token for blank lines.""" state.append_token({'type': 'blank_line'}) return m.end() def parse_thematic_break(self, m: re.Match, state: BlockState) -> int: """Parse token for thematic break, e.g. ``
`` tag in HTML.""" state.append_token({'type': 'thematic_break'}) # $ does not count '\n' return m.end() + 1 def parse_indent_code(self, m: re.Match, state: BlockState) -> int: """Parse token for code block which is indented by 4 spaces.""" # it is a part of the paragraph end_pos = state.append_paragraph() if end_pos: return end_pos code = m.group(0) code = expand_leading_tab(code) code = _INDENT_CODE_TRIM.sub('', code) code = code.strip('\n') state.append_token({'type': 'block_code', 'raw': code, 'style': 'indent'}) return m.end() def parse_fenced_code(self, m: re.Match, state: BlockState) -> Optional[int]: """Parse token for fenced code block. A fenced code block is started with 3 or more backtick(`) or tilde(~). An example of a fenced code block: .. code-block:: markdown ```python def markdown(text): return mistune.html(text) ``` """ spaces = m.group('fenced_1') marker = m.group('fenced_2') info = m.group('fenced_3') c = marker[0] if info and c == '`': # CommonMark Example 145 # Info strings for backtick code blocks cannot contain backticks if info.find(c) != -1: return _end = re.compile( r'^ {0,3}' + c + '{' + str(len(marker)) + r',}[ \t]*(?:\n|$)', re.M) cursor_start = m.end() + 1 m2 = _end.search(state.src, cursor_start) if m2: code = state.src[cursor_start:m2.start()] end_pos = m2.end() else: code = state.src[cursor_start:] end_pos = state.cursor_max if spaces and code: _trim_pattern = re.compile('^ {0,' + str(len(spaces)) + '}', re.M) code = _trim_pattern.sub('', code) token = {'type': 'block_code', 'raw': code, 'style': 'fenced', 'marker': marker} if info: info = unescape_char(info) token['attrs'] = {'info': info.strip()} state.append_token(token) return end_pos def parse_axt_heading(self, m: re.Match, state: BlockState) -> int: """Parse token for AXT heading. An AXT heading is started with 1 to 6 symbol of ``#``.""" level = len(m.group('axt_1')) text = m.group('axt_2').strip() # remove last # if text: text = _AXT_HEADING_TRIM.sub('', text) token = {'type': 'heading', 'text': text, 'attrs': {'level': level}, 'style': 'axt'} state.append_token(token) return m.end() + 1 def parse_setex_heading(self, m: re.Match, state: BlockState) -> Optional[int]: """Parse token for setex style heading. A setex heading syntax looks like: .. code-block:: markdown H1 title ======== """ last_token = state.last_token() if last_token and last_token['type'] == 'paragraph': level = 1 if m.group('setext_1') == '=' else 2 last_token['type'] = 'heading' last_token['style'] = 'setext' last_token['attrs'] = {'level': level} return m.end() + 1 sc = self.compile_sc(['thematic_break', 'list']) m = sc.match(state.src, state.cursor) if m: return self.parse_method(m, state) def parse_ref_link(self, m: re.Match, state: BlockState) -> Optional[int]: """Parse link references and save the link information into ``state.env``. Here is an example of a link reference: .. code-block:: markdown a [link][example] [example]: https://example.com "Optional title" This method will save the link reference into ``state.env`` as:: state.env['ref_links']['example'] = { 'url': 'https://example.com', 'title': "Optional title", } """ end_pos = state.append_paragraph() if end_pos: return end_pos label = m.group('reflink_1') key = unikey(label) if not key: return href, href_pos = parse_link_href(state.src, m.end(), block=True) if href is None: return _blank = self.BLANK_LINE.search(state.src, href_pos) if _blank: max_pos = _blank.start() else: max_pos = state.cursor_max title, title_pos = parse_link_title(state.src, href_pos, max_pos) if title_pos: m = _BLANK_TO_LINE.match(state.src, title_pos) if m: title_pos = m.end() else: title_pos = None title = None if title_pos is None: m = _BLANK_TO_LINE.match(state.src, href_pos) if m: href_pos = m.end() else: href_pos = None href = None end_pos = title_pos or href_pos if not end_pos: return if key not in state.env['ref_links']: href = unescape_char(href) data = {'url': escape_url(href), 'label': label} if title: data['title'] = title state.env['ref_links'][key] = data return end_pos def extract_block_quote(self, m: re.Match, state: BlockState) -> Tuple[str, int]: """Extract text and cursor end position of a block quote.""" # cleanup at first to detect if it is code block text = m.group('quote_1') + '\n' text = expand_leading_tab(text, 3) text = _BLOCK_QUOTE_TRIM.sub('', text) sc = self.compile_sc(['blank_line', 'indent_code', 'fenced_code']) require_marker = bool(sc.match(text)) state.cursor = m.end() + 1 end_pos = None if require_marker: m = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor) if m: quote = m.group(0) quote = _BLOCK_QUOTE_LEADING.sub('', quote) quote = expand_leading_tab(quote, 3) quote = _BLOCK_QUOTE_TRIM.sub('', quote) text += quote state.cursor = m.end() else: prev_blank_line = False break_sc = self.compile_sc([ 'blank_line', 'thematic_break', 'fenced_code', 'list', 'block_html', ]) while state.cursor < state.cursor_max: m = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor) if m: quote = m.group(0) quote = _BLOCK_QUOTE_LEADING.sub('', quote) quote = expand_leading_tab(quote, 3) quote = _BLOCK_QUOTE_TRIM.sub('', quote) text += quote state.cursor = m.end() if not quote.strip(): prev_blank_line = True else: prev_blank_line = bool(_LINE_BLANK_END.search(quote)) continue if prev_blank_line: # CommonMark Example 249 # because of laziness, a blank line is needed between # a block quote and a following paragraph break m = break_sc.match(state.src, state.cursor) if m: end_pos = self.parse_method(m, state) if end_pos: break # lazy continuation line pos = state.find_line_end() line = state.get_text(pos) line = expand_leading_tab(line, 3) text += line state.cursor = pos # according to CommonMark Example 6, the second tab should be # treated as 4 spaces return expand_tab(text), end_pos def parse_block_quote(self, m: re.Match, state: BlockState) -> int: """Parse token for block quote. Here is an example of the syntax: .. code-block:: markdown > a block quote starts > with right arrows """ text, end_pos = self.extract_block_quote(m, state) # scan children state child = state.child_state(text) if state.depth() >= self.max_nested_level - 1: rules = list(self.block_quote_rules) rules.remove('block_quote') else: rules = self.block_quote_rules self.parse(child, rules) token = {'type': 'block_quote', 'children': child.tokens} if end_pos: state.prepend_token(token) return end_pos state.append_token(token) return state.cursor def parse_list(self, m: re.Match, state: BlockState) -> int: """Parse tokens for ordered and unordered list.""" return parse_list(self, m, state) def parse_block_html(self, m: re.Match, state: BlockState) -> Optional[int]: return self.parse_raw_html(m, state) def parse_raw_html(self, m: re.Match, state: BlockState) -> Optional[int]: marker = m.group(0).strip() # rule 2 if marker == '', m.end()) # rule 3 if marker == '', m.end()) # rule 5 if marker == '', m.end()) # rule 4 if marker.startswith('', m.end()) close_tag = None open_tag = None if marker.startswith('' return _parse_html_to_end(state, end_tag, m.end()) # rule 6 if open_tag in BLOCK_TAGS: return _parse_html_to_newline(state, self.BLANK_LINE) # Blocks of type 7 may not interrupt a paragraph. end_pos = state.append_paragraph() if end_pos: return end_pos # rule 7 start_pos = m.end() end_pos = state.find_line_end() if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or \ (close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos)): return _parse_html_to_newline(state, self.BLANK_LINE) def parse(self, state: BlockState, rules: Optional[List[str]]=None) -> None: sc = self.compile_sc(rules) while state.cursor < state.cursor_max: m = sc.search(state.src, state.cursor) if not m: break end_pos = m.start() if end_pos > state.cursor: text = state.get_text(end_pos) state.add_paragraph(text) state.cursor = end_pos end_pos = self.parse_method(m, state) if end_pos: state.cursor = end_pos else: end_pos = state.find_line_end() text = state.get_text(end_pos) state.add_paragraph(text) state.cursor = end_pos if state.cursor < state.cursor_max: text = state.src[state.cursor:] state.add_paragraph(text) state.cursor = state.cursor_max def _parse_html_to_end(state, end_marker, start_pos): marker_pos = state.src.find(end_marker, start_pos) if marker_pos == -1: text = state.src[state.cursor:] end_pos = state.cursor_max else: text = state.get_text(marker_pos) state.cursor = marker_pos end_pos = state.find_line_end() text += state.get_text(end_pos) state.append_token({'type': 'block_html', 'raw': text}) return end_pos def _parse_html_to_newline(state, newline): m = newline.search(state.src, state.cursor) if m: end_pos = m.start() text = state.get_text(end_pos) else: text = state.src[state.cursor:] end_pos = state.cursor_max state.append_token({'type': 'block_html', 'raw': text}) return end_pos