summaryrefslogtreecommitdiff
path: root/src/mistune/block_parser.py
diff options
context:
space:
mode:
authorArturs Artamonovs <dos21h@gmail.com>2023-01-29 10:30:54 +0000
committerArturs Artamonovs <dos21h@gmail.com>2023-01-29 10:30:54 +0000
commit66fa71a8f11b6ce5e8471b533f67cc3a1fdb85a8 (patch)
tree7aed7f385826a3bd88c76a373e28c6cfae4f396e /src/mistune/block_parser.py
parent129c1201ea5c4418f0f89ad932633c7cea2439b7 (diff)
downloadmd-site-66fa71a8f11b6ce5e8471b533f67cc3a1fdb85a8.tar.gz
md-site-66fa71a8f11b6ce5e8471b533f67cc3a1fdb85a8.zip
Update to new mistune, removed old mistune, rewrite to python3
Diffstat (limited to 'src/mistune/block_parser.py')
-rw-r--r--src/mistune/block_parser.py486
1 files changed, 486 insertions, 0 deletions
diff --git a/src/mistune/block_parser.py b/src/mistune/block_parser.py
new file mode 100644
index 0000000..1ed79ec
--- /dev/null
+++ b/src/mistune/block_parser.py
@@ -0,0 +1,486 @@
+import re
+from typing import Optional, List, Tuple
+from .util import (
+ unikey,
+ escape_url,
+ expand_tab,
+ expand_leading_tab,
+)
+from .core import Parser, BlockState
+from .helpers import (
+ LINK_LABEL,
+ HTML_TAGNAME,
+ HTML_ATTRIBUTES,
+ BLOCK_TAGS,
+ PRE_TAGS,
+ unescape_char,
+ parse_link_href,
+ parse_link_title,
+)
+from .list_parser import parse_list, LIST_PATTERN
+
+_INDENT_CODE_TRIM = re.compile(r'^ {1,4}', flags=re.M)
+_AXT_HEADING_TRIM = re.compile(r'(\s+|^)#+\s*$')
+_BLOCK_QUOTE_TRIM = re.compile(r'^ ?', flags=re.M)
+_BLOCK_QUOTE_LEADING = re.compile(r'^ *>', flags=re.M)
+
+_LINE_BLANK_END = re.compile(r'\n[ \t]*\n$')
+_BLANK_TO_LINE = re.compile(r'[ \t]*\n')
+
+_BLOCK_TAGS_PATTERN = '|'.join(BLOCK_TAGS) + '|' + '|'.join(PRE_TAGS)
+_OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r'[ \t]*>[ \t]*(?:\n|$)')
+_CLOSE_TAG_END = re.compile(r'[ \t]*>[ \t]*(?:\n|$)')
+_STRICT_BLOCK_QUOTE = re.compile(r'( {0,3}>[^\n]*(?:\n|$))+')
+
+
+class BlockParser(Parser):
+ BLANK_LINE = re.compile(r'(^[ \t\v\f]*\n)+', re.M)
+
+ RAW_HTML = (
+ r'^ {0,3}('
+ r'</?' + HTML_TAGNAME + r'|'
+ r'<!--|' # comment
+ r'<\?|' # script
+ r'<![A-Z]|'
+ r'<!\[CDATA\[)'
+ )
+
+ BLOCK_HTML = (
+ r'^ {0,3}(?:'
+ r'(?:</?' + _BLOCK_TAGS_PATTERN + r'(?:[ \t]+|\n|$))'
+ r'|<!--' # comment
+ r'|<\?' # script
+ r'|<![A-Z]'
+ r'|<!\[CDATA\[)'
+ )
+
+ SPECIFICATION = {
+ 'blank_line': r'(^[ \t\v\f]*\n)+',
+ 'axt_heading': r'^ {0,3}(?P<axt_1>#{1,6})(?!#+)(?P<axt_2>[ \t]*|[ \t]+.*?)$',
+ 'setex_heading': r'^ {0,3}(?P<setext_1>=|-){1,}[ \t]*$',
+ 'fenced_code': (
+ r'^(?P<fenced_1> {0,3})(?P<fenced_2>`{3,}|~{3,})'
+ r'[ \t]*(?P<fenced_3>.*?)$'
+ ),
+ 'indent_code': (
+ r'^(?: {4}| *\t)[^\n]+(?:\n+|$)'
+ r'((?:(?: {4}| *\t)[^\n]+(?:\n+|$))|\s)*'
+ ),
+ 'thematic_break': r'^ {0,3}((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$',
+ 'ref_link': r'^ {0,3}\[(?P<reflink_1>' + LINK_LABEL + r')\]:',
+ 'block_quote': r'^ {0,3}>(?P<quote_1>.*?)$',
+ 'list': LIST_PATTERN,
+ 'block_html': BLOCK_HTML,
+ 'raw_html': RAW_HTML,
+ }
+
+ DEFAULT_RULES = (
+ 'fenced_code',
+ 'indent_code',
+ 'axt_heading',
+ 'setex_heading',
+ 'thematic_break',
+ 'block_quote',
+ 'list',
+ 'ref_link',
+ 'raw_html',
+ 'blank_line',
+ )
+
+ def __init__(
+ self,
+ block_quote_rules: Optional[List[str]]=None,
+ list_rules: Optional[List[str]]=None,
+ max_nested_level: int=6
+ ):
+ super(BlockParser, self).__init__()
+
+ if block_quote_rules is None:
+ block_quote_rules = list(self.DEFAULT_RULES)
+
+ if list_rules is None:
+ list_rules = list(self.DEFAULT_RULES)
+
+ self.block_quote_rules = block_quote_rules
+ self.list_rules = list_rules
+ self.max_nested_level = max_nested_level
+ # register default parse methods
+ self._methods = {
+ name: getattr(self, 'parse_' + name) for name in self.SPECIFICATION
+ }
+
+ def parse_blank_line(self, m: re.Match, state: BlockState) -> int:
+ """Parse token for blank lines."""
+ state.append_token({'type': 'blank_line'})
+ return m.end()
+
+ def parse_thematic_break(self, m: re.Match, state: BlockState) -> int:
+ """Parse token for thematic break, e.g. ``<hr>`` tag in HTML."""
+ state.append_token({'type': 'thematic_break'})
+ # $ does not count '\n'
+ return m.end() + 1
+
+ def parse_indent_code(self, m: re.Match, state: BlockState) -> int:
+ """Parse token for code block which is indented by 4 spaces."""
+ # it is a part of the paragraph
+ end_pos = state.append_paragraph()
+ if end_pos:
+ return end_pos
+
+ code = m.group(0)
+ code = expand_leading_tab(code)
+ code = _INDENT_CODE_TRIM.sub('', code)
+ code = code.strip('\n')
+ state.append_token({'type': 'block_code', 'raw': code, 'style': 'indent'})
+ return m.end()
+
+ def parse_fenced_code(self, m: re.Match, state: BlockState) -> Optional[int]:
+ """Parse token for fenced code block. A fenced code block is started with
+ 3 or more backtick(`) or tilde(~).
+
+ An example of a fenced code block:
+
+ .. code-block:: markdown
+
+ ```python
+ def markdown(text):
+ return mistune.html(text)
+ ```
+ """
+ spaces = m.group('fenced_1')
+ marker = m.group('fenced_2')
+ info = m.group('fenced_3')
+
+ c = marker[0]
+ if info and c == '`':
+ # CommonMark Example 145
+ # Info strings for backtick code blocks cannot contain backticks
+ if info.find(c) != -1:
+ return
+
+ _end = re.compile(
+ r'^ {0,3}' + c + '{' + str(len(marker)) + r',}[ \t]*(?:\n|$)', re.M)
+ cursor_start = m.end() + 1
+
+ m2 = _end.search(state.src, cursor_start)
+ if m2:
+ code = state.src[cursor_start:m2.start()]
+ end_pos = m2.end()
+ else:
+ code = state.src[cursor_start:]
+ end_pos = state.cursor_max
+
+ if spaces and code:
+ _trim_pattern = re.compile('^ {0,' + str(len(spaces)) + '}', re.M)
+ code = _trim_pattern.sub('', code)
+
+ token = {'type': 'block_code', 'raw': code, 'style': 'fenced', 'marker': marker}
+ if info:
+ info = unescape_char(info)
+ token['attrs'] = {'info': info.strip()}
+
+ state.append_token(token)
+ return end_pos
+
+ def parse_axt_heading(self, m: re.Match, state: BlockState) -> int:
+ """Parse token for AXT heading. An AXT heading is started with 1 to 6
+ symbol of ``#``."""
+ level = len(m.group('axt_1'))
+ text = m.group('axt_2').strip()
+ # remove last #
+ if text:
+ text = _AXT_HEADING_TRIM.sub('', text)
+
+ token = {'type': 'heading', 'text': text, 'attrs': {'level': level}, 'style': 'axt'}
+ state.append_token(token)
+ return m.end() + 1
+
+ def parse_setex_heading(self, m: re.Match, state: BlockState) -> Optional[int]:
+ """Parse token for setex style heading. A setex heading syntax looks like:
+
+ .. code-block:: markdown
+
+ H1 title
+ ========
+ """
+ last_token = state.last_token()
+ if last_token and last_token['type'] == 'paragraph':
+ level = 1 if m.group('setext_1') == '=' else 2
+ last_token['type'] = 'heading'
+ last_token['style'] = 'setext'
+ last_token['attrs'] = {'level': level}
+ return m.end() + 1
+
+ sc = self.compile_sc(['thematic_break', 'list'])
+ m = sc.match(state.src, state.cursor)
+ if m:
+ return self.parse_method(m, state)
+
+ def parse_ref_link(self, m: re.Match, state: BlockState) -> Optional[int]:
+ """Parse link references and save the link information into ``state.env``.
+
+ Here is an example of a link reference:
+
+ .. code-block:: markdown
+
+ a [link][example]
+
+ [example]: https://example.com "Optional title"
+
+ This method will save the link reference into ``state.env`` as::
+
+ state.env['ref_links']['example'] = {
+ 'url': 'https://example.com',
+ 'title': "Optional title",
+ }
+ """
+ end_pos = state.append_paragraph()
+ if end_pos:
+ return end_pos
+
+ label = m.group('reflink_1')
+ key = unikey(label)
+ if not key:
+ return
+
+ href, href_pos = parse_link_href(state.src, m.end(), block=True)
+ if href is None:
+ return
+
+ _blank = self.BLANK_LINE.search(state.src, href_pos)
+ if _blank:
+ max_pos = _blank.start()
+ else:
+ max_pos = state.cursor_max
+
+ title, title_pos = parse_link_title(state.src, href_pos, max_pos)
+ if title_pos:
+ m = _BLANK_TO_LINE.match(state.src, title_pos)
+ if m:
+ title_pos = m.end()
+ else:
+ title_pos = None
+ title = None
+
+ if title_pos is None:
+ m = _BLANK_TO_LINE.match(state.src, href_pos)
+ if m:
+ href_pos = m.end()
+ else:
+ href_pos = None
+ href = None
+
+ end_pos = title_pos or href_pos
+ if not end_pos:
+ return
+
+ if key not in state.env['ref_links']:
+ href = unescape_char(href)
+ data = {'url': escape_url(href), 'label': label}
+ if title:
+ data['title'] = title
+ state.env['ref_links'][key] = data
+ return end_pos
+
+ def extract_block_quote(self, m: re.Match, state: BlockState) -> Tuple[str, int]:
+ """Extract text and cursor end position of a block quote."""
+
+ # cleanup at first to detect if it is code block
+ text = m.group('quote_1') + '\n'
+ text = expand_leading_tab(text, 3)
+ text = _BLOCK_QUOTE_TRIM.sub('', text)
+
+ sc = self.compile_sc(['blank_line', 'indent_code', 'fenced_code'])
+ require_marker = bool(sc.match(text))
+
+ state.cursor = m.end() + 1
+
+ end_pos = None
+ if require_marker:
+ m = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
+ if m:
+ quote = m.group(0)
+ quote = _BLOCK_QUOTE_LEADING.sub('', quote)
+ quote = expand_leading_tab(quote, 3)
+ quote = _BLOCK_QUOTE_TRIM.sub('', quote)
+ text += quote
+ state.cursor = m.end()
+ else:
+ prev_blank_line = False
+ break_sc = self.compile_sc([
+ 'blank_line', 'thematic_break', 'fenced_code',
+ 'list', 'block_html',
+ ])
+ while state.cursor < state.cursor_max:
+ m = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
+ if m:
+ quote = m.group(0)
+ quote = _BLOCK_QUOTE_LEADING.sub('', quote)
+ quote = expand_leading_tab(quote, 3)
+ quote = _BLOCK_QUOTE_TRIM.sub('', quote)
+ text += quote
+ state.cursor = m.end()
+ if not quote.strip():
+ prev_blank_line = True
+ else:
+ prev_blank_line = bool(_LINE_BLANK_END.search(quote))
+ continue
+
+ if prev_blank_line:
+ # CommonMark Example 249
+ # because of laziness, a blank line is needed between
+ # a block quote and a following paragraph
+ break
+
+ m = break_sc.match(state.src, state.cursor)
+ if m:
+ end_pos = self.parse_method(m, state)
+ if end_pos:
+ break
+
+ # lazy continuation line
+ pos = state.find_line_end()
+ line = state.get_text(pos)
+ line = expand_leading_tab(line, 3)
+ text += line
+ state.cursor = pos
+
+ # according to CommonMark Example 6, the second tab should be
+ # treated as 4 spaces
+ return expand_tab(text), end_pos
+
+ def parse_block_quote(self, m: re.Match, state: BlockState) -> int:
+ """Parse token for block quote. Here is an example of the syntax:
+
+ .. code-block:: markdown
+
+ > a block quote starts
+ > with right arrows
+ """
+ text, end_pos = self.extract_block_quote(m, state)
+ # scan children state
+ child = state.child_state(text)
+ if state.depth() >= self.max_nested_level - 1:
+ rules = list(self.block_quote_rules)
+ rules.remove('block_quote')
+ else:
+ rules = self.block_quote_rules
+
+ self.parse(child, rules)
+ token = {'type': 'block_quote', 'children': child.tokens}
+ if end_pos:
+ state.prepend_token(token)
+ return end_pos
+ state.append_token(token)
+ return state.cursor
+
+ def parse_list(self, m: re.Match, state: BlockState) -> int:
+ """Parse tokens for ordered and unordered list."""
+ return parse_list(self, m, state)
+
+ def parse_block_html(self, m: re.Match, state: BlockState) -> Optional[int]:
+ return self.parse_raw_html(m, state)
+
+ def parse_raw_html(self, m: re.Match, state: BlockState) -> Optional[int]:
+ marker = m.group(0).strip()
+
+ # rule 2
+ if marker == '<!--':
+ return _parse_html_to_end(state, '-->', m.end())
+
+ # rule 3
+ if marker == '<?':
+ return _parse_html_to_end(state, '?>', m.end())
+
+ # rule 5
+ if marker == '<![CDATA[':
+ return _parse_html_to_end(state, ']]>', m.end())
+
+ # rule 4
+ if marker.startswith('<!'):
+ return _parse_html_to_end(state, '>', m.end())
+
+ close_tag = None
+ open_tag = None
+ if marker.startswith('</'):
+ close_tag = marker[2:].lower()
+ # rule 6
+ if close_tag in BLOCK_TAGS:
+ return _parse_html_to_newline(state, self.BLANK_LINE)
+ else:
+ open_tag = marker[1:].lower()
+ # rule 1
+ if open_tag in PRE_TAGS:
+ end_tag = '</' + open_tag + '>'
+ return _parse_html_to_end(state, end_tag, m.end())
+ # rule 6
+ if open_tag in BLOCK_TAGS:
+ return _parse_html_to_newline(state, self.BLANK_LINE)
+
+ # Blocks of type 7 may not interrupt a paragraph.
+ end_pos = state.append_paragraph()
+ if end_pos:
+ return end_pos
+
+ # rule 7
+ start_pos = m.end()
+ end_pos = state.find_line_end()
+ if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or \
+ (close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos)):
+ return _parse_html_to_newline(state, self.BLANK_LINE)
+
+ def parse(self, state: BlockState, rules: Optional[List[str]]=None) -> None:
+ sc = self.compile_sc(rules)
+
+ while state.cursor < state.cursor_max:
+ m = sc.search(state.src, state.cursor)
+ if not m:
+ break
+
+ end_pos = m.start()
+ if end_pos > state.cursor:
+ text = state.get_text(end_pos)
+ state.add_paragraph(text)
+ state.cursor = end_pos
+
+ end_pos = self.parse_method(m, state)
+ if end_pos:
+ state.cursor = end_pos
+ else:
+ end_pos = state.find_line_end()
+ text = state.get_text(end_pos)
+ state.add_paragraph(text)
+ state.cursor = end_pos
+
+ if state.cursor < state.cursor_max:
+ text = state.src[state.cursor:]
+ state.add_paragraph(text)
+ state.cursor = state.cursor_max
+
+
+def _parse_html_to_end(state, end_marker, start_pos):
+ marker_pos = state.src.find(end_marker, start_pos)
+ if marker_pos == -1:
+ text = state.src[state.cursor:]
+ end_pos = state.cursor_max
+ else:
+ text = state.get_text(marker_pos)
+ state.cursor = marker_pos
+ end_pos = state.find_line_end()
+ text += state.get_text(end_pos)
+
+ state.append_token({'type': 'block_html', 'raw': text})
+ return end_pos
+
+
+def _parse_html_to_newline(state, newline):
+ m = newline.search(state.src, state.cursor)
+ if m:
+ end_pos = m.start()
+ text = state.get_text(end_pos)
+ else:
+ text = state.src[state.cursor:]
+ end_pos = state.cursor_max
+
+ state.append_token({'type': 'block_html', 'raw': text})
+ return end_pos