diff options
Diffstat (limited to 'src/mistune/inline_parser.py')
-rw-r--r-- | src/mistune/inline_parser.py | 390 |
1 files changed, 390 insertions, 0 deletions
diff --git a/src/mistune/inline_parser.py b/src/mistune/inline_parser.py new file mode 100644 index 0000000..0375a74 --- /dev/null +++ b/src/mistune/inline_parser.py @@ -0,0 +1,390 @@ +import re +from typing import Optional, List, Dict, Any +from .core import Parser, InlineState +from .util import ( + escape, + escape_url, + unikey, +) +from .helpers import ( + PREVENT_BACKSLASH, + PUNCTUATION, + HTML_TAGNAME, + HTML_ATTRIBUTES, + unescape_char, + parse_link, + parse_link_label, + parse_link_text, +) + +PAREN_END_RE = re.compile(r'\s*\)') + +AUTO_EMAIL = ( + r'''<[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9]''' + r'(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?' + r'(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*>' +) + +INLINE_HTML = ( + r'<' + HTML_TAGNAME + HTML_ATTRIBUTES + r'\s*/?>|' # open tag + r'</' + HTML_TAGNAME + r'\s*>|' # close tag + r'<!--(?!>|->)(?:(?!--)[\s\S])+?(?<!-)-->|' # comment + r'<\?[\s\S]+?\?>|' # script like <?php?> + r'<![A-Z][\s\S]+?>|' # doctype + r'<!\[CDATA[\s\S]+?\]\]>' # cdata +) + +EMPHASIS_END_RE = { + '*': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\*|[^\s*])\*(?!\*)'), + '_': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\_|[^\s_])_(?!_)\b'), + + '**': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\*|[^\s*])\*\*(?!\*)'), + '__': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\_|[^\s_])__(?!_)\b'), + + '***': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\*|[^\s*])\*\*\*(?!\*)'), + '___': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\_|[^\s_])___(?!_)\b'), +} + + +class InlineParser(Parser): + sc_flag = 0 + state_cls = InlineState + + #: linebreak leaves two spaces at the end of line + STD_LINEBREAK = r'(?:\\| {2,})\n\s*' + + #: every new line becomes <br> + HARD_LINEBREAK = r' *\n\s*' + + # we only need to find the start pattern of an inline token + SPECIFICATION = { + # e.g. \`, \$ + 'escape': r'(?:\\' + PUNCTUATION + ')+', + + # `code, ```code + 'codespan': r'`{1,}', + + # *w, **w, _w, __w + 'emphasis': r'\*{1,3}(?=[^\s*])|\b_{1,3}(?=[^\s_])', + + # [link], ![img] + 'link': r'!?\[', + + # <https://example.com>. regex copied from commonmark.js + 'auto_link': r'<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>', + 'auto_email': AUTO_EMAIL, + + 'inline_html': INLINE_HTML, + + 'linebreak': STD_LINEBREAK, + 'softbreak': HARD_LINEBREAK, + + 'prec_auto_link': r'<[A-Za-z][A-Za-z\d.+-]{1,31}:', + 'prec_inline_html': r'</?' + HTML_TAGNAME + r'|<!|<\?', + } + DEFAULT_RULES = ( + 'escape', + 'codespan', + 'emphasis', + 'link', + 'auto_link', + 'auto_email', + 'inline_html', + 'linebreak', + ) + + def __init__(self, hard_wrap=False): + super(InlineParser, self).__init__() + + self.hard_wrap = hard_wrap + # lazy add linebreak + if hard_wrap: + self.specification['linebreak'] = self.HARD_LINEBREAK + else: + self.rules.append('softbreak') + + self._methods = { + name: getattr(self, 'parse_' + name) for name in self.rules + } + + def parse_escape(self, m: re.Match, state: InlineState) -> int: + text = m.group(0) + text = unescape_char(text) + state.append_token({ + 'type': 'text', + 'raw': text, + }) + return m.end() + + def parse_link(self, m: re.Match, state: InlineState) -> Optional[int]: + pos = m.end() + + marker = m.group(0) + is_image = marker[0] == '!' + if is_image and state.in_image: + state.append_token({'type': 'text', 'raw': marker}) + return pos + elif not is_image and state.in_link: + state.append_token({'type': 'text', 'raw': marker}) + return pos + + text = None + label, end_pos = parse_link_label(state.src, pos) + if label is None: + text, end_pos = parse_link_text(state.src, pos) + if text is None: + return + + if text is None: + text = label + + if end_pos >= len(state.src) and label is None: + return + + rules = ['codespan', 'prec_auto_link', 'prec_inline_html'] + prec_pos = self.precedence_scan(m, state, end_pos, rules) + if prec_pos: + return prec_pos + + if end_pos < len(state.src): + c = state.src[end_pos] + if c == '(': + # standard link [text](<url> "title") + attrs, pos2 = parse_link(state.src, end_pos + 1) + if pos2: + token = self.__parse_link_token(is_image, text, attrs, state) + state.append_token(token) + return pos2 + + elif c == '[': + # standard ref link [text][label] + label2, pos2 = parse_link_label(state.src, end_pos + 1) + if pos2: + end_pos = pos2 + if label2: + label = label2 + + if label is None: + return + + ref_links = state.env.get('ref_links') + if not ref_links: + return + + key = unikey(label) + env = ref_links.get(key) + if env: + attrs = {'url': env['url'], 'title': env.get('title')} + token = self.__parse_link_token(is_image, text, attrs, state) + token['ref'] = key + token['label'] = label + state.append_token(token) + return end_pos + + def __parse_link_token(self, is_image, text, attrs, state): + new_state = state.copy() + new_state.src = text + if is_image: + new_state.in_image = True + token = { + 'type': 'image', + 'children': self.render(new_state), + 'attrs': attrs, + } + else: + new_state.in_link = True + token = { + 'type': 'link', + 'children': self.render(new_state), + 'attrs': attrs, + } + return token + + def parse_auto_link(self, m: re.Match, state: InlineState) -> int: + text = m.group(0) + pos = m.end() + if state.in_link: + self.process_text(text, state) + return pos + + text = text[1:-1] + self._add_auto_link(text, text, state) + return pos + + def parse_auto_email(self, m: re.Match, state: InlineState) -> int: + text = m.group(0) + pos = m.end() + if state.in_link: + self.process_text(text, state) + return pos + + text = text[1:-1] + url = 'mailto:' + text + self._add_auto_link(url, text, state) + return pos + + def _add_auto_link(self, url, text, state): + state.append_token({ + 'type': 'link', + 'children': [{'type': 'text', 'raw': text}], + 'attrs': {'url': escape_url(url)}, + }) + + def parse_emphasis(self, m: re.Match, state: InlineState) -> int: + pos = m.end() + + marker = m.group(0) + mlen = len(marker) + if mlen == 1 and state.in_emphasis: + state.append_token({'type': 'text', 'raw': marker}) + return pos + elif mlen == 2 and state.in_strong: + state.append_token({'type': 'text', 'raw': marker}) + return pos + + _end_re = EMPHASIS_END_RE[marker] + m1 = _end_re.search(state.src, pos) + if not m1: + state.append_token({'type': 'text', 'raw': marker}) + return pos + + end_pos = m1.end() + text = state.src[pos:end_pos-mlen] + + prec_pos = self.precedence_scan(m, state, end_pos) + if prec_pos: + return prec_pos + + new_state = state.copy() + new_state.src = text + if mlen == 1: + new_state.in_emphasis = True + children = self.render(new_state) + state.append_token({'type': 'emphasis', 'children': children}) + elif mlen == 2: + new_state.in_strong = True + children = self.render(new_state) + state.append_token({'type': 'strong', 'children': children}) + else: + new_state.in_emphasis = True + new_state.in_strong = True + + children = [{ + 'type': 'strong', + 'children': self.render(new_state) + }] + state.append_token({ + 'type': 'emphasis', + 'children': children, + }) + return end_pos + + def parse_codespan(self, m: re.Match, state: InlineState) -> int: + marker = m.group(0) + # require same marker with same length at end + + pattern = re.compile(r'(.*?(?:[^`]))' + marker + r'(?!`)', re.S) + + pos = m.end() + m = pattern.match(state.src, pos) + if m: + end_pos = m.end() + code = m.group(1) + # Line endings are treated like spaces + code = code.replace('\n', ' ') + if len(code.strip()): + if code.startswith(' ') and code.endswith(' '): + code = code[1:-1] + state.append_token({'type': 'codespan', 'raw': escape(code)}) + return end_pos + else: + state.append_token({'type': 'text', 'raw': marker}) + return pos + + def parse_linebreak(self, m: re.Match, state: InlineState) -> int: + state.append_token({'type': 'linebreak'}) + return m.end() + + def parse_softbreak(self, m: re.Match, state: InlineState) -> int: + state.append_token({'type': 'softbreak'}) + return m.end() + + def parse_inline_html(self, m: re.Match, state: InlineState) -> int: + end_pos = m.end() + html = m.group(0) + state.append_token({'type': 'inline_html', 'raw': html}) + if html.startswith(('<a ', '<a>', '<A ', '<A>')): + state.in_link = True + elif html.startswith(('</a ', '</a>', '</A ', '</A>')): + state.in_link = False + return end_pos + + def process_text(self, text: str, state: InlineState): + state.append_token({'type': 'text', 'raw': text}) + + def parse(self, state: InlineState) -> List[Dict[str, Any]]: + pos = 0 + sc = self.compile_sc() + while pos < len(state.src): + m = sc.search(state.src, pos) + if not m: + break + + end_pos = m.start() + if end_pos > pos: + hole = state.src[pos:end_pos] + self.process_text(hole, state) + + new_pos = self.parse_method(m, state) + if not new_pos: + # move cursor 1 character forward + pos = end_pos + 1 + hole = state.src[end_pos:pos] + self.process_text(hole, state) + else: + pos = new_pos + + if pos == 0: + # special case, just pure text + self.process_text(state.src, state) + elif pos < len(state.src): + self.process_text(state.src[pos:], state) + return state.tokens + + def precedence_scan(self, m: re.Match, state: InlineState, end_pos: int, rules=None): + if rules is None: + rules = ['codespan', 'link', 'prec_auto_link', 'prec_inline_html'] + + mark_pos = m.end() + sc = self.compile_sc(rules) + m1 = sc.search(state.src, mark_pos, end_pos) + if not m1: + return + + rule_name = m1.lastgroup.replace('prec_', '') + sc = self.compile_sc([rule_name]) + m2 = sc.match(state.src, m1.start()) + if not m2: + return + + func = self._methods[rule_name] + new_state = state.copy() + new_state.src = state.src + m2_pos = func(m2, new_state) + if not m2_pos or m2_pos < end_pos: + return + + raw_text = state.src[m.start():m2.start()] + state.append_token({'type': 'text', 'raw': raw_text}) + for token in new_state.tokens: + state.append_token(token) + return m2_pos + + def render(self, state: InlineState): + self.parse(state) + return state.tokens + + def __call__(self, s, env): + state = self.state_cls(env) + state.src = s + return self.render(state) |