diff options
author | Arturs Artamonovs <dos21h@gmail.com> | 2023-01-29 10:30:54 +0000 |
---|---|---|
committer | Arturs Artamonovs <dos21h@gmail.com> | 2023-01-29 10:30:54 +0000 |
commit | 66fa71a8f11b6ce5e8471b533f67cc3a1fdb85a8 (patch) | |
tree | 7aed7f385826a3bd88c76a373e28c6cfae4f396e /src | |
parent | 129c1201ea5c4418f0f89ad932633c7cea2439b7 (diff) | |
download | md-site-66fa71a8f11b6ce5e8471b533f67cc3a1fdb85a8.tar.gz md-site-66fa71a8f11b6ce5e8471b533f67cc3a1fdb85a8.zip |
Update to new mistune, removed old mistune, rewrite to python3
Diffstat (limited to 'src')
37 files changed, 4488 insertions, 1254 deletions
@@ -1,112 +1,114 @@ -#!/usr/bin/python2 +#!/usr/bin/python3 import os import os.path import sys + +#from jinja2 import Environment, PackageLoader, Template, FileSystemLoader from jinja2 import Environment, PackageLoader, Template, FileSystemLoader import mistune #global name for templates -article = {} - -class TocMixin(): - """TOC mixin for Renderer, mix this with Renderer:: - - class TocRenderer(TocMixin, Renderer): - pass - - toc = TocRenderer() - md = mistune.Markdown(renderer=toc) - - # required in this order - toc.reset_toc() # initial the status - md.parse(text) # parse for headers - toc.render_toc(level=3) # render TOC HTML - """ - - def reset_toc(self): - self.toc_tree = [] - self.toc_count = 0 - self.title = None - self.img_count = -1 - - def header(self, text, level, raw=None): - rv = "" - if level != 1: - rv = '<h%d id="toc-%d">%s</h%d>\n' % ( - level, self.toc_count, text, level - ) - else: - self.title = text - self.toc_tree.append((self.toc_count, text, level, raw)) - self.toc_count += 1 - return rv - - #Lets do previev image - def image(self, src, title, alt_text): - prev_img = src.split(".") - prev_img = prev_img[0]+"_prev."+prev_img[1] - self.img_count += 1 - return "<a href="+src+"><img src=\""+prev_img+"\" alt=\""+alt_text+"\"></a>" - - def render_toc(self, level=3): - """Render TOC to HTML. - - :param level: render toc to the given level - """ - return ''.join(self._iter_toc(level)) - - def _iter_toc(self, level): - first_level = None - last_level = None - cnt = 0 - - if (self.title != None): - yield '<h1>%s</h1>\n' % (self.title) - - yield '<ul id="table-of-content">\n' - - for toc in self.toc_tree: - index, text, l, raw = toc - index = cnt - - - if l > level: - # ignore this level - continue - - if first_level is None: - # based on first level - first_level = l - last_level = l - #yield '<li><a href="#toc-%d">%s</a>' % (cnt, text) - yield '' - elif last_level == l: - yield '</li>\n<li><a href="#toc-%d">%s</a>' % (cnt, text) - elif last_level == l - 1: - last_level = l - yield '<ul>\n<li><a href="#toc-%d">%s</a>' % (cnt, text) - elif last_level > l: - # close indention - yield '</li>' - while last_level > l: - yield '</ul>\n</li>\n' - last_level -= 1 - yield '<li><a href="#toc-%d">%s</a>' % (cnt, text) - cnt = cnt + 1 - - # close tags - yield '</li>\n' - while last_level > first_level: - yield '</ul>\n<!--</li>-->\n' - last_level -= 1 - - yield '</ul>\n' - self.toc_count = 0 - -class TocRenderer(TocMixin, mistune.Renderer): - pass +article = {}# + +#class TocMixin(): +# """TOC mixin for Renderer, mix this with Renderer::# + +# class TocRenderer(TocMixin, Renderer): +# pass# + +# toc = TocRenderer() +# md = mistune.Markdown(renderer=toc)# + +# # required in this order +# toc.reset_toc() # initial the status +# md.parse(text) # parse for headers +# toc.render_toc(level=3) # render TOC HTML +# """# + +# def reset_toc(self): +# self.toc_tree = [] +# self.toc_count = 0 +# self.title = None +# self.img_count = -1# + +# def header(self, text, level, raw=None): +# rv = "" +# if level != 1: +# rv = '<h%d id="toc-%d">%s</h%d>\n' % ( +# level, self.toc_count, text, level +# ) +# else: +# self.title = text +# self.toc_tree.append((self.toc_count, text, level, raw)) +# self.toc_count += 1 +# return rv# + +# #Lets do previev image +# def image(self, src, title, alt_text): +# prev_img = src.split(".") +# prev_img = prev_img[0]+"_prev."+prev_img[1] +# self.img_count += 1 +# return "<a href="+src+"><img src=\""+prev_img+"\" alt=\""+alt_text+"\"></a>"# + +# def render_toc(self, level=3): +# """Render TOC to HTML.# + +# :param level: render toc to the given level +# """ +# return ''.join(self._iter_toc(level))# + +# def _iter_toc(self, level): +# first_level = None +# last_level = None +# cnt = 0# + +# if (self.title != None): +# yield '<h1>%s</h1>\n' % (self.title)# + +# yield '<ul id="table-of-content">\n'# + +# for toc in self.toc_tree: +# index, text, l, raw = toc +# index = cnt +# # + +# if l > level: +# # ignore this level +# continue# + +# if first_level is None: +# # based on first level +# first_level = l +# last_level = l +# #yield '<li><a href="#toc-%d">%s</a>' % (cnt, text) +# yield '' +# elif last_level == l: +# yield '</li>\n<li><a href="#toc-%d">%s</a>' % (cnt, text) +# elif last_level == l - 1: +# last_level = l +# yield '<ul>\n<li><a href="#toc-%d">%s</a>' % (cnt, text) +# elif last_level > l: +# # close indention +# yield '</li>' +# while last_level > l: +# yield '</ul>\n</li>\n' +# last_level -= 1 +# yield '<li><a href="#toc-%d">%s</a>' % (cnt, text) +# cnt = cnt + 1# + +# # close tags +# yield '</li>\n' +# while last_level > first_level: +# yield '</ul>\n<!--</li>-->\n' +# last_level -= 1# + +# yield '</ul>\n' +# self.toc_count = 0# + +#class TocRenderer(TocMixin, mistune.Renderer): +# pass ################################################################################ #get first tags and use them to configure some bits @@ -129,27 +131,30 @@ def get_tags(data): ################################################################################ #check if there is input file if len(sys.argv) < 2: - print "Need more arguments" + print("ERROR: Exit . Need more arguments") sys.exit(0) md_fn = sys.argv[1] #template loader loader = FileSystemLoader( "/home/fam/downloads/source/repos/md-site/src/templ" ) -templ_env = Environment( loader = loader ) +templ_env = Environment( loader = loader )# -t = templ_env.get_template("main.thtml") +t = templ_env.get_template("main.thtml")# -#get md file +##get md file f = open( md_fn, "r" ) data = f.read() data = get_tags(data) -toc = TocRenderer() -md = mistune.Markdown(rule=True,renderer=toc) +#toc = TocRenderer() +## Create AST rendered + +## Create AST -> Markdown renderer + +## Mistune generate MTL +md_rend = mistune.html(data) -toc.reset_toc() -md.parse(data) -md_rend_toc = toc.render_toc(level=3) -md_rend = md_rend_toc + md.render( data ) +#print t.render( article=article, block = md_rend ) +print(t.render( article=article, block = md_rend )) -print t.render( article=article, block = md_rend )
\ No newline at end of file +f.close()
\ No newline at end of file diff --git a/src/mistune.py b/src/mistune.py deleted file mode 100644 index 4c711b2..0000000 --- a/src/mistune.py +++ /dev/null @@ -1,1143 +0,0 @@ -# coding: utf-8 -""" - mistune - ~~~~~~~ - - The fastest markdown parser in pure Python with renderer feature. - - :copyright: (c) 2014 - 2015 by Hsiaoming Yang. -""" - -import re -import inspect - -__version__ = '0.7.1' -__author__ = 'Hsiaoming Yang <me@lepture.com>' -__all__ = [ - 'BlockGrammar', 'BlockLexer', - 'InlineGrammar', 'InlineLexer', - 'Renderer', 'Markdown', - 'markdown', 'escape', -] - - -_key_pattern = re.compile(r'\s+') -_escape_pattern = re.compile(r'&(?!#?\w+;)') -_newline_pattern = re.compile(r'\r\n|\r') -_block_quote_leading_pattern = re.compile(r'^ *> ?', flags=re.M) -_block_code_leadning_pattern = re.compile(r'^ {4}', re.M) -_inline_tags = [ - 'a', 'em', 'strong', 'small', 's', 'cite', 'q', 'dfn', 'abbr', 'data', - 'time', 'code', 'var', 'samp', 'kbd', 'sub', 'sup', 'i', 'b', 'u', 'mark', - 'ruby', 'rt', 'rp', 'bdi', 'bdo', 'span', 'br', 'wbr', 'ins', 'del', - 'img', 'font', -] -_pre_tags = ['pre', 'script', 'style'] -_valid_end = r'(?!:/|[^\w\s@]*@)\b' -_valid_attr = r'''"[^"]*"|'[^']*'|[^'">]''' -_block_tag = r'(?!(?:%s)\b)\w+%s' % ('|'.join(_inline_tags), _valid_end) - - -def _pure_pattern(regex): - pattern = regex.pattern - if pattern.startswith('^'): - pattern = pattern[1:] - return pattern - - -def _keyify(key): - return _key_pattern.sub(' ', key.lower()) - - -def escape(text, quote=False, smart_amp=True): - """Replace special characters "&", "<" and ">" to HTML-safe sequences. - - The original cgi.escape will always escape "&", but you can control - this one for a smart escape amp. - - :param quote: if set to True, " and ' will be escaped. - :param smart_amp: if set to False, & will always be escaped. - """ - if smart_amp: - text = _escape_pattern.sub('&', text) - else: - text = text.replace('&', '&') - text = text.replace('<', '<') - text = text.replace('>', '>') - if quote: - text = text.replace('"', '"') - text = text.replace("'", ''') - return text - - -def preprocessing(text, tab=4): - text = _newline_pattern.sub('\n', text) - text = text.replace('\t', ' ' * tab) - text = text.replace('\u00a0', ' ') - text = text.replace('\u2424', '\n') - pattern = re.compile(r'^ +$', re.M) - return pattern.sub('', text) - - -class BlockGrammar(object): - """Grammars for block level tokens.""" - - def_links = re.compile( - r'^ *\[([^^\]]+)\]: *' # [key]: - r'<?([^\s>]+)>?' # <link> or link - r'(?: +["(]([^\n]+)[")])? *(?:\n+|$)' - ) - def_footnotes = re.compile( - r'^\[\^([^\]]+)\]: *(' - r'[^\n]*(?:\n+|$)' # [^key]: - r'(?: {1,}[^\n]*(?:\n+|$))*' - r')' - ) - - newline = re.compile(r'^\n+') - block_code = re.compile(r'^( {4}[^\n]+\n*)+') - fences = re.compile( - r'^ *(`{3,}|~{3,}) *(\S+)? *\n' # ```lang - r'([\s\S]+?)\s*' - r'\1 *(?:\n+|$)' # ``` - ) - hrule = re.compile(r'^ {0,3}[-*_](?: *[-*_]){2,} *(?:\n+|$)') - heading = re.compile(r'^ *(#{1,6}) *([^\n]+?) *#* *(?:\n+|$)') - lheading = re.compile(r'^([^\n]+)\n *(=|-)+ *(?:\n+|$)') - block_quote = re.compile(r'^( *>[^\n]+(\n[^\n]+)*\n*)+') - list_block = re.compile( - r'^( *)([*+-]|\d+\.) [\s\S]+?' - r'(?:' - r'\n+(?=\1?(?:[-*_] *){3,}(?:\n+|$))' # hrule - r'|\n+(?=%s)' # def links - r'|\n+(?=%s)' # def footnotes - r'|\n{2,}' - r'(?! )' - r'(?!\1(?:[*+-]|\d+\.) )\n*' - r'|' - r'\s*$)' % ( - _pure_pattern(def_links), - _pure_pattern(def_footnotes), - ) - ) - list_item = re.compile( - r'^(( *)(?:[*+-]|\d+\.) [^\n]*' - r'(?:\n(?!\2(?:[*+-]|\d+\.) )[^\n]*)*)', - flags=re.M - ) - list_bullet = re.compile(r'^ *(?:[*+-]|\d+\.) +') - paragraph = re.compile( - r'^((?:[^\n]+\n?(?!' - r'%s|%s|%s|%s|%s|%s|%s|%s|%s' - r'))+)\n*' % ( - _pure_pattern(fences).replace(r'\1', r'\2'), - _pure_pattern(list_block).replace(r'\1', r'\3'), - _pure_pattern(hrule), - _pure_pattern(heading), - _pure_pattern(lheading), - _pure_pattern(block_quote), - _pure_pattern(def_links), - _pure_pattern(def_footnotes), - '<' + _block_tag, - ) - ) - block_html = re.compile( - r'^ *(?:%s|%s|%s) *(?:\n{2,}|\s*$)' % ( - r'<!--[\s\S]*?-->', - r'<(%s)((?:%s)*?)>([\s\S]+?)<\/\1>' % (_block_tag, _valid_attr), - r'<%s(?:%s)*?>' % (_block_tag, _valid_attr), - ) - ) - table = re.compile( - r'^ *\|(.+)\n *\|( *[-:]+[-| :]*)\n((?: *\|.*(?:\n|$))*)\n*' - ) - nptable = re.compile( - r'^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)\n*' - ) - text = re.compile(r'^[^\n]+') - - -class BlockLexer(object): - """Block level lexer for block grammars.""" - grammar_class = BlockGrammar - - default_rules = [ - 'newline', 'hrule', 'block_code', 'fences', 'heading', - 'nptable', 'lheading', 'block_quote', - 'list_block', 'block_html', 'def_links', - 'def_footnotes', 'table', 'paragraph', 'text' - ] - - list_rules = ( - 'newline', 'block_code', 'fences', 'lheading', 'hrule', - 'block_quote', 'list_block', 'block_html', 'text', - ) - - footnote_rules = ( - 'newline', 'block_code', 'fences', 'heading', - 'nptable', 'lheading', 'hrule', 'block_quote', - 'list_block', 'block_html', 'table', 'paragraph', 'text' - ) - - def __init__(self, rules=None, **kwargs): - self.tokens = [] - self.def_links = {} - self.def_footnotes = {} - - if not rules: - rules = self.grammar_class() - - self.rules = rules - - def __call__(self, text, rules=None): - return self.parse(text, rules) - - def parse(self, text, rules=None): - text = text.rstrip('\n') - - if not rules: - rules = self.default_rules - - def manipulate(text): - for key in rules: - rule = getattr(self.rules, key) - m = rule.match(text) - if not m: - continue - getattr(self, 'parse_%s' % key)(m) - return m - return False # pragma: no cover - - while text: - m = manipulate(text) - if m is not False: - text = text[len(m.group(0)):] - continue - if text: # pragma: no cover - raise RuntimeError('Infinite loop at: %s' % text) - return self.tokens - - def parse_newline(self, m): - length = len(m.group(0)) - if length > 1: - self.tokens.append({'type': 'newline'}) - - def parse_block_code(self, m): - # clean leading whitespace - code = _block_code_leadning_pattern.sub('', m.group(0)) - self.tokens.append({ - 'type': 'code', - 'lang': None, - 'text': code, - }) - - def parse_fences(self, m): - self.tokens.append({ - 'type': 'code', - 'lang': m.group(2), - 'text': m.group(3), - }) - - def parse_heading(self, m): - self.tokens.append({ - 'type': 'heading', - 'level': len(m.group(1)), - 'text': m.group(2), - }) - - def parse_lheading(self, m): - """Parse setext heading.""" - self.tokens.append({ - 'type': 'heading', - 'level': 1 if m.group(2) == '=' else 2, - 'text': m.group(1), - }) - - def parse_hrule(self, m): - self.tokens.append({'type': 'hrule'}) - - def parse_list_block(self, m): - bull = m.group(2) - self.tokens.append({ - 'type': 'list_start', - 'ordered': '.' in bull, - }) - cap = m.group(0) - self._process_list_item(cap, bull) - self.tokens.append({'type': 'list_end'}) - - def _process_list_item(self, cap, bull): - cap = self.rules.list_item.findall(cap) - - _next = False - length = len(cap) - - for i in range(length): - item = cap[i][0] - - # remove the bullet - space = len(item) - item = self.rules.list_bullet.sub('', item) - - # outdent - if '\n ' in item: - space = space - len(item) - pattern = re.compile(r'^ {1,%d}' % space, flags=re.M) - item = pattern.sub('', item) - - # determin whether item is loose or not - loose = _next - if not loose and re.search(r'\n\n(?!\s*$)', item): - loose = True - - rest = len(item) - if i != length - 1 and rest: - _next = item[rest-1] == '\n' - if not loose: - loose = _next - - if loose: - t = 'loose_item_start' - else: - t = 'list_item_start' - - self.tokens.append({'type': t}) - # recurse - self.parse(item, self.list_rules) - self.tokens.append({'type': 'list_item_end'}) - - def parse_block_quote(self, m): - self.tokens.append({'type': 'block_quote_start'}) - # clean leading > - cap = _block_quote_leading_pattern.sub('', m.group(0)) - self.parse(cap) - self.tokens.append({'type': 'block_quote_end'}) - - def parse_def_links(self, m): - key = _keyify(m.group(1)) - self.def_links[key] = { - 'link': m.group(2), - 'title': m.group(3), - } - - def parse_def_footnotes(self, m): - key = _keyify(m.group(1)) - if key in self.def_footnotes: - # footnote is already defined - return - - self.def_footnotes[key] = 0 - - self.tokens.append({ - 'type': 'footnote_start', - 'key': key, - }) - - text = m.group(2) - - if '\n' in text: - lines = text.split('\n') - whitespace = None - for line in lines[1:]: - space = len(line) - len(line.lstrip()) - if space and (not whitespace or space < whitespace): - whitespace = space - newlines = [lines[0]] - for line in lines[1:]: - newlines.append(line[whitespace:]) - text = '\n'.join(newlines) - - self.parse(text, self.footnote_rules) - - self.tokens.append({ - 'type': 'footnote_end', - 'key': key, - }) - - def parse_table(self, m): - item = self._process_table(m) - - cells = re.sub(r'(?: *\| *)?\n$', '', m.group(3)) - cells = cells.split('\n') - for i, v in enumerate(cells): - v = re.sub(r'^ *\| *| *\| *$', '', v) - cells[i] = re.split(r' *\| *', v) - - item['cells'] = cells - self.tokens.append(item) - - def parse_nptable(self, m): - item = self._process_table(m) - - cells = re.sub(r'\n$', '', m.group(3)) - cells = cells.split('\n') - for i, v in enumerate(cells): - cells[i] = re.split(r' *\| *', v) - - item['cells'] = cells - self.tokens.append(item) - - def _process_table(self, m): - header = re.sub(r'^ *| *\| *$', '', m.group(1)) - header = re.split(r' *\| *', header) - align = re.sub(r' *|\| *$', '', m.group(2)) - align = re.split(r' *\| *', align) - - for i, v in enumerate(align): - if re.search(r'^ *-+: *$', v): - align[i] = 'right' - elif re.search(r'^ *:-+: *$', v): - align[i] = 'center' - elif re.search(r'^ *:-+ *$', v): - align[i] = 'left' - else: - align[i] = None - - item = { - 'type': 'table', - 'header': header, - 'align': align, - } - return item - - def parse_block_html(self, m): - tag = m.group(1) - if not tag: - text = m.group(0) - self.tokens.append({ - 'type': 'close_html', - 'text': text - }) - else: - attr = m.group(2) - text = m.group(3) - self.tokens.append({ - 'type': 'open_html', - 'tag': tag, - 'extra': attr, - 'text': text - }) - - def parse_paragraph(self, m): - text = m.group(1).rstrip('\n') - self.tokens.append({'type': 'paragraph', 'text': text}) - - def parse_text(self, m): - text = m.group(0) - self.tokens.append({'type': 'text', 'text': text}) - - -class InlineGrammar(object): - """Grammars for inline level tokens.""" - - escape = re.compile(r'^\\([\\`*{}\[\]()#+\-.!_>~|])') # \* \+ \! .... - inline_html = re.compile( - r'^(?:%s|%s|%s)' % ( - r'<!--[\s\S]*?-->', - r'<(\w+%s)((?:%s)*?)>([\s\S]*?)<\/\1>' % (_valid_end, _valid_attr), - r'<\w+%s(?:%s)*?>' % (_valid_end, _valid_attr), - ) - ) - autolink = re.compile(r'^<([^ >]+(@|:)[^ >]+)>') - link = re.compile( - r'^!?\[(' - r'(?:\[[^^\]]*\]|[^\[\]]|\](?=[^\[]*\]))*' - r')\]\(' - r'''\s*(<)?([\s\S]*?)(?(2)>)(?:\s+['"]([\s\S]*?)['"])?\s*''' - r'\)' - ) - reflink = re.compile( - r'^!?\[(' - r'(?:\[[^^\]]*\]|[^\[\]]|\](?=[^\[]*\]))*' - r')\]\s*\[([^^\]]*)\]' - ) - nolink = re.compile(r'^!?\[((?:\[[^\]]*\]|[^\[\]])*)\]') - url = re.compile(r'''^(https?:\/\/[^\s<]+[^<.,:;"')\]\s])''') - double_emphasis = re.compile( - r'^_{2}([\s\S]+?)_{2}(?!_)' # __word__ - r'|' - r'^\*{2}([\s\S]+?)\*{2}(?!\*)' # **word** - ) - emphasis = re.compile( - r'^\b_((?:__|[\s\S])+?)_\b' # _word_ - r'|' - r'^\*((?:\*\*|[\s\S])+?)\*(?!\*)' # *word* - ) - code = re.compile(r'^(`+)\s*([\s\S]*?[^`])\s*\1(?!`)') # `code` - linebreak = re.compile(r'^ {2,}\n(?!\s*$)') - strikethrough = re.compile(r'^~~(?=\S)([\s\S]+?\S)~~') # ~~word~~ - footnote = re.compile(r'^\[\^([^\]]+)\]') - text = re.compile(r'^[\s\S]+?(?=[\\<!\[_*`~]|https?://| {2,}\n|$)') - - def hard_wrap(self): - """Grammar for hard wrap linebreak. You don't need to add two - spaces at the end of a line. - """ - self.linebreak = re.compile(r'^ *\n(?!\s*$)') - self.text = re.compile( - r'^[\s\S]+?(?=[\\<!\[_*`~]|https?://| *\n|$)' - ) - - -class InlineLexer(object): - """Inline level lexer for inline grammars.""" - grammar_class = InlineGrammar - - default_rules = [ - 'escape', 'inline_html', 'autolink', 'url', - 'footnote', 'link', 'reflink', 'nolink', - 'double_emphasis', 'emphasis', 'code', - 'linebreak', 'strikethrough', 'text', - ] - inline_html_rules = [ - 'escape', 'autolink', 'url', 'link', 'reflink', - 'nolink', 'double_emphasis', 'emphasis', 'code', - 'linebreak', 'strikethrough', 'text', - ] - - def __init__(self, renderer, rules=None, **kwargs): - self.renderer = renderer - self.links = {} - self.footnotes = {} - self.footnote_index = 0 - - if not rules: - rules = self.grammar_class() - - self.rules = rules - - self._in_link = False - self._in_footnote = False - - kwargs.update(self.renderer.options) - self._parse_inline_html = kwargs.get('parse_inline_html') - - def __call__(self, text, rules=None): - return self.output(text, rules) - - def setup(self, links, footnotes): - self.footnote_index = 0 - self.links = links or {} - self.footnotes = footnotes or {} - - def output(self, text, rules=None): - text = text.rstrip('\n') - if not rules: - rules = list(self.default_rules) - - if self._in_footnote and 'footnote' in rules: - rules.remove('footnote') - - output = self.renderer.placeholder() - - def manipulate(text): - for key in rules: - pattern = getattr(self.rules, key) - m = pattern.match(text) - if not m: - continue - self.line_match = m - out = getattr(self, 'output_%s' % key)(m) - if out is not None: - return m, out - return False # pragma: no cover - - self.line_started = False - while text: - ret = manipulate(text) - self.line_started = True - if ret is not False: - m, out = ret - output += out - text = text[len(m.group(0)):] - continue - if text: # pragma: no cover - raise RuntimeError('Infinite loop at: %s' % text) - - return output - - def output_escape(self, m): - return m.group(1) - - def output_autolink(self, m): - link = m.group(1) - if m.group(2) == '@': - is_email = True - else: - is_email = False - return self.renderer.autolink(link, is_email) - - def output_url(self, m): - link = m.group(1) - if self._in_link: - return self.renderer.text(link) - return self.renderer.autolink(link, False) - - def output_inline_html(self, m): - tag = m.group(1) - if self._parse_inline_html and tag in _inline_tags: - text = m.group(3) - if tag == 'a': - self._in_link = True - text = self.output(text, rules=self.inline_html_rules) - self._in_link = False - else: - text = self.output(text, rules=self.inline_html_rules) - extra = m.group(2) or '' - html = '<%s%s>%s</%s>' % (tag, extra, text, tag) - else: - html = m.group(0) - return self.renderer.inline_html(html) - - def output_footnote(self, m): - key = _keyify(m.group(1)) - if key not in self.footnotes: - return None - if self.footnotes[key]: - return None - self.footnote_index += 1 - self.footnotes[key] = self.footnote_index - return self.renderer.footnote_ref(key, self.footnote_index) - - def output_link(self, m): - return self._process_link(m, m.group(3), m.group(4)) - - def output_reflink(self, m): - key = _keyify(m.group(2) or m.group(1)) - if key not in self.links: - return None - ret = self.links[key] - return self._process_link(m, ret['link'], ret['title']) - - def output_nolink(self, m): - key = _keyify(m.group(1)) - if key not in self.links: - return None - ret = self.links[key] - return self._process_link(m, ret['link'], ret['title']) - - def _process_link(self, m, link, title=None): - line = m.group(0) - text = m.group(1) - if line[0] == '!': - return self.renderer.image(link, title, text) - - self._in_link = True - text = self.output(text) - self._in_link = False - return self.renderer.link(link, title, text) - - def output_double_emphasis(self, m): - text = m.group(2) or m.group(1) - text = self.output(text) - return self.renderer.double_emphasis(text) - - def output_emphasis(self, m): - text = m.group(2) or m.group(1) - text = self.output(text) - return self.renderer.emphasis(text) - - def output_code(self, m): - text = m.group(2) - return self.renderer.codespan(text) - - def output_linebreak(self, m): - return self.renderer.linebreak() - - def output_strikethrough(self, m): - text = self.output(m.group(1)) - return self.renderer.strikethrough(text) - - def output_text(self, m): - text = m.group(0) - return self.renderer.text(text) - - -class Renderer(object): - """The default HTML renderer for rendering Markdown. - """ - - def __init__(self, **kwargs): - self.options = kwargs - - def placeholder(self): - """Returns the default, empty output value for the renderer. - - All renderer methods use the '+=' operator to append to this value. - Default is a string so rendering HTML can build up a result string with - the rendered Markdown. - - Can be overridden by Renderer subclasses to be types like an empty - list, allowing the renderer to create a tree-like structure to - represent the document (which can then be reprocessed later into a - separate format like docx or pdf). - """ - return '' - - def block_code(self, code, lang=None): - """Rendering block level code. ``pre > code``. - - :param code: text content of the code block. - :param lang: language of the given code. - """ - code = code.rstrip('\n') - if not lang: - code = escape(code, smart_amp=False) - return '<pre><code>%s\n</code></pre>\n' % code - code = escape(code, quote=True, smart_amp=False) - #return '<pre><code class="lang-%s">%s\n</code></pre>\n' % (lang, code) - return '<pre><code class="%s">%s\n</code></pre>\n' % (lang, code) - - - def block_quote(self, text): - """Rendering <blockquote> with the given text. - - :param text: text content of the blockquote. - """ - return '<blockquote>%s\n</blockquote>\n' % text.rstrip('\n') - - def block_html(self, html): - """Rendering block level pure html content. - - :param html: text content of the html snippet. - """ - if self.options.get('skip_style') and \ - html.lower().startswith('<style'): - return '' - if self.options.get('escape'): - return escape(html) - return html - - def header(self, text, level, raw=None): - """Rendering header/heading tags like ``<h1>`` ``<h2>``. - - :param text: rendered text content for the header. - :param level: a number for the header level, for example: 1. - :param raw: raw text content of the header. - """ - return '<h%d>%s</h%d>\n' % (level, text, level) - - def hrule(self): - """Rendering method for ``<hr>`` tag.""" - if self.options.get('use_xhtml'): - return '<hr />\n' - return '<hr>\n' - - def list(self, body, ordered=True): - """Rendering list tags like ``<ul>`` and ``<ol>``. - - :param body: body contents of the list. - :param ordered: whether this list is ordered or not. - """ - tag = 'ul' - if ordered: - tag = 'ol' - return '<%s>\n%s</%s>\n' % (tag, body, tag) - - def list_item(self, text): - """Rendering list item snippet. Like ``<li>``.""" - return '<li>%s</li>\n' % text - - def paragraph(self, text): - """Rendering paragraph tags. Like ``<p>``.""" - return '<p>%s</p>\n' % text.strip(' ') - - def table(self, header, body): - """Rendering table element. Wrap header and body in it. - - :param header: header part of the table. - :param body: body part of the table. - """ - return ( - '<table>\n<thead>%s</thead>\n' - '<tbody>\n%s</tbody>\n</table>\n' - ) % (header, body) - - def table_row(self, content): - """Rendering a table row. Like ``<tr>``. - - :param content: content of current table row. - """ - return '<tr>\n%s</tr>\n' % content - - def table_cell(self, content, **flags): - """Rendering a table cell. Like ``<th>`` ``<td>``. - - :param content: content of current table cell. - :param header: whether this is header or not. - :param align: align of current table cell. - """ - if flags['header']: - tag = 'th' - else: - tag = 'td' - align = flags['align'] - if not align: - return '<%s>%s</%s>\n' % (tag, content, tag) - return '<%s style="text-align:%s">%s</%s>\n' % ( - tag, align, content, tag - ) - - def double_emphasis(self, text): - """Rendering **strong** text. - - :param text: text content for emphasis. - """ - return '<strong>%s</strong>' % text - - def emphasis(self, text): - """Rendering *emphasis* text. - - :param text: text content for emphasis. - """ - return '<em>%s</em>' % text - - def codespan(self, text): - """Rendering inline `code` text. - - :param text: text content for inline code. - """ - text = escape(text.rstrip(), smart_amp=False) - return '<code>%s</code>' % text - - def linebreak(self): - """Rendering line break like ``<br>``.""" - if self.options.get('use_xhtml'): - return '<br />\n' - return '<br>\n' - - def strikethrough(self, text): - """Rendering ~~strikethrough~~ text. - - :param text: text content for strikethrough. - """ - return '<del>%s</del>' % text - - def text(self, text): - """Rendering unformatted text. - - :param text: text content. - """ - return escape(text) - - def autolink(self, link, is_email=False): - """Rendering a given link or email address. - - :param link: link content or email address. - :param is_email: whether this is an email or not. - """ - text = link = escape(link) - if is_email: - link = 'mailto:%s' % link - return '<a href="%s">%s</a>' % (link, text) - - def link(self, link, title, text): - """Rendering a given link with content and title. - - :param link: href link for ``<a>`` tag. - :param title: title content for `title` attribute. - :param text: text content for description. - """ - if link.startswith('javascript:'): - link = '' - if not title: - return '<a href="%s">%s</a>' % (link, text) - title = escape(title, quote=True) - return '<a href="%s" title="%s">%s</a>' % (link, title, text) - - def image(self, src, title, text): - """Rendering a image with title and text. - - :param src: source link of the image. - :param title: title text of the image. - :param text: alt text of the image. - """ - if src.startswith('javascript:'): - src = '' - text = escape(text, quote=True) - if title: - title = escape(title, quote=True) - html = '<img src="%s" alt="%s" title="%s"' % (src, text, title) - else: - html = '<img src="%s" alt="%s"' % (src, text) - if self.options.get('use_xhtml'): - return '%s />' % html - return '%s>' % html - - def inline_html(self, html): - """Rendering span level pure html content. - - :param html: text content of the html snippet. - """ - if self.options.get('escape'): - return escape(html) - return html - - def newline(self): - """Rendering newline element.""" - return '' - - def footnote_ref(self, key, index): - """Rendering the ref anchor of a footnote. - - :param key: identity key for the footnote. - :param index: the index count of current footnote. - """ - html = ( - '<sup class="footnote-ref" id="fnref-%s">' - '<a href="#fn-%s" rel="footnote">%d</a></sup>' - ) % (escape(key), escape(key), index) - return html - - def footnote_item(self, key, text): - """Rendering a footnote item. - - :param key: identity key for the footnote. - :param text: text content of the footnote. - """ - back = ( - '<a href="#fnref-%s" rev="footnote">↩</a>' - ) % escape(key) - text = text.rstrip() - if text.endswith('</p>'): - text = re.sub(r'<\/p>$', r'%s</p>' % back, text) - else: - text = '%s<p>%s</p>' % (text, back) - html = '<li id="fn-%s">%s</li>\n' % (escape(key), text) - return html - - def footnotes(self, text): - """Wrapper for all footnotes. - - :param text: contents of all footnotes. - """ - html = '<div class="footnotes">\n%s<ol>%s</ol>\n</div>\n' - return html % (self.hrule(), text) - - -class Markdown(object): - """The Markdown parser. - - :param renderer: An instance of ``Renderer``. - :param inline: An inline lexer class or instance. - :param block: A block lexer class or instance. - """ - def __init__(self, renderer=None, inline=None, block=None, **kwargs): - if not renderer: - renderer = Renderer(**kwargs) - - self.renderer = renderer - - if inline and inspect.isclass(inline): - inline = inline(renderer, **kwargs) - if block and inspect.isclass(block): - block = block(**kwargs) - - if inline: - self.inline = inline - else: - rules = InlineGrammar() - if kwargs.get('hard_wrap'): - rules.hard_wrap() - self.inline = InlineLexer(renderer, rules=rules) - - self.block = block or BlockLexer(BlockGrammar()) - self.options = kwargs - self.footnotes = [] - self.tokens = [] - - # detect if it should parse text in block html - self._parse_block_html = kwargs.get('parse_block_html') - - def __call__(self, text): - return self.parse(text) - - def render(self, text): - """Render the Markdown text. - - :param text: markdown formatted text content. - """ - return self.parse(text) - - def parse(self, text): - out = self.output(preprocessing(text)) - - keys = self.block.def_footnotes - - # reset block - self.block.def_links = {} - self.block.def_footnotes = {} - - # reset inline - self.inline.links = {} - self.inline.footnotes = {} - - if not self.footnotes: - return out - - footnotes = filter(lambda o: keys.get(o['key']), self.footnotes) - self.footnotes = sorted( - footnotes, key=lambda o: keys.get(o['key']), reverse=True - ) - - body = self.renderer.placeholder() - while self.footnotes: - note = self.footnotes.pop() - body += self.renderer.footnote_item( - note['key'], note['text'] - ) - - out += self.renderer.footnotes(body) - return out - - def pop(self): - if not self.tokens: - return None - self.token = self.tokens.pop() - return self.token - - def peek(self): - if self.tokens: - return self.tokens[-1] - return None # pragma: no cover - - def output(self, text, rules=None): - self.tokens = self.block(text, rules) - self.tokens.reverse() - - self.inline.setup(self.block.def_links, self.block.def_footnotes) - - out = self.renderer.placeholder() - while self.pop(): - out += self.tok() - return out - - def tok(self): - t = self.token['type'] - - # sepcial cases - if t.endswith('_start'): - t = t[:-6] - - return getattr(self, 'output_%s' % t)() - - def tok_text(self): - text = self.token['text'] - while self.peek()['type'] == 'text': - text += '\n' + self.pop()['text'] - return self.inline(text) - - def output_newline(self): - return self.renderer.newline() - - def output_hrule(self): - return self.renderer.hrule() - - def output_heading(self): - return self.renderer.header( - self.inline(self.token['text']), - self.token['level'], - self.token['text'], - ) - - def output_code(self): - return self.renderer.block_code( - self.token['text'], self.token['lang'] - ) - - def output_table(self): - aligns = self.token['align'] - aligns_length = len(aligns) - cell = self.renderer.placeholder() - - # header part - header = self.renderer.placeholder() - for i, value in enumerate(self.token['header']): - align = aligns[i] if i < aligns_length else None - flags = {'header': True, 'align': align} - cell += self.renderer.table_cell(self.inline(value), **flags) - - header += self.renderer.table_row(cell) - - # body part - body = self.renderer.placeholder() - for i, row in enumerate(self.token['cells']): - cell = self.renderer.placeholder() - for j, value in enumerate(row): - align = aligns[j] if j < aligns_length else None - flags = {'header': False, 'align': align} - cell += self.renderer.table_cell(self.inline(value), **flags) - body += self.renderer.table_row(cell) - - return self.renderer.table(header, body) - - def output_block_quote(self): - body = self.renderer.placeholder() - while self.pop()['type'] != 'block_quote_end': - body += self.tok() - return self.renderer.block_quote(body) - - def output_list(self): - ordered = self.token['ordered'] - body = self.renderer.placeholder() - while self.pop()['type'] != 'list_end': - body += self.tok() - return self.renderer.list(body, ordered) - - def output_list_item(self): - body = self.renderer.placeholder() - while self.pop()['type'] != 'list_item_end': - if self.token['type'] == 'text': - body += self.tok_text() - else: - body += self.tok() - - return self.renderer.list_item(body) - - def output_loose_item(self): - body = self.renderer.placeholder() - while self.pop()['type'] != 'list_item_end': - body += self.tok() - return self.renderer.list_item(body) - - def output_footnote(self): - self.inline._in_footnote = True - body = self.renderer.placeholder() - key = self.token['key'] - while self.pop()['type'] != 'footnote_end': - body += self.tok() - self.footnotes.append({'key': key, 'text': body}) - self.inline._in_footnote = False - return self.renderer.placeholder() - - def output_close_html(self): - text = self.token['text'] - return self.renderer.block_html(text) - - def output_open_html(self): - text = self.token['text'] - tag = self.token['tag'] - if self._parse_block_html and tag not in _pre_tags: - text = self.inline(text, rules=self.inline.inline_html_rules) - extra = self.token.get('extra') or '' - html = '<%s%s>%s</%s>' % (tag, extra, text, tag) - return self.renderer.block_html(html) - - def output_paragraph(self): - return self.renderer.paragraph(self.inline(self.token['text'])) - - def output_text(self): - return self.renderer.paragraph(self.tok_text()) - - -def markdown(text, escape=True, **kwargs): - """Render markdown formatted text to html. - - :param text: markdown formatted text content. - :param escape: if set to False, all html tags will not be escaped. - :param use_xhtml: output with xhtml tags. - :param hard_wrap: if set to True, it will has GFM line breaks feature. - :param parse_block_html: parse text only in block level html. - :param parse_inline_html: parse text only in inline level html. - """ - return Markdown(escape=escape, **kwargs)(text) diff --git a/src/mistune/__init__.py b/src/mistune/__init__.py new file mode 100644 index 0000000..4de95c5 --- /dev/null +++ b/src/mistune/__init__.py @@ -0,0 +1,75 @@ +""" + mistune + ~~~~~~~ + + A fast yet powerful Python Markdown parser with renderers and + plugins, compatible with sane CommonMark rules. + + Documentation: https://mistune.lepture.com/ +""" + +from .markdown import Markdown +from .core import BlockState, InlineState, BaseRenderer +from .block_parser import BlockParser +from .inline_parser import InlineParser +from .renderers.html import HTMLRenderer +from .util import escape, escape_url, safe_entity, unikey +from .plugins import import_plugin + + +def create_markdown(escape=True, hard_wrap=False, renderer='html', plugins=None): + """Create a Markdown instance based on the given condition. + + :param escape: Boolean. If using html renderer, escape html. + :param hard_wrap: Boolean. Break every new line into ``<br>``. + :param renderer: renderer instance, default is HTMLRenderer. + :param plugins: List of plugins. + + This method is used when you want to re-use a Markdown instance:: + + markdown = create_markdown( + escape=False, + hard_wrap=True, + ) + # re-use markdown function + markdown('.... your text ...') + """ + if renderer == 'html': + renderer = HTMLRenderer(escape=escape) + + inline = InlineParser(hard_wrap=hard_wrap) + if plugins is not None: + plugins = [import_plugin(n) for n in plugins] + return Markdown(renderer=renderer, inline=inline, plugins=plugins) + + +html = create_markdown( + escape=False, + plugins=['strikethrough', 'footnotes', 'table', 'speedup'] +) + + +__cached_parsers = {} + + +def markdown(text, escape=True, renderer='html', plugins=None): + key = (escape, renderer, plugins) + if key in __cached_parsers: + return __cached_parsers[key](text) + + md = create_markdown(escape=escape, renderer=renderer, plugins=plugins) + # improve the speed for markdown parser creation + __cached_parsers[key] = md + return md(text) + + +__all__ = [ + 'Markdown', 'HTMLRenderer', + 'BlockParser', 'BlockState', 'BaseRenderer', + 'InlineParser', 'InlineState', + 'escape', 'escape_url', 'safe_entity', 'unikey', + 'html', 'create_markdown', 'markdown', +] + +__version__ = '3.0.0rc4' +__homepage__ = 'https://mistune.lepture.com/' diff --git a/src/mistune/__main__.py b/src/mistune/__main__.py new file mode 100644 index 0000000..053a379 --- /dev/null +++ b/src/mistune/__main__.py @@ -0,0 +1,124 @@ +import sys +import argparse +from .renderers.rst import RSTRenderer +from .renderers.markdown import MarkdownRenderer +from . import ( + create_markdown, + __version__ as version +) + + +def _md(args): + if args.plugin: + plugins = args.plugin + else: + # default plugins + plugins = ['strikethrough', 'footnotes', 'table', 'speedup'] + + if args.renderer == 'rst': + renderer = RSTRenderer() + elif args.renderer == 'markdown': + renderer = MarkdownRenderer() + else: + renderer = args.renderer + return create_markdown( + escape=args.escape, + hard_wrap=args.hardwrap, + renderer=renderer, + plugins=plugins, + ) + + +def _output(text, args): + if args.output: + with open(args.output, 'w') as f: + f.write(text) + else: + print(text) + + +CMD_HELP = '''Mistune, a sane and fast python markdown parser. + +Here are some use cases of the command line tool: + + $ python -m mistune -m "Hi **Markdown**" + <p>Hi <strong>Markdown</strong></p> + + $ python -m mistune -f README.md + <p>... + + $ cat README.md | python -m mistune + <p>... +''' + + +def cli(): + parser = argparse.ArgumentParser( + prog='python -m mistune', + description=CMD_HELP, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + '-m', '--message', + help='the markdown message to convert', + ) + parser.add_argument( + '-f', '--file', + help='the markdown file to convert', + ) + parser.add_argument( + '-p', '--plugin', + metavar='NAME', + action='extend', + nargs='+', + help='specifiy a plugin to use', + ) + parser.add_argument( + '--escape', + action='store_true', + help='turn on escape option', + ) + parser.add_argument( + '--hardwrap', + action='store_true', + help='turn on hardwrap option', + ) + parser.add_argument( + '-o', '--output', + help='write the rendered result into file', + ) + parser.add_argument( + '-r', '--renderer', + default='html', + help='specify the output renderer', + ) + parser.add_argument('--version', action='version', version='mistune ' + version) + args = parser.parse_args() + + message = args.message + if not message and not args.file: + message = read_stdin() + + if message: + md = _md(args) + text = md(message) + _output(text, args) + elif args.file: + md = _md(args) + text = md.read(args.file)[0] + _output(text, args) + else: + print('You MUST specify a message or file') + return sys.exit(1) + + +def read_stdin(): + is_stdin_pipe = not sys.stdin.isatty() + if is_stdin_pipe: + return sys.stdin.read() + else: + return None + + +if __name__ == '__main__': + cli() diff --git a/src/mistune/block_parser.py b/src/mistune/block_parser.py new file mode 100644 index 0000000..1ed79ec --- /dev/null +++ b/src/mistune/block_parser.py @@ -0,0 +1,486 @@ +import re +from typing import Optional, List, Tuple +from .util import ( + unikey, + escape_url, + expand_tab, + expand_leading_tab, +) +from .core import Parser, BlockState +from .helpers import ( + LINK_LABEL, + HTML_TAGNAME, + HTML_ATTRIBUTES, + BLOCK_TAGS, + PRE_TAGS, + unescape_char, + parse_link_href, + parse_link_title, +) +from .list_parser import parse_list, LIST_PATTERN + +_INDENT_CODE_TRIM = re.compile(r'^ {1,4}', flags=re.M) +_AXT_HEADING_TRIM = re.compile(r'(\s+|^)#+\s*$') +_BLOCK_QUOTE_TRIM = re.compile(r'^ ?', flags=re.M) +_BLOCK_QUOTE_LEADING = re.compile(r'^ *>', flags=re.M) + +_LINE_BLANK_END = re.compile(r'\n[ \t]*\n$') +_BLANK_TO_LINE = re.compile(r'[ \t]*\n') + +_BLOCK_TAGS_PATTERN = '|'.join(BLOCK_TAGS) + '|' + '|'.join(PRE_TAGS) +_OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r'[ \t]*>[ \t]*(?:\n|$)') +_CLOSE_TAG_END = re.compile(r'[ \t]*>[ \t]*(?:\n|$)') +_STRICT_BLOCK_QUOTE = re.compile(r'( {0,3}>[^\n]*(?:\n|$))+') + + +class BlockParser(Parser): + BLANK_LINE = re.compile(r'(^[ \t\v\f]*\n)+', re.M) + + RAW_HTML = ( + r'^ {0,3}(' + r'</?' + HTML_TAGNAME + r'|' + r'<!--|' # comment + r'<\?|' # script + r'<![A-Z]|' + r'<!\[CDATA\[)' + ) + + BLOCK_HTML = ( + r'^ {0,3}(?:' + r'(?:</?' + _BLOCK_TAGS_PATTERN + r'(?:[ \t]+|\n|$))' + r'|<!--' # comment + r'|<\?' # script + r'|<![A-Z]' + r'|<!\[CDATA\[)' + ) + + SPECIFICATION = { + 'blank_line': r'(^[ \t\v\f]*\n)+', + 'axt_heading': r'^ {0,3}(?P<axt_1>#{1,6})(?!#+)(?P<axt_2>[ \t]*|[ \t]+.*?)$', + 'setex_heading': r'^ {0,3}(?P<setext_1>=|-){1,}[ \t]*$', + 'fenced_code': ( + r'^(?P<fenced_1> {0,3})(?P<fenced_2>`{3,}|~{3,})' + r'[ \t]*(?P<fenced_3>.*?)$' + ), + 'indent_code': ( + r'^(?: {4}| *\t)[^\n]+(?:\n+|$)' + r'((?:(?: {4}| *\t)[^\n]+(?:\n+|$))|\s)*' + ), + 'thematic_break': r'^ {0,3}((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$', + 'ref_link': r'^ {0,3}\[(?P<reflink_1>' + LINK_LABEL + r')\]:', + 'block_quote': r'^ {0,3}>(?P<quote_1>.*?)$', + 'list': LIST_PATTERN, + 'block_html': BLOCK_HTML, + 'raw_html': RAW_HTML, + } + + DEFAULT_RULES = ( + 'fenced_code', + 'indent_code', + 'axt_heading', + 'setex_heading', + 'thematic_break', + 'block_quote', + 'list', + 'ref_link', + 'raw_html', + 'blank_line', + ) + + def __init__( + self, + block_quote_rules: Optional[List[str]]=None, + list_rules: Optional[List[str]]=None, + max_nested_level: int=6 + ): + super(BlockParser, self).__init__() + + if block_quote_rules is None: + block_quote_rules = list(self.DEFAULT_RULES) + + if list_rules is None: + list_rules = list(self.DEFAULT_RULES) + + self.block_quote_rules = block_quote_rules + self.list_rules = list_rules + self.max_nested_level = max_nested_level + # register default parse methods + self._methods = { + name: getattr(self, 'parse_' + name) for name in self.SPECIFICATION + } + + def parse_blank_line(self, m: re.Match, state: BlockState) -> int: + """Parse token for blank lines.""" + state.append_token({'type': 'blank_line'}) + return m.end() + + def parse_thematic_break(self, m: re.Match, state: BlockState) -> int: + """Parse token for thematic break, e.g. ``<hr>`` tag in HTML.""" + state.append_token({'type': 'thematic_break'}) + # $ does not count '\n' + return m.end() + 1 + + def parse_indent_code(self, m: re.Match, state: BlockState) -> int: + """Parse token for code block which is indented by 4 spaces.""" + # it is a part of the paragraph + end_pos = state.append_paragraph() + if end_pos: + return end_pos + + code = m.group(0) + code = expand_leading_tab(code) + code = _INDENT_CODE_TRIM.sub('', code) + code = code.strip('\n') + state.append_token({'type': 'block_code', 'raw': code, 'style': 'indent'}) + return m.end() + + def parse_fenced_code(self, m: re.Match, state: BlockState) -> Optional[int]: + """Parse token for fenced code block. A fenced code block is started with + 3 or more backtick(`) or tilde(~). + + An example of a fenced code block: + + .. code-block:: markdown + + ```python + def markdown(text): + return mistune.html(text) + ``` + """ + spaces = m.group('fenced_1') + marker = m.group('fenced_2') + info = m.group('fenced_3') + + c = marker[0] + if info and c == '`': + # CommonMark Example 145 + # Info strings for backtick code blocks cannot contain backticks + if info.find(c) != -1: + return + + _end = re.compile( + r'^ {0,3}' + c + '{' + str(len(marker)) + r',}[ \t]*(?:\n|$)', re.M) + cursor_start = m.end() + 1 + + m2 = _end.search(state.src, cursor_start) + if m2: + code = state.src[cursor_start:m2.start()] + end_pos = m2.end() + else: + code = state.src[cursor_start:] + end_pos = state.cursor_max + + if spaces and code: + _trim_pattern = re.compile('^ {0,' + str(len(spaces)) + '}', re.M) + code = _trim_pattern.sub('', code) + + token = {'type': 'block_code', 'raw': code, 'style': 'fenced', 'marker': marker} + if info: + info = unescape_char(info) + token['attrs'] = {'info': info.strip()} + + state.append_token(token) + return end_pos + + def parse_axt_heading(self, m: re.Match, state: BlockState) -> int: + """Parse token for AXT heading. An AXT heading is started with 1 to 6 + symbol of ``#``.""" + level = len(m.group('axt_1')) + text = m.group('axt_2').strip() + # remove last # + if text: + text = _AXT_HEADING_TRIM.sub('', text) + + token = {'type': 'heading', 'text': text, 'attrs': {'level': level}, 'style': 'axt'} + state.append_token(token) + return m.end() + 1 + + def parse_setex_heading(self, m: re.Match, state: BlockState) -> Optional[int]: + """Parse token for setex style heading. A setex heading syntax looks like: + + .. code-block:: markdown + + H1 title + ======== + """ + last_token = state.last_token() + if last_token and last_token['type'] == 'paragraph': + level = 1 if m.group('setext_1') == '=' else 2 + last_token['type'] = 'heading' + last_token['style'] = 'setext' + last_token['attrs'] = {'level': level} + return m.end() + 1 + + sc = self.compile_sc(['thematic_break', 'list']) + m = sc.match(state.src, state.cursor) + if m: + return self.parse_method(m, state) + + def parse_ref_link(self, m: re.Match, state: BlockState) -> Optional[int]: + """Parse link references and save the link information into ``state.env``. + + Here is an example of a link reference: + + .. code-block:: markdown + + a [link][example] + + [example]: https://example.com "Optional title" + + This method will save the link reference into ``state.env`` as:: + + state.env['ref_links']['example'] = { + 'url': 'https://example.com', + 'title': "Optional title", + } + """ + end_pos = state.append_paragraph() + if end_pos: + return end_pos + + label = m.group('reflink_1') + key = unikey(label) + if not key: + return + + href, href_pos = parse_link_href(state.src, m.end(), block=True) + if href is None: + return + + _blank = self.BLANK_LINE.search(state.src, href_pos) + if _blank: + max_pos = _blank.start() + else: + max_pos = state.cursor_max + + title, title_pos = parse_link_title(state.src, href_pos, max_pos) + if title_pos: + m = _BLANK_TO_LINE.match(state.src, title_pos) + if m: + title_pos = m.end() + else: + title_pos = None + title = None + + if title_pos is None: + m = _BLANK_TO_LINE.match(state.src, href_pos) + if m: + href_pos = m.end() + else: + href_pos = None + href = None + + end_pos = title_pos or href_pos + if not end_pos: + return + + if key not in state.env['ref_links']: + href = unescape_char(href) + data = {'url': escape_url(href), 'label': label} + if title: + data['title'] = title + state.env['ref_links'][key] = data + return end_pos + + def extract_block_quote(self, m: re.Match, state: BlockState) -> Tuple[str, int]: + """Extract text and cursor end position of a block quote.""" + + # cleanup at first to detect if it is code block + text = m.group('quote_1') + '\n' + text = expand_leading_tab(text, 3) + text = _BLOCK_QUOTE_TRIM.sub('', text) + + sc = self.compile_sc(['blank_line', 'indent_code', 'fenced_code']) + require_marker = bool(sc.match(text)) + + state.cursor = m.end() + 1 + + end_pos = None + if require_marker: + m = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor) + if m: + quote = m.group(0) + quote = _BLOCK_QUOTE_LEADING.sub('', quote) + quote = expand_leading_tab(quote, 3) + quote = _BLOCK_QUOTE_TRIM.sub('', quote) + text += quote + state.cursor = m.end() + else: + prev_blank_line = False + break_sc = self.compile_sc([ + 'blank_line', 'thematic_break', 'fenced_code', + 'list', 'block_html', + ]) + while state.cursor < state.cursor_max: + m = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor) + if m: + quote = m.group(0) + quote = _BLOCK_QUOTE_LEADING.sub('', quote) + quote = expand_leading_tab(quote, 3) + quote = _BLOCK_QUOTE_TRIM.sub('', quote) + text += quote + state.cursor = m.end() + if not quote.strip(): + prev_blank_line = True + else: + prev_blank_line = bool(_LINE_BLANK_END.search(quote)) + continue + + if prev_blank_line: + # CommonMark Example 249 + # because of laziness, a blank line is needed between + # a block quote and a following paragraph + break + + m = break_sc.match(state.src, state.cursor) + if m: + end_pos = self.parse_method(m, state) + if end_pos: + break + + # lazy continuation line + pos = state.find_line_end() + line = state.get_text(pos) + line = expand_leading_tab(line, 3) + text += line + state.cursor = pos + + # according to CommonMark Example 6, the second tab should be + # treated as 4 spaces + return expand_tab(text), end_pos + + def parse_block_quote(self, m: re.Match, state: BlockState) -> int: + """Parse token for block quote. Here is an example of the syntax: + + .. code-block:: markdown + + > a block quote starts + > with right arrows + """ + text, end_pos = self.extract_block_quote(m, state) + # scan children state + child = state.child_state(text) + if state.depth() >= self.max_nested_level - 1: + rules = list(self.block_quote_rules) + rules.remove('block_quote') + else: + rules = self.block_quote_rules + + self.parse(child, rules) + token = {'type': 'block_quote', 'children': child.tokens} + if end_pos: + state.prepend_token(token) + return end_pos + state.append_token(token) + return state.cursor + + def parse_list(self, m: re.Match, state: BlockState) -> int: + """Parse tokens for ordered and unordered list.""" + return parse_list(self, m, state) + + def parse_block_html(self, m: re.Match, state: BlockState) -> Optional[int]: + return self.parse_raw_html(m, state) + + def parse_raw_html(self, m: re.Match, state: BlockState) -> Optional[int]: + marker = m.group(0).strip() + + # rule 2 + if marker == '<!--': + return _parse_html_to_end(state, '-->', m.end()) + + # rule 3 + if marker == '<?': + return _parse_html_to_end(state, '?>', m.end()) + + # rule 5 + if marker == '<![CDATA[': + return _parse_html_to_end(state, ']]>', m.end()) + + # rule 4 + if marker.startswith('<!'): + return _parse_html_to_end(state, '>', m.end()) + + close_tag = None + open_tag = None + if marker.startswith('</'): + close_tag = marker[2:].lower() + # rule 6 + if close_tag in BLOCK_TAGS: + return _parse_html_to_newline(state, self.BLANK_LINE) + else: + open_tag = marker[1:].lower() + # rule 1 + if open_tag in PRE_TAGS: + end_tag = '</' + open_tag + '>' + return _parse_html_to_end(state, end_tag, m.end()) + # rule 6 + if open_tag in BLOCK_TAGS: + return _parse_html_to_newline(state, self.BLANK_LINE) + + # Blocks of type 7 may not interrupt a paragraph. + end_pos = state.append_paragraph() + if end_pos: + return end_pos + + # rule 7 + start_pos = m.end() + end_pos = state.find_line_end() + if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or \ + (close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos)): + return _parse_html_to_newline(state, self.BLANK_LINE) + + def parse(self, state: BlockState, rules: Optional[List[str]]=None) -> None: + sc = self.compile_sc(rules) + + while state.cursor < state.cursor_max: + m = sc.search(state.src, state.cursor) + if not m: + break + + end_pos = m.start() + if end_pos > state.cursor: + text = state.get_text(end_pos) + state.add_paragraph(text) + state.cursor = end_pos + + end_pos = self.parse_method(m, state) + if end_pos: + state.cursor = end_pos + else: + end_pos = state.find_line_end() + text = state.get_text(end_pos) + state.add_paragraph(text) + state.cursor = end_pos + + if state.cursor < state.cursor_max: + text = state.src[state.cursor:] + state.add_paragraph(text) + state.cursor = state.cursor_max + + +def _parse_html_to_end(state, end_marker, start_pos): + marker_pos = state.src.find(end_marker, start_pos) + if marker_pos == -1: + text = state.src[state.cursor:] + end_pos = state.cursor_max + else: + text = state.get_text(marker_pos) + state.cursor = marker_pos + end_pos = state.find_line_end() + text += state.get_text(end_pos) + + state.append_token({'type': 'block_html', 'raw': text}) + return end_pos + + +def _parse_html_to_newline(state, newline): + m = newline.search(state.src, state.cursor) + if m: + end_pos = m.start() + text = state.get_text(end_pos) + else: + text = state.src[state.cursor:] + end_pos = state.cursor_max + + state.append_token({'type': 'block_html', 'raw': text}) + return end_pos diff --git a/src/mistune/core.py b/src/mistune/core.py new file mode 100644 index 0000000..71db4dd --- /dev/null +++ b/src/mistune/core.py @@ -0,0 +1,208 @@ +import re + +_LINE_END = re.compile(r'\n|$') + + +class BlockState: + """The state to save block parser's cursor and tokens.""" + def __init__(self, parent=None): + self.src = '' + self.tokens = [] + + # current cursor position + self.cursor = 0 + self.cursor_max = 0 + + # for list and block quote chain + self.list_tight = True + self.parent = parent + + # for saving def references + if parent: + self.env = parent.env + else: + self.env = {'ref_links': {}} + + def child_state(self, src): + child = self.__class__(self) + child.process(src) + return child + + def process(self, src): + self.src = src + self.cursor_max = len(src) + + def find_line_end(self): + m = _LINE_END.search(self.src, self.cursor) + return m.end() + + def get_text(self, end_pos): + return self.src[self.cursor:end_pos] + + def last_token(self): + if self.tokens: + return self.tokens[-1] + + def prepend_token(self, token): + """Insert token before the last token.""" + self.tokens.insert(len(self.tokens) - 1, token) + + def append_token(self, token): + """Add token to the end of token list.""" + self.tokens.append(token) + + def add_paragraph(self, text): + last_token = self.last_token() + if last_token and last_token['type'] == 'paragraph': + last_token['text'] += text + else: + self.tokens.append({'type': 'paragraph', 'text': text}) + + def append_paragraph(self): + last_token = self.last_token() + if last_token and last_token['type'] == 'paragraph': + pos = self.find_line_end() + last_token['text'] += self.get_text(pos) + return pos + + def depth(self): + d = 0 + parent = self.parent + while parent: + d += 1 + parent = parent.parent + return d + + +class InlineState: + """The state to save inline parser's tokens.""" + def __init__(self, env): + self.env = env + self.src = '' + self.tokens = [] + self.in_image = False + self.in_link = False + self.in_emphasis = False + self.in_strong = False + + def prepend_token(self, token): + """Insert token before the last token.""" + self.tokens.insert(len(self.tokens) - 1, token) + + def append_token(self, token): + """Add token to the end of token list.""" + self.tokens.append(token) + + def copy(self): + """Create a copy of current state.""" + state = self.__class__(self.env) + state.in_image = self.in_image + state.in_link = self.in_link + state.in_emphasis = self.in_emphasis + state.in_strong = self.in_strong + return state + + +class Parser: + sc_flag = re.M + state_cls = BlockState + + SPECIFICATION = {} + DEFAULT_RULES = [] + + def __init__(self): + self.specification = self.SPECIFICATION.copy() + self.rules = list(self.DEFAULT_RULES) + self._methods = {} + + self.__sc = {} + + def compile_sc(self, rules=None): + if rules is None: + key = '$' + rules = self.rules + else: + key = '|'.join(rules) + + sc = self.__sc.get(key) + if sc: + return sc + + regex = '|'.join(r'(?P<%s>%s)' % (k, self.specification[k]) for k in rules) + sc = re.compile(regex, self.sc_flag) + self.__sc[key] = sc + return sc + + def register(self, name, pattern, func, before=None): + """Register a new rule to parse the token. This method is usually used to + create a new plugin. + + :param name: name of the new grammar + :param pattern: regex pattern in string + :param func: the parsing function + :param before: insert this rule before a built-in rule + """ + self._methods[name] = lambda m, state: func(self, m, state) + if pattern: + self.specification[name] = pattern + if name not in self.rules: + self.insert_rule(self.rules, name, before=before) + + def register_rule(self, name, pattern, func): + raise DeprecationWarning('This plugin is not compatible with mistune v3.') + + @staticmethod + def insert_rule(rules, name, before=None): + if before: + try: + index = rules.index(before) + rules.insert(index, name) + except ValueError: + rules.append(name) + else: + rules.append(name) + + def parse_method(self, m, state): + func = self._methods[m.lastgroup] + return func(m, state) + + +class BaseRenderer(object): + NAME = 'base' + + def __init__(self): + self.__methods = {} + + def register(self, name, method): + """Register a render method for the named token. For example:: + + def render_wiki(renderer, key, title): + return f'<a href="/wiki/{key}">{title}</a>' + + renderer.register('wiki', render_wiki) + """ + # bind self into renderer method + self.__methods[name] = lambda *arg, **kwargs: method(self, *arg, **kwargs) + + def _get_method(self, name): + try: + return object.__getattribute__(self, name) + except AttributeError: + method = self.__methods.get(name) + if not method: + raise AttributeError('No renderer "{!r}"'.format(name)) + return method + + def render_token(self, token, state): + func = self._get_method(token['type']) + return func(token, state) + + def iter_tokens(self, tokens, state): + for tok in tokens: + yield self.render_token(tok, state) + + def render_tokens(self, tokens, state): + return ''.join(self.iter_tokens(tokens, state)) + + def __call__(self, tokens, state): + return self.render_tokens(tokens, state) diff --git a/src/mistune/directives/__init__.py b/src/mistune/directives/__init__.py new file mode 100644 index 0000000..660c4c8 --- /dev/null +++ b/src/mistune/directives/__init__.py @@ -0,0 +1,31 @@ +from ._base import DirectiveParser, BaseDirective, DirectivePlugin +from ._rst import RSTDirective +from ._fenced import FencedDirective +from .admonition import Admonition +from .toc import TableOfContents +from .include import Include +from .image import Image, Figure + + +class RstDirective(RSTDirective): # pragma: no cover + def __init__(self, plugins): + super(RstDirective, self).__init__(plugins) + import warnings + warnings.warn( + "'RstDirective' is deprecated, please use 'RSTDirective' instead.", + DeprecationWarning, + stacklevel=2, + ) + + +__all__ = [ + 'DirectiveParser', + 'BaseDirective', + 'DirectivePlugin', + 'RSTDirective', + 'FencedDirective', + 'Admonition', + 'TableOfContents', + 'Include', + 'Image', 'Figure', +] diff --git a/src/mistune/directives/_base.py b/src/mistune/directives/_base.py new file mode 100644 index 0000000..ad326c6 --- /dev/null +++ b/src/mistune/directives/_base.py @@ -0,0 +1,121 @@ +import re + + +class DirectiveParser: + name = 'directive' + + @staticmethod + def parse_type(m: re.Match): + raise NotImplementedError() + + @staticmethod + def parse_title(m: re.Match): + raise NotImplementedError() + + @staticmethod + def parse_content(m: re.Match): + raise NotImplementedError() + + @classmethod + def parse_tokens(cls, block, text, state): + if state.depth() >= block.max_nested_level - 1 and cls.name in block.rules: + rules = list(block.rules) + rules.remove(cls.name) + else: + rules = block.rules + child = state.child_state(text) + block.parse(child, rules) + return child.tokens + + @staticmethod + def parse_options(m: re.Match): + text = m.group('options') + if not text.strip(): + return [] + + options = [] + for line in re.split(r'\n+', text): + line = line.strip()[1:] + if not line: + continue + i = line.find(':') + k = line[:i] + v = line[i + 1:].strip() + options.append((k, v)) + return options + + +class BaseDirective: + parser = DirectiveParser + directive_pattern = None + + def __init__(self, plugins): + self._methods = {} + self.__plugins = plugins + + def register(self, name, fn): + self._methods[name] = fn + + def parse_method(self, block, m, state): + _type = self.parser.parse_type(m) + method = self._methods.get(_type) + if method: + try: + token = method(block, m, state) + except ValueError as e: + token = {'type': 'block_error', 'raw': str(e)} + else: + text = m.group(0) + token = { + 'type': 'block_error', + 'raw': text, + } + + if isinstance(token, list): + for tok in token: + state.append_token(tok) + else: + state.append_token(token) + return token + + def parse_directive(self, block, m, state): + raise NotImplementedError() + + def register_block_parser(self, md, before=None): + md.block.register( + self.parser.name, + self.directive_pattern, + self.parse_directive, + before=before, + ) + + def __call__(self, md): + for plugin in self.__plugins: + plugin.parser = self.parser + plugin(self, md) + + +class DirectivePlugin: + def __init__(self): + self.parser = None + + def parse_options(self, m: re.Match): + return self.parser.parse_options(m) + + def parse_type(self, m: re.Match): + return self.parser.parse_type(m) + + def parse_title(self, m: re.Match): + return self.parser.parse_title(m) + + def parse_content(self, m: re.Match): + return self.parser.parse_content(m) + + def parse_tokens(self, block, text, state): + return self.parser.parse_tokens(block, text, state) + + def parse(self, block, m, state): + raise NotImplementedError() + + def __call__(self, md): + raise NotImplementedError() diff --git a/src/mistune/directives/_fenced.py b/src/mistune/directives/_fenced.py new file mode 100644 index 0000000..818f130 --- /dev/null +++ b/src/mistune/directives/_fenced.py @@ -0,0 +1,142 @@ +import re +from ._base import DirectiveParser, BaseDirective + +__all__ = ['FencedDirective'] + + +_type_re = re.compile(r'^ *\{[a-zA-Z0-9_-]+\}') +_directive_re = re.compile( + r'\{(?P<type>[a-zA-Z0-9_-]+)\} *(?P<title>[^\n]*)(?:\n|$)' + r'(?P<options>(?:\:[a-zA-Z0-9_-]+\: *[^\n]*\n+)*)' + r'\n*(?P<text>(?:[^\n]*\n+)*)' +) + + +class FencedParser(DirectiveParser): + name = 'fenced_directive' + + @staticmethod + def parse_type(m: re.Match): + return m.group('type') + + @staticmethod + def parse_title(m: re.Match): + return m.group('title') + + @staticmethod + def parse_content(m: re.Match): + return m.group('text') + + +class FencedDirective(BaseDirective): + """A **fenced** style of directive looks like a fenced code block, it is + inspired by markdown-it-docutils. The syntax looks like: + + .. code-block:: text + + ```{directive-type} title + :option-key: option value + :option-key: option value + + content text here + ``` + + To use ``FencedDirective``, developers can add it into plugin list in + the :class:`Markdown` instance: + + .. code-block:: python + + import mistune + from mistune.directives import FencedDirective, Admonition + + md = mistune.create_markdown(plugins=[ + # ... + FencedDirective([Admonition()]), + ]) + + FencedDirective is using >= 3 backticks or curly-brackets for the fenced + syntax. Developers can change it to other characters, e.g. colon: + + .. code-block:: python + + directive = FencedDirective([Admonition()], ':') + + And then the directive syntax would look like: + + .. code-block:: text + + ::::{note} Nesting directives + You can nest directives by ensuring the start and end fence matching + the length. For instance, in this example, the admonition is started + with 4 colons, then it should end with 4 colons. + + You can nest another admonition with other length of colons except 4. + + :::{tip} Longer outermost fence + It would be better that you put longer markers for the outer fence, + and shorter markers for the inner fence. In this example, we put 4 + colons outsie, and 3 colons inside. + ::: + :::: + + :param plugins: list of directive plugins + :param markers: characters to determine the fence, default is backtick + and curly-bracket + """ + parser = FencedParser + + def __init__(self, plugins, markers='`~'): + super(FencedDirective, self).__init__(plugins) + self.markers = markers + _marker_pattern = '|'.join(re.escape(c) for c in markers) + self.directive_pattern = ( + r'^(?P<fenced_directive_mark>(?:' + _marker_pattern + r'){3,})' + r'\{[a-zA-Z0-9_-]+\}' + ) + + def _process_directive(self, block, marker, start, state): + mlen = len(marker) + cursor_start = start + len(marker) + + _end_pattern = ( + r'^ {0,3}' + marker[0] + '{' + str(mlen) + r',}' + r'[ \t]*(?:\n|$)' + ) + _end_re = re.compile(_end_pattern, re.M) + + _end_m = _end_re.search(state.src, cursor_start) + if _end_m: + text = state.src[cursor_start:_end_m.start()] + end_pos = _end_m.end() + else: + text = state.src[cursor_start:] + end_pos = state.cursor_max + + m = _directive_re.match(text) + if not m: + return + + self.parse_method(block, m, state) + return end_pos + + def parse_directive(self, block, m, state): + marker = m.group('fenced_directive_mark') + return self._process_directive(block, marker, m.start(), state) + + def parse_fenced_code(self, block, m, state): + info = m.group('fenced_3') + if not info or not _type_re.match(info): + return block.parse_fenced_code(m, state) + + if state.depth() >= block.max_nested_level: + return block.parse_fenced_code(m, state) + + marker = m.group('fenced_2') + return self._process_directive(block, marker, m.start(), state) + + def __call__(self, md): + super(FencedDirective, self).__call__(md) + if self.markers == '`~': + md.block.register('fenced_code', None, self.parse_fenced_code) + else: + self.register_block_parser(md, 'fenced_code') diff --git a/src/mistune/directives/_rst.py b/src/mistune/directives/_rst.py new file mode 100644 index 0000000..6e054cf --- /dev/null +++ b/src/mistune/directives/_rst.py @@ -0,0 +1,73 @@ +import re +from ._base import DirectiveParser, BaseDirective + +__all__ = ['RSTDirective'] + + +_directive_re = re.compile( + r'\.\.( +)(?P<type>[a-zA-Z0-9_-]+)\:\: *(?P<title>[^\n]*)(?:\n|$)' + r'(?P<options>(?: \1 {0,3}\:[a-zA-Z0-9_-]+\: *[^\n]*\n+)*)' + r'\n*(?P<text>(?: \1 {0,3}[^\n]*\n+)*)' +) + + +class RSTParser(DirectiveParser): + name = 'rst_directive' + + @staticmethod + def parse_type(m: re.Match): + return m.group('type') + + @staticmethod + def parse_title(m: re.Match): + return m.group('title') + + @staticmethod + def parse_content(m: re.Match): + full_content = m.group(0) + text = m.group('text') + pretext = full_content[:-len(text)] + leading = len(m.group(1)) + 2 + return '\n'.join(line[leading:] for line in text.splitlines()) + '\n' + + +class RSTDirective(BaseDirective): + """A RST style of directive syntax is inspired by reStructuredText. + The syntax is very powerful that you can define a lot of custom + features on your own. The syntax looks like: + + .. code-block:: text + + .. directive-type:: directive value + :option-key: option value + :option-key: option value + + content text here + + To use ``RSTDirective``, developers can add it into plugin list in + the :class:`Markdown` instance: + + .. code-block:: python + + import mistune + from mistune.directives import RSTDirective, Admonition + + md = mistune.create_markdown(plugins=[ + # ... + RSTDirective([Admonition()]), + ]) + """ + parser = RSTParser + directive_pattern = r'^\.\. +[a-zA-Z0-9_-]+\:\:' + + def parse_directive(self, block, m, state): + m = _directive_re.match(state.src, state.cursor) + if not m: + return + + self.parse_method(block, m, state) + return m.end() + + def __call__(self, md): + super(RSTDirective, self).__call__(md) + self.register_block_parser(md) diff --git a/src/mistune/directives/admonition.py b/src/mistune/directives/admonition.py new file mode 100644 index 0000000..b380611 --- /dev/null +++ b/src/mistune/directives/admonition.py @@ -0,0 +1,61 @@ +from ._base import DirectivePlugin + + +class Admonition(DirectivePlugin): + SUPPORTED_NAMES = { + "attention", "caution", "danger", "error", "hint", + "important", "note", "tip", "warning", + } + + def parse(self, block, m, state): + name = self.parse_type(m) + attrs = {'name': name} + options = dict(self.parse_options(m)) + if 'class' in options: + attrs['class'] = options['class'] + + title = self.parse_title(m) + if not title: + title = name.capitalize() + + content = self.parse_content(m) + children = [ + { + 'type': 'admonition_title', + 'text': title, + }, + { + 'type': 'admonition_content', + 'children': self.parse_tokens(block, content, state), + } + ] + return { + 'type': 'admonition', + 'children': children, + 'attrs': attrs, + } + + def __call__(self, directive, md): + for name in self.SUPPORTED_NAMES: + directive.register(name, self.parse) + + if md.renderer.NAME == 'html': + md.renderer.register('admonition', render_admonition) + md.renderer.register('admonition_title', render_admonition_title) + md.renderer.register('admonition_content', render_admonition_content) + + +def render_admonition(self, text, name, **attrs): + html = '<section class="admonition ' + name + _cls = attrs.get('class') + if _cls: + html += ' ' + _cls + return html + '">\n' + text + '</section>\n' + + +def render_admonition_title(self, text): + return '<p class="admonition-title">' + text + '</p>\n' + + +def render_admonition_content(self, text): + return text diff --git a/src/mistune/directives/image.py b/src/mistune/directives/image.py new file mode 100644 index 0000000..5d9d40a --- /dev/null +++ b/src/mistune/directives/image.py @@ -0,0 +1,152 @@ +import re +from ._base import DirectivePlugin +from ..util import escape as escape_text, escape_url + +__all__ = ['Image', 'Figure'] + +_num_re = re.compile(r'^\d+(?:\.\d*)?') +_allowed_aligns = ["top", "middle", "bottom", "left", "center", "right"] + + +def _parse_attrs(options): + attrs = {} + if 'alt' in options: + attrs['alt'] = options['alt'] + + # validate align + align = options.get('align') + if align and align in _allowed_aligns: + attrs['align'] = align + + height = options.get('height') + width = options.get('width') + if height and _num_re.match(height): + attrs['height'] = height + if width and _num_re.match(width): + attrs['width'] = width + if 'target' in options: + attrs['target'] = escape_url(options['target']) + return attrs + + +class Image(DirectivePlugin): + NAME = 'image' + + def parse(self, block, m, state): + options = dict(self.parse_options(m)) + attrs = _parse_attrs(options) + attrs['src'] = self.parse_title(m) + return {'type': 'block_image', 'attrs': attrs} + + def __call__(self, directive, md): + directive.register(self.NAME, self.parse) + if md.renderer.NAME == 'html': + md.renderer.register('block_image', render_block_image) + + +def render_block_image(self, src: str, alt=None, width=None, height=None, **attrs): + img = '<img src="' + src + '"' + style = '' + if alt: + img += ' alt="' + escape_text(alt) + '"' + if width: + if width.isdigit(): + img += ' width="' + width + '"' + else: + style += 'width:' + width + ';' + if height: + if height.isdigit(): + img += ' height="' + height + '"' + else: + style += 'height:' + height + ';' + if style: + img += ' style="' + escape_text(style) + '"' + + img += ' />' + + _cls = 'block-image' + align = attrs.get('align') + if align: + _cls += ' align-' + align + + target = attrs.get('target') + if target: + href = escape_text(self.safe_url(target)) + outer = '<a class="' + _cls + '" href="' + href + '">' + return outer + img + '</a>\n' + else: + return '<div class="' + _cls + '">' + img + '</div>\n' + + +class Figure(DirectivePlugin): + NAME = 'figure' + + def parse_directive_content(self, block, m, state): + content = self.parse_content(m) + if not content: + return + + tokens = self.parse_tokens(block, content, state) + caption = tokens[0] + if caption['type'] == 'paragraph': + caption['type'] = 'figcaption' + children = [caption] + if len(tokens) > 1: + children.append({ + 'type': 'legend', + 'children': tokens[1:] + }) + return children + + def parse(self, block, m, state): + options = dict(self.parse_options(m)) + image_attrs = _parse_attrs(options) + image_attrs['src'] = self.parse_title(m) + + align = image_attrs.pop('align', None) + fig_attrs = {} + if align: + fig_attrs['align'] = align + for k in ['figwidth', 'figclass']: + if k in options: + fig_attrs[k] = options[k] + + children = [{'type': 'block_image', 'attrs': image_attrs}] + content = self.parse_directive_content(block, m, state) + if content: + children.extend(content) + return { + 'type': 'figure', + 'attrs': fig_attrs, + 'children': children, + } + + def __call__(self, directive, md): + directive.register(self.NAME, self.parse) + + if md.renderer.NAME == 'html': + md.renderer.register('figure', render_figure) + md.renderer.register('block_image', render_block_image) + md.renderer.register('figcaption', render_figcaption) + md.renderer.register('legend', render_legend) + + +def render_figure(self, text, align=None, figwidth=None, figclass=None): + _cls = 'figure' + if align: + _cls += ' align-' + align + if figclass: + _cls += ' ' + figclass + + html = '<figure class="' + _cls + '"' + if figwidth: + html += ' style="width:' + figwidth + '"' + return html + '>\n' + text + '</figure>\n' + + +def render_figcaption(self, text): + return '<figcaption>' + text + '</figcaption>\n' + + +def render_legend(self, text): + return '<div class="legend">\n' + text + '</div>\n' diff --git a/src/mistune/directives/include.py b/src/mistune/directives/include.py new file mode 100644 index 0000000..d2180ba --- /dev/null +++ b/src/mistune/directives/include.py @@ -0,0 +1,65 @@ +import os +from ._base import DirectivePlugin + + +class Include(DirectivePlugin): + def parse(self, block, m, state): + source_file = state.env.get('__file__') + if not source_file: + return {'type': 'block_error', 'raw': 'Missing source file'} + + encoding = 'utf-8' + options = self.parse_options(m) + if options: + attrs = dict(options) + if 'encoding' in attrs: + encoding = attrs['encoding'] + else: + attrs = {} + + relpath = self.parse_title(m) + dest = os.path.join(os.path.dirname(source_file), relpath) + dest = os.path.normpath(dest) + + if dest == source_file: + return { + 'type': 'block_error', + 'raw': 'Could not include self: ' + relpath, + } + + if not os.path.isfile(dest): + return { + 'type': 'block_error', + 'raw': 'Could not find file: ' + relpath, + } + + with open(dest, 'rb') as f: + content = f.read() + content = content.decode(encoding) + + ext = os.path.splitext(relpath)[1] + if ext in {'.md', '.markdown', '.mkd'}: + new_state = block.state_cls() + new_state.env['__file__'] = dest + new_state.process(content) + block.parse(new_state) + return new_state.tokens + + elif ext in {'.html', '.xhtml', '.htm'}: + return {'type': 'block_html', 'raw': content} + + attrs['filepath'] = dest + return { + 'type': 'include', + 'raw': content, + 'attrs': attrs, + } + + def __call__(self, directive, md): + directive.register('include', self.parse) + if md.renderer and md.renderer.NAME == 'html': + md.renderer.register('include', render_html_include) + + +def render_html_include(renderer, text, **attrs): + return '<pre class="directive-include">\n' + text + '</pre>\n' diff --git a/src/mistune/directives/toc.py b/src/mistune/directives/toc.py new file mode 100644 index 0000000..4084f43 --- /dev/null +++ b/src/mistune/directives/toc.py @@ -0,0 +1,105 @@ +""" + TOC directive + ~~~~~~~~~~~~~ + + The TOC directive syntax looks like:: + + .. toc:: Title + :min-level: 1 + :max-level: 3 + + "Title", "min-level", and "max-level" option can be empty. "min-level" + and "max-level" are integers >= 1 and <= 6, which define the allowed + heading levels writers want to include in the table of contents. +""" + +from ._base import DirectivePlugin +from ..toc import normalize_toc_item, render_toc_ul + + +class TableOfContents(DirectivePlugin): + def __init__(self, min_level=1, max_level=3): + self.min_level = min_level + self.max_level = max_level + + def generate_heading_id(self, token, index): + return 'toc_' + str(index + 1) + + def parse(self, block, m, state): + title = self.parse_title(m) + options = self.parse_options(m) + if options: + d_options = dict(options) + collapse = 'collapse' in d_options + min_level = _normalize_level(d_options, 'min-level', self.min_level) + max_level = _normalize_level(d_options, 'max-level', self.max_level) + if min_level < self.min_level: + raise ValueError(f'"min-level" option MUST be >= {self.min_level}') + if max_level > self.max_level: + raise ValueError(f'"max-level" option MUST be <= {self.max_level}') + if min_level > max_level: + raise ValueError('"min-level" option MUST be less than "max-level" option') + else: + collapse = False + min_level = self.min_level + max_level = self.max_level + + attrs = { + 'min_level': min_level, + 'max_level': max_level, + 'collapse': collapse, + } + return {'type': 'toc', 'text': title or '', 'attrs': attrs} + + def toc_hook(self, md, state): + sections = [] + headings = [] + + for tok in state.tokens: + if tok['type'] == 'toc': + sections.append(tok) + elif tok['type'] == 'heading': + headings.append(tok) + + if sections: + toc_items = [] + # adding ID for each heading + for i, tok in enumerate(headings): + tok['attrs']['id'] = self.generate_heading_id(tok, i) + toc_items.append(normalize_toc_item(md, tok)) + + for sec in sections: + _min = sec['attrs']['min_level'] + _max = sec['attrs']['max_level'] + toc = [item for item in toc_items if _min <= item[0] <= _max] + sec['attrs']['toc'] = toc + + def __call__(self, directive, md): + if md.renderer and md.renderer.NAME == 'html': + # only works with HTML renderer + directive.register('toc', self.parse) + md.before_render_hooks.append(self.toc_hook) + md.renderer.register('toc', render_html_toc) + + +def render_html_toc(renderer, title, collapse=False, **attrs): + if not title: + title = 'Table of Contents' + toc = attrs['toc'] + content = render_toc_ul(attrs['toc']) + + html = '<details class="toc"' + if not collapse: + html += ' open' + html += '>\n<summary>' + title + '</summary>\n' + return html + content + '</details>\n' + + +def _normalize_level(options, name, default): + level = options.get(name) + if not level: + return default + try: + return int(level) + except (ValueError, TypeError): + raise ValueError(f'"{name}" option MUST be integer') diff --git a/src/mistune/helpers.py b/src/mistune/helpers.py new file mode 100644 index 0000000..04c1df1 --- /dev/null +++ b/src/mistune/helpers.py @@ -0,0 +1,137 @@ +import re +import string +from .util import escape_url + +PREVENT_BACKSLASH = r'(?<!\\)(?:\\\\)*' +PUNCTUATION = r'[' + re.escape(string.punctuation) + r']' + +LINK_LABEL = r'(?:[^\\\[\]]|\\.){0,500}' + +LINK_BRACKET_START = re.compile(r'[ \t]*\n?[ \t]*<') +LINK_BRACKET_RE = re.compile(r'<([^<>\n\\\x00]*)>') +LINK_HREF_BLOCK_RE = re.compile(r'[ \t]*\n?[ \t]*([^\s]+)(?:\s|$)') +LINK_HREF_INLINE_RE = re.compile( + r'[ \t]*\n?[ \t]*([^ \t\n]*?)(?:[ \t\n]|' + r'(?:' + PREVENT_BACKSLASH + r'\)))' +) + +LINK_TITLE_RE = re.compile( + r'[ \t\n]+(' + r'"(?:\\' + PUNCTUATION + r'|[^"\x00])*"|' # "title" + r"'(?:\\" + PUNCTUATION + r"|[^'\x00])*'" # 'title' + r')' +) +PAREN_END_RE = re.compile(r'\s*\)') + +HTML_TAGNAME = r'[A-Za-z][A-Za-z0-9-]*' +HTML_ATTRIBUTES = ( + r'(?:\s+[A-Za-z_:][A-Za-z0-9_.:-]*' + r'(?:\s*=\s*(?:[^ !"\'=<>`]+|\'[^\']*?\'|"[^\"]*?"))?)*' +) + +BLOCK_TAGS = ( + 'address', 'article', 'aside', 'base', 'basefont', 'blockquote', + 'body', 'caption', 'center', 'col', 'colgroup', 'dd', 'details', + 'dialog', 'dir', 'div', 'dl', 'dt', 'fieldset', 'figcaption', + 'figure', 'footer', 'form', 'frame', 'frameset', 'h1', 'h2', 'h3', + 'h4', 'h5', 'h6', 'head', 'header', 'hr', 'html', 'iframe', + 'legend', 'li', 'link', 'main', 'menu', 'menuitem', 'meta', 'nav', + 'noframes', 'ol', 'optgroup', 'option', 'p', 'param', 'section', + 'source', 'summary', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', + 'title', 'tr', 'track', 'ul' +) +PRE_TAGS = ('pre', 'script', 'style', 'textarea') + +_INLINE_LINK_LABEL_RE = re.compile(LINK_LABEL + r'\]') +_INLINE_SQUARE_BRACKET_RE = re.compile(PREVENT_BACKSLASH + r'[\[\]]') +_ESCAPE_CHAR_RE = re.compile(r'\\(' + PUNCTUATION + r')') + + +def unescape_char(text): + return _ESCAPE_CHAR_RE.sub(r'\1', text) + + +def parse_link_text(src, pos): + level = 1 + found = False + start_pos = pos + + while pos < len(src): + m = _INLINE_SQUARE_BRACKET_RE.search(src, pos) + if not m: + break + + pos = m.end() + marker = m.group(0) + if marker == ']': + level -= 1 + if level == 0: + found = True + break + else: + level += 1 + + if found: + text = src[start_pos:pos-1] + return text, pos + return None, None + + +def parse_link_label(src, start_pos): + m = _INLINE_LINK_LABEL_RE.match(src, start_pos) + if m: + label = m.group(0)[:-1] + return label, m.end() + return None, None + + +def parse_link_href(src, start_pos, block=False): + m = LINK_BRACKET_START.match(src, start_pos) + if m: + start_pos = m.end() - 1 + m = LINK_BRACKET_RE.match(src, start_pos) + if m: + return m.group(1), m.end() + return None, None + + if block: + m = LINK_HREF_BLOCK_RE.match(src, start_pos) + else: + m = LINK_HREF_INLINE_RE.match(src, start_pos) + + if not m: + return None, None + + end_pos = m.end() + href = m.group(1) + + if block and src[end_pos - 1] == href[-1]: + return href, end_pos + return href, end_pos - 1 + + +def parse_link_title(src, start_pos, max_pos): + m = LINK_TITLE_RE.match(src, start_pos, max_pos) + if m: + title = m.group(1)[1:-1] + title = unescape_char(title) + return title, m.end() + return None, None + + +def parse_link(src, pos): + href, href_pos = parse_link_href(src, pos) + if href is None: + return None, None + + title, title_pos = parse_link_title(src, href_pos, len(src)) + next_pos = title_pos or href_pos + m = PAREN_END_RE.match(src, next_pos) + if not m: + return None, None + + href = unescape_char(href) + attrs = {'url': escape_url(href)} + if title: + attrs['title'] = title + return attrs, m.end() diff --git a/src/mistune/inline_parser.py b/src/mistune/inline_parser.py new file mode 100644 index 0000000..0375a74 --- /dev/null +++ b/src/mistune/inline_parser.py @@ -0,0 +1,390 @@ +import re +from typing import Optional, List, Dict, Any +from .core import Parser, InlineState +from .util import ( + escape, + escape_url, + unikey, +) +from .helpers import ( + PREVENT_BACKSLASH, + PUNCTUATION, + HTML_TAGNAME, + HTML_ATTRIBUTES, + unescape_char, + parse_link, + parse_link_label, + parse_link_text, +) + +PAREN_END_RE = re.compile(r'\s*\)') + +AUTO_EMAIL = ( + r'''<[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9]''' + r'(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?' + r'(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*>' +) + +INLINE_HTML = ( + r'<' + HTML_TAGNAME + HTML_ATTRIBUTES + r'\s*/?>|' # open tag + r'</' + HTML_TAGNAME + r'\s*>|' # close tag + r'<!--(?!>|->)(?:(?!--)[\s\S])+?(?<!-)-->|' # comment + r'<\?[\s\S]+?\?>|' # script like <?php?> + r'<![A-Z][\s\S]+?>|' # doctype + r'<!\[CDATA[\s\S]+?\]\]>' # cdata +) + +EMPHASIS_END_RE = { + '*': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\*|[^\s*])\*(?!\*)'), + '_': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\_|[^\s_])_(?!_)\b'), + + '**': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\*|[^\s*])\*\*(?!\*)'), + '__': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\_|[^\s_])__(?!_)\b'), + + '***': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\*|[^\s*])\*\*\*(?!\*)'), + '___': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\_|[^\s_])___(?!_)\b'), +} + + +class InlineParser(Parser): + sc_flag = 0 + state_cls = InlineState + + #: linebreak leaves two spaces at the end of line + STD_LINEBREAK = r'(?:\\| {2,})\n\s*' + + #: every new line becomes <br> + HARD_LINEBREAK = r' *\n\s*' + + # we only need to find the start pattern of an inline token + SPECIFICATION = { + # e.g. \`, \$ + 'escape': r'(?:\\' + PUNCTUATION + ')+', + + # `code, ```code + 'codespan': r'`{1,}', + + # *w, **w, _w, __w + 'emphasis': r'\*{1,3}(?=[^\s*])|\b_{1,3}(?=[^\s_])', + + # [link], ![img] + 'link': r'!?\[', + + # <https://example.com>. regex copied from commonmark.js + 'auto_link': r'<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>', + 'auto_email': AUTO_EMAIL, + + 'inline_html': INLINE_HTML, + + 'linebreak': STD_LINEBREAK, + 'softbreak': HARD_LINEBREAK, + + 'prec_auto_link': r'<[A-Za-z][A-Za-z\d.+-]{1,31}:', + 'prec_inline_html': r'</?' + HTML_TAGNAME + r'|<!|<\?', + } + DEFAULT_RULES = ( + 'escape', + 'codespan', + 'emphasis', + 'link', + 'auto_link', + 'auto_email', + 'inline_html', + 'linebreak', + ) + + def __init__(self, hard_wrap=False): + super(InlineParser, self).__init__() + + self.hard_wrap = hard_wrap + # lazy add linebreak + if hard_wrap: + self.specification['linebreak'] = self.HARD_LINEBREAK + else: + self.rules.append('softbreak') + + self._methods = { + name: getattr(self, 'parse_' + name) for name in self.rules + } + + def parse_escape(self, m: re.Match, state: InlineState) -> int: + text = m.group(0) + text = unescape_char(text) + state.append_token({ + 'type': 'text', + 'raw': text, + }) + return m.end() + + def parse_link(self, m: re.Match, state: InlineState) -> Optional[int]: + pos = m.end() + + marker = m.group(0) + is_image = marker[0] == '!' + if is_image and state.in_image: + state.append_token({'type': 'text', 'raw': marker}) + return pos + elif not is_image and state.in_link: + state.append_token({'type': 'text', 'raw': marker}) + return pos + + text = None + label, end_pos = parse_link_label(state.src, pos) + if label is None: + text, end_pos = parse_link_text(state.src, pos) + if text is None: + return + + if text is None: + text = label + + if end_pos >= len(state.src) and label is None: + return + + rules = ['codespan', 'prec_auto_link', 'prec_inline_html'] + prec_pos = self.precedence_scan(m, state, end_pos, rules) + if prec_pos: + return prec_pos + + if end_pos < len(state.src): + c = state.src[end_pos] + if c == '(': + # standard link [text](<url> "title") + attrs, pos2 = parse_link(state.src, end_pos + 1) + if pos2: + token = self.__parse_link_token(is_image, text, attrs, state) + state.append_token(token) + return pos2 + + elif c == '[': + # standard ref link [text][label] + label2, pos2 = parse_link_label(state.src, end_pos + 1) + if pos2: + end_pos = pos2 + if label2: + label = label2 + + if label is None: + return + + ref_links = state.env.get('ref_links') + if not ref_links: + return + + key = unikey(label) + env = ref_links.get(key) + if env: + attrs = {'url': env['url'], 'title': env.get('title')} + token = self.__parse_link_token(is_image, text, attrs, state) + token['ref'] = key + token['label'] = label + state.append_token(token) + return end_pos + + def __parse_link_token(self, is_image, text, attrs, state): + new_state = state.copy() + new_state.src = text + if is_image: + new_state.in_image = True + token = { + 'type': 'image', + 'children': self.render(new_state), + 'attrs': attrs, + } + else: + new_state.in_link = True + token = { + 'type': 'link', + 'children': self.render(new_state), + 'attrs': attrs, + } + return token + + def parse_auto_link(self, m: re.Match, state: InlineState) -> int: + text = m.group(0) + pos = m.end() + if state.in_link: + self.process_text(text, state) + return pos + + text = text[1:-1] + self._add_auto_link(text, text, state) + return pos + + def parse_auto_email(self, m: re.Match, state: InlineState) -> int: + text = m.group(0) + pos = m.end() + if state.in_link: + self.process_text(text, state) + return pos + + text = text[1:-1] + url = 'mailto:' + text + self._add_auto_link(url, text, state) + return pos + + def _add_auto_link(self, url, text, state): + state.append_token({ + 'type': 'link', + 'children': [{'type': 'text', 'raw': text}], + 'attrs': {'url': escape_url(url)}, + }) + + def parse_emphasis(self, m: re.Match, state: InlineState) -> int: + pos = m.end() + + marker = m.group(0) + mlen = len(marker) + if mlen == 1 and state.in_emphasis: + state.append_token({'type': 'text', 'raw': marker}) + return pos + elif mlen == 2 and state.in_strong: + state.append_token({'type': 'text', 'raw': marker}) + return pos + + _end_re = EMPHASIS_END_RE[marker] + m1 = _end_re.search(state.src, pos) + if not m1: + state.append_token({'type': 'text', 'raw': marker}) + return pos + + end_pos = m1.end() + text = state.src[pos:end_pos-mlen] + + prec_pos = self.precedence_scan(m, state, end_pos) + if prec_pos: + return prec_pos + + new_state = state.copy() + new_state.src = text + if mlen == 1: + new_state.in_emphasis = True + children = self.render(new_state) + state.append_token({'type': 'emphasis', 'children': children}) + elif mlen == 2: + new_state.in_strong = True + children = self.render(new_state) + state.append_token({'type': 'strong', 'children': children}) + else: + new_state.in_emphasis = True + new_state.in_strong = True + + children = [{ + 'type': 'strong', + 'children': self.render(new_state) + }] + state.append_token({ + 'type': 'emphasis', + 'children': children, + }) + return end_pos + + def parse_codespan(self, m: re.Match, state: InlineState) -> int: + marker = m.group(0) + # require same marker with same length at end + + pattern = re.compile(r'(.*?(?:[^`]))' + marker + r'(?!`)', re.S) + + pos = m.end() + m = pattern.match(state.src, pos) + if m: + end_pos = m.end() + code = m.group(1) + # Line endings are treated like spaces + code = code.replace('\n', ' ') + if len(code.strip()): + if code.startswith(' ') and code.endswith(' '): + code = code[1:-1] + state.append_token({'type': 'codespan', 'raw': escape(code)}) + return end_pos + else: + state.append_token({'type': 'text', 'raw': marker}) + return pos + + def parse_linebreak(self, m: re.Match, state: InlineState) -> int: + state.append_token({'type': 'linebreak'}) + return m.end() + + def parse_softbreak(self, m: re.Match, state: InlineState) -> int: + state.append_token({'type': 'softbreak'}) + return m.end() + + def parse_inline_html(self, m: re.Match, state: InlineState) -> int: + end_pos = m.end() + html = m.group(0) + state.append_token({'type': 'inline_html', 'raw': html}) + if html.startswith(('<a ', '<a>', '<A ', '<A>')): + state.in_link = True + elif html.startswith(('</a ', '</a>', '</A ', '</A>')): + state.in_link = False + return end_pos + + def process_text(self, text: str, state: InlineState): + state.append_token({'type': 'text', 'raw': text}) + + def parse(self, state: InlineState) -> List[Dict[str, Any]]: + pos = 0 + sc = self.compile_sc() + while pos < len(state.src): + m = sc.search(state.src, pos) + if not m: + break + + end_pos = m.start() + if end_pos > pos: + hole = state.src[pos:end_pos] + self.process_text(hole, state) + + new_pos = self.parse_method(m, state) + if not new_pos: + # move cursor 1 character forward + pos = end_pos + 1 + hole = state.src[end_pos:pos] + self.process_text(hole, state) + else: + pos = new_pos + + if pos == 0: + # special case, just pure text + self.process_text(state.src, state) + elif pos < len(state.src): + self.process_text(state.src[pos:], state) + return state.tokens + + def precedence_scan(self, m: re.Match, state: InlineState, end_pos: int, rules=None): + if rules is None: + rules = ['codespan', 'link', 'prec_auto_link', 'prec_inline_html'] + + mark_pos = m.end() + sc = self.compile_sc(rules) + m1 = sc.search(state.src, mark_pos, end_pos) + if not m1: + return + + rule_name = m1.lastgroup.replace('prec_', '') + sc = self.compile_sc([rule_name]) + m2 = sc.match(state.src, m1.start()) + if not m2: + return + + func = self._methods[rule_name] + new_state = state.copy() + new_state.src = state.src + m2_pos = func(m2, new_state) + if not m2_pos or m2_pos < end_pos: + return + + raw_text = state.src[m.start():m2.start()] + state.append_token({'type': 'text', 'raw': raw_text}) + for token in new_state.tokens: + state.append_token(token) + return m2_pos + + def render(self, state: InlineState): + self.parse(state) + return state.tokens + + def __call__(self, s, env): + state = self.state_cls(env) + state.src = s + return self.render(state) diff --git a/src/mistune/list_parser.py b/src/mistune/list_parser.py new file mode 100644 index 0000000..b5ff866 --- /dev/null +++ b/src/mistune/list_parser.py @@ -0,0 +1,250 @@ +import re +from .core import BlockState +from .util import ( + strip_end, + expand_tab, + expand_leading_tab, +) +# because list is complex, split list parser in a new file + +LIST_PATTERN = ( + r'^(?P<list_1> {0,3})' + r'(?P<list_2>[\*\+-]|\d{1,9}[.)])' + r'(?P<list_3>[ \t]*|[ \t].+)$' +) + +_LINE_HAS_TEXT = re.compile(r'( *)\S') + + +def parse_list(block, m: re.Match, state: BlockState) -> int: + """Parse tokens for ordered and unordered list.""" + text = m.group('list_3') + if not text.strip(): + # Example 285 + # an empty list item cannot interrupt a paragraph + end_pos = state.append_paragraph() + if end_pos: + return end_pos + + marker = m.group('list_2') + ordered = len(marker) > 1 + depth = state.depth() + token = { + 'type': 'list', + 'children': [], + 'tight': True, + 'bullet': marker[-1], + 'attrs': { + 'depth': depth, + 'ordered': ordered, + }, + } + if ordered: + start = int(marker[:-1]) + if start != 1: + # Example 304 + # we allow only lists starting with 1 to interrupt paragraphs + end_pos = state.append_paragraph() + if end_pos: + return end_pos + token['attrs']['start'] = start + + state.cursor = m.end() + 1 + groups = (m.group('list_1'), marker, text) + + if depth >= block.max_nested_level - 1: + rules = list(block.list_rules) + rules.remove('list') + else: + rules = block.list_rules + + bullet = _get_list_bullet(marker[-1]) + while groups: + groups = _parse_list_item(block, bullet, groups, token, state, rules) + + end_pos = token.pop('_end_pos', None) + _transform_tight_list(token) + if end_pos: + index = token.pop('_tok_index') + state.tokens.insert(index, token) + return end_pos + + state.append_token(token) + return state.cursor + + +def _transform_tight_list(token): + if token['tight']: + # reset tight list item + for list_item in token['children']: + for tok in list_item['children']: + if tok['type'] == 'paragraph': + tok['type'] = 'block_text' + elif tok['type'] == 'list': + _transform_tight_list(tok) + + +def _parse_list_item(block, bullet, groups, token, state, rules): + spaces, marker, text = groups + + leading_width = len(spaces) + len(marker) + text, continue_width = _compile_continue_width(text, leading_width) + item_pattern = _compile_list_item_pattern(bullet, leading_width) + pairs = [ + ('thematic_break', block.specification['thematic_break']), + ('fenced_code', block.specification['fenced_code']), + ('axt_heading', block.specification['axt_heading']), + ('block_quote', block.specification['block_quote']), + ('block_html', block.specification['block_html']), + ('list', block.specification['list']), + ] + if leading_width < 3: + _repl_w = str(leading_width) + pairs = [(n, p.replace('3', _repl_w, 1)) for n, p in pairs] + + pairs.insert(1, ('list_item', item_pattern)) + regex = '|'.join(r'(?P<%s>(?<=\n)%s)' % pair for pair in pairs) + sc = re.compile(regex, re.M) + + src = '' + next_group = None + prev_blank_line = False + pos = state.cursor + + continue_space = ' ' * continue_width + while pos < state.cursor_max: + pos = state.find_line_end() + line = state.get_text(pos) + if block.BLANK_LINE.match(line): + src += '\n' + prev_blank_line = True + state.cursor = pos + continue + + line = expand_leading_tab(line) + if line.startswith(continue_space): + if prev_blank_line and not text and not src.strip(): + # Example 280 + # A list item can begin with at most one blank line + break + + src += line + prev_blank_line = False + state.cursor = pos + continue + + m = sc.match(state.src, state.cursor) + if m: + tok_type = m.lastgroup + if tok_type == 'list_item': + if prev_blank_line: + token['tight'] = False + next_group = ( + m.group('listitem_1'), + m.group('listitem_2'), + m.group('listitem_3') + ) + state.cursor = m.end() + 1 + break + tok_index = len(state.tokens) + end_pos = block.parse_method(m, state) + if end_pos: + token['_tok_index'] = tok_index + token['_end_pos'] = end_pos + break + + if prev_blank_line and not line.startswith(continue_space): + # not a continue line, and previous line is blank + break + + src += line + state.cursor = pos + + text += _clean_list_item_text(src, continue_width) + child = state.child_state(strip_end(text)) + + block.parse(child, rules) + + if token['tight'] and _is_loose_list(child.tokens): + token['tight'] = False + + token['children'].append({ + 'type': 'list_item', + 'children': child.tokens, + }) + if next_group: + return next_group + + +def _get_list_bullet(c): + if c == '.': + bullet = r'\d{0,9}\.' + elif c == ')': + bullet = r'\d{0,9}\)' + elif c == '*': + bullet = r'\*' + elif c == '+': + bullet = r'\+' + else: + bullet = '-' + return bullet + + +def _compile_list_item_pattern(bullet, leading_width): + if leading_width > 3: + leading_width = 3 + return ( + r'^(?P<listitem_1> {0,' + str(leading_width) + '})' + r'(?P<listitem_2>' + bullet + ')' + r'(?P<listitem_3>[ \t]*|[ \t][^\n]+)$' + ) + + +def _compile_continue_width(text, leading_width): + text = expand_leading_tab(text, 3) + text = expand_tab(text) + + m2 = _LINE_HAS_TEXT.match(text) + if m2: + # indent code, startswith 5 spaces + if text.startswith(' '): + space_width = 1 + else: + space_width = len(m2.group(1)) + + text = text[space_width:] + '\n' + else: + space_width = 1 + text = '' + + continue_width = leading_width + space_width + return text, continue_width + + +def _clean_list_item_text(src, continue_width): + # according to Example 7, tab should be treated as 3 spaces + rv = [] + trim_space = ' ' * continue_width + lines = src.split('\n') + for line in lines: + if line.startswith(trim_space): + line = line.replace(trim_space, '', 1) + # according to CommonMark Example 5 + # tab should be treated as 4 spaces + line = expand_tab(line) + rv.append(line) + else: + rv.append(line) + + return '\n'.join(rv) + + +def _is_loose_list(tokens): + paragraph_count = 0 + for tok in tokens: + if tok['type'] == 'blank_line': + return True + if tok['type'] == 'paragraph': + paragraph_count += 1 + if paragraph_count > 1: + return True diff --git a/src/mistune/markdown.py b/src/mistune/markdown.py new file mode 100644 index 0000000..c814a59 --- /dev/null +++ b/src/mistune/markdown.py @@ -0,0 +1,104 @@ +from typing import Optional +from .core import BlockState +from .block_parser import BlockParser +from .inline_parser import InlineParser + + +class Markdown: + """Markdown instance to convert markdown text into HTML or other formats. + Here is an example with the HTMLRenderer:: + + from mistune import HTMLRenderer + + md = Markdown(renderer=HTMLRenderer(escape=False)) + md('hello **world**') + + :param renderer: a renderer to convert parsed tokens + :param block: block level syntax parser + :param inline: inline level syntax parser + :param plugins: mistune plugins to use + """ + def __init__(self, renderer=None, block=None, inline=None, plugins=None): + if block is None: + block = BlockParser() + + if inline is None: + inline = InlineParser() + + self.renderer = renderer + self.block = block + self.inline = inline + self.before_parse_hooks = [] + self.before_render_hooks = [] + self.after_render_hooks = [] + + if plugins: + for plugin in plugins: + plugin(self) + + def use(self, plugin): + plugin(self) + + def render_state(self, state: BlockState): + data = self._iter_render(state.tokens, state) + if self.renderer: + return self.renderer(data, state) + return list(data) + + def _iter_render(self, tokens, state): + for tok in tokens: + if 'children' in tok: + children = self._iter_render(tok['children'], state) + tok['children'] = list(children) + elif 'text' in tok: + text = tok.pop('text') + # process inline text + tok['children'] = self.inline(text.strip(), state.env) + yield tok + + def parse(self, s: str, state: Optional[BlockState]=None): + """Parse and convert the given markdown string. If renderer is None, + the returned **result** will be parsed markdown tokens. + + :param s: markdown string + :param state: instance of BlockState + :returns: result, state + """ + if state is None: + state = self.block.state_cls() + + # normalize line separator + s = s.replace('\r\n', '\n') + s = s.replace('\r', '\n') + + state.process(s) + + for hook in self.before_parse_hooks: + hook(self, state) + + self.block.parse(state) + + for hook in self.before_render_hooks: + hook(self, state) + + result = self.render_state(state) + + for hook in self.after_render_hooks: + result = hook(self, result, state) + return result, state + + def read(self, filepath, encoding='utf-8', state=None): + if state is None: + state = self.block.state_cls() + + state.env['__file__'] = filepath + with open(filepath, 'rb') as f: + s = f.read() + + s = s.decode(encoding) + return self.parse(s, state) + + def __call__(self, s: str): + if s is None: + s = '\n' + return self.parse(s)[0] diff --git a/src/mistune/plugins/__init__.py b/src/mistune/plugins/__init__.py new file mode 100644 index 0000000..a79d727 --- /dev/null +++ b/src/mistune/plugins/__init__.py @@ -0,0 +1,38 @@ +from importlib import import_module + +_plugins = { + 'speedup': 'mistune.plugins.speedup.speedup', + 'strikethrough': 'mistune.plugins.formatting.strikethrough', + 'mark': 'mistune.plugins.formatting.mark', + 'insert': 'mistune.plugins.formatting.insert', + 'superscript': 'mistune.plugins.formatting.superscript', + 'subscript': 'mistune.plugins.formatting.subscript', + 'footnotes': 'mistune.plugins.footnotes.footnotes', + 'table': 'mistune.plugins.table.table', + 'url': 'mistune.plugins.url.url', + 'abbr': 'mistune.plugins.abbr.abbr', + 'def_list': 'mistune.plugins.def_list.def_list', + 'math': 'mistune.plugins.math.math', + 'ruby': 'mistune.plugins.ruby.ruby', + 'task_lists': 'mistune.plugins.task_lists.task_lists', + 'spoiler': 'mistune.plugins.spoiler.spoiler', +} +_cached_modules = {} + + +def import_plugin(name): + if name in _cached_modules: + return _cached_modules[name] + + if callable(name): + return name + + if name in _plugins: + module_path, func_name = _plugins[name].rsplit(".", 1) + else: + module_path, func_name = name.rsplit(".", 1) + + module = import_module(module_path) + plugin = getattr(module, func_name) + _cached_modules[name] = plugin + return plugin diff --git a/src/mistune/plugins/abbr.py b/src/mistune/plugins/abbr.py new file mode 100644 index 0000000..1b45790 --- /dev/null +++ b/src/mistune/plugins/abbr.py @@ -0,0 +1,103 @@ +import re +import types +from ..util import escape +from ..helpers import PREVENT_BACKSLASH + +__all__ = ['abbr'] + +# https://michelf.ca/projects/php-markdown/extra/#abbr +REF_ABBR = ( + r'^ {0,3}\*\[(?P<abbr_key>[^\]]+)'+ PREVENT_BACKSLASH + r'\]:' + r'(?P<abbr_text>(?:[ \t]*\n(?: {3,}|\t)[^\n]+)|(?:[^\n]*))$' +) + + +def parse_ref_abbr(block, m, state): + ref = state.env.get('ref_abbrs') + if not ref: + ref = {} + key = m.group('abbr_key') + text = m.group('abbr_text') + ref[key] = text.strip() + state.env['ref_abbrs'] = ref + # abbr definition can split paragraph + state.append_token({'type': 'blank_line'}) + return m.end() + 1 + + +def process_text(inline, text, state): + ref = state.env.get('ref_abbrs') + if not ref: + return state.append_token({'type': 'text', 'raw': text}) + + if state.tokens: + last = state.tokens[-1] + if last['type'] == 'text': + state.tokens.pop() + text = last['raw'] + text + + abbrs_re = state.env.get('abbrs_re') + if not abbrs_re: + abbrs_re = re.compile(r'|'.join(re.escape(k) for k in ref.keys())) + state.env['abbrs_re'] = abbrs_re + + pos = 0 + while pos < len(text): + m = abbrs_re.search(text, pos) + if not m: + break + + end_pos = m.start() + if end_pos > pos: + hole = text[pos:end_pos] + state.append_token({'type': 'text', 'raw': hole}) + + label = m.group(0) + state.append_token({ + 'type': 'abbr', + 'children': [{'type': 'text', 'raw': label}], + 'attrs': {'title': ref[label]} + }) + pos = m.end() + + if pos == 0: + # special case, just pure text + state.append_token({'type': 'text', 'raw': text}) + elif pos < len(text): + state.append_token({'type': 'text', 'raw': text[pos:]}) + + +def render_abbr(renderer, text, title): + if not title: + return '<abbr>' + text + '</abbr>' + return '<abbr title="' + escape(title) + '">' + text + '</abbr>' + + +def abbr(md): + """A mistune plugin to support abbreviations, spec defined at + https://michelf.ca/projects/php-markdown/extra/#abbr + + Here is an example: + + .. code-block:: text + + The HTML specification + is maintained by the W3C. + + *[HTML]: Hyper Text Markup Language + *[W3C]: World Wide Web Consortium + + It will be converted into HTML: + + .. code-block:: html + + The <abbr title="Hyper Text Markup Language">HTML</abbr> specification + is maintained by the <abbr title="World Wide Web Consortium">W3C</abbr>. + + :param md: Markdown instance + """ + md.block.register('ref_abbr', REF_ABBR, parse_ref_abbr, before='paragraph') + # replace process_text + md.inline.process_text = types.MethodType(process_text, md.inline) + if md.renderer and md.renderer.NAME == 'html': + md.renderer.register('abbr', render_abbr) diff --git a/src/mistune/plugins/def_list.py b/src/mistune/plugins/def_list.py new file mode 100644 index 0000000..3675641 --- /dev/null +++ b/src/mistune/plugins/def_list.py @@ -0,0 +1,135 @@ +import re +from ..util import strip_end + +__all__ = ['def_list'] + +# https://michelf.ca/projects/php-markdown/extra/#def-list + +DEF_PATTERN = ( + r'^(?P<def_list_head>(?:[^\n]+\n)+?)' + r'\n?(?:' + r'\:[ \t]+.*\n' + r'(?:[^\n]+\n)*' # lazy continue line + r'(?:(?:[ \t]*\n)*[ \t]+[^\n]+\n)*' + r'(?:[ \t]*\n)*' + r')+' +) +DEF_RE = re.compile(DEF_PATTERN, re.M) +DD_START_RE = re.compile(r'^:[ \t]+', re.M) +TRIM_RE = re.compile(r'^ {0,4}', re.M) +HAS_BLANK_LINE_RE = re.compile(r'\n[ \t]*\n$') + + +def parse_def_list(block, m, state): + pos = m.end() + children = list(_parse_def_item(block, m)) + + m = DEF_RE.match(state.src, pos) + while m: + children.extend(list(_parse_def_item(block, m))) + pos = m.end() + m = DEF_RE.match(state.src, pos) + + state.append_token({ + 'type': 'def_list', + 'children': children, + }) + return pos + + +def _parse_def_item(block, m): + head = m.group('def_list_head') + for line in head.splitlines(): + yield { + 'type': 'def_list_head', + 'text': line, + } + + src = m.group(0) + end = len(head) + + m = DD_START_RE.search(src, end) + start = m.start() + prev_blank_line = src[end:start] == '\n' + while m: + m = DD_START_RE.search(src, start + 1) + if not m: + break + + end = m.start() + text = src[start:end].replace(':', ' ', 1) + children = _process_text(block, text, prev_blank_line) + prev_blank_line = bool(HAS_BLANK_LINE_RE.search(text)) + yield { + 'type': 'def_list_item', + 'children': children, + } + start = end + + text = src[start:].replace(':', ' ', 1) + children = _process_text(block, text, prev_blank_line) + yield { + 'type': 'def_list_item', + 'children': children, + } + + +def _process_text(block, text, loose): + text = TRIM_RE.sub('', text) + state = block.state_cls() + state.process(strip_end(text)) + # use default list rules + block.parse(state, block.list_rules) + tokens = state.tokens + if not loose and len(tokens) == 1 and tokens[0]['type'] == 'paragraph': + tokens[0]['type'] = 'block_text' + return tokens + + +def render_def_list(renderer, text): + return '<dl>\n' + text + '</dl>\n' + + +def render_def_list_head(renderer, text): + return '<dt>' + text + '</dt>\n' + + +def render_def_list_item(renderer, text): + return '<dd>' + text + '</dd>\n' + + +def def_list(md): + """A mistune plugin to support def list, spec defined at + https://michelf.ca/projects/php-markdown/extra/#def-list + + Here is an example: + + .. code-block:: text + + Apple + : Pomaceous fruit of plants of the genus Malus in + the family Rosaceae. + + Orange + : The fruit of an evergreen tree of the genus Citrus. + + It will be converted into HTML: + + .. code-block:: html + + <dl> + <dt>Apple</dt> + <dd>Pomaceous fruit of plants of the genus Malus in + the family Rosaceae.</dd> + + <dt>Orange</dt> + <dd>The fruit of an evergreen tree of the genus Citrus.</dd> + </dl> + + :param md: Markdown instance + """ + md.block.register('def_list', DEF_PATTERN, parse_def_list, before='paragraph') + if md.renderer and md.renderer.NAME == 'html': + md.renderer.register('def_list', render_def_list) + md.renderer.register('def_list_head', render_def_list_head) + md.renderer.register('def_list_item', render_def_list_item) diff --git a/src/mistune/plugins/footnotes.py b/src/mistune/plugins/footnotes.py new file mode 100644 index 0000000..2e10704 --- /dev/null +++ b/src/mistune/plugins/footnotes.py @@ -0,0 +1,153 @@ +import re +from ..core import BlockState +from ..util import unikey +from ..helpers import LINK_LABEL + +__all__ = ['footnotes'] + +_PARAGRAPH_SPLIT = re.compile(r'\n{2,}') +# https://michelf.ca/projects/php-markdown/extra/#footnotes +REF_FOOTNOTE = ( + r'^(?P<footnote_lead> {0,3})' + r'\[\^(?P<footnote_key>' + LINK_LABEL + r')]:[ \t]' + r'(?P<footnote_text>[^\n]*(?:\n+|$)' + r'(?:(?P=footnote_lead) {1,3}(?! )[^\n]*\n+)*' + r')' +) + +INLINE_FOOTNOTE = r'\[\^(?P<footnote_key>' + LINK_LABEL + r')\]' + + +def parse_inline_footnote(inline, m: re.Match, state): + key = unikey(m.group('footnote_key')) + ref = state.env.get('ref_footnotes') + if ref and key in ref: + notes = state.env.get('footnotes') + if not notes: + notes = [] + if key not in notes: + notes.append(key) + state.env['footnotes'] = notes + state.append_token({ + 'type': 'footnote_ref', + 'raw': key, + 'attrs': {'index': notes.index(key) + 1} + }) + else: + state.append_token({'type': 'text', 'raw': m.group(0)}) + return m.end() + + +def parse_ref_footnote(block, m: re.Match, state: BlockState): + ref = state.env.get('ref_footnotes') + if not ref: + ref = {} + + key = unikey(m.group('footnote_key')) + if key not in ref: + ref[key] = m.group('footnote_text') + state.env['ref_footnotes'] = ref + return m.end() + + +def parse_footnote_item(block, key: str, index: int, state: BlockState): + ref = state.env.get('ref_footnotes') + text = ref[key] + + lines = text.splitlines() + second_line = None + for second_line in lines[1:]: + if second_line: + break + + if second_line: + spaces = len(second_line) - len(second_line.lstrip()) + pattern = re.compile(r'^ {' + str(spaces) + r',}', flags=re.M) + text = pattern.sub('', text).strip() + items = _PARAGRAPH_SPLIT.split(text) + children = [{'type': 'paragraph', 'text': s} for s in items] + else: + text = text.strip() + children = [{'type': 'paragraph', 'text': text}] + return { + 'type': 'footnote_item', + 'children': children, + 'attrs': {'key': key, 'index': index} + } + + +def md_footnotes_hook(md, result: str, state: BlockState): + notes = state.env.get('footnotes') + if not notes: + return result + + children = [ + parse_footnote_item(md.block, k, i + 1, state) + for i, k in enumerate(notes) + ] + state = BlockState() + state.tokens = [{'type': 'footnotes', 'children': children}] + output = md.render_state(state) + return result + output + + +def render_footnote_ref(renderer, key: str, index: int): + i = str(index) + html = '<sup class="footnote-ref" id="fnref-' + i + '">' + return html + '<a href="#fn-' + i + '">' + i + '</a></sup>' + + +def render_footnotes(renderer, text: str): + return '<section class="footnotes">\n<ol>\n' + text + '</ol>\n</section>\n' + + +def render_footnote_item(renderer, text: str, key: str, index: int): + i = str(index) + back = '<a href="#fnref-' + i + '" class="footnote">↩</a>' + text = text.rstrip()[:-4] + back + '</p>' + return '<li id="fn-' + i + '">' + text + '</li>\n' + + +def footnotes(md): + """A mistune plugin to support footnotes, spec defined at + https://michelf.ca/projects/php-markdown/extra/#footnotes + + Here is an example: + + .. code-block:: text + + That's some text with a footnote.[^1] + + [^1]: And that's the footnote. + + It will be converted into HTML: + + .. code-block:: html + + <p>That's some text with a footnote.<sup class="footnote-ref" id="fnref-1"><a href="#fn-1">1</a></sup></p> + <section class="footnotes"> + <ol> + <li id="fn-1"><p>And that's the footnote.<a href="#fnref-1" class="footnote">↩</a></p></li> + </ol> + </section> + + :param md: Markdown instance + """ + md.inline.register( + 'footnote', + INLINE_FOOTNOTE, + parse_inline_footnote, + before='link', + ) + md.block.register( + 'ref_footnote', + REF_FOOTNOTE, + parse_ref_footnote, + before='ref_link', + ) + md.after_render_hooks.append(md_footnotes_hook) + + if md.renderer and md.renderer.NAME == 'html': + md.renderer.register('footnote_ref', render_footnote_ref) + md.renderer.register('footnote_item', render_footnote_item) + md.renderer.register('footnotes', render_footnotes) diff --git a/src/mistune/plugins/formatting.py b/src/mistune/plugins/formatting.py new file mode 100644 index 0000000..57e5def --- /dev/null +++ b/src/mistune/plugins/formatting.py @@ -0,0 +1,173 @@ +import re +from ..helpers import PREVENT_BACKSLASH + +__all__ = ["strikethrough", "mark", "insert", "superscript", "subscript"] + +_STRIKE_END = re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\~|[^\s~])~~(?!~)') +_MARK_END = re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\=|[^\s=])==(?!=)') +_INSERT_END = re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\^|[^\s^])\^\^(?!\^)') + +SUPERSCRIPT_PATTERN = r'\^(?:' + PREVENT_BACKSLASH + r'\\\^|\S|\\ )+?\^' +SUBSCRIPT_PATTERN = r'~(?:' + PREVENT_BACKSLASH + r'\\~|\S|\\ )+?~' + + +def parse_strikethrough(inline, m, state): + return _parse_to_end(inline, m, state, 'strikethrough', _STRIKE_END) + + +def render_strikethrough(renderer, text): + return '<del>' + text + '</del>' + + +def parse_mark(inline, m, state): + return _parse_to_end(inline, m, state, 'mark', _MARK_END) + + +def render_mark(renderer, text): + return '<mark>' + text + '</mark>' + + +def parse_insert(inline, m, state): + return _parse_to_end(inline, m, state, 'insert', _INSERT_END) + + +def render_insert(renderer, text): + return '<ins>' + text + '</ins>' + + +def parse_superscript(inline, m, state): + return _parse_script(inline, m, state, 'superscript') + + +def render_superscript(renderer, text): + return '<sup>' + text + '</sup>' + + +def parse_subscript(inline, m, state): + return _parse_script(inline, m, state, 'subscript') + + +def render_subscript(renderer, text): + return '<sub>' + text + '</sub>' + + +def _parse_to_end(inline, m, state, tok_type, end_pattern): + pos = m.end() + m1 = end_pattern.search(state.src, pos) + if not m1: + return + end_pos = m1.end() + text = state.src[pos:end_pos-2] + new_state = state.copy() + new_state.src = text + children = inline.render(new_state) + state.append_token({'type': tok_type, 'children': children}) + return end_pos + + +def _parse_script(inline, m, state, tok_type): + text = m.group(0) + new_state = state.copy() + new_state.src = text[1:-1].replace('\\ ', ' ') + children = inline.render(new_state) + state.append_token({ + 'type': tok_type, + 'children': children + }) + return m.end() + + +def strikethrough(md): + """A mistune plugin to support strikethrough. Spec defined by + GitHub flavored Markdown and commonly used by many parsers: + + .. code-block:: text + + ~~This was mistaken text~~ + + It will be converted into HTML: + + .. code-block:: html + + <del>This was mistaken text</del> + + :param md: Markdown instance + """ + md.inline.register( + 'strikethrough', + r'~~(?=[^\s~])', + parse_strikethrough, + before='link', + ) + if md.renderer and md.renderer.NAME == 'html': + md.renderer.register('strikethrough', render_strikethrough) + + +def mark(md): + """A mistune plugin to add ``<mark>`` tag. Spec defined at + https://facelessuser.github.io/pymdown-extensions/extensions/mark/: + + .. code-block:: text + + ==mark me== ==mark \\=\\= equal== + + :param md: Markdown instance + """ + md.inline.register( + 'mark', + r'==(?=[^\s=])', + parse_mark, + before='link', + ) + if md.renderer and md.renderer.NAME == 'html': + md.renderer.register('mark', render_mark) + + +def insert(md): + """A mistune plugin to add ``<ins>`` tag. Spec defined at + https://facelessuser.github.io/pymdown-extensions/extensions/caret/#insert: + + .. code-block:: text + + ^^insert me^^ + + :param md: Markdown instance + """ + md.inline.register( + 'insert', + r'\^\^(?=[^\s\^])', + parse_insert, + before='link', + ) + if md.renderer and md.renderer.NAME == 'html': + md.renderer.register('insert', render_insert) + + +def superscript(md): + """A mistune plugin to add ``<sup>`` tag. Spec defined at + https://pandoc.org/MANUAL.html#superscripts-and-subscripts: + + .. code-block:: text + + 2^10^ is 1024. + + :param md: Markdown instance + """ + md.inline.register('superscript', SUPERSCRIPT_PATTERN, parse_superscript, before='linebreak') + if md.renderer and md.renderer.NAME == 'html': + md.renderer.register('superscript', render_superscript) + + +def subscript(md): + """A mistune plugin to add ``<sub>`` tag. Spec defined at + https://pandoc.org/MANUAL.html#superscripts-and-subscripts: + + .. code-block:: text + + H~2~O is a liquid. + + :param md: Markdown instance + """ + md.inline.register('subscript', SUBSCRIPT_PATTERN, parse_subscript, before='linebreak') + if md.renderer and md.renderer.NAME == 'html': + md.renderer.register('subscript', render_subscript) diff --git a/src/mistune/plugins/math.py b/src/mistune/plugins/math.py new file mode 100644 index 0000000..805105e --- /dev/null +++ b/src/mistune/plugins/math.py @@ -0,0 +1,57 @@ +__all__ = ['math', 'math_in_quote', 'math_in_list'] + +BLOCK_MATH_PATTERN = r'^ {0,3}\$\$[ \t]*\n(?P<math_text>[\s\S]+?)\n\$\$[ \t]*$' +INLINE_MATH_PATTERN = r'\$(?!\s)(?P<math_text>.+?)(?!\s)\$' + + +def parse_block_math(block, m, state): + text = m.group('math_text') + state.append_token({'type': 'block_math', 'raw': text}) + return m.end() + 1 + + +def parse_inline_math(inline, m, state): + text = m.group('math_text') + state.append_token({'type': 'inline_math', 'raw': text}) + return m.end() + + +def render_block_math(renderer, text): + return '<div class="math">$$\n' + text + '\n$$</div>\n' + + +def render_inline_math(renderer, text): + return r'<span class="math">\(' + text + r'\)</span>' + + +def math(md): + """A mistune plugin to support math. The syntax is used + by many markdown extensions: + + .. code-block:: text + + Block math is surrounded by $$: + + $$ + f(a)=f(b) + $$ + + Inline math is surrounded by `$`, such as $f(a)=f(b)$ + + :param md: Markdown instance + """ + md.block.register('block_math', BLOCK_MATH_PATTERN, parse_block_math, before='list') + md.inline.register('inline_math', INLINE_MATH_PATTERN, parse_inline_math, before='link') + if md.renderer and md.renderer.NAME == 'html': + md.renderer.register('block_math', render_block_math) + md.renderer.register('inline_math', render_inline_math) + + +def math_in_quote(md): + """Enable block math plugin in block quote.""" + md.block.insert_rule(md.block.block_quote_rules, 'block_math', before='list') + + +def math_in_list(md): + """Enable block math plugin in list.""" + md.block.insert_rule(md.block.list_rules, 'block_math', before='list') diff --git a/src/mistune/plugins/ruby.py b/src/mistune/plugins/ruby.py new file mode 100644 index 0000000..eabc037 --- /dev/null +++ b/src/mistune/plugins/ruby.py @@ -0,0 +1,100 @@ +import re +from ..util import unikey +from ..helpers import parse_link, parse_link_label + + +RUBY_PATTERN = r'\[(?:\w+\(\w+\))+\]' +_ruby_re = re.compile(RUBY_PATTERN) + + +def parse_ruby(inline, m, state): + text = m.group(0)[1:-2] + items = text.split(')') + tokens = [] + for item in items: + rb, rt = item.split('(') + tokens.append({ + 'type': 'ruby', + 'raw': rb, + 'attrs': {'rt': rt} + }) + + end_pos = m.end() + + next_match = _ruby_re.match(state.src, end_pos) + if next_match: + for tok in tokens: + state.append_token(tok) + return parse_ruby(inline, next_match, state) + + # repeat link logic + if end_pos < len(state.src): + link_pos = _parse_ruby_link(inline, state, end_pos, tokens) + if link_pos: + return link_pos + + for tok in tokens: + state.append_token(tok) + return end_pos + + +def _parse_ruby_link(inline, state, pos, tokens): + c = state.src[pos] + if c == '(': + # standard link [text](<url> "title") + attrs, link_pos = parse_link(state.src, pos + 1) + if link_pos: + state.append_token({ + 'type': 'link', + 'children': tokens, + 'attrs': attrs, + }) + return link_pos + + elif c == '[': + # standard ref link [text][label] + label, link_pos = parse_link_label(state.src, pos + 1) + if label and link_pos: + ref_links = state.env['ref_links'] + key = unikey(label) + env = ref_links.get(key) + if env: + attrs = {'url': env['url'], 'title': env.get('title')} + state.append_token({ + 'type': 'link', + 'children': tokens, + 'attrs': attrs, + }) + else: + for tok in tokens: + state.append_token(tok) + state.append_token({ + 'type': 'text', + 'raw': '[' + label + ']', + }) + return link_pos + + +def render_ruby(renderer, text, rt): + return '<ruby><rb>' + text + '</rb><rt>' + rt + '</rt></ruby>' + + +def ruby(md): + """A mistune plugin to support ``<ruby>`` tag. The syntax is defined + at https://lepture.com/en/2022/markdown-ruby-markup: + + .. code-block:: text + + [漢字(ㄏㄢˋㄗˋ)] + [漢(ㄏㄢˋ)字(ㄗˋ)] + + [漢字(ㄏㄢˋㄗˋ)][link] + [漢字(ㄏㄢˋㄗˋ)](/url "title") + + [link]: /url "title" + + :param md: Markdown instance + """ + md.inline.register('ruby', RUBY_PATTERN, parse_ruby, before='link') + if md.renderer and md.renderer.NAME == 'html': + md.renderer.register('ruby', render_ruby) diff --git a/src/mistune/plugins/speedup.py b/src/mistune/plugins/speedup.py new file mode 100644 index 0000000..784022c --- /dev/null +++ b/src/mistune/plugins/speedup.py @@ -0,0 +1,44 @@ +import re +import string + +# because mismatch is too slow, add parsers for paragraph and text + +HARD_LINEBREAK_RE = re.compile(r' *\n\s*') +PARAGRAPH = ( + # start with none punctuation, not number, not whitespace + r'(?:^[^\s\d' + re.escape(string.punctuation) + r'][^\n]*\n)+' +) + +__all__ = ['speedup'] + + + +def parse_text(inline, m, state): + text = m.group(0) + text = HARD_LINEBREAK_RE.sub('\n', text) + inline.process_text(text, state) + return m.end() + + +def parse_paragraph(block, m, state): + text = m.group(0) + state.add_paragraph(text) + return m.end() + + +def speedup(md): + """Increase the speed of parsing paragraph and inline text.""" + md.block.register('paragraph', PARAGRAPH, parse_paragraph) + + punc = r'\\><!\[_*`~\^\$=' + text_pattern = r'[\s\S]+?(?=[' + punc + r']|' + if 'url_link' in md.inline.rules: + text_pattern += 'https?:|' + + if md.inline.hard_wrap: + text_pattern += r' *\n|' + else: + text_pattern += r' {2,}\n|' + + text_pattern += r'$)' + md.inline.register('text', text_pattern, parse_text) diff --git a/src/mistune/plugins/spoiler.py b/src/mistune/plugins/spoiler.py new file mode 100644 index 0000000..2931d2b --- /dev/null +++ b/src/mistune/plugins/spoiler.py @@ -0,0 +1,80 @@ +import re + +__all__ = ['spoiler'] + +_BLOCK_SPOILER_START = re.compile(r'^ {0,3}! ?', re.M) +_BLOCK_SPOILER_MATCH = re.compile(r'^( {0,3}![^\n]*\n)+$') + +INLINE_SPOILER_PATTERN = r'>!\s*(?P<spoiler_text>.+?)\s*!<' + + +def parse_block_spoiler(block, m, state): + text, end_pos = block.extract_block_quote(m, state) + if not text.endswith('\n'): + # ensure it endswith \n to make sure + # _BLOCK_SPOILER_MATCH.match works + text += '\n' + + depth = state.depth() + if not depth and _BLOCK_SPOILER_MATCH.match(text): + text = _BLOCK_SPOILER_START.sub('', text) + tok_type = 'block_spoiler' + else: + tok_type = 'block_quote' + + # scan children state + child = state.child_state(text) + if state.depth() >= block.max_nested_level - 1: + rules = list(block.block_quote_rules) + rules.remove('block_quote') + else: + rules = block.block_quote_rules + + block.parse(child, rules) + token = {'type': tok_type, 'children': child.tokens} + if end_pos: + state.prepend_token(token) + return end_pos + state.append_token(token) + return state.cursor + + +def parse_inline_spoiler(inline, m, state): + text = m.group('spoiler_text') + new_state = state.copy() + new_state.src = text + children = inline.render(new_state) + state.append_token({'type': 'inline_spoiler', 'children': children}) + return m.end() + + +def render_block_spoiler(renderer, text): + return '<div class="spoiler">\n' + text + '</div>\n' + + +def render_inline_spoiler(renderer, text): + return '<span class="spoiler">' + text + '</span>' + + +def spoiler(md): + """A mistune plugin to support block and inline spoiler. The + syntax is inspired by stackexchange: + + .. code-block:: text + + Block level spoiler looks like block quote, but with `>!`: + + >! this is spoiler + >! + >! the content will be hidden + + Inline spoiler is surrounded by `>!` and `!<`, such as >! hide me !<. + + :param md: Markdown instance + """ + # reset block quote parser with block spoiler parser + md.block.register('block_quote', None, parse_block_spoiler) + md.inline.register('inline_spoiler', INLINE_SPOILER_PATTERN, parse_inline_spoiler) + if md.renderer and md.renderer.NAME == 'html': + md.renderer.register('block_spoiler', render_block_spoiler) + md.renderer.register('inline_spoiler', render_inline_spoiler) diff --git a/src/mistune/plugins/table.py b/src/mistune/plugins/table.py new file mode 100644 index 0000000..d3bc4c2 --- /dev/null +++ b/src/mistune/plugins/table.py @@ -0,0 +1,179 @@ +import re +from ..helpers import PREVENT_BACKSLASH + +# https://michelf.ca/projects/php-markdown/extra/#table + +__all__ = ['table', 'table_in_quote', 'table_in_list'] + + +TABLE_PATTERN = ( + r'^ {0,3}\|(?P<table_head>.+)\|[ \t]*\n' + r' {0,3}\|(?P<table_align> *[-:]+[-| :]*)\|[ \t]*\n' + r'(?P<table_body>(?: {0,3}\|.*\|[ \t]*(?:\n|$))*)\n*' +) +NP_TABLE_PATTERN = ( + r'^ {0,3}(?P<nptable_head>\S.*\|.*)\n' + r' {0,3}(?P<nptable_align>[-:]+ *\|[-| :]*)\n' + r'(?P<nptable_body>(?:.*\|.*(?:\n|$))*)\n*' +) + +TABLE_CELL = re.compile(r'^ {0,3}\|(.+)\|[ \t]*$') +CELL_SPLIT = re.compile(r' *' + PREVENT_BACKSLASH + r'\| *') +ALIGN_CENTER = re.compile(r'^ *:-+: *$') +ALIGN_LEFT = re.compile(r'^ *:-+ *$') +ALIGN_RIGHT = re.compile(r'^ *-+: *$') + + +def parse_table(block, m, state): + pos = m.end() + header = m.group('table_head') + align = m.group('table_align') + thead, aligns = _process_thead(header, align) + if not thead: + return + + rows = [] + body = m.group('table_body') + for text in body.splitlines(): + m = TABLE_CELL.match(text) + if not m: # pragma: no cover + return + row = _process_row(m.group(1), aligns) + if not row: + return + rows.append(row) + + children = [thead, {'type': 'table_body', 'children': rows}] + state.append_token({'type': 'table', 'children': children}) + return pos + + +def parse_nptable(block, m, state): + header = m.group('nptable_head') + align = m.group('nptable_align') + thead, aligns = _process_thead(header, align) + if not thead: + return + + rows = [] + body = m.group('nptable_body') + for text in body.splitlines(): + row = _process_row(text, aligns) + if not row: + return + rows.append(row) + + children = [thead, {'type': 'table_body', 'children': rows}] + state.append_token({'type': 'table', 'children': children}) + return m.end() + + +def _process_thead(header, align): + headers = CELL_SPLIT.split(header) + aligns = CELL_SPLIT.split(align) + if len(headers) != len(aligns): + return None, None + + for i, v in enumerate(aligns): + if ALIGN_CENTER.match(v): + aligns[i] = 'center' + elif ALIGN_LEFT.match(v): + aligns[i] = 'left' + elif ALIGN_RIGHT.match(v): + aligns[i] = 'right' + else: + aligns[i] = None + + children = [ + { + 'type': 'table_cell', + 'text': text.strip(), + 'attrs': {'align': aligns[i], 'head': True} + } + for i, text in enumerate(headers) + ] + thead = {'type': 'table_head', 'children': children} + return thead, aligns + + +def _process_row(text, aligns): + cells = CELL_SPLIT.split(text) + if len(cells) != len(aligns): + return None + + children = [ + { + 'type': 'table_cell', + 'text': text.strip(), + 'attrs': {'align': aligns[i], 'head': False} + } + for i, text in enumerate(cells) + ] + return {'type': 'table_row', 'children': children} + + +def render_table(renderer, text): + return '<table>\n' + text + '</table>\n' + + +def render_table_head(renderer, text): + return '<thead>\n<tr>\n' + text + '</tr>\n</thead>\n' + + +def render_table_body(renderer, text): + return '<tbody>\n' + text + '</tbody>\n' + + +def render_table_row(renderer, text): + return '<tr>\n' + text + '</tr>\n' + + +def render_table_cell(renderer, text, align=None, head=False): + if head: + tag = 'th' + else: + tag = 'td' + + html = ' <' + tag + if align: + html += ' style="text-align:' + align + '"' + + return html + '>' + text + '</' + tag + '>\n' + + +def table(md): + """A mistune plugin to support table, spec defined at + https://michelf.ca/projects/php-markdown/extra/#table + + Here is an example: + + .. code-block:: text + + First Header | Second Header + ------------- | ------------- + Content Cell | Content Cell + Content Cell | Content Cell + + :param md: Markdown instance + """ + md.block.register('table', TABLE_PATTERN, parse_table, before='paragraph') + md.block.register('nptable', NP_TABLE_PATTERN, parse_nptable, before='paragraph') + + if md.renderer and md.renderer.NAME == 'html': + md.renderer.register('table', render_table) + md.renderer.register('table_head', render_table_head) + md.renderer.register('table_body', render_table_body) + md.renderer.register('table_row', render_table_row) + md.renderer.register('table_cell', render_table_cell) + + +def table_in_quote(md): + """Enable table plugin in block quotes.""" + md.block.insert_rule(md.block.block_quote_rules, 'table', before='paragraph') + md.block.insert_rule(md.block.block_quote_rules, 'nptable', before='paragraph') + + +def table_in_list(md): + """Enable table plugin in list.""" + md.block.insert_rule(md.block.list_rules, 'table', before='paragraph') + md.block.insert_rule(md.block.list_rules, 'nptable', before='paragraph') diff --git a/src/mistune/plugins/task_lists.py b/src/mistune/plugins/task_lists.py new file mode 100644 index 0000000..8571c32 --- /dev/null +++ b/src/mistune/plugins/task_lists.py @@ -0,0 +1,67 @@ +import re + +__all__ = ['task_lists'] + + +TASK_LIST_ITEM = re.compile(r'^(\[[ xX]\])\s+') + + +def task_lists_hook(md, state): + return _rewrite_all_list_items(state.tokens) + + +def render_task_list_item(renderer, text, checked=False): + checkbox = ( + '<input class="task-list-item-checkbox" ' + 'type="checkbox" disabled' + ) + if checked: + checkbox += ' checked/>' + else: + checkbox += '/>' + + if text.startswith('<p>'): + text = text.replace('<p>', '<p>' + checkbox, 1) + else: + text = checkbox + text + + return '<li class="task-list-item">' + text + '</li>\n' + + +def task_lists(md): + """A mistune plugin to support task lists. Spec defined by + GitHub flavored Markdown and commonly used by many parsers: + + .. code-block:: text + + - [ ] unchecked task + - [x] checked task + + :param md: Markdown instance + """ + md.before_render_hooks.append(task_lists_hook) + if md.renderer and md.renderer.NAME == 'html': + md.renderer.register('task_list_item', render_task_list_item) + + +def _rewrite_all_list_items(tokens): + for tok in tokens: + if tok['type'] == 'list_item': + _rewrite_list_item(tok) + if 'children' in tok: + _rewrite_all_list_items(tok['children']) + return tokens + + +def _rewrite_list_item(tok): + children = tok['children'] + if children: + first_child = children[0] + text = first_child.get('text', '') + m = TASK_LIST_ITEM.match(text) + if m: + mark = m.group(1) + first_child['text'] = text[m.end():] + + tok['type'] = 'task_list_item' + tok['attrs'] = {'checked': mark != '[ ]'} diff --git a/src/mistune/plugins/url.py b/src/mistune/plugins/url.py new file mode 100644 index 0000000..d6f2251 --- /dev/null +++ b/src/mistune/plugins/url.py @@ -0,0 +1,23 @@ +from ..util import escape_url + +__all__ = ['url'] + +URL_LINK_PATTERN = r'''https?:\/\/[^\s<]+[^<.,:;"')\]\s]''' + + +def parse_url_link(inline, m, state): + text = m.group(0) + pos = m.end() + if state.in_link: + inline.process_text(text, state) + return pos + state.append_token({ + 'type': 'link', + 'children': [{'type': 'text', 'raw': text}], + 'attrs': {'url': escape_url(text)}, + }) + return pos + + +def url(md): + md.inline.register('url_link', URL_LINK_PATTERN, parse_url_link) diff --git a/src/mistune/renderers/__init__.py b/src/mistune/renderers/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/src/mistune/renderers/__init__.py diff --git a/src/mistune/renderers/_list.py b/src/mistune/renderers/_list.py new file mode 100644 index 0000000..0a18639 --- /dev/null +++ b/src/mistune/renderers/_list.py @@ -0,0 +1,60 @@ +from ..util import strip_end + + +def render_list(renderer, token, state) -> str: + attrs = token['attrs'] + if attrs['ordered']: + children = _render_ordered_list(renderer, token, state) + else: + children = _render_unordered_list(renderer, token, state) + + text = ''.join(children) + parent = token.get('parent') + if parent: + if parent['tight']: + return text + return text + '\n' + return strip_end(text) + '\n' + + +def _render_list_item(renderer, parent, item, state): + leading = parent['leading'] + text = '' + for tok in item['children']: + if tok['type'] == 'list': + tok['parent'] = parent + elif tok['type'] == 'blank_line': + continue + text += renderer.render_token(tok, state) + + lines = text.splitlines() + text = lines[0] + '\n' + prefix = ' ' * len(leading) + for line in lines[1:]: + if line: + text += prefix + line + '\n' + else: + text += '\n' + return leading + text + + +def _render_ordered_list(renderer, token, state): + attrs = token['attrs'] + start = attrs.get('start', 1) + for item in token['children']: + leading = str(start) + token['bullet'] + ' ' + parent = { + 'leading': leading, + 'tight': token['tight'], + } + yield _render_list_item(renderer, parent, item, state) + start += 1 + + +def _render_unordered_list(renderer, token, state): + parent = { + 'leading': token['bullet'] + ' ', + 'tight': token['tight'], + } + for item in token['children']: + yield _render_list_item(renderer, parent, item, state) diff --git a/src/mistune/renderers/html.py b/src/mistune/renderers/html.py new file mode 100644 index 0000000..c458a4a --- /dev/null +++ b/src/mistune/renderers/html.py @@ -0,0 +1,151 @@ +from ..core import BaseRenderer +from ..util import escape as escape_text, striptags, safe_entity + + +class HTMLRenderer(BaseRenderer): + """A renderer for converting Markdown to HTML.""" + NAME = 'html' + HARMFUL_PROTOCOLS = ( + 'javascript:', + 'vbscript:', + 'file:', + 'data:', + ) + GOOD_DATA_PROTOCOLS = ( + 'data:image/gif;', + 'data:image/png;', + 'data:image/jpeg;', + 'data:image/webp;', + ) + + def __init__(self, escape=True, allow_harmful_protocols=None): + super(HTMLRenderer, self).__init__() + self._allow_harmful_protocols = allow_harmful_protocols + self._escape = escape + + def render_token(self, token, state): + # backward compitable with v2 + func = self._get_method(token['type']) + attrs = token.get('attrs') + + if 'raw' in token: + text = token['raw'] + elif 'children' in token: + text = self.render_tokens(token['children'], state) + else: + if attrs: + return func(**attrs) + else: + return func() + if attrs: + return func(text, **attrs) + else: + return func(text) + + def safe_url(self, url: str) -> str: + """Ensure the given URL is safe. This method is used for rendering + links, images, and etc. + """ + if self._allow_harmful_protocols is True: + return url + + _url = url.lower() + if self._allow_harmful_protocols and \ + _url.startswith(tuple(self._allow_harmful_protocols)): + return url + + if _url.startswith(self.HARMFUL_PROTOCOLS) and \ + not _url.startswith(self.GOOD_DATA_PROTOCOLS): + return '#harmful-link' + return url + + def text(self, text: str) -> str: + if self._escape: + return escape_text(text) + return safe_entity(text) + + def emphasis(self, text: str) -> str: + return '<em>' + text + '</em>' + + def strong(self, text: str) -> str: + return '<strong>' + text + '</strong>' + + def link(self, text: str, url: str, title=None) -> str: + s = '<a href="' + self.safe_url(url) + '"' + if title: + s += ' title="' + safe_entity(title) + '"' + return s + '>' + text + '</a>' + + def image(self, text: str, url: str, title=None) -> str: + src = self.safe_url(url) + alt = escape_text(striptags(text)) + s = '<img src="' + src + '" alt="' + alt + '"' + if title: + s += ' title="' + safe_entity(title) + '"' + return s + ' />' + + def codespan(self, text: str) -> str: + return '<code>' + text + '</code>' + + def linebreak(self) -> str: + return '<br />\n' + + def softbreak(self) -> str: + return '\n' + + def inline_html(self, html: str) -> str: + if self._escape: + return escape_text(html) + return html + + def paragraph(self, text: str) -> str: + return '<p>' + text + '</p>\n' + + def heading(self, text: str, level: int, **attrs) -> str: + tag = 'h' + str(level) + html = '<' + tag + _id = attrs.get('id') + if _id: + html += ' id="' + _id + '"' + return html + '>' + text + '</' + tag + '>\n' + + def blank_line(self) -> str: + return '' + + def thematic_break(self) -> str: + return '<hr />\n' + + def block_text(self, text: str) -> str: + return text + + def block_code(self, code: str, info=None) -> str: + html = '<pre><code' + if info is not None: + info = safe_entity(info.strip()) + if info: + lang = info.split(None, 1)[0] + html += ' class="language-' + lang + '"' + return html + '>' + escape_text(code) + '</code></pre>\n' + + def block_quote(self, text: str) -> str: + return '<blockquote>\n' + text + '</blockquote>\n' + + def block_html(self, html: str) -> str: + if self._escape: + return '<p>' + escape_text(html) + '</p>\n' + return html + '\n' + + def block_error(self, text: str) -> str: + return '<div class="error"><pre>' + text + '</pre></div>\n' + + def list(self, text: str, ordered: bool, **attrs) -> str: + if ordered: + html = '<ol' + start = attrs.get('start') + if start is not None: + html += ' start="' + str(start) + '"' + return html + '>\n' + text + '</ol>\n' + return '<ul>\n' + text + '</ul>\n' + + def list_item(self, text: str) -> str: + return '<li>' + text + '</li>\n' diff --git a/src/mistune/renderers/markdown.py b/src/mistune/renderers/markdown.py new file mode 100644 index 0000000..78334bc --- /dev/null +++ b/src/mistune/renderers/markdown.py @@ -0,0 +1,146 @@ +import re +from typing import Dict, Any +from textwrap import indent +from ._list import render_list +from ..core import BaseRenderer, BlockState +from ..util import strip_end + +fenced_re = re.compile(r'^(?:`|~)+', re.M) + + +class MarkdownRenderer(BaseRenderer): + """A renderer to re-format Markdown text.""" + NAME = 'markdown' + + def __call__(self, tokens, state: BlockState): + out = self.render_tokens(tokens, state) + # special handle for line breaks + out += '\n\n'.join(self.render_referrences(state)) + '\n' + return strip_end(out) + + def render_referrences(self, state: BlockState): + ref_links = state.env['ref_links'] + for key in ref_links: + attrs = ref_links[key] + text = '[' + attrs['label'] + ']: ' + attrs['url'] + title = attrs.get('title') + if title: + text += ' "' + title + '"' + yield text + + def render_children(self, token, state: BlockState): + children = token['children'] + return self.render_tokens(children, state) + + def text(self, token: Dict[str, Any], state: BlockState) -> str: + return token['raw'] + + def emphasis(self, token: Dict[str, Any], state: BlockState) -> str: + return '*' + self.render_children(token, state) + '*' + + def strong(self, token: Dict[str, Any], state: BlockState) -> str: + return '**' + self.render_children(token, state) + '**' + + def link(self, token: Dict[str, Any], state: BlockState) -> str: + label = token.get('label') + text = self.render_children(token, state) + out = '[' + text + ']' + if label: + return out + '[' + label + ']' + + attrs = token['attrs'] + url = attrs['url'] + title = attrs.get('title') + if text == url and not title: + return '<' + text + '>' + elif 'mailto:' + text == url and not title: + return '<' + text + '>' + + out += '(' + if '(' in url or ')' in url: + out += '<' + url + '>' + else: + out += url + if title: + out += ' "' + title + '"' + return out + ')' + + def image(self, token: Dict[str, Any], state: BlockState) -> str: + return '!' + self.link(token, state) + + def codespan(self, token: Dict[str, Any], state: BlockState) -> str: + return '`' + token['raw'] + '`' + + def linebreak(self, token: Dict[str, Any], state: BlockState) -> str: + return ' \n' + + def softbreak(self, token: Dict[str, Any], state: BlockState) -> str: + return '\n' + + def blank_line(self, token: Dict[str, Any], state: BlockState) -> str: + return '' + + def inline_html(self, token: Dict[str, Any], state: BlockState) -> str: + return token['raw'] + + def paragraph(self, token: Dict[str, Any], state: BlockState) -> str: + text = self.render_children(token, state) + return text + '\n\n' + + def heading(self, token: Dict[str, Any], state: BlockState) -> str: + level = token['attrs']['level'] + marker = '#' * level + text = self.render_children(token, state) + return marker + ' ' + text + '\n\n' + + def thematic_break(self, token: Dict[str, Any], state: BlockState) -> str: + return '***\n\n' + + def block_text(self, token: Dict[str, Any], state: BlockState) -> str: + return self.render_children(token, state) + '\n' + + def block_code(self, token: Dict[str, Any], state: BlockState) -> str: + attrs = token.get('attrs', {}) + info = attrs.get('info', '') + code = token['raw'] + if code and code[-1] != '\n': + code += '\n' + + marker = token.get('marker') + if not marker: + marker = _get_fenced_marker(code) + return marker + info + '\n' + code + marker + '\n\n' + + def block_quote(self, token: Dict[str, Any], state: BlockState) -> str: + text = indent(self.render_children(token, state), '> ') + return text + '\n\n' + + def block_html(self, token: Dict[str, Any], state: BlockState) -> str: + return token['raw'] + '\n\n' + + def block_error(self, token: Dict[str, Any], state: BlockState) -> str: + return '' + + def list(self, token: Dict[str, Any], state: BlockState) -> str: + return render_list(self, token, state) + + +def _get_fenced_marker(code): + found = fenced_re.findall(code) + if not found: + return '```' + + ticks = [] # ` + waves = [] # ~ + for s in found: + if s[0] == '`': + ticks.append(len(s)) + else: + waves.append(len(s)) + + if not ticks: + return '```' + + if not waves: + return '~~~' + return '`' * (max(ticks) + 1) diff --git a/src/mistune/renderers/rst.py b/src/mistune/renderers/rst.py new file mode 100644 index 0000000..fa12c21 --- /dev/null +++ b/src/mistune/renderers/rst.py @@ -0,0 +1,147 @@ +from typing import Dict, Any +from textwrap import indent +from ._list import render_list +from ..core import BaseRenderer, BlockState +from ..util import strip_end + + +class RSTRenderer(BaseRenderer): + """A renderer for converting Markdown to ReST.""" + NAME = 'rst' + + #: marker symbols for heading + HEADING_MARKERS = { + 1: '=', + 2: '-', + 3: '~', + 4: '^', + 5: '"', + 6: "'", + } + INLINE_IMAGE_PREFIX = 'img-' + + def iter_tokens(self, tokens, state): + prev = None + for tok in tokens: + # ignore blank line + if tok['type'] == 'blank_line': + continue + tok['prev'] = prev + prev = tok + yield self.render_token(tok, state) + + def __call__(self, tokens, state: BlockState): + state.env['inline_images'] = [] + out = self.render_tokens(tokens, state) + # special handle for line breaks + out += '\n\n'.join(self.render_referrences(state)) + '\n' + return strip_end(out) + + def render_referrences(self, state: BlockState): + images = state.env['inline_images'] + for index, token in enumerate(images): + attrs = token['attrs'] + alt = self.render_children(token, state) + ident = self.INLINE_IMAGE_PREFIX + str(index) + yield '.. |' + ident + '| image:: ' + attrs['url'] + '\n :alt: ' + alt + + def render_children(self, token, state: BlockState): + children = token['children'] + return self.render_tokens(children, state) + + def text(self, token: Dict[str, Any], state: BlockState) -> str: + text = token['raw'] + return text.replace('|', r'\|') + + def emphasis(self, token: Dict[str, Any], state: BlockState) -> str: + return '*' + self.render_children(token, state) + '*' + + def strong(self, token: Dict[str, Any], state: BlockState) -> str: + return '**' + self.render_children(token, state) + '**' + + def link(self, token: Dict[str, Any], state: BlockState) -> str: + attrs = token['attrs'] + text = self.render_children(token, state) + return '`' + text + ' <' + attrs['url'] + '>`__' + + def image(self, token: Dict[str, Any], state: BlockState) -> str: + refs: list = state.env['inline_images'] + index = len(refs) + refs.append(token) + return '|' + self.INLINE_IMAGE_PREFIX + str(index) + '|' + + def codespan(self, token: Dict[str, Any], state: BlockState) -> str: + return '``' + token['raw'] + '``' + + def linebreak(self, token: Dict[str, Any], state: BlockState) -> str: + return '<linebreak>' + + def softbreak(self, token: Dict[str, Any], state: BlockState) -> str: + return ' ' + + def inline_html(self, token: Dict[str, Any], state: BlockState) -> str: + # rst does not support inline html + return '' + + def paragraph(self, token: Dict[str, Any], state: BlockState) -> str: + children = token['children'] + if len(children) == 1 and children[0]['type'] == 'image': + image = children[0] + attrs = image['attrs'] + title = attrs.get('title') + alt = self.render_children(image, state) + text = '.. figure:: ' + attrs['url'] + if title: + text += '\n :alt: ' + title + text += '\n\n' + indent(alt, ' ') + else: + text = self.render_tokens(children, state) + lines = text.split('<linebreak>') + if len(lines) > 1: + text = '\n'.join('| ' + line for line in lines) + return text + '\n\n' + + def heading(self, token: Dict[str, Any], state: BlockState) -> str: + attrs = token['attrs'] + text = self.render_children(token, state) + marker = self.HEADING_MARKERS[attrs['level']] + return text + '\n' + marker * len(text) + '\n\n' + + def thematic_break(self, token: Dict[str, Any], state: BlockState) -> str: + return '--------------\n\n' + + def block_text(self, token: Dict[str, Any], state: BlockState) -> str: + return self.render_children(token, state) + '\n' + + def block_code(self, token: Dict[str, Any], state: BlockState) -> str: + attrs = token.get('attrs', {}) + info = attrs.get('info') + code = indent(token['raw'], ' ') + if info: + lang = info.split()[0] + return '.. code:: ' + lang + '\n\n' + code + '\n' + else: + return '::\n\n' + code + '\n\n' + + def block_quote(self, token: Dict[str, Any], state: BlockState) -> str: + text = indent(self.render_children(token, state), ' ') + prev = token['prev'] + ignore_blocks = ( + 'paragraph', + 'thematic_break', + 'linebreak', + 'heading', + ) + if prev and prev['type'] not in ignore_blocks: + text = '..\n\n' + text + return text + + def block_html(self, token: Dict[str, Any], state: BlockState) -> str: + raw = token['raw'] + return '.. raw:: html\n\n' + indent(raw, ' ') + '\n\n' + + def block_error(self, token: Dict[str, Any], state: BlockState) -> str: + return '' + + def list(self, token: Dict[str, Any], state: BlockState) -> str: + return render_list(self, token, state) diff --git a/src/mistune/toc.py b/src/mistune/toc.py new file mode 100644 index 0000000..c908b0c --- /dev/null +++ b/src/mistune/toc.py @@ -0,0 +1,111 @@ +from .util import striptags + + +def add_toc_hook(md, min_level=1, max_level=3, heading_id=None): + """Add a hook to save toc items into ``state.env``. This is + usually helpful for doc generator:: + + import mistune + from mistune.toc import add_toc_hook, render_toc_ul + + md = mistune.create_markdown(...) + add_toc_hook(md, level, heading_id) + + html, state = md.parse(text) + toc_items = state.env['toc_items'] + toc_html = render_toc_ul(toc_items) + + :param md: Markdown instance + :param min_level: min heading level + :param max_level: max heading level + :param heading_id: a function to generate heading_id + """ + if heading_id is None: + def heading_id(token, index): + return 'toc_' + str(index + 1) + + def toc_hook(md, state): + headings = [] + + for tok in state.tokens: + if tok['type'] == 'heading': + level = tok['attrs']['level'] + if min_level <= level <= max_level: + headings.append(tok) + + toc_items = [] + for i, tok in enumerate(headings): + tok['attrs']['id'] = heading_id(tok, i) + toc_items.append(normalize_toc_item(md, tok)) + + # save items into state + state.env['toc_items'] = toc_items + + md.before_render_hooks.append(toc_hook) + + +def normalize_toc_item(md, token): + text = token['text'] + tokens = md.inline(text, {}) + html = md.renderer(tokens, {}) + text = striptags(html) + attrs = token['attrs'] + return attrs['level'], attrs['id'], text + + +def render_toc_ul(toc): + """Render a <ul> table of content HTML. The param "toc" should + be formatted into this structure:: + + [ + (level, id, text), + ] + + For example:: + + [ + (1, 'toc-intro', 'Introduction'), + (2, 'toc-install', 'Install'), + (2, 'toc-upgrade', 'Upgrade'), + (1, 'toc-license', 'License'), + ] + """ + if not toc: + return '' + + s = '<ul>\n' + levels = [] + for level, k, text in toc: + item = '<a href="#{}">{}</a>'.format(k, text) + if not levels: + s += '<li>' + item + levels.append(level) + elif level == levels[-1]: + s += '</li>\n<li>' + item + elif level > levels[-1]: + s += '\n<ul>\n<li>' + item + levels.append(level) + else: + levels.pop() + while levels: + last_level = levels.pop() + if level == last_level: + s += '</li>\n</ul>\n</li>\n<li>' + item + levels.append(level) + break + elif level > last_level: + s += '</li>\n<li>' + item + levels.append(last_level) + levels.append(level) + break + else: + s += '</li>\n</ul>\n' + else: + levels.append(level) + s += '</li>\n<li>' + item + + while len(levels) > 1: + s += '</li>\n</ul>\n' + levels.pop() + + return s + '</li>\n</ul>\n' diff --git a/src/mistune/util.py b/src/mistune/util.py new file mode 100644 index 0000000..e2337b4 --- /dev/null +++ b/src/mistune/util.py @@ -0,0 +1,81 @@ +import re +from urllib.parse import quote +from html import _replace_charref + + +_expand_tab_re = re.compile(r'^( {0,3})\t', flags=re.M) + + +def expand_leading_tab(text, width=4): + def repl(m): + s = m.group(1) + return s + ' ' * (width - len(s)) + return _expand_tab_re.sub(repl, text) + + +def expand_tab(text, space=' '): + repl = r'\1' + space + return _expand_tab_re.sub(repl, text) + + +def escape(s, quote=True): + """Escape characters of ``&<>``. If quote=True, ``"`` will be + converted to ``"e;``.""" + s = s.replace("&", "&") + s = s.replace("<", "<") + s = s.replace(">", ">") + if quote: + s = s.replace('"', """) + return s + + +def escape_url(link): + """Escape URL for safety.""" + safe = ( + ':/?#@' # gen-delims - '[]' (rfc3986) + '!$&()*+,;=' # sub-delims - "'" (rfc3986) + '%' # leave already-encoded octets alone + ) + return escape(quote(unescape(link), safe=safe)) + + +def safe_entity(s): + """Escape characters for safety.""" + return escape(unescape(s)) + + +def unikey(s): + """Generate a unique key for links and footnotes.""" + key = ' '.join(s.split()).strip() + return key.lower().upper() + + +_charref_re = re.compile( + r'&(#[0-9]{1,7};' + r'|#[xX][0-9a-fA-F]+;' + r'|[^\t\n\f <&#;]{1,32};)' +) + + +def unescape(s): + """ + Copy from `html.unescape`, but `_charref` is different. CommonMark + does not accept entity references without a trailing semicolon + """ + if '&' not in s: + return s + return _charref_re.sub(_replace_charref, s) + + +_striptags_re = re.compile(r'(<!--.*?-->|<[^>]*>)') + + +def striptags(s): + return _striptags_re.sub('', s) + + +_strip_end_re = re.compile(r'\n\s+$') + + +def strip_end(src): + return _strip_end_re.sub('\n', src) |