This commit is contained in:
Waylon Walker 2022-03-31 20:20:07 -05:00
commit 38355d2442
No known key found for this signature in database
GPG key ID: 66E2BF2B4190EFE4
9083 changed files with 1225834 additions and 0 deletions

View file

@ -0,0 +1,8 @@
# flake8: noqa
from __future__ import unicode_literals, absolute_import
from commonmark.main import commonmark
from commonmark.dump import dumpAST, dumpJSON
from commonmark.blocks import Parser
from commonmark.render.html import HtmlRenderer
from commonmark.render.rst import ReStructuredTextRenderer

View file

@ -0,0 +1,908 @@
from __future__ import absolute_import, unicode_literals
import re
from commonmark import common
from commonmark.common import unescape_string
from commonmark.inlines import InlineParser
from commonmark.node import Node
CODE_INDENT = 4
reHtmlBlockOpen = [
re.compile(r'.'), # dummy for 0
re.compile(r'^<(?:script|pre|style)(?:\s|>|$)', re.IGNORECASE),
re.compile(r'^<!--'),
re.compile(r'^<[?]'),
re.compile(r'^<![A-Z]'),
re.compile(r'^<!\[CDATA\['),
re.compile(
r'^<[/]?(?:address|article|aside|base|basefont|blockquote|body|'
r'caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|'
r'fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|'
r'header|hr|html|iframe|legend|li|link|main|menu|menuitem|'
r'nav|noframes|ol|optgroup|option|p|param|section|source|title|'
r'summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)'
r'(?:\s|[/]?[>]|$)',
re.IGNORECASE),
re.compile(
'^(?:' + common.OPENTAG + '|' + common.CLOSETAG + ')\\s*$',
re.IGNORECASE),
]
reHtmlBlockClose = [
re.compile(r'.'), # dummy for 0
re.compile(r'<\/(?:script|pre|style)>', re.IGNORECASE),
re.compile(r'-->'),
re.compile(r'\?>'),
re.compile(r'>'),
re.compile(r'\]\]>'),
]
reThematicBreak = re.compile(
r'^(?:(?:\*[ \t]*){3,}|(?:_[ \t]*){3,}|(?:-[ \t]*){3,})[ \t]*$')
reMaybeSpecial = re.compile(r'^[#`~*+_=<>0-9-]')
reNonSpace = re.compile(r'[^ \t\f\v\r\n]')
reBulletListMarker = re.compile(r'^[*+-]')
reOrderedListMarker = re.compile(r'^(\d{1,9})([.)])')
reATXHeadingMarker = re.compile(r'^#{1,6}(?:[ \t]+|$)')
reCodeFence = re.compile(r'^`{3,}(?!.*`)|^~{3,}')
reClosingCodeFence = re.compile(r'^(?:`{3,}|~{3,})(?= *$)')
reSetextHeadingLine = re.compile(r'^(?:=+|-+)[ \t]*$')
reLineEnding = re.compile(r'\r\n|\n|\r')
def is_blank(s):
"""Returns True if string contains only space characters."""
return re.search(reNonSpace, s) is None
def is_space_or_tab(s):
return s in (' ', '\t')
def peek(ln, pos):
if pos < len(ln):
return ln[pos]
else:
return None
def ends_with_blank_line(block):
""" Returns true if block ends with a blank line,
descending if needed into lists and sublists."""
while block:
if block.last_line_blank:
return True
if not block.last_line_checked and \
block.t in ('list', 'item'):
block.last_line_checked = True
block = block.last_child
else:
block.last_line_checked = True
break
return False
def parse_list_marker(parser, container):
""" Parse a list marker and return data on the marker (type,
start, delimiter, bullet character, padding) or None."""
rest = parser.current_line[parser.next_nonspace:]
data = {
'type': None,
'tight': True, # lists are tight by default
'bullet_char': None,
'start': None,
'delimiter': None,
'padding': None,
'marker_offset': parser.indent,
}
if parser.indent >= 4:
return None
m = re.search(reBulletListMarker, rest)
m2 = re.search(reOrderedListMarker, rest)
if m:
data['type'] = 'bullet'
data['bullet_char'] = m.group()[0]
elif m2 and (container.t != 'paragraph' or m2.group(1) == '1'):
m = m2
data['type'] = 'ordered'
data['start'] = int(m.group(1))
data['delimiter'] = m.group(2)
else:
return None
# make sure we have spaces after
nextc = peek(parser.current_line, parser.next_nonspace + len(m.group()))
if not (nextc is None or nextc == '\t' or nextc == ' '):
return None
# if it interrupts paragraph, make sure first line isn't blank
if container.t == 'paragraph' and \
not re.search(
reNonSpace,
parser.current_line[parser.next_nonspace + len(m.group()):]):
return None
# we've got a match! advance offset and calculate padding
parser.advance_next_nonspace() # to start of marker
parser.advance_offset(len(m.group()), True) # to end of marker
spaces_start_col = parser.column
spaces_start_offset = parser.offset
while True:
parser.advance_offset(1, True)
nextc = peek(parser.current_line, parser.offset)
if parser.column - spaces_start_col < 5 and \
is_space_or_tab(nextc):
pass
else:
break
blank_item = peek(parser.current_line, parser.offset) is None
spaces_after_marker = parser.column - spaces_start_col
if spaces_after_marker >= 5 or \
spaces_after_marker < 1 or \
blank_item:
data['padding'] = len(m.group()) + 1
parser.column = spaces_start_col
parser.offset = spaces_start_offset
if is_space_or_tab(peek(parser.current_line, parser.offset)):
parser.advance_offset(1, True)
else:
data['padding'] = len(m.group()) + spaces_after_marker
return data
def lists_match(list_data, item_data):
"""
Returns True if the two list items are of the same type,
with the same delimiter and bullet character. This is used
in agglomerating list items into lists.
"""
return list_data.get('type') == item_data.get('type') and \
list_data.get('delimiter') == item_data.get('delimiter') and \
list_data.get('bullet_char') == item_data.get('bullet_char')
class Block(object):
accepts_lines = None
@staticmethod
def continue_(parser=None, container=None):
return
@staticmethod
def finalize(parser=None, block=None):
return
@staticmethod
def can_contain(t):
return
class Document(Block):
accepts_lines = False
@staticmethod
def continue_(parser=None, container=None):
return 0
@staticmethod
def finalize(parser=None, block=None):
return
@staticmethod
def can_contain(t):
return t != 'item'
class List(Block):
accepts_lines = False
@staticmethod
def continue_(parser=None, container=None):
return 0
@staticmethod
def finalize(parser=None, block=None):
item = block.first_child
while item:
# check for non-final list item ending with blank line:
if ends_with_blank_line(item) and item.nxt:
block.list_data['tight'] = False
break
# recurse into children of list item, to see if there are
# spaces between any of them:
subitem = item.first_child
while subitem:
if ends_with_blank_line(subitem) and \
(item.nxt or subitem.nxt):
block.list_data['tight'] = False
break
subitem = subitem.nxt
item = item.nxt
@staticmethod
def can_contain(t):
return t == 'item'
class BlockQuote(Block):
accepts_lines = False
@staticmethod
def continue_(parser=None, container=None):
ln = parser.current_line
if not parser.indented and peek(ln, parser.next_nonspace) == '>':
parser.advance_next_nonspace()
parser.advance_offset(1, False)
if is_space_or_tab(peek(ln, parser.offset)):
parser.advance_offset(1, True)
else:
return 1
return 0
@staticmethod
def finalize(parser=None, block=None):
return
@staticmethod
def can_contain(t):
return t != 'item'
class Item(Block):
accepts_lines = False
@staticmethod
def continue_(parser=None, container=None):
if parser.blank:
if container.first_child is None:
# Blank line after empty list item
return 1
else:
parser.advance_next_nonspace()
elif parser.indent >= (container.list_data['marker_offset'] +
container.list_data['padding']):
parser.advance_offset(
container.list_data['marker_offset'] +
container.list_data['padding'], True)
else:
return 1
return 0
@staticmethod
def finalize(parser=None, block=None):
return
@staticmethod
def can_contain(t):
return t != 'item'
class Heading(Block):
accepts_lines = False
@staticmethod
def continue_(parser=None, container=None):
# A heading can never container > 1 line, so fail to match:
return 1
@staticmethod
def finalize(parser=None, block=None):
return
@staticmethod
def can_contain(t):
return False
class ThematicBreak(Block):
accepts_lines = False
@staticmethod
def continue_(parser=None, container=None):
# A thematic break can never container > 1 line, so fail to match:
return 1
@staticmethod
def finalize(parser=None, block=None):
return
@staticmethod
def can_contain(t):
return False
class CodeBlock(Block):
accepts_lines = True
@staticmethod
def continue_(parser=None, container=None):
ln = parser.current_line
indent = parser.indent
if container.is_fenced:
match = indent <= 3 and \
len(ln) >= parser.next_nonspace + 1 and \
ln[parser.next_nonspace] == container.fence_char and \
re.search(reClosingCodeFence, ln[parser.next_nonspace:])
if match and len(match.group()) >= container.fence_length:
# closing fence - we're at end of line, so we can return
parser.finalize(container, parser.line_number)
return 2
else:
# skip optional spaces of fence offset
i = container.fence_offset
while i > 0 and is_space_or_tab(peek(ln, parser.offset)):
parser.advance_offset(1, True)
i -= 1
else:
# indented
if indent >= CODE_INDENT:
parser.advance_offset(CODE_INDENT, True)
elif parser.blank:
parser.advance_next_nonspace()
else:
return 1
return 0
@staticmethod
def finalize(parser=None, block=None):
if block.is_fenced:
# first line becomes info string
content = block.string_content
newline_pos = content.index('\n')
first_line = content[0:newline_pos]
rest = content[newline_pos + 1:]
block.info = unescape_string(first_line.strip())
block.literal = rest
else:
# indented
block.literal = re.sub(r'(\n *)+$', '\n', block.string_content)
block.string_content = None
@staticmethod
def can_contain(t):
return False
class HtmlBlock(Block):
accepts_lines = True
@staticmethod
def continue_(parser=None, container=None):
if parser.blank and (container.html_block_type == 6 or
container.html_block_type == 7):
return 1
else:
return 0
@staticmethod
def finalize(parser=None, block=None):
block.literal = re.sub(r'(\n *)+$', '', block.string_content)
# allow GC
block.string_content = None
@staticmethod
def can_contain(t):
return False
class Paragraph(Block):
accepts_lines = True
@staticmethod
def continue_(parser=None, container=None):
return 1 if parser.blank else 0
@staticmethod
def finalize(parser=None, block=None):
has_reference_defs = False
# try parsing the beginning as link reference definitions:
while peek(block.string_content, 0) == '[':
pos = parser.inline_parser.parseReference(
block.string_content, parser.refmap)
if not pos:
break
block.string_content = block.string_content[pos:]
has_reference_defs = True
if has_reference_defs and is_blank(block.string_content):
block.unlink()
@staticmethod
def can_contain(t):
return False
class BlockStarts(object):
"""Block start functions.
Return values:
0 = no match
1 = matched container, keep going
2 = matched leaf, no more block starts
"""
METHODS = [
'block_quote',
'atx_heading',
'fenced_code_block',
'html_block',
'setext_heading',
'thematic_break',
'list_item',
'indented_code_block',
]
@staticmethod
def block_quote(parser, container=None):
if not parser.indented and \
peek(parser.current_line, parser.next_nonspace) == '>':
parser.advance_next_nonspace()
parser.advance_offset(1, False)
# optional following space
if is_space_or_tab(peek(parser.current_line, parser.offset)):
parser.advance_offset(1, True)
parser.close_unmatched_blocks()
parser.add_child('block_quote', parser.next_nonspace)
return 1
return 0
@staticmethod
def atx_heading(parser, container=None):
if not parser.indented:
m = re.search(reATXHeadingMarker,
parser.current_line[parser.next_nonspace:])
if m:
parser.advance_next_nonspace()
parser.advance_offset(len(m.group()), False)
parser.close_unmatched_blocks()
container = parser.add_child('heading', parser.next_nonspace)
# number of #s
container.level = len(m.group().strip())
# remove trailing ###s:
container.string_content = re.sub(
r'[ \t]+#+[ \t]*$', '', re.sub(
r'^[ \t]*#+[ \t]*$',
'',
parser.current_line[parser.offset:]))
parser.advance_offset(
len(parser.current_line) - parser.offset, False)
return 2
return 0
@staticmethod
def fenced_code_block(parser, container=None):
if not parser.indented:
m = re.search(
reCodeFence,
parser.current_line[parser.next_nonspace:])
if m:
fence_length = len(m.group())
parser.close_unmatched_blocks()
container = parser.add_child(
'code_block', parser.next_nonspace)
container.is_fenced = True
container.fence_length = fence_length
container.fence_char = m.group()[0]
container.fence_offset = parser.indent
parser.advance_next_nonspace()
parser.advance_offset(fence_length, False)
return 2
return 0
@staticmethod
def html_block(parser, container=None):
if not parser.indented and \
peek(parser.current_line, parser.next_nonspace) == '<':
s = parser.current_line[parser.next_nonspace:]
for block_type in range(1, 8):
if re.search(reHtmlBlockOpen[block_type], s) and \
(block_type < 7 or container.t != 'paragraph'):
parser.close_unmatched_blocks()
# We don't adjust parser.offset;
# spaces are part of the HTML block:
b = parser.add_child('html_block', parser.offset)
b.html_block_type = block_type
return 2
return 0
@staticmethod
def setext_heading(parser, container=None):
if not parser.indented and container.t == 'paragraph':
m = re.search(
reSetextHeadingLine,
parser.current_line[parser.next_nonspace:])
if m:
parser.close_unmatched_blocks()
# resolve reference link definitiosn
while peek(container.string_content, 0) == '[':
pos = parser.inline_parser.parseReference(
container.string_content, parser.refmap)
if not pos:
break
container.string_content = container.string_content[pos:]
if container.string_content:
heading = Node('heading', container.sourcepos)
heading.level = 1 if m.group()[0] == '=' else 2
heading.string_content = container.string_content
container.insert_after(heading)
container.unlink()
parser.tip = heading
parser.advance_offset(
len(parser.current_line) - parser.offset, False)
return 2
else:
return 0
return 0
@staticmethod
def thematic_break(parser, container=None):
if not parser.indented and re.search(
reThematicBreak, parser.current_line[parser.next_nonspace:]):
parser.close_unmatched_blocks()
parser.add_child('thematic_break', parser.next_nonspace)
parser.advance_offset(
len(parser.current_line) - parser.offset, False)
return 2
return 0
@staticmethod
def list_item(parser, container=None):
if (not parser.indented or container.t == 'list'):
data = parse_list_marker(parser, container)
if data:
parser.close_unmatched_blocks()
# add the list if needed
if parser.tip.t != 'list' or \
not lists_match(container.list_data, data):
container = parser.add_child('list', parser.next_nonspace)
container.list_data = data
# add the list item
container = parser.add_child('item', parser.next_nonspace)
container.list_data = data
return 1
return 0
@staticmethod
def indented_code_block(parser, container=None):
if parser.indented and \
parser.tip.t != 'paragraph' and \
not parser.blank:
# indented code
parser.advance_offset(CODE_INDENT, True)
parser.close_unmatched_blocks()
parser.add_child('code_block', parser.offset)
return 2
return 0
class Parser(object):
def __init__(self, options={}):
self.doc = Node('document', [[1, 1], [0, 0]])
self.block_starts = BlockStarts()
self.tip = self.doc
self.oldtip = self.doc
self.current_line = ''
self.line_number = 0
self.offset = 0
self.column = 0
self.next_nonspace = 0
self.next_nonspace_column = 0
self.indent = 0
self.indented = False
self.blank = False
self.partially_consumed_tab = False
self.all_closed = True
self.last_matched_container = self.doc
self.refmap = {}
self.last_line_length = 0
self.inline_parser = InlineParser(options)
self.options = options
def add_line(self):
""" Add a line to the block at the tip. We assume the tip
can accept lines -- that check should be done before calling this."""
if self.partially_consumed_tab:
# Skip over tab
self.offset += 1
# Add space characters
chars_to_tab = 4 - (self.column % 4)
self.tip.string_content += (' ' * chars_to_tab)
self.tip.string_content += (self.current_line[self.offset:] + '\n')
def add_child(self, tag, offset):
""" Add block of type tag as a child of the tip. If the tip can't
accept children, close and finalize it and try its parent,
and so on til we find a block that can accept children."""
while not self.blocks[self.tip.t].can_contain(tag):
self.finalize(self.tip, self.line_number - 1)
column_number = offset + 1
new_block = Node(tag, [[self.line_number, column_number], [0, 0]])
new_block.string_content = ''
self.tip.append_child(new_block)
self.tip = new_block
return new_block
def close_unmatched_blocks(self):
"""Finalize and close any unmatched blocks."""
if not self.all_closed:
while self.oldtip != self.last_matched_container:
parent = self.oldtip.parent
self.finalize(self.oldtip, self.line_number - 1)
self.oldtip = parent
self.all_closed = True
def find_next_nonspace(self):
current_line = self.current_line
i = self.offset
cols = self.column
try:
c = current_line[i]
except IndexError:
c = ''
while c != '':
if c == ' ':
i += 1
cols += 1
elif c == '\t':
i += 1
cols += (4 - (cols % 4))
else:
break
try:
c = current_line[i]
except IndexError:
c = ''
self.blank = (c == '\n' or c == '\r' or c == '')
self.next_nonspace = i
self.next_nonspace_column = cols
self.indent = self.next_nonspace_column - self.column
self.indented = self.indent >= CODE_INDENT
def advance_next_nonspace(self):
self.offset = self.next_nonspace
self.column = self.next_nonspace_column
self.partially_consumed_tab = False
def advance_offset(self, count, columns):
current_line = self.current_line
try:
c = current_line[self.offset]
except IndexError:
c = None
while count > 0 and c is not None:
if c == '\t':
chars_to_tab = 4 - (self.column % 4)
if columns:
self.partially_consumed_tab = chars_to_tab > count
chars_to_advance = min(count, chars_to_tab)
self.column += chars_to_advance
self.offset += 0 if self.partially_consumed_tab else 1
count -= chars_to_advance
else:
self.partially_consumed_tab = False
self.column += chars_to_tab
self.offset += 1
count -= 1
else:
self.partially_consumed_tab = False
self.offset += 1
# assume ascii; block starts are ascii
self.column += 1
count -= 1
try:
c = current_line[self.offset]
except IndexError:
c = None
def incorporate_line(self, ln):
"""Analyze a line of text and update the document appropriately.
We parse markdown text by calling this on each line of input,
then finalizing the document.
"""
all_matched = True
container = self.doc
self.oldtip = self.tip
self.offset = 0
self.column = 0
self.blank = False
self.partially_consumed_tab = False
self.line_number += 1
# replace NUL characters for security
if re.search(r'\u0000', ln) is not None:
ln = re.sub(r'\0', '\uFFFD', ln)
self.current_line = ln
# For each containing block, try to parse the associated line start.
# Bail out on failure: container will point to the last matching block.
# Set all_matched to false if not all containers match.
while True:
last_child = container.last_child
if not (last_child and last_child.is_open):
break
container = last_child
self.find_next_nonspace()
rv = self.blocks[container.t].continue_(self, container)
if rv == 0:
# we've matched, keep going
pass
elif rv == 1:
# we've failed to match a block
all_matched = False
elif rv == 2:
# we've hit end of line for fenced code close and can return
self.last_line_length = len(ln)
return
else:
raise ValueError(
'continue_ returned illegal value, must be 0, 1, or 2')
if not all_matched:
# back up to last matching block
container = container.parent
break
self.all_closed = (container == self.oldtip)
self.last_matched_container = container
matched_leaf = container.t != 'paragraph' and \
self.blocks[container.t].accepts_lines
starts = self.block_starts
starts_len = len(starts.METHODS)
# Unless last matched container is a code block, try new container
# starts, adding children to the last matched container:
while not matched_leaf:
self.find_next_nonspace()
# this is a little performance optimization:
if not self.indented and \
not re.search(reMaybeSpecial, ln[self.next_nonspace:]):
self.advance_next_nonspace()
break
i = 0
while i < starts_len:
res = getattr(starts, starts.METHODS[i])(self, container)
if res == 1:
container = self.tip
break
elif res == 2:
container = self.tip
matched_leaf = True
break
else:
i += 1
if i == starts_len:
# nothing matched
self.advance_next_nonspace()
break
# What remains at the offset is a text line. Add the text to the
# appropriate container.
if not self.all_closed and not self.blank and \
self.tip.t == 'paragraph':
# lazy paragraph continuation
self.add_line()
else:
# not a lazy continuation
# finalize any blocks not matched
self.close_unmatched_blocks()
if self.blank and container.last_child:
container.last_child.last_line_blank = True
t = container.t
# Block quote lines are never blank as they start with >
# and we don't count blanks in fenced code for purposes of
# tight/loose lists or breaking out of lists. We also
# don't set last_line_blank on an empty list item, or if we
# just closed a fenced block.
last_line_blank = self.blank and \
not (t == 'block_quote' or
(t == 'code_block' and container.is_fenced) or
(t == 'item' and
not container.first_child and
container.sourcepos[0][0] == self.line_number))
# propagate last_line_blank up through parents:
cont = container
while cont:
cont.last_line_blank = last_line_blank
cont = cont.parent
if self.blocks[t].accepts_lines:
self.add_line()
# if HtmlBlock, check for end condition
if t == 'html_block' and \
container.html_block_type >= 1 and \
container.html_block_type <= 5 and \
re.search(
reHtmlBlockClose[container.html_block_type],
self.current_line[self.offset:]):
self.finalize(container, self.line_number)
elif self.offset < len(ln) and not self.blank:
# create a paragraph container for one line
container = self.add_child('paragraph', self.offset)
self.advance_next_nonspace()
self.add_line()
self.last_line_length = len(ln)
def finalize(self, block, line_number):
""" Finalize a block. Close it and do any necessary postprocessing,
e.g. creating string_content from strings, setting the 'tight'
or 'loose' status of a list, and parsing the beginnings
of paragraphs for reference definitions. Reset the tip to the
parent of the closed block."""
above = block.parent
block.is_open = False
block.sourcepos[1] = [line_number, self.last_line_length]
self.blocks[block.t].finalize(self, block)
self.tip = above
def process_inlines(self, block):
"""
Walk through a block & children recursively, parsing string content
into inline content where appropriate.
"""
walker = block.walker()
self.inline_parser.refmap = self.refmap
self.inline_parser.options = self.options
event = walker.nxt()
while event is not None:
node = event['node']
t = node.t
if not event['entering'] and (t == 'paragraph' or t == 'heading'):
self.inline_parser.parse(node)
event = walker.nxt()
def parse(self, my_input):
""" The main parsing function. Returns a parsed document AST."""
self.doc = Node('document', [[1, 1], [0, 0]])
self.tip = self.doc
self.refmap = {}
self.line_number = 0
self.last_line_length = 0
self.offset = 0
self.column = 0
self.last_matched_container = self.doc
self.current_line = ''
lines = re.split(reLineEnding, my_input)
length = len(lines)
if len(my_input) > 0 and my_input[-1] == '\n':
# ignore last blank line created by final newline
length -= 1
for i in range(length):
self.incorporate_line(lines[i])
while (self.tip):
self.finalize(self.tip, length)
self.process_inlines(self.doc)
return self.doc
CAMEL_RE = re.compile("(.)([A-Z](?:[a-z]+|(?<=[a-z0-9].)))")
Parser.blocks = dict(
(CAMEL_RE.sub(r'\1_\2', cls.__name__).lower(), cls)
for cls in Block.__subclasses__())

View file

@ -0,0 +1,53 @@
#!/usr/bin/env python
from __future__ import unicode_literals
import argparse
import sys
import commonmark
def main():
parser = argparse.ArgumentParser(
description="Process Markdown according to "
"the CommonMark specification.")
if sys.version_info < (3, 0):
reload(sys) # noqa
sys.setdefaultencoding('utf-8')
parser.add_argument(
'infile',
nargs="?",
type=argparse.FileType('r'),
default=sys.stdin,
help="Input Markdown file to parse, defaults to STDIN")
parser.add_argument(
'-o',
nargs="?",
type=argparse.FileType('w'),
default=sys.stdout,
help="Output HTML/JSON file, defaults to STDOUT")
parser.add_argument('-a', action="store_true", help="Print formatted AST")
parser.add_argument('-aj', action="store_true", help="Output JSON AST")
args = parser.parse_args()
parser = commonmark.Parser()
f = args.infile
o = args.o
lines = []
for line in f:
lines.append(line)
data = "".join(lines)
ast = parser.parse(data)
if not args.a and not args.aj:
renderer = commonmark.HtmlRenderer()
o.write(renderer.render(ast))
exit()
if args.a:
# print ast
commonmark.dumpAST(ast)
exit()
# o.write(ast.to_JSON())
o.write(commonmark.dumpJSON(ast))
exit()
if __name__ == '__main__':
main()

View file

@ -0,0 +1,113 @@
from __future__ import absolute_import, unicode_literals
import re
import sys
try:
from urllib.parse import quote
except ImportError:
from urllib import quote
if sys.version_info >= (3, 0):
if sys.version_info >= (3, 4):
import html
HTMLunescape = html.unescape
else:
from .entitytrans import _unescape
HTMLunescape = _unescape
else:
from commonmark import entitytrans
HTMLunescape = entitytrans._unescape
ENTITY = '&(?:#x[a-f0-9]{1,6}|#[0-9]{1,7}|[a-z][a-z0-9]{1,31});'
TAGNAME = '[A-Za-z][A-Za-z0-9-]*'
ATTRIBUTENAME = '[a-zA-Z_:][a-zA-Z0-9:._-]*'
UNQUOTEDVALUE = "[^\"'=<>`\\x00-\\x20]+"
SINGLEQUOTEDVALUE = "'[^']*'"
DOUBLEQUOTEDVALUE = '"[^"]*"'
ATTRIBUTEVALUE = "(?:" + UNQUOTEDVALUE + "|" + SINGLEQUOTEDVALUE + \
"|" + DOUBLEQUOTEDVALUE + ")"
ATTRIBUTEVALUESPEC = "(?:" + "\\s*=" + "\\s*" + ATTRIBUTEVALUE + ")"
ATTRIBUTE = "(?:" + "\\s+" + ATTRIBUTENAME + ATTRIBUTEVALUESPEC + "?)"
OPENTAG = "<" + TAGNAME + ATTRIBUTE + "*" + "\\s*/?>"
CLOSETAG = "</" + TAGNAME + "\\s*[>]"
HTMLCOMMENT = '<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->'
PROCESSINGINSTRUCTION = "[<][?].*?[?][>]"
DECLARATION = "<![A-Z]+" + "\\s+[^>]*>"
CDATA = '<!\\[CDATA\\[[\\s\\S]*?\\]\\]>'
HTMLTAG = "(?:" + OPENTAG + "|" + CLOSETAG + "|" + HTMLCOMMENT + "|" + \
PROCESSINGINSTRUCTION + "|" + DECLARATION + "|" + CDATA + ")"
reHtmlTag = re.compile('^' + HTMLTAG, re.IGNORECASE)
reBackslashOrAmp = re.compile(r'[\\&]')
ESCAPABLE = '[!"#$%&\'()*+,./:;<=>?@[\\\\\\]^_`{|}~-]'
reEntityOrEscapedChar = re.compile(
'\\\\' + ESCAPABLE + '|' + ENTITY, re.IGNORECASE)
XMLSPECIAL = '[&<>"]'
reXmlSpecial = re.compile(XMLSPECIAL)
def unescape_char(s):
if s[0] == '\\':
return s[1]
else:
return HTMLunescape(s)
def unescape_string(s):
"""Replace entities and backslash escapes with literal characters."""
if re.search(reBackslashOrAmp, s):
return re.sub(
reEntityOrEscapedChar,
lambda m: unescape_char(m.group()),
s)
else:
return s
def normalize_uri(uri):
try:
return quote(uri.encode('utf-8'), safe=str('/@:+?=&()%#*,'))
except UnicodeDecodeError:
# Python 2 also throws a UnicodeDecodeError, complaining about
# the width of the "safe" string. Removing this parameter
# solves the issue, but yields overly aggressive quoting, but we
# can correct those errors manually.
s = quote(uri.encode('utf-8'))
s = re.sub(r'%40', '@', s)
s = re.sub(r'%3A', ':', s)
s = re.sub(r'%2B', '+', s)
s = re.sub(r'%3F', '?', s)
s = re.sub(r'%3D', '=', s)
s = re.sub(r'%26', '&', s)
s = re.sub(r'%28', '(', s)
s = re.sub(r'%29', ')', s)
s = re.sub(r'%25', '%', s)
s = re.sub(r'%23', '#', s)
s = re.sub(r'%2A', '*', s)
s = re.sub(r'%2C', ',', s)
return s
UNSAFE_MAP = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;',
'"': '&quot;',
}
def replace_unsafe_char(s):
return UNSAFE_MAP.get(s, s)
def escape_xml(s):
if s is None:
return ''
if re.search(reXmlSpecial, s):
return re.sub(
reXmlSpecial,
lambda m: replace_unsafe_char(m.group()),
s)
else:
return s

View file

@ -0,0 +1,108 @@
from __future__ import absolute_import, unicode_literals
from builtins import str
import json
from commonmark.node import is_container
def prepare(obj, topnode=False):
"""Walk the complete AST, only returning needed data.
This removes circular references and allows us to output
JSON.
"""
a = []
for subnode, entered in obj.walker():
rep = {
'type': subnode.t,
}
if subnode.literal:
rep['literal'] = subnode.literal
if subnode.string_content:
rep['string_content'] = subnode.string_content
if subnode.title:
rep['title'] = subnode.title
if subnode.info:
rep['info'] = subnode.info
if subnode.destination:
rep['destination'] = subnode.destination
if subnode.list_data:
rep['list_data'] = subnode.list_data
if is_container(subnode):
rep['children'] = []
if entered and len(a) > 0:
if a[-1]['children']:
a[-1]['children'].append(rep)
else:
a[-1]['children'] = [rep]
else:
a.append(rep)
return a
def dumpJSON(obj):
"""Output AST in JSON form, this is destructive of block."""
prepared = prepare(obj)
return json.dumps(prepared, indent=4, sort_keys=True)
def dumpAST(obj, ind=0, topnode=False):
"""Print out a block/entire AST."""
indChar = ("\t" * ind) + "-> " if ind else ""
print(indChar + "[" + obj.t + "]")
if not obj.title == "":
print("\t" + indChar + "Title: " + (obj.title or ''))
if not obj.info == "":
print("\t" + indChar + "Info: " + (obj.info or ''))
if not obj.destination == "":
print("\t" + indChar + "Destination: " + (obj.destination or ''))
if obj.is_open:
print("\t" + indChar + "Open: " + str(obj.is_open))
if obj.last_line_blank:
print(
"\t" + indChar + "Last line blank: " + str(obj.last_line_blank))
if obj.sourcepos:
print("\t" + indChar + "Sourcepos: " + str(obj.sourcepos))
if not obj.string_content == "":
print("\t" + indChar + "String content: " + (obj.string_content or ''))
if not obj.info == "":
print("\t" + indChar + "Info: " + (obj.info or ''))
if not obj.literal == "":
print("\t" + indChar + "Literal: " + (obj.literal or ''))
if obj.list_data.get('type'):
print("\t" + indChar + "List Data: ")
print("\t\t" + indChar + "[type] = " + obj.list_data.get('type'))
if obj.list_data.get('bullet_char'):
print(
"\t\t" + indChar + "[bullet_char] = " +
obj.list_data['bullet_char'])
if obj.list_data.get('start'):
print(
"\t\t" + indChar + "[start] = " +
str(obj.list_data.get('start')))
if obj.list_data.get('delimiter'):
print(
"\t\t" + indChar + "[delimiter] = " +
obj.list_data.get('delimiter'))
if obj.list_data.get('padding'):
print(
"\t\t" + indChar + "[padding] = " +
str(obj.list_data.get('padding')))
if obj.list_data.get('marker_offset'):
print(
"\t\t" + indChar + "[marker_offset] = " +
str(obj.list_data.get('marker_offset')))
if obj.walker:
print("\t" + indChar + "Children:")
walker = obj.walker()
nxt = walker.nxt()
while nxt is not None and topnode is False:
dumpAST(nxt['node'], ind + 2, topnode=True)
nxt = walker.nxt()

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,882 @@
from __future__ import absolute_import, unicode_literals, division
import re
import sys
from commonmark import common
from commonmark.common import normalize_uri, unescape_string
from commonmark.node import Node
from commonmark.normalize_reference import normalize_reference
if sys.version_info >= (3, 0):
if sys.version_info >= (3, 4):
import html
HTMLunescape = html.unescape
else:
from .entitytrans import _unescape
HTMLunescape = _unescape
else:
from commonmark import entitytrans
HTMLunescape = entitytrans._unescape
# Some regexps used in inline parser:
ESCAPED_CHAR = '\\\\' + common.ESCAPABLE
rePunctuation = re.compile(
r'[!"#$%&\'()*+,\-./:;<=>?@\[\]\\^_`{|}~\xA1\xA7\xAB\xB6\xB7\xBB'
r'\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3'
r'\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F'
r'\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E'
r'\u085E\u0964\u0965\u0970\u0AF0\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12'
r'\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB'
r'\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736'
r'\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-'
r'\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F'
r'\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E'
r'\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5'
r'\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC'
r'\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E42\u3001-\u3003\u3008-\u3011'
r'\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673'
r'\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E'
r'\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0'
r'\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63'
r'\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B'
r'\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-'
r'\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58'
r'\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD804[\uDC47-\uDC4D'
r'\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC9\uDDCD'
r'\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDCC6\uDDC1-\uDDD7'
r'\uDE41-\uDE43\uDF3C-\uDF3E]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F'
r'\uDEF5\uDF37-\uDF3B\uDF44]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]'
)
reLinkTitle = re.compile(
'^(?:"(' + ESCAPED_CHAR + '|[^"\\x00])*"' +
'|' +
'\'(' + ESCAPED_CHAR + '|[^\'\\x00])*\'' +
'|' +
'\\((' + ESCAPED_CHAR + '|[^()\\x00])*\\))')
reLinkDestinationBraces = re.compile(r'^(?:<(?:[^<>\n\\\x00]|\\.)*>)')
reEscapable = re.compile('^' + common.ESCAPABLE)
reEntityHere = re.compile('^' + common.ENTITY, re.IGNORECASE)
reTicks = re.compile(r'`+')
reTicksHere = re.compile(r'^`+')
reEllipses = re.compile(r'\.\.\.')
reDash = re.compile(r'--+')
reEmailAutolink = re.compile(
r"^<([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9]"
r"(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?"
r"(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>")
reAutolink = re.compile(
r'^<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>',
re.IGNORECASE)
reSpnl = re.compile(r'^ *(?:\n *)?')
reWhitespaceChar = re.compile(r'^^[ \t\n\x0b\x0c\x0d]')
reWhitespace = re.compile(r'[ \t\n\x0b\x0c\x0d]+')
reUnicodeWhitespaceChar = re.compile(r'^\s')
reFinalSpace = re.compile(r' *$')
reInitialSpace = re.compile(r'^ *')
reSpaceAtEndOfLine = re.compile(r'^ *(?:\n|$)')
reLinkLabel = re.compile(r'^\[(?:[^\\\[\]]|\\.){0,1000}\]')
# Matches a string of non-special characters.
reMain = re.compile(r'^[^\n`\[\]\\!<&*_\'"]+', re.MULTILINE)
def text(s):
node = Node('text', None)
node.literal = s
return node
def smart_dashes(chars):
en_count = 0
em_count = 0
if len(chars) % 3 == 0:
# If divisible by 3, use all em dashes
em_count = len(chars) // 3
elif len(chars) % 2 == 0:
# If divisble by 2, use all en dashes
en_count = len(chars) // 2
elif len(chars) % 3 == 2:
# if 2 extra dashes, use en dashfor last 2;
# em dashes for rest
en_count = 1
em_count = (len(chars) - 2) // 3
else:
# Use en dashes for last 4 hyphens; em dashes for rest
en_count = 2
em_count = (len(chars) - 4) // 3
return ('\u2014' * em_count) + ('\u2013' * en_count)
class InlineParser(object):
"""INLINE PARSER
These are methods of an InlineParser class, defined below.
An InlineParser keeps track of a subject (a string to be
parsed) and a position in that subject.
"""
def __init__(self, options={}):
self.subject = ''
self.brackets = None
self.pos = 0
self.refmap = {}
self.options = options
def match(self, regexString):
"""
If regexString matches at current position in the subject, advance
position in subject and return the match; otherwise return None.
"""
match = re.search(regexString, self.subject[self.pos:])
if match is None:
return None
else:
self.pos += match.end()
return match.group()
def peek(self):
""" Returns the character at the current subject position, or None if
there are no more characters."""
if self.pos < len(self.subject):
return self.subject[self.pos]
else:
return None
def spnl(self):
""" Parse zero or more space characters, including at
most one newline."""
self.match(reSpnl)
return True
# All of the parsers below try to match something at the current position
# in the subject. If they succeed in matching anything, they
# push an inline matched, advancing the subject.
def parseBackticks(self, block):
""" Attempt to parse backticks, adding either a backtick code span or a
literal sequence of backticks to the 'inlines' list."""
ticks = self.match(reTicksHere)
if ticks is None:
return False
after_open_ticks = self.pos
matched = self.match(reTicks)
while matched is not None:
if matched == ticks:
node = Node('code', None)
contents = self.subject[after_open_ticks:self.pos-len(ticks)] \
.replace('\n', ' ')
if contents.lstrip(' ') and contents[0] == contents[-1] == ' ':
node.literal = contents[1:-1]
else:
node.literal = contents
block.append_child(node)
return True
matched = self.match(reTicks)
# If we got here, we didn't match a closing backtick sequence.
self.pos = after_open_ticks
block.append_child(text(ticks))
return True
def parseBackslash(self, block):
"""
Parse a backslash-escaped special character, adding either the
escaped character, a hard line break (if the backslash is followed
by a newline), or a literal backslash to the block's children.
Assumes current character is a backslash.
"""
subj = self.subject
self.pos += 1
try:
subjchar = subj[self.pos]
except IndexError:
subjchar = None
if self.peek() == '\n':
self.pos += 1
node = Node('linebreak', None)
block.append_child(node)
elif subjchar and re.search(reEscapable, subjchar):
block.append_child(text(subjchar))
self.pos += 1
else:
block.append_child(text('\\'))
return True
def parseAutolink(self, block):
"""Attempt to parse an autolink (URL or email in pointy brackets)."""
m = self.match(reEmailAutolink)
if m:
# email
dest = m[1:-1]
node = Node('link', None)
node.destination = normalize_uri('mailto:' + dest)
node.title = ''
node.append_child(text(dest))
block.append_child(node)
return True
else:
m = self.match(reAutolink)
if m:
# link
dest = m[1:-1]
node = Node('link', None)
node.destination = normalize_uri(dest)
node.title = ''
node.append_child(text(dest))
block.append_child(node)
return True
return False
def parseHtmlTag(self, block):
"""Attempt to parse a raw HTML tag."""
m = self.match(common.reHtmlTag)
if m is None:
return False
else:
node = Node('html_inline', None)
node.literal = m
block.append_child(node)
return True
def scanDelims(self, c):
"""
Scan a sequence of characters == c, and return information about
the number of delimiters and whether they are positioned such that
they can open and/or close emphasis or strong emphasis. A utility
function for strong/emph parsing.
"""
numdelims = 0
startpos = self.pos
if c == "'" or c == '"':
numdelims += 1
self.pos += 1
else:
while (self.peek() == c):
numdelims += 1
self.pos += 1
if numdelims == 0:
return None
c_before = '\n' if startpos == 0 else self.subject[startpos - 1]
c_after = self.peek()
if c_after is None:
c_after = '\n'
# Python 2 doesn't recognize '\xa0' as whitespace
after_is_whitespace = re.search(reUnicodeWhitespaceChar, c_after) or \
c_after == '\xa0'
after_is_punctuation = re.search(rePunctuation, c_after)
before_is_whitespace = re.search(
reUnicodeWhitespaceChar, c_before) or \
c_before == '\xa0'
before_is_punctuation = re.search(rePunctuation, c_before)
left_flanking = not after_is_whitespace and \
(not after_is_punctuation or
before_is_whitespace or
before_is_punctuation)
right_flanking = not before_is_whitespace and \
(not before_is_punctuation or
after_is_whitespace or
after_is_punctuation)
if c == '_':
can_open = left_flanking and \
(not right_flanking or before_is_punctuation)
can_close = right_flanking and \
(not left_flanking or after_is_punctuation)
elif c == "'" or c == '"':
can_open = left_flanking and not right_flanking
can_close = right_flanking
else:
can_open = left_flanking
can_close = right_flanking
self.pos = startpos
return {
'numdelims': numdelims,
'can_open': can_open,
'can_close': can_close,
}
def handleDelim(self, cc, block):
"""Handle a delimiter marker for emphasis or a quote."""
res = self.scanDelims(cc)
if not res:
return False
numdelims = res.get('numdelims')
startpos = self.pos
self.pos += numdelims
if cc == "'":
contents = '\u2019'
elif cc == '"':
contents = '\u201C'
else:
contents = self.subject[startpos:self.pos]
node = text(contents)
block.append_child(node)
# Add entry to stack for this opener
self.delimiters = {
'cc': cc,
'numdelims': numdelims,
'origdelims': numdelims,
'node': node,
'previous': self.delimiters,
'next': None,
'can_open': res.get('can_open'),
'can_close': res.get('can_close'),
}
if self.delimiters['previous'] is not None:
self.delimiters['previous']['next'] = self.delimiters
return True
def removeDelimiter(self, delim):
if delim.get('previous') is not None:
delim['previous']['next'] = delim.get('next')
if delim.get('next') is None:
# Top of stack
self.delimiters = delim.get('previous')
else:
delim['next']['previous'] = delim.get('previous')
@staticmethod
def removeDelimitersBetween(bottom, top):
if bottom.get('next') != top:
bottom['next'] = top
top['previous'] = bottom
def processEmphasis(self, stack_bottom):
openers_bottom = {
'_': stack_bottom,
'*': stack_bottom,
"'": stack_bottom,
'"': stack_bottom,
}
odd_match = False
use_delims = 0
# Find first closer above stack_bottom
closer = self.delimiters
while closer is not None and closer.get('previous') != stack_bottom:
closer = closer.get('previous')
# Move forward, looking for closers, and handling each
while closer is not None:
if not closer.get('can_close'):
closer = closer.get('next')
else:
# found emphasis closer. now look back for first
# matching opener:
opener = closer.get('previous')
opener_found = False
closercc = closer.get('cc')
while (opener is not None and opener != stack_bottom and
opener != openers_bottom[closercc]):
odd_match = (closer.get('can_open') or
opener.get('can_close')) and \
closer['origdelims'] % 3 != 0 and \
(opener['origdelims'] +
closer['origdelims']) % 3 == 0
if opener.get('cc') == closercc and \
opener.get('can_open') and \
not odd_match:
opener_found = True
break
opener = opener.get('previous')
old_closer = closer
if closercc == '*' or closercc == '_':
if not opener_found:
closer = closer.get('next')
else:
# Calculate actual number of delimiters used from
# closer
use_delims = 2 if (
closer['numdelims'] >= 2 and
opener['numdelims'] >= 2) else 1
opener_inl = opener.get('node')
closer_inl = closer.get('node')
# Remove used delimiters from stack elts and inlines
opener['numdelims'] -= use_delims
closer['numdelims'] -= use_delims
opener_inl.literal = opener_inl.literal[
:len(opener_inl.literal) - use_delims]
closer_inl.literal = closer_inl.literal[
:len(closer_inl.literal) - use_delims]
# Build contents for new Emph element
if use_delims == 1:
emph = Node('emph', None)
else:
emph = Node('strong', None)
tmp = opener_inl.nxt
while tmp and tmp != closer_inl:
nxt = tmp.nxt
tmp.unlink()
emph.append_child(tmp)
tmp = nxt
opener_inl.insert_after(emph)
# Remove elts between opener and closer in delimiters
# stack
self.removeDelimitersBetween(opener, closer)
# If opener has 0 delims, remove it and the inline
if opener['numdelims'] == 0:
opener_inl.unlink()
self.removeDelimiter(opener)
if closer['numdelims'] == 0:
closer_inl.unlink()
tempstack = closer['next']
self.removeDelimiter(closer)
closer = tempstack
elif closercc == "'":
closer['node'].literal = '\u2019'
if opener_found:
opener['node'].literal = '\u2018'
closer = closer['next']
elif closercc == '"':
closer['node'].literal = '\u201D'
if opener_found:
opener['node'].literal = '\u201C'
closer = closer['next']
if not opener_found and not odd_match:
# Set lower bound for future searches for openers:
# We don't do this with odd_match because a **
# that doesn't match an earlier * might turn into
# an opener, and the * might be matched by something
# else.
openers_bottom[closercc] = old_closer['previous']
if not old_closer['can_open']:
# We can remove a closer that can't be an opener,
# once we've seen there's no matching opener:
self.removeDelimiter(old_closer)
# Remove all delimiters
while self.delimiters is not None and self.delimiters != stack_bottom:
self.removeDelimiter(self.delimiters)
def parseLinkTitle(self):
"""
Attempt to parse link title (sans quotes), returning the string
or None if no match.
"""
title = self.match(reLinkTitle)
if title is None:
return None
else:
# chop off quotes from title and unescape:
return unescape_string(title[1:-1])
def parseLinkDestination(self):
"""
Attempt to parse link destination, returning the string or
None if no match.
"""
res = self.match(reLinkDestinationBraces)
if res is None:
if self.peek() == '<':
return None
# TODO handrolled parser; res should be None or the string
savepos = self.pos
openparens = 0
while True:
c = self.peek()
if c is None:
break
if c == '\\' and re.search(
reEscapable, self.subject[self.pos+1:self.pos+2]):
self.pos += 1
if self.peek() is not None:
self.pos += 1
elif c == '(':
self.pos += 1
openparens += 1
elif c == ')':
if openparens < 1:
break
else:
self.pos += 1
openparens -= 1
elif re.search(reWhitespaceChar, c):
break
else:
self.pos += 1
if self.pos == savepos and c != ')':
return None
res = self.subject[savepos:self.pos]
return normalize_uri(unescape_string(res))
else:
# chop off surrounding <..>:
return normalize_uri(unescape_string(res[1:-1]))
def parseLinkLabel(self):
"""
Attempt to parse a link label, returning number of
characters parsed.
"""
# Note: our regex will allow something of form [..\];
# we disallow it here rather than using lookahead in the regex:
m = self.match(reLinkLabel)
if m is None or len(m) > 1001:
return 0
else:
return len(m)
def parseOpenBracket(self, block):
"""
Add open bracket to delimiter stack and add a text node to
block's children.
"""
startpos = self.pos
self.pos += 1
node = text('[')
block.append_child(node)
# Add entry to stack for this opener
self.addBracket(node, startpos, False)
return True
def parseBang(self, block):
"""
If next character is [, and ! delimiter to delimiter stack and
add a text node to block's children. Otherwise just add a text
node.
"""
startpos = self.pos
self.pos += 1
if self.peek() == '[':
self.pos += 1
node = text('![')
block.append_child(node)
# Add entry to stack for this openeer
self.addBracket(node, startpos + 1, True)
else:
block.append_child(text('!'))
return True
def parseCloseBracket(self, block):
"""
Try to match close bracket against an opening in the delimiter
stack. Add either a link or image, or a plain [ character,
to block's children. If there is a matching delimiter,
remove it from the delimiter stack.
"""
title = None
matched = False
self.pos += 1
startpos = self.pos
# get last [ or ![
opener = self.brackets
if opener is None:
# no matched opener, just return a literal
block.append_child(text(']'))
return True
if not opener.get('active'):
# no matched opener, just return a literal
block.append_child(text(']'))
# take opener off brackets stack
self.removeBracket()
return True
# If we got here, opener is a potential opener
is_image = opener.get('image')
# Check to see if we have a link/image
savepos = self.pos
# Inline link?
if self.peek() == '(':
self.pos += 1
self.spnl()
dest = self.parseLinkDestination()
if dest is not None and self.spnl():
# make sure there's a space before the title
if re.search(reWhitespaceChar, self.subject[self.pos-1]):
title = self.parseLinkTitle()
if self.spnl() and self.peek() == ')':
self.pos += 1
matched = True
else:
self.pos = savepos
if not matched:
# Next, see if there's a link label
beforelabel = self.pos
n = self.parseLinkLabel()
if n > 2:
reflabel = self.subject[beforelabel:beforelabel + n]
elif not opener.get('bracket_after'):
# Empty or missing second label means to use the first
# label as the reference. The reference must not
# contain a bracket. If we know there's a bracket, we
# don't even bother checking it.
reflabel = self.subject[opener.get('index'):startpos]
if n == 0:
# If shortcut reference link, rewind before spaces we skipped.
self.pos = savepos
if reflabel:
# lookup rawlabel in refmap
link = self.refmap.get(normalize_reference(reflabel))
if link:
dest = link['destination']
title = link['title']
matched = True
if matched:
node = Node('image' if is_image else 'link', None)
node.destination = dest
node.title = title or ''
tmp = opener.get('node').nxt
while tmp:
nxt = tmp.nxt
tmp.unlink()
node.append_child(tmp)
tmp = nxt
block.append_child(node)
self.processEmphasis(opener.get('previousDelimiter'))
self.removeBracket()
opener.get('node').unlink()
# We remove this bracket and processEmphasis will remove
# later delimiters.
# Now, for a link, we also deactivate earlier link openers.
# (no links in links)
if not is_image:
opener = self.brackets
while opener is not None:
if not opener.get('image'):
# deactivate this opener
opener['active'] = False
opener = opener.get('previous')
return True
else:
# no match
# remove this opener from stack
self.removeBracket()
self.pos = startpos
block.append_child(text(']'))
return True
def addBracket(self, node, index, image):
if self.brackets is not None:
self.brackets['bracketAfter'] = True
self.brackets = {
'node': node,
'previous': self.brackets,
'previousDelimiter': self.delimiters,
'index': index,
'image': image,
'active': True,
}
def removeBracket(self):
self.brackets = self.brackets.get('previous')
def parseEntity(self, block):
"""Attempt to parse an entity."""
m = self.match(reEntityHere)
if m:
block.append_child(text(HTMLunescape(m)))
return True
else:
return False
def parseString(self, block):
"""
Parse a run of ordinary characters, or a single character with
a special meaning in markdown, as a plain string.
"""
m = self.match(reMain)
if m:
if self.options.get('smart'):
s = re.sub(reEllipses, '\u2026', m)
s = re.sub(reDash, lambda x: smart_dashes(x.group()), s)
block.append_child(text(s))
else:
block.append_child(text(m))
return True
else:
return False
def parseNewline(self, block):
"""
Parse a newline. If it was preceded by two spaces, return a hard
line break; otherwise a soft line break.
"""
# assume we're at a \n
self.pos += 1
lastc = block.last_child
if lastc and lastc.t == 'text' and lastc.literal[-1] == ' ':
linebreak = len(lastc.literal) >= 2 and lastc.literal[-2] == ' '
lastc.literal = re.sub(reFinalSpace, '', lastc.literal)
if linebreak:
node = Node('linebreak', None)
else:
node = Node('softbreak', None)
block.append_child(node)
else:
block.append_child(Node('softbreak', None))
# gobble leading spaces in next line
self.match(reInitialSpace)
return True
def parseReference(self, s, refmap):
"""Attempt to parse a link reference, modifying refmap."""
self.subject = s
self.pos = 0
startpos = self.pos
# label:
match_chars = self.parseLinkLabel()
if match_chars == 0 or match_chars == 2:
return 0
else:
rawlabel = self.subject[:match_chars]
# colon:
if (self.peek() == ':'):
self.pos += 1
else:
self.pos = startpos
return 0
# link url
self.spnl()
dest = self.parseLinkDestination()
if dest is None:
self.pos = startpos
return 0
beforetitle = self.pos
self.spnl()
title = None
if self.pos != beforetitle:
title = self.parseLinkTitle()
if title is None:
title = ''
# rewind before spaces
self.pos = beforetitle
# make sure we're at line end:
at_line_end = True
if self.match(reSpaceAtEndOfLine) is None:
if title == '':
at_line_end = False
else:
# the potential title we found is not at the line end,
# but it could still be a legal link reference if we
# discard the title
title == ''
# rewind before spaces
self.pos = beforetitle
# and instead check if the link URL is at the line end
at_line_end = self.match(reSpaceAtEndOfLine) is not None
if not at_line_end:
self.pos = startpos
return 0
normlabel = normalize_reference(rawlabel)
if normlabel == '':
# label must contain non-whitespace characters
self.pos = startpos
return 0
if not refmap.get(normlabel):
refmap[normlabel] = {
'destination': dest,
'title': title
}
return (self.pos - startpos)
def parseInline(self, block):
"""
Parse the next inline element in subject, advancing subject
position.
On success, add the result to block's children and return True.
On failure, return False.
"""
res = False
c = self.peek()
if c is None:
return False
if c == '\n':
res = self.parseNewline(block)
elif c == '\\':
res = self.parseBackslash(block)
elif c == '`':
res = self.parseBackticks(block)
elif c == '*' or c == '_':
res = self.handleDelim(c, block)
elif c == "'" or c == '"':
res = self.options.get('smart') and self.handleDelim(c, block)
elif c == '[':
res = self.parseOpenBracket(block)
elif c == '!':
res = self.parseBang(block)
elif c == ']':
res = self.parseCloseBracket(block)
elif c == '<':
res = self.parseAutolink(block) or self.parseHtmlTag(block)
elif c == '&':
res = self.parseEntity(block)
else:
res = self.parseString(block)
if not res:
self.pos += 1
block.append_child(text(c))
return True
def parseInlines(self, block):
"""
Parse string content in block into inline children,
using refmap to resolve references.
"""
self.subject = block.string_content.strip()
self.pos = 0
self.delimiters = None
self.brackets = None
while (self.parseInline(block)):
pass
# allow raw string to be garbage collected
block.string_content = None
self.processEmphasis(None)
parse = parseInlines

View file

@ -0,0 +1,41 @@
# 2014 - Bibek Kafle & Roland Shoemaker
# 2015-2017 - Nikolas Nyby
# Port of @jgm's commonmark.js implementation of the CommonMark spec.
# Basic usage:
#
# import commonmark
# parser = commonmark.Parser()
# renderer = commonmark.HtmlRenderer()
# print(renderer.render(parser.parse('Hello *world*')))
from __future__ import absolute_import, unicode_literals
from commonmark.blocks import Parser
from commonmark.dump import dumpAST, dumpJSON
from commonmark.render.html import HtmlRenderer
from commonmark.render.rst import ReStructuredTextRenderer
def commonmark(text, format="html"):
"""Render CommonMark into HTML, JSON or AST
Optional keyword arguments:
format: 'html' (default), 'json' or 'ast'
>>> commonmark("*hello!*")
'<p><em>hello</em></p>\\n'
"""
parser = Parser()
ast = parser.parse(text)
if format not in ["html", "json", "ast", "rst"]:
raise ValueError("format must be 'html', 'json' or 'ast'")
if format == "html":
renderer = HtmlRenderer()
return renderer.render(ast)
if format == "json":
return dumpJSON(ast)
if format == "ast":
return dumpAST(ast)
if format == "rst":
renderer = ReStructuredTextRenderer()
return renderer.render(ast)

View file

@ -0,0 +1,179 @@
from __future__ import unicode_literals
import re
reContainer = re.compile(
r'(document|block_quote|list|item|paragraph|'
r'heading|emph|strong|link|image|'
r'custom_inline|custom_block)')
def is_container(node):
return (re.search(reContainer, node.t) is not None)
class NodeWalker(object):
def __init__(self, root):
self.current = root
self.root = root
self.entering = True
def __next__(self):
cur = self.current
entering = self.entering
if cur is None:
raise StopIteration
container = is_container(cur)
if entering and container:
if cur.first_child:
self.current = cur.first_child
self.entering = True
else:
# stay on node but exit
self.entering = False
elif cur == self.root:
self.current = None
elif cur.nxt is None:
self.current = cur.parent
self.entering = False
else:
self.current = cur.nxt
self.entering = True
return cur, entering
next = __next__
def __iter__(self):
return self
def nxt(self):
""" for backwards compatibility """
try:
cur, entering = next(self)
return {
'entering': entering,
'node': cur,
}
except StopIteration:
return None
def resume_at(self, node, entering):
self.current = node
self.entering = (entering is True)
class Node(object):
def __init__(self, node_type, sourcepos):
self.t = node_type
self.parent = None
self.first_child = None
self.last_child = None
self.prv = None
self.nxt = None
self.sourcepos = sourcepos
self.last_line_blank = False
self.last_line_checked = False
self.is_open = True
self.string_content = ''
self.literal = None
self.list_data = {}
self.info = None
self.destination = None
self.title = None
self.is_fenced = False
self.fence_char = None
self.fence_length = 0
self.fence_offset = None
self.level = None
self.on_enter = None
self.on_exit = None
def __repr__(self):
return "Node {} [{}]".format(self.t, self.literal)
def pretty(self):
from pprint import pprint
pprint(self.__dict__)
def normalize(self):
prev = None
for curr, _ in self.walker():
if prev is None:
prev = curr
continue
if prev.t == 'text' and curr.t == 'text':
prev.literal += curr.literal
curr.unlink()
else:
prev = curr
def is_container(self):
return is_container(self)
def append_child(self, child):
child.unlink()
child.parent = self
if self.last_child:
self.last_child.nxt = child
child.prv = self.last_child
self.last_child = child
else:
self.first_child = child
self.last_child = child
def prepend_child(self, child):
child.unlink()
child.parent = self
if self.first_child:
self.first_child.prv = child
child.nxt = self.first_child
self.first_child = child
else:
self.first_child = child
self.last_child = child
def unlink(self):
if self.prv:
self.prv.nxt = self.nxt
elif self.parent:
self.parent.first_child = self.nxt
if self.nxt:
self.nxt.prv = self.prv
elif self.parent:
self.parent.last_child = self.prv
self.parent = None
self.nxt = None
self.prv = None
def insert_after(self, sibling):
sibling.unlink()
sibling.nxt = self.nxt
if sibling.nxt:
sibling.nxt.prv = sibling
sibling.prv = self
self.nxt = sibling
sibling.parent = self.parent
if not sibling.nxt:
sibling.parent.last_child = sibling
def insert_before(self, sibling):
sibling.unlink()
sibling.prv = self.prv
if sibling.prv:
sibling.prv.nxt = sibling
sibling.nxt = self
self.prv = sibling
sibling.parent = self.parent
if not sibling.prv:
sibling.parent.first_child = sibling
def walker(self):
return NodeWalker(self)

View file

@ -0,0 +1,165 @@
"""Case-folding and whitespace normalization"""
# Unicode Case Folding table has been derived from the following work:
#
# CaseFolding-12.0.0.txt
# Date: 2019-01-22, 08:18:22 GMT
# (c) 2019 Unicode(R) Inc.
# Unicode and the Unicode Logo are registered trademarks
# of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# For documentation, see http://www.unicode.org/reports/tr44/
import re
import sys
from builtins import str, chr
__all__ = ["normalize_reference"]
if sys.version_info < (3,) and sys.maxunicode <= 0xffff:
# shim for Python 2.x UCS2 build
_unichr = chr
def chr(cdp):
if 0x10000 <= cdp < 0x110000:
cdp -= 0x10000
return (_unichr(0xd800 | (cdp >> 10)) +
_unichr(0xdc00 | (cdp & 0x3ff)))
return _unichr(cdp)
def _parse_table(tbl):
xlat = {}
cur_i, cur_j = -1, 0
for entry in tbl.split(';'):
arr = entry.split(',')
info = [int(x, 36) if x else 0 for x in arr[0].split(':')]
arr = [int(x, 36) for x in arr[1:]]
assert not any(x in xlat for x in arr)
sfx = ''.join(map(chr, arr))
streak, stride = 0, 1
if len(info) == 2:
fdt, delta = info
elif len(info) == 3:
fdt, streak, delta = info
else:
fdt, streak, delta, stride = info
assert streak >= 0 and stride >= 1
cur_i += fdt + 1
cur_j -= delta
assert cur_j != 0
i = cur_i
last = cur_i + streak
while i <= last:
# uniqueness and idempotency
assert i not in xlat and i + cur_j not in xlat
assert i not in arr
xlat[i] = chr(i + cur_j) + sfx
i += stride
return xlat
XLAT = _parse_table(
# ===== Start of Unicode Case Folding table =====
'1t:p:-w;37:-kn;a:m:kn;n:6:;6:3w,37;w:1a:-31:2;1b:5k,lj;1:4:-5k:2;6:e::'
'2;f:-aa,32;:18:aa:2;19:3e;:4:-3e:2;5:7h;1:-da;:2:5t:2;3:-5p;:5p;1:1:-5'
'o;1:5o;2:-26;:-3f;:-1;:5m;1:-5o;:-2;1:-4;:2;:5s;3:-5u;:-2;1:-1;:4:5x:2'
';5:-61;:61;1:-61;2:61;1:-61;:61;1:1:-60;1:2:60:2;3:-62;:4:62:4;b:-1;:1'
';1:-1;:1;1:-1;:g:1:2;i:g::2;h:av,lo;:-aw;:2:1:2;3:2q;:-15;:12:-1l:2;13'
':3n;1:g:-3n:2;n:-8bu;:8bu;1:4k;:-8gb;2:8br;1:5g;:-7c;:-2;:8:1y:2;72:-3'
'7;16:2:37:2;5:;8:-37;6:26;1:2:1;3:-r;1:1:1;1:m,lk,ld;:g:9;h:8:;c:b,lk,'
'ld;h:k;c:-7;:12;:-5;3:-a;:7;1:m:-n:2;n:1j;:-6;2:c;:4;1:-1t;1:8;:-8;2:2'
':3n;2:f:-5u;f:v:1c;27:w:v:2;15:1g::2;1h:-e;:c:e:2;e:2m::2;2o:11:-1b;2d'
':2a,136;26w:11:-5mq;12:6::6;mo:5:5m0;1on:4sm;:-1;:-9;:1:-2;1:1;:-7;:-o'
';:-vzb;7:16:tj7;18:2:;8y:44:-2bl:2;45:5yn,mp;:-b,lk;:-2,lm;:-1,lm;:p,j'
'i;:-5xb;2:5wx,37;1:2m:-5yk:2;2v:7:9;f:5:;f:7:;f:7:;f:5:;7:5fn,lv;1:2,l'
'v,lc;1:2,lv,ld;1:2,lv,n6;2:6:-5ft:2;e:7:;n:7:3c,qh;7:7:8,qh;7:7:-o,qh;'
'7:7:8,qh;7:7:-1k,qh;7:7:8,qh;9:-6,qh;:5hc,qh;:6,qh;1:-3,n6;:1,n6,qh;:1'
':-5j2;1:1:1u;1:5hd,qh;1:-6;3:-5h3,qh;:5ha,qh;:a,qh;1:-7,n6;:1,n6,qh;:3'
':-5h6;3:5hb,qh;5:4,lk,lc;:1,lk,ld;2:3,n6;:1,lk,n6;:1:-5jq;1:1:2k;7:5h5'
',lk,lc;:1,lk,ld;:5,lv;1:-2,n6;:1,lk,n6;:1:-5ju;1:1:2w;1:-2x;5:33,qh;:5'
'h0,qh;:-4,qh;1:7,n6;:1,n6,qh;:1:-5gu;1:1:-2;1:5h1,qh;89:8a;3:o2;:-3d;6'
':-6ea;19:f:c;y:f;mq:p:-p;1ft:1a:-m;2n:1b;1:8ag;:-5ch;:5c1;2:4:-8a0:2;5'
':8bh;:-v;:y;:-1;1:3:-8bj:3;b:1:8cg;1:2q:-8cg:2;2y:2::2;6:nym::nym;nyn:'
'16::2;1p:q::2;4h:c::2;f:1o::2;1y:2::2;3:r9h;:8:-r9h:2;c:;1:wmh;2:2:-wm'
'h:2;5:i::2;j:wn9;:b;:-4;:-a;:3;1:-1e;:o;:-l;:-xbp;:a:pr:2;d:;1:1d;:wlv'
';:-5cb;q1:27:2oo;fpr:jii,2u;:1,2x;:1,30;:1,2u,2x;:1,2u,30;:-c,38;:1,38'
';c:-z8,12u;:1,12d;:1,12j;:-9,12u;:b,12l;sp:p:-1cjn;ym:13:-8;4v:z:;1jj:'
'1e:-o;2e7:v:w;gwv:v:;o8v:x:-2'
# ===== End of Unicode Case Folding table =====
)
def _check_native(tbl):
"""
Determine if Python's own native implementation
subsumes the supplied case folding table
"""
try:
for i in tbl:
stv = chr(i)
if stv.casefold() == stv:
return False
except AttributeError:
return False
return True
# Hoist version check out of function for performance
SPACE_RE = re.compile(r'[ \t\r\n]+')
if _check_native(XLAT):
def normalize_reference(string):
"""
Normalize reference label: collapse internal whitespace
to single space, remove leading/trailing whitespace, case fold.
"""
return SPACE_RE.sub(' ', string[1:-1].strip()).casefold()
elif sys.version_info >= (3,) or sys.maxunicode > 0xffff:
def normalize_reference(string):
"""
Normalize reference label: collapse internal whitespace
to single space, remove leading/trailing whitespace, case fold.
"""
return SPACE_RE.sub(' ', string[1:-1].strip()).translate(XLAT)
else:
def _get_smp_regex():
xls = sorted(x - 0x10000 for x in XLAT if x >= 0x10000)
xls.append(-1)
fmt, (dsh, opn, pip, cse) = str('\\u%04x'), str('-[|]')
rga, srk, erk = [str(r'[ \t\r\n]+')], 0, -2
for k in xls:
new_hir = (erk ^ k) >> 10 != 0
if new_hir or erk + 1 != k:
if erk >= 0 and srk != erk:
if srk + 1 != erk:
rga.append(dsh)
rga.append(fmt % (0xdc00 + (erk & 0x3ff)))
if new_hir:
if erk >= 0:
rga.append(cse)
if k < 0:
break
rga.append(pip)
rga.append(fmt % (0xd800 + (k >> 10)))
rga.append(opn)
srk = k
rga.append(fmt % (0xdc00 + (srk & 0x3ff)))
erk = k
return re.compile(str().join(rga))
def _subst_handler(matchobj):
src = matchobj.group(0)
hiv = ord(src[0])
if hiv < 0xd800:
return ' '
return XLAT[0x10000 + ((hiv & 0x3ff) << 10) | (ord(src[1]) & 0x3ff)]
SMP_RE = _get_smp_regex()
def normalize_reference(string):
"""
Normalize reference label: collapse internal whitespace
to single space, remove leading/trailing whitespace, case fold.
"""
return SMP_RE.sub(_subst_handler, string[1:-1].strip()).translate(XLAT)

View file

@ -0,0 +1,228 @@
from __future__ import unicode_literals
import re
from builtins import str
from commonmark.common import escape_xml
from commonmark.render.renderer import Renderer
reUnsafeProtocol = re.compile(
r'^javascript:|vbscript:|file:|data:', re.IGNORECASE)
reSafeDataProtocol = re.compile(
r'^data:image\/(?:png|gif|jpeg|webp)', re.IGNORECASE)
def potentially_unsafe(url):
return re.search(reUnsafeProtocol, url) and \
(not re.search(reSafeDataProtocol, url))
class HtmlRenderer(Renderer):
def __init__(self, options={}):
# by default, soft breaks are rendered as newlines in HTML
options['softbreak'] = options.get('softbreak') or '\n'
# set to "<br />" to make them hard breaks
# set to " " if you want to ignore line wrapping in source
self.disable_tags = 0
self.last_out = '\n'
self.options = options
def escape(self, text):
return escape_xml(text)
def tag(self, name, attrs=None, selfclosing=None):
"""Helper function to produce an HTML tag."""
if self.disable_tags > 0:
return
self.buf += '<' + name
if attrs and len(attrs) > 0:
for attrib in attrs:
self.buf += ' ' + attrib[0] + '="' + attrib[1] + '"'
if selfclosing:
self.buf += ' /'
self.buf += '>'
self.last_out = '>'
# Node methods #
def text(self, node, entering=None):
self.out(node.literal)
def softbreak(self, node=None, entering=None):
self.lit(self.options['softbreak'])
def linebreak(self, node=None, entering=None):
self.tag('br', [], True)
self.cr()
def link(self, node, entering):
attrs = self.attrs(node)
if entering:
if not (self.options.get('safe') and
potentially_unsafe(node.destination)):
attrs.append(['href', self.escape(node.destination)])
if node.title:
attrs.append(['title', self.escape(node.title)])
self.tag('a', attrs)
else:
self.tag('/a')
def image(self, node, entering):
if entering:
if self.disable_tags == 0:
if self.options.get('safe') and \
potentially_unsafe(node.destination):
self.lit('<img src="" alt="')
else:
self.lit('<img src="' +
self.escape(node.destination) +
'" alt="')
self.disable_tags += 1
else:
self.disable_tags -= 1
if self.disable_tags == 0:
if node.title:
self.lit('" title="' + self.escape(node.title))
self.lit('" />')
def emph(self, node, entering):
self.tag('em' if entering else '/em')
def strong(self, node, entering):
self.tag('strong' if entering else '/strong')
def paragraph(self, node, entering):
grandparent = node.parent.parent
attrs = self.attrs(node)
if grandparent is not None and grandparent.t == 'list':
if grandparent.list_data['tight']:
return
if entering:
self.cr()
self.tag('p', attrs)
else:
self.tag('/p')
self.cr()
def heading(self, node, entering):
tagname = 'h' + str(node.level)
attrs = self.attrs(node)
if entering:
self.cr()
self.tag(tagname, attrs)
else:
self.tag('/' + tagname)
self.cr()
def code(self, node, entering):
self.tag('code')
self.out(node.literal)
self.tag('/code')
def code_block(self, node, entering):
info_words = node.info.split() if node.info else []
attrs = self.attrs(node)
if len(info_words) > 0 and len(info_words[0]) > 0:
attrs.append(['class', 'language-' +
self.escape(info_words[0])])
self.cr()
self.tag('pre')
self.tag('code', attrs)
self.out(node.literal)
self.tag('/code')
self.tag('/pre')
self.cr()
def thematic_break(self, node, entering):
attrs = self.attrs(node)
self.cr()
self.tag('hr', attrs, True)
self.cr()
def block_quote(self, node, entering):
attrs = self.attrs(node)
if entering:
self.cr()
self.tag('blockquote', attrs)
self.cr()
else:
self.cr()
self.tag('/blockquote')
self.cr()
def list(self, node, entering):
tagname = 'ul' if node.list_data['type'] == 'bullet' else 'ol'
attrs = self.attrs(node)
if entering:
start = node.list_data['start']
if start is not None and start != 1:
attrs.append(['start', str(start)])
self.cr()
self.tag(tagname, attrs)
self.cr()
else:
self.cr()
self.tag('/' + tagname)
self.cr()
def item(self, node, entering):
attrs = self.attrs(node)
if entering:
self.tag('li', attrs)
else:
self.tag('/li')
self.cr()
def html_inline(self, node, entering):
if self.options.get('safe'):
self.lit('<!-- raw HTML omitted -->')
else:
self.lit(node.literal)
def html_block(self, node, entering):
self.cr()
if self.options.get('safe'):
self.lit('<!-- raw HTML omitted -->')
else:
self.lit(node.literal)
self.cr()
def custom_inline(self, node, entering):
if entering and node.on_enter:
self.lit(node.on_enter)
elif (not entering) and node.on_exit:
self.lit(node.on_exit)
def custom_block(self, node, entering):
self.cr()
if entering and node.on_enter:
self.lit(node.on_enter)
elif (not entering) and node.on_exit:
self.lit(node.on_exit)
self.cr()
# Helper methods #
def out(self, s):
self.lit(self.escape(s))
def attrs(self, node):
att = []
if self.options.get('sourcepos'):
pos = node.sourcepos
if pos:
att.append(['data-sourcepos', str(pos[0][0]) + ':' +
str(pos[0][1]) + '-' + str(pos[1][0]) + ':' +
str(pos[1][1])])
return att

View file

@ -0,0 +1,43 @@
from __future__ import unicode_literals
class Renderer(object):
def render(self, ast):
"""Walks the AST and calls member methods for each Node type.
@param ast {Node} The root of the abstract syntax tree.
"""
walker = ast.walker()
self.buf = ''
self.last_out = '\n'
event = walker.nxt()
while event is not None:
type_ = event['node'].t
if hasattr(self, type_):
getattr(self, type_)(event['node'], event['entering'])
event = walker.nxt()
return self.buf
def lit(self, s):
"""Concatenate a literal string to the buffer.
@param str {String} The string to concatenate.
"""
self.buf += s
self.last_out = s
def cr(self):
if self.last_out != '\n':
self.lit('\n')
def out(self, s):
"""Concatenate a string to the buffer possibly escaping the content.
Concrete renderer implementations should override this method.
@param str {String} The string to concatenate.
"""
self.lit(s)

View file

@ -0,0 +1,159 @@
from __future__ import unicode_literals
from commonmark.render.renderer import Renderer
class ReStructuredTextRenderer(Renderer):
"""
Render reStructuredText from Markdown
Example:
.. code:: python
import commonmark
parser = commonmark.Parser()
ast = parser.parse('Hello `inline code` example')
renderer = commonmark.ReStructuredTextRenderer()
rst = renderer.render(ast)
print(rst) # Hello ``inline code`` example
"""
def __init__(self, indent_char=' '):
self.indent_char = indent_char
self.indent_length = 0
def lit(self, s):
if s == '\n':
indent = '' # Avoid whitespace if we're just adding a newline
elif self.last_out != '\n':
indent = '' # Don't indent if we're in the middle of a line
else:
indent = self.indent_char * self.indent_length
return super(ReStructuredTextRenderer, self).lit(indent + s)
def cr(self):
self.lit('\n')
def indent_lines(self, literal, indent_length=4):
indent = self.indent_char * indent_length
new_lines = []
for line in literal.splitlines():
new_lines.append(indent + line)
return '\n'.join(new_lines)
# Nodes
def document(self, node, entering):
pass
def softbreak(self, node, entering):
self.cr()
def linebreak(self, node, entering):
self.cr()
self.cr()
def text(self, node, entering):
self.out(node.literal)
def emph(self, node, entering):
self.out('*')
def strong(self, node, entering):
self.out('**')
def paragraph(self, node, entering):
if node.parent.t == 'item':
pass
else:
self.cr()
def link(self, node, entering):
if entering:
self.out('`')
else:
self.out(' <%s>`_' % node.destination)
def image(self, node, entering):
directive = '.. image:: ' + node.destination
if entering:
self.out(directive)
self.cr()
self.indent_length += 4
self.out(':alt: ')
else:
self.indent_length -= 4
def code(self, node, entering):
self.out('``')
self.out(node.literal)
self.out('``')
def code_block(self, node, entering):
directive = '.. code::'
language_name = None
info_words = node.info.split() if node.info else []
if len(info_words) > 0 and len(info_words[0]) > 0:
language_name = info_words[0]
if language_name:
directive += ' ' + language_name
self.cr()
self.out(directive)
self.cr()
self.cr()
self.out(self.indent_lines(node.literal))
self.cr()
def list(self, node, entering):
if entering:
self.cr()
def item(self, node, entering):
tagname = '*' if node.list_data['type'] == 'bullet' else '#.'
if entering:
self.out(tagname + ' ')
else:
self.cr()
def block_quote(self, node, entering):
if entering:
self.indent_length += 4
else:
self.indent_length -= 4
def heading(self, node, entering):
heading_chars = [
'#',
'*',
'=',
'-',
'^',
'"'
]
try:
heading_char = heading_chars[node.level-1]
except IndexError:
# Default to the last level if we're in too deep
heading_char = heading_chars[-1]
heading_length = len(node.first_child.literal)
banner = heading_char * heading_length
if entering:
self.cr()
else:
self.cr()
self.out(banner)
self.cr()

View file

@ -0,0 +1,172 @@
import unittest
import commonmark
class TestCommonmark(unittest.TestCase):
def setUp(self):
self.parser = commonmark.Parser()
self.renderer = commonmark.ReStructuredTextRenderer()
def render_rst(self, test_str):
ast = self.parser.parse(test_str)
rst = self.renderer.render(ast)
return rst
def assertEqualRender(self, src_markdown, expected_rst):
rendered_rst = self.render_rst(src_markdown)
self.assertEqual(rendered_rst, expected_rst)
def test_strong(self):
src_markdown = 'Hello **Strong**'
expected_rst = '\nHello **Strong**\n'
self.assertEqualRender(src_markdown, expected_rst)
def test_emphasis(self):
src_markdown = 'Hello *Emphasis*'
expected_rst = '\nHello *Emphasis*\n'
self.assertEqualRender(src_markdown, expected_rst)
def test_paragraph(self):
src_markdown = 'Hello paragraph'
expected_rst = '\nHello paragraph\n'
self.assertEqualRender(src_markdown, expected_rst)
def test_link(self):
src_markdown = '[Link](http://example.com)'
expected_rst = '\n`Link <http://example.com>`_\n'
self.assertEqualRender(src_markdown, expected_rst)
def test_image(self):
src_markdown = '![Image](http://placekitten.com/100/100)'
expected_rst = """
.. image:: http://placekitten.com/100/100
:alt: Image
"""
self.assertEqualRender(src_markdown, expected_rst)
def test_code(self):
src_markdown = 'Test `inline code` with backticks'
expected_rst = '\nTest ``inline code`` with backticks\n'
self.assertEqualRender(src_markdown, expected_rst)
def test_code_block(self):
src_markdown = """
```python
# code block
print '3 backticks or'
print 'indent 4 spaces'
```
"""
expected_rst = """
.. code:: python
# code block
print '3 backticks or'
print 'indent 4 spaces'
"""
self.assertEqualRender(src_markdown, expected_rst)
def test_unordered_list(self):
src_markdown = """
This is a list:
* List item
* List item
* List item
"""
expected_rst = """
This is a list:
* List item
* List item
* List item
"""
self.assertEqualRender(src_markdown, expected_rst)
def test_ordered_list(self):
src_markdown = """
This is a ordered list:
1. One
2. Two
3. Three
"""
expected_rst = """
This is a ordered list:
#. One
#. Two
#. Three
"""
self.assertEqualRender(src_markdown, expected_rst)
def test_block_quote(self):
src_markdown = """
Before the blockquote:
> The blockquote
After the blockquote
"""
expected_rst = """
Before the blockquote:
The blockquote
After the blockquote
"""
self.assertEqualRender(src_markdown, expected_rst)
def test_heading(self):
src_markdown = '''
# Heading 1
## Heading 2
### Heading 3
#### Heading 4
##### Heading 5
###### Heading 6
'''
expected_rst = '''
Heading 1
#########
Heading 2
*********
Heading 3
=========
Heading 4
---------
Heading 5
^^^^^^^^^
Heading 6
"""""""""
'''
self.assertEqualRender(src_markdown, expected_rst)
def test_multiple_paragraphs(self):
src_markdown = '''
Start of first paragraph that
continues on a new line
This is the second paragraph
'''
expected_rst = '''
Start of first paragraph that
continues on a new line
This is the second paragraph
'''
self.assertEqualRender(src_markdown, expected_rst)
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,242 @@
#!/usr/bin/env python
# coding: utf-8
from __future__ import division, print_function, unicode_literals
import re
import timeit
import codecs
import argparse
import sys
from builtins import str
from commonmark.render.html import HtmlRenderer
from commonmark.main import Parser, dumpAST
class colors(object):
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
def trace_calls(frame, event, arg):
co = frame.f_code
func_name = co.co_name
if func_name == "write":
return
line_no = frame.f_lineno
filename = co.co_filename
if event == "call" and not re.match("__", func_name) and \
re.search("CommonMark.py", filename) \
and func_name != "dumpAST":
print("-> " + frame.f_back.f_code.co_name +
" at " + str(frame.f_back.f_lineno) +
" called " + func_name + " at " + str(line_no) +
" in " + filename)
return trace_calls
def main():
parser = argparse.ArgumentParser(
description="script to run the CommonMark specification tests " +
"against the CommonMark.py parser")
parser.add_argument(
'-t',
help="Single test to run or comma separated list " +
"of tests (-t 10 or -t 10,11,12,13)")
parser.add_argument(
'-p',
action="store_true",
help="Print passed test information")
parser.add_argument(
'-f',
action="store_true",
help="Print failed tests (during -np...)")
parser.add_argument(
'-i',
action="store_true",
help="Interactive Markdown input mode")
parser.add_argument(
'-d',
action="store_true",
help="Debug, trace calls")
parser.add_argument(
'-np',
action="store_true",
help="Only print section header, tick, or cross")
parser.add_argument(
'-s',
action="store_true",
help="Print percent of tests passed by category")
args = parser.parse_args()
if args.d:
sys.settrace(trace_calls)
renderer = HtmlRenderer()
parser = Parser()
f = codecs.open("spec.txt", encoding="utf-8")
datalist = []
for line in f:
datalist.append(line)
data = "".join(datalist)
passed = 0
failed = 0
catStats = {}
examples = []
example_number = 0
current_section = ""
tabChar = '\u2192'
spaceChar = '\u2423'
nbspChar = '\u00A0'
def showSpaces(t):
t = re.sub("\\t", tabChar, t)
t = re.sub(" ", spaceChar, t)
t = re.sub(nbspChar, spaceChar, t)
return t
t = re.sub("\r\n", "\n", data)
tests = re.sub(
re.compile("^<!-- END TESTS -->(.|[\n])*", flags=re.M), '', t)
testMatch = re.findall(
re.compile(
r'^`{32} example\n'
r'([\s\S]*?)^\.\n([\s\S]*?)'
r'^`{32}$'
r'|^#{1,6} *(.*)$',
re.M),
tests)
for match in testMatch:
if not match[2] == "":
current_section = match[2]
else:
example_number += 1
examples.append({
'markdown': match[0],
'html': match[1],
'section': current_section,
'number': example_number})
current_section = ""
startTime = timeit.default_timer()
if args.i:
print(
colors.OKGREEN +
"(To end input of Markdown block enter 'end' on " +
"it's own line, to quit enter 'quit')" +
colors.ENDC)
while True:
s = ""
while True:
if sys.version_info >= (3, 0):
inp = input(colors.OKBLUE + 'Markdown: ' + colors.ENDC)
else:
inp = raw_input(colors.OKBLUE + 'Markdown: ' + colors.ENDC) # noqa
if not inp == "end" and inp != "quit":
s += inp + "\n"
elif inp == "end":
s = s[:-1]
break
elif inp == "quit":
print(colors.HEADER+"bye!"+colors.ENDC)
exit(0)
ast = parser.parse(s)
html = renderer.render(ast)
print(colors.WARNING+"="*10+"AST====="+colors.ENDC)
dumpAST(ast)
print(colors.WARNING+"="*10+"HTML===="+colors.ENDC)
print(html)
# some tests?
if args.t:
tests = args.t.split(",")
choice_examples = []
for t in tests:
if not t == "" and len(examples) > int(t):
choice_examples.append(examples[int(t)-1])
examples = choice_examples
# all tests
for i, example in enumerate(examples): # [0,examples[0]]
if not example['section'] == "" and \
not current_section == example['section']:
print('\n' + colors.HEADER + '[' + example['section'] + ']' +
colors.ENDC + ' ', end='')
current_section = example['section']
catStats.update({current_section: [0, 0, 0]})
catStats[current_section][2] += 1
if args.d:
print(colors.HEADER+"[Parsing]"+colors.ENDC)
ast = parser.parse(re.sub(tabChar, "\t", example['markdown']))
if args.d:
print(colors.HEADER+"[Rendering]"+colors.ENDC)
actual = renderer.render(ast)
if re.sub('\t', tabChar, actual) == example['html']:
passed += 1
catStats[current_section][0] += 1
if not args.f:
print(colors.OKGREEN + '' + colors.ENDC, end='')
if args.d:
dumpAST(ast)
if args.p or args.d and not args.np:
print(
colors.OKBLUE +
"=== markdown ===============\n" +
colors.ENDC + showSpaces(example['markdown']) +
colors.OKBLUE +
"\n=== expected ===============\n" +
colors.ENDC + showSpaces(example['html']) +
colors.OKBLUE +
"\n=== got ====================\n" +
colors.ENDC + showSpaces(actual))
else:
failed += 1
catStats[current_section][1] += 1
if args.t:
print("Test #" + str(args.t.split(",")[i]), end='')
else:
print("Test #" + str(i+1), end='')
print(' ' + colors.FAIL + "" + colors.ENDC)
if args.d:
dumpAST(ast)
if not args.np or args.f:
print(
colors.WARNING +
"=== markdown ===============\n" +
colors.ENDC + showSpaces(example['markdown']) +
colors.WARNING +
"\n=== expected ===============\n" +
colors.ENDC + showSpaces(example['html']) +
colors.WARNING +
"\n=== got ====================\n" +
colors.ENDC + showSpaces(actual))
print('\n' + str(passed) + ' tests passed, ' + str(failed) + ' failed')
endTime = timeit.default_timer()
runTime = endTime - startTime
if args.s:
for i in catStats.keys():
per = catStats[i][0]/catStats[i][2]
print(colors.HEADER + "[" + i + "]" + colors.ENDC +
"\t" + str(per*100) + "% Passed")
print("runtime: " + str(runTime) + "s")
if (failed > 0):
sys.exit(1)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,157 @@
from __future__ import unicode_literals
import unittest
try:
from hypothesis import given, example
except ImportError:
# Mock out hypothesis stuff for python 2.6
def given(a):
def func(b):
return
return func
example = given
try:
from hypothesis.strategies import text
except ImportError:
def text():
pass
import commonmark
from commonmark.blocks import Parser
from commonmark.render.html import HtmlRenderer
from commonmark.inlines import InlineParser
from commonmark.node import NodeWalker, Node
class TestCommonmark(unittest.TestCase):
def test_output(self):
s = commonmark.commonmark('*hello!*')
self.assertEqual(s, '<p><em>hello!</em></p>\n')
def test_unicode(self):
s = commonmark.commonmark('<div>\u2020</div>\n')
self.assertEqual(s, '<div>\u2020</div>\n',
'Unicode works in an HTML block.')
commonmark.commonmark('* unicode: \u2020')
commonmark.commonmark('# unicode: \u2020')
commonmark.commonmark('```\n# unicode: \u2020\n```')
def test_null_string_bug(self):
s = commonmark.commonmark('> sometext\n>\n\n')
self.assertEqual(
s,
'<blockquote>\n<pre><code>sometext\n</code></pre>'
'\n</blockquote>\n')
def test_normalize_contracts_text_nodes(self):
md = '_a'
ast = Parser().parse(md)
def assert_text_literals(text_literals):
walker = ast.walker()
document, _ = walker.next()
self.assertEqual(document.t, 'document')
paragraph, _ = walker.next()
self.assertEqual(paragraph.t, 'paragraph')
for literal in text_literals:
text, _ = walker.next()
self.assertEqual(text.t, 'text')
self.assertEqual(text.literal, literal)
paragraph, _ = walker.next()
self.assertEqual(paragraph.t, 'paragraph')
assert_text_literals(['_', 'a'])
ast.normalize()
# assert text nodes are contracted
assert_text_literals(['_a'])
ast.normalize()
# assert normalize() doesn't alter a normalized ast
assert_text_literals(['_a'])
def test_dumpAST_orderedlist(self):
md = '1.'
ast = Parser().parse(md)
commonmark.dumpAST(ast)
@given(text())
def test_random_text(self, s):
commonmark.commonmark(s)
def test_smart_dashes(self):
md = 'a - b -- c --- d ---- e ----- f'
EM = '\u2014'
EN = '\u2013'
expected_html = (
'<p>'
+ 'a - '
+ 'b ' + EN + ' '
+ 'c ' + EM + ' '
+ 'd ' + EN + EN + ' '
+ 'e ' + EM + EN + ' '
+ 'f</p>\n')
parser = commonmark.Parser(options=dict(smart=True))
ast = parser.parse(md)
renderer = commonmark.HtmlRenderer()
html = renderer.render(ast)
self.assertEqual(html, expected_html)
def test_regex_vulnerability_link_label(self):
i = 200
while i <= 2000:
s = commonmark.commonmark('[' + ('\\' * i) + '\n')
self.assertEqual(s, '<p>' + '[' + ('\\' * (i // 2)) + '</p>\n',
'[\\\\... %d deep' % (i,))
i *= 10
def test_regex_vulnerability_link_destination(self):
i = 200
while i <= 2000:
s = commonmark.commonmark(('[](' * i) + '\n')
self.assertEqual(s, '<p>' + ('[](' * i) + '</p>\n',
'[]( %d deep' % (i,))
i *= 10
class TestHtmlRenderer(unittest.TestCase):
def test_init(self):
HtmlRenderer()
class TestInlineParser(unittest.TestCase):
def test_init(self):
InlineParser()
class TestNode(unittest.TestCase):
def test_doc_node(self):
Node('document', [[1, 1], [0, 0]])
class TestNodeWalker(unittest.TestCase):
def test_node_walker(self):
node = Node('document', [[1, 1], [0, 0]])
NodeWalker(node)
def test_node_walker_iter(self):
node = Node('document', [[1, 1], [0, 0]])
for subnode, entered in node.walker():
pass
class TestParser(unittest.TestCase):
def setUp(self):
self.parser = Parser()
@given(text())
@example('')
@example('* unicode: \u2020')
def test_text(self, s):
self.parser.parse(s)
if __name__ == '__main__':
unittest.main()