This commit is contained in:
Waylon Walker 2022-03-31 20:20:07 -05:00
commit 38355d2442
No known key found for this signature in database
GPG key ID: 66E2BF2B4190EFE4
9083 changed files with 1225834 additions and 0 deletions

View file

@ -0,0 +1,251 @@
# Grammar for 2to3. This grammar supports Python 2.x and 3.x.
# NOTE WELL: You should also follow all the steps listed at
# https://devguide.python.org/grammar/
# Start symbols for the grammar:
# file_input is a module or sequence of commands read from an input file;
# single_input is a single interactive statement;
# eval_input is the input for the eval() and input() functions.
# NB: compound_stmt in single_input is followed by extra NEWLINE!
file_input: (NEWLINE | stmt)* ENDMARKER
single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
eval_input: testlist NEWLINE* ENDMARKER
decorator: '@' namedexpr_test NEWLINE
decorators: decorator+
decorated: decorators (classdef | funcdef | async_funcdef)
async_funcdef: ASYNC funcdef
funcdef: 'def' NAME parameters ['->' test] ':' suite
parameters: '(' [typedargslist] ')'
# The following definition for typedarglist is equivalent to this set of rules:
#
# arguments = argument (',' argument)*
# argument = tfpdef ['=' test]
# kwargs = '**' tname [',']
# args = '*' [tname]
# kwonly_kwargs = (',' argument)* [',' [kwargs]]
# args_kwonly_kwargs = args kwonly_kwargs | kwargs
# poskeyword_args_kwonly_kwargs = arguments [',' [args_kwonly_kwargs]]
# typedargslist_no_posonly = poskeyword_args_kwonly_kwargs | args_kwonly_kwargs
# typedarglist = arguments ',' '/' [',' [typedargslist_no_posonly]])|(typedargslist_no_posonly)"
#
# It needs to be fully expanded to allow our LL(1) parser to work on it.
typedargslist: tfpdef ['=' test] (',' tfpdef ['=' test])* ',' '/' [
',' [((tfpdef ['=' test] ',')* ('*' [tname] (',' tname ['=' test])*
[',' ['**' tname [',']]] | '**' tname [','])
| tfpdef ['=' test] (',' tfpdef ['=' test])* [','])]
] | ((tfpdef ['=' test] ',')* ('*' [tname] (',' tname ['=' test])*
[',' ['**' tname [',']]] | '**' tname [','])
| tfpdef ['=' test] (',' tfpdef ['=' test])* [','])
tname: NAME [':' test]
tfpdef: tname | '(' tfplist ')'
tfplist: tfpdef (',' tfpdef)* [',']
# The following definition for varargslist is equivalent to this set of rules:
#
# arguments = argument (',' argument )*
# argument = vfpdef ['=' test]
# kwargs = '**' vname [',']
# args = '*' [vname]
# kwonly_kwargs = (',' argument )* [',' [kwargs]]
# args_kwonly_kwargs = args kwonly_kwargs | kwargs
# poskeyword_args_kwonly_kwargs = arguments [',' [args_kwonly_kwargs]]
# vararglist_no_posonly = poskeyword_args_kwonly_kwargs | args_kwonly_kwargs
# varargslist = arguments ',' '/' [','[(vararglist_no_posonly)]] | (vararglist_no_posonly)
#
# It needs to be fully expanded to allow our LL(1) parser to work on it.
varargslist: vfpdef ['=' test ](',' vfpdef ['=' test])* ',' '/' [',' [
((vfpdef ['=' test] ',')* ('*' [vname] (',' vname ['=' test])*
[',' ['**' vname [',']]] | '**' vname [','])
| vfpdef ['=' test] (',' vfpdef ['=' test])* [','])
]] | ((vfpdef ['=' test] ',')*
('*' [vname] (',' vname ['=' test])* [',' ['**' vname [',']]]| '**' vname [','])
| vfpdef ['=' test] (',' vfpdef ['=' test])* [','])
vname: NAME
vfpdef: vname | '(' vfplist ')'
vfplist: vfpdef (',' vfpdef)* [',']
stmt: simple_stmt | compound_stmt
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
small_stmt: (expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt |
import_stmt | global_stmt | exec_stmt | assert_stmt)
expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) |
('=' (yield_expr|testlist_star_expr))*)
annassign: ':' test ['=' (yield_expr|testlist_star_expr)]
testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
augassign: ('+=' | '-=' | '*=' | '@=' | '/=' | '%=' | '&=' | '|=' | '^=' |
'<<=' | '>>=' | '**=' | '//=')
# For normal and annotated assignments, additional restrictions enforced by the interpreter
print_stmt: 'print' ( [ test (',' test)* [','] ] |
'>>' test [ (',' test)+ [','] ] )
del_stmt: 'del' exprlist
pass_stmt: 'pass'
flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: 'break'
continue_stmt: 'continue'
return_stmt: 'return' [testlist_star_expr]
yield_stmt: yield_expr
raise_stmt: 'raise' [test ['from' test | ',' test [',' test]]]
import_stmt: import_name | import_from
import_name: 'import' dotted_as_names
import_from: ('from' ('.'* dotted_name | '.'+)
'import' ('*' | '(' import_as_names ')' | import_as_names))
import_as_name: NAME ['as' NAME]
dotted_as_name: dotted_name ['as' NAME]
import_as_names: import_as_name (',' import_as_name)* [',']
dotted_as_names: dotted_as_name (',' dotted_as_name)*
dotted_name: NAME ('.' NAME)*
global_stmt: ('global' | 'nonlocal') NAME (',' NAME)*
exec_stmt: 'exec' expr ['in' test [',' test]]
assert_stmt: 'assert' test [',' test]
compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt | match_stmt
async_stmt: ASYNC (funcdef | with_stmt | for_stmt)
if_stmt: 'if' namedexpr_test ':' suite ('elif' namedexpr_test ':' suite)* ['else' ':' suite]
while_stmt: 'while' namedexpr_test ':' suite ['else' ':' suite]
for_stmt: 'for' exprlist 'in' testlist_star_expr ':' suite ['else' ':' suite]
try_stmt: ('try' ':' suite
((except_clause ':' suite)+
['else' ':' suite]
['finally' ':' suite] |
'finally' ':' suite))
with_stmt: 'with' asexpr_test (',' asexpr_test)* ':' suite
# NB compile.c makes sure that the default except clause is last
except_clause: 'except' [test [(',' | 'as') test]]
suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
# Backward compatibility cruft to support:
# [ x for x in lambda: True, lambda: False if x() ]
# even while also allowing:
# lambda x: 5 if x else 2
# (But not a mix of the two)
testlist_safe: old_test [(',' old_test)+ [',']]
old_test: or_test | old_lambdef
old_lambdef: 'lambda' [varargslist] ':' old_test
namedexpr_test: asexpr_test [':=' asexpr_test]
# This is actually not a real rule, though since the parser is very
# limited in terms of the strategy about match/case rules, we are inserting
# a virtual case (<expr> as <expr>) as a valid expression. Unless a better
# approach is thought, the only side effect of this seem to be just allowing
# more stuff to be parser (which would fail on the ast).
asexpr_test: test ['as' test]
test: or_test ['if' or_test 'else' test] | lambdef
or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: expr (comp_op expr)*
comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
star_expr: '*' expr
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('^' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
term: factor (('*'|'@'|'/'|'%'|'//') factor)*
factor: ('+'|'-'|'~') factor | power
power: [AWAIT] atom trailer* ['**' factor]
atom: ('(' [yield_expr|testlist_gexp] ')' |
'[' [listmaker] ']' |
'{' [dictsetmaker] '}' |
'`' testlist1 '`' |
NAME | NUMBER | STRING+ | '.' '.' '.')
listmaker: (namedexpr_test|star_expr) ( old_comp_for | (',' (namedexpr_test|star_expr))* [','] )
testlist_gexp: (namedexpr_test|star_expr) ( old_comp_for | (',' (namedexpr_test|star_expr))* [','] )
lambdef: 'lambda' [varargslist] ':' test
trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
subscriptlist: subscript (',' subscript)* [',']
subscript: test [':=' test] | [test] ':' [test] [sliceop]
sliceop: ':' [test]
exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
testlist: test (',' test)* [',']
dictsetmaker: ( ((test ':' asexpr_test | '**' expr)
(comp_for | (',' (test ':' asexpr_test | '**' expr))* [','])) |
((test [':=' test] | star_expr)
(comp_for | (',' (test [':=' test] | star_expr))* [','])) )
classdef: 'class' NAME ['(' [arglist] ')'] ':' suite
arglist: argument (',' argument)* [',']
# "test '=' test" is really "keyword '=' test", but we have no such token.
# These need to be in a single rule to avoid grammar that is ambiguous
# to our LL(1) parser. Even though 'test' includes '*expr' in star_expr,
# we explicitly match '*' here, too, to give it proper precedence.
# Illegal combinations and orderings are blocked in ast.c:
# multiple (test comp_for) arguments are blocked; keyword unpackings
# that precede iterable unpackings are blocked; etc.
argument: ( test [comp_for] |
test ':=' test |
test 'as' test |
test '=' asexpr_test |
'**' test |
'*' test )
comp_iter: comp_for | comp_if
comp_for: [ASYNC] 'for' exprlist 'in' or_test [comp_iter]
comp_if: 'if' old_test [comp_iter]
# As noted above, testlist_safe extends the syntax allowed in list
# comprehensions and generators. We can't use it indiscriminately in all
# derivations using a comp_for-like pattern because the testlist_safe derivation
# contains comma which clashes with trailing comma in arglist.
#
# This was an issue because the parser would not follow the correct derivation
# when parsing syntactically valid Python code. Since testlist_safe was created
# specifically to handle list comprehensions and generator expressions enclosed
# with parentheses, it's safe to only use it in those. That avoids the issue; we
# can parse code like set(x for x in [],).
#
# The syntax supported by this set of rules is not a valid Python 3 syntax,
# hence the prefix "old".
#
# See https://bugs.python.org/issue27494
old_comp_iter: old_comp_for | old_comp_if
old_comp_for: [ASYNC] 'for' exprlist 'in' testlist_safe [old_comp_iter]
old_comp_if: 'if' old_test [old_comp_iter]
testlist1: test (',' test)*
# not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME
yield_expr: 'yield' [yield_arg]
yield_arg: 'from' test | testlist_star_expr
# 3.10 match statement definition
# PS: normally the grammar is much much more restricted, but
# at this moment for not trying to bother much with encoding the
# exact same DSL in a LL(1) parser, we will just accept an expression
# and let the ast.parse() step of the safe mode to reject invalid
# grammar.
# The reason why it is more restricted is that, patterns are some
# sort of a DSL (more advanced than our LHS on assignments, but
# still in a very limited python subset). They are not really
# expressions, but who cares. If we can parse them, that is enough
# to reformat them.
match_stmt: "match" subject_expr ':' NEWLINE INDENT case_block+ DEDENT
# This is more permissive than the actual version. For example it
# accepts `match *something:`, even though single-item starred expressions
# are forbidden.
subject_expr: (namedexpr_test|star_expr) (',' (namedexpr_test|star_expr))* [',']
# cases
case_block: "case" patterns [guard] ':' suite
guard: 'if' namedexpr_test
patterns: pattern (',' pattern)* [',']
pattern: (expr|star_expr) ['as' expr]

View file

@ -0,0 +1,28 @@
# Copyright 2006 Google, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# A grammar to describe tree matching patterns.
# Not shown here:
# - 'TOKEN' stands for any token (leaf node)
# - 'any' stands for any node (leaf or interior)
# With 'any' we can still specify the sub-structure.
# The start symbol is 'Matcher'.
Matcher: Alternatives ENDMARKER
Alternatives: Alternative ('|' Alternative)*
Alternative: (Unit | NegatedUnit)+
Unit: [NAME '='] ( STRING [Repeater]
| NAME [Details] [Repeater]
| '(' Alternatives ')' [Repeater]
| '[' Alternatives ']'
)
NegatedUnit: 'not' (STRING | NAME [Details] | '(' Alternatives ')')
Repeater: '*' | '+' | '{' NUMBER [',' NUMBER] '}'
Details: '<' Alternatives '>'

View file

@ -0,0 +1 @@
# empty

View file

@ -0,0 +1,4 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""The pgen2 package."""

View file

@ -0,0 +1,256 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# mypy: ignore-errors
"""Convert graminit.[ch] spit out by pgen to Python code.
Pgen is the Python parser generator. It is useful to quickly create a
parser from a grammar file in Python's grammar notation. But I don't
want my parsers to be written in C (yet), so I'm translating the
parsing tables to Python data structures and writing a Python parse
engine.
Note that the token numbers are constants determined by the standard
Python tokenizer. The standard token module defines these numbers and
their names (the names are not used much). The token numbers are
hardcoded into the Python tokenizer and into pgen. A Python
implementation of the Python tokenizer is also available, in the
standard tokenize module.
On the other hand, symbol numbers (representing the grammar's
non-terminals) are assigned by pgen based on the actual grammar
input.
Note: this module is pretty much obsolete; the pgen module generates
equivalent grammar tables directly from the Grammar.txt input file
without having to invoke the Python pgen C program.
"""
# Python imports
import re
# Local imports
from pgen2 import grammar, token
class Converter(grammar.Grammar):
"""Grammar subclass that reads classic pgen output files.
The run() method reads the tables as produced by the pgen parser
generator, typically contained in two C files, graminit.h and
graminit.c. The other methods are for internal use only.
See the base class for more documentation.
"""
def run(self, graminit_h, graminit_c):
"""Load the grammar tables from the text files written by pgen."""
self.parse_graminit_h(graminit_h)
self.parse_graminit_c(graminit_c)
self.finish_off()
def parse_graminit_h(self, filename):
"""Parse the .h file written by pgen. (Internal)
This file is a sequence of #define statements defining the
nonterminals of the grammar as numbers. We build two tables
mapping the numbers to names and back.
"""
try:
f = open(filename)
except OSError as err:
print("Can't open %s: %s" % (filename, err))
return False
self.symbol2number = {}
self.number2symbol = {}
lineno = 0
for line in f:
lineno += 1
mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line)
if not mo and line.strip():
print("%s(%s): can't parse %s" % (filename, lineno, line.strip()))
else:
symbol, number = mo.groups()
number = int(number)
assert symbol not in self.symbol2number
assert number not in self.number2symbol
self.symbol2number[symbol] = number
self.number2symbol[number] = symbol
return True
def parse_graminit_c(self, filename):
"""Parse the .c file written by pgen. (Internal)
The file looks as follows. The first two lines are always this:
#include "pgenheaders.h"
#include "grammar.h"
After that come four blocks:
1) one or more state definitions
2) a table defining dfas
3) a table defining labels
4) a struct defining the grammar
A state definition has the following form:
- one or more arc arrays, each of the form:
static arc arcs_<n>_<m>[<k>] = {
{<i>, <j>},
...
};
- followed by a state array, of the form:
static state states_<s>[<t>] = {
{<k>, arcs_<n>_<m>},
...
};
"""
try:
f = open(filename)
except OSError as err:
print("Can't open %s: %s" % (filename, err))
return False
# The code below essentially uses f's iterator-ness!
lineno = 0
# Expect the two #include lines
lineno, line = lineno + 1, next(f)
assert line == '#include "pgenheaders.h"\n', (lineno, line)
lineno, line = lineno + 1, next(f)
assert line == '#include "grammar.h"\n', (lineno, line)
# Parse the state definitions
lineno, line = lineno + 1, next(f)
allarcs = {}
states = []
while line.startswith("static arc "):
while line.startswith("static arc "):
mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$", line)
assert mo, (lineno, line)
n, m, k = list(map(int, mo.groups()))
arcs = []
for _ in range(k):
lineno, line = lineno + 1, next(f)
mo = re.match(r"\s+{(\d+), (\d+)},$", line)
assert mo, (lineno, line)
i, j = list(map(int, mo.groups()))
arcs.append((i, j))
lineno, line = lineno + 1, next(f)
assert line == "};\n", (lineno, line)
allarcs[(n, m)] = arcs
lineno, line = lineno + 1, next(f)
mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line)
assert mo, (lineno, line)
s, t = list(map(int, mo.groups()))
assert s == len(states), (lineno, line)
state = []
for _ in range(t):
lineno, line = lineno + 1, next(f)
mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line)
assert mo, (lineno, line)
k, n, m = list(map(int, mo.groups()))
arcs = allarcs[n, m]
assert k == len(arcs), (lineno, line)
state.append(arcs)
states.append(state)
lineno, line = lineno + 1, next(f)
assert line == "};\n", (lineno, line)
lineno, line = lineno + 1, next(f)
self.states = states
# Parse the dfas
dfas = {}
mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line)
assert mo, (lineno, line)
ndfas = int(mo.group(1))
for i in range(ndfas):
lineno, line = lineno + 1, next(f)
mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$', line)
assert mo, (lineno, line)
symbol = mo.group(2)
number, x, y, z = list(map(int, mo.group(1, 3, 4, 5)))
assert self.symbol2number[symbol] == number, (lineno, line)
assert self.number2symbol[number] == symbol, (lineno, line)
assert x == 0, (lineno, line)
state = states[z]
assert y == len(state), (lineno, line)
lineno, line = lineno + 1, next(f)
mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line)
assert mo, (lineno, line)
first = {}
rawbitset = eval(mo.group(1))
for i, c in enumerate(rawbitset):
byte = ord(c)
for j in range(8):
if byte & (1 << j):
first[i * 8 + j] = 1
dfas[number] = (state, first)
lineno, line = lineno + 1, next(f)
assert line == "};\n", (lineno, line)
self.dfas = dfas
# Parse the labels
labels = []
lineno, line = lineno + 1, next(f)
mo = re.match(r"static label labels\[(\d+)\] = {$", line)
assert mo, (lineno, line)
nlabels = int(mo.group(1))
for i in range(nlabels):
lineno, line = lineno + 1, next(f)
mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line)
assert mo, (lineno, line)
x, y = mo.groups()
x = int(x)
if y == "0":
y = None
else:
y = eval(y)
labels.append((x, y))
lineno, line = lineno + 1, next(f)
assert line == "};\n", (lineno, line)
self.labels = labels
# Parse the grammar struct
lineno, line = lineno + 1, next(f)
assert line == "grammar _PyParser_Grammar = {\n", (lineno, line)
lineno, line = lineno + 1, next(f)
mo = re.match(r"\s+(\d+),$", line)
assert mo, (lineno, line)
ndfas = int(mo.group(1))
assert ndfas == len(self.dfas)
lineno, line = lineno + 1, next(f)
assert line == "\tdfas,\n", (lineno, line)
lineno, line = lineno + 1, next(f)
mo = re.match(r"\s+{(\d+), labels},$", line)
assert mo, (lineno, line)
nlabels = int(mo.group(1))
assert nlabels == len(self.labels), (lineno, line)
lineno, line = lineno + 1, next(f)
mo = re.match(r"\s+(\d+)$", line)
assert mo, (lineno, line)
start = int(mo.group(1))
assert start in self.number2symbol, (lineno, line)
self.start = start
lineno, line = lineno + 1, next(f)
assert line == "};\n", (lineno, line)
try:
lineno, line = lineno + 1, next(f)
except StopIteration:
pass
else:
assert 0, (lineno, line)
def finish_off(self):
"""Create additional useful structures. (Internal)."""
self.keywords = {} # map from keyword strings to arc labels
self.tokens = {} # map from numeric token values to arc labels
for ilabel, (type, value) in enumerate(self.labels):
if type == token.NAME and value is not None:
self.keywords[value] = ilabel
elif value is None:
self.tokens[type] = ilabel

View file

@ -0,0 +1,327 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Modifications:
# Copyright 2006 Google, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Parser driver.
This provides a high-level interface to parse a file into a syntax tree.
"""
__author__ = "Guido van Rossum <guido@python.org>"
__all__ = ["Driver", "load_grammar"]
# Python imports
import io
import os
import logging
import pkgutil
import sys
from typing import (
Any,
cast,
IO,
Iterable,
List,
Optional,
Text,
Iterator,
Tuple,
TypeVar,
Generic,
Union,
)
from contextlib import contextmanager
from dataclasses import dataclass, field
# Pgen imports
from . import grammar, parse, token, tokenize, pgen
from logging import Logger
from blib2to3.pytree import NL
from blib2to3.pgen2.grammar import Grammar
from blib2to3.pgen2.tokenize import GoodTokenInfo
Path = Union[str, "os.PathLike[str]"]
@dataclass
class ReleaseRange:
start: int
end: Optional[int] = None
tokens: List[Any] = field(default_factory=list)
def lock(self) -> None:
total_eaten = len(self.tokens)
self.end = self.start + total_eaten
class TokenProxy:
def __init__(self, generator: Any) -> None:
self._tokens = generator
self._counter = 0
self._release_ranges: List[ReleaseRange] = []
@contextmanager
def release(self) -> Iterator["TokenProxy"]:
release_range = ReleaseRange(self._counter)
self._release_ranges.append(release_range)
try:
yield self
finally:
# Lock the last release range to the final position that
# has been eaten.
release_range.lock()
def eat(self, point: int) -> Any:
eaten_tokens = self._release_ranges[-1].tokens
if point < len(eaten_tokens):
return eaten_tokens[point]
else:
while point >= len(eaten_tokens):
token = next(self._tokens)
eaten_tokens.append(token)
return token
def __iter__(self) -> "TokenProxy":
return self
def __next__(self) -> Any:
# If the current position is already compromised (looked up)
# return the eaten token, if not just go further on the given
# token producer.
for release_range in self._release_ranges:
assert release_range.end is not None
start, end = release_range.start, release_range.end
if start <= self._counter < end:
token = release_range.tokens[self._counter - start]
break
else:
token = next(self._tokens)
self._counter += 1
return token
def can_advance(self, to: int) -> bool:
# Try to eat, fail if it can't. The eat operation is cached
# so there wont be any additional cost of eating here
try:
self.eat(to)
except StopIteration:
return False
else:
return True
class Driver(object):
def __init__(self, grammar: Grammar, logger: Optional[Logger] = None) -> None:
self.grammar = grammar
if logger is None:
logger = logging.getLogger(__name__)
self.logger = logger
def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) -> NL:
"""Parse a series of tokens and return the syntax tree."""
# XXX Move the prefix computation into a wrapper around tokenize.
proxy = TokenProxy(tokens)
p = parse.Parser(self.grammar)
p.setup(proxy=proxy)
lineno = 1
column = 0
indent_columns: List[int] = []
type = value = start = end = line_text = None
prefix = ""
for quintuple in proxy:
type, value, start, end, line_text = quintuple
if start != (lineno, column):
assert (lineno, column) <= start, ((lineno, column), start)
s_lineno, s_column = start
if lineno < s_lineno:
prefix += "\n" * (s_lineno - lineno)
lineno = s_lineno
column = 0
if column < s_column:
prefix += line_text[column:s_column]
column = s_column
if type in (tokenize.COMMENT, tokenize.NL):
prefix += value
lineno, column = end
if value.endswith("\n"):
lineno += 1
column = 0
continue
if type == token.OP:
type = grammar.opmap[value]
if debug:
assert type is not None
self.logger.debug(
"%s %r (prefix=%r)", token.tok_name[type], value, prefix
)
if type == token.INDENT:
indent_columns.append(len(value))
_prefix = prefix + value
prefix = ""
value = ""
elif type == token.DEDENT:
_indent_col = indent_columns.pop()
prefix, _prefix = self._partially_consume_prefix(prefix, _indent_col)
if p.addtoken(cast(int, type), value, (prefix, start)):
if debug:
self.logger.debug("Stop.")
break
prefix = ""
if type in {token.INDENT, token.DEDENT}:
prefix = _prefix
lineno, column = end
if value.endswith("\n"):
lineno += 1
column = 0
else:
# We never broke out -- EOF is too soon (how can this happen???)
assert start is not None
raise parse.ParseError("incomplete input", type, value, (prefix, start))
assert p.rootnode is not None
return p.rootnode
def parse_stream_raw(self, stream: IO[Text], debug: bool = False) -> NL:
"""Parse a stream and return the syntax tree."""
tokens = tokenize.generate_tokens(stream.readline, grammar=self.grammar)
return self.parse_tokens(tokens, debug)
def parse_stream(self, stream: IO[Text], debug: bool = False) -> NL:
"""Parse a stream and return the syntax tree."""
return self.parse_stream_raw(stream, debug)
def parse_file(
self, filename: Path, encoding: Optional[Text] = None, debug: bool = False
) -> NL:
"""Parse a file and return the syntax tree."""
with io.open(filename, "r", encoding=encoding) as stream:
return self.parse_stream(stream, debug)
def parse_string(self, text: Text, debug: bool = False) -> NL:
"""Parse a string and return the syntax tree."""
tokens = tokenize.generate_tokens(
io.StringIO(text).readline, grammar=self.grammar
)
return self.parse_tokens(tokens, debug)
def _partially_consume_prefix(self, prefix: Text, column: int) -> Tuple[Text, Text]:
lines: List[str] = []
current_line = ""
current_column = 0
wait_for_nl = False
for char in prefix:
current_line += char
if wait_for_nl:
if char == "\n":
if current_line.strip() and current_column < column:
res = "".join(lines)
return res, prefix[len(res) :]
lines.append(current_line)
current_line = ""
current_column = 0
wait_for_nl = False
elif char in " \t":
current_column += 1
elif char == "\n":
# unexpected empty line
current_column = 0
else:
# indent is finished
wait_for_nl = True
return "".join(lines), current_line
def _generate_pickle_name(gt: Path, cache_dir: Optional[Path] = None) -> Text:
head, tail = os.path.splitext(gt)
if tail == ".txt":
tail = ""
name = head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
if cache_dir:
return os.path.join(cache_dir, os.path.basename(name))
else:
return name
def load_grammar(
gt: Text = "Grammar.txt",
gp: Optional[Text] = None,
save: bool = True,
force: bool = False,
logger: Optional[Logger] = None,
) -> Grammar:
"""Load the grammar (maybe from a pickle)."""
if logger is None:
logger = logging.getLogger(__name__)
gp = _generate_pickle_name(gt) if gp is None else gp
if force or not _newer(gp, gt):
logger.info("Generating grammar tables from %s", gt)
g: grammar.Grammar = pgen.generate_grammar(gt)
if save:
logger.info("Writing grammar tables to %s", gp)
try:
g.dump(gp)
except OSError as e:
logger.info("Writing failed: %s", e)
else:
g = grammar.Grammar()
g.load(gp)
return g
def _newer(a: Text, b: Text) -> bool:
"""Inquire whether file a was written since file b."""
if not os.path.exists(a):
return False
if not os.path.exists(b):
return True
return os.path.getmtime(a) >= os.path.getmtime(b)
def load_packaged_grammar(
package: str, grammar_source: Text, cache_dir: Optional[Path] = None
) -> grammar.Grammar:
"""Normally, loads a pickled grammar by doing
pkgutil.get_data(package, pickled_grammar)
where *pickled_grammar* is computed from *grammar_source* by adding the
Python version and using a ``.pickle`` extension.
However, if *grammar_source* is an extant file, load_grammar(grammar_source)
is called instead. This facilitates using a packaged grammar file when needed
but preserves load_grammar's automatic regeneration behavior when possible.
"""
if os.path.isfile(grammar_source):
gp = _generate_pickle_name(grammar_source, cache_dir) if cache_dir else None
return load_grammar(grammar_source, gp=gp)
pickled_name = _generate_pickle_name(os.path.basename(grammar_source), cache_dir)
data = pkgutil.get_data(package, pickled_name)
assert data is not None
g = grammar.Grammar()
g.loads(data)
return g
def main(*args: Text) -> bool:
"""Main program, when run as a script: produce grammar pickle files.
Calls load_grammar for each argument, a path to a grammar text file.
"""
if not args:
args = tuple(sys.argv[1:])
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format="%(message)s")
for gt in args:
load_grammar(gt, save=True, force=True)
return True
if __name__ == "__main__":
sys.exit(int(not main()))

View file

@ -0,0 +1,227 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""This module defines the data structures used to represent a grammar.
These are a bit arcane because they are derived from the data
structures used by Python's 'pgen' parser generator.
There's also a table here mapping operators to their names in the
token module; the Python tokenize module reports all operators as the
fallback token code OP, but the parser needs the actual token code.
"""
# Python imports
import os
import pickle
import tempfile
from typing import Any, Dict, List, Optional, Text, Tuple, TypeVar, Union
# Local imports
from . import token
_P = TypeVar("_P", bound="Grammar")
Label = Tuple[int, Optional[Text]]
DFA = List[List[Tuple[int, int]]]
DFAS = Tuple[DFA, Dict[int, int]]
Path = Union[str, "os.PathLike[str]"]
class Grammar(object):
"""Pgen parsing tables conversion class.
Once initialized, this class supplies the grammar tables for the
parsing engine implemented by parse.py. The parsing engine
accesses the instance variables directly. The class here does not
provide initialization of the tables; several subclasses exist to
do this (see the conv and pgen modules).
The load() method reads the tables from a pickle file, which is
much faster than the other ways offered by subclasses. The pickle
file is written by calling dump() (after loading the grammar
tables using a subclass). The report() method prints a readable
representation of the tables to stdout, for debugging.
The instance variables are as follows:
symbol2number -- a dict mapping symbol names to numbers. Symbol
numbers are always 256 or higher, to distinguish
them from token numbers, which are between 0 and
255 (inclusive).
number2symbol -- a dict mapping numbers to symbol names;
these two are each other's inverse.
states -- a list of DFAs, where each DFA is a list of
states, each state is a list of arcs, and each
arc is a (i, j) pair where i is a label and j is
a state number. The DFA number is the index into
this list. (This name is slightly confusing.)
Final states are represented by a special arc of
the form (0, j) where j is its own state number.
dfas -- a dict mapping symbol numbers to (DFA, first)
pairs, where DFA is an item from the states list
above, and first is a set of tokens that can
begin this grammar rule (represented by a dict
whose values are always 1).
labels -- a list of (x, y) pairs where x is either a token
number or a symbol number, and y is either None
or a string; the strings are keywords. The label
number is the index in this list; label numbers
are used to mark state transitions (arcs) in the
DFAs.
start -- the number of the grammar's start symbol.
keywords -- a dict mapping keyword strings to arc labels.
tokens -- a dict mapping token numbers to arc labels.
"""
def __init__(self) -> None:
self.symbol2number: Dict[str, int] = {}
self.number2symbol: Dict[int, str] = {}
self.states: List[DFA] = []
self.dfas: Dict[int, DFAS] = {}
self.labels: List[Label] = [(0, "EMPTY")]
self.keywords: Dict[str, int] = {}
self.soft_keywords: Dict[str, int] = {}
self.tokens: Dict[int, int] = {}
self.symbol2label: Dict[str, int] = {}
self.version: Tuple[int, int] = (0, 0)
self.start = 256
# Python 3.7+ parses async as a keyword, not an identifier
self.async_keywords = False
def dump(self, filename: Path) -> None:
"""Dump the grammar tables to a pickle file."""
# mypyc generates objects that don't have a __dict__, but they
# do have __getstate__ methods that will return an equivalent
# dictionary
if hasattr(self, "__dict__"):
d = self.__dict__
else:
d = self.__getstate__() # type: ignore
with tempfile.NamedTemporaryFile(
dir=os.path.dirname(filename), delete=False
) as f:
pickle.dump(d, f, pickle.HIGHEST_PROTOCOL)
os.replace(f.name, filename)
def _update(self, attrs: Dict[str, Any]) -> None:
for k, v in attrs.items():
setattr(self, k, v)
def load(self, filename: Path) -> None:
"""Load the grammar tables from a pickle file."""
with open(filename, "rb") as f:
d = pickle.load(f)
self._update(d)
def loads(self, pkl: bytes) -> None:
"""Load the grammar tables from a pickle bytes object."""
self._update(pickle.loads(pkl))
def copy(self: _P) -> _P:
"""
Copy the grammar.
"""
new = self.__class__()
for dict_attr in (
"symbol2number",
"number2symbol",
"dfas",
"keywords",
"soft_keywords",
"tokens",
"symbol2label",
):
setattr(new, dict_attr, getattr(self, dict_attr).copy())
new.labels = self.labels[:]
new.states = self.states[:]
new.start = self.start
new.version = self.version
new.async_keywords = self.async_keywords
return new
def report(self) -> None:
"""Dump the grammar tables to standard output, for debugging."""
from pprint import pprint
print("s2n")
pprint(self.symbol2number)
print("n2s")
pprint(self.number2symbol)
print("states")
pprint(self.states)
print("dfas")
pprint(self.dfas)
print("labels")
pprint(self.labels)
print("start", self.start)
# Map from operator to number (since tokenize doesn't do this)
opmap_raw = """
( LPAR
) RPAR
[ LSQB
] RSQB
: COLON
, COMMA
; SEMI
+ PLUS
- MINUS
* STAR
/ SLASH
| VBAR
& AMPER
< LESS
> GREATER
= EQUAL
. DOT
% PERCENT
` BACKQUOTE
{ LBRACE
} RBRACE
@ AT
@= ATEQUAL
== EQEQUAL
!= NOTEQUAL
<> NOTEQUAL
<= LESSEQUAL
>= GREATEREQUAL
~ TILDE
^ CIRCUMFLEX
<< LEFTSHIFT
>> RIGHTSHIFT
** DOUBLESTAR
+= PLUSEQUAL
-= MINEQUAL
*= STAREQUAL
/= SLASHEQUAL
%= PERCENTEQUAL
&= AMPEREQUAL
|= VBAREQUAL
^= CIRCUMFLEXEQUAL
<<= LEFTSHIFTEQUAL
>>= RIGHTSHIFTEQUAL
**= DOUBLESTAREQUAL
// DOUBLESLASH
//= DOUBLESLASHEQUAL
-> RARROW
:= COLONEQUAL
"""
opmap = {}
for line in opmap_raw.splitlines():
if line:
op, name = line.split()
opmap[op] = getattr(token, name)

View file

@ -0,0 +1,68 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Safely evaluate Python string literals without using eval()."""
import re
from typing import Dict, Match, Text
simple_escapes: Dict[Text, Text] = {
"a": "\a",
"b": "\b",
"f": "\f",
"n": "\n",
"r": "\r",
"t": "\t",
"v": "\v",
"'": "'",
'"': '"',
"\\": "\\",
}
def escape(m: Match[Text]) -> Text:
all, tail = m.group(0, 1)
assert all.startswith("\\")
esc = simple_escapes.get(tail)
if esc is not None:
return esc
if tail.startswith("x"):
hexes = tail[1:]
if len(hexes) < 2:
raise ValueError("invalid hex string escape ('\\%s')" % tail)
try:
i = int(hexes, 16)
except ValueError:
raise ValueError("invalid hex string escape ('\\%s')" % tail) from None
else:
try:
i = int(tail, 8)
except ValueError:
raise ValueError("invalid octal string escape ('\\%s')" % tail) from None
return chr(i)
def evalString(s: Text) -> Text:
assert s.startswith("'") or s.startswith('"'), repr(s[:1])
q = s[0]
if s[:3] == q * 3:
q = q * 3
assert s.endswith(q), repr(s[-len(q) :])
assert len(s) >= 2 * len(q)
s = s[len(q) : -len(q)]
return re.sub(r"\\(\'|\"|\\|[abfnrtv]|x.{0,2}|[0-7]{1,3})", escape, s)
def test() -> None:
for i in range(256):
c = chr(i)
s = repr(c)
e = evalString(s)
if e != c:
print(i, c, s, e)
if __name__ == "__main__":
test()

View file

@ -0,0 +1,393 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Parser engine for the grammar tables generated by pgen.
The grammar table must be loaded first.
See Parser/parser.c in the Python distribution for additional info on
how this parsing engine works.
"""
import copy
from contextlib import contextmanager
# Local imports
from . import grammar, token, tokenize
from typing import (
cast,
Any,
Optional,
Text,
Union,
Tuple,
Dict,
List,
Iterator,
Callable,
Set,
TYPE_CHECKING,
)
from blib2to3.pgen2.grammar import Grammar
from blib2to3.pytree import convert, NL, Context, RawNode, Leaf, Node
if TYPE_CHECKING:
from blib2to3.driver import TokenProxy
Results = Dict[Text, NL]
Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
DFA = List[List[Tuple[int, int]]]
DFAS = Tuple[DFA, Dict[int, int]]
def lam_sub(grammar: Grammar, node: RawNode) -> NL:
assert node[3] is not None
return Node(type=node[0], children=node[3], context=node[2])
# A placeholder node, used when parser is backtracking.
DUMMY_NODE = (-1, None, None, None)
def stack_copy(
stack: List[Tuple[DFAS, int, RawNode]]
) -> List[Tuple[DFAS, int, RawNode]]:
"""Nodeless stack copy."""
return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack]
class Recorder:
def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None:
self.parser = parser
self._ilabels = ilabels
self.context = context # not really matter
self._dead_ilabels: Set[int] = set()
self._start_point = self.parser.stack
self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
@property
def ilabels(self) -> Set[int]:
return self._dead_ilabels.symmetric_difference(self._ilabels)
@contextmanager
def switch_to(self, ilabel: int) -> Iterator[None]:
with self.backtrack():
self.parser.stack = self._points[ilabel]
try:
yield
except ParseError:
self._dead_ilabels.add(ilabel)
finally:
self.parser.stack = self._start_point
@contextmanager
def backtrack(self) -> Iterator[None]:
"""
Use the node-level invariant ones for basic parsing operations (push/pop/shift).
These still will operate on the stack; but they won't create any new nodes, or
modify the contents of any other existing nodes.
This saves us a ton of time when we are backtracking, since we
want to restore to the initial state as quick as possible, which
can only be done by having as little mutatations as possible.
"""
is_backtracking = self.parser.is_backtracking
try:
self.parser.is_backtracking = True
yield
finally:
self.parser.is_backtracking = is_backtracking
def add_token(self, tok_type: int, tok_val: Text, raw: bool = False) -> None:
func: Callable[..., Any]
if raw:
func = self.parser._addtoken
else:
func = self.parser.addtoken
for ilabel in self.ilabels:
with self.switch_to(ilabel):
args = [tok_type, tok_val, self.context]
if raw:
args.insert(0, ilabel)
func(*args)
def determine_route(self, value: Text = None, force: bool = False) -> Optional[int]:
alive_ilabels = self.ilabels
if len(alive_ilabels) == 0:
*_, most_successful_ilabel = self._dead_ilabels
raise ParseError("bad input", most_successful_ilabel, value, self.context)
ilabel, *rest = alive_ilabels
if force or not rest:
return ilabel
else:
return None
class ParseError(Exception):
"""Exception to signal the parser is stuck."""
def __init__(
self, msg: Text, type: Optional[int], value: Optional[Text], context: Context
) -> None:
Exception.__init__(
self, "%s: type=%r, value=%r, context=%r" % (msg, type, value, context)
)
self.msg = msg
self.type = type
self.value = value
self.context = context
class Parser(object):
"""Parser engine.
The proper usage sequence is:
p = Parser(grammar, [converter]) # create instance
p.setup([start]) # prepare for parsing
<for each input token>:
if p.addtoken(...): # parse a token; may raise ParseError
break
root = p.rootnode # root of abstract syntax tree
A Parser instance may be reused by calling setup() repeatedly.
A Parser instance contains state pertaining to the current token
sequence, and should not be used concurrently by different threads
to parse separate token sequences.
See driver.py for how to get input tokens by tokenizing a file or
string.
Parsing is complete when addtoken() returns True; the root of the
abstract syntax tree can then be retrieved from the rootnode
instance variable. When a syntax error occurs, addtoken() raises
the ParseError exception. There is no error recovery; the parser
cannot be used after a syntax error was reported (but it can be
reinitialized by calling setup()).
"""
def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
"""Constructor.
The grammar argument is a grammar.Grammar instance; see the
grammar module for more information.
The parser is not ready yet for parsing; you must call the
setup() method to get it started.
The optional convert argument is a function mapping concrete
syntax tree nodes to abstract syntax tree nodes. If not
given, no conversion is done and the syntax tree produced is
the concrete syntax tree. If given, it must be a function of
two arguments, the first being the grammar (a grammar.Grammar
instance), and the second being the concrete syntax tree node
to be converted. The syntax tree is converted from the bottom
up.
**post-note: the convert argument is ignored since for Black's
usage, convert will always be blib2to3.pytree.convert. Allowing
this to be dynamic hurts mypyc's ability to use early binding.
These docs are left for historical and informational value.
A concrete syntax tree node is a (type, value, context, nodes)
tuple, where type is the node type (a token or symbol number),
value is None for symbols and a string for tokens, context is
None or an opaque value used for error reporting (typically a
(lineno, offset) pair), and nodes is a list of children for
symbols, and None for tokens.
An abstract syntax tree node may be anything; this is entirely
up to the converter function.
"""
self.grammar = grammar
# See note in docstring above. TL;DR this is ignored.
self.convert = convert or lam_sub
self.is_backtracking = False
def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
"""Prepare for parsing.
This *must* be called before starting to parse.
The optional argument is an alternative start symbol; it
defaults to the grammar's start symbol.
You can use a Parser instance to parse any number of programs;
each time you call setup() the parser is reset to an initial
state determined by the (implicit or explicit) start symbol.
"""
if start is None:
start = self.grammar.start
# Each stack entry is a tuple: (dfa, state, node).
# A node is a tuple: (type, value, context, children),
# where children is a list of nodes or None, and context may be None.
newnode: RawNode = (start, None, None, [])
stackentry = (self.grammar.dfas[start], 0, newnode)
self.stack: List[Tuple[DFAS, int, RawNode]] = [stackentry]
self.rootnode: Optional[NL] = None
self.used_names: Set[str] = set()
self.proxy = proxy
def addtoken(self, type: int, value: Text, context: Context) -> bool:
"""Add a token; return True iff this is the end of the program."""
# Map from token to label
ilabels = self.classify(type, value, context)
assert len(ilabels) >= 1
# If we have only one state to advance, we'll directly
# take it as is.
if len(ilabels) == 1:
[ilabel] = ilabels
return self._addtoken(ilabel, type, value, context)
# If there are multiple states which we can advance (only
# happen under soft-keywords), then we will try all of them
# in parallel and as soon as one state can reach further than
# the rest, we'll choose that one. This is a pretty hacky
# and hopefully temporary algorithm.
#
# For a more detailed explanation, check out this post:
# https://tree.science/what-the-backtracking.html
with self.proxy.release() as proxy:
counter, force = 0, False
recorder = Recorder(self, ilabels, context)
recorder.add_token(type, value, raw=True)
next_token_value = value
while recorder.determine_route(next_token_value) is None:
if not proxy.can_advance(counter):
force = True
break
next_token_type, next_token_value, *_ = proxy.eat(counter)
if next_token_type in (tokenize.COMMENT, tokenize.NL):
counter += 1
continue
if next_token_type == tokenize.OP:
next_token_type = grammar.opmap[next_token_value]
recorder.add_token(next_token_type, next_token_value)
counter += 1
ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
assert ilabel is not None
return self._addtoken(ilabel, type, value, context)
def _addtoken(self, ilabel: int, type: int, value: Text, context: Context) -> bool:
# Loop until the token is shifted; may raise exceptions
while True:
dfa, state, node = self.stack[-1]
states, first = dfa
arcs = states[state]
# Look for a state with this label
for i, newstate in arcs:
t = self.grammar.labels[i][0]
if t >= 256:
# See if it's a symbol and if we're in its first set
itsdfa = self.grammar.dfas[t]
itsstates, itsfirst = itsdfa
if ilabel in itsfirst:
# Push a symbol
self.push(t, itsdfa, newstate, context)
break # To continue the outer while loop
elif ilabel == i:
# Look it up in the list of labels
# Shift a token; we're done with it
self.shift(type, value, newstate, context)
# Pop while we are in an accept-only state
state = newstate
while states[state] == [(0, state)]:
self.pop()
if not self.stack:
# Done parsing!
return True
dfa, state, node = self.stack[-1]
states, first = dfa
# Done with this token
return False
else:
if (0, state) in arcs:
# An accepting state, pop it and try something else
self.pop()
if not self.stack:
# Done parsing, but another token is input
raise ParseError("too much input", type, value, context)
else:
# No success finding a transition
raise ParseError("bad input", type, value, context)
def classify(self, type: int, value: Text, context: Context) -> List[int]:
"""Turn a token into a label. (Internal)
Depending on whether the value is a soft-keyword or not,
this function may return multiple labels to choose from."""
if type == token.NAME:
# Keep a listing of all used names
self.used_names.add(value)
# Check for reserved words
if value in self.grammar.keywords:
return [self.grammar.keywords[value]]
elif value in self.grammar.soft_keywords:
assert type in self.grammar.tokens
return [
self.grammar.soft_keywords[value],
self.grammar.tokens[type],
]
ilabel = self.grammar.tokens.get(type)
if ilabel is None:
raise ParseError("bad token", type, value, context)
return [ilabel]
def shift(self, type: int, value: Text, newstate: int, context: Context) -> None:
"""Shift a token. (Internal)"""
if self.is_backtracking:
dfa, state, _ = self.stack[-1]
self.stack[-1] = (dfa, newstate, DUMMY_NODE)
else:
dfa, state, node = self.stack[-1]
rawnode: RawNode = (type, value, context, None)
newnode = convert(self.grammar, rawnode)
assert node[-1] is not None
node[-1].append(newnode)
self.stack[-1] = (dfa, newstate, node)
def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
"""Push a nonterminal. (Internal)"""
if self.is_backtracking:
dfa, state, _ = self.stack[-1]
self.stack[-1] = (dfa, newstate, DUMMY_NODE)
self.stack.append((newdfa, 0, DUMMY_NODE))
else:
dfa, state, node = self.stack[-1]
newnode: RawNode = (type, None, context, [])
self.stack[-1] = (dfa, newstate, node)
self.stack.append((newdfa, 0, newnode))
def pop(self) -> None:
"""Pop a nonterminal. (Internal)"""
if self.is_backtracking:
self.stack.pop()
else:
popdfa, popstate, popnode = self.stack.pop()
newnode = convert(self.grammar, popnode)
if self.stack:
dfa, state, node = self.stack[-1]
assert node[-1] is not None
node[-1].append(newnode)
else:
self.rootnode = newnode
self.rootnode.used_names = self.used_names

View file

@ -0,0 +1,433 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Pgen imports
from . import grammar, token, tokenize
from typing import (
Any,
Dict,
IO,
Iterator,
List,
Optional,
Text,
Tuple,
Union,
Sequence,
NoReturn,
)
from blib2to3.pgen2 import grammar
from blib2to3.pgen2.tokenize import GoodTokenInfo
import os
Path = Union[str, "os.PathLike[str]"]
class PgenGrammar(grammar.Grammar):
pass
class ParserGenerator(object):
filename: Path
stream: IO[Text]
generator: Iterator[GoodTokenInfo]
first: Dict[Text, Optional[Dict[Text, int]]]
def __init__(self, filename: Path, stream: Optional[IO[Text]] = None) -> None:
close_stream = None
if stream is None:
stream = open(filename)
close_stream = stream.close
self.filename = filename
self.stream = stream
self.generator = tokenize.generate_tokens(stream.readline)
self.gettoken() # Initialize lookahead
self.dfas, self.startsymbol = self.parse()
if close_stream is not None:
close_stream()
self.first = {} # map from symbol name to set of tokens
self.addfirstsets()
def make_grammar(self) -> PgenGrammar:
c = PgenGrammar()
names = list(self.dfas.keys())
names.sort()
names.remove(self.startsymbol)
names.insert(0, self.startsymbol)
for name in names:
i = 256 + len(c.symbol2number)
c.symbol2number[name] = i
c.number2symbol[i] = name
for name in names:
dfa = self.dfas[name]
states = []
for state in dfa:
arcs = []
for label, next in sorted(state.arcs.items()):
arcs.append((self.make_label(c, label), dfa.index(next)))
if state.isfinal:
arcs.append((0, dfa.index(state)))
states.append(arcs)
c.states.append(states)
c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
c.start = c.symbol2number[self.startsymbol]
return c
def make_first(self, c: PgenGrammar, name: Text) -> Dict[int, int]:
rawfirst = self.first[name]
assert rawfirst is not None
first = {}
for label in sorted(rawfirst):
ilabel = self.make_label(c, label)
##assert ilabel not in first # XXX failed on <> ... !=
first[ilabel] = 1
return first
def make_label(self, c: PgenGrammar, label: Text) -> int:
# XXX Maybe this should be a method on a subclass of converter?
ilabel = len(c.labels)
if label[0].isalpha():
# Either a symbol name or a named token
if label in c.symbol2number:
# A symbol name (a non-terminal)
if label in c.symbol2label:
return c.symbol2label[label]
else:
c.labels.append((c.symbol2number[label], None))
c.symbol2label[label] = ilabel
return ilabel
else:
# A named token (NAME, NUMBER, STRING)
itoken = getattr(token, label, None)
assert isinstance(itoken, int), label
assert itoken in token.tok_name, label
if itoken in c.tokens:
return c.tokens[itoken]
else:
c.labels.append((itoken, None))
c.tokens[itoken] = ilabel
return ilabel
else:
# Either a keyword or an operator
assert label[0] in ('"', "'"), label
value = eval(label)
if value[0].isalpha():
if label[0] == '"':
keywords = c.soft_keywords
else:
keywords = c.keywords
# A keyword
if value in keywords:
return keywords[value]
else:
c.labels.append((token.NAME, value))
keywords[value] = ilabel
return ilabel
else:
# An operator (any non-numeric token)
itoken = grammar.opmap[value] # Fails if unknown token
if itoken in c.tokens:
return c.tokens[itoken]
else:
c.labels.append((itoken, None))
c.tokens[itoken] = ilabel
return ilabel
def addfirstsets(self) -> None:
names = list(self.dfas.keys())
names.sort()
for name in names:
if name not in self.first:
self.calcfirst(name)
# print name, self.first[name].keys()
def calcfirst(self, name: Text) -> None:
dfa = self.dfas[name]
self.first[name] = None # dummy to detect left recursion
state = dfa[0]
totalset: Dict[str, int] = {}
overlapcheck = {}
for label, next in state.arcs.items():
if label in self.dfas:
if label in self.first:
fset = self.first[label]
if fset is None:
raise ValueError("recursion for rule %r" % name)
else:
self.calcfirst(label)
fset = self.first[label]
assert fset is not None
totalset.update(fset)
overlapcheck[label] = fset
else:
totalset[label] = 1
overlapcheck[label] = {label: 1}
inverse: Dict[str, str] = {}
for label, itsfirst in overlapcheck.items():
for symbol in itsfirst:
if symbol in inverse:
raise ValueError(
"rule %s is ambiguous; %s is in the first sets of %s as well"
" as %s" % (name, symbol, label, inverse[symbol])
)
inverse[symbol] = label
self.first[name] = totalset
def parse(self) -> Tuple[Dict[Text, List["DFAState"]], Text]:
dfas = {}
startsymbol: Optional[str] = None
# MSTART: (NEWLINE | RULE)* ENDMARKER
while self.type != token.ENDMARKER:
while self.type == token.NEWLINE:
self.gettoken()
# RULE: NAME ':' RHS NEWLINE
name = self.expect(token.NAME)
self.expect(token.OP, ":")
a, z = self.parse_rhs()
self.expect(token.NEWLINE)
# self.dump_nfa(name, a, z)
dfa = self.make_dfa(a, z)
# self.dump_dfa(name, dfa)
oldlen = len(dfa)
self.simplify_dfa(dfa)
newlen = len(dfa)
dfas[name] = dfa
# print name, oldlen, newlen
if startsymbol is None:
startsymbol = name
assert startsymbol is not None
return dfas, startsymbol
def make_dfa(self, start: "NFAState", finish: "NFAState") -> List["DFAState"]:
# To turn an NFA into a DFA, we define the states of the DFA
# to correspond to *sets* of states of the NFA. Then do some
# state reduction. Let's represent sets as dicts with 1 for
# values.
assert isinstance(start, NFAState)
assert isinstance(finish, NFAState)
def closure(state: NFAState) -> Dict[NFAState, int]:
base: Dict[NFAState, int] = {}
addclosure(state, base)
return base
def addclosure(state: NFAState, base: Dict[NFAState, int]) -> None:
assert isinstance(state, NFAState)
if state in base:
return
base[state] = 1
for label, next in state.arcs:
if label is None:
addclosure(next, base)
states = [DFAState(closure(start), finish)]
for state in states: # NB states grows while we're iterating
arcs: Dict[str, Dict[NFAState, int]] = {}
for nfastate in state.nfaset:
for label, next in nfastate.arcs:
if label is not None:
addclosure(next, arcs.setdefault(label, {}))
for label, nfaset in sorted(arcs.items()):
for st in states:
if st.nfaset == nfaset:
break
else:
st = DFAState(nfaset, finish)
states.append(st)
state.addarc(st, label)
return states # List of DFAState instances; first one is start
def dump_nfa(self, name: Text, start: "NFAState", finish: "NFAState") -> None:
print("Dump of NFA for", name)
todo = [start]
for i, state in enumerate(todo):
print(" State", i, state is finish and "(final)" or "")
for label, next in state.arcs:
if next in todo:
j = todo.index(next)
else:
j = len(todo)
todo.append(next)
if label is None:
print(" -> %d" % j)
else:
print(" %s -> %d" % (label, j))
def dump_dfa(self, name: Text, dfa: Sequence["DFAState"]) -> None:
print("Dump of DFA for", name)
for i, state in enumerate(dfa):
print(" State", i, state.isfinal and "(final)" or "")
for label, next in sorted(state.arcs.items()):
print(" %s -> %d" % (label, dfa.index(next)))
def simplify_dfa(self, dfa: List["DFAState"]) -> None:
# This is not theoretically optimal, but works well enough.
# Algorithm: repeatedly look for two states that have the same
# set of arcs (same labels pointing to the same nodes) and
# unify them, until things stop changing.
# dfa is a list of DFAState instances
changes = True
while changes:
changes = False
for i, state_i in enumerate(dfa):
for j in range(i + 1, len(dfa)):
state_j = dfa[j]
if state_i == state_j:
# print " unify", i, j
del dfa[j]
for state in dfa:
state.unifystate(state_j, state_i)
changes = True
break
def parse_rhs(self) -> Tuple["NFAState", "NFAState"]:
# RHS: ALT ('|' ALT)*
a, z = self.parse_alt()
if self.value != "|":
return a, z
else:
aa = NFAState()
zz = NFAState()
aa.addarc(a)
z.addarc(zz)
while self.value == "|":
self.gettoken()
a, z = self.parse_alt()
aa.addarc(a)
z.addarc(zz)
return aa, zz
def parse_alt(self) -> Tuple["NFAState", "NFAState"]:
# ALT: ITEM+
a, b = self.parse_item()
while self.value in ("(", "[") or self.type in (token.NAME, token.STRING):
c, d = self.parse_item()
b.addarc(c)
b = d
return a, b
def parse_item(self) -> Tuple["NFAState", "NFAState"]:
# ITEM: '[' RHS ']' | ATOM ['+' | '*']
if self.value == "[":
self.gettoken()
a, z = self.parse_rhs()
self.expect(token.OP, "]")
a.addarc(z)
return a, z
else:
a, z = self.parse_atom()
value = self.value
if value not in ("+", "*"):
return a, z
self.gettoken()
z.addarc(a)
if value == "+":
return a, z
else:
return a, a
def parse_atom(self) -> Tuple["NFAState", "NFAState"]:
# ATOM: '(' RHS ')' | NAME | STRING
if self.value == "(":
self.gettoken()
a, z = self.parse_rhs()
self.expect(token.OP, ")")
return a, z
elif self.type in (token.NAME, token.STRING):
a = NFAState()
z = NFAState()
a.addarc(z, self.value)
self.gettoken()
return a, z
else:
self.raise_error(
"expected (...) or NAME or STRING, got %s/%s", self.type, self.value
)
assert False
def expect(self, type: int, value: Optional[Any] = None) -> Text:
if self.type != type or (value is not None and self.value != value):
self.raise_error(
"expected %s/%s, got %s/%s", type, value, self.type, self.value
)
value = self.value
self.gettoken()
return value
def gettoken(self) -> None:
tup = next(self.generator)
while tup[0] in (tokenize.COMMENT, tokenize.NL):
tup = next(self.generator)
self.type, self.value, self.begin, self.end, self.line = tup
# print token.tok_name[self.type], repr(self.value)
def raise_error(self, msg: str, *args: Any) -> NoReturn:
if args:
try:
msg = msg % args
except:
msg = " ".join([msg] + list(map(str, args)))
raise SyntaxError(msg, (self.filename, self.end[0], self.end[1], self.line))
class NFAState(object):
arcs: List[Tuple[Optional[Text], "NFAState"]]
def __init__(self) -> None:
self.arcs = [] # list of (label, NFAState) pairs
def addarc(self, next: "NFAState", label: Optional[Text] = None) -> None:
assert label is None or isinstance(label, str)
assert isinstance(next, NFAState)
self.arcs.append((label, next))
class DFAState(object):
nfaset: Dict[NFAState, Any]
isfinal: bool
arcs: Dict[Text, "DFAState"]
def __init__(self, nfaset: Dict[NFAState, Any], final: NFAState) -> None:
assert isinstance(nfaset, dict)
assert isinstance(next(iter(nfaset)), NFAState)
assert isinstance(final, NFAState)
self.nfaset = nfaset
self.isfinal = final in nfaset
self.arcs = {} # map from label to DFAState
def addarc(self, next: "DFAState", label: Text) -> None:
assert isinstance(label, str)
assert label not in self.arcs
assert isinstance(next, DFAState)
self.arcs[label] = next
def unifystate(self, old: "DFAState", new: "DFAState") -> None:
for label, next in self.arcs.items():
if next is old:
self.arcs[label] = new
def __eq__(self, other: Any) -> bool:
# Equality test -- ignore the nfaset instance variable
assert isinstance(other, DFAState)
if self.isfinal != other.isfinal:
return False
# Can't just return self.arcs == other.arcs, because that
# would invoke this method recursively, with cycles...
if len(self.arcs) != len(other.arcs):
return False
for label, next in self.arcs.items():
if next is not other.arcs.get(label):
return False
return True
__hash__: Any = None # For Py3 compatibility.
def generate_grammar(filename: Path = "Grammar.txt") -> PgenGrammar:
p = ParserGenerator(filename)
return p.make_grammar()

View file

@ -0,0 +1,94 @@
"""Token constants (from "token.h")."""
import sys
from typing import Dict
if sys.version_info < (3, 8):
from typing_extensions import Final
else:
from typing import Final
# Taken from Python (r53757) and modified to include some tokens
# originally monkeypatched in by pgen2.tokenize
# --start constants--
ENDMARKER: Final = 0
NAME: Final = 1
NUMBER: Final = 2
STRING: Final = 3
NEWLINE: Final = 4
INDENT: Final = 5
DEDENT: Final = 6
LPAR: Final = 7
RPAR: Final = 8
LSQB: Final = 9
RSQB: Final = 10
COLON: Final = 11
COMMA: Final = 12
SEMI: Final = 13
PLUS: Final = 14
MINUS: Final = 15
STAR: Final = 16
SLASH: Final = 17
VBAR: Final = 18
AMPER: Final = 19
LESS: Final = 20
GREATER: Final = 21
EQUAL: Final = 22
DOT: Final = 23
PERCENT: Final = 24
BACKQUOTE: Final = 25
LBRACE: Final = 26
RBRACE: Final = 27
EQEQUAL: Final = 28
NOTEQUAL: Final = 29
LESSEQUAL: Final = 30
GREATEREQUAL: Final = 31
TILDE: Final = 32
CIRCUMFLEX: Final = 33
LEFTSHIFT: Final = 34
RIGHTSHIFT: Final = 35
DOUBLESTAR: Final = 36
PLUSEQUAL: Final = 37
MINEQUAL: Final = 38
STAREQUAL: Final = 39
SLASHEQUAL: Final = 40
PERCENTEQUAL: Final = 41
AMPEREQUAL: Final = 42
VBAREQUAL: Final = 43
CIRCUMFLEXEQUAL: Final = 44
LEFTSHIFTEQUAL: Final = 45
RIGHTSHIFTEQUAL: Final = 46
DOUBLESTAREQUAL: Final = 47
DOUBLESLASH: Final = 48
DOUBLESLASHEQUAL: Final = 49
AT: Final = 50
ATEQUAL: Final = 51
OP: Final = 52
COMMENT: Final = 53
NL: Final = 54
RARROW: Final = 55
AWAIT: Final = 56
ASYNC: Final = 57
ERRORTOKEN: Final = 58
COLONEQUAL: Final = 59
N_TOKENS: Final = 60
NT_OFFSET: Final = 256
# --end constants--
tok_name: Final[Dict[int, str]] = {}
for _name, _value in list(globals().items()):
if type(_value) is type(0):
tok_name[_value] = _name
def ISTERMINAL(x: int) -> bool:
return x < NT_OFFSET
def ISNONTERMINAL(x: int) -> bool:
return x >= NT_OFFSET
def ISEOF(x: int) -> bool:
return x == ENDMARKER

View file

@ -0,0 +1,688 @@
# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
# All rights reserved.
# mypy: allow-untyped-defs, allow-untyped-calls
"""Tokenization help for Python programs.
generate_tokens(readline) is a generator that breaks a stream of
text into Python tokens. It accepts a readline-like method which is called
repeatedly to get the next line of input (or "" for EOF). It generates
5-tuples with these members:
the token type (see token.py)
the token (a string)
the starting (row, column) indices of the token (a 2-tuple of ints)
the ending (row, column) indices of the token (a 2-tuple of ints)
the original line (string)
It is designed to match the working of the Python tokenizer exactly, except
that it produces COMMENT tokens for comments and gives type OP for all
operators
Older entry points
tokenize_loop(readline, tokeneater)
tokenize(readline, tokeneater=printtoken)
are the same, except instead of generating tokens, tokeneater is a callback
function to which the 5 fields described above are passed as 5 arguments,
each time a new token is found."""
import sys
from typing import (
Callable,
Iterable,
Iterator,
List,
Optional,
Text,
Tuple,
Pattern,
Union,
cast,
)
if sys.version_info >= (3, 8):
from typing import Final
else:
from typing_extensions import Final
from blib2to3.pgen2.token import *
from blib2to3.pgen2.grammar import Grammar
__author__ = "Ka-Ping Yee <ping@lfw.org>"
__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
import re
from codecs import BOM_UTF8, lookup
from blib2to3.pgen2.token import *
from . import token
__all__ = [x for x in dir(token) if x[0] != "_"] + [
"tokenize",
"generate_tokens",
"untokenize",
]
del token
def group(*choices):
return "(" + "|".join(choices) + ")"
def any(*choices):
return group(*choices) + "*"
def maybe(*choices):
return group(*choices) + "?"
def _combinations(*l):
return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
Whitespace = r"[ \f\t]*"
Comment = r"#[^\r\n]*"
Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
Name = ( # this is invalid but it's fine because Name comes after Number in all groups
r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
)
Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
Exponent = r"[eE][-+]?\d+(?:_\d+)*"
Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
Exponent
)
Expfloat = r"\d+(?:_\d+)*" + Exponent
Floatnumber = group(Pointfloat, Expfloat)
Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
Number = group(Imagnumber, Floatnumber, Intnumber)
# Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
# Tail end of " string.
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
# Tail end of ''' string.
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
Triple = group(_litprefix + "'''", _litprefix + '"""')
# Single-line ' or " string.
String = group(
_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
_litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
)
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
Operator = group(
r"\*\*=?",
r">>=?",
r"<<=?",
r"<>",
r"!=",
r"//=?",
r"->",
r"[+\-*/%&@|^=<>:]=?",
r"~",
)
Bracket = "[][(){}]"
Special = group(r"\r?\n", r"[:;.,`@]")
Funny = group(Operator, Bracket, Special)
# First (or only) line of ' or " string.
ContStr = group(
_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
_litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
)
PseudoExtras = group(r"\\\r?\n", Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
single3prog = re.compile(Single3)
double3prog = re.compile(Double3)
_strprefixes = (
_combinations("r", "R", "f", "F")
| _combinations("r", "R", "b", "B")
| {"u", "U", "ur", "uR", "Ur", "UR"}
)
endprogs: Final = {
"'": re.compile(Single),
'"': re.compile(Double),
"'''": single3prog,
'"""': double3prog,
**{f"{prefix}'''": single3prog for prefix in _strprefixes},
**{f'{prefix}"""': double3prog for prefix in _strprefixes},
**{prefix: None for prefix in _strprefixes},
}
triple_quoted: Final = (
{"'''", '"""'}
| {f"{prefix}'''" for prefix in _strprefixes}
| {f'{prefix}"""' for prefix in _strprefixes}
)
single_quoted: Final = (
{"'", '"'}
| {f"{prefix}'" for prefix in _strprefixes}
| {f'{prefix}"' for prefix in _strprefixes}
)
tabsize = 8
class TokenError(Exception):
pass
class StopTokenizing(Exception):
pass
def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
(srow, scol) = xxx_todo_changeme
(erow, ecol) = xxx_todo_changeme1
print(
"%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
)
Coord = Tuple[int, int]
TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
"""
The tokenize() function accepts two parameters: one representing the
input stream, and one providing an output mechanism for tokenize().
The first parameter, readline, must be a callable object which provides
the same interface as the readline() method of built-in file objects.
Each call to the function should return one line of input as a string.
The second parameter, tokeneater, must also be a callable object. It is
called once for each token, with five arguments, corresponding to the
tuples generated by generate_tokens().
"""
try:
tokenize_loop(readline, tokeneater)
except StopTokenizing:
pass
# backwards compatible interface
def tokenize_loop(readline, tokeneater):
for token_info in generate_tokens(readline):
tokeneater(*token_info)
GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
class Untokenizer:
tokens: List[Text]
prev_row: int
prev_col: int
def __init__(self) -> None:
self.tokens = []
self.prev_row = 1
self.prev_col = 0
def add_whitespace(self, start: Coord) -> None:
row, col = start
assert row <= self.prev_row
col_offset = col - self.prev_col
if col_offset:
self.tokens.append(" " * col_offset)
def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
for t in iterable:
if len(t) == 2:
self.compat(cast(Tuple[int, str], t), iterable)
break
tok_type, token, start, end, line = cast(
Tuple[int, Text, Coord, Coord, Text], t
)
self.add_whitespace(start)
self.tokens.append(token)
self.prev_row, self.prev_col = end
if tok_type in (NEWLINE, NL):
self.prev_row += 1
self.prev_col = 0
return "".join(self.tokens)
def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
startline = False
indents = []
toks_append = self.tokens.append
toknum, tokval = token
if toknum in (NAME, NUMBER):
tokval += " "
if toknum in (NEWLINE, NL):
startline = True
for tok in iterable:
toknum, tokval = tok[:2]
if toknum in (NAME, NUMBER, ASYNC, AWAIT):
tokval += " "
if toknum == INDENT:
indents.append(tokval)
continue
elif toknum == DEDENT:
indents.pop()
continue
elif toknum in (NEWLINE, NL):
startline = True
elif startline and indents:
toks_append(indents[-1])
startline = False
toks_append(tokval)
cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
def _get_normal_name(orig_enc: str) -> str:
"""Imitates get_normal_name in tokenizer.c."""
# Only care about the first 12 characters.
enc = orig_enc[:12].lower().replace("_", "-")
if enc == "utf-8" or enc.startswith("utf-8-"):
return "utf-8"
if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
("latin-1-", "iso-8859-1-", "iso-latin-1-")
):
return "iso-8859-1"
return orig_enc
def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
"""
The detect_encoding() function is used to detect the encoding that should
be used to decode a Python source file. It requires one argument, readline,
in the same way as the tokenize() generator.
It will call readline a maximum of twice, and return the encoding used
(as a string) and a list of any lines (left as bytes) it has read
in.
It detects the encoding from the presence of a utf-8 bom or an encoding
cookie as specified in pep-0263. If both a bom and a cookie are present, but
disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
charset, raise a SyntaxError. Note that if a utf-8 bom is found,
'utf-8-sig' is returned.
If no encoding is specified, then the default of 'utf-8' will be returned.
"""
bom_found = False
encoding = None
default = "utf-8"
def read_or_stop() -> bytes:
try:
return readline()
except StopIteration:
return bytes()
def find_cookie(line: bytes) -> Optional[str]:
try:
line_string = line.decode("ascii")
except UnicodeDecodeError:
return None
match = cookie_re.match(line_string)
if not match:
return None
encoding = _get_normal_name(match.group(1))
try:
codec = lookup(encoding)
except LookupError:
# This behaviour mimics the Python interpreter
raise SyntaxError("unknown encoding: " + encoding)
if bom_found:
if codec.name != "utf-8":
# This behaviour mimics the Python interpreter
raise SyntaxError("encoding problem: utf-8")
encoding += "-sig"
return encoding
first = read_or_stop()
if first.startswith(BOM_UTF8):
bom_found = True
first = first[3:]
default = "utf-8-sig"
if not first:
return default, []
encoding = find_cookie(first)
if encoding:
return encoding, [first]
if not blank_re.match(first):
return default, [first]
second = read_or_stop()
if not second:
return default, [first]
encoding = find_cookie(second)
if encoding:
return encoding, [first, second]
return default, [first, second]
def untokenize(iterable: Iterable[TokenInfo]) -> Text:
"""Transform tokens back into Python source code.
Each element returned by the iterable must be a token sequence
with at least two elements, a token number and token value. If
only two tokens are passed, the resulting output is poor.
Round-trip invariant for full input:
Untokenized source will match input source exactly
Round-trip invariant for limited input:
# Output text will tokenize the back to the input
t1 = [tok[:2] for tok in generate_tokens(f.readline)]
newcode = untokenize(t1)
readline = iter(newcode.splitlines(1)).next
t2 = [tok[:2] for tokin generate_tokens(readline)]
assert t1 == t2
"""
ut = Untokenizer()
return ut.untokenize(iterable)
def generate_tokens(
readline: Callable[[], Text], grammar: Optional[Grammar] = None
) -> Iterator[GoodTokenInfo]:
"""
The generate_tokens() generator requires one argument, readline, which
must be a callable object which provides the same interface as the
readline() method of built-in file objects. Each call to the function
should return one line of input as a string. Alternately, readline
can be a callable function terminating with StopIteration:
readline = open(myfile).next # Example of alternate readline
The generator produces 5-tuples with these members: the token type; the
token string; a 2-tuple (srow, scol) of ints specifying the row and
column where the token begins in the source; a 2-tuple (erow, ecol) of
ints specifying the row and column where the token ends in the source;
and the line on which the token was found. The line passed is the
logical line; continuation lines are included.
"""
lnum = parenlev = continued = 0
numchars: Final = "0123456789"
contstr, needcont = "", 0
contline: Optional[str] = None
indents = [0]
# If we know we're parsing 3.7+, we can unconditionally parse `async` and
# `await` as keywords.
async_keywords = False if grammar is None else grammar.async_keywords
# 'stashed' and 'async_*' are used for async/await parsing
stashed: Optional[GoodTokenInfo] = None
async_def = False
async_def_indent = 0
async_def_nl = False
strstart: Tuple[int, int]
endprog: Pattern[str]
while 1: # loop over lines in stream
try:
line = readline()
except StopIteration:
line = ""
lnum += 1
pos, max = 0, len(line)
if contstr: # continued string
assert contline is not None
if not line:
raise TokenError("EOF in multi-line string", strstart)
endmatch = endprog.match(line)
if endmatch:
pos = end = endmatch.end(0)
yield (
STRING,
contstr + line[:end],
strstart,
(lnum, end),
contline + line,
)
contstr, needcont = "", 0
contline = None
elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
yield (
ERRORTOKEN,
contstr + line,
strstart,
(lnum, len(line)),
contline,
)
contstr = ""
contline = None
continue
else:
contstr = contstr + line
contline = contline + line
continue
elif parenlev == 0 and not continued: # new statement
if not line:
break
column = 0
while pos < max: # measure leading whitespace
if line[pos] == " ":
column += 1
elif line[pos] == "\t":
column = (column // tabsize + 1) * tabsize
elif line[pos] == "\f":
column = 0
else:
break
pos += 1
if pos == max:
break
if stashed:
yield stashed
stashed = None
if line[pos] in "\r\n": # skip blank lines
yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
continue
if line[pos] == "#": # skip comments
comment_token = line[pos:].rstrip("\r\n")
nl_pos = pos + len(comment_token)
yield (
COMMENT,
comment_token,
(lnum, pos),
(lnum, nl_pos),
line,
)
yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
continue
if column > indents[-1]: # count indents
indents.append(column)
yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
while column < indents[-1]: # count dedents
if column not in indents:
raise IndentationError(
"unindent does not match any outer indentation level",
("<tokenize>", lnum, pos, line),
)
indents = indents[:-1]
if async_def and async_def_indent >= indents[-1]:
async_def = False
async_def_nl = False
async_def_indent = 0
yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
if async_def and async_def_nl and async_def_indent >= indents[-1]:
async_def = False
async_def_nl = False
async_def_indent = 0
else: # continued statement
if not line:
raise TokenError("EOF in multi-line statement", (lnum, 0))
continued = 0
while pos < max:
pseudomatch = pseudoprog.match(line, pos)
if pseudomatch: # scan for tokens
start, end = pseudomatch.span(1)
spos, epos, pos = (lnum, start), (lnum, end), end
token, initial = line[start:end], line[start]
if initial in numchars or (
initial == "." and token != "."
): # ordinary number
yield (NUMBER, token, spos, epos, line)
elif initial in "\r\n":
newline = NEWLINE
if parenlev > 0:
newline = NL
elif async_def:
async_def_nl = True
if stashed:
yield stashed
stashed = None
yield (newline, token, spos, epos, line)
elif initial == "#":
assert not token.endswith("\n")
if stashed:
yield stashed
stashed = None
yield (COMMENT, token, spos, epos, line)
elif token in triple_quoted:
endprog = endprogs[token]
endmatch = endprog.match(line, pos)
if endmatch: # all on one line
pos = endmatch.end(0)
token = line[start:pos]
if stashed:
yield stashed
stashed = None
yield (STRING, token, spos, (lnum, pos), line)
else:
strstart = (lnum, start) # multiple lines
contstr = line[start:]
contline = line
break
elif (
initial in single_quoted
or token[:2] in single_quoted
or token[:3] in single_quoted
):
if token[-1] == "\n": # continued string
strstart = (lnum, start)
endprog = (
endprogs[initial]
or endprogs[token[1]]
or endprogs[token[2]]
)
contstr, needcont = line[start:], 1
contline = line
break
else: # ordinary string
if stashed:
yield stashed
stashed = None
yield (STRING, token, spos, epos, line)
elif initial.isidentifier(): # ordinary name
if token in ("async", "await"):
if async_keywords or async_def:
yield (
ASYNC if token == "async" else AWAIT,
token,
spos,
epos,
line,
)
continue
tok = (NAME, token, spos, epos, line)
if token == "async" and not stashed:
stashed = tok
continue
if token in ("def", "for"):
if stashed and stashed[0] == NAME and stashed[1] == "async":
if token == "def":
async_def = True
async_def_indent = indents[-1]
yield (
ASYNC,
stashed[1],
stashed[2],
stashed[3],
stashed[4],
)
stashed = None
if stashed:
yield stashed
stashed = None
yield tok
elif initial == "\\": # continued stmt
# This yield is new; needed for better idempotency:
if stashed:
yield stashed
stashed = None
yield (NL, token, spos, (lnum, pos), line)
continued = 1
else:
if initial in "([{":
parenlev += 1
elif initial in ")]}":
parenlev -= 1
if stashed:
yield stashed
stashed = None
yield (OP, token, spos, epos, line)
else:
yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
pos += 1
if stashed:
yield stashed
stashed = None
for indent in indents[1:]: # pop remaining indent levels
yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
if __name__ == "__main__": # testing
import sys
if len(sys.argv) > 1:
tokenize(open(sys.argv[1]).readline)
else:
tokenize(sys.stdin.readline)

View file

@ -0,0 +1,217 @@
# Copyright 2006 Google, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Export the Python grammar and symbols."""
# Python imports
import os
from typing import Union
# Local imports
from .pgen2 import token
from .pgen2 import driver
from .pgen2.grammar import Grammar
# Moved into initialize because mypyc can't handle __file__ (XXX bug)
# # The grammar file
# _GRAMMAR_FILE = os.path.join(os.path.dirname(__file__), "Grammar.txt")
# _PATTERN_GRAMMAR_FILE = os.path.join(os.path.dirname(__file__),
# "PatternGrammar.txt")
class Symbols(object):
def __init__(self, grammar: Grammar) -> None:
"""Initializer.
Creates an attribute for each grammar symbol (nonterminal),
whose value is the symbol's type (an int >= 256).
"""
for name, symbol in grammar.symbol2number.items():
setattr(self, name, symbol)
class _python_symbols(Symbols):
and_expr: int
and_test: int
annassign: int
arglist: int
argument: int
arith_expr: int
asexpr_test: int
assert_stmt: int
async_funcdef: int
async_stmt: int
atom: int
augassign: int
break_stmt: int
case_block: int
classdef: int
comp_for: int
comp_if: int
comp_iter: int
comp_op: int
comparison: int
compound_stmt: int
continue_stmt: int
decorated: int
decorator: int
decorators: int
del_stmt: int
dictsetmaker: int
dotted_as_name: int
dotted_as_names: int
dotted_name: int
encoding_decl: int
eval_input: int
except_clause: int
exec_stmt: int
expr: int
expr_stmt: int
exprlist: int
factor: int
file_input: int
flow_stmt: int
for_stmt: int
funcdef: int
global_stmt: int
guard: int
if_stmt: int
import_as_name: int
import_as_names: int
import_from: int
import_name: int
import_stmt: int
lambdef: int
listmaker: int
match_stmt: int
namedexpr_test: int
not_test: int
old_comp_for: int
old_comp_if: int
old_comp_iter: int
old_lambdef: int
old_test: int
or_test: int
parameters: int
pass_stmt: int
pattern: int
patterns: int
power: int
print_stmt: int
raise_stmt: int
return_stmt: int
shift_expr: int
simple_stmt: int
single_input: int
sliceop: int
small_stmt: int
subject_expr: int
star_expr: int
stmt: int
subscript: int
subscriptlist: int
suite: int
term: int
test: int
testlist: int
testlist1: int
testlist_gexp: int
testlist_safe: int
testlist_star_expr: int
tfpdef: int
tfplist: int
tname: int
trailer: int
try_stmt: int
typedargslist: int
varargslist: int
vfpdef: int
vfplist: int
vname: int
while_stmt: int
with_stmt: int
xor_expr: int
yield_arg: int
yield_expr: int
yield_stmt: int
class _pattern_symbols(Symbols):
Alternative: int
Alternatives: int
Details: int
Matcher: int
NegatedUnit: int
Repeater: int
Unit: int
python_grammar: Grammar
python_grammar_no_print_statement: Grammar
python_grammar_no_print_statement_no_exec_statement: Grammar
python_grammar_no_print_statement_no_exec_statement_async_keywords: Grammar
python_grammar_no_exec_statement: Grammar
pattern_grammar: Grammar
python_grammar_soft_keywords: Grammar
python_symbols: _python_symbols
pattern_symbols: _pattern_symbols
def initialize(cache_dir: Union[str, "os.PathLike[str]", None] = None) -> None:
global python_grammar
global python_grammar_no_print_statement
global python_grammar_no_print_statement_no_exec_statement
global python_grammar_no_print_statement_no_exec_statement_async_keywords
global python_grammar_soft_keywords
global python_symbols
global pattern_grammar
global pattern_symbols
# The grammar file
_GRAMMAR_FILE = os.path.join(os.path.dirname(__file__), "Grammar.txt")
_PATTERN_GRAMMAR_FILE = os.path.join(
os.path.dirname(__file__), "PatternGrammar.txt"
)
# Python 2
python_grammar = driver.load_packaged_grammar("blib2to3", _GRAMMAR_FILE, cache_dir)
python_grammar.version = (2, 0)
soft_keywords = python_grammar.soft_keywords.copy()
python_grammar.soft_keywords.clear()
python_symbols = _python_symbols(python_grammar)
# Python 2 + from __future__ import print_function
python_grammar_no_print_statement = python_grammar.copy()
del python_grammar_no_print_statement.keywords["print"]
# Python 3.0-3.6
python_grammar_no_print_statement_no_exec_statement = python_grammar.copy()
del python_grammar_no_print_statement_no_exec_statement.keywords["print"]
del python_grammar_no_print_statement_no_exec_statement.keywords["exec"]
python_grammar_no_print_statement_no_exec_statement.version = (3, 0)
# Python 3.7+
python_grammar_no_print_statement_no_exec_statement_async_keywords = (
python_grammar_no_print_statement_no_exec_statement.copy()
)
python_grammar_no_print_statement_no_exec_statement_async_keywords.async_keywords = (
True
)
python_grammar_no_print_statement_no_exec_statement_async_keywords.version = (3, 7)
# Python 3.10+
python_grammar_soft_keywords = (
python_grammar_no_print_statement_no_exec_statement_async_keywords.copy()
)
python_grammar_soft_keywords.soft_keywords = soft_keywords
python_grammar_soft_keywords.version = (3, 10)
pattern_grammar = driver.load_packaged_grammar(
"blib2to3", _PATTERN_GRAMMAR_FILE, cache_dir
)
pattern_symbols = _pattern_symbols(pattern_grammar)

View file

@ -0,0 +1,984 @@
# Copyright 2006 Google, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""
Python parse tree definitions.
This is a very concrete parse tree; we need to keep every token and
even the comments and whitespace between tokens.
There's also a pattern matching implementation here.
"""
# mypy: allow-untyped-defs
from typing import (
Any,
Dict,
Iterator,
List,
Optional,
Text,
Tuple,
TypeVar,
Union,
Set,
Iterable,
)
from blib2to3.pgen2.grammar import Grammar
__author__ = "Guido van Rossum <guido@python.org>"
import sys
from io import StringIO
HUGE: int = 0x7FFFFFFF # maximum repeat count, default max
_type_reprs: Dict[int, Union[Text, int]] = {}
def type_repr(type_num: int) -> Union[Text, int]:
global _type_reprs
if not _type_reprs:
from .pygram import python_symbols
# printing tokens is possible but not as useful
# from .pgen2 import token // token.__dict__.items():
for name in dir(python_symbols):
val = getattr(python_symbols, name)
if type(val) == int:
_type_reprs[val] = name
return _type_reprs.setdefault(type_num, type_num)
_P = TypeVar("_P", bound="Base")
NL = Union["Node", "Leaf"]
Context = Tuple[Text, Tuple[int, int]]
RawNode = Tuple[int, Optional[Text], Optional[Context], Optional[List[NL]]]
class Base(object):
"""
Abstract base class for Node and Leaf.
This provides some default functionality and boilerplate using the
template pattern.
A node may be a subnode of at most one parent.
"""
# Default values for instance variables
type: int # int: token number (< 256) or symbol number (>= 256)
parent: Optional["Node"] = None # Parent node pointer, or None
children: List[NL] # List of subnodes
was_changed: bool = False
was_checked: bool = False
def __new__(cls, *args, **kwds):
"""Constructor that prevents Base from being instantiated."""
assert cls is not Base, "Cannot instantiate Base"
return object.__new__(cls)
def __eq__(self, other: Any) -> bool:
"""
Compare two nodes for equality.
This calls the method _eq().
"""
if self.__class__ is not other.__class__:
return NotImplemented
return self._eq(other)
@property
def prefix(self) -> Text:
raise NotImplementedError
def _eq(self: _P, other: _P) -> bool:
"""
Compare two nodes for equality.
This is called by __eq__ and __ne__. It is only called if the two nodes
have the same type. This must be implemented by the concrete subclass.
Nodes should be considered equal if they have the same structure,
ignoring the prefix string and other context information.
"""
raise NotImplementedError
def __deepcopy__(self: _P, memo: Any) -> _P:
return self.clone()
def clone(self: _P) -> _P:
"""
Return a cloned (deep) copy of self.
This must be implemented by the concrete subclass.
"""
raise NotImplementedError
def post_order(self) -> Iterator[NL]:
"""
Return a post-order iterator for the tree.
This must be implemented by the concrete subclass.
"""
raise NotImplementedError
def pre_order(self) -> Iterator[NL]:
"""
Return a pre-order iterator for the tree.
This must be implemented by the concrete subclass.
"""
raise NotImplementedError
def replace(self, new: Union[NL, List[NL]]) -> None:
"""Replace this node with a new one in the parent."""
assert self.parent is not None, str(self)
assert new is not None
if not isinstance(new, list):
new = [new]
l_children = []
found = False
for ch in self.parent.children:
if ch is self:
assert not found, (self.parent.children, self, new)
if new is not None:
l_children.extend(new)
found = True
else:
l_children.append(ch)
assert found, (self.children, self, new)
self.parent.children = l_children
self.parent.changed()
self.parent.invalidate_sibling_maps()
for x in new:
x.parent = self.parent
self.parent = None
def get_lineno(self) -> Optional[int]:
"""Return the line number which generated the invocant node."""
node = self
while not isinstance(node, Leaf):
if not node.children:
return None
node = node.children[0]
return node.lineno
def changed(self) -> None:
if self.was_changed:
return
if self.parent:
self.parent.changed()
self.was_changed = True
def remove(self) -> Optional[int]:
"""
Remove the node from the tree. Returns the position of the node in its
parent's children before it was removed.
"""
if self.parent:
for i, node in enumerate(self.parent.children):
if node is self:
del self.parent.children[i]
self.parent.changed()
self.parent.invalidate_sibling_maps()
self.parent = None
return i
return None
@property
def next_sibling(self) -> Optional[NL]:
"""
The node immediately following the invocant in their parent's children
list. If the invocant does not have a next sibling, it is None
"""
if self.parent is None:
return None
if self.parent.next_sibling_map is None:
self.parent.update_sibling_maps()
assert self.parent.next_sibling_map is not None
return self.parent.next_sibling_map[id(self)]
@property
def prev_sibling(self) -> Optional[NL]:
"""
The node immediately preceding the invocant in their parent's children
list. If the invocant does not have a previous sibling, it is None.
"""
if self.parent is None:
return None
if self.parent.prev_sibling_map is None:
self.parent.update_sibling_maps()
assert self.parent.prev_sibling_map is not None
return self.parent.prev_sibling_map[id(self)]
def leaves(self) -> Iterator["Leaf"]:
for child in self.children:
yield from child.leaves()
def depth(self) -> int:
if self.parent is None:
return 0
return 1 + self.parent.depth()
def get_suffix(self) -> Text:
"""
Return the string immediately following the invocant node. This is
effectively equivalent to node.next_sibling.prefix
"""
next_sib = self.next_sibling
if next_sib is None:
return ""
prefix = next_sib.prefix
return prefix
class Node(Base):
"""Concrete implementation for interior nodes."""
fixers_applied: Optional[List[Any]]
used_names: Optional[Set[Text]]
def __init__(
self,
type: int,
children: List[NL],
context: Optional[Any] = None,
prefix: Optional[Text] = None,
fixers_applied: Optional[List[Any]] = None,
) -> None:
"""
Initializer.
Takes a type constant (a symbol number >= 256), a sequence of
child nodes, and an optional context keyword argument.
As a side effect, the parent pointers of the children are updated.
"""
assert type >= 256, type
self.type = type
self.children = list(children)
for ch in self.children:
assert ch.parent is None, repr(ch)
ch.parent = self
self.invalidate_sibling_maps()
if prefix is not None:
self.prefix = prefix
if fixers_applied:
self.fixers_applied = fixers_applied[:]
else:
self.fixers_applied = None
def __repr__(self) -> Text:
"""Return a canonical string representation."""
assert self.type is not None
return "%s(%s, %r)" % (
self.__class__.__name__,
type_repr(self.type),
self.children,
)
def __str__(self) -> Text:
"""
Return a pretty string representation.
This reproduces the input source exactly.
"""
return "".join(map(str, self.children))
def _eq(self, other) -> bool:
"""Compare two nodes for equality."""
return (self.type, self.children) == (other.type, other.children)
def clone(self) -> "Node":
assert self.type is not None
"""Return a cloned (deep) copy of self."""
return Node(
self.type,
[ch.clone() for ch in self.children],
fixers_applied=self.fixers_applied,
)
def post_order(self) -> Iterator[NL]:
"""Return a post-order iterator for the tree."""
for child in self.children:
yield from child.post_order()
yield self
def pre_order(self) -> Iterator[NL]:
"""Return a pre-order iterator for the tree."""
yield self
for child in self.children:
yield from child.pre_order()
@property
def prefix(self) -> Text:
"""
The whitespace and comments preceding this node in the input.
"""
if not self.children:
return ""
return self.children[0].prefix
@prefix.setter
def prefix(self, prefix) -> None:
if self.children:
self.children[0].prefix = prefix
def set_child(self, i: int, child: NL) -> None:
"""
Equivalent to 'node.children[i] = child'. This method also sets the
child's parent attribute appropriately.
"""
child.parent = self
self.children[i].parent = None
self.children[i] = child
self.changed()
self.invalidate_sibling_maps()
def insert_child(self, i: int, child: NL) -> None:
"""
Equivalent to 'node.children.insert(i, child)'. This method also sets
the child's parent attribute appropriately.
"""
child.parent = self
self.children.insert(i, child)
self.changed()
self.invalidate_sibling_maps()
def append_child(self, child: NL) -> None:
"""
Equivalent to 'node.children.append(child)'. This method also sets the
child's parent attribute appropriately.
"""
child.parent = self
self.children.append(child)
self.changed()
self.invalidate_sibling_maps()
def invalidate_sibling_maps(self) -> None:
self.prev_sibling_map: Optional[Dict[int, Optional[NL]]] = None
self.next_sibling_map: Optional[Dict[int, Optional[NL]]] = None
def update_sibling_maps(self) -> None:
_prev: Dict[int, Optional[NL]] = {}
_next: Dict[int, Optional[NL]] = {}
self.prev_sibling_map = _prev
self.next_sibling_map = _next
previous: Optional[NL] = None
for current in self.children:
_prev[id(current)] = previous
_next[id(previous)] = current
previous = current
_next[id(current)] = None
class Leaf(Base):
"""Concrete implementation for leaf nodes."""
# Default values for instance variables
value: Text
fixers_applied: List[Any]
bracket_depth: int
# Changed later in brackets.py
opening_bracket: Optional["Leaf"] = None
used_names: Optional[Set[Text]]
_prefix = "" # Whitespace and comments preceding this token in the input
lineno: int = 0 # Line where this token starts in the input
column: int = 0 # Column where this token starts in the input
def __init__(
self,
type: int,
value: Text,
context: Optional[Context] = None,
prefix: Optional[Text] = None,
fixers_applied: List[Any] = [],
opening_bracket: Optional["Leaf"] = None,
) -> None:
"""
Initializer.
Takes a type constant (a token number < 256), a string value, and an
optional context keyword argument.
"""
assert 0 <= type < 256, type
if context is not None:
self._prefix, (self.lineno, self.column) = context
self.type = type
self.value = value
if prefix is not None:
self._prefix = prefix
self.fixers_applied: Optional[List[Any]] = fixers_applied[:]
self.children = []
self.opening_bracket = opening_bracket
def __repr__(self) -> str:
"""Return a canonical string representation."""
from .pgen2.token import tok_name
assert self.type is not None
return "%s(%s, %r)" % (
self.__class__.__name__,
tok_name.get(self.type, self.type),
self.value,
)
def __str__(self) -> Text:
"""
Return a pretty string representation.
This reproduces the input source exactly.
"""
return self._prefix + str(self.value)
def _eq(self, other) -> bool:
"""Compare two nodes for equality."""
return (self.type, self.value) == (other.type, other.value)
def clone(self) -> "Leaf":
assert self.type is not None
"""Return a cloned (deep) copy of self."""
return Leaf(
self.type,
self.value,
(self.prefix, (self.lineno, self.column)),
fixers_applied=self.fixers_applied,
opening_bracket=self.opening_bracket,
)
def leaves(self) -> Iterator["Leaf"]:
yield self
def post_order(self) -> Iterator["Leaf"]:
"""Return a post-order iterator for the tree."""
yield self
def pre_order(self) -> Iterator["Leaf"]:
"""Return a pre-order iterator for the tree."""
yield self
@property
def prefix(self) -> Text:
"""
The whitespace and comments preceding this token in the input.
"""
return self._prefix
@prefix.setter
def prefix(self, prefix) -> None:
self.changed()
self._prefix = prefix
def convert(gr: Grammar, raw_node: RawNode) -> NL:
"""
Convert raw node information to a Node or Leaf instance.
This is passed to the parser driver which calls it whenever a reduction of a
grammar rule produces a new complete node, so that the tree is build
strictly bottom-up.
"""
type, value, context, children = raw_node
if children or type in gr.number2symbol:
# If there's exactly one child, return that child instead of
# creating a new node.
assert children is not None
if len(children) == 1:
return children[0]
return Node(type, children, context=context)
else:
return Leaf(type, value or "", context=context)
_Results = Dict[Text, NL]
class BasePattern(object):
"""
A pattern is a tree matching pattern.
It looks for a specific node type (token or symbol), and
optionally for a specific content.
This is an abstract base class. There are three concrete
subclasses:
- LeafPattern matches a single leaf node;
- NodePattern matches a single node (usually non-leaf);
- WildcardPattern matches a sequence of nodes of variable length.
"""
# Defaults for instance variables
type: Optional[int]
type = None # Node type (token if < 256, symbol if >= 256)
content: Any = None # Optional content matching pattern
name: Optional[Text] = None # Optional name used to store match in results dict
def __new__(cls, *args, **kwds):
"""Constructor that prevents BasePattern from being instantiated."""
assert cls is not BasePattern, "Cannot instantiate BasePattern"
return object.__new__(cls)
def __repr__(self) -> Text:
assert self.type is not None
args = [type_repr(self.type), self.content, self.name]
while args and args[-1] is None:
del args[-1]
return "%s(%s)" % (self.__class__.__name__, ", ".join(map(repr, args)))
def _submatch(self, node, results=None) -> bool:
raise NotImplementedError
def optimize(self) -> "BasePattern":
"""
A subclass can define this as a hook for optimizations.
Returns either self or another node with the same effect.
"""
return self
def match(self, node: NL, results: Optional[_Results] = None) -> bool:
"""
Does this pattern exactly match a node?
Returns True if it matches, False if not.
If results is not None, it must be a dict which will be
updated with the nodes matching named subpatterns.
Default implementation for non-wildcard patterns.
"""
if self.type is not None and node.type != self.type:
return False
if self.content is not None:
r: Optional[_Results] = None
if results is not None:
r = {}
if not self._submatch(node, r):
return False
if r:
assert results is not None
results.update(r)
if results is not None and self.name:
results[self.name] = node
return True
def match_seq(self, nodes: List[NL], results: Optional[_Results] = None) -> bool:
"""
Does this pattern exactly match a sequence of nodes?
Default implementation for non-wildcard patterns.
"""
if len(nodes) != 1:
return False
return self.match(nodes[0], results)
def generate_matches(self, nodes: List[NL]) -> Iterator[Tuple[int, _Results]]:
"""
Generator yielding all matches for this pattern.
Default implementation for non-wildcard patterns.
"""
r: _Results = {}
if nodes and self.match(nodes[0], r):
yield 1, r
class LeafPattern(BasePattern):
def __init__(
self,
type: Optional[int] = None,
content: Optional[Text] = None,
name: Optional[Text] = None,
) -> None:
"""
Initializer. Takes optional type, content, and name.
The type, if given must be a token type (< 256). If not given,
this matches any *leaf* node; the content may still be required.
The content, if given, must be a string.
If a name is given, the matching node is stored in the results
dict under that key.
"""
if type is not None:
assert 0 <= type < 256, type
if content is not None:
assert isinstance(content, str), repr(content)
self.type = type
self.content = content
self.name = name
def match(self, node: NL, results=None):
"""Override match() to insist on a leaf node."""
if not isinstance(node, Leaf):
return False
return BasePattern.match(self, node, results)
def _submatch(self, node, results=None):
"""
Match the pattern's content to the node's children.
This assumes the node type matches and self.content is not None.
Returns True if it matches, False if not.
If results is not None, it must be a dict which will be
updated with the nodes matching named subpatterns.
When returning False, the results dict may still be updated.
"""
return self.content == node.value
class NodePattern(BasePattern):
wildcards: bool = False
def __init__(
self,
type: Optional[int] = None,
content: Optional[Iterable[Text]] = None,
name: Optional[Text] = None,
) -> None:
"""
Initializer. Takes optional type, content, and name.
The type, if given, must be a symbol type (>= 256). If the
type is None this matches *any* single node (leaf or not),
except if content is not None, in which it only matches
non-leaf nodes that also match the content pattern.
The content, if not None, must be a sequence of Patterns that
must match the node's children exactly. If the content is
given, the type must not be None.
If a name is given, the matching node is stored in the results
dict under that key.
"""
if type is not None:
assert type >= 256, type
if content is not None:
assert not isinstance(content, str), repr(content)
newcontent = list(content)
for i, item in enumerate(newcontent):
assert isinstance(item, BasePattern), (i, item)
# I don't even think this code is used anywhere, but it does cause
# unreachable errors from mypy. This function's signature does look
# odd though *shrug*.
if isinstance(item, WildcardPattern): # type: ignore[unreachable]
self.wildcards = True # type: ignore[unreachable]
self.type = type
self.content = newcontent
self.name = name
def _submatch(self, node, results=None) -> bool:
"""
Match the pattern's content to the node's children.
This assumes the node type matches and self.content is not None.
Returns True if it matches, False if not.
If results is not None, it must be a dict which will be
updated with the nodes matching named subpatterns.
When returning False, the results dict may still be updated.
"""
if self.wildcards:
for c, r in generate_matches(self.content, node.children):
if c == len(node.children):
if results is not None:
results.update(r)
return True
return False
if len(self.content) != len(node.children):
return False
for subpattern, child in zip(self.content, node.children):
if not subpattern.match(child, results):
return False
return True
class WildcardPattern(BasePattern):
"""
A wildcard pattern can match zero or more nodes.
This has all the flexibility needed to implement patterns like:
.* .+ .? .{m,n}
(a b c | d e | f)
(...)* (...)+ (...)? (...){m,n}
except it always uses non-greedy matching.
"""
min: int
max: int
def __init__(
self,
content: Optional[Text] = None,
min: int = 0,
max: int = HUGE,
name: Optional[Text] = None,
) -> None:
"""
Initializer.
Args:
content: optional sequence of subsequences of patterns;
if absent, matches one node;
if present, each subsequence is an alternative [*]
min: optional minimum number of times to match, default 0
max: optional maximum number of times to match, default HUGE
name: optional name assigned to this match
[*] Thus, if content is [[a, b, c], [d, e], [f, g, h]] this is
equivalent to (a b c | d e | f g h); if content is None,
this is equivalent to '.' in regular expression terms.
The min and max parameters work as follows:
min=0, max=maxint: .*
min=1, max=maxint: .+
min=0, max=1: .?
min=1, max=1: .
If content is not None, replace the dot with the parenthesized
list of alternatives, e.g. (a b c | d e | f g h)*
"""
assert 0 <= min <= max <= HUGE, (min, max)
if content is not None:
f = lambda s: tuple(s)
wrapped_content = tuple(map(f, content)) # Protect against alterations
# Check sanity of alternatives
assert len(wrapped_content), repr(
wrapped_content
) # Can't have zero alternatives
for alt in wrapped_content:
assert len(alt), repr(alt) # Can have empty alternatives
self.content = wrapped_content
self.min = min
self.max = max
self.name = name
def optimize(self) -> Any:
"""Optimize certain stacked wildcard patterns."""
subpattern = None
if (
self.content is not None
and len(self.content) == 1
and len(self.content[0]) == 1
):
subpattern = self.content[0][0]
if self.min == 1 and self.max == 1:
if self.content is None:
return NodePattern(name=self.name)
if subpattern is not None and self.name == subpattern.name:
return subpattern.optimize()
if (
self.min <= 1
and isinstance(subpattern, WildcardPattern)
and subpattern.min <= 1
and self.name == subpattern.name
):
return WildcardPattern(
subpattern.content,
self.min * subpattern.min,
self.max * subpattern.max,
subpattern.name,
)
return self
def match(self, node, results=None) -> bool:
"""Does this pattern exactly match a node?"""
return self.match_seq([node], results)
def match_seq(self, nodes, results=None) -> bool:
"""Does this pattern exactly match a sequence of nodes?"""
for c, r in self.generate_matches(nodes):
if c == len(nodes):
if results is not None:
results.update(r)
if self.name:
results[self.name] = list(nodes)
return True
return False
def generate_matches(self, nodes) -> Iterator[Tuple[int, _Results]]:
"""
Generator yielding matches for a sequence of nodes.
Args:
nodes: sequence of nodes
Yields:
(count, results) tuples where:
count: the match comprises nodes[:count];
results: dict containing named submatches.
"""
if self.content is None:
# Shortcut for special case (see __init__.__doc__)
for count in range(self.min, 1 + min(len(nodes), self.max)):
r = {}
if self.name:
r[self.name] = nodes[:count]
yield count, r
elif self.name == "bare_name":
yield self._bare_name_matches(nodes)
else:
# The reason for this is that hitting the recursion limit usually
# results in some ugly messages about how RuntimeErrors are being
# ignored. We only have to do this on CPython, though, because other
# implementations don't have this nasty bug in the first place.
if hasattr(sys, "getrefcount"):
save_stderr = sys.stderr
sys.stderr = StringIO()
try:
for count, r in self._recursive_matches(nodes, 0):
if self.name:
r[self.name] = nodes[:count]
yield count, r
except RuntimeError:
# We fall back to the iterative pattern matching scheme if the recursive
# scheme hits the recursion limit.
for count, r in self._iterative_matches(nodes):
if self.name:
r[self.name] = nodes[:count]
yield count, r
finally:
if hasattr(sys, "getrefcount"):
sys.stderr = save_stderr
def _iterative_matches(self, nodes) -> Iterator[Tuple[int, _Results]]:
"""Helper to iteratively yield the matches."""
nodelen = len(nodes)
if 0 >= self.min:
yield 0, {}
results = []
# generate matches that use just one alt from self.content
for alt in self.content:
for c, r in generate_matches(alt, nodes):
yield c, r
results.append((c, r))
# for each match, iterate down the nodes
while results:
new_results = []
for c0, r0 in results:
# stop if the entire set of nodes has been matched
if c0 < nodelen and c0 <= self.max:
for alt in self.content:
for c1, r1 in generate_matches(alt, nodes[c0:]):
if c1 > 0:
r = {}
r.update(r0)
r.update(r1)
yield c0 + c1, r
new_results.append((c0 + c1, r))
results = new_results
def _bare_name_matches(self, nodes) -> Tuple[int, _Results]:
"""Special optimized matcher for bare_name."""
count = 0
r = {} # type: _Results
done = False
max = len(nodes)
while not done and count < max:
done = True
for leaf in self.content:
if leaf[0].match(nodes[count], r):
count += 1
done = False
break
assert self.name is not None
r[self.name] = nodes[:count]
return count, r
def _recursive_matches(self, nodes, count) -> Iterator[Tuple[int, _Results]]:
"""Helper to recursively yield the matches."""
assert self.content is not None
if count >= self.min:
yield 0, {}
if count < self.max:
for alt in self.content:
for c0, r0 in generate_matches(alt, nodes):
for c1, r1 in self._recursive_matches(nodes[c0:], count + 1):
r = {}
r.update(r0)
r.update(r1)
yield c0 + c1, r
class NegatedPattern(BasePattern):
def __init__(self, content: Optional[Any] = None) -> None:
"""
Initializer.
The argument is either a pattern or None. If it is None, this
only matches an empty sequence (effectively '$' in regex
lingo). If it is not None, this matches whenever the argument
pattern doesn't have any matches.
"""
if content is not None:
assert isinstance(content, BasePattern), repr(content)
self.content = content
def match(self, node, results=None) -> bool:
# We never match a node in its entirety
return False
def match_seq(self, nodes, results=None) -> bool:
# We only match an empty sequence of nodes in its entirety
return len(nodes) == 0
def generate_matches(self, nodes) -> Iterator[Tuple[int, _Results]]:
if self.content is None:
# Return a match if there is an empty sequence
if len(nodes) == 0:
yield 0, {}
else:
# Return a match if the argument pattern has no matches
for c, r in self.content.generate_matches(nodes):
return
yield 0, {}
def generate_matches(
patterns: List[BasePattern], nodes: List[NL]
) -> Iterator[Tuple[int, _Results]]:
"""
Generator yielding matches for a sequence of patterns and nodes.
Args:
patterns: a sequence of patterns
nodes: a sequence of nodes
Yields:
(count, results) tuples where:
count: the entire sequence of patterns matches nodes[:count];
results: dict containing named submatches.
"""
if not patterns:
yield 0, {}
else:
p, rest = patterns[0], patterns[1:]
for c0, r0 in p.generate_matches(nodes):
if not rest:
yield c0, r0
else:
for c1, r1 in generate_matches(rest, nodes[c0:]):
r = {}
r.update(r0)
r.update(r1)
yield c0 + c1, r