# pyflyby/_parse.py. # Copyright (C) 2011, 2012, 2013, 2014, 2015, 2018 Karl Chen. # License: MIT http://opensource.org/licenses/MIT from __future__ import (absolute_import, division, print_function, with_statement) import ast from collections import namedtuple from doctest import DocTestParser from functools import total_ordering from itertools import groupby import re import sys from textwrap import dedent import types import six from six import PY2, PY3, text_type as unicode from six.moves import range from pyflyby._file import FilePos, FileText, Filename from pyflyby._flags import CompilerFlags from pyflyby._log import logger from pyflyby._util import cached_attribute, cmp if PY3: from ast import Bytes else: Bytes = ast.Str if sys.version_info >= (3, 8): from ast import TypeIgnore else: # TypeIgnore does not exist on Python 3.7 and before. # thus we define a dummy TypeIgnore just to simplify remaining code. class TypeIgnore: pass def _is_comment_or_blank(line): """ Returns whether a line of python code contains only a comment is blank. >>> _is_comment_or_blank("foo\\n") False >>> _is_comment_or_blank(" # blah\\n") True """ return re.sub("#.*", "", line).rstrip() == "" def _ast_str_literal_value(node): if isinstance(node, (ast.Str, Bytes)): return node.s if isinstance(node, ast.Expr) and isinstance(node.value, (ast.Str, Bytes)): return node.value.s else: return None def _flatten_ast_nodes(arg): if arg is None: pass elif isinstance(arg, ast.AST): yield arg elif isinstance(arg, str): #FunctionDef type_comments yield arg elif isinstance(arg, (tuple, list, types.GeneratorType)): for x in arg: for y in _flatten_ast_nodes(x): yield y else: raise TypeError( "_flatten_ast_nodes: unexpected %s" % (type(arg).__name__,)) def _iter_child_nodes_in_order(node): """ Yield all direct child nodes of ``node``, that is, all fields that are nodes and all items of fields that are lists of nodes. ``_iter_child_nodes_in_order`` yields nodes in the same order that they appear in the source. ``ast.iter_child_nodes`` does the same thing, but not in source order. e.g. for ``Dict`` s, it yields all key nodes before all value nodes. """ return _flatten_ast_nodes(_iter_child_nodes_in_order_internal_1(node)) def _iter_child_nodes_in_order_internal_1(node): if isinstance(node, str): # this happen for type comments which are not ast nodes but str # they do not have children. We yield nothing. yield [] return if not isinstance(node, ast.AST): raise TypeError if isinstance(node, ast.Dict): assert node._fields == ("keys", "values") yield list(zip(node.keys, node.values)) elif isinstance(node, ast.FunctionDef): if six.PY2: assert node._fields == ('name', 'args', 'body', 'decorator_list'), node._fields yield node.decorator_list, node.args, node.body elif sys.version_info >= (3, 8): assert node._fields == ( "name", "args", "body", "decorator_list", "returns", "type_comment", ), node._fields res = ( node.type_comment, node.decorator_list, node.args, node.returns, node.body, ) yield res else: assert node._fields == ('name', 'args', 'body', 'decorator_list', 'returns'), node._fields yield node.decorator_list, node.args, node.returns, node.body # node.name is a string, not an AST node elif isinstance(node, ast.arguments): if six.PY2: assert node._fields == ('args', 'vararg', 'kwarg', 'defaults'), node._fields elif sys.version_info >= (3, 8): assert node._fields == ('posonlyargs', 'args', 'vararg', 'kwonlyargs', 'kw_defaults', 'kwarg', 'defaults'), node._fields else: assert node._fields == ('args', 'vararg', 'kwonlyargs', 'kw_defaults', 'kwarg', 'defaults'), node._fields defaults = node.defaults or () num_no_default = len(node.args)-len(defaults) yield node.args[:num_no_default] yield list(zip(node.args[num_no_default:], defaults)) # node.varags and node.kwarg are strings, not AST nodes. elif isinstance(node, ast.IfExp): assert node._fields == ('test', 'body', 'orelse') yield node.body, node.test, node.orelse elif isinstance(node, ast.Call): # call arguments order are lost by ast, re-order them yield node.func args = sorted([(k.value.lineno, k.value.col_offset, k) for k in node.keywords]+ [(k.lineno,k.col_offset, k) for k in node.args]) yield [a[2] for a in args] elif isinstance(node, ast.ClassDef): if six.PY2: assert node._fields == ('name', 'bases', 'body', 'decorator_list') else: assert node._fields == ('name', 'bases', 'keywords', 'body', 'decorator_list') yield node.decorator_list, node.bases, node.body # node.name is a string, not an AST node elif sys.version_info >= (3, 7) and isinstance(node, ast.FormattedValue): assert node._fields == ('value', 'conversion', 'format_spec') yield node.value, else: # Default behavior. yield ast.iter_child_nodes(node) def _walk_ast_nodes_in_order(node): """ Recursively yield all child nodes of ``node``, in the same order that the node appears in the source. ``ast.walk`` does the same thing, but yields nodes in an arbitrary order. """ # The implementation is basically the same as ``ast.walk``, but: # 1. Use a stack instead of a deque. (I.e., depth-first search instead # of breadth-first search.) # 2. Use _iter_child_nodes_in_order instead of ``ast.iter_child_nodes``. todo = [node] while todo: node = todo.pop() yield node todo.extend(reversed(list(_iter_child_nodes_in_order(node)))) def _flags_to_try(source, flags, auto_flags, mode): """ Flags to try for ``auto_flags``. If ``auto_flags`` is False, then only yield ``flags``. If ``auto_flags`` is True, then yield ``flags`` and ``flags ^ print_function``. """ flags = CompilerFlags(flags) if sys.version_info >= (3, 8): if re.search(r"# *type:", source): flags = flags | CompilerFlags('type_comments') yield flags return if not auto_flags: yield flags return if PY3: yield flags return if mode == "eval": if re.search(r"\bprint\b", source): flags = flags | CompilerFlags("print_function") yield flags return yield flags if re.search(r"\bprint\b", source): yield flags ^ CompilerFlags("print_function") def _parse_ast_nodes(text, flags, auto_flags, mode): """ Parse a block of lines into an AST. Also annotate ``input_flags``, ``source_flags``, and ``flags`` on the resulting ast node. :type text: ``FileText`` :type flags: ``CompilerFlags`` :type auto_flags: ``bool`` :param auto_flags: Whether to guess different flags if ``text`` can't be parsed with ``flags``. :param mode: Compilation mode: "exec", "single", or "eval". :rtype: ``ast.Module`` """ text = FileText(text) filename = str(text.filename) if text.filename else "" source = text.joined source = dedent(source) if PY2 and isinstance(source, unicode): source = source.encode('utf-8') if not source.endswith("\n"): # Ensure that the last line ends with a newline (``ast`` barfs # otherwise). source += "\n" exp = None for flags in _flags_to_try(source, flags, auto_flags, mode): cflags = ast.PyCF_ONLY_AST | int(flags) try: result = compile( source, filename, mode, flags=cflags, dont_inherit=1) except SyntaxError as e: exp = e pass else: # Attach flags to the result. result.input_flags = flags result.source_flags = CompilerFlags.from_ast(result) result.flags = result.input_flags | result.source_flags result.text = text return result raise exp # SyntaxError def _test_parse_string_literal(text, flags): r""" Attempt to parse ``text``. If it parses cleanly to a single string literal, return its value. Otherwise return ``None``. >>> _test_parse_string_literal(r'"foo\n" r"\nbar"', None) 'foo\n\\nbar' """ text = FileText(text) if PY2: try: text.joined.encode('ascii') except UnicodeError: text = FileText(u'# encoding: utf-8\n' + unicode(text), filename=text.filename) try: module_node = _parse_ast_nodes(text, flags, False, "eval") except SyntaxError: return None body = module_node.body if not isinstance(body, (ast.Str, Bytes)): return None return body.s AstNodeContext = namedtuple("AstNodeContext", "parent field index") def _annotate_ast_nodes(ast_node): """ Annotate AST with: - startpos and endpos - [disabled for now: context as `AstNodeContext` ] :type ast_node: ``ast.AST`` :param ast_node: AST node returned by `_parse_ast_nodes` :return: ``None`` """ text = ast_node.text flags = ast_node.flags startpos = text.startpos _annotate_ast_startpos(ast_node, None, startpos, text, flags) # Not used for now: # ast_node.context = AstNodeContext(None, None, None) # _annotate_ast_context(ast_node) def _annotate_ast_startpos(ast_node, parent_ast_node, minpos, text, flags): r""" Annotate ``ast_node``. Set ``ast_node.startpos`` to the starting position of the node within ``text``. For "typical" nodes, i.e. those other than multiline strings, this is simply FilePos(ast_node.lineno, ast_node.col_offset+1), but taking ``text.startpos`` into account. For multiline string nodes, this function works by trying to parse all possible subranges of lines until finding the range that is syntactically valid and matches ``value``. The candidate range is text[min_start_lineno:lineno+text.startpos.lineno+1]. This function is unfortunately necessary because of a flaw in the output produced by the Python built-in parser. For some crazy reason, the ``ast_node.lineno`` attribute represents something different for multiline string literals versus all other statements. For multiline string literal nodes and statements that are just a string expression (or more generally, nodes where the first descendant leaf node is a multiline string literal), the compiler attaches the ending line number as the value of the ``lineno`` attribute. For all other than AST nodes, the compiler attaches the starting line number as the value of the ``lineno`` attribute. This means e.g. the statement "'''foo\nbar'''" has a lineno value of 2, but the statement "x='''foo\nbar'''" has a lineno value of 1. :type ast_node: ``ast.AST`` :type minpos: ``FilePos`` :param minpos: Earliest position to check, in the number space of ``text``. :type text: ``FileText`` :param text: Source text that was used to parse the AST, whose ``startpos`` should be used in interpreting ``ast_node.lineno`` (which always starts at 1 for the subset that was parsed). :type flags: ``CompilerFlags`` :param flags: Compiler flags to use when re-compiling code. :return: ``True`` if this node is a multiline string literal or the first child is such a node (recursively); ``False`` otherwise. :raise ValueError: Could not find the starting line number. """ assert isinstance(ast_node, (ast.AST, str, TypeIgnore)), ast_node # joined strings and children do not carry a column offset on pre-3.8 # this prevent reformatting. # set the column offset to the parent value before 3.8 if (3, 7) < sys.version_info < (3, 8): if ( isinstance(ast_node, (getattr(ast, "JoinedStr", None), ast.FormattedValue)) or isinstance( parent_ast_node, (getattr(ast, "JoinedStr", None), ast.FormattedValue) ) ) and ast_node.col_offset == -1: ast_node.col_offset = parent_ast_node.col_offset # First, traverse child nodes. If the first child node (recursively) is a # multiline string, then we need to transfer its information to this node. # Walk all nodes/fields of the AST. We implement this as a custom # depth-first search instead of using ast.walk() or ast.NodeVisitor # so that we can easily keep track of the preceding node's lineno. child_minpos = minpos is_first_child = True leftstr_node = None for child_node in _iter_child_nodes_in_order(ast_node): leftstr = _annotate_ast_startpos(child_node, ast_node, child_minpos, text, flags) if is_first_child and leftstr: leftstr_node = child_node if hasattr(child_node, 'lineno') and not isinstance(child_node, TypeIgnore): if child_node.startpos < child_minpos: raise AssertionError( "Got out-of-order AST node(s):\n" " parent minpos=%s\n" % minpos + " node: %s\n" % ast.dump(ast_node) + " fields: %s\n" % (" ".join(ast_node._fields)) + " children:\n" + ''.join( " %s %9s: %s\n" % ( ("==>" if cn is child_node else " "), getattr(cn, 'startpos', ""), ast.dump(cn)) for cn in _iter_child_nodes_in_order(ast_node)) + "\n" "This indicates a bug in pyflyby._\n" "\n" "pyflyby developer: Check if there's a bug or missing ast node handler in " "pyflyby._parse._iter_child_nodes_in_order() - " "probably the handler for ast.%s." % type(ast_node).__name__) child_minpos = child_node.startpos is_first_child = False # If the node has no lineno at all, then skip it. This should only happen # for nodes we don't care about, e.g. ``ast.Module`` or ``ast.alias``. if not hasattr(ast_node, 'lineno') or isinstance(ast_node, TypeIgnore): return False # If col_offset is set then the lineno should be correct also. if ast_node.col_offset >= 0: # In Python 3.8+, FunctionDef.lineno is the line with the def. To # account for decorators, we need the lineno of the first decorator if (sys.version_info >= (3, 8) and isinstance(ast_node, (ast.FunctionDef, ast.ClassDef)) and ast_node.decorator_list): delta = (ast_node.decorator_list[0].lineno-1, # The col_offset doesn't include the @ ast_node.decorator_list[0].col_offset - 1) else: delta = (ast_node.lineno-1, ast_node.col_offset) # Not a multiline string literal. (I.e., it could be a non-string or # a single-line string.) # Easy. startpos = text.startpos + delta # Special case for 'with' statements. Consider the code: # with X: pass # ^0 ^5 # In python2.6, col_offset is 0. # In python2.7, col_offset is 5. # This is because python2.7 allows for multiple clauses: # with X, Y: pass # Since 'Y's col_offset isn't the beginning of the line, the authors # of Python presumably changed 'X's col_offset to also not be the # beginning of the line. If they had made the With ast node support # multiple clauses, they wouldn't have needed to do that, but then # that would introduce an API change in the AST. So it's # understandable that they did that. # Since we use startpos for breaking lines, we need to set startpos to # the beginning of the line. # In Python 3, the col_offset for the with is 0 again. if (isinstance(ast_node, ast.With) and not isinstance(parent_ast_node, ast.With) and sys.version_info[:2] == (2,7)): assert ast_node.col_offset >= 5 if startpos.lineno == text.startpos.lineno: linestart = text.startpos.colno else: linestart = 1 line = text[(startpos.lineno,linestart):startpos] m = re.search(r"\bwith\s+$", str(line)) assert m lk = len(m.group()) # length of 'with ' including spaces startpos = FilePos(startpos.lineno, startpos.colno - lk) assert str(text[startpos:(startpos+(0,4))]) == "with" ast_node.startpos = startpos if sys.version_info <= (3, 8): ast_node.startpos = max(startpos, minpos) return False assert ast_node.col_offset == -1 if leftstr_node: # This is an ast node where the leftmost deepest leaf is a # multiline string. The bug that multiline strings have broken # lineno/col_offset infects ancestors up the tree. # # If the leftmost leaf is a multi-line string, then ``lineno`` # contains the ending line number, and col_offset is -1: # >>> ast.parse("""'''foo\nbar'''+blah""").body[0].lineno # 2 # But if the leftmost leaf is not a multi-line string, then # ``lineno`` contains the starting line number: # >>> ast.parse("""'''foobar'''+blah""").body[0].lineno # 1 # >>> ast.parse("""blah+'''foo\nbar'''+blah""").body[0].lineno # 1 # # To fix that, we copy start_lineno and start_colno from the Str # node once we've corrected the values. assert not isinstance(ast_node, (ast.Str, Bytes)) assert leftstr_node.lineno == ast_node.lineno assert leftstr_node.col_offset == -1 ast_node.startpos = leftstr_node.startpos return True # It should now be the case that we are looking at a multi-line string # literal. if sys.version_info >= (3, 7) and isinstance(ast_node, ast.FormattedValue): ast_node.startpos = ast_node.value.startpos ast_node.endpos = ast_node.value.startpos return True if not isinstance(ast_node, (ast.Str, Bytes)): raise ValueError( "got a non-string col_offset=-1: %s" % (ast.dump(ast_node))) # The ``lineno`` attribute gives the ending line number of the multiline # string ... unless it's multiple multiline strings that are concatenated # by adjacency, in which case it's merely the end of the first one of # them. At least we know that the start lineno is definitely not later # than the ``lineno`` attribute. first_end_lineno = text.startpos.lineno + ast_node.lineno - 1 # Compute possible start positions. # The starting line number of this string could be anywhere between the # end of the previous expression and ``first_end_lineno``. startpos_candidates = [] assert minpos.lineno <= first_end_lineno for start_lineno in range(minpos.lineno, first_end_lineno + 1): start_line = text[start_lineno] start_line_colno = (text.startpos.colno if start_lineno==text.startpos.lineno else 1) startpos_candidates.extend([ (_m.group()[-1], FilePos(start_lineno, _m.start()+start_line_colno)) for _m in re.finditer("[bBrRuU]*[\"\']", start_line)]) target_str = ast_node.s if isinstance(target_str, bytes) and sys.version_info[:2] == (3, 7): target_str = target_str.decode() # Loop over possible end_linenos. The first one we've identified is the # by far most likely one, but in theory it could be anywhere later in the # file. This could be because of a dastardly concatenated string like # this: # """ # L1 # two # L2 # """ """ # L3 # four # L4 # five # L5 # six # L6 # """ # L7 # There are two substrings on L1:L3 and L3:L7. The parser gives us a # single concatenated string, but sets lineno to 3 instead of 7. We don't # have much to go on to figure out that the real end_lineno is 7. If we # don't find the string ending on L3, then search forward looking for the # real end of the string. Yuck! # # This is now complicated by fstrings that do interpolate variable on 3.7 fixed on 3.8+) # where we'll try to guess based on prefix f_string_candidate_prefixes = [] for end_lineno in range(first_end_lineno, text.endpos.lineno+1): # Compute possible end positions. We're given the line we're ending # on, but not the column position. Note that the ending line could # contain more than just the string we're looking for -- including # possibly other strings or comments. end_line = text[end_lineno] end_line_startcol = ( text.startpos.colno if end_lineno==text.startpos.lineno else 1) endpos_candidates = [ (_m.group(), FilePos(end_lineno,_m.start()+end_line_startcol+1)) for _m in re.finditer("[\"\']", end_line)] if not endpos_candidates: # We found no endpos_candidates. This should not happen for # first_end_lineno because there should be _some_ string that ends # there. if end_lineno == first_end_lineno: raise AssertionError( "No quote char found on line with supposed string") continue # Filter and sort the possible startpos candidates given this endpos # candidate. It's possible for the starting quotechar and ending # quotechar to be different in case of adjacent string concatenation, # e.g. "foo"'''bar'''. That said, it's an unlikely case, so # deprioritize checking them. likely_candidates = [] unlikely_candidates = [] for end_quotechar, endpos in reversed(endpos_candidates): for start_quotechar, startpos in startpos_candidates: if not startpos < endpos: continue if start_quotechar == end_quotechar: candidate_list = likely_candidates else: candidate_list = unlikely_candidates candidate_list.append((startpos,endpos)) # Loop over sorted candidates. matched_prefix = set() for (startpos, endpos) in likely_candidates + unlikely_candidates: # Try to parse the given range and see if it matches the target # string literal. subtext = text[startpos:endpos] candidate_str = _test_parse_string_literal(subtext, flags) if candidate_str is None: continue if isinstance(candidate_str, bytes) and sys.version_info[:2] == (3, 7): candidate_str = candidate_str.decode() maybe_fstring = False try: if (3, 7) <= sys.version_info <= (3, 8): potential_start = text.lines[startpos.lineno - 1] maybe_fstring = ("f'" in potential_start) or ( 'f"' in potential_start ) except IndexError: pass if target_str == candidate_str and target_str: # Success! ast_node.startpos = startpos ast_node.endpos = endpos # This node is a multiline string; and, it's a leaf, so by # definition it is the leftmost node. return True # all done elif candidate_str and target_str.startswith(candidate_str): matched_prefix.add(startpos) elif maybe_fstring: candidate_prefix = candidate_str.split("{")[0] if candidate_prefix and target_str.startswith(candidate_prefix): f_string_candidate_prefixes.append((startpos, endpos)) # We didn't find a string given the current end_lineno candidate. # Only continue checking the startpos candidates that so far produced # prefixes of the string we're looking for. if not matched_prefix: break startpos_candidates = [ (sq, sp) for (sq, sp) in startpos_candidates if sp in matched_prefix ] if (3, 7) <= sys.version_info <= (3, 8): if len(f_string_candidate_prefixes) == 1: # we did not find the string but there is one fstring candidate starting it ast_node.startpos, ast_node.endpos = f_string_candidate_prefixes[0] return True elif isinstance(parent_ast_node, ast.JoinedStr): self_pos = parent_ast_node.values.index(ast_node) ast_node.startpos = parent_ast_node.values[self_pos - 1].startpos ast_node.endpos = parent_ast_node.values[self_pos - 1].endpos return True raise ValueError("Couldn't find exact position of %s" % (ast.dump(ast_node))) def _annotate_ast_context(ast_node): """ Recursively annotate ``context`` on ast nodes, setting ``context`` to a `AstNodeContext` named tuple with values ``(parent, field, index)``. Each ast_node satisfies ``parent.[] is ast_node``. For non-list fields, the index part is ``None``. """ assert isinstance(ast_node, ast.AST) for field_name, field_value in ast.iter_fields(ast_node): if isinstance(field_value, ast.AST): child_node = field_value child_node.context = AstNodeContext(ast_node, field_name, None) _annotate_ast_context(child_node) elif isinstance(field_value, list): for i, item in enumerate(field_value): if isinstance(item, ast.AST): child_node = item child_node.context = AstNodeContext(ast_node, field_name, i) _annotate_ast_context(child_node) def _split_code_lines(ast_nodes, text): """ Split the given ``ast_nodes`` and corresponding ``text`` by code/noncode statement. Yield tuples of (nodes, subtext). ``nodes`` is a list of ``ast.AST`` nodes, length 0 or 1; ``subtext`` is a `FileText` sliced from ``text``. FileText(...))} for code lines and ``(None, FileText(...))`` for non-code lines (comments and blanks). :type ast_nodes: sequence of ``ast.AST`` nodes :type text: `FileText` """ if not ast_nodes: yield ([], text) return assert text.startpos <= ast_nodes[0].startpos assert ast_nodes[-1].startpos < text.endpos if text.startpos != ast_nodes[0].startpos: # Starting noncode lines. yield ([], text[text.startpos:ast_nodes[0].startpos]) end_sentinel = _DummyAst_Node() end_sentinel.startpos = text.endpos for node, next_node in zip(ast_nodes, ast_nodes[1:] + [end_sentinel]): startpos = node.startpos next_startpos = next_node.startpos assert startpos < next_startpos # We have the start position of this node. Figure out the end # position, excluding noncode lines (standalone comments and blank # lines). if hasattr(node, 'endpos'): # We have an endpos for the node because this was a multi-line # string. Start with the node endpos. endpos = node.endpos assert startpos < endpos <= next_startpos # enpos points to the character *after* the ending quote, so we # know that this is never at the beginning of the line. assert endpos.colno != 1 # Advance past whitespace an inline comment, if any. Do NOT # advance past other code that could be on the same line, nor past # blank lines and comments on subsequent lines. line = text[endpos : min(text.endpos, FilePos(endpos.lineno+1,1))] if _is_comment_or_blank(line): endpos = FilePos(endpos.lineno+1, 1) else: endpos = next_startpos assert endpos <= text.endpos # We don't have an endpos yet; what we do have is the next node's # startpos (or the position at the end of the text). Start there # and work backward. if endpos.colno != 1: if endpos == text.endpos: # There could be a comment on the last line and no # trailing newline. # TODO: do this in a more principled way. if _is_comment_or_blank(text[endpos.lineno]): assert startpos.lineno < endpos.lineno if not text[endpos.lineno-1].endswith("\\"): endpos = FilePos(endpos.lineno,1) else: # We're not at end of file, yet the next node starts in # the middle of the line. This should only happen with if # we're not looking at a comment. [The first character in # the line could still be "#" if we're inside a multiline # string that's the last child of the parent node. # Therefore we don't assert 'not # _is_comment_or_blank(...)'.] pass if endpos.colno == 1: while (endpos.lineno-1 > startpos.lineno and _is_comment_or_blank(text[endpos.lineno-1]) and (not text[endpos.lineno-2].endswith("\\") or _is_comment_or_blank(text[endpos.lineno-2]))): endpos = FilePos(endpos.lineno-1, 1) assert startpos < endpos <= next_startpos yield ([node], text[startpos:endpos]) if endpos != next_startpos: yield ([], text[endpos:next_startpos]) def _ast_node_is_in_docstring_position(ast_node): """ Given a ``Str`` AST node, return whether its position within the AST makes it eligible as a docstring. The main way a ``Str`` can be a docstring is if it is a standalone string at the beginning of a ``Module``, ``FunctionDef``, or ``ClassDef``. We also support variable docstrings per Epydoc: - If a variable assignment statement is immediately followed by a bare string literal, then that assignment is treated as a docstring for that variable. :type ast_node: ``ast.Str`` :param ast_node: AST node that has been annotated by ``_annotate_ast_nodes``. :rtype: ``bool`` :return: Whether this string ast node is in docstring position. """ if not isinstance(ast_node, (ast.Str, Bytes)): raise TypeError expr_node = ast_node.context.parent if not isinstance(expr_node, ast.Expr): return False assert ast_node.context.field == 'value' assert ast_node.context.index is None expr_ctx = expr_node.context if expr_ctx.field != 'body': return False parent_node = expr_ctx.parent if not isinstance(parent_node, (ast.FunctionDef, ast.ClassDef, ast.Module)): return False if expr_ctx.index == 0: return True prev_sibling_node = parent_node.body[expr_ctx.index-1] if isinstance(prev_sibling_node, ast.Assign): return True return False def infer_compile_mode(arg): """ Infer the mode needed to compile ``arg``. :type arg: ``ast.AST`` :rtype: ``str`` """ # Infer mode from ast object. if isinstance(arg, ast.Module): mode = "exec" elif isinstance(arg, ast.Expression): mode = "eval" elif isinstance(arg, ast.Interactive): mode = "single" else: raise TypeError( "Expected Module/Expression/Interactive ast node; got %s" % (type(arg).__name__)) return mode class _DummyAst_Node(object): pass class PythonStatement(object): r""" Representation of a top-level Python statement or consecutive comments/blank lines. >>> PythonStatement('print("x",\n file=None)\n', flags='print_function') #doctest: +SKIP PythonStatement('print("x",\n file=None)\n', flags=0x10000) Implemented as a wrapper around a `PythonBlock` containing at most one top-level AST node. """ def __new__(cls, arg, filename=None, startpos=None, flags=None): if isinstance(arg, cls): if filename is startpos is flags is None: return arg arg = arg.block # Fall through if isinstance(arg, (PythonBlock, FileText, str, six.text_type)): block = PythonBlock(arg, filename=filename, startpos=startpos, flags=flags) statements = block.statements if len(statements) != 1: raise ValueError( "Code contains %d statements instead of exactly 1: %r" % (len(statements), block)) statement, = statements assert isinstance(statement, cls) return statement raise TypeError("PythonStatement: unexpected %s" % (type(arg).__name__,)) @classmethod def _construct_from_block(cls, block): # Only to be used by PythonBlock. assert isinstance(block, PythonBlock) self = object.__new__(cls) self.block = block return self @property def text(self): """ :rtype: `FileText` """ return self.block.text @property def filename(self): """ :rtype: `Filename` """ return self.text.filename @property def startpos(self): """ :rtype: `FilePos` """ return self.text.startpos @property def flags(self): """ :rtype: `CompilerFlags` """ return self.block.flags @property def ast_node(self): """ A single AST node representing this statement, or ``None`` if this object only represents comments/blanks. :rtype: ``ast.AST`` or ``NoneType`` """ ast_nodes = self.block.ast_node.body if len(ast_nodes) == 0: return None if len(ast_nodes) == 1: return ast_nodes[0] raise AssertionError("More than one AST node in block") @property def is_comment_or_blank(self): return self.ast_node is None @property def is_comment_or_blank_or_string_literal(self): return (self.is_comment_or_blank or _ast_str_literal_value(self.ast_node) is not None) @property def is_import(self): return isinstance(self.ast_node, (ast.Import, ast.ImportFrom)) @property def is_single_assign(self): n = self.ast_node return isinstance(n, ast.Assign) and len(n.targets) == 1 def get_assignment_literal_value(self): """ If the statement is an assignment, return the name and literal value. >>> PythonStatement('foo = {1: {2: 3}}').get_assignment_literal_value() ('foo', {1: {2: 3}}) :return: (target, literal_value) """ if not self.is_single_assign: raise ValueError( "Statement is not an assignment to a single name: %s" % self) n = self.ast_node target_name = n.targets[0].id literal_value = ast.literal_eval(n.value) return (target_name, literal_value) def __repr__(self): r = repr(self.block) assert r.startswith("PythonBlock(") r = "PythonStatement(" + r[12:] return r def __eq__(self, other): if self is other: return True if not isinstance(other, PythonStatement): return NotImplemented return self.block == other.block def __ne__(self, other): return not (self == other) # The rest are defined by total_ordering def __lt__(self, other): if not isinstance(other, PythonStatement): return NotImplemented return self.block < other.block def __cmp__(self, other): if self is other: return 0 if not isinstance(other, PythonStatement): return NotImplemented return cmp(self.block, other.block) def __hash__(self): return hash(self.block) @total_ordering class PythonBlock(object): r""" Representation of a sequence of consecutive top-level `PythonStatement` (s). >>> source_code = '# 1\nprint(2)\n# 3\n# 4\nprint(5)\nx=[6,\n 7]\n# 8\n' >>> codeblock = PythonBlock(source_code) >>> for stmt in PythonBlock(codeblock).statements: ... print(stmt) PythonStatement('# 1\n') PythonStatement('print(2)\n', startpos=(2,1)) PythonStatement('# 3\n# 4\n', startpos=(3,1)) PythonStatement('print(5)\n', startpos=(5,1)) PythonStatement('x=[6,\n 7]\n', startpos=(6,1)) PythonStatement('# 8\n', startpos=(8,1)) A ``PythonBlock`` has a ``flags`` attribute that gives the compiler_flags associated with the __future__ features using which the code should be parsed. """ def __new__(cls, arg, filename=None, startpos=None, flags=None, auto_flags=None): if isinstance(arg, PythonStatement): arg = arg.block # Fall through if isinstance(arg, cls): if filename is startpos is flags is None: return arg flags = CompilerFlags(flags, arg.flags) arg = arg.text # Fall through if isinstance(arg, (FileText, Filename, str, six.text_type)): return cls.from_text( arg, filename=filename, startpos=startpos, flags=flags, auto_flags=auto_flags) raise TypeError("%s: unexpected %s" % (cls.__name__, type(arg).__name__,)) @classmethod def from_filename(cls, filename): return cls.from_text(Filename(filename)) @classmethod def from_text(cls, text, filename=None, startpos=None, flags=None, auto_flags=False): """ :type text: `FileText` or convertible :type filename: ``Filename`` :param filename: Filename, if not already given by ``text``. :type startpos: ``FilePos`` :param startpos: Starting position, if not already given by ``text``. :type flags: ``CompilerFlags`` :param flags: Input compiler flags. :param auto_flags: Whether to try other flags if ``flags`` fails. :rtype: `PythonBlock` """ text = FileText(text, filename=filename, startpos=startpos) self = object.__new__(cls) self.text = text self._input_flags = CompilerFlags(flags) self._auto_flags = auto_flags return self @classmethod def __construct_from_annotated_ast(cls, annotated_ast_nodes, text, flags): # Constructor for internal use by _split_by_statement() or # concatenate(). ast_node = ast.Module(annotated_ast_nodes) ast_node.text = text ast_node.flags = flags if not hasattr(ast_node, "source_flags"): ast_node.source_flags = CompilerFlags.from_ast(annotated_ast_nodes) self = object.__new__(cls) self._ast_node_or_parse_exception = ast_node self.ast_node = ast_node self.annotated_ast_node = ast_node self.text = text self.flags = self._input_flags = flags self._auto_flags = False return self @classmethod def concatenate(cls, blocks, assume_contiguous=False): """ Concatenate a bunch of blocks into one block. :type blocks: sequence of `PythonBlock` s and/or `PythonStatement` s :param assume_contiguous: Whether to assume, without checking, that the input blocks were originally all contiguous. This must be set to True to indicate the caller understands the assumption; False is not implemented. """ if not assume_contiguous: raise NotImplementedError blocks = [PythonBlock(b) for b in blocks] if len(blocks) == 1: return blocks[0] assert blocks text = FileText.concatenate([b.text for b in blocks]) # The contiguous assumption is important here because ``ast_node`` # contains line information that would otherwise be wrong. ast_nodes = [n for b in blocks for n in b.annotated_ast_node.body] flags = blocks[0].flags return cls.__construct_from_annotated_ast(ast_nodes, text, flags) @property def filename(self): return self.text.filename @property def startpos(self): return self.text.startpos @property def endpos(self): return self.text.endpos @cached_attribute def _ast_node_or_parse_exception(self): """ Attempt to parse this block of code into an abstract syntax tree. Cached (including exception case). :return: Either ast_node or exception. """ # This attribute may also be set by __construct_from_annotated_ast(), # in which case this code does not run. try: return _parse_ast_nodes( self.text, self._input_flags, self._auto_flags, "exec") except Exception as e: # Add the filename to the exception message to be nicer. if self.text.filename: try: e = type(e)("While parsing %s: %s" % (self.text.filename, e)) except TypeError: # Exception takes more than one argument pass # Cache the exception to avoid re-attempting while debugging. return e @cached_attribute def parsable(self): """ Whether the contents of this ``PythonBlock`` are parsable as Python code, using the given flags. :rtype: ``bool`` """ return isinstance(self._ast_node_or_parse_exception, ast.AST) @cached_attribute def parsable_as_expression(self): """ Whether the contents of this ``PythonBlock`` are parsable as a single Python expression, using the given flags. :rtype: ``bool`` """ return self.parsable and self.expression_ast_node is not None @cached_attribute def ast_node(self): """ Parse this block of code into an abstract syntax tree. The returned object type is the kind of AST as returned by the ``compile`` built-in (rather than as returned by the older, deprecated ``compiler`` module). The code is parsed using mode="exec". The result is a ``ast.Module`` node, even if this block represents only a subset of the entire file. :rtype: ``ast.Module`` """ r = self._ast_node_or_parse_exception if isinstance(r, ast.AST): return r else: raise r @cached_attribute def annotated_ast_node(self): """ Return ``self.ast_node``, annotated in place with positions. All nodes are annotated with ``startpos``. All top-level nodes are annotated with ``endpos``. :rtype: ``ast.Module`` """ result = self.ast_node _annotate_ast_nodes(result) return result @cached_attribute def expression_ast_node(self): """ Return an ``ast.Expression`` if ``self.ast_node`` can be converted into one. I.e., return parse(self.text, mode="eval"), if possible. Otherwise, return ``None``. :rtype: ``ast.Expression`` """ node = self.ast_node if len(node.body) == 1 and isinstance(node.body[0], ast.Expr): return ast.Expression(node.body[0].value) else: return None def parse(self, mode=None): """ Parse the source text into an AST. :param mode: Compilation mode: "exec", "single", or "eval". "exec", "single", and "eval" work as the built-in ``compile`` function do. If ``None``, then default to "eval" if the input is a string with a single expression, else "exec". :rtype: ``ast.AST`` """ if mode == "exec": return self.ast_node elif mode == "eval": if self.expression_ast_node: return self.expression_ast_node else: raise SyntaxError elif mode == None: if self.expression_ast_node: return self.expression_ast_node else: return self.ast_node elif mode == "exec": raise NotImplementedError else: raise ValueError("parse(): invalid mode=%r" % (mode,)) def compile(self, mode=None): """ Parse into AST and compile AST into code. :rtype: ``CodeType`` """ ast_node = self.parse(mode=mode) mode = infer_compile_mode(ast_node) filename = str(self.filename or "") return compile(ast_node, filename, mode) @cached_attribute def statements(self): r""" Partition of this ``PythonBlock`` into individual ``PythonStatement`` s. Each one contains at most 1 top-level ast node. A ``PythonStatement`` can contain no ast node to represent comments. >>> code = "# multiline\n# comment\n'''multiline\nstring'''\nblah\n" >>> print(PythonBlock(code).statements) # doctest:+NORMALIZE_WHITESPACE (PythonStatement('# multiline\n# comment\n'), PythonStatement("'''multiline\nstring'''\n", startpos=(3,1)), PythonStatement('blah\n', startpos=(5,1))) :rtype: ``tuple`` of `PythonStatement` s """ node = self.annotated_ast_node nodes_subtexts = list(_split_code_lines(node.body, self.text)) if nodes_subtexts == [(self.ast_node.body, self.text)]: # This block is either all comments/blanks or a single statement # with no surrounding whitespace/comment lines. Return self. return (PythonStatement._construct_from_block(self),) cls = type(self) statement_blocks = [ cls.__construct_from_annotated_ast(subnodes, subtext, self.flags) for subnodes, subtext in nodes_subtexts] # Convert to statements. statements = [] for b in statement_blocks: statement = PythonStatement._construct_from_block(b) statements.append(statement) # Optimization: set the new sub-block's ``statements`` attribute # since we already know it contains exactly one statement, itself. assert 'statements' not in b.__dict__ b.statements = (statement,) return tuple(statements) @cached_attribute def source_flags(self): """ If the AST contains __future__ imports, then the compiler_flags associated with them. Otherwise, 0. The difference between ``source_flags`` and ``flags`` is that ``flags`` may be set by the caller (e.g. based on an earlier __future__ import) and include automatically guessed flags, whereas ``source_flags`` is only nonzero if this code itself contains __future__ imports. :rtype: `CompilerFlags` """ return self.ast_node.source_flags @cached_attribute def flags(self): """ The compiler flags for this code block, including both the input flags (possibly automatically guessed), and the flags from "__future__" imports in the source code text. :rtype: `CompilerFlags` """ return self.ast_node.flags def groupby(self, predicate): """ Partition this block of code into smaller blocks of code which consecutively have the same ``predicate``. :param predicate: Function that takes a `PythonStatement` and returns a value. :return: Generator that yields (group, `PythonBlock` s). """ cls = type(self) for pred, stmts in groupby(self.statements, predicate): blocks = [s.block for s in stmts] yield pred, cls.concatenate(blocks, assume_contiguous=True) def string_literals(self): r""" Yield all string literals anywhere in this block. The string literals have ``startpos`` attributes attached. >>> block = PythonBlock("'a' + ('b' + \n'c')") >>> [(f.s, f.startpos) for f in block.string_literals()] [('a', FilePos(1,1)), ('b', FilePos(1,8)), ('c', FilePos(2,1))] :return: Iterable of ``ast.Str`` or ``ast.Bytes`` nodes """ for node in _walk_ast_nodes_in_order(self.annotated_ast_node): if isinstance(node, (ast.Str, Bytes)): assert hasattr(node, 'startpos') yield node def _get_docstring_nodes(self): """ Yield docstring AST nodes. We consider the following to be docstrings:: - First literal string of function definitions, class definitions, and modules (the python standard) - Literal strings after assignments, per Epydoc :rtype: Generator of ``ast.Str`` nodes """ # This is similar to ``ast.get_docstring``, but: # - This function is recursive # - This function yields the node object, rather than the string # - This function yields multiple docstrings (even per ast node) # - This function doesn't raise TypeError on other AST types # - This function doesn't cleandoc # A previous implementation did # [n for n in self.string_literals() # if _ast_node_is_in_docstring_position(n)] # However, the method we now use is more straightforward, and doesn't # require first annotating each node with context information. docstring_containers = (ast.FunctionDef, ast.ClassDef, ast.Module) for node in _walk_ast_nodes_in_order(self.annotated_ast_node): if not isinstance(node, docstring_containers): continue if not node.body: continue # If the first body item is a literal string, then yield the node. if (isinstance(node.body[0], ast.Expr) and isinstance(node.body[0].value, ast.Str)): yield node.body[0].value for i in range(1, len(node.body)-1): # If a body item is an assignment and the next one is a # literal string, then yield the node for the literal string. n1, n2 = node.body[i], node.body[i+1] if (isinstance(n1, ast.Assign) and isinstance(n2, ast.Expr) and isinstance(n2.value, ast.Str)): yield n2.value def get_doctests(self): r""" Return doctests in this code. >>> PythonBlock("x\n'''\n >>> foo(bar\n ... + baz)\n'''\n").get_doctests() [PythonBlock('foo(bar\n + baz)\n', startpos=(3,2))] :rtype: ``list`` of `PythonStatement` s """ parser = IgnoreOptionsDocTestParser() doctest_blocks = [] filename = self.filename flags = self.flags for ast_node in self._get_docstring_nodes(): try: examples = parser.get_examples(ast_node.s) except Exception: blob = ast_node.s if len(blob) > 60: blob = blob[:60] + '...' # TODO: let caller decide how to handle logger.warning("Can't parse docstring; ignoring: %r", blob) continue for example in examples: lineno = ast_node.startpos.lineno + example.lineno colno = ast_node.startpos.colno + example.indent # dubious text = FileText(example.source, filename=filename, startpos=(lineno,colno)) try: block = PythonBlock(text, flags=flags) block.ast_node # make sure we can parse except Exception: blob = text.joined if len(blob) > 60: blob = blob[:60] + '...' logger.warning("Can't parse doctest; ignoring: %r", blob) continue doctest_blocks.append(block) return doctest_blocks def __repr__(self): r = "%s(%r" % (type(self).__name__, self.text.joined) if self.filename: r += ", filename=%r" % (str(self.filename),) if self.startpos != FilePos(): r += ", startpos=%s" % (self.startpos,) if self.flags != self.source_flags: r += ", flags=%s" % (self.flags,) r += ")" return r def __str__(self): return str(self.text) def __text__(self): return self.text def __eq__(self, other): if self is other: return True if not isinstance(other, PythonBlock): return NotImplemented return self.text == other.text and self.flags == other.flags def __ne__(self, other): return not (self == other) # The rest are defined by total_ordering def __lt__(self, other): if not isinstance(other, PythonBlock): return NotImplemented return (self.text, self.flags) < (other.text, other.flags) def __cmp__(self, other): if self is other: return 0 if not isinstance(other, PythonBlock): return NotImplemented return cmp(self.text, other.text) or cmp(self.flags, other.flags) def __hash__(self): h = hash((self.text, self.flags)) self.__hash__ = lambda: h return h class IgnoreOptionsDocTestParser(DocTestParser): def _find_options(self, source, name, lineno): # Ignore doctest options. We don't use them, and we don't want to # error on unknown options, which is what the default DocTestParser # does. return {}