394 lines
16 KiB
Python
394 lines
16 KiB
Python
"""
|
|
The ``Parser`` tries to convert the available Python code in an easy to read
|
|
format, something like an abstract syntax tree. The classes who represent this
|
|
tree, are sitting in the :mod:`jedi.parser.tree` module.
|
|
|
|
The Python module ``tokenize`` is a very important part in the ``Parser``,
|
|
because it splits the code into different words (tokens). Sometimes it looks a
|
|
bit messy. Sorry for that! You might ask now: "Why didn't you use the ``ast``
|
|
module for this? Well, ``ast`` does a very good job understanding proper Python
|
|
code, but fails to work as soon as there's a single line of broken code.
|
|
|
|
There's one important optimization that needs to be known: Statements are not
|
|
being parsed completely. ``Statement`` is just a representation of the tokens
|
|
within the statement. This lowers memory usage and cpu time and reduces the
|
|
complexity of the ``Parser`` (there's another parser sitting inside
|
|
``Statement``, which produces ``Array`` and ``Call``).
|
|
"""
|
|
import os
|
|
import re
|
|
|
|
from jedi.parser import tree as pt
|
|
from jedi.parser import tokenize
|
|
from jedi.parser import token
|
|
from jedi.parser.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER,
|
|
STRING, OP, ERRORTOKEN)
|
|
from jedi.parser.pgen2.pgen import generate_grammar
|
|
from jedi.parser.pgen2.parse import PgenParser
|
|
|
|
OPERATOR_KEYWORDS = 'and', 'for', 'if', 'else', 'in', 'is', 'lambda', 'not', 'or'
|
|
# Not used yet. In the future I intend to add something like KeywordStatement
|
|
STATEMENT_KEYWORDS = 'assert', 'del', 'global', 'nonlocal', 'raise', \
|
|
'return', 'yield', 'pass', 'continue', 'break'
|
|
|
|
|
|
_loaded_grammars = {}
|
|
|
|
|
|
def load_grammar(file='grammar3.4'):
|
|
# For now we only support two different Python syntax versions: The latest
|
|
# Python 3 and Python 2. This may change.
|
|
if file.startswith('grammar3'):
|
|
file = 'grammar3.4'
|
|
else:
|
|
file = 'grammar2.7'
|
|
|
|
global _loaded_grammars
|
|
path = os.path.join(os.path.dirname(__file__), file) + '.txt'
|
|
try:
|
|
return _loaded_grammars[path]
|
|
except KeyError:
|
|
return _loaded_grammars.setdefault(path, generate_grammar(path))
|
|
|
|
|
|
class ErrorStatement(object):
|
|
def __init__(self, stack, next_token, position_modifier, next_start_pos):
|
|
self.stack = stack
|
|
self._position_modifier = position_modifier
|
|
self.next_token = next_token
|
|
self._next_start_pos = next_start_pos
|
|
|
|
@property
|
|
def next_start_pos(self):
|
|
s = self._next_start_pos
|
|
return s[0] + self._position_modifier.line, s[1]
|
|
|
|
@property
|
|
def first_pos(self):
|
|
first_type, nodes = self.stack[0]
|
|
return nodes[0].start_pos
|
|
|
|
@property
|
|
def first_type(self):
|
|
first_type, nodes = self.stack[0]
|
|
return first_type
|
|
|
|
|
|
class ParserSyntaxError(object):
|
|
def __init__(self, message, position):
|
|
self.message = message
|
|
self.position = position
|
|
|
|
|
|
class Parser(object):
|
|
"""
|
|
This class is used to parse a Python file, it then divides them into a
|
|
class structure of different scopes.
|
|
|
|
:param grammar: The grammar object of pgen2. Loaded by load_grammar.
|
|
:param source: The codebase for the parser. Must be unicode.
|
|
:param module_path: The path of the module in the file system, may be None.
|
|
:type module_path: str
|
|
:param top_module: Use this module as a parent instead of `self.module`.
|
|
"""
|
|
def __init__(self, grammar, source, module_path=None, tokenizer=None):
|
|
self._ast_mapping = {
|
|
'expr_stmt': pt.ExprStmt,
|
|
'classdef': pt.Class,
|
|
'funcdef': pt.Function,
|
|
'file_input': pt.Module,
|
|
'import_name': pt.ImportName,
|
|
'import_from': pt.ImportFrom,
|
|
'break_stmt': pt.KeywordStatement,
|
|
'continue_stmt': pt.KeywordStatement,
|
|
'return_stmt': pt.ReturnStmt,
|
|
'raise_stmt': pt.KeywordStatement,
|
|
'yield_expr': pt.YieldExpr,
|
|
'del_stmt': pt.KeywordStatement,
|
|
'pass_stmt': pt.KeywordStatement,
|
|
'global_stmt': pt.GlobalStmt,
|
|
'nonlocal_stmt': pt.KeywordStatement,
|
|
'assert_stmt': pt.AssertStmt,
|
|
'if_stmt': pt.IfStmt,
|
|
'with_stmt': pt.WithStmt,
|
|
'for_stmt': pt.ForStmt,
|
|
'while_stmt': pt.WhileStmt,
|
|
'try_stmt': pt.TryStmt,
|
|
'comp_for': pt.CompFor,
|
|
'decorator': pt.Decorator,
|
|
'lambdef': pt.Lambda,
|
|
'old_lambdef': pt.Lambda,
|
|
'lambdef_nocond': pt.Lambda,
|
|
}
|
|
|
|
self.syntax_errors = []
|
|
|
|
self._global_names = []
|
|
self._omit_dedent_list = []
|
|
self._indent_counter = 0
|
|
self._last_failed_start_pos = (0, 0)
|
|
|
|
# TODO do print absolute import detection here.
|
|
#try:
|
|
# del python_grammar_no_print_statement.keywords["print"]
|
|
#except KeyError:
|
|
# pass # Doesn't exist in the Python 3 grammar.
|
|
|
|
#if self.options["print_function"]:
|
|
# python_grammar = pygram.python_grammar_no_print_statement
|
|
#else:
|
|
self._used_names = {}
|
|
self._scope_names_stack = [{}]
|
|
self._error_statement_stacks = []
|
|
|
|
added_newline = False
|
|
# The Python grammar needs a newline at the end of each statement.
|
|
if not source.endswith('\n'):
|
|
source += '\n'
|
|
added_newline = True
|
|
|
|
# For the fast parser.
|
|
self.position_modifier = pt.PositionModifier()
|
|
p = PgenParser(grammar, self.convert_node, self.convert_leaf,
|
|
self.error_recovery)
|
|
tokenizer = tokenizer or tokenize.source_tokens(source)
|
|
self.module = p.parse(self._tokenize(tokenizer))
|
|
if self.module.type != 'file_input':
|
|
# If there's only one statement, we get back a non-module. That's
|
|
# not what we want, we want a module, so we add it here:
|
|
self.module = self.convert_node(grammar,
|
|
grammar.symbol2number['file_input'],
|
|
[self.module])
|
|
|
|
if added_newline:
|
|
self.remove_last_newline()
|
|
self.module.used_names = self._used_names
|
|
self.module.path = module_path
|
|
self.module.global_names = self._global_names
|
|
self.module.error_statement_stacks = self._error_statement_stacks
|
|
|
|
def convert_node(self, grammar, type, children):
|
|
"""
|
|
Convert raw node information to a Node instance.
|
|
|
|
This is passed to the parser driver which calls it whenever a reduction of a
|
|
grammar rule produces a new complete node, so that the tree is build
|
|
strictly bottom-up.
|
|
"""
|
|
symbol = grammar.number2symbol[type]
|
|
try:
|
|
new_node = self._ast_mapping[symbol](children)
|
|
except KeyError:
|
|
new_node = pt.Node(symbol, children)
|
|
|
|
# We need to check raw_node always, because the same node can be
|
|
# returned by convert multiple times.
|
|
if symbol == 'global_stmt':
|
|
self._global_names += new_node.get_global_names()
|
|
elif isinstance(new_node, pt.Lambda):
|
|
new_node.names_dict = self._scope_names_stack.pop()
|
|
elif isinstance(new_node, (pt.ClassOrFunc, pt.Module)) \
|
|
and symbol in ('funcdef', 'classdef', 'file_input'):
|
|
# scope_name_stack handling
|
|
scope_names = self._scope_names_stack.pop()
|
|
if isinstance(new_node, pt.ClassOrFunc):
|
|
n = new_node.name
|
|
scope_names[n.value].remove(n)
|
|
# Set the func name of the current node
|
|
arr = self._scope_names_stack[-1].setdefault(n.value, [])
|
|
arr.append(n)
|
|
new_node.names_dict = scope_names
|
|
elif isinstance(new_node, pt.CompFor):
|
|
# The name definitions of comprehenions shouldn't be part of the
|
|
# current scope. They are part of the comprehension scope.
|
|
for n in new_node.get_defined_names():
|
|
self._scope_names_stack[-1][n.value].remove(n)
|
|
return new_node
|
|
|
|
def convert_leaf(self, grammar, type, value, prefix, start_pos):
|
|
#print('leaf', value, pytree.type_repr(type))
|
|
if type == tokenize.NAME:
|
|
if value in grammar.keywords:
|
|
if value in ('def', 'class', 'lambda'):
|
|
self._scope_names_stack.append({})
|
|
|
|
return pt.Keyword(self.position_modifier, value, start_pos, prefix)
|
|
else:
|
|
name = pt.Name(self.position_modifier, value, start_pos, prefix)
|
|
# Keep a listing of all used names
|
|
arr = self._used_names.setdefault(name.value, [])
|
|
arr.append(name)
|
|
arr = self._scope_names_stack[-1].setdefault(name.value, [])
|
|
arr.append(name)
|
|
return name
|
|
elif type == STRING:
|
|
return pt.String(self.position_modifier, value, start_pos, prefix)
|
|
elif type == NUMBER:
|
|
return pt.Number(self.position_modifier, value, start_pos, prefix)
|
|
elif type in (NEWLINE, ENDMARKER):
|
|
return pt.Whitespace(self.position_modifier, value, start_pos, prefix)
|
|
else:
|
|
return pt.Operator(self.position_modifier, value, start_pos, prefix)
|
|
|
|
def error_recovery(self, grammar, stack, typ, value, start_pos, prefix,
|
|
add_token_callback):
|
|
"""
|
|
This parser is written in a dynamic way, meaning that this parser
|
|
allows using different grammars (even non-Python). However, error
|
|
recovery is purely written for Python.
|
|
"""
|
|
def current_suite(stack):
|
|
# For now just discard everything that is not a suite or
|
|
# file_input, if we detect an error.
|
|
for index, (dfa, state, (typ, nodes)) in reversed(list(enumerate(stack))):
|
|
# `suite` can sometimes be only simple_stmt, not stmt.
|
|
symbol = grammar.number2symbol[typ]
|
|
if symbol == 'file_input':
|
|
break
|
|
elif symbol == 'suite' and len(nodes) > 1:
|
|
# suites without an indent in them get discarded.
|
|
break
|
|
elif symbol == 'simple_stmt' and len(nodes) > 1:
|
|
# simple_stmt can just be turned into a Node, if there are
|
|
# enough statements. Ignore the rest after that.
|
|
break
|
|
return index, symbol, nodes
|
|
|
|
index, symbol, nodes = current_suite(stack)
|
|
if symbol == 'simple_stmt':
|
|
index -= 2
|
|
(_, _, (typ, suite_nodes)) = stack[index]
|
|
symbol = grammar.number2symbol[typ]
|
|
suite_nodes.append(pt.Node(symbol, list(nodes)))
|
|
# Remove
|
|
nodes[:] = []
|
|
nodes = suite_nodes
|
|
stack[index]
|
|
|
|
#print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index)
|
|
self._stack_removal(grammar, stack, index + 1, value, start_pos)
|
|
if typ == INDENT:
|
|
# For every deleted INDENT we have to delete a DEDENT as well.
|
|
# Otherwise the parser will get into trouble and DEDENT too early.
|
|
self._omit_dedent_list.append(self._indent_counter)
|
|
|
|
if value in ('import', 'from', 'class', 'def', 'try', 'while', 'return'):
|
|
# Those can always be new statements.
|
|
add_token_callback(typ, value, prefix, start_pos)
|
|
elif typ == DEDENT and symbol == 'suite':
|
|
# Close the current suite, with DEDENT.
|
|
# Note that this may cause some suites to not contain any
|
|
# statements at all. This is contrary to valid Python syntax. We
|
|
# keep incomplete suites in Jedi to be able to complete param names
|
|
# or `with ... as foo` names. If we want to use this parser for
|
|
# syntax checks, we have to check in a separate turn if suites
|
|
# contain statements or not. However, a second check is necessary
|
|
# anyway (compile.c does that for Python), because Python's grammar
|
|
# doesn't stop you from defining `continue` in a module, etc.
|
|
add_token_callback(typ, value, prefix, start_pos)
|
|
|
|
def _stack_removal(self, grammar, stack, start_index, value, start_pos):
|
|
def clear_names(children):
|
|
for c in children:
|
|
try:
|
|
clear_names(c.children)
|
|
except AttributeError:
|
|
if isinstance(c, pt.Name):
|
|
try:
|
|
self._scope_names_stack[-1][c.value].remove(c)
|
|
self._used_names[c.value].remove(c)
|
|
except ValueError:
|
|
pass # This may happen with CompFor.
|
|
|
|
for dfa, state, node in stack[start_index:]:
|
|
clear_names(children=node[1])
|
|
|
|
failed_stack = []
|
|
found = False
|
|
for dfa, state, (typ, nodes) in stack[start_index:]:
|
|
if nodes:
|
|
found = True
|
|
if found:
|
|
symbol = grammar.number2symbol[typ]
|
|
failed_stack.append((symbol, nodes))
|
|
if nodes and nodes[0] in ('def', 'class', 'lambda'):
|
|
self._scope_names_stack.pop()
|
|
if failed_stack:
|
|
err = ErrorStatement(failed_stack, value, self.position_modifier, start_pos)
|
|
self._error_statement_stacks.append(err)
|
|
|
|
self._last_failed_start_pos = start_pos
|
|
|
|
stack[start_index:] = []
|
|
|
|
def _tokenize(self, tokenizer):
|
|
for typ, value, start_pos, prefix in tokenizer:
|
|
#print(tokenize.tok_name[typ], repr(value), start_pos, repr(prefix))
|
|
if typ == DEDENT:
|
|
# We need to count indents, because if we just omit any DEDENT,
|
|
# we might omit them in the wrong place.
|
|
o = self._omit_dedent_list
|
|
if o and o[-1] == self._indent_counter:
|
|
o.pop()
|
|
continue
|
|
|
|
self._indent_counter -= 1
|
|
elif typ == INDENT:
|
|
self._indent_counter += 1
|
|
elif typ == ERRORTOKEN:
|
|
self._add_syntax_error('Strange token', start_pos)
|
|
continue
|
|
|
|
if typ == OP:
|
|
typ = token.opmap[value]
|
|
yield typ, value, prefix, start_pos
|
|
|
|
def _add_syntax_error(self, message, position):
|
|
self.syntax_errors.append(ParserSyntaxError(message, position))
|
|
|
|
def __repr__(self):
|
|
return "<%s: %s>" % (type(self).__name__, self.module)
|
|
|
|
def remove_last_newline(self):
|
|
"""
|
|
In all of this we need to work with _start_pos, because if we worked
|
|
with start_pos, we would need to check the position_modifier as well
|
|
(which is accounted for in the start_pos property).
|
|
"""
|
|
endmarker = self.module.children[-1]
|
|
# The newline is either in the endmarker as a prefix or the previous
|
|
# leaf as a newline token.
|
|
if endmarker.prefix.endswith('\n'):
|
|
endmarker.prefix = endmarker.prefix[:-1]
|
|
last_line = re.sub('.*\n', '', endmarker.prefix)
|
|
endmarker._start_pos = endmarker._start_pos[0] - 1, len(last_line)
|
|
else:
|
|
try:
|
|
newline = endmarker.get_previous()
|
|
except IndexError:
|
|
return # This means that the parser is empty.
|
|
while True:
|
|
if newline.value == '':
|
|
# Must be a DEDENT, just continue.
|
|
try:
|
|
newline = newline.get_previous()
|
|
except IndexError:
|
|
# If there's a statement that fails to be parsed, there
|
|
# will be no previous leaf. So just ignore it.
|
|
break
|
|
elif newline.value != '\n':
|
|
# This may happen if error correction strikes and removes
|
|
# a whole statement including '\n'.
|
|
break
|
|
else:
|
|
newline.value = ''
|
|
if self._last_failed_start_pos > newline._start_pos:
|
|
# It may be the case that there was a syntax error in a
|
|
# function. In that case error correction removes the
|
|
# right newline. So we use the previously assigned
|
|
# _last_failed_start_pos variable to account for that.
|
|
endmarker._start_pos = self._last_failed_start_pos
|
|
else:
|
|
endmarker._start_pos = newline._start_pos
|
|
break
|