cadquery-freecad-module/CadQuery/Libs/jedi/parser/__init__.py
2015-10-26 16:25:38 -04:00

394 lines
16 KiB
Python

"""
The ``Parser`` tries to convert the available Python code in an easy to read
format, something like an abstract syntax tree. The classes who represent this
tree, are sitting in the :mod:`jedi.parser.tree` module.
The Python module ``tokenize`` is a very important part in the ``Parser``,
because it splits the code into different words (tokens). Sometimes it looks a
bit messy. Sorry for that! You might ask now: "Why didn't you use the ``ast``
module for this? Well, ``ast`` does a very good job understanding proper Python
code, but fails to work as soon as there's a single line of broken code.
There's one important optimization that needs to be known: Statements are not
being parsed completely. ``Statement`` is just a representation of the tokens
within the statement. This lowers memory usage and cpu time and reduces the
complexity of the ``Parser`` (there's another parser sitting inside
``Statement``, which produces ``Array`` and ``Call``).
"""
import os
import re
from jedi.parser import tree as pt
from jedi.parser import tokenize
from jedi.parser import token
from jedi.parser.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER,
STRING, OP, ERRORTOKEN)
from jedi.parser.pgen2.pgen import generate_grammar
from jedi.parser.pgen2.parse import PgenParser
OPERATOR_KEYWORDS = 'and', 'for', 'if', 'else', 'in', 'is', 'lambda', 'not', 'or'
# Not used yet. In the future I intend to add something like KeywordStatement
STATEMENT_KEYWORDS = 'assert', 'del', 'global', 'nonlocal', 'raise', \
'return', 'yield', 'pass', 'continue', 'break'
_loaded_grammars = {}
def load_grammar(file='grammar3.4'):
# For now we only support two different Python syntax versions: The latest
# Python 3 and Python 2. This may change.
if file.startswith('grammar3'):
file = 'grammar3.4'
else:
file = 'grammar2.7'
global _loaded_grammars
path = os.path.join(os.path.dirname(__file__), file) + '.txt'
try:
return _loaded_grammars[path]
except KeyError:
return _loaded_grammars.setdefault(path, generate_grammar(path))
class ErrorStatement(object):
def __init__(self, stack, next_token, position_modifier, next_start_pos):
self.stack = stack
self._position_modifier = position_modifier
self.next_token = next_token
self._next_start_pos = next_start_pos
@property
def next_start_pos(self):
s = self._next_start_pos
return s[0] + self._position_modifier.line, s[1]
@property
def first_pos(self):
first_type, nodes = self.stack[0]
return nodes[0].start_pos
@property
def first_type(self):
first_type, nodes = self.stack[0]
return first_type
class ParserSyntaxError(object):
def __init__(self, message, position):
self.message = message
self.position = position
class Parser(object):
"""
This class is used to parse a Python file, it then divides them into a
class structure of different scopes.
:param grammar: The grammar object of pgen2. Loaded by load_grammar.
:param source: The codebase for the parser. Must be unicode.
:param module_path: The path of the module in the file system, may be None.
:type module_path: str
:param top_module: Use this module as a parent instead of `self.module`.
"""
def __init__(self, grammar, source, module_path=None, tokenizer=None):
self._ast_mapping = {
'expr_stmt': pt.ExprStmt,
'classdef': pt.Class,
'funcdef': pt.Function,
'file_input': pt.Module,
'import_name': pt.ImportName,
'import_from': pt.ImportFrom,
'break_stmt': pt.KeywordStatement,
'continue_stmt': pt.KeywordStatement,
'return_stmt': pt.ReturnStmt,
'raise_stmt': pt.KeywordStatement,
'yield_expr': pt.YieldExpr,
'del_stmt': pt.KeywordStatement,
'pass_stmt': pt.KeywordStatement,
'global_stmt': pt.GlobalStmt,
'nonlocal_stmt': pt.KeywordStatement,
'assert_stmt': pt.AssertStmt,
'if_stmt': pt.IfStmt,
'with_stmt': pt.WithStmt,
'for_stmt': pt.ForStmt,
'while_stmt': pt.WhileStmt,
'try_stmt': pt.TryStmt,
'comp_for': pt.CompFor,
'decorator': pt.Decorator,
'lambdef': pt.Lambda,
'old_lambdef': pt.Lambda,
'lambdef_nocond': pt.Lambda,
}
self.syntax_errors = []
self._global_names = []
self._omit_dedent_list = []
self._indent_counter = 0
self._last_failed_start_pos = (0, 0)
# TODO do print absolute import detection here.
#try:
# del python_grammar_no_print_statement.keywords["print"]
#except KeyError:
# pass # Doesn't exist in the Python 3 grammar.
#if self.options["print_function"]:
# python_grammar = pygram.python_grammar_no_print_statement
#else:
self._used_names = {}
self._scope_names_stack = [{}]
self._error_statement_stacks = []
added_newline = False
# The Python grammar needs a newline at the end of each statement.
if not source.endswith('\n'):
source += '\n'
added_newline = True
# For the fast parser.
self.position_modifier = pt.PositionModifier()
p = PgenParser(grammar, self.convert_node, self.convert_leaf,
self.error_recovery)
tokenizer = tokenizer or tokenize.source_tokens(source)
self.module = p.parse(self._tokenize(tokenizer))
if self.module.type != 'file_input':
# If there's only one statement, we get back a non-module. That's
# not what we want, we want a module, so we add it here:
self.module = self.convert_node(grammar,
grammar.symbol2number['file_input'],
[self.module])
if added_newline:
self.remove_last_newline()
self.module.used_names = self._used_names
self.module.path = module_path
self.module.global_names = self._global_names
self.module.error_statement_stacks = self._error_statement_stacks
def convert_node(self, grammar, type, children):
"""
Convert raw node information to a Node instance.
This is passed to the parser driver which calls it whenever a reduction of a
grammar rule produces a new complete node, so that the tree is build
strictly bottom-up.
"""
symbol = grammar.number2symbol[type]
try:
new_node = self._ast_mapping[symbol](children)
except KeyError:
new_node = pt.Node(symbol, children)
# We need to check raw_node always, because the same node can be
# returned by convert multiple times.
if symbol == 'global_stmt':
self._global_names += new_node.get_global_names()
elif isinstance(new_node, pt.Lambda):
new_node.names_dict = self._scope_names_stack.pop()
elif isinstance(new_node, (pt.ClassOrFunc, pt.Module)) \
and symbol in ('funcdef', 'classdef', 'file_input'):
# scope_name_stack handling
scope_names = self._scope_names_stack.pop()
if isinstance(new_node, pt.ClassOrFunc):
n = new_node.name
scope_names[n.value].remove(n)
# Set the func name of the current node
arr = self._scope_names_stack[-1].setdefault(n.value, [])
arr.append(n)
new_node.names_dict = scope_names
elif isinstance(new_node, pt.CompFor):
# The name definitions of comprehenions shouldn't be part of the
# current scope. They are part of the comprehension scope.
for n in new_node.get_defined_names():
self._scope_names_stack[-1][n.value].remove(n)
return new_node
def convert_leaf(self, grammar, type, value, prefix, start_pos):
#print('leaf', value, pytree.type_repr(type))
if type == tokenize.NAME:
if value in grammar.keywords:
if value in ('def', 'class', 'lambda'):
self._scope_names_stack.append({})
return pt.Keyword(self.position_modifier, value, start_pos, prefix)
else:
name = pt.Name(self.position_modifier, value, start_pos, prefix)
# Keep a listing of all used names
arr = self._used_names.setdefault(name.value, [])
arr.append(name)
arr = self._scope_names_stack[-1].setdefault(name.value, [])
arr.append(name)
return name
elif type == STRING:
return pt.String(self.position_modifier, value, start_pos, prefix)
elif type == NUMBER:
return pt.Number(self.position_modifier, value, start_pos, prefix)
elif type in (NEWLINE, ENDMARKER):
return pt.Whitespace(self.position_modifier, value, start_pos, prefix)
else:
return pt.Operator(self.position_modifier, value, start_pos, prefix)
def error_recovery(self, grammar, stack, typ, value, start_pos, prefix,
add_token_callback):
"""
This parser is written in a dynamic way, meaning that this parser
allows using different grammars (even non-Python). However, error
recovery is purely written for Python.
"""
def current_suite(stack):
# For now just discard everything that is not a suite or
# file_input, if we detect an error.
for index, (dfa, state, (typ, nodes)) in reversed(list(enumerate(stack))):
# `suite` can sometimes be only simple_stmt, not stmt.
symbol = grammar.number2symbol[typ]
if symbol == 'file_input':
break
elif symbol == 'suite' and len(nodes) > 1:
# suites without an indent in them get discarded.
break
elif symbol == 'simple_stmt' and len(nodes) > 1:
# simple_stmt can just be turned into a Node, if there are
# enough statements. Ignore the rest after that.
break
return index, symbol, nodes
index, symbol, nodes = current_suite(stack)
if symbol == 'simple_stmt':
index -= 2
(_, _, (typ, suite_nodes)) = stack[index]
symbol = grammar.number2symbol[typ]
suite_nodes.append(pt.Node(symbol, list(nodes)))
# Remove
nodes[:] = []
nodes = suite_nodes
stack[index]
#print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index)
self._stack_removal(grammar, stack, index + 1, value, start_pos)
if typ == INDENT:
# For every deleted INDENT we have to delete a DEDENT as well.
# Otherwise the parser will get into trouble and DEDENT too early.
self._omit_dedent_list.append(self._indent_counter)
if value in ('import', 'from', 'class', 'def', 'try', 'while', 'return'):
# Those can always be new statements.
add_token_callback(typ, value, prefix, start_pos)
elif typ == DEDENT and symbol == 'suite':
# Close the current suite, with DEDENT.
# Note that this may cause some suites to not contain any
# statements at all. This is contrary to valid Python syntax. We
# keep incomplete suites in Jedi to be able to complete param names
# or `with ... as foo` names. If we want to use this parser for
# syntax checks, we have to check in a separate turn if suites
# contain statements or not. However, a second check is necessary
# anyway (compile.c does that for Python), because Python's grammar
# doesn't stop you from defining `continue` in a module, etc.
add_token_callback(typ, value, prefix, start_pos)
def _stack_removal(self, grammar, stack, start_index, value, start_pos):
def clear_names(children):
for c in children:
try:
clear_names(c.children)
except AttributeError:
if isinstance(c, pt.Name):
try:
self._scope_names_stack[-1][c.value].remove(c)
self._used_names[c.value].remove(c)
except ValueError:
pass # This may happen with CompFor.
for dfa, state, node in stack[start_index:]:
clear_names(children=node[1])
failed_stack = []
found = False
for dfa, state, (typ, nodes) in stack[start_index:]:
if nodes:
found = True
if found:
symbol = grammar.number2symbol[typ]
failed_stack.append((symbol, nodes))
if nodes and nodes[0] in ('def', 'class', 'lambda'):
self._scope_names_stack.pop()
if failed_stack:
err = ErrorStatement(failed_stack, value, self.position_modifier, start_pos)
self._error_statement_stacks.append(err)
self._last_failed_start_pos = start_pos
stack[start_index:] = []
def _tokenize(self, tokenizer):
for typ, value, start_pos, prefix in tokenizer:
#print(tokenize.tok_name[typ], repr(value), start_pos, repr(prefix))
if typ == DEDENT:
# We need to count indents, because if we just omit any DEDENT,
# we might omit them in the wrong place.
o = self._omit_dedent_list
if o and o[-1] == self._indent_counter:
o.pop()
continue
self._indent_counter -= 1
elif typ == INDENT:
self._indent_counter += 1
elif typ == ERRORTOKEN:
self._add_syntax_error('Strange token', start_pos)
continue
if typ == OP:
typ = token.opmap[value]
yield typ, value, prefix, start_pos
def _add_syntax_error(self, message, position):
self.syntax_errors.append(ParserSyntaxError(message, position))
def __repr__(self):
return "<%s: %s>" % (type(self).__name__, self.module)
def remove_last_newline(self):
"""
In all of this we need to work with _start_pos, because if we worked
with start_pos, we would need to check the position_modifier as well
(which is accounted for in the start_pos property).
"""
endmarker = self.module.children[-1]
# The newline is either in the endmarker as a prefix or the previous
# leaf as a newline token.
if endmarker.prefix.endswith('\n'):
endmarker.prefix = endmarker.prefix[:-1]
last_line = re.sub('.*\n', '', endmarker.prefix)
endmarker._start_pos = endmarker._start_pos[0] - 1, len(last_line)
else:
try:
newline = endmarker.get_previous()
except IndexError:
return # This means that the parser is empty.
while True:
if newline.value == '':
# Must be a DEDENT, just continue.
try:
newline = newline.get_previous()
except IndexError:
# If there's a statement that fails to be parsed, there
# will be no previous leaf. So just ignore it.
break
elif newline.value != '\n':
# This may happen if error correction strikes and removes
# a whole statement including '\n'.
break
else:
newline.value = ''
if self._last_failed_start_pos > newline._start_pos:
# It may be the case that there was a syntax error in a
# function. In that case error correction removes the
# right newline. So we use the previously assigned
# _last_failed_start_pos variable to account for that.
endmarker._start_pos = self._last_failed_start_pos
else:
endmarker._start_pos = newline._start_pos
break