cadquery-freecad-module/CadQuery/Libs/jedi/parser/__init__.py

"""
The ``Parser`` tries to convert the available Python code in an easy to read
format, something like an abstract syntax tree. The classes who represent this
tree, are sitting in the :mod:`jedi.parser.tree` module.

The Python module ``tokenize`` is a very important part in the ``Parser``,
because it splits the code into different words (tokens).  Sometimes it looks a
bit messy. Sorry for that! You might ask now: "Why didn't you use the ``ast``
module for this? Well, ``ast`` does a very good job understanding proper Python
code, but fails to work as soon as there's a single line of broken code.

There's one important optimization that needs to be known: Statements are not
being parsed completely. ``Statement`` is just a representation of the tokens
within the statement. This lowers memory usage and cpu time and reduces the
complexity of the ``Parser`` (there's another parser sitting inside
``Statement``, which produces ``Array`` and ``Call``).
"""
import os
import re

from jedi.parser import tree as pt
from jedi.parser import tokenize
from jedi.parser import token
from jedi.parser.token import (DEDENT, INDENT, ENDMARKER, NEWLINE, NUMBER,
                               STRING, OP, ERRORTOKEN)
from jedi.parser.pgen2.pgen import generate_grammar
from jedi.parser.pgen2.parse import PgenParser

OPERATOR_KEYWORDS = 'and', 'for', 'if', 'else', 'in', 'is', 'lambda', 'not', 'or'
# Not used yet. In the future I intend to add something like KeywordStatement
STATEMENT_KEYWORDS = 'assert', 'del', 'global', 'nonlocal', 'raise', \
    'return', 'yield', 'pass', 'continue', 'break'


_loaded_grammars = {}


def load_grammar(file='grammar3.4'):
    # For now we only support two different Python syntax versions: The latest
    # Python 3 and Python 2. This may change.
    if file.startswith('grammar3'):
        file = 'grammar3.4'
    else:
        file = 'grammar2.7'

    global _loaded_grammars
    path = os.path.join(os.path.dirname(__file__), file) + '.txt'
    try:
        return _loaded_grammars[path]
    except KeyError:
        return _loaded_grammars.setdefault(path, generate_grammar(path))


class ErrorStatement(object):
    def __init__(self, stack, next_token, position_modifier, next_start_pos):
        self.stack = stack
        self._position_modifier = position_modifier
        self.next_token = next_token
        self._next_start_pos = next_start_pos

    @property
    def next_start_pos(self):
        s = self._next_start_pos
        return s[0] + self._position_modifier.line, s[1]

    @property
    def first_pos(self):
        first_type, nodes = self.stack[0]
        return nodes[0].start_pos

    @property
    def first_type(self):
        first_type, nodes = self.stack[0]
        return first_type


class ParserSyntaxError(object):
    def __init__(self, message, position):
        self.message = message
        self.position = position


class Parser(object):
    """
    This class is used to parse a Python file, it then divides them into a
    class structure of different scopes.

    :param grammar: The grammar object of pgen2. Loaded by load_grammar.
    :param source: The codebase for the parser. Must be unicode.
    :param module_path: The path of the module in the file system, may be None.
    :type module_path: str
    :param top_module: Use this module as a parent instead of `self.module`.
    """
    def __init__(self, grammar, source, module_path=None, tokenizer=None):
        self._ast_mapping = {
            'expr_stmt': pt.ExprStmt,
            'classdef': pt.Class,
            'funcdef': pt.Function,
            'file_input': pt.Module,
            'import_name': pt.ImportName,
            'import_from': pt.ImportFrom,
            'break_stmt': pt.KeywordStatement,
            'continue_stmt': pt.KeywordStatement,
            'return_stmt': pt.ReturnStmt,
            'raise_stmt': pt.KeywordStatement,
            'yield_expr': pt.YieldExpr,
            'del_stmt': pt.KeywordStatement,
            'pass_stmt': pt.KeywordStatement,
            'global_stmt': pt.GlobalStmt,
            'nonlocal_stmt': pt.KeywordStatement,
            'assert_stmt': pt.AssertStmt,
            'if_stmt': pt.IfStmt,
            'with_stmt': pt.WithStmt,
            'for_stmt': pt.ForStmt,
            'while_stmt': pt.WhileStmt,
            'try_stmt': pt.TryStmt,
            'comp_for': pt.CompFor,
            'decorator': pt.Decorator,
            'lambdef': pt.Lambda,
            'old_lambdef': pt.Lambda,
            'lambdef_nocond': pt.Lambda,
        }

        self.syntax_errors = []

        self._global_names = []
        self._omit_dedent_list = []
        self._indent_counter = 0
        self._last_failed_start_pos = (0, 0)

        # TODO do print absolute import detection here.
        #try:
        #    del python_grammar_no_print_statement.keywords["print"]
        #except KeyError:
        #    pass  # Doesn't exist in the Python 3 grammar.

        #if self.options["print_function"]:
        #    python_grammar = pygram.python_grammar_no_print_statement
        #else:
        self._used_names = {}
        self._scope_names_stack = [{}]
        self._error_statement_stacks = []

        added_newline = False
        # The Python grammar needs a newline at the end of each statement.
        if not source.endswith('\n'):
            source += '\n'
            added_newline = True

        # For the fast parser.
        self.position_modifier = pt.PositionModifier()
        p = PgenParser(grammar, self.convert_node, self.convert_leaf,
                       self.error_recovery)
        tokenizer = tokenizer or tokenize.source_tokens(source)
        self.module = p.parse(self._tokenize(tokenizer))
        if self.module.type != 'file_input':
            # If there's only one statement, we get back a non-module. That's
            # not what we want, we want a module, so we add it here:
            self.module = self.convert_node(grammar,
                                            grammar.symbol2number['file_input'],
                                            [self.module])

        if added_newline:
            self.remove_last_newline()
        self.module.used_names = self._used_names
        self.module.path = module_path
        self.module.global_names = self._global_names
        self.module.error_statement_stacks = self._error_statement_stacks

    def convert_node(self, grammar, type, children):
        """
        Convert raw node information to a Node instance.

        This is passed to the parser driver which calls it whenever a reduction of a
        grammar rule produces a new complete node, so that the tree is build
        strictly bottom-up.
        """
        symbol = grammar.number2symbol[type]
        try:
            new_node = self._ast_mapping[symbol](children)
        except KeyError:
            new_node = pt.Node(symbol, children)

        # We need to check raw_node always, because the same node can be
        # returned by convert multiple times.
        if symbol == 'global_stmt':
            self._global_names += new_node.get_global_names()
        elif isinstance(new_node, pt.Lambda):
            new_node.names_dict = self._scope_names_stack.pop()
        elif isinstance(new_node, (pt.ClassOrFunc, pt.Module)) \
                and symbol in ('funcdef', 'classdef', 'file_input'):
            # scope_name_stack handling
            scope_names = self._scope_names_stack.pop()
            if isinstance(new_node, pt.ClassOrFunc):
                n = new_node.name
                scope_names[n.value].remove(n)
                # Set the func name of the current node
                arr = self._scope_names_stack[-1].setdefault(n.value, [])
                arr.append(n)
            new_node.names_dict = scope_names
        elif isinstance(new_node, pt.CompFor):
            # The name definitions of comprehenions shouldn't be part of the
            # current scope. They are part of the comprehension scope.
            for n in new_node.get_defined_names():
                self._scope_names_stack[-1][n.value].remove(n)
        return new_node

    def convert_leaf(self, grammar, type, value, prefix, start_pos):
        #print('leaf', value, pytree.type_repr(type))
        if type == tokenize.NAME:
            if value in grammar.keywords:
                if value in ('def', 'class', 'lambda'):
                    self._scope_names_stack.append({})

                return pt.Keyword(self.position_modifier, value, start_pos, prefix)
            else:
                name = pt.Name(self.position_modifier, value, start_pos, prefix)
                # Keep a listing of all used names
                arr = self._used_names.setdefault(name.value, [])
                arr.append(name)
                arr = self._scope_names_stack[-1].setdefault(name.value, [])
                arr.append(name)
                return name
        elif type == STRING:
            return pt.String(self.position_modifier, value, start_pos, prefix)
        elif type == NUMBER:
            return pt.Number(self.position_modifier, value, start_pos, prefix)
        elif type in (NEWLINE, ENDMARKER):
            return pt.Whitespace(self.position_modifier, value, start_pos, prefix)
        else:
            return pt.Operator(self.position_modifier, value, start_pos, prefix)

    def error_recovery(self, grammar, stack, typ, value, start_pos, prefix,
                       add_token_callback):
        """
        This parser is written in a dynamic way, meaning that this parser
        allows using different grammars (even non-Python). However, error
        recovery is purely written for Python.
        """
        def current_suite(stack):
            # For now just discard everything that is not a suite or
            # file_input, if we detect an error.
            for index, (dfa, state, (typ, nodes)) in reversed(list(enumerate(stack))):
                # `suite` can sometimes be only simple_stmt, not stmt.
                symbol = grammar.number2symbol[typ]
                if symbol == 'file_input':
                    break
                elif symbol == 'suite' and len(nodes) > 1:
                    # suites without an indent in them get discarded.
                    break
                elif symbol == 'simple_stmt' and len(nodes) > 1:
                    # simple_stmt can just be turned into a Node, if there are
                    # enough statements. Ignore the rest after that.
                    break
            return index, symbol, nodes

        index, symbol, nodes = current_suite(stack)
        if symbol == 'simple_stmt':
            index -= 2
            (_, _, (typ, suite_nodes)) = stack[index]
            symbol = grammar.number2symbol[typ]
            suite_nodes.append(pt.Node(symbol, list(nodes)))
            # Remove
            nodes[:] = []
            nodes = suite_nodes
            stack[index]

        #print('err', token.tok_name[typ], repr(value), start_pos, len(stack), index)
        self._stack_removal(grammar, stack, index + 1, value, start_pos)
        if typ == INDENT:
            # For every deleted INDENT we have to delete a DEDENT as well.
            # Otherwise the parser will get into trouble and DEDENT too early.
            self._omit_dedent_list.append(self._indent_counter)

        if value in ('import', 'from', 'class', 'def', 'try', 'while', 'return'):
            # Those can always be new statements.
            add_token_callback(typ, value, prefix, start_pos)
        elif typ == DEDENT and symbol == 'suite':
            # Close the current suite, with DEDENT.
            # Note that this may cause some suites to not contain any
            # statements at all. This is contrary to valid Python syntax. We
            # keep incomplete suites in Jedi to be able to complete param names
            # or `with ... as foo` names. If we want to use this parser for
            # syntax checks, we have to check in a separate turn if suites
            # contain statements or not. However, a second check is necessary
            # anyway (compile.c does that for Python), because Python's grammar
            # doesn't stop you from defining `continue` in a module, etc.
            add_token_callback(typ, value, prefix, start_pos)

    def _stack_removal(self, grammar, stack, start_index, value, start_pos):
        def clear_names(children):
            for c in children:
                try:
                    clear_names(c.children)
                except AttributeError:
                    if isinstance(c, pt.Name):
                        try:
                            self._scope_names_stack[-1][c.value].remove(c)
                            self._used_names[c.value].remove(c)
                        except ValueError:
                            pass  # This may happen with CompFor.

        for dfa, state, node in stack[start_index:]:
            clear_names(children=node[1])

        failed_stack = []
        found = False
        for dfa, state, (typ, nodes) in stack[start_index:]:
            if nodes:
                found = True
            if found:
                symbol = grammar.number2symbol[typ]
                failed_stack.append((symbol, nodes))
            if nodes and nodes[0] in ('def', 'class', 'lambda'):
                self._scope_names_stack.pop()
        if failed_stack:
            err = ErrorStatement(failed_stack, value, self.position_modifier, start_pos)
            self._error_statement_stacks.append(err)

        self._last_failed_start_pos = start_pos

        stack[start_index:] = []

    def _tokenize(self, tokenizer):
        for typ, value, start_pos, prefix in tokenizer:
            #print(tokenize.tok_name[typ], repr(value), start_pos, repr(prefix))
            if typ == DEDENT:
                # We need to count indents, because if we just omit any DEDENT,
                # we might omit them in the wrong place.
                o = self._omit_dedent_list
                if o and o[-1] == self._indent_counter:
                    o.pop()
                    continue

                self._indent_counter -= 1
            elif typ == INDENT:
                self._indent_counter += 1
            elif typ == ERRORTOKEN:
                self._add_syntax_error('Strange token', start_pos)
                continue

            if typ == OP:
                typ = token.opmap[value]
            yield typ, value, prefix, start_pos

    def _add_syntax_error(self, message, position):
        self.syntax_errors.append(ParserSyntaxError(message, position))

    def __repr__(self):
        return "<%s: %s>" % (type(self).__name__, self.module)

    def remove_last_newline(self):
        """
        In all of this we need to work with _start_pos, because if we worked
        with start_pos, we would need to check the position_modifier as well
        (which is accounted for in the start_pos property).
        """
        endmarker = self.module.children[-1]
        # The newline is either in the endmarker as a prefix or the previous
        # leaf as a newline token.
        if endmarker.prefix.endswith('\n'):
            endmarker.prefix = endmarker.prefix[:-1]
            last_line = re.sub('.*\n', '', endmarker.prefix)
            endmarker._start_pos = endmarker._start_pos[0] - 1, len(last_line)
        else:
            try:
                newline = endmarker.get_previous()
            except IndexError:
                return  # This means that the parser is empty.
            while True:
                if newline.value == '':
                    # Must be a DEDENT, just continue.
                    try:
                        newline = newline.get_previous()
                    except IndexError:
                        # If there's a statement that fails to be parsed, there
                        # will be no previous leaf. So just ignore it.
                        break
                elif newline.value != '\n':
                    # This may happen if error correction strikes and removes
                    # a whole statement including '\n'.
                    break
                else:
                    newline.value = ''
                    if self._last_failed_start_pos > newline._start_pos:
                        # It may be the case that there was a syntax error in a
                        # function. In that case error correction removes the
                        # right newline. So we use the previously assigned
                        # _last_failed_start_pos variable to account for that.
                        endmarker._start_pos = self._last_failed_start_pos
                    else:
                        endmarker._start_pos = newline._start_pos
                    break