KaTeX/src/Lexer.js

/**
 * The Lexer class handles tokenizing the input in various ways. Since our
 * parser expects us to be able to backtrack, the lexer allows lexing from any
 * given starting point.
 *
 * Its main exposed function is the `lex` function, which takes a position to
 * lex from and a type of token to lex. It defers to the appropriate `_innerLex`
 * function.
 *
 * The various `_innerLex` functions perform the actual lexing of different
 * kinds.
 */

const matchAt = require("match-at");

const ParseError = require("./ParseError");

// The main lexer class
function Lexer(input) {
    this.input = input;
    this.pos = 0;
}

/**
 * The resulting token returned from `lex`.
 *
 * It consists of the token text plus some position information.
 * The position information is essentially a range in an input string,
 * but instead of referencing the bare input string, we refer to the lexer.
 * That way it is possible to attach extra metadata to the input string,
 * like for example a file name or similar.
 *
 * The position information (all three parameters) is optional,
 * so it is OK to construct synthetic tokens if appropriate.
 * Not providing available position information may lead to
 * degraded error reporting, though.
 *
 * @param {string}  text   the text of this token
 * @param {number=} start  the start offset, zero-based inclusive
 * @param {number=} end    the end offset, zero-based exclusive
 * @param {Lexer=}  lexer  the lexer which in turn holds the input string
 */
function Token(text, start, end, lexer) {
    this.text = text;
    this.start = start;
    this.end = end;
    this.lexer = lexer;
}

/**
 * Given a pair of tokens (this and endToken), compute a “Token” encompassing
 * the whole input range enclosed by these two.
 *
 * @param {Token}  endToken  last token of the range, inclusive
 * @param {string} text      the text of the newly constructed token
 */
Token.prototype.range = function(endToken, text) {
    if (endToken.lexer !== this.lexer) {
        return new Token(text); // sorry, no position information available
    }
    return new Token(text, this.start, endToken.end, this.lexer);
};

/* The following tokenRegex
 * - matches typical whitespace (but not NBSP etc.) using its first group
 * - does not match any control character \x00-\x1f except whitespace
 * - does not match a bare backslash
 * - matches any ASCII character except those just mentioned
 * - does not match the BMP private use area \uE000-\uF8FF
 * - does not match bare surrogate code units
 * - matches any BMP character except for those just described
 * - matches any valid Unicode surrogate pair
 * - matches a backslash followed by one or more letters
 * - matches a backslash followed by any BMP character, including newline
 * Just because the Lexer matches something doesn't mean it's valid input:
 * If there is no matching function or symbol definition, the Parser will
 * still reject the input.
 */
const tokenRegex = new RegExp(
    "([ \r\n\t]+)|" +                                 // whitespace
    "([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" +  // single codepoint
    "|[\uD800-\uDBFF][\uDC00-\uDFFF]" +               // surrogate pair
    "|\\\\(?:[a-zA-Z]+|[^\uD800-\uDFFF])" +           // function name
    ")"
);

/**
 * This function lexes a single token.
 */
Lexer.prototype.lex = function() {
    const input = this.input;
    const pos = this.pos;
    if (pos === input.length) {
        return new Token("EOF", pos, pos, this);
    }
    const match = matchAt(tokenRegex, input, pos);
    if (match === null) {
        throw new ParseError(
            "Unexpected character: '" + input[pos] + "'",
            new Token(input[pos], pos, pos + 1, this));
    }
    const text = match[2] || " ";
    const start = this.pos;
    this.pos += match[0].length;
    const end = this.pos;
    return new Token(text, start, end, this);
};

module.exports = Lexer;