KaTeX/src/Lexer.js
2017-01-13 22:37:17 -05:00

110 lines
3.8 KiB
JavaScript

/**
* The Lexer class handles tokenizing the input in various ways. Since our
* parser expects us to be able to backtrack, the lexer allows lexing from any
* given starting point.
*
* Its main exposed function is the `lex` function, which takes a position to
* lex from and a type of token to lex. It defers to the appropriate `_innerLex`
* function.
*
* The various `_innerLex` functions perform the actual lexing of different
* kinds.
*/
const matchAt = require("match-at");
const ParseError = require("./ParseError");
// The main lexer class
function Lexer(input) {
this.input = input;
this.pos = 0;
}
/**
* The resulting token returned from `lex`.
*
* It consists of the token text plus some position information.
* The position information is essentially a range in an input string,
* but instead of referencing the bare input string, we refer to the lexer.
* That way it is possible to attach extra metadata to the input string,
* like for example a file name or similar.
*
* The position information (all three parameters) is optional,
* so it is OK to construct synthetic tokens if appropriate.
* Not providing available position information may lead to
* degraded error reporting, though.
*
* @param {string} text the text of this token
* @param {number=} start the start offset, zero-based inclusive
* @param {number=} end the end offset, zero-based exclusive
* @param {Lexer=} lexer the lexer which in turn holds the input string
*/
function Token(text, start, end, lexer) {
this.text = text;
this.start = start;
this.end = end;
this.lexer = lexer;
}
/**
* Given a pair of tokens (this and endToken), compute a “Token” encompassing
* the whole input range enclosed by these two.
*
* @param {Token} endToken last token of the range, inclusive
* @param {string} text the text of the newly constructed token
*/
Token.prototype.range = function(endToken, text) {
if (endToken.lexer !== this.lexer) {
return new Token(text); // sorry, no position information available
}
return new Token(text, this.start, endToken.end, this.lexer);
};
/* The following tokenRegex
* - matches typical whitespace (but not NBSP etc.) using its first group
* - does not match any control character \x00-\x1f except whitespace
* - does not match a bare backslash
* - matches any ASCII character except those just mentioned
* - does not match the BMP private use area \uE000-\uF8FF
* - does not match bare surrogate code units
* - matches any BMP character except for those just described
* - matches any valid Unicode surrogate pair
* - matches a backslash followed by one or more letters
* - matches a backslash followed by any BMP character, including newline
* Just because the Lexer matches something doesn't mean it's valid input:
* If there is no matching function or symbol definition, the Parser will
* still reject the input.
*/
const tokenRegex = new RegExp(
"([ \r\n\t]+)|" + // whitespace
"([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
"|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair
"|\\\\(?:[a-zA-Z]+|[^\uD800-\uDFFF])" + // function name
")"
);
/**
* This function lexes a single token.
*/
Lexer.prototype.lex = function() {
const input = this.input;
const pos = this.pos;
if (pos === input.length) {
return new Token("EOF", pos, pos, this);
}
const match = matchAt(tokenRegex, input, pos);
if (match === null) {
throw new ParseError(
"Unexpected character: '" + input[pos] + "'",
new Token(input[pos], pos, pos + 1, this));
}
const text = match[2] || " ";
const start = this.pos;
this.pos += match[0].length;
const end = this.pos;
return new Token(text, start, end, this);
};
module.exports = Lexer;