110 lines
3.8 KiB
JavaScript
110 lines
3.8 KiB
JavaScript
/**
|
|
* The Lexer class handles tokenizing the input in various ways. Since our
|
|
* parser expects us to be able to backtrack, the lexer allows lexing from any
|
|
* given starting point.
|
|
*
|
|
* Its main exposed function is the `lex` function, which takes a position to
|
|
* lex from and a type of token to lex. It defers to the appropriate `_innerLex`
|
|
* function.
|
|
*
|
|
* The various `_innerLex` functions perform the actual lexing of different
|
|
* kinds.
|
|
*/
|
|
|
|
const matchAt = require("match-at");
|
|
|
|
const ParseError = require("./ParseError");
|
|
|
|
// The main lexer class
|
|
function Lexer(input) {
|
|
this.input = input;
|
|
this.pos = 0;
|
|
}
|
|
|
|
/**
|
|
* The resulting token returned from `lex`.
|
|
*
|
|
* It consists of the token text plus some position information.
|
|
* The position information is essentially a range in an input string,
|
|
* but instead of referencing the bare input string, we refer to the lexer.
|
|
* That way it is possible to attach extra metadata to the input string,
|
|
* like for example a file name or similar.
|
|
*
|
|
* The position information (all three parameters) is optional,
|
|
* so it is OK to construct synthetic tokens if appropriate.
|
|
* Not providing available position information may lead to
|
|
* degraded error reporting, though.
|
|
*
|
|
* @param {string} text the text of this token
|
|
* @param {number=} start the start offset, zero-based inclusive
|
|
* @param {number=} end the end offset, zero-based exclusive
|
|
* @param {Lexer=} lexer the lexer which in turn holds the input string
|
|
*/
|
|
function Token(text, start, end, lexer) {
|
|
this.text = text;
|
|
this.start = start;
|
|
this.end = end;
|
|
this.lexer = lexer;
|
|
}
|
|
|
|
/**
|
|
* Given a pair of tokens (this and endToken), compute a “Token” encompassing
|
|
* the whole input range enclosed by these two.
|
|
*
|
|
* @param {Token} endToken last token of the range, inclusive
|
|
* @param {string} text the text of the newly constructed token
|
|
*/
|
|
Token.prototype.range = function(endToken, text) {
|
|
if (endToken.lexer !== this.lexer) {
|
|
return new Token(text); // sorry, no position information available
|
|
}
|
|
return new Token(text, this.start, endToken.end, this.lexer);
|
|
};
|
|
|
|
/* The following tokenRegex
|
|
* - matches typical whitespace (but not NBSP etc.) using its first group
|
|
* - does not match any control character \x00-\x1f except whitespace
|
|
* - does not match a bare backslash
|
|
* - matches any ASCII character except those just mentioned
|
|
* - does not match the BMP private use area \uE000-\uF8FF
|
|
* - does not match bare surrogate code units
|
|
* - matches any BMP character except for those just described
|
|
* - matches any valid Unicode surrogate pair
|
|
* - matches a backslash followed by one or more letters
|
|
* - matches a backslash followed by any BMP character, including newline
|
|
* Just because the Lexer matches something doesn't mean it's valid input:
|
|
* If there is no matching function or symbol definition, the Parser will
|
|
* still reject the input.
|
|
*/
|
|
const tokenRegex = new RegExp(
|
|
"([ \r\n\t]+)|" + // whitespace
|
|
"([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
|
|
"|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair
|
|
"|\\\\(?:[a-zA-Z]+|[^\uD800-\uDFFF])" + // function name
|
|
")"
|
|
);
|
|
|
|
/**
|
|
* This function lexes a single token.
|
|
*/
|
|
Lexer.prototype.lex = function() {
|
|
const input = this.input;
|
|
const pos = this.pos;
|
|
if (pos === input.length) {
|
|
return new Token("EOF", pos, pos, this);
|
|
}
|
|
const match = matchAt(tokenRegex, input, pos);
|
|
if (match === null) {
|
|
throw new ParseError(
|
|
"Unexpected character: '" + input[pos] + "'",
|
|
new Token(input[pos], pos, pos + 1, this));
|
|
}
|
|
const text = match[2] || " ";
|
|
const start = this.pos;
|
|
this.pos += match[0].length;
|
|
const end = this.pos;
|
|
return new Token(text, start, end, this);
|
|
};
|
|
|
|
module.exports = Lexer;
|