Rewrote lexer, avoiding some mode-specific distinctions
There are two main motivations for this commit. One is unicode input, which requires unicode characters to get past the lexer. See discussion in #261. The second is in preparation for #266, where we'd deal with one token of look-ahead but might be lexing that token in an unknown mode in some cases. The unit test shipped with this commit addresses the latter concern, since it checks that a math-mode-only token may immediately follow some text mode content group. In this new implementation, all the various things that could get matched have been collected into a single regular expression. The hope is that this will be beneficial for performance and keep the code simpler. The code was written with Unicode input in mind, including non-BMP codepoints. The role of the lexer as a gate keeper, keeping out invalid TeX syntax, has been abandoned. That role is still fulfilled by the symbols and functions tables, though, since any input which is neither a symbol nor a command is still considered invalid input, even though it lexes successfully.
This commit is contained in:
parent
95e2f1c8d7
commit
d423bec089
110
src/Lexer.js
110
src/Lexer.js
|
@ -27,86 +27,54 @@ function Token(text, data, position) {
|
||||||
this.position = position;
|
this.position = position;
|
||||||
}
|
}
|
||||||
|
|
||||||
// "normal" types of tokens. These are tokens which can be matched by a simple
|
/* The following tokenRegex
|
||||||
// regex
|
* - matches typical whitespace (but not NBSP etc.) using its first group
|
||||||
var mathNormals = [
|
* - matches symbol combinations which result in a single output character
|
||||||
/[/|@.""`0-9a-zA-Z]/, // ords
|
* - does not match any control character \x00-\x1f except whitespace
|
||||||
/[*+-]/, // bins
|
* - does not match a bare backslash
|
||||||
/[=<>:]/, // rels
|
* - matches any ASCII character except those just mentioned
|
||||||
/[,;]/, // punctuation
|
* - does not match the BMP private use area \uE000-\uF8FF
|
||||||
/['\^_{}]/, // misc
|
* - does not match bare surrogate code units
|
||||||
/[(\[]/, // opens
|
* - matches any BMP character except for those just described
|
||||||
/[)\]?!]/, // closes
|
* - matches any valid Unicode surrogate pair
|
||||||
/~/, // spacing
|
* - matches a backslash followed by one or more letters
|
||||||
/&/, // horizontal alignment
|
* - matches a backslash followed by any BMP character, including newline
|
||||||
/\\\\/ // line break
|
* Just because the Lexer matches something doesn't mean it's valid input:
|
||||||
];
|
* If there is no matching function or symbol definition, the Parser will
|
||||||
|
* still reject the input.
|
||||||
|
*/
|
||||||
|
var tokenRegex = new RegExp(
|
||||||
|
"([ \r\n\t]+)|(" + // whitespace
|
||||||
|
"---?" + // special combinations
|
||||||
|
"|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
|
||||||
|
"|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair
|
||||||
|
"|\\\\(?:[a-zA-Z]+|[^\uD800-\uDFFF])" + // function name
|
||||||
|
")"
|
||||||
|
);
|
||||||
|
|
||||||
// These are "normal" tokens like above, but should instead be parsed in text
|
|
||||||
// mode.
|
|
||||||
var textNormals = [
|
|
||||||
/[a-zA-Z0-9`!@*()-=+\[\]'";:?\/.,]/, // ords
|
|
||||||
/[{}]/, // grouping
|
|
||||||
/~/, // spacing
|
|
||||||
/&/, // horizontal alignment
|
|
||||||
/\\\\/ // line break
|
|
||||||
];
|
|
||||||
|
|
||||||
// Regexes for matching whitespace
|
|
||||||
var whitespaceRegex = /\s*/;
|
var whitespaceRegex = /\s*/;
|
||||||
var whitespaceConcatRegex = / +|\\ +/;
|
|
||||||
|
|
||||||
// This regex matches any other TeX function, which is a backslash followed by a
|
|
||||||
// word or a single symbol
|
|
||||||
var anyFunc = /\\(?:[a-zA-Z]+|.)/;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This function lexes a single normal token. It takes a position, a list of
|
* This function lexes a single normal token. It takes a position and
|
||||||
* "normal" tokens to try, and whether it should completely ignore whitespace or
|
* whether it should completely ignore whitespace or not.
|
||||||
* not.
|
|
||||||
*/
|
*/
|
||||||
Lexer.prototype._innerLex = function(pos, normals, ignoreWhitespace) {
|
Lexer.prototype._innerLex = function(pos, ignoreWhitespace) {
|
||||||
var input = this._input;
|
var input = this._input;
|
||||||
var whitespace;
|
if (pos == input.length) {
|
||||||
|
|
||||||
if (ignoreWhitespace) {
|
|
||||||
// Get rid of whitespace.
|
|
||||||
whitespace = matchAt(whitespaceRegex, input, pos)[0];
|
|
||||||
pos += whitespace.length;
|
|
||||||
} else {
|
|
||||||
// Do the funky concatenation of whitespace that happens in text mode.
|
|
||||||
whitespace = matchAt(whitespaceConcatRegex, input, pos);
|
|
||||||
if (whitespace !== null) {
|
|
||||||
return new Token(" ", null, pos + whitespace[0].length);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If there's no more input to parse, return an EOF token
|
|
||||||
if (pos === input.length) {
|
|
||||||
return new Token("EOF", null, pos);
|
return new Token("EOF", null, pos);
|
||||||
}
|
}
|
||||||
|
var match = matchAt(tokenRegex, input, pos);
|
||||||
var match;
|
if (match === null) {
|
||||||
if ((match = matchAt(anyFunc, input, pos))) {
|
|
||||||
// If we match a function token, return it
|
|
||||||
return new Token(match[0], null, pos + match[0].length);
|
|
||||||
} else {
|
|
||||||
// Otherwise, we look through the normal token regexes and see if it's
|
|
||||||
// one of them.
|
|
||||||
for (var i = 0; i < normals.length; i++) {
|
|
||||||
var normal = normals[i];
|
|
||||||
|
|
||||||
if ((match = matchAt(normal, input, pos))) {
|
|
||||||
// If it is, return it
|
|
||||||
return new Token(
|
|
||||||
match[0], null, pos + match[0].length);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
throw new ParseError(
|
throw new ParseError(
|
||||||
"Unexpected character: '" + input[pos] + "'",
|
"Unexpected character: '" + input[pos] + "'",
|
||||||
this, pos);
|
this, pos);
|
||||||
|
} else if (match[2]) { // matched non-whitespace
|
||||||
|
return new Token(match[2], null, pos + match[2].length);
|
||||||
|
} else if (ignoreWhitespace) {
|
||||||
|
return this._innerLex(pos + match[1].length, true);
|
||||||
|
} else { // concatenate whitespace to a single space
|
||||||
|
return new Token(" ", null, pos + match[1].length);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// A regex to match a CSS color (like #ffffff or BlueViolet)
|
// A regex to match a CSS color (like #ffffff or BlueViolet)
|
||||||
|
@ -179,9 +147,9 @@ Lexer.prototype._innerLexWhitespace = function(pos) {
|
||||||
*/
|
*/
|
||||||
Lexer.prototype.lex = function(pos, mode) {
|
Lexer.prototype.lex = function(pos, mode) {
|
||||||
if (mode === "math") {
|
if (mode === "math") {
|
||||||
return this._innerLex(pos, mathNormals, true);
|
return this._innerLex(pos, true);
|
||||||
} else if (mode === "text") {
|
} else if (mode === "text") {
|
||||||
return this._innerLex(pos, textNormals, false);
|
return this._innerLex(pos, false);
|
||||||
} else if (mode === "color") {
|
} else if (mode === "color") {
|
||||||
return this._innerLexColor(pos);
|
return this._innerLexColor(pos);
|
||||||
} else if (mode === "size") {
|
} else if (mode === "size") {
|
||||||
|
|
|
@ -665,6 +665,7 @@ describe("A text parser", function() {
|
||||||
var leadingSpaceTextExpression = "\\text {moo}";
|
var leadingSpaceTextExpression = "\\text {moo}";
|
||||||
var badTextExpression = "\\text{a b%}";
|
var badTextExpression = "\\text{a b%}";
|
||||||
var badFunctionExpression = "\\text{\\sqrt{x}}";
|
var badFunctionExpression = "\\text{\\sqrt{x}}";
|
||||||
|
var mathTokenAfterText = "\\text{sin}^2";
|
||||||
|
|
||||||
it("should not fail", function() {
|
it("should not fail", function() {
|
||||||
expect(textExpression).toParse();
|
expect(textExpression).toParse();
|
||||||
|
@ -710,6 +711,10 @@ describe("A text parser", function() {
|
||||||
expect(group[3].type).toMatch("spacing");
|
expect(group[3].type).toMatch("spacing");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should accept math mode tokens after its argument", function() {
|
||||||
|
expect(mathTokenAfterText).toParse();
|
||||||
|
});
|
||||||
|
|
||||||
it("should ignore a space before the text group", function() {
|
it("should ignore a space before the text group", function() {
|
||||||
var parse = getParsed(leadingSpaceTextExpression)[0];
|
var parse = getParsed(leadingSpaceTextExpression)[0];
|
||||||
// [m, o, o]
|
// [m, o, o]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user