Rewrote lexer, avoiding some mode-specific distinctions

There are two main motivations for this commit.  One is unicode input, which
requires unicode characters to get past the lexer.  See discussion in #261.
The second is in preparation for #266, where we'd deal with one token of
look-ahead but might be lexing that token in an unknown mode in some cases.
The unit test shipped with this commit addresses the latter concern, since
it checks that a math-mode-only token may immediately follow some text mode
content group.

In this new implementation, all the various things that could get matched
have been collected into a single regular expression.  The hope is that
this will be beneficial for performance and keep the code simpler.
The code was written with Unicode input in mind, including non-BMP codepoints.

The role of the lexer as a gate keeper, keeping out invalid TeX syntax, has
been abandoned.  That role is still fulfilled by the symbols and functions
tables, though, since any input which is neither a symbol nor a command is
still considered invalid input, even though it lexes successfully.
This commit is contained in:
Martin von Gagern 2015-07-07 14:15:58 +02:00
parent 95e2f1c8d7
commit d423bec089
2 changed files with 45 additions and 72 deletions

View File

@ -27,86 +27,54 @@ function Token(text, data, position) {
this.position = position; this.position = position;
} }
// "normal" types of tokens. These are tokens which can be matched by a simple /* The following tokenRegex
// regex * - matches typical whitespace (but not NBSP etc.) using its first group
var mathNormals = [ * - matches symbol combinations which result in a single output character
/[/|@.""`0-9a-zA-Z]/, // ords * - does not match any control character \x00-\x1f except whitespace
/[*+-]/, // bins * - does not match a bare backslash
/[=<>:]/, // rels * - matches any ASCII character except those just mentioned
/[,;]/, // punctuation * - does not match the BMP private use area \uE000-\uF8FF
/['\^_{}]/, // misc * - does not match bare surrogate code units
/[(\[]/, // opens * - matches any BMP character except for those just described
/[)\]?!]/, // closes * - matches any valid Unicode surrogate pair
/~/, // spacing * - matches a backslash followed by one or more letters
/&/, // horizontal alignment * - matches a backslash followed by any BMP character, including newline
/\\\\/ // line break * Just because the Lexer matches something doesn't mean it's valid input:
]; * If there is no matching function or symbol definition, the Parser will
* still reject the input.
*/
var tokenRegex = new RegExp(
"([ \r\n\t]+)|(" + // whitespace
"---?" + // special combinations
"|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
"|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair
"|\\\\(?:[a-zA-Z]+|[^\uD800-\uDFFF])" + // function name
")"
);
// These are "normal" tokens like above, but should instead be parsed in text
// mode.
var textNormals = [
/[a-zA-Z0-9`!@*()-=+\[\]'";:?\/.,]/, // ords
/[{}]/, // grouping
/~/, // spacing
/&/, // horizontal alignment
/\\\\/ // line break
];
// Regexes for matching whitespace
var whitespaceRegex = /\s*/; var whitespaceRegex = /\s*/;
var whitespaceConcatRegex = / +|\\ +/;
// This regex matches any other TeX function, which is a backslash followed by a
// word or a single symbol
var anyFunc = /\\(?:[a-zA-Z]+|.)/;
/** /**
* This function lexes a single normal token. It takes a position, a list of * This function lexes a single normal token. It takes a position and
* "normal" tokens to try, and whether it should completely ignore whitespace or * whether it should completely ignore whitespace or not.
* not.
*/ */
Lexer.prototype._innerLex = function(pos, normals, ignoreWhitespace) { Lexer.prototype._innerLex = function(pos, ignoreWhitespace) {
var input = this._input; var input = this._input;
var whitespace; if (pos == input.length) {
if (ignoreWhitespace) {
// Get rid of whitespace.
whitespace = matchAt(whitespaceRegex, input, pos)[0];
pos += whitespace.length;
} else {
// Do the funky concatenation of whitespace that happens in text mode.
whitespace = matchAt(whitespaceConcatRegex, input, pos);
if (whitespace !== null) {
return new Token(" ", null, pos + whitespace[0].length);
}
}
// If there's no more input to parse, return an EOF token
if (pos === input.length) {
return new Token("EOF", null, pos); return new Token("EOF", null, pos);
} }
var match = matchAt(tokenRegex, input, pos);
var match; if (match === null) {
if ((match = matchAt(anyFunc, input, pos))) { throw new ParseError(
// If we match a function token, return it
return new Token(match[0], null, pos + match[0].length);
} else {
// Otherwise, we look through the normal token regexes and see if it's
// one of them.
for (var i = 0; i < normals.length; i++) {
var normal = normals[i];
if ((match = matchAt(normal, input, pos))) {
// If it is, return it
return new Token(
match[0], null, pos + match[0].length);
}
}
}
throw new ParseError(
"Unexpected character: '" + input[pos] + "'", "Unexpected character: '" + input[pos] + "'",
this, pos); this, pos);
} else if (match[2]) { // matched non-whitespace
return new Token(match[2], null, pos + match[2].length);
} else if (ignoreWhitespace) {
return this._innerLex(pos + match[1].length, true);
} else { // concatenate whitespace to a single space
return new Token(" ", null, pos + match[1].length);
}
}; };
// A regex to match a CSS color (like #ffffff or BlueViolet) // A regex to match a CSS color (like #ffffff or BlueViolet)
@ -179,9 +147,9 @@ Lexer.prototype._innerLexWhitespace = function(pos) {
*/ */
Lexer.prototype.lex = function(pos, mode) { Lexer.prototype.lex = function(pos, mode) {
if (mode === "math") { if (mode === "math") {
return this._innerLex(pos, mathNormals, true); return this._innerLex(pos, true);
} else if (mode === "text") { } else if (mode === "text") {
return this._innerLex(pos, textNormals, false); return this._innerLex(pos, false);
} else if (mode === "color") { } else if (mode === "color") {
return this._innerLexColor(pos); return this._innerLexColor(pos);
} else if (mode === "size") { } else if (mode === "size") {

View File

@ -665,6 +665,7 @@ describe("A text parser", function() {
var leadingSpaceTextExpression = "\\text {moo}"; var leadingSpaceTextExpression = "\\text {moo}";
var badTextExpression = "\\text{a b%}"; var badTextExpression = "\\text{a b%}";
var badFunctionExpression = "\\text{\\sqrt{x}}"; var badFunctionExpression = "\\text{\\sqrt{x}}";
var mathTokenAfterText = "\\text{sin}^2";
it("should not fail", function() { it("should not fail", function() {
expect(textExpression).toParse(); expect(textExpression).toParse();
@ -710,6 +711,10 @@ describe("A text parser", function() {
expect(group[3].type).toMatch("spacing"); expect(group[3].type).toMatch("spacing");
}); });
it("should accept math mode tokens after its argument", function() {
expect(mathTokenAfterText).toParse();
});
it("should ignore a space before the text group", function() { it("should ignore a space before the text group", function() {
var parse = getParsed(leadingSpaceTextExpression)[0]; var parse = getParsed(leadingSpaceTextExpression)[0];
// [m, o, o] // [m, o, o]