Rewrote lexer, avoiding some mode-specific distinctions

There are two main motivations for this commit. One is unicode input, which requires unicode characters to get past the lexer. See discussion in #261. The second is in preparation for #266, where we'd deal with one token of look-ahead but might be lexing that token in an unknown mode in some cases. The unit test shipped with this commit addresses the latter concern, since it checks that a math-mode-only token may immediately follow some text mode content group. In this new implementation, all the various things that could get matched have been collected into a single regular expression. The hope is that this will be beneficial for performance and keep the code simpler. The code was written with Unicode input in mind, including non-BMP codepoints. The role of the lexer as a gate keeper, keeping out invalid TeX syntax, has been abandoned. That role is still fulfilled by the symbols and functions tables, though, since any input which is neither a symbol nor a command is still considered invalid input, even though it lexes successfully.
2015-07-07 14:15:58 +02:00 · 2015-07-07 14:15:58 +02:00 · d423bec089
commit d423bec089
parent 95e2f1c8d7
2 changed files with 45 additions and 72 deletions
--- a/src/Lexer.js
+++ b/src/Lexer.js
@ -27,86 +27,54 @@ function Token(text, data, position) {
    this.position = position;
 }
-// "normal" types of tokens. These are tokens which can be matched by a simple
+/* The following tokenRegex
-// regex
+ * - matches typical whitespace (but not NBSP etc.) using its first group
-var mathNormals = [
+ * - matches symbol combinations which result in a single output character
-    /[/|@.""`0-9a-zA-Z]/, // ords
+ * - does not match any control character \x00-\x1f except whitespace
-    /[*+-]/, // bins
+ * - does not match a bare backslash
-    /[=<>:]/, // rels
+ * - matches any ASCII character except those just mentioned
-    /[,;]/, // punctuation
+ * - does not match the BMP private use area \uE000-\uF8FF
-    /['\^_{}]/, // misc
+ * - does not match bare surrogate code units
-    /[(\[]/, // opens
+ * - matches any BMP character except for those just described
-    /[)\]?!]/, // closes
+ * - matches any valid Unicode surrogate pair
-    /~/, // spacing
+ * - matches a backslash followed by one or more letters
-    /&/, // horizontal alignment
+ * - matches a backslash followed by any BMP character, including newline
-    /\\\\/ // line break
+ * Just because the Lexer matches something doesn't mean it's valid input:
-];
+ * If there is no matching function or symbol definition, the Parser will
 * still reject the input.
 */
 var tokenRegex = new RegExp(
    "([ \r\n\t]+)|(" +                                // whitespace
    "---?" +                                          // special combinations
    "|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" +  // single codepoint
    "|[\uD800-\uDBFF][\uDC00-\uDFFF]" +               // surrogate pair
    "|\\\\(?:[a-zA-Z]+|[^\uD800-\uDFFF])" +           // function name
    ")"
 );
 // These are "normal" tokens like above, but should instead be parsed in text
 // mode.
 var textNormals = [
    /[a-zA-Z0-9`!@*()-=+\[\]'";:?\/.,]/, // ords
    /[{}]/, // grouping
    /~/, // spacing
    /&/, // horizontal alignment
    /\\\\/ // line break
 ];
 // Regexes for matching whitespace
 var whitespaceRegex = /\s*/;
 var whitespaceConcatRegex = / +|\\  +/;
 // This regex matches any other TeX function, which is a backslash followed by a
 // word or a single symbol
 var anyFunc = /\\(?:[a-zA-Z]+|.)/;
 /**
- * This function lexes a single normal token. It takes a position, a list of
+ * This function lexes a single normal token. It takes a position and
- * "normal" tokens to try, and whether it should completely ignore whitespace or
+ * whether it should completely ignore whitespace or not.
 * not.
 */
-Lexer.prototype._innerLex = function(pos, normals, ignoreWhitespace) {
+Lexer.prototype._innerLex = function(pos, ignoreWhitespace) {
    var input = this._input;
-    var whitespace;
+    if (pos == input.length) {
    if (ignoreWhitespace) {
        // Get rid of whitespace.
        whitespace = matchAt(whitespaceRegex, input, pos)[0];
        pos += whitespace.length;
    } else {
        // Do the funky concatenation of whitespace that happens in text mode.
        whitespace = matchAt(whitespaceConcatRegex, input, pos);
        if (whitespace !== null) {
            return new Token(" ", null, pos + whitespace[0].length);
        }
    }
    // If there's no more input to parse, return an EOF token
    if (pos === input.length) {
        return new Token("EOF", null, pos);
    }
-
+    var match = matchAt(tokenRegex, input, pos);
-    var match;
+    if (match === null) {
-    if ((match = matchAt(anyFunc, input, pos))) {
+        throw new ParseError(
        // If we match a function token, return it
        return new Token(match[0], null, pos + match[0].length);
    } else {
        // Otherwise, we look through the normal token regexes and see if it's
        // one of them.
        for (var i = 0; i < normals.length; i++) {
            var normal = normals[i];
            if ((match = matchAt(normal, input, pos))) {
                // If it is, return it
                return new Token(
                    match[0], null, pos + match[0].length);
            }
        }
    }
    throw new ParseError(
            "Unexpected character: '" + input[pos] + "'",
            this, pos);
    } else if (match[2]) { // matched non-whitespace
        return new Token(match[2], null, pos + match[2].length);
    } else if (ignoreWhitespace) {
        return this._innerLex(pos + match[1].length, true);
    } else { // concatenate whitespace to a single space
        return new Token(" ", null, pos + match[1].length);
    }
 };
 // A regex to match a CSS color (like #ffffff or BlueViolet)
@ -179,9 +147,9 @@ Lexer.prototype._innerLexWhitespace = function(pos) {
 */
 Lexer.prototype.lex = function(pos, mode) {
    if (mode === "math") {
-        return this._innerLex(pos, mathNormals, true);
+        return this._innerLex(pos, true);
    } else if (mode === "text") {
-        return this._innerLex(pos, textNormals, false);
+        return this._innerLex(pos, false);
    } else if (mode === "color") {
        return this._innerLexColor(pos);
    } else if (mode === "size") {
--- a/test/katex-spec.js
+++ b/test/katex-spec.js
@ -665,6 +665,7 @@ describe("A text parser", function() {
    var leadingSpaceTextExpression = "\\text {moo}";
    var badTextExpression = "\\text{a b%}";
    var badFunctionExpression = "\\text{\\sqrt{x}}";
    var mathTokenAfterText = "\\text{sin}^2";
    it("should not fail", function() {
        expect(textExpression).toParse();
@ -710,6 +711,10 @@ describe("A text parser", function() {
        expect(group[3].type).toMatch("spacing");
    });
    it("should accept math mode tokens after its argument", function() {
        expect(mathTokenAfterText).toParse();
    });
    it("should ignore a space before the text group", function() {
        var parse = getParsed(leadingSpaceTextExpression)[0];
        // [m, o, o]