scribble-math/src/Lexer.js
Ben Alpert 0f6530096b Don't slice in lexer
Summary: Theoretically this allocates way less. In practice it seems to be exactly the same speed.

Test Plan: make test

Reviewers: emily

Reviewed By: emily

Differential Revision: https://phabricator.khanacademy.org/D16621
2015-04-06 10:39:39 -07:00

191 lines
5.4 KiB
JavaScript

/**
* The Lexer class handles tokenizing the input in various ways. Since our
* parser expects us to be able to backtrack, the lexer allows lexing from any
* given starting point.
*
* Its main exposed function is the `lex` function, which takes a position to
* lex from and a type of token to lex. It defers to the appropriate `_innerLex`
* function.
*
* The various `_innerLex` functions perform the actual lexing of different
* kinds.
*/
var matchAt = require("match-at");
var ParseError = require("./ParseError");
// The main lexer class
function Lexer(input) {
this._input = input;
}
// The resulting token returned from `lex`.
function Token(text, data, position) {
this.text = text;
this.data = data;
this.position = position;
}
// "normal" types of tokens. These are tokens which can be matched by a simple
// regex
var mathNormals = [
/[/|@.""`0-9a-zA-Z]/, // ords
/[*+-]/, // bins
/[=<>:]/, // rels
/[,;]/, // punctuation
/['\^_{}]/, // misc
/[(\[]/, // opens
/[)\]?!]/, // closes
/~/ // spacing
];
// These are "normal" tokens like above, but should instead be parsed in text
// mode.
var textNormals = [
/[a-zA-Z0-9`!@*()-=+\[\]'";:?\/.,]/, // ords
/[{}]/, // grouping
/~/ // spacing
];
// Regexes for matching whitespace
var whitespaceRegex = /\s*/;
var whitespaceConcatRegex = / +|\\ +/;
// This regex matches any other TeX function, which is a backslash followed by a
// word or a single symbol
var anyFunc = /\\(?:[a-zA-Z]+|.)/;
/**
* This function lexes a single normal token. It takes a position, a list of
* "normal" tokens to try, and whether it should completely ignore whitespace or
* not.
*/
Lexer.prototype._innerLex = function(pos, normals, ignoreWhitespace) {
var input = this._input;
var whitespace;
if (ignoreWhitespace) {
// Get rid of whitespace.
whitespace = matchAt(whitespaceRegex, input, pos)[0];
pos += whitespace.length;
} else {
// Do the funky concatenation of whitespace that happens in text mode.
whitespace = matchAt(whitespaceConcatRegex, input, pos);
if (whitespace !== null) {
return new Token(" ", null, pos + whitespace[0].length);
}
}
// If there's no more input to parse, return an EOF token
if (pos === input.length) {
return new Token("EOF", null, pos);
}
var match;
if ((match = matchAt(anyFunc, input, pos))) {
// If we match a function token, return it
return new Token(match[0], null, pos + match[0].length);
} else {
// Otherwise, we look through the normal token regexes and see if it's
// one of them.
for (var i = 0; i < normals.length; i++) {
var normal = normals[i];
if ((match = matchAt(normal, input, pos))) {
// If it is, return it
return new Token(
match[0], null, pos + match[0].length);
}
}
}
throw new ParseError(
"Unexpected character: '" + input[pos] + "'",
this, pos);
};
// A regex to match a CSS color (like #ffffff or BlueViolet)
var cssColor = /#[a-z0-9]+|[a-z]+/i;
/**
* This function lexes a CSS color.
*/
Lexer.prototype._innerLexColor = function(pos) {
var input = this._input;
// Ignore whitespace
var whitespace = matchAt(whitespaceRegex, input, pos)[0];
pos += whitespace.length;
var match;
if ((match = matchAt(cssColor, input, pos))) {
// If we look like a color, return a color
return new Token(match[0], null, pos + match[0].length);
} else {
throw new ParseError("Invalid color", this, pos);
}
};
// A regex to match a dimension. Dimensions look like
// "1.2em" or ".4pt" or "1 ex"
var sizeRegex = /(-?)\s*(\d+(?:\.\d*)?|\.\d+)\s*([a-z]{2})/;
/**
* This function lexes a dimension.
*/
Lexer.prototype._innerLexSize = function(pos) {
var input = this._input;
// Ignore whitespace
var whitespace = matchAt(whitespaceRegex, input, pos)[0];
pos += whitespace.length;
var match;
if ((match = matchAt(sizeRegex, input, pos))) {
var unit = match[3];
// We only currently handle "em" and "ex" units
if (unit !== "em" && unit !== "ex") {
throw new ParseError("Invalid unit: '" + unit + "'", this, pos);
}
return new Token(match[0], {
number: +(match[1] + match[2]),
unit: unit
}, pos + match[0].length);
}
throw new ParseError("Invalid size", this, pos);
};
/**
* This function lexes a string of whitespace.
*/
Lexer.prototype._innerLexWhitespace = function(pos) {
var input = this._input;
var whitespace = matchAt(whitespaceRegex, input, pos)[0];
pos += whitespace.length;
return new Token(whitespace[0], null, pos);
};
/**
* This function lexes a single token starting at `pos` and of the given mode.
* Based on the mode, we defer to one of the `_innerLex` functions.
*/
Lexer.prototype.lex = function(pos, mode) {
if (mode === "math") {
return this._innerLex(pos, mathNormals, true);
} else if (mode === "text") {
return this._innerLex(pos, textNormals, false);
} else if (mode === "color") {
return this._innerLexColor(pos);
} else if (mode === "size") {
return this._innerLexSize(pos);
} else if (mode === "whitespace") {
return this._innerLexWhitespace(pos);
}
};
module.exports = Lexer;