/** * The Lexer class handles tokenizing the input in various ways. Since our * parser expects us to be able to backtrack, the lexer allows lexing from any * given starting point. * * Its main exposed function is the `lex` function, which takes a position to * lex from and a type of token to lex. It defers to the appropriate `_innerLex` * function. * * The various `_innerLex` functions perform the actual lexing of different * kinds. */ var matchAt = require("match-at"); var ParseError = require("./ParseError"); // The main lexer class function Lexer(input) { this._input = input; } // The resulting token returned from `lex`. function Token(text, data, position) { this.text = text; this.data = data; this.position = position; } // "normal" types of tokens. These are tokens which can be matched by a simple // regex var mathNormals = [ /[/|@.""`0-9a-zA-Z]/, // ords /[*+-]/, // bins /[=<>:]/, // rels /[,;]/, // punctuation /['\^_{}]/, // misc /[(\[]/, // opens /[)\]?!]/, // closes /~/, // spacing /&/, // horizontal alignment /\\\\/ // line break ]; // These are "normal" tokens like above, but should instead be parsed in text // mode. var textNormals = [ /[a-zA-Z0-9`!@*()-=+\[\]'";:?\/.,]/, // ords /[{}]/, // grouping /~/, // spacing /&/, // horizontal alignment /\\\\/ // line break ]; // Regexes for matching whitespace var whitespaceRegex = /\s*/; var whitespaceConcatRegex = / +|\\ +/; // This regex matches any other TeX function, which is a backslash followed by a // word or a single symbol var anyFunc = /\\(?:[a-zA-Z]+|.)/; /** * This function lexes a single normal token. It takes a position, a list of * "normal" tokens to try, and whether it should completely ignore whitespace or * not. */ Lexer.prototype._innerLex = function(pos, normals, ignoreWhitespace) { var input = this._input; var whitespace; if (ignoreWhitespace) { // Get rid of whitespace. whitespace = matchAt(whitespaceRegex, input, pos)[0]; pos += whitespace.length; } else { // Do the funky concatenation of whitespace that happens in text mode. whitespace = matchAt(whitespaceConcatRegex, input, pos); if (whitespace !== null) { return new Token(" ", null, pos + whitespace[0].length); } } // If there's no more input to parse, return an EOF token if (pos === input.length) { return new Token("EOF", null, pos); } var match; if ((match = matchAt(anyFunc, input, pos))) { // If we match a function token, return it return new Token(match[0], null, pos + match[0].length); } else { // Otherwise, we look through the normal token regexes and see if it's // one of them. for (var i = 0; i < normals.length; i++) { var normal = normals[i]; if ((match = matchAt(normal, input, pos))) { // If it is, return it return new Token( match[0], null, pos + match[0].length); } } } throw new ParseError( "Unexpected character: '" + input[pos] + "'", this, pos); }; // A regex to match a CSS color (like #ffffff or BlueViolet) var cssColor = /#[a-z0-9]+|[a-z]+/i; /** * This function lexes a CSS color. */ Lexer.prototype._innerLexColor = function(pos) { var input = this._input; // Ignore whitespace var whitespace = matchAt(whitespaceRegex, input, pos)[0]; pos += whitespace.length; var match; if ((match = matchAt(cssColor, input, pos))) { // If we look like a color, return a color return new Token(match[0], null, pos + match[0].length); } else { throw new ParseError("Invalid color", this, pos); } }; // A regex to match a dimension. Dimensions look like // "1.2em" or ".4pt" or "1 ex" var sizeRegex = /(-?)\s*(\d+(?:\.\d*)?|\.\d+)\s*([a-z]{2})/; /** * This function lexes a dimension. */ Lexer.prototype._innerLexSize = function(pos) { var input = this._input; // Ignore whitespace var whitespace = matchAt(whitespaceRegex, input, pos)[0]; pos += whitespace.length; var match; if ((match = matchAt(sizeRegex, input, pos))) { var unit = match[3]; // We only currently handle "em" and "ex" units if (unit !== "em" && unit !== "ex") { throw new ParseError("Invalid unit: '" + unit + "'", this, pos); } return new Token(match[0], { number: +(match[1] + match[2]), unit: unit }, pos + match[0].length); } throw new ParseError("Invalid size", this, pos); }; /** * This function lexes a string of whitespace. */ Lexer.prototype._innerLexWhitespace = function(pos) { var input = this._input; var whitespace = matchAt(whitespaceRegex, input, pos)[0]; pos += whitespace.length; return new Token(whitespace[0], null, pos); }; /** * This function lexes a single token starting at `pos` and of the given mode. * Based on the mode, we defer to one of the `_innerLex` functions. */ Lexer.prototype.lex = function(pos, mode) { if (mode === "math") { return this._innerLex(pos, mathNormals, true); } else if (mode === "text") { return this._innerLex(pos, textNormals, false); } else if (mode === "color") { return this._innerLexColor(pos); } else if (mode === "size") { return this._innerLexSize(pos); } else if (mode === "whitespace") { return this._innerLexWhitespace(pos); } }; module.exports = Lexer;