KaTeX/src/Lexer.js
Martin von Gagern 2f7a54877a Implement environments, for arrays and matrices in particular
This commit introduces environments, and implements the parser
infrastructure to handle them, even including arguments after the
“\begin{name}” construct.  It also offers a way to turn array-like data
structures, i.e. delimited by “&” and “\\”, into nested arrays of groups.
Environments are essentially functions which call back to the parser to
parse their body.  It is their responsibility to stop at the next “\end”,
while the parser takes care of verifing that the names match between
“\begin” and “\end”.  The environment has to return a ParseResult, to
provide the position that goes with the resulting node.

One application of this is the “array” environment.  So far, it supports
column alignment, but no column separators, and no multi-column shorthands
using “*{…}”.  Building on the same infrastructure, there are “matrix”,
“pmatrix”, “bmatrix”, “vmatrix” and “Vmatrix” environments.  Internally
these are just “\left..\right” wrapped around an array with no margins at
its ends.  Spacing for arrays and matrices was derived from the LaTeX
sources, and comments indicate the appropriate references.

Now we have hard-wired breaks in parseExpression, to always break on “}”,
“\end”, “\right”, “&”, “\\” and “\cr”.  This means that these symbols are
never PART of an expression, at least not without some nesting.  They may
follow AFTER an expression, and the caller of parseExpression should be
expecting them.  The implicit groups for sizing or styling don't care what
ended the expression, which is all right for them.  We still have support
for breakOnToken, but now it is only used for “]” since that MAY be used to
terminate an optional argument, but otherwise it's an ordinary symbol.
2015-06-18 22:24:40 +02:00

195 lines
5.5 KiB
JavaScript

/**
* The Lexer class handles tokenizing the input in various ways. Since our
* parser expects us to be able to backtrack, the lexer allows lexing from any
* given starting point.
*
* Its main exposed function is the `lex` function, which takes a position to
* lex from and a type of token to lex. It defers to the appropriate `_innerLex`
* function.
*
* The various `_innerLex` functions perform the actual lexing of different
* kinds.
*/
var matchAt = require("match-at");
var ParseError = require("./ParseError");
// The main lexer class
function Lexer(input) {
this._input = input;
}
// The resulting token returned from `lex`.
function Token(text, data, position) {
this.text = text;
this.data = data;
this.position = position;
}
// "normal" types of tokens. These are tokens which can be matched by a simple
// regex
var mathNormals = [
/[/|@.""`0-9a-zA-Z]/, // ords
/[*+-]/, // bins
/[=<>:]/, // rels
/[,;]/, // punctuation
/['\^_{}]/, // misc
/[(\[]/, // opens
/[)\]?!]/, // closes
/~/, // spacing
/&/, // horizontal alignment
/\\\\/ // line break
];
// These are "normal" tokens like above, but should instead be parsed in text
// mode.
var textNormals = [
/[a-zA-Z0-9`!@*()-=+\[\]'";:?\/.,]/, // ords
/[{}]/, // grouping
/~/, // spacing
/&/, // horizontal alignment
/\\\\/ // line break
];
// Regexes for matching whitespace
var whitespaceRegex = /\s*/;
var whitespaceConcatRegex = / +|\\ +/;
// This regex matches any other TeX function, which is a backslash followed by a
// word or a single symbol
var anyFunc = /\\(?:[a-zA-Z]+|.)/;
/**
* This function lexes a single normal token. It takes a position, a list of
* "normal" tokens to try, and whether it should completely ignore whitespace or
* not.
*/
Lexer.prototype._innerLex = function(pos, normals, ignoreWhitespace) {
var input = this._input;
var whitespace;
if (ignoreWhitespace) {
// Get rid of whitespace.
whitespace = matchAt(whitespaceRegex, input, pos)[0];
pos += whitespace.length;
} else {
// Do the funky concatenation of whitespace that happens in text mode.
whitespace = matchAt(whitespaceConcatRegex, input, pos);
if (whitespace !== null) {
return new Token(" ", null, pos + whitespace[0].length);
}
}
// If there's no more input to parse, return an EOF token
if (pos === input.length) {
return new Token("EOF", null, pos);
}
var match;
if ((match = matchAt(anyFunc, input, pos))) {
// If we match a function token, return it
return new Token(match[0], null, pos + match[0].length);
} else {
// Otherwise, we look through the normal token regexes and see if it's
// one of them.
for (var i = 0; i < normals.length; i++) {
var normal = normals[i];
if ((match = matchAt(normal, input, pos))) {
// If it is, return it
return new Token(
match[0], null, pos + match[0].length);
}
}
}
throw new ParseError(
"Unexpected character: '" + input[pos] + "'",
this, pos);
};
// A regex to match a CSS color (like #ffffff or BlueViolet)
var cssColor = /#[a-z0-9]+|[a-z]+/i;
/**
* This function lexes a CSS color.
*/
Lexer.prototype._innerLexColor = function(pos) {
var input = this._input;
// Ignore whitespace
var whitespace = matchAt(whitespaceRegex, input, pos)[0];
pos += whitespace.length;
var match;
if ((match = matchAt(cssColor, input, pos))) {
// If we look like a color, return a color
return new Token(match[0], null, pos + match[0].length);
} else {
throw new ParseError("Invalid color", this, pos);
}
};
// A regex to match a dimension. Dimensions look like
// "1.2em" or ".4pt" or "1 ex"
var sizeRegex = /(-?)\s*(\d+(?:\.\d*)?|\.\d+)\s*([a-z]{2})/;
/**
* This function lexes a dimension.
*/
Lexer.prototype._innerLexSize = function(pos) {
var input = this._input;
// Ignore whitespace
var whitespace = matchAt(whitespaceRegex, input, pos)[0];
pos += whitespace.length;
var match;
if ((match = matchAt(sizeRegex, input, pos))) {
var unit = match[3];
// We only currently handle "em" and "ex" units
if (unit !== "em" && unit !== "ex") {
throw new ParseError("Invalid unit: '" + unit + "'", this, pos);
}
return new Token(match[0], {
number: +(match[1] + match[2]),
unit: unit
}, pos + match[0].length);
}
throw new ParseError("Invalid size", this, pos);
};
/**
* This function lexes a string of whitespace.
*/
Lexer.prototype._innerLexWhitespace = function(pos) {
var input = this._input;
var whitespace = matchAt(whitespaceRegex, input, pos)[0];
pos += whitespace.length;
return new Token(whitespace[0], null, pos);
};
/**
* This function lexes a single token starting at `pos` and of the given mode.
* Based on the mode, we defer to one of the `_innerLex` functions.
*/
Lexer.prototype.lex = function(pos, mode) {
if (mode === "math") {
return this._innerLex(pos, mathNormals, true);
} else if (mode === "text") {
return this._innerLex(pos, textNormals, false);
} else if (mode === "color") {
return this._innerLexColor(pos);
} else if (mode === "size") {
return this._innerLexSize(pos);
} else if (mode === "whitespace") {
return this._innerLexWhitespace(pos);
}
};
module.exports = Lexer;