
This commit introduces environments, and implements the parser infrastructure to handle them, even including arguments after the “\begin{name}” construct. It also offers a way to turn array-like data structures, i.e. delimited by “&” and “\\”, into nested arrays of groups. Environments are essentially functions which call back to the parser to parse their body. It is their responsibility to stop at the next “\end”, while the parser takes care of verifing that the names match between “\begin” and “\end”. The environment has to return a ParseResult, to provide the position that goes with the resulting node. One application of this is the “array” environment. So far, it supports column alignment, but no column separators, and no multi-column shorthands using “*{…}”. Building on the same infrastructure, there are “matrix”, “pmatrix”, “bmatrix”, “vmatrix” and “Vmatrix” environments. Internally these are just “\left..\right” wrapped around an array with no margins at its ends. Spacing for arrays and matrices was derived from the LaTeX sources, and comments indicate the appropriate references. Now we have hard-wired breaks in parseExpression, to always break on “}”, “\end”, “\right”, “&”, “\\” and “\cr”. This means that these symbols are never PART of an expression, at least not without some nesting. They may follow AFTER an expression, and the caller of parseExpression should be expecting them. The implicit groups for sizing or styling don't care what ended the expression, which is all right for them. We still have support for breakOnToken, but now it is only used for “]” since that MAY be used to terminate an optional argument, but otherwise it's an ordinary symbol.
195 lines
5.5 KiB
JavaScript
195 lines
5.5 KiB
JavaScript
/**
|
|
* The Lexer class handles tokenizing the input in various ways. Since our
|
|
* parser expects us to be able to backtrack, the lexer allows lexing from any
|
|
* given starting point.
|
|
*
|
|
* Its main exposed function is the `lex` function, which takes a position to
|
|
* lex from and a type of token to lex. It defers to the appropriate `_innerLex`
|
|
* function.
|
|
*
|
|
* The various `_innerLex` functions perform the actual lexing of different
|
|
* kinds.
|
|
*/
|
|
|
|
var matchAt = require("match-at");
|
|
|
|
var ParseError = require("./ParseError");
|
|
|
|
// The main lexer class
|
|
function Lexer(input) {
|
|
this._input = input;
|
|
}
|
|
|
|
// The resulting token returned from `lex`.
|
|
function Token(text, data, position) {
|
|
this.text = text;
|
|
this.data = data;
|
|
this.position = position;
|
|
}
|
|
|
|
// "normal" types of tokens. These are tokens which can be matched by a simple
|
|
// regex
|
|
var mathNormals = [
|
|
/[/|@.""`0-9a-zA-Z]/, // ords
|
|
/[*+-]/, // bins
|
|
/[=<>:]/, // rels
|
|
/[,;]/, // punctuation
|
|
/['\^_{}]/, // misc
|
|
/[(\[]/, // opens
|
|
/[)\]?!]/, // closes
|
|
/~/, // spacing
|
|
/&/, // horizontal alignment
|
|
/\\\\/ // line break
|
|
];
|
|
|
|
// These are "normal" tokens like above, but should instead be parsed in text
|
|
// mode.
|
|
var textNormals = [
|
|
/[a-zA-Z0-9`!@*()-=+\[\]'";:?\/.,]/, // ords
|
|
/[{}]/, // grouping
|
|
/~/, // spacing
|
|
/&/, // horizontal alignment
|
|
/\\\\/ // line break
|
|
];
|
|
|
|
// Regexes for matching whitespace
|
|
var whitespaceRegex = /\s*/;
|
|
var whitespaceConcatRegex = / +|\\ +/;
|
|
|
|
// This regex matches any other TeX function, which is a backslash followed by a
|
|
// word or a single symbol
|
|
var anyFunc = /\\(?:[a-zA-Z]+|.)/;
|
|
|
|
/**
|
|
* This function lexes a single normal token. It takes a position, a list of
|
|
* "normal" tokens to try, and whether it should completely ignore whitespace or
|
|
* not.
|
|
*/
|
|
Lexer.prototype._innerLex = function(pos, normals, ignoreWhitespace) {
|
|
var input = this._input;
|
|
var whitespace;
|
|
|
|
if (ignoreWhitespace) {
|
|
// Get rid of whitespace.
|
|
whitespace = matchAt(whitespaceRegex, input, pos)[0];
|
|
pos += whitespace.length;
|
|
} else {
|
|
// Do the funky concatenation of whitespace that happens in text mode.
|
|
whitespace = matchAt(whitespaceConcatRegex, input, pos);
|
|
if (whitespace !== null) {
|
|
return new Token(" ", null, pos + whitespace[0].length);
|
|
}
|
|
}
|
|
|
|
// If there's no more input to parse, return an EOF token
|
|
if (pos === input.length) {
|
|
return new Token("EOF", null, pos);
|
|
}
|
|
|
|
var match;
|
|
if ((match = matchAt(anyFunc, input, pos))) {
|
|
// If we match a function token, return it
|
|
return new Token(match[0], null, pos + match[0].length);
|
|
} else {
|
|
// Otherwise, we look through the normal token regexes and see if it's
|
|
// one of them.
|
|
for (var i = 0; i < normals.length; i++) {
|
|
var normal = normals[i];
|
|
|
|
if ((match = matchAt(normal, input, pos))) {
|
|
// If it is, return it
|
|
return new Token(
|
|
match[0], null, pos + match[0].length);
|
|
}
|
|
}
|
|
}
|
|
|
|
throw new ParseError(
|
|
"Unexpected character: '" + input[pos] + "'",
|
|
this, pos);
|
|
};
|
|
|
|
// A regex to match a CSS color (like #ffffff or BlueViolet)
|
|
var cssColor = /#[a-z0-9]+|[a-z]+/i;
|
|
|
|
/**
|
|
* This function lexes a CSS color.
|
|
*/
|
|
Lexer.prototype._innerLexColor = function(pos) {
|
|
var input = this._input;
|
|
|
|
// Ignore whitespace
|
|
var whitespace = matchAt(whitespaceRegex, input, pos)[0];
|
|
pos += whitespace.length;
|
|
|
|
var match;
|
|
if ((match = matchAt(cssColor, input, pos))) {
|
|
// If we look like a color, return a color
|
|
return new Token(match[0], null, pos + match[0].length);
|
|
} else {
|
|
throw new ParseError("Invalid color", this, pos);
|
|
}
|
|
};
|
|
|
|
// A regex to match a dimension. Dimensions look like
|
|
// "1.2em" or ".4pt" or "1 ex"
|
|
var sizeRegex = /(-?)\s*(\d+(?:\.\d*)?|\.\d+)\s*([a-z]{2})/;
|
|
|
|
/**
|
|
* This function lexes a dimension.
|
|
*/
|
|
Lexer.prototype._innerLexSize = function(pos) {
|
|
var input = this._input;
|
|
|
|
// Ignore whitespace
|
|
var whitespace = matchAt(whitespaceRegex, input, pos)[0];
|
|
pos += whitespace.length;
|
|
|
|
var match;
|
|
if ((match = matchAt(sizeRegex, input, pos))) {
|
|
var unit = match[3];
|
|
// We only currently handle "em" and "ex" units
|
|
if (unit !== "em" && unit !== "ex") {
|
|
throw new ParseError("Invalid unit: '" + unit + "'", this, pos);
|
|
}
|
|
return new Token(match[0], {
|
|
number: +(match[1] + match[2]),
|
|
unit: unit
|
|
}, pos + match[0].length);
|
|
}
|
|
|
|
throw new ParseError("Invalid size", this, pos);
|
|
};
|
|
|
|
/**
|
|
* This function lexes a string of whitespace.
|
|
*/
|
|
Lexer.prototype._innerLexWhitespace = function(pos) {
|
|
var input = this._input;
|
|
|
|
var whitespace = matchAt(whitespaceRegex, input, pos)[0];
|
|
pos += whitespace.length;
|
|
|
|
return new Token(whitespace[0], null, pos);
|
|
};
|
|
|
|
/**
|
|
* This function lexes a single token starting at `pos` and of the given mode.
|
|
* Based on the mode, we defer to one of the `_innerLex` functions.
|
|
*/
|
|
Lexer.prototype.lex = function(pos, mode) {
|
|
if (mode === "math") {
|
|
return this._innerLex(pos, mathNormals, true);
|
|
} else if (mode === "text") {
|
|
return this._innerLex(pos, textNormals, false);
|
|
} else if (mode === "color") {
|
|
return this._innerLexColor(pos);
|
|
} else if (mode === "size") {
|
|
return this._innerLexSize(pos);
|
|
} else if (mode === "whitespace") {
|
|
return this._innerLexWhitespace(pos);
|
|
}
|
|
};
|
|
|
|
module.exports = Lexer;
|