KaTeX/src/Parser.js
Martin von Gagern 2f7a54877a Implement environments, for arrays and matrices in particular
This commit introduces environments, and implements the parser
infrastructure to handle them, even including arguments after the
“\begin{name}” construct.  It also offers a way to turn array-like data
structures, i.e. delimited by “&” and “\\”, into nested arrays of groups.
Environments are essentially functions which call back to the parser to
parse their body.  It is their responsibility to stop at the next “\end”,
while the parser takes care of verifing that the names match between
“\begin” and “\end”.  The environment has to return a ParseResult, to
provide the position that goes with the resulting node.

One application of this is the “array” environment.  So far, it supports
column alignment, but no column separators, and no multi-column shorthands
using “*{…}”.  Building on the same infrastructure, there are “matrix”,
“pmatrix”, “bmatrix”, “vmatrix” and “Vmatrix” environments.  Internally
these are just “\left..\right” wrapped around an array with no margins at
its ends.  Spacing for arrays and matrices was derived from the LaTeX
sources, and comments indicate the appropriate references.

Now we have hard-wired breaks in parseExpression, to always break on “}”,
“\end”, “\right”, “&”, “\\” and “\cr”.  This means that these symbols are
never PART of an expression, at least not without some nesting.  They may
follow AFTER an expression, and the caller of parseExpression should be
expecting them.  The implicit groups for sizing or styling don't care what
ended the expression, which is all right for them.  We still have support
for breakOnToken, but now it is only used for “]” since that MAY be used to
terminate an optional argument, but otherwise it's an ordinary symbol.
2015-06-18 22:24:40 +02:00

653 lines
22 KiB
JavaScript

var functions = require("./functions");
var environments = require("./environments");
var Lexer = require("./Lexer");
var symbols = require("./symbols");
var utils = require("./utils");
var parseData = require("./parseData");
var ParseError = require("./ParseError");
/**
* This file contains the parser used to parse out a TeX expression from the
* input. Since TeX isn't context-free, standard parsers don't work particularly
* well.
*
* The strategy of this parser is as such:
*
* The main functions (the `.parse...` ones) take a position in the current
* parse string to parse tokens from. The lexer (found in Lexer.js, stored at
* this.lexer) also supports pulling out tokens at arbitrary places. When
* individual tokens are needed at a position, the lexer is called to pull out a
* token, which is then used.
*
* The main functions also take a mode that the parser is currently in
* (currently "math" or "text"), which denotes whether the current environment
* is a math-y one or a text-y one (e.g. inside \text). Currently, this serves
* to limit the functions which can be used in text mode.
*
* The main functions then return an object which contains the useful data that
* was parsed at its given point, and a new position at the end of the parsed
* data. The main functions can call each other and continue the parsing by
* using the returned position as a new starting point.
*
* There are also extra `.handle...` functions, which pull out some reused
* functionality into self-contained functions.
*
* The earlier functions return `ParseResult`s, which contain a ParseNode and a
* new position.
*
* The later functions (which are called deeper in the parse) sometimes return
* ParseFuncOrArgument, which contain a ParseResult as well as some data about
* whether the parsed object is a function which is missing some arguments, or a
* standalone object which can be used as an argument to another function.
*/
/**
* Main Parser class
*/
function Parser(input, settings) {
// Make a new lexer
this.lexer = new Lexer(input);
// Store the settings for use in parsing
this.settings = settings;
}
var ParseNode = parseData.ParseNode;
var ParseResult = parseData.ParseResult;
/**
* An initial function (without its arguments), or an argument to a function.
* The `result` argument should be a ParseResult.
*/
function ParseFuncOrArgument(result, isFunction) {
this.result = result;
// Is this a function (i.e. is it something defined in functions.js)?
this.isFunction = isFunction;
}
/**
* Checks a result to make sure it has the right type, and throws an
* appropriate error otherwise.
*/
Parser.prototype.expect = function(result, text) {
if (result.text !== text) {
throw new ParseError(
"Expected '" + text + "', got '" + result.text + "'",
this.lexer, result.position
);
}
};
/**
* Main parsing function, which parses an entire input.
*
* @return {?Array.<ParseNode>}
*/
Parser.prototype.parse = function(input) {
// Try to parse the input
var parse = this.parseInput(0, "math");
return parse.result;
};
/**
* Parses an entire input tree.
*/
Parser.prototype.parseInput = function(pos, mode) {
// Parse an expression
var expression = this.parseExpression(pos, mode, false);
// If we succeeded, make sure there's an EOF at the end
this.expect(expression.peek, "EOF");
return expression;
};
var endOfExpression = ["}", "\\end", "\\right", "&", "\\\\", "\\cr"];
/**
* Parses an "expression", which is a list of atoms.
*
* @param {boolean} breakOnInfix Should the parsing stop when we hit infix
* nodes? This happens when functions have higher precendence
* than infix nodes in implicit parses.
*
* @param {?string} breakOnToken The token that the expression should end with,
* or `null` if something else should end the expression.
*
* @return {ParseResult}
*/
Parser.prototype.parseExpression = function(pos, mode, breakOnInfix, breakOnToken) {
var body = [];
var lex = null;
// Keep adding atoms to the body until we can't parse any more atoms (either
// we reached the end, a }, or a \right)
while (true) {
lex = this.lexer.lex(pos, mode);
if (endOfExpression.indexOf(lex.text) !== -1) {
break;
}
if (breakOnToken && lex.text === breakOnToken) {
break;
}
var atom = this.parseAtom(pos, mode);
if (!atom) {
break;
}
if (breakOnInfix && atom.result.type === "infix") {
break;
}
body.push(atom.result);
pos = atom.position;
}
var res = new ParseResult(this.handleInfixNodes(body, mode), pos);
res.peek = lex;
return res;
};
/**
* Rewrites infix operators such as \over with corresponding commands such
* as \frac.
*
* There can only be one infix operator per group. If there's more than one
* then the expression is ambiguous. This can be resolved by adding {}.
*
* @returns {Array}
*/
Parser.prototype.handleInfixNodes = function (body, mode) {
var overIndex = -1;
var func;
var funcName;
for (var i = 0; i < body.length; i++) {
var node = body[i];
if (node.type === "infix") {
if (overIndex !== -1) {
throw new ParseError("only one infix operator per group",
this.lexer, -1);
}
overIndex = i;
funcName = node.value.replaceWith;
func = functions.funcs[funcName];
}
}
if (overIndex !== -1) {
var numerNode, denomNode;
var numerBody = body.slice(0, overIndex);
var denomBody = body.slice(overIndex + 1);
if (numerBody.length === 1 && numerBody[0].type === "ordgroup") {
numerNode = numerBody[0];
} else {
numerNode = new ParseNode("ordgroup", numerBody, mode);
}
if (denomBody.length === 1 && denomBody[0].type === "ordgroup") {
denomNode = denomBody[0];
} else {
denomNode = new ParseNode("ordgroup", denomBody, mode);
}
var value = func.handler(funcName, numerNode, denomNode);
return [new ParseNode(value.type, value, mode)];
} else {
return body;
}
};
// The greediness of a superscript or subscript
var SUPSUB_GREEDINESS = 1;
/**
* Handle a subscript or superscript with nice errors.
*/
Parser.prototype.handleSupSubscript = function(pos, mode, symbol, name) {
var group = this.parseGroup(pos, mode);
if (!group) {
throw new ParseError(
"Expected group after '" + symbol + "'", this.lexer, pos);
} else if (group.isFunction) {
// ^ and _ have a greediness, so handle interactions with functions'
// greediness
var funcGreediness = functions.funcs[group.result.result].greediness;
if (funcGreediness > SUPSUB_GREEDINESS) {
return this.parseFunction(pos, mode);
} else {
throw new ParseError(
"Got function '" + group.result.result + "' with no arguments " +
"as " + name,
this.lexer, pos);
}
} else {
return group.result;
}
};
/**
* Parses a group with optional super/subscripts.
*
* @return {?ParseResult}
*/
Parser.prototype.parseAtom = function(pos, mode) {
// The body of an atom is an implicit group, so that things like
// \left(x\right)^2 work correctly.
var base = this.parseImplicitGroup(pos, mode);
// In text mode, we don't have superscripts or subscripts
if (mode === "text") {
return base;
}
// Handle an empty base
var currPos;
if (!base) {
currPos = pos;
base = undefined;
} else {
currPos = base.position;
}
var superscript;
var subscript;
var result;
while (true) {
// Lex the first token
var lex = this.lexer.lex(currPos, mode);
if (lex.text === "^") {
// We got a superscript start
if (superscript) {
throw new ParseError(
"Double superscript", this.lexer, currPos);
}
result = this.handleSupSubscript(
lex.position, mode, lex.text, "superscript");
currPos = result.position;
superscript = result.result;
} else if (lex.text === "_") {
// We got a subscript start
if (subscript) {
throw new ParseError(
"Double subscript", this.lexer, currPos);
}
result = this.handleSupSubscript(
lex.position, mode, lex.text, "subscript");
currPos = result.position;
subscript = result.result;
} else if (lex.text === "'") {
// We got a prime
var prime = new ParseNode("textord", "\\prime", mode);
// Many primes can be grouped together, so we handle this here
var primes = [prime];
currPos = lex.position;
// Keep lexing tokens until we get something that's not a prime
while ((lex = this.lexer.lex(currPos, mode)).text === "'") {
// For each one, add another prime to the list
primes.push(prime);
currPos = lex.position;
}
// Put them into an ordgroup as the superscript
superscript = new ParseNode("ordgroup", primes, mode);
} else {
// If it wasn't ^, _, or ', stop parsing super/subscripts
break;
}
}
if (superscript || subscript) {
// If we got either a superscript or subscript, create a supsub
return new ParseResult(
new ParseNode("supsub", {
base: base && base.result,
sup: superscript,
sub: subscript
}, mode),
currPos);
} else {
// Otherwise return the original body
return base;
}
};
// A list of the size-changing functions, for use in parseImplicitGroup
var sizeFuncs = [
"\\tiny", "\\scriptsize", "\\footnotesize", "\\small", "\\normalsize",
"\\large", "\\Large", "\\LARGE", "\\huge", "\\Huge"
];
// A list of the style-changing functions, for use in parseImplicitGroup
var styleFuncs = [
"\\displaystyle", "\\textstyle", "\\scriptstyle", "\\scriptscriptstyle"
];
/**
* Parses an implicit group, which is a group that starts at the end of a
* specified, and ends right before a higher explicit group ends, or at EOL. It
* is used for functions that appear to affect the current style, like \Large or
* \textrm, where instead of keeping a style we just pretend that there is an
* implicit grouping after it until the end of the group. E.g.
* small text {\Large large text} small text again
* It is also used for \left and \right to get the correct grouping.
*
* @return {?ParseResult}
*/
Parser.prototype.parseImplicitGroup = function(pos, mode) {
var start = this.parseSymbol(pos, mode);
if (!start || !start.result) {
// If we didn't get anything we handle, fall back to parseFunction
return this.parseFunction(pos, mode);
}
var func = start.result.result;
var body;
if (func === "\\left") {
// If we see a left:
// Parse the entire left function (including the delimiter)
var left = this.parseFunction(pos, mode);
// Parse out the implicit body
body = this.parseExpression(left.position, mode, false);
// Check the next token
this.expect(body.peek, "\\right");
var right = this.parseFunction(body.position, mode);
return new ParseResult(
new ParseNode("leftright", {
body: body.result,
left: left.result.value.value,
right: right.result.value.value
}, mode),
right.position);
} else if (func === "\\begin") {
// begin...end is similar to left...right
var begin = this.parseFunction(pos, mode);
var envName = begin.result.value.name;
if (!environments.hasOwnProperty(envName)) {
throw new ParseError(
"No such environment: " + envName,
this.lexer, begin.result.value.namepos);
}
// Build the environment object. Arguments and other information will
// be made available to the begin and end methods using properties.
var env = environments[envName];
var args = [null, mode, envName];
var newPos = this.parseArguments(
begin.position, mode, "\\begin{" + envName + "}", env, args);
args[0] = newPos;
var result = env.handler.apply(this, args);
var endLex = this.lexer.lex(result.position, mode);
this.expect(endLex, "\\end");
var end = this.parseFunction(result.position, mode);
if (end.result.value.name !== envName) {
throw new ParseError(
"Mismatch: \\begin{" + envName + "} matched " +
"by \\end{" + end.result.value.name + "}",
this.lexer, end.namepos);
}
result.position = end.position;
return result;
} else if (utils.contains(sizeFuncs, func)) {
// If we see a sizing function, parse out the implict body
body = this.parseExpression(start.result.position, mode, false);
return new ParseResult(
new ParseNode("sizing", {
// Figure out what size to use based on the list of functions above
size: "size" + (utils.indexOf(sizeFuncs, func) + 1),
value: body.result
}, mode),
body.position);
} else if (utils.contains(styleFuncs, func)) {
// If we see a styling function, parse out the implict body
body = this.parseExpression(start.result.position, mode, true);
return new ParseResult(
new ParseNode("styling", {
// Figure out what style to use by pulling out the style from
// the function name
style: func.slice(1, func.length - 5),
value: body.result
}, mode),
body.position);
} else {
// Defer to parseFunction if it's not a function we handle
return this.parseFunction(pos, mode);
}
};
/**
* Parses an entire function, including its base and all of its arguments
*
* @return {?ParseResult}
*/
Parser.prototype.parseFunction = function(pos, mode) {
var baseGroup = this.parseGroup(pos, mode);
if (baseGroup) {
if (baseGroup.isFunction) {
var func = baseGroup.result.result;
var funcData = functions.funcs[func];
if (mode === "text" && !funcData.allowedInText) {
throw new ParseError(
"Can't use function '" + func + "' in text mode",
this.lexer, baseGroup.position);
}
var args = [func];
var newPos = this.parseArguments(
baseGroup.result.position, mode, func, funcData, args);
var result = functions.funcs[func].handler.apply(this, args);
return new ParseResult(
new ParseNode(result.type, result, mode),
newPos);
} else {
return baseGroup.result;
}
} else {
return null;
}
};
/**
* Parses the arguments of a function or environment
*
* @param {string} func "\name" or "\begin{name}"
* @param {{numArgs:number,numOptionalArgs:number|undefined}} funcData
* @param {Array} args list of arguments to which new ones will be pushed
* @return the position after all arguments have been parsed
*/
Parser.prototype.parseArguments = function(pos, mode, func, funcData, args) {
var totalArgs = funcData.numArgs + funcData.numOptionalArgs;
if (totalArgs === 0) {
return pos;
}
var newPos = pos;
var baseGreediness = funcData.greediness;
var positions = [newPos];
for (var i = 0; i < totalArgs; i++) {
var argType = funcData.argTypes && funcData.argTypes[i];
var arg;
if (i < funcData.numOptionalArgs) {
if (argType) {
arg = this.parseSpecialGroup(newPos, argType, mode, true);
} else {
arg = this.parseOptionalGroup(newPos, mode);
}
if (!arg) {
args.push(null);
positions.push(newPos);
continue;
}
} else {
if (argType) {
arg = this.parseSpecialGroup(newPos, argType, mode);
} else {
arg = this.parseGroup(newPos, mode);
}
if (!arg) {
throw new ParseError(
"Expected group after '" + func + "'",
this.lexer, newPos);
}
}
var argNode;
if (arg.isFunction) {
var argGreediness =
functions.funcs[arg.result.result].greediness;
if (argGreediness > baseGreediness) {
argNode = this.parseFunction(newPos, mode);
} else {
throw new ParseError(
"Got function '" + arg.result.result + "' as " +
"argument to '" + func + "'",
this.lexer, arg.result.position - 1);
}
} else {
argNode = arg.result;
}
args.push(argNode.result);
positions.push(argNode.position);
newPos = argNode.position;
}
args.push(positions);
return newPos;
};
/**
* Parses a group when the mode is changing. Takes a position, a new mode, and
* an outer mode that is used to parse the outside.
*
* @return {?ParseFuncOrArgument}
*/
Parser.prototype.parseSpecialGroup = function(pos, mode, outerMode, optional) {
// Handle `original` argTypes
if (mode === "original") {
mode = outerMode;
}
if (mode === "color" || mode === "size") {
// color and size modes are special because they should have braces and
// should only lex a single symbol inside
var openBrace = this.lexer.lex(pos, outerMode);
if (optional && openBrace.text !== "[") {
// optional arguments should return null if they don't exist
return null;
}
this.expect(openBrace, optional ? "[" : "{");
var inner = this.lexer.lex(openBrace.position, mode);
var data;
if (mode === "color") {
data = inner.text;
} else {
data = inner.data;
}
var closeBrace = this.lexer.lex(inner.position, outerMode);
this.expect(closeBrace, optional ? "]" : "}");
return new ParseFuncOrArgument(
new ParseResult(
new ParseNode(mode, data, outerMode),
closeBrace.position),
false);
} else if (mode === "text") {
// text mode is special because it should ignore the whitespace before
// it
var whitespace = this.lexer.lex(pos, "whitespace");
pos = whitespace.position;
}
if (optional) {
return this.parseOptionalGroup(pos, mode);
} else {
return this.parseGroup(pos, mode);
}
};
/**
* Parses a group, which is either a single nucleus (like "x") or an expression
* in braces (like "{x+y}")
*
* @return {?ParseFuncOrArgument}
*/
Parser.prototype.parseGroup = function(pos, mode) {
var start = this.lexer.lex(pos, mode);
// Try to parse an open brace
if (start.text === "{") {
// If we get a brace, parse an expression
var expression = this.parseExpression(start.position, mode, false);
// Make sure we get a close brace
var closeBrace = this.lexer.lex(expression.position, mode);
this.expect(closeBrace, "}");
return new ParseFuncOrArgument(
new ParseResult(
new ParseNode("ordgroup", expression.result, mode),
closeBrace.position),
false);
} else {
// Otherwise, just return a nucleus
return this.parseSymbol(pos, mode);
}
};
/**
* Parses a group, which is an expression in brackets (like "[x+y]")
*
* @return {?ParseFuncOrArgument}
*/
Parser.prototype.parseOptionalGroup = function(pos, mode) {
var start = this.lexer.lex(pos, mode);
// Try to parse an open bracket
if (start.text === "[") {
// If we get a brace, parse an expression
var expression = this.parseExpression(start.position, mode, false, "]");
// Make sure we get a close bracket
var closeBracket = this.lexer.lex(expression.position, mode);
this.expect(closeBracket, "]");
return new ParseFuncOrArgument(
new ParseResult(
new ParseNode("ordgroup", expression.result, mode),
closeBracket.position),
false);
} else {
// Otherwise, return null,
return null;
}
};
/**
* Parse a single symbol out of the string. Here, we handle both the functions
* we have defined, as well as the single character symbols
*
* @return {?ParseFuncOrArgument}
*/
Parser.prototype.parseSymbol = function(pos, mode) {
var nucleus = this.lexer.lex(pos, mode);
if (functions.funcs[nucleus.text]) {
// If there exists a function with this name, we return the function and
// say that it is a function.
return new ParseFuncOrArgument(
new ParseResult(nucleus.text, nucleus.position),
true);
} else if (symbols[mode][nucleus.text]) {
// Otherwise if this is a no-argument function, find the type it
// corresponds to in the symbols map
return new ParseFuncOrArgument(
new ParseResult(
new ParseNode(symbols[mode][nucleus.text].group,
nucleus.text, mode),
nucleus.position),
false);
} else {
return null;
}
};
Parser.prototype.ParseNode = ParseNode;
module.exports = Parser;