Add some more symbols (#502)

This adds support for the following input sequences: -- --- ` ' `` '' \degree \pounds \maltese resulting in – — ‘ ’ “ ” ° £ ✠ symbols already present in our fonts. As part of this modification, the recognition of multiple dashes was moved from the lexer to the parser. This is neccessary since in math mode a sequence of hyphens is just a sequence of minus signs. Just like a pair of apostrophes in math mode is a double prime not a right double quotation mark. To make this easier, parseGroup and parseOptionalGroup have been merged.
2016-07-25 04:56:31 +02:00 · 2016-07-25 04:56:31 +02:00 · 4a9c2acbf7
commit 4a9c2acbf7
parent befe1c1af7
10 changed files with 73 additions and 44 deletions
--- a/src/Lexer.js
+++ b/src/Lexer.js
@ -63,7 +63,6 @@ Token.prototype.range = function(endToken, text) {

 /* The following tokenRegex
 * - matches typical whitespace (but not NBSP etc.) using its first group
- * - matches symbol combinations which result in a single output character
 * - does not match any control character \x00-\x1f except whitespace
 * - does not match a bare backslash
 * - matches any ASCII character except those just mentioned
@ -78,9 +77,8 @@ Token.prototype.range = function(endToken, text) {
 * still reject the input.
 */
 var tokenRegex = new RegExp(
-    "([ \r\n\t]+)|(" +                                // whitespace
-    "---?" +                                          // special combinations
-    "|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" +  // single codepoint
+    "([ \r\n\t]+)|" +                                 // whitespace
+    "([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" +  // single codepoint
    "|[\uD800-\uDBFF][\uDC00-\uDFFF]" +               // surrogate pair
    "|\\\\(?:[a-zA-Z]+|[^\uD800-\uDFFF])" +           // function name
    ")"
--- a/src/Parser.js
+++ b/src/Parser.js
@ -545,7 +545,7 @@ Parser.prototype.parseArguments = function(func, funcData) {
            if (argType) {
                arg = this.parseGroupOfType(argType, true);
            } else {
-                arg = this.parseOptionalGroup();
+                arg = this.parseGroup(true);
            }
            if (!arg) {
                args.push(null);
@ -623,12 +623,7 @@ Parser.prototype.parseGroupOfType = function(innerMode, optional) {
    }
    // By the time we get here, innerMode is one of "text" or "math".
    // We switch the mode of the parser, recurse, then restore the old mode.
-    var res;
-    if (optional) {
-        res = this.parseOptionalGroup();
-    } else {
-        res = this.parseGroup();
-    }
+    var res = this.parseGroup(optional);
    this.switchMode(outerMode);
    return res;
 };
@ -638,7 +633,7 @@ Parser.prototype.parseGroupOfType = function(innerMode, optional) {
 * brace-enclosed tokens plus some position information.
 *
 * @param {string} modeName  Used to describe the mode in error messages
- * @param {boolean} optional  Whether the group is optional or required
+ * @param {boolean=} optional  Whether the group is optional or required
 */
 Parser.prototype.parseStringGroup = function(modeName, optional) {
    if (optional && this.nextToken.text !== "[") {
@ -707,53 +702,71 @@ Parser.prototype.parseSizeGroup = function(optional) {
 };

 /**
- * Parses a group, which is either a single nucleus (like "x") or an expression
- * in braces (like "{x+y}")
+ * If the argument is false or absent, this parses an ordinary group,
+ * which is either a single nucleus (like "x") or an expression
+ * in braces (like "{x+y}").
+ * If the argument is true, it parses either a bracket-delimited expression
+ * (like "[x+y]") or returns null to indicate the absence of a
+ * bracket-enclosed group.
 *
+ * @param {boolean=} optional  Whether the group is optional or required
 * @return {?ParseFuncOrArgument}
 */
-Parser.prototype.parseGroup = function() {
+Parser.prototype.parseGroup = function(optional) {
    var firstToken = this.nextToken;
    // Try to parse an open brace
-    if (this.nextToken.text === "{") {
+    if (this.nextToken.text === (optional ? "[" : "{")) {
        // If we get a brace, parse an expression
        this.consume();
-        var expression = this.parseExpression(false);
+        var expression = this.parseExpression(false, optional ? "]" : null);
        var lastToken = this.nextToken;
        // Make sure we get a close brace
-        this.expect("}");
+        this.expect(optional ? "]" : "}");
+        if (this.mode === "text") {
+            this.formLigatures(expression);
+        }
        return new ParseFuncOrArgument(
            new ParseNode("ordgroup", expression, this.mode,
                          firstToken, lastToken),
            false);
    } else {
-        // Otherwise, just return a nucleus
-        return this.parseSymbol();
+        // Otherwise, just return a nucleus, or nothing for an optional group
+        return optional ? null : this.parseSymbol();
    }
 };

 /**
- * Parses a group, which is an expression in brackets (like "[x+y]")
+ * Form ligature-like combinations of characters for text mode.
+ * This includes inputs like "--", "---", "``" and "''".
+ * The result will simply replace multiple textord nodes with a single
+ * character in each value by a single textord node having multiple
+ * characters in its value.  The representation is still ASCII source.
 *
- * @return {?ParseFuncOrArgument}
+ * @param {Array.<ParseNode>} group  the nodes of this group,
+ *                                   list will be moified in place
 */
-Parser.prototype.parseOptionalGroup = function() {
-    var firstToken = this.nextToken;
-    // Try to parse an open bracket
-    if (this.nextToken.text === "[") {
-        // If we get a brace, parse an expression
-        this.consume();
-        var expression = this.parseExpression(false, "]");
-        var lastToken = this.nextToken;
-        // Make sure we get a close bracket
-        this.expect("]");
-        return new ParseFuncOrArgument(
-            new ParseNode("ordgroup", expression, this.mode,
-                          firstToken, lastToken),
-            false);
-    } else {
-        // Otherwise, return null,
-        return null;
+Parser.prototype.formLigatures = function(group) {
+    var i;
+    var n = group.length - 1;
+    for (i = 0; i < n; ++i) {
+        var a = group[i];
+        var v = a.value;
+        if (v === "-" && group[i + 1].value === "-") {
+            if (i + 1 < n && group[i + 2].value === "-") {
+                group.splice(i, 3, new ParseNode(
+                    "textord", "---", "text", a, group[i + 2]));
+                n -= 2;
+            } else {
+                group.splice(i, 2, new ParseNode(
+                    "textord", "--", "text", a, group[i + 1]));
+                n -= 1;
+            }
+        }
+        if ((v === "'" || v === "`") && group[i + 1].value === v) {
+            group.splice(i, 2, new ParseNode(
+                "textord", v + v, "text", a, group[i + 1]));
+            n -= 1;
+        }
    }
 };

--- a/src/buildCommon.js
+++ b/src/buildCommon.js
@ -23,9 +23,11 @@ var greekCapitals = [
    "\\Omega",
 ];

-var dotlessLetters = [
+// The following have to be loaded from Main-Italic font, using class mainit
+var mainitLetters = [
    "\u0131",   // dotless i, \imath
    "\u0237",   // dotless j, \jmath
+    "\u00a3",   // \pounds
 ];

 /**
@ -101,7 +103,7 @@ var mathit = function(value, mode, color, classes) {
    if (/[0-9]/.test(value.charAt(0)) ||
            // glyphs for \imath and \jmath do not exist in Math-Italic so we
            // need to use Main-Italic instead
-            utils.contains(dotlessLetters, value) ||
+            utils.contains(mainitLetters, value) ||
            utils.contains(greekCapitals, value)) {
        return makeSymbol(
            value, "Main-Italic", mode, color, classes.concat(["mainit"]));
@ -126,7 +128,7 @@ var makeOrd = function(group, options, type) {

    var font = options.font;
    if (font) {
-        if (font === "mathit" || utils.contains(dotlessLetters, value)) {
+        if (font === "mathit" || utils.contains(mainitLetters, value)) {
            return mathit(value, mode, color, classes);
        } else {
            var fontName = fontMap[font].fontName;
--- a/src/symbols.js
+++ b/src/symbols.js
@ -589,6 +589,18 @@ defineSymbol(math, main, accent, "\u02d9", "\\dot");
 defineSymbol(math, main, mathord, "\u0131", "\\imath");
 defineSymbol(math, main, mathord, "\u0237", "\\jmath");

+defineSymbol(text, main, textord, "\u2013", "--");
+defineSymbol(text, main, textord, "\u2014", "---");
+defineSymbol(text, main, textord, "\u2018", "`");
+defineSymbol(text, main, textord, "\u2019", "'");
+defineSymbol(text, main, textord, "\u201c", "``");
+defineSymbol(text, main, textord, "\u201d", "''");
+defineSymbol(math, main, textord, "\u00b0", "\\degree");
+defineSymbol(text, main, textord, "\u00b0", "\\degree");
+defineSymbol(math, main, mathord, "\u00a3", "\\pounds");
+defineSymbol(math, ams, textord, "\u2720", "\\maltese");
+defineSymbol(text, ams, textord, "\u2720", "\\maltese");
+
 defineSymbol(text, main, spacing, "\u00a0", "\\ ");
 defineSymbol(text, main, spacing, "\u00a0", " ");
 defineSymbol(text, main, spacing, "\u00a0", "~");
@ -605,7 +617,7 @@ for (i = 0; i < mathTextSymbols.length; i++) {
 }

 // All of these are textords in text mode
-var textSymbols = "0123456789`!@*()-=+[]'\";:?/.,";
+var textSymbols = "0123456789!@*()-=+[]\";:?/.,";
 for (i = 0; i < textSymbols.length; i++) {
    ch = textSymbols.charAt(i);
    defineSymbol(text, main, textord, ch, ch);
--- a/test/screenshotter/images/DashesAndQuotes-chrome.png
+++ b/test/screenshotter/images/DashesAndQuotes-chrome.png
--- a/test/screenshotter/images/DashesAndQuotes-firefox.png
+++ b/test/screenshotter/images/DashesAndQuotes-firefox.png
--- a/test/screenshotter/images/Symbols1-chrome.png
+++ b/test/screenshotter/images/Symbols1-chrome.png
--- a/test/screenshotter/images/Symbols1-firefox.png
+++ b/test/screenshotter/images/Symbols1-firefox.png
--- a/test/screenshotter/ss_data.yaml
+++ b/test/screenshotter/ss_data.yaml
@ -36,6 +36,7 @@ Cases: |
 Colors:
    tex: \blue{a}\color{#0f0}{b}\color{red}{c}
    nolatex: different syntax and different scope
+DashesAndQuotes: \text{``a'' b---c -- d----`e'-{-}-f}--``x''
 DeepFontSizing:
    tex: |
        a^{\big| x^{\big(}}_{\Big\uparrow} +
@ -109,6 +110,9 @@ SupSubHorizSpacing: |
 SupSubLeftAlignReset: |
    \omega^8_{888} \quad \frac{1}{\hat{\omega}^{8}_{888}} \quad \displaystyle\sum_{\omega^{8}_{888}}
 SupSubOffsets: \displaystyle \int_{2+3}x f^{2+3}+3\lim_{2+3+4+5}f
+Symbols1: |
+    \maltese\degree\pounds\$
+    \text{\maltese\degree}
 Text: \frac{a}{b}\text{c~ {ab} \ e}+fg
 UnsupportedCmds:
    tex: \err\,\frac\fracerr3\,2^\superr_\suberr\,\sqrt\sqrterr
--- a/test/screenshotter/test.tex
+++ b/test/screenshotter/test.tex
@ -1,6 +1,6 @@
 \documentclass[10pt]{article}

-\usepackage{amsmath,amssymb}
+\usepackage{amsmath,amssymb,textcomp,gensymb}
 \usepackage[mathscr]{eucal}
 \usepackage{eufrak}
 \usepackage[papersize={133pt,100pt},margin=0.5pt]{geometry}