Add support for Latin-1, Cyrillic, and CJK characters inside \text{} (#508)

Summary: This diff provides support for Latin-1, Cyrillic, and CJK characters inside \text{} groups. For Latin-1 and Cyrillic characters we use glyph metrics from a glyph from Basic Latin that has roughly the same bounding box. We use the metrics for a capital 'M' to approximate the full-width CJK characters. Half-width characters are not supported yet. Test Plan: - make test - make screenshots Reviewers: emily
2016-08-01 17:51:40 -07:00 · 2016-08-01 17:51:40 -07:00 · ec62ec39d8
commit ec62ec39d8
parent 92bbbffbc8
15 changed files with 348 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,3 +9,4 @@ diff.png
 /test/symgroups.aux
 /test/symgroups.log
 /test/symgroups.pdf
+/test/screenshotter/unicode-fonts
--- a/.travis.yml
+++ b/.travis.yml
@ -10,4 +10,5 @@ before_script:
  - docker images --no-trunc
 script:
  - npm test
+  - git clone https://github.com/Khan/KaTeX-test-fonts test/screenshotter/unicode-fonts
  - dockers/Screenshotter/screenshotter.sh --verify
--- a/8
+++ b/8
@ -39,6 +39,12 @@ build/fonts:
 		cp static/fonts/$$font* $@; \
 	done

+test/screenshotter/unicode-fonts:
+	git clone https://github.com/Khan/KaTeX-test-fonts test/screenshotter/unicode-fonts
+	cd test/screenshotter/unicode-fonts && \
+	git checkout 99fa66a2da643218754c8236b9f9151cac71ba7c && \
+	cd ../../../
+
 contrib: build/contrib

 .PHONY: build/contrib
@ -90,5 +96,5 @@ extended_metrics:
 clean:
 	rm -rf build/*

-screenshots:
+screenshots: test/screenshotter/unicode-fonts
 	dockers/Screenshotter/screenshotter.sh
--- a/server.js
+++ b/server.js
@ -81,6 +81,8 @@ app.use(express["static"](path.join(__dirname, "static")));
 app.use(express["static"](path.join(__dirname, "build")));
 app.use("/test", express["static"](path.join(__dirname, "test")));
 app.use("/contrib", express["static"](path.join(__dirname, "contrib")));
+// app.use("/unicode-fonts",
+//     express["static"](path.join(__dirname, "static", "unicode-fonts")));

 app.use(function(err, req, res, next) {
    console.error(err.stack);
--- a/src/Parser.js
+++ b/src/Parser.js
@ -4,6 +4,7 @@ var environments = require("./environments");
 var MacroExpander = require("./MacroExpander");
 var symbols = require("./symbols");
 var utils = require("./utils");
+var cjkRegex = require("./unicodeRegexes").cjkRegex;

 var parseData = require("./parseData");
 var ParseError = require("./ParseError");
@ -794,6 +795,11 @@ Parser.prototype.parseSymbol = function() {
            new ParseNode(symbols[this.mode][nucleus.text].group,
                          nucleus.text, this.mode, nucleus),
            false, nucleus);
+    } else if (this.mode === "text" && cjkRegex.test(nucleus.text)) {
+        this.consume();
+        return new ParseFuncOrArgument(
+            new ParseNode("textord", nucleus.text, this.mode, nucleus),
+            false, nucleus);
    } else {
        return null;
    }
--- a/src/domTree.js
+++ b/src/domTree.js
@ -7,7 +7,7 @@
 *
 * Similar functions for working with MathML nodes exist in mathMLTree.js.
 */
-
+var unicodeRegexes = require("./unicodeRegexes");
 var utils = require("./utils");

 /**
@ -169,6 +169,14 @@ documentFragment.prototype.toMarkup = function() {
    return markup;
 };

+var iCombinations = {
+    'î': '\u0131\u0302',
+    'ï': '\u0131\u0308',
+    'í': '\u0131\u0301',
+    // 'ī': '\u0131\u0304', // enable when we add Extended Latin
+    'ì': '\u0131\u0300',
+};
+
 /**
 * A symbol node contains information about a single symbol. It either renders
 * to a single text node, or a span with a single text node in it, depending on
@ -183,6 +191,25 @@ function symbolNode(value, height, depth, italic, skew, classes, style) {
    this.classes = classes || [];
    this.style = style || {};
    this.maxFontSize = 0;
+
+    // Mark CJK characters with specific classes so that we can specify which
+    // fonts to use.  This allows us to render these characters with a serif
+    // font in situations where the browser would either default to a sans serif
+    // or render a placeholder character.
+    if (unicodeRegexes.cjkRegex.test(value)) {
+        // I couldn't find any fonts that contained Hangul as well as all of
+        // the other characters we wanted to test there for it gets its own
+        // CSS class.
+        if (unicodeRegexes.hangulRegex.test(value)) {
+            this.classes.push('hangul_fallback');
+        } else {
+            this.classes.push('cjk_fallback');
+        }
+    }
+
+    if (/[îïíì]/.test(this.value)) {    // add ī when we add Extended Latin
+        this.value = iCombinations[this.value];
+    }
 }

 /**
--- a/src/fontMetrics.js
+++ b/src/fontMetrics.js
@ -1,6 +1,7 @@
 /* eslint no-unused-vars:0 */

 var Style = require("./Style");
+var cjkRegex = require("./unicodeRegexes").cjkRegex;

 /**
 * This file contains metrics regarding fonts and individual symbols. The sigma
@ -121,6 +122,145 @@ var metrics = {
 // This map is generated via `make metrics`. It should not be changed manually.
 var metricMap = require("./fontMetricsData");

+// These are very rough approximations.  We default to Times New Roman which
+// should have Latin-1 and Cyrillic characters, but may not depending on the
+// operating system.  The metrics do not account for extra height from the
+// accents.  In the case of Cyrillic characters which have both ascenders and
+// descenders we prefer approximations with ascenders, primarily to prevent
+// the fraction bar or root line from intersecting the glyph.
+// TODO(kevinb) allow union of multiple glyph metrics for better accuracy.
+var extraCharacterMap = {
+    // Latin-1
+    'À': 'A',
+    'Á': 'A',
+    'Â': 'A',
+    'Ã': 'A',
+    'Ä': 'A',
+    'Å': 'A',
+    'Æ': 'A',
+    'Ç': 'C',
+    'È': 'E',
+    'É': 'E',
+    'Ê': 'E',
+    'Ë': 'E',
+    'Ì': 'I',
+    'Í': 'I',
+    'Î': 'I',
+    'Ï': 'I',
+    'Ð': 'D',
+    'Ñ': 'N',
+    'Ò': 'O',
+    'Ó': 'O',
+    'Ô': 'O',
+    'Õ': 'O',
+    'Ö': 'O',
+    'Ø': 'O',
+    'Ù': 'U',
+    'Ú': 'U',
+    'Û': 'U',
+    'Ü': 'U',
+    'Ý': 'Y',
+    'Þ': 'o',
+    'ß': 'B',
+    'à': 'a',
+    'á': 'a',
+    'â': 'a',
+    'ã': 'a',
+    'ä': 'a',
+    'å': 'a',
+    'æ': 'a',
+    'ç': 'c',
+    'è': 'e',
+    'é': 'e',
+    'ê': 'e',
+    'ë': 'e',
+    'ì': 'i',
+    'í': 'i',
+    'î': 'i',
+    'ï': 'i',
+    'ð': 'd',
+    'ñ': 'n',
+    'ò': 'o',
+    'ó': 'o',
+    'ô': 'o',
+    'õ': 'o',
+    'ö': 'o',
+    'ø': 'o',
+    'ù': 'u',
+    'ú': 'u',
+    'û': 'u',
+    'ü': 'u',
+    'ý': 'y',
+    'þ': 'o',
+    'ÿ': 'y',
+
+    // Cyrillic
+    'А': 'A',
+    'Б': 'B',
+    'В': 'B',
+    'Г': 'F',
+    'Д': 'A',
+    'Е': 'E',
+    'Ж': 'K',
+    'З': '3',
+    'И': 'N',
+    'Й': 'N',
+    'К': 'K',
+    'Л': 'N',
+    'М': 'M',
+    'Н': 'H',
+    'О': 'O',
+    'П': 'N',
+    'Р': 'P',
+    'С': 'C',
+    'Т': 'T',
+    'У': 'y',
+    'Ф': 'O',
+    'Х': 'X',
+    'Ц': 'U',
+    'Ч': 'h',
+    'Ш': 'W',
+    'Щ': 'W',
+    'Ъ': 'B',
+    'Ы': 'X',
+    'Ь': 'B',
+    'Э': '3',
+    'Ю': 'X',
+    'Я': 'R',
+    'а': 'a',
+    'б': 'b',
+    'в': 'a',
+    'г': 'r',
+    'д': 'y',
+    'е': 'e',
+    'ж': 'm',
+    'з': 'e',
+    'и': 'n',
+    'й': 'n',
+    'к': 'n',
+    'л': 'n',
+    'м': 'm',
+    'н': 'n',
+    'о': 'o',
+    'п': 'n',
+    'р': 'p',
+    'с': 'c',
+    'т': 'o',
+    'у': 'y',
+    'ф': 'b',
+    'х': 'x',
+    'ц': 'n',
+    'ч': 'n',
+    'ш': 'w',
+    'щ': 'w',
+    'ъ': 'a',
+    'ы': 'm',
+    'ь': 'a',
+    'э': 'e',
+    'ю': 'm',
+    'я': 'r',
+};
+
 /**
 * This function is a convenience function for looking up information in the
 * metricMap table. It takes a character as a string, and a style.
@ -129,7 +269,13 @@ var metricMap = require("./fontMetricsData");
 * built using `Make extended_metrics`.
 */
 var getCharacterMetrics = function(character, style) {
-    var metrics = metricMap[style][character.charCodeAt(0)];
+    var ch = character.charCodeAt(0);
+    if (character[0] in extraCharacterMap) {
+        ch = extraCharacterMap[character[0]].charCodeAt(0);
+    } else if (cjkRegex.test(character[0])) {
+        ch = 'M'.charCodeAt(0);
+    }
+    var metrics = metricMap[style][ch];
    if (metrics) {
        return {
            depth: metrics[0],
--- a/src/symbols.js
+++ b/src/symbols.js
@ -630,3 +630,25 @@ for (i = 0; i < letters.length; i++) {
    defineSymbol(math, main, mathord, ch, ch);
    defineSymbol(text, main, textord, ch, ch);
 }
+
+// Latin-1 letters
+for (i = 0x00C0; i <= 0x00D6; i++) {
+    ch = String.fromCharCode(i);
+    defineSymbol(text, main, textord, ch, ch);
+}
+
+for (i = 0x00D8; i <= 0x00F6; i++) {
+    ch = String.fromCharCode(i);
+    defineSymbol(text, main, textord, ch, ch);
+}
+
+for (i = 0x00F8; i <= 0x00FF; i++) {
+    ch = String.fromCharCode(i);
+    defineSymbol(text, main, textord, ch, ch);
+}
+
+// Cyrillic
+for (i = 0x0410; i <= 0x044F; i++) {
+    ch = String.fromCharCode(i);
+    defineSymbol(text, main, textord, ch, ch);
+}
--- a/src/unicodeRegexes.js
+++ b/src/unicodeRegexes.js
@ -0,0 +1,15 @@
+var hangulRegex = /[\uAC00-\uD7AF]/;
+
+// This regex combines
+// - Hiragana: [\u3040-\u309F]
+// - Katakana: [\u30A0-\u30FF]
+// - CJK ideograms: [\u4E00-\u9FAF]
+// - Hangul syllables: [\uAC00-\uD7AF]
+// Notably missing are halfwidth Katakana and Romanji glyphs.
+var cjkRegex =
+    /[\u3040-\u309F]|[\u30A0-\u30FF]|[\u4E00-\u9FAF]|[\uAC00-\uD7AF]/;
+
+module.exports = {
+    cjkRegex: cjkRegex,
+    hangulRegex: hangulRegex,
+};
--- a/static/katex.less
+++ b/static/katex.less
@ -15,7 +15,7 @@
 }

 .katex {
-    font: normal 1.21em KaTeX_Main;
+    font: normal 1.21em KaTeX_Main, Times New Roman, serif;
    line-height: 1.2;
    white-space: nowrap;

--- a/test/screenshotter/images/Unicode-chrome.png
+++ b/test/screenshotter/images/Unicode-chrome.png
--- a/test/screenshotter/images/Unicode-firefox.png
+++ b/test/screenshotter/images/Unicode-firefox.png
--- a/test/screenshotter/ss_data.yaml
+++ b/test/screenshotter/ss_data.yaml
@ -114,6 +114,7 @@ Symbols1: |
    \maltese\degree\pounds\$
    \text{\maltese\degree}
 Text: \frac{a}{b}\text{c~ {ab} \ e}+fg
+Unicode: \begin{matrix}\text{ÀàÇçÉéÏïÖöÛû} \\ \text{БГДЖЗЙЛФЦШЫЮЯ} \\ \text{여보세요} \\ \text{私はバナナです} \end{matrix}
 UnsupportedCmds:
    tex: \err\,\frac\fracerr3\,2^\superr_\suberr\,\sqrt\sqrterr
    noThrow: 1
--- a/test/screenshotter/test.html
+++ b/test/screenshotter/test.html
@ -11,6 +11,20 @@
      body {
        font-family: "DejaVu Serif",serif;
      }
+      @font-face {
+        font-family: "Mincho";
+        src: url("unicode-fonts/mincho/font_1_honokamin.ttf") format("truetype");
+      }
+      @font-face {
+        font-family: "Batang";
+        src: url("unicode-fonts/batang/batang.ttf") format("truetype");
+      }
+      .katex .cjk_fallback {
+          font-family: "Mincho",serif;
+      }
+      .katex .hangul_fallback {
+          font-family: "Batang",serif;
+      }
    </style>
  </head>
  <body>
--- a/test/unicode-spec.js
+++ b/test/unicode-spec.js
@ -0,0 +1,103 @@
+/* eslint max-len:0 */
+/* global beforeEach: false */
+/* global jasmine: false */
+/* global expect: false */
+/* global it: false */
+/* global describe: false */
+var ParseError = require("../src/ParseError");
+var parseTree = require("../src/parseTree");
+var Settings = require("../src/Settings");
+
+var defaultSettings = new Settings({});
+
+var parseAndSetResult = function(expr, result, settings) {
+    try {
+        return parseTree(expr, settings || defaultSettings);
+    } catch (e) {
+        result.pass = false;
+        if (e instanceof ParseError) {
+            result.message = "'" + expr + "' failed " +
+                "parsing with error: " + e.message;
+        } else {
+            result.message = "'" + expr + "' failed " +
+                "parsing with unknown error: " + e.message;
+        }
+    }
+};
+
+describe("unicode", function() {
+    beforeEach(function() {
+        jasmine.addMatchers({
+
+            toParse: function() {
+                return {
+                    compare: function(actual, settings) {
+                        var usedSettings = settings ? settings : defaultSettings;
+
+                        var result = {
+                            pass: true,
+                            message: "'" + actual + "' succeeded parsing",
+                        };
+                        parseAndSetResult(actual, result, usedSettings);
+                        return result;
+                    },
+                };
+            },
+
+            toNotParse: function() {
+                return {
+                    compare: function(actual, settings) {
+                        var usedSettings = settings ? settings : defaultSettings;
+
+                        var result = {
+                            pass: false,
+                            message: "Expected '" + actual + "' to fail " +
+                                "parsing, but it succeeded",
+                        };
+
+                        try {
+                            parseTree(actual, usedSettings);
+                        } catch (e) {
+                            if (e instanceof ParseError) {
+                                result.pass = true;
+                                result.message = "'" + actual + "' correctly " +
+                                    "didn't parse with error: " + e.message;
+                            } else {
+                                result.message = "'" + actual + "' failed " +
+                                    "parsing with unknown error: " + e.message;
+                            }
+                        }
+
+                        return result;
+                    },
+                };
+            },
+        });
+    });
+
+    it("should parse Latin-1 inside \\text{}", function() {
+        expect('\\text{ÀàÇçÉéÏïÖöÛû}').toParse();
+    });
+
+    it("should not parse Latin-1 outside \\text{}", function() {
+        expect('ÀàÇçÉéÏïÖöÛû').toNotParse();
+    });
+
+    it("should parse Cyrillic inside \\text{}", function() {
+        expect('\\text{БГДЖЗЙЛФЦШЫЮЯ}').toParse();
+    });
+
+    it("should not parse Cyrillic outside \\text{}", function() {
+        expect('БГДЖЗЙЛФЦШЫЮЯ').toNotParse();
+    });
+
+    it("should parse CJK inside \\text{}", function() {
+        expect('\\text{私はバナナです}').toParse();
+        expect('\\text{여보세요}').toParse();
+    });
+
+    it("should not parse CJK outside \\text{}", function() {
+        expect('私はバナナです。').toNotParse();
+        expect('여보세요').toNotParse();
+    });
+});