diff --git a/.gitignore b/.gitignore index 058d8cf4e..ddaaa9335 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ diff.png /test/symgroups.aux /test/symgroups.log /test/symgroups.pdf +/test/screenshotter/unicode-fonts diff --git a/.travis.yml b/.travis.yml index 4228096c7..5b6e28802 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,4 +10,5 @@ before_script: - docker images --no-trunc script: - npm test + - git clone https://github.com/Khan/KaTeX-test-fonts test/screenshotter/unicode-fonts - dockers/Screenshotter/screenshotter.sh --verify diff --git a/Makefile b/Makefile index 546cbaa15..03d3d9596 100644 --- a/Makefile +++ b/Makefile @@ -39,6 +39,12 @@ build/fonts: cp static/fonts/$$font* $@; \ done +test/screenshotter/unicode-fonts: + git clone https://github.com/Khan/KaTeX-test-fonts test/screenshotter/unicode-fonts + cd test/screenshotter/unicode-fonts && \ + git checkout 99fa66a2da643218754c8236b9f9151cac71ba7c && \ + cd ../../../ + contrib: build/contrib .PHONY: build/contrib @@ -90,5 +96,5 @@ extended_metrics: clean: rm -rf build/* -screenshots: +screenshots: test/screenshotter/unicode-fonts dockers/Screenshotter/screenshotter.sh diff --git a/server.js b/server.js index a6213449c..08eacf1fa 100644 --- a/server.js +++ b/server.js @@ -81,6 +81,8 @@ app.use(express["static"](path.join(__dirname, "static"))); app.use(express["static"](path.join(__dirname, "build"))); app.use("/test", express["static"](path.join(__dirname, "test"))); app.use("/contrib", express["static"](path.join(__dirname, "contrib"))); +// app.use("/unicode-fonts", +// express["static"](path.join(__dirname, "static", "unicode-fonts"))); app.use(function(err, req, res, next) { console.error(err.stack); diff --git a/src/Parser.js b/src/Parser.js index 4bfde913b..97bbbf3fd 100644 --- a/src/Parser.js +++ b/src/Parser.js @@ -4,6 +4,7 @@ var environments = require("./environments"); var MacroExpander = require("./MacroExpander"); var symbols = require("./symbols"); var utils = require("./utils"); +var cjkRegex = require("./unicodeRegexes").cjkRegex; var parseData = require("./parseData"); var ParseError = require("./ParseError"); @@ -794,6 +795,11 @@ Parser.prototype.parseSymbol = function() { new ParseNode(symbols[this.mode][nucleus.text].group, nucleus.text, this.mode, nucleus), false, nucleus); + } else if (this.mode === "text" && cjkRegex.test(nucleus.text)) { + this.consume(); + return new ParseFuncOrArgument( + new ParseNode("textord", nucleus.text, this.mode, nucleus), + false, nucleus); } else { return null; } diff --git a/src/domTree.js b/src/domTree.js index e0d8e925a..46b515ea1 100644 --- a/src/domTree.js +++ b/src/domTree.js @@ -7,7 +7,7 @@ * * Similar functions for working with MathML nodes exist in mathMLTree.js. */ - +var unicodeRegexes = require("./unicodeRegexes"); var utils = require("./utils"); /** @@ -169,6 +169,14 @@ documentFragment.prototype.toMarkup = function() { return markup; }; +var iCombinations = { + 'î': '\u0131\u0302', + 'ï': '\u0131\u0308', + 'í': '\u0131\u0301', + // 'ī': '\u0131\u0304', // enable when we add Extended Latin + 'ì': '\u0131\u0300', +}; + /** * A symbol node contains information about a single symbol. It either renders * to a single text node, or a span with a single text node in it, depending on @@ -183,6 +191,25 @@ function symbolNode(value, height, depth, italic, skew, classes, style) { this.classes = classes || []; this.style = style || {}; this.maxFontSize = 0; + + // Mark CJK characters with specific classes so that we can specify which + // fonts to use. This allows us to render these characters with a serif + // font in situations where the browser would either default to a sans serif + // or render a placeholder character. + if (unicodeRegexes.cjkRegex.test(value)) { + // I couldn't find any fonts that contained Hangul as well as all of + // the other characters we wanted to test there for it gets its own + // CSS class. + if (unicodeRegexes.hangulRegex.test(value)) { + this.classes.push('hangul_fallback'); + } else { + this.classes.push('cjk_fallback'); + } + } + + if (/[îïíì]/.test(this.value)) { // add ī when we add Extended Latin + this.value = iCombinations[this.value]; + } } /** diff --git a/src/fontMetrics.js b/src/fontMetrics.js index db9e44bfa..e4e440a78 100644 --- a/src/fontMetrics.js +++ b/src/fontMetrics.js @@ -1,6 +1,7 @@ /* eslint no-unused-vars:0 */ var Style = require("./Style"); +var cjkRegex = require("./unicodeRegexes").cjkRegex; /** * This file contains metrics regarding fonts and individual symbols. The sigma @@ -121,6 +122,145 @@ var metrics = { // This map is generated via `make metrics`. It should not be changed manually. var metricMap = require("./fontMetricsData"); +// These are very rough approximations. We default to Times New Roman which +// should have Latin-1 and Cyrillic characters, but may not depending on the +// operating system. The metrics do not account for extra height from the +// accents. In the case of Cyrillic characters which have both ascenders and +// descenders we prefer approximations with ascenders, primarily to prevent +// the fraction bar or root line from intersecting the glyph. +// TODO(kevinb) allow union of multiple glyph metrics for better accuracy. +var extraCharacterMap = { + // Latin-1 + 'À': 'A', + 'Á': 'A', + 'Â': 'A', + 'Ã': 'A', + 'Ä': 'A', + 'Å': 'A', + 'Æ': 'A', + 'Ç': 'C', + 'È': 'E', + 'É': 'E', + 'Ê': 'E', + 'Ë': 'E', + 'Ì': 'I', + 'Í': 'I', + 'Î': 'I', + 'Ï': 'I', + 'Ð': 'D', + 'Ñ': 'N', + 'Ò': 'O', + 'Ó': 'O', + 'Ô': 'O', + 'Õ': 'O', + 'Ö': 'O', + 'Ø': 'O', + 'Ù': 'U', + 'Ú': 'U', + 'Û': 'U', + 'Ü': 'U', + 'Ý': 'Y', + 'Þ': 'o', + 'ß': 'B', + 'à': 'a', + 'á': 'a', + 'â': 'a', + 'ã': 'a', + 'ä': 'a', + 'å': 'a', + 'æ': 'a', + 'ç': 'c', + 'è': 'e', + 'é': 'e', + 'ê': 'e', + 'ë': 'e', + 'ì': 'i', + 'í': 'i', + 'î': 'i', + 'ï': 'i', + 'ð': 'd', + 'ñ': 'n', + 'ò': 'o', + 'ó': 'o', + 'ô': 'o', + 'õ': 'o', + 'ö': 'o', + 'ø': 'o', + 'ù': 'u', + 'ú': 'u', + 'û': 'u', + 'ü': 'u', + 'ý': 'y', + 'þ': 'o', + 'ÿ': 'y', + + // Cyrillic + 'А': 'A', + 'Б': 'B', + 'В': 'B', + 'Г': 'F', + 'Д': 'A', + 'Е': 'E', + 'Ж': 'K', + 'З': '3', + 'И': 'N', + 'Й': 'N', + 'К': 'K', + 'Л': 'N', + 'М': 'M', + 'Н': 'H', + 'О': 'O', + 'П': 'N', + 'Р': 'P', + 'С': 'C', + 'Т': 'T', + 'У': 'y', + 'Ф': 'O', + 'Х': 'X', + 'Ц': 'U', + 'Ч': 'h', + 'Ш': 'W', + 'Щ': 'W', + 'Ъ': 'B', + 'Ы': 'X', + 'Ь': 'B', + 'Э': '3', + 'Ю': 'X', + 'Я': 'R', + 'а': 'a', + 'б': 'b', + 'в': 'a', + 'г': 'r', + 'д': 'y', + 'е': 'e', + 'ж': 'm', + 'з': 'e', + 'и': 'n', + 'й': 'n', + 'к': 'n', + 'л': 'n', + 'м': 'm', + 'н': 'n', + 'о': 'o', + 'п': 'n', + 'р': 'p', + 'с': 'c', + 'т': 'o', + 'у': 'y', + 'ф': 'b', + 'х': 'x', + 'ц': 'n', + 'ч': 'n', + 'ш': 'w', + 'щ': 'w', + 'ъ': 'a', + 'ы': 'm', + 'ь': 'a', + 'э': 'e', + 'ю': 'm', + 'я': 'r', +}; + /** * This function is a convenience function for looking up information in the * metricMap table. It takes a character as a string, and a style. @@ -129,7 +269,13 @@ var metricMap = require("./fontMetricsData"); * built using `Make extended_metrics`. */ var getCharacterMetrics = function(character, style) { - var metrics = metricMap[style][character.charCodeAt(0)]; + var ch = character.charCodeAt(0); + if (character[0] in extraCharacterMap) { + ch = extraCharacterMap[character[0]].charCodeAt(0); + } else if (cjkRegex.test(character[0])) { + ch = 'M'.charCodeAt(0); + } + var metrics = metricMap[style][ch]; if (metrics) { return { depth: metrics[0], diff --git a/src/symbols.js b/src/symbols.js index aef7c8748..55301a4e4 100644 --- a/src/symbols.js +++ b/src/symbols.js @@ -630,3 +630,25 @@ for (i = 0; i < letters.length; i++) { defineSymbol(math, main, mathord, ch, ch); defineSymbol(text, main, textord, ch, ch); } + +// Latin-1 letters +for (i = 0x00C0; i <= 0x00D6; i++) { + ch = String.fromCharCode(i); + defineSymbol(text, main, textord, ch, ch); +} + +for (i = 0x00D8; i <= 0x00F6; i++) { + ch = String.fromCharCode(i); + defineSymbol(text, main, textord, ch, ch); +} + +for (i = 0x00F8; i <= 0x00FF; i++) { + ch = String.fromCharCode(i); + defineSymbol(text, main, textord, ch, ch); +} + +// Cyrillic +for (i = 0x0410; i <= 0x044F; i++) { + ch = String.fromCharCode(i); + defineSymbol(text, main, textord, ch, ch); +} diff --git a/src/unicodeRegexes.js b/src/unicodeRegexes.js new file mode 100644 index 000000000..a05d7cd5f --- /dev/null +++ b/src/unicodeRegexes.js @@ -0,0 +1,15 @@ +var hangulRegex = /[\uAC00-\uD7AF]/; + +// This regex combines +// - Hiragana: [\u3040-\u309F] +// - Katakana: [\u30A0-\u30FF] +// - CJK ideograms: [\u4E00-\u9FAF] +// - Hangul syllables: [\uAC00-\uD7AF] +// Notably missing are halfwidth Katakana and Romanji glyphs. +var cjkRegex = + /[\u3040-\u309F]|[\u30A0-\u30FF]|[\u4E00-\u9FAF]|[\uAC00-\uD7AF]/; + +module.exports = { + cjkRegex: cjkRegex, + hangulRegex: hangulRegex, +}; diff --git a/static/katex.less b/static/katex.less index b7c8f84d9..8b11048ab 100644 --- a/static/katex.less +++ b/static/katex.less @@ -15,7 +15,7 @@ } .katex { - font: normal 1.21em KaTeX_Main; + font: normal 1.21em KaTeX_Main, Times New Roman, serif; line-height: 1.2; white-space: nowrap; diff --git a/test/screenshotter/images/Unicode-chrome.png b/test/screenshotter/images/Unicode-chrome.png new file mode 100644 index 000000000..6d7472f4a Binary files /dev/null and b/test/screenshotter/images/Unicode-chrome.png differ diff --git a/test/screenshotter/images/Unicode-firefox.png b/test/screenshotter/images/Unicode-firefox.png new file mode 100644 index 000000000..bc0fc1bc0 Binary files /dev/null and b/test/screenshotter/images/Unicode-firefox.png differ diff --git a/test/screenshotter/ss_data.yaml b/test/screenshotter/ss_data.yaml index 4f2ce6d6c..59bc5bd64 100644 --- a/test/screenshotter/ss_data.yaml +++ b/test/screenshotter/ss_data.yaml @@ -114,6 +114,7 @@ Symbols1: | \maltese\degree\pounds\$ \text{\maltese\degree} Text: \frac{a}{b}\text{c~ {ab} \ e}+fg +Unicode: \begin{matrix}\text{ÀàÇçÉéÏïÖöÛû} \\ \text{БГДЖЗЙЛФЦШЫЮЯ} \\ \text{여보세요} \\ \text{私はバナナです} \end{matrix} UnsupportedCmds: tex: \err\,\frac\fracerr3\,2^\superr_\suberr\,\sqrt\sqrterr noThrow: 1 diff --git a/test/screenshotter/test.html b/test/screenshotter/test.html index 4da2a8b8f..3694ba98b 100644 --- a/test/screenshotter/test.html +++ b/test/screenshotter/test.html @@ -11,6 +11,20 @@ body { font-family: "DejaVu Serif",serif; } + @font-face { + font-family: "Mincho"; + src: url("unicode-fonts/mincho/font_1_honokamin.ttf") format("truetype"); + } + @font-face { + font-family: "Batang"; + src: url("unicode-fonts/batang/batang.ttf") format("truetype"); + } + .katex .cjk_fallback { + font-family: "Mincho",serif; + } + .katex .hangul_fallback { + font-family: "Batang",serif; + } diff --git a/test/unicode-spec.js b/test/unicode-spec.js new file mode 100644 index 000000000..a78a83aaa --- /dev/null +++ b/test/unicode-spec.js @@ -0,0 +1,103 @@ +/* eslint max-len:0 */ +/* global beforeEach: false */ +/* global jasmine: false */ +/* global expect: false */ +/* global it: false */ +/* global describe: false */ +var ParseError = require("../src/ParseError"); +var parseTree = require("../src/parseTree"); +var Settings = require("../src/Settings"); + +var defaultSettings = new Settings({}); + +var parseAndSetResult = function(expr, result, settings) { + try { + return parseTree(expr, settings || defaultSettings); + } catch (e) { + result.pass = false; + if (e instanceof ParseError) { + result.message = "'" + expr + "' failed " + + "parsing with error: " + e.message; + } else { + result.message = "'" + expr + "' failed " + + "parsing with unknown error: " + e.message; + } + } +}; + +describe("unicode", function() { + beforeEach(function() { + jasmine.addMatchers({ + + toParse: function() { + return { + compare: function(actual, settings) { + var usedSettings = settings ? settings : defaultSettings; + + var result = { + pass: true, + message: "'" + actual + "' succeeded parsing", + }; + parseAndSetResult(actual, result, usedSettings); + return result; + }, + }; + }, + + toNotParse: function() { + return { + compare: function(actual, settings) { + var usedSettings = settings ? settings : defaultSettings; + + var result = { + pass: false, + message: "Expected '" + actual + "' to fail " + + "parsing, but it succeeded", + }; + + try { + parseTree(actual, usedSettings); + } catch (e) { + if (e instanceof ParseError) { + result.pass = true; + result.message = "'" + actual + "' correctly " + + "didn't parse with error: " + e.message; + } else { + result.message = "'" + actual + "' failed " + + "parsing with unknown error: " + e.message; + } + } + + return result; + }, + }; + }, + }); + }); + + it("should parse Latin-1 inside \\text{}", function() { + expect('\\text{ÀàÇçÉéÏïÖöÛû}').toParse(); + }); + + it("should not parse Latin-1 outside \\text{}", function() { + expect('ÀàÇçÉéÏïÖöÛû').toNotParse(); + }); + + it("should parse Cyrillic inside \\text{}", function() { + expect('\\text{БГДЖЗЙЛФЦШЫЮЯ}').toParse(); + }); + + it("should not parse Cyrillic outside \\text{}", function() { + expect('БГДЖЗЙЛФЦШЫЮЯ').toNotParse(); + }); + + it("should parse CJK inside \\text{}", function() { + expect('\\text{私はバナナです}').toParse(); + expect('\\text{여보세요}').toParse(); + }); + + it("should not parse CJK outside \\text{}", function() { + expect('私はバナナです。').toNotParse(); + expect('여보세요').toNotParse(); + }); +});