Add support for Latin-1, Cyrillic, and CJK characters inside \text{} (#508)
Summary: This diff provides support for Latin-1, Cyrillic, and CJK characters inside \text{} groups. For Latin-1 and Cyrillic characters we use glyph metrics from a glyph from Basic Latin that has roughly the same bounding box. We use the metrics for a capital 'M' to approximate the full-width CJK characters. Half-width characters are not supported yet. Test Plan: - make test - make screenshots Reviewers: emily
This commit is contained in:
parent
92bbbffbc8
commit
ec62ec39d8
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -9,3 +9,4 @@ diff.png
|
|||
/test/symgroups.aux
|
||||
/test/symgroups.log
|
||||
/test/symgroups.pdf
|
||||
/test/screenshotter/unicode-fonts
|
||||
|
|
|
@ -10,4 +10,5 @@ before_script:
|
|||
- docker images --no-trunc
|
||||
script:
|
||||
- npm test
|
||||
- git clone https://github.com/Khan/KaTeX-test-fonts test/screenshotter/unicode-fonts
|
||||
- dockers/Screenshotter/screenshotter.sh --verify
|
||||
|
|
8
Makefile
8
Makefile
|
@ -39,6 +39,12 @@ build/fonts:
|
|||
cp static/fonts/$$font* $@; \
|
||||
done
|
||||
|
||||
test/screenshotter/unicode-fonts:
|
||||
git clone https://github.com/Khan/KaTeX-test-fonts test/screenshotter/unicode-fonts
|
||||
cd test/screenshotter/unicode-fonts && \
|
||||
git checkout 99fa66a2da643218754c8236b9f9151cac71ba7c && \
|
||||
cd ../../../
|
||||
|
||||
contrib: build/contrib
|
||||
|
||||
.PHONY: build/contrib
|
||||
|
@ -90,5 +96,5 @@ extended_metrics:
|
|||
clean:
|
||||
rm -rf build/*
|
||||
|
||||
screenshots:
|
||||
screenshots: test/screenshotter/unicode-fonts
|
||||
dockers/Screenshotter/screenshotter.sh
|
||||
|
|
|
@ -81,6 +81,8 @@ app.use(express["static"](path.join(__dirname, "static")));
|
|||
app.use(express["static"](path.join(__dirname, "build")));
|
||||
app.use("/test", express["static"](path.join(__dirname, "test")));
|
||||
app.use("/contrib", express["static"](path.join(__dirname, "contrib")));
|
||||
// app.use("/unicode-fonts",
|
||||
// express["static"](path.join(__dirname, "static", "unicode-fonts")));
|
||||
|
||||
app.use(function(err, req, res, next) {
|
||||
console.error(err.stack);
|
||||
|
|
|
@ -4,6 +4,7 @@ var environments = require("./environments");
|
|||
var MacroExpander = require("./MacroExpander");
|
||||
var symbols = require("./symbols");
|
||||
var utils = require("./utils");
|
||||
var cjkRegex = require("./unicodeRegexes").cjkRegex;
|
||||
|
||||
var parseData = require("./parseData");
|
||||
var ParseError = require("./ParseError");
|
||||
|
@ -794,6 +795,11 @@ Parser.prototype.parseSymbol = function() {
|
|||
new ParseNode(symbols[this.mode][nucleus.text].group,
|
||||
nucleus.text, this.mode, nucleus),
|
||||
false, nucleus);
|
||||
} else if (this.mode === "text" && cjkRegex.test(nucleus.text)) {
|
||||
this.consume();
|
||||
return new ParseFuncOrArgument(
|
||||
new ParseNode("textord", nucleus.text, this.mode, nucleus),
|
||||
false, nucleus);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
*
|
||||
* Similar functions for working with MathML nodes exist in mathMLTree.js.
|
||||
*/
|
||||
|
||||
var unicodeRegexes = require("./unicodeRegexes");
|
||||
var utils = require("./utils");
|
||||
|
||||
/**
|
||||
|
@ -169,6 +169,14 @@ documentFragment.prototype.toMarkup = function() {
|
|||
return markup;
|
||||
};
|
||||
|
||||
var iCombinations = {
|
||||
'î': '\u0131\u0302',
|
||||
'ï': '\u0131\u0308',
|
||||
'í': '\u0131\u0301',
|
||||
// 'ī': '\u0131\u0304', // enable when we add Extended Latin
|
||||
'ì': '\u0131\u0300',
|
||||
};
|
||||
|
||||
/**
|
||||
* A symbol node contains information about a single symbol. It either renders
|
||||
* to a single text node, or a span with a single text node in it, depending on
|
||||
|
@ -183,6 +191,25 @@ function symbolNode(value, height, depth, italic, skew, classes, style) {
|
|||
this.classes = classes || [];
|
||||
this.style = style || {};
|
||||
this.maxFontSize = 0;
|
||||
|
||||
// Mark CJK characters with specific classes so that we can specify which
|
||||
// fonts to use. This allows us to render these characters with a serif
|
||||
// font in situations where the browser would either default to a sans serif
|
||||
// or render a placeholder character.
|
||||
if (unicodeRegexes.cjkRegex.test(value)) {
|
||||
// I couldn't find any fonts that contained Hangul as well as all of
|
||||
// the other characters we wanted to test there for it gets its own
|
||||
// CSS class.
|
||||
if (unicodeRegexes.hangulRegex.test(value)) {
|
||||
this.classes.push('hangul_fallback');
|
||||
} else {
|
||||
this.classes.push('cjk_fallback');
|
||||
}
|
||||
}
|
||||
|
||||
if (/[îïíì]/.test(this.value)) { // add ī when we add Extended Latin
|
||||
this.value = iCombinations[this.value];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
/* eslint no-unused-vars:0 */
|
||||
|
||||
var Style = require("./Style");
|
||||
var cjkRegex = require("./unicodeRegexes").cjkRegex;
|
||||
|
||||
/**
|
||||
* This file contains metrics regarding fonts and individual symbols. The sigma
|
||||
|
@ -121,6 +122,145 @@ var metrics = {
|
|||
// This map is generated via `make metrics`. It should not be changed manually.
|
||||
var metricMap = require("./fontMetricsData");
|
||||
|
||||
// These are very rough approximations. We default to Times New Roman which
|
||||
// should have Latin-1 and Cyrillic characters, but may not depending on the
|
||||
// operating system. The metrics do not account for extra height from the
|
||||
// accents. In the case of Cyrillic characters which have both ascenders and
|
||||
// descenders we prefer approximations with ascenders, primarily to prevent
|
||||
// the fraction bar or root line from intersecting the glyph.
|
||||
// TODO(kevinb) allow union of multiple glyph metrics for better accuracy.
|
||||
var extraCharacterMap = {
|
||||
// Latin-1
|
||||
'À': 'A',
|
||||
'Á': 'A',
|
||||
'Â': 'A',
|
||||
'Ã': 'A',
|
||||
'Ä': 'A',
|
||||
'Å': 'A',
|
||||
'Æ': 'A',
|
||||
'Ç': 'C',
|
||||
'È': 'E',
|
||||
'É': 'E',
|
||||
'Ê': 'E',
|
||||
'Ë': 'E',
|
||||
'Ì': 'I',
|
||||
'Í': 'I',
|
||||
'Î': 'I',
|
||||
'Ï': 'I',
|
||||
'Ð': 'D',
|
||||
'Ñ': 'N',
|
||||
'Ò': 'O',
|
||||
'Ó': 'O',
|
||||
'Ô': 'O',
|
||||
'Õ': 'O',
|
||||
'Ö': 'O',
|
||||
'Ø': 'O',
|
||||
'Ù': 'U',
|
||||
'Ú': 'U',
|
||||
'Û': 'U',
|
||||
'Ü': 'U',
|
||||
'Ý': 'Y',
|
||||
'Þ': 'o',
|
||||
'ß': 'B',
|
||||
'à': 'a',
|
||||
'á': 'a',
|
||||
'â': 'a',
|
||||
'ã': 'a',
|
||||
'ä': 'a',
|
||||
'å': 'a',
|
||||
'æ': 'a',
|
||||
'ç': 'c',
|
||||
'è': 'e',
|
||||
'é': 'e',
|
||||
'ê': 'e',
|
||||
'ë': 'e',
|
||||
'ì': 'i',
|
||||
'í': 'i',
|
||||
'î': 'i',
|
||||
'ï': 'i',
|
||||
'ð': 'd',
|
||||
'ñ': 'n',
|
||||
'ò': 'o',
|
||||
'ó': 'o',
|
||||
'ô': 'o',
|
||||
'õ': 'o',
|
||||
'ö': 'o',
|
||||
'ø': 'o',
|
||||
'ù': 'u',
|
||||
'ú': 'u',
|
||||
'û': 'u',
|
||||
'ü': 'u',
|
||||
'ý': 'y',
|
||||
'þ': 'o',
|
||||
'ÿ': 'y',
|
||||
|
||||
// Cyrillic
|
||||
'А': 'A',
|
||||
'Б': 'B',
|
||||
'В': 'B',
|
||||
'Г': 'F',
|
||||
'Д': 'A',
|
||||
'Е': 'E',
|
||||
'Ж': 'K',
|
||||
'З': '3',
|
||||
'И': 'N',
|
||||
'Й': 'N',
|
||||
'К': 'K',
|
||||
'Л': 'N',
|
||||
'М': 'M',
|
||||
'Н': 'H',
|
||||
'О': 'O',
|
||||
'П': 'N',
|
||||
'Р': 'P',
|
||||
'С': 'C',
|
||||
'Т': 'T',
|
||||
'У': 'y',
|
||||
'Ф': 'O',
|
||||
'Х': 'X',
|
||||
'Ц': 'U',
|
||||
'Ч': 'h',
|
||||
'Ш': 'W',
|
||||
'Щ': 'W',
|
||||
'Ъ': 'B',
|
||||
'Ы': 'X',
|
||||
'Ь': 'B',
|
||||
'Э': '3',
|
||||
'Ю': 'X',
|
||||
'Я': 'R',
|
||||
'а': 'a',
|
||||
'б': 'b',
|
||||
'в': 'a',
|
||||
'г': 'r',
|
||||
'д': 'y',
|
||||
'е': 'e',
|
||||
'ж': 'm',
|
||||
'з': 'e',
|
||||
'и': 'n',
|
||||
'й': 'n',
|
||||
'к': 'n',
|
||||
'л': 'n',
|
||||
'м': 'm',
|
||||
'н': 'n',
|
||||
'о': 'o',
|
||||
'п': 'n',
|
||||
'р': 'p',
|
||||
'с': 'c',
|
||||
'т': 'o',
|
||||
'у': 'y',
|
||||
'ф': 'b',
|
||||
'х': 'x',
|
||||
'ц': 'n',
|
||||
'ч': 'n',
|
||||
'ш': 'w',
|
||||
'щ': 'w',
|
||||
'ъ': 'a',
|
||||
'ы': 'm',
|
||||
'ь': 'a',
|
||||
'э': 'e',
|
||||
'ю': 'm',
|
||||
'я': 'r',
|
||||
};
|
||||
|
||||
/**
|
||||
* This function is a convenience function for looking up information in the
|
||||
* metricMap table. It takes a character as a string, and a style.
|
||||
|
@ -129,7 +269,13 @@ var metricMap = require("./fontMetricsData");
|
|||
* built using `Make extended_metrics`.
|
||||
*/
|
||||
var getCharacterMetrics = function(character, style) {
|
||||
var metrics = metricMap[style][character.charCodeAt(0)];
|
||||
var ch = character.charCodeAt(0);
|
||||
if (character[0] in extraCharacterMap) {
|
||||
ch = extraCharacterMap[character[0]].charCodeAt(0);
|
||||
} else if (cjkRegex.test(character[0])) {
|
||||
ch = 'M'.charCodeAt(0);
|
||||
}
|
||||
var metrics = metricMap[style][ch];
|
||||
if (metrics) {
|
||||
return {
|
||||
depth: metrics[0],
|
||||
|
|
|
@ -630,3 +630,25 @@ for (i = 0; i < letters.length; i++) {
|
|||
defineSymbol(math, main, mathord, ch, ch);
|
||||
defineSymbol(text, main, textord, ch, ch);
|
||||
}
|
||||
|
||||
// Latin-1 letters
|
||||
for (i = 0x00C0; i <= 0x00D6; i++) {
|
||||
ch = String.fromCharCode(i);
|
||||
defineSymbol(text, main, textord, ch, ch);
|
||||
}
|
||||
|
||||
for (i = 0x00D8; i <= 0x00F6; i++) {
|
||||
ch = String.fromCharCode(i);
|
||||
defineSymbol(text, main, textord, ch, ch);
|
||||
}
|
||||
|
||||
for (i = 0x00F8; i <= 0x00FF; i++) {
|
||||
ch = String.fromCharCode(i);
|
||||
defineSymbol(text, main, textord, ch, ch);
|
||||
}
|
||||
|
||||
// Cyrillic
|
||||
for (i = 0x0410; i <= 0x044F; i++) {
|
||||
ch = String.fromCharCode(i);
|
||||
defineSymbol(text, main, textord, ch, ch);
|
||||
}
|
||||
|
|
15
src/unicodeRegexes.js
Normal file
15
src/unicodeRegexes.js
Normal file
|
@ -0,0 +1,15 @@
|
|||
var hangulRegex = /[\uAC00-\uD7AF]/;
|
||||
|
||||
// This regex combines
|
||||
// - Hiragana: [\u3040-\u309F]
|
||||
// - Katakana: [\u30A0-\u30FF]
|
||||
// - CJK ideograms: [\u4E00-\u9FAF]
|
||||
// - Hangul syllables: [\uAC00-\uD7AF]
|
||||
// Notably missing are halfwidth Katakana and Romanji glyphs.
|
||||
var cjkRegex =
|
||||
/[\u3040-\u309F]|[\u30A0-\u30FF]|[\u4E00-\u9FAF]|[\uAC00-\uD7AF]/;
|
||||
|
||||
module.exports = {
|
||||
cjkRegex: cjkRegex,
|
||||
hangulRegex: hangulRegex,
|
||||
};
|
|
@ -15,7 +15,7 @@
|
|||
}
|
||||
|
||||
.katex {
|
||||
font: normal 1.21em KaTeX_Main;
|
||||
font: normal 1.21em KaTeX_Main, Times New Roman, serif;
|
||||
line-height: 1.2;
|
||||
white-space: nowrap;
|
||||
|
||||
|
|
BIN
test/screenshotter/images/Unicode-chrome.png
Normal file
BIN
test/screenshotter/images/Unicode-chrome.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 25 KiB |
BIN
test/screenshotter/images/Unicode-firefox.png
Normal file
BIN
test/screenshotter/images/Unicode-firefox.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 24 KiB |
|
@ -114,6 +114,7 @@ Symbols1: |
|
|||
\maltese\degree\pounds\$
|
||||
\text{\maltese\degree}
|
||||
Text: \frac{a}{b}\text{c~ {ab} \ e}+fg
|
||||
Unicode: \begin{matrix}\text{ÀàÇçÉéÏïÖöÛû} \\ \text{БГДЖЗЙЛФЦШЫЮЯ} \\ \text{여보세요} \\ \text{私はバナナです} \end{matrix}
|
||||
UnsupportedCmds:
|
||||
tex: \err\,\frac\fracerr3\,2^\superr_\suberr\,\sqrt\sqrterr
|
||||
noThrow: 1
|
||||
|
|
|
@ -11,6 +11,20 @@
|
|||
body {
|
||||
font-family: "DejaVu Serif",serif;
|
||||
}
|
||||
@font-face {
|
||||
font-family: "Mincho";
|
||||
src: url("unicode-fonts/mincho/font_1_honokamin.ttf") format("truetype");
|
||||
}
|
||||
@font-face {
|
||||
font-family: "Batang";
|
||||
src: url("unicode-fonts/batang/batang.ttf") format("truetype");
|
||||
}
|
||||
.katex .cjk_fallback {
|
||||
font-family: "Mincho",serif;
|
||||
}
|
||||
.katex .hangul_fallback {
|
||||
font-family: "Batang",serif;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
|
103
test/unicode-spec.js
Normal file
103
test/unicode-spec.js
Normal file
|
@ -0,0 +1,103 @@
|
|||
/* eslint max-len:0 */
|
||||
/* global beforeEach: false */
|
||||
/* global jasmine: false */
|
||||
/* global expect: false */
|
||||
/* global it: false */
|
||||
/* global describe: false */
|
||||
var ParseError = require("../src/ParseError");
|
||||
var parseTree = require("../src/parseTree");
|
||||
var Settings = require("../src/Settings");
|
||||
|
||||
var defaultSettings = new Settings({});
|
||||
|
||||
var parseAndSetResult = function(expr, result, settings) {
|
||||
try {
|
||||
return parseTree(expr, settings || defaultSettings);
|
||||
} catch (e) {
|
||||
result.pass = false;
|
||||
if (e instanceof ParseError) {
|
||||
result.message = "'" + expr + "' failed " +
|
||||
"parsing with error: " + e.message;
|
||||
} else {
|
||||
result.message = "'" + expr + "' failed " +
|
||||
"parsing with unknown error: " + e.message;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
describe("unicode", function() {
|
||||
beforeEach(function() {
|
||||
jasmine.addMatchers({
|
||||
|
||||
toParse: function() {
|
||||
return {
|
||||
compare: function(actual, settings) {
|
||||
var usedSettings = settings ? settings : defaultSettings;
|
||||
|
||||
var result = {
|
||||
pass: true,
|
||||
message: "'" + actual + "' succeeded parsing",
|
||||
};
|
||||
parseAndSetResult(actual, result, usedSettings);
|
||||
return result;
|
||||
},
|
||||
};
|
||||
},
|
||||
|
||||
toNotParse: function() {
|
||||
return {
|
||||
compare: function(actual, settings) {
|
||||
var usedSettings = settings ? settings : defaultSettings;
|
||||
|
||||
var result = {
|
||||
pass: false,
|
||||
message: "Expected '" + actual + "' to fail " +
|
||||
"parsing, but it succeeded",
|
||||
};
|
||||
|
||||
try {
|
||||
parseTree(actual, usedSettings);
|
||||
} catch (e) {
|
||||
if (e instanceof ParseError) {
|
||||
result.pass = true;
|
||||
result.message = "'" + actual + "' correctly " +
|
||||
"didn't parse with error: " + e.message;
|
||||
} else {
|
||||
result.message = "'" + actual + "' failed " +
|
||||
"parsing with unknown error: " + e.message;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
},
|
||||
};
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("should parse Latin-1 inside \\text{}", function() {
|
||||
expect('\\text{ÀàÇçÉéÏïÖöÛû}').toParse();
|
||||
});
|
||||
|
||||
it("should not parse Latin-1 outside \\text{}", function() {
|
||||
expect('ÀàÇçÉéÏïÖöÛû').toNotParse();
|
||||
});
|
||||
|
||||
it("should parse Cyrillic inside \\text{}", function() {
|
||||
expect('\\text{БГДЖЗЙЛФЦШЫЮЯ}').toParse();
|
||||
});
|
||||
|
||||
it("should not parse Cyrillic outside \\text{}", function() {
|
||||
expect('БГДЖЗЙЛФЦШЫЮЯ').toNotParse();
|
||||
});
|
||||
|
||||
it("should parse CJK inside \\text{}", function() {
|
||||
expect('\\text{私はバナナです}').toParse();
|
||||
expect('\\text{여보세요}').toParse();
|
||||
});
|
||||
|
||||
it("should not parse CJK outside \\text{}", function() {
|
||||
expect('私はバナナです。').toNotParse();
|
||||
expect('여보세요').toNotParse();
|
||||
});
|
||||
});
|
Loading…
Reference in New Issue
Block a user