commit ec62ec39d82b6b42ee2ec9ad866a79c56b8e2990
parent 92bbbffbc800cf3d8a586d01505a546052b3a0b1
Author: Kevin Barabash <kevinb7@gmail.com>
Date: Mon, 1 Aug 2016 17:51:40 -0700
Add support for Latin-1, Cyrillic, and CJK characters inside \text{} (#508)
Summary:
This diff provides support for Latin-1, Cyrillic, and CJK characters
inside \text{} groups. For Latin-1 and Cyrillic characters we use
glyph metrics from a glyph from Basic Latin that has roughly the same
bounding box. We use the metrics for a capital 'M' to approximate the
full-width CJK characters. Half-width characters are not supported yet.
Test Plan:
- make test
- make screenshots
Reviewers: emily
Diffstat:
15 files changed, 348 insertions(+), 4 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@ diff.png
/test/symgroups.aux
/test/symgroups.log
/test/symgroups.pdf
+/test/screenshotter/unicode-fonts
diff --git a/.travis.yml b/.travis.yml
@@ -10,4 +10,5 @@ before_script:
- docker images --no-trunc
script:
- npm test
+ - git clone https://github.com/Khan/KaTeX-test-fonts test/screenshotter/unicode-fonts
- dockers/Screenshotter/screenshotter.sh --verify
diff --git a/Makefile b/Makefile
@@ -39,6 +39,12 @@ build/fonts:
cp static/fonts/$$font* $@; \
done
+test/screenshotter/unicode-fonts:
+ git clone https://github.com/Khan/KaTeX-test-fonts test/screenshotter/unicode-fonts
+ cd test/screenshotter/unicode-fonts && \
+ git checkout 99fa66a2da643218754c8236b9f9151cac71ba7c && \
+ cd ../../../
+
contrib: build/contrib
.PHONY: build/contrib
@@ -90,5 +96,5 @@ extended_metrics:
clean:
rm -rf build/*
-screenshots:
+screenshots: test/screenshotter/unicode-fonts
dockers/Screenshotter/screenshotter.sh
diff --git a/server.js b/server.js
@@ -81,6 +81,8 @@ app.use(express["static"](path.join(__dirname, "static")));
app.use(express["static"](path.join(__dirname, "build")));
app.use("/test", express["static"](path.join(__dirname, "test")));
app.use("/contrib", express["static"](path.join(__dirname, "contrib")));
+// app.use("/unicode-fonts",
+// express["static"](path.join(__dirname, "static", "unicode-fonts")));
app.use(function(err, req, res, next) {
console.error(err.stack);
diff --git a/src/Parser.js b/src/Parser.js
@@ -4,6 +4,7 @@ var environments = require("./environments");
var MacroExpander = require("./MacroExpander");
var symbols = require("./symbols");
var utils = require("./utils");
+var cjkRegex = require("./unicodeRegexes").cjkRegex;
var parseData = require("./parseData");
var ParseError = require("./ParseError");
@@ -794,6 +795,11 @@ Parser.prototype.parseSymbol = function() {
new ParseNode(symbols[this.mode][nucleus.text].group,
nucleus.text, this.mode, nucleus),
false, nucleus);
+ } else if (this.mode === "text" && cjkRegex.test(nucleus.text)) {
+ this.consume();
+ return new ParseFuncOrArgument(
+ new ParseNode("textord", nucleus.text, this.mode, nucleus),
+ false, nucleus);
} else {
return null;
}
diff --git a/src/domTree.js b/src/domTree.js
@@ -7,7 +7,7 @@
*
* Similar functions for working with MathML nodes exist in mathMLTree.js.
*/
-
+var unicodeRegexes = require("./unicodeRegexes");
var utils = require("./utils");
/**
@@ -169,6 +169,14 @@ documentFragment.prototype.toMarkup = function() {
return markup;
};
+var iCombinations = {
+ 'î': '\u0131\u0302',
+ 'ï': '\u0131\u0308',
+ 'í': '\u0131\u0301',
+ // 'ī': '\u0131\u0304', // enable when we add Extended Latin
+ 'ì': '\u0131\u0300',
+};
+
/**
* A symbol node contains information about a single symbol. It either renders
* to a single text node, or a span with a single text node in it, depending on
@@ -183,6 +191,25 @@ function symbolNode(value, height, depth, italic, skew, classes, style) {
this.classes = classes || [];
this.style = style || {};
this.maxFontSize = 0;
+
+ // Mark CJK characters with specific classes so that we can specify which
+ // fonts to use. This allows us to render these characters with a serif
+ // font in situations where the browser would either default to a sans serif
+ // or render a placeholder character.
+ if (unicodeRegexes.cjkRegex.test(value)) {
+ // I couldn't find any fonts that contained Hangul as well as all of
+ // the other characters we wanted to test there for it gets its own
+ // CSS class.
+ if (unicodeRegexes.hangulRegex.test(value)) {
+ this.classes.push('hangul_fallback');
+ } else {
+ this.classes.push('cjk_fallback');
+ }
+ }
+
+ if (/[îïíì]/.test(this.value)) { // add ī when we add Extended Latin
+ this.value = iCombinations[this.value];
+ }
}
/**
diff --git a/src/fontMetrics.js b/src/fontMetrics.js
@@ -1,6 +1,7 @@
/* eslint no-unused-vars:0 */
var Style = require("./Style");
+var cjkRegex = require("./unicodeRegexes").cjkRegex;
/**
* This file contains metrics regarding fonts and individual symbols. The sigma
@@ -121,6 +122,145 @@ var metrics = {
// This map is generated via `make metrics`. It should not be changed manually.
var metricMap = require("./fontMetricsData");
+// These are very rough approximations. We default to Times New Roman which
+// should have Latin-1 and Cyrillic characters, but may not depending on the
+// operating system. The metrics do not account for extra height from the
+// accents. In the case of Cyrillic characters which have both ascenders and
+// descenders we prefer approximations with ascenders, primarily to prevent
+// the fraction bar or root line from intersecting the glyph.
+// TODO(kevinb) allow union of multiple glyph metrics for better accuracy.
+var extraCharacterMap = {
+ // Latin-1
+ 'À': 'A',
+ 'Á': 'A',
+ 'Â': 'A',
+ 'Ã': 'A',
+ 'Ä': 'A',
+ 'Å': 'A',
+ 'Æ': 'A',
+ 'Ç': 'C',
+ 'È': 'E',
+ 'É': 'E',
+ 'Ê': 'E',
+ 'Ë': 'E',
+ 'Ì': 'I',
+ 'Í': 'I',
+ 'Î': 'I',
+ 'Ï': 'I',
+ 'Ð': 'D',
+ 'Ñ': 'N',
+ 'Ò': 'O',
+ 'Ó': 'O',
+ 'Ô': 'O',
+ 'Õ': 'O',
+ 'Ö': 'O',
+ 'Ø': 'O',
+ 'Ù': 'U',
+ 'Ú': 'U',
+ 'Û': 'U',
+ 'Ü': 'U',
+ 'Ý': 'Y',
+ 'Þ': 'o',
+ 'ß': 'B',
+ 'à': 'a',
+ 'á': 'a',
+ 'â': 'a',
+ 'ã': 'a',
+ 'ä': 'a',
+ 'å': 'a',
+ 'æ': 'a',
+ 'ç': 'c',
+ 'è': 'e',
+ 'é': 'e',
+ 'ê': 'e',
+ 'ë': 'e',
+ 'ì': 'i',
+ 'í': 'i',
+ 'î': 'i',
+ 'ï': 'i',
+ 'ð': 'd',
+ 'ñ': 'n',
+ 'ò': 'o',
+ 'ó': 'o',
+ 'ô': 'o',
+ 'õ': 'o',
+ 'ö': 'o',
+ 'ø': 'o',
+ 'ù': 'u',
+ 'ú': 'u',
+ 'û': 'u',
+ 'ü': 'u',
+ 'ý': 'y',
+ 'þ': 'o',
+ 'ÿ': 'y',
+
+ // Cyrillic
+ 'А': 'A',
+ 'Б': 'B',
+ 'В': 'B',
+ 'Г': 'F',
+ 'Д': 'A',
+ 'Е': 'E',
+ 'Ж': 'K',
+ 'З': '3',
+ 'И': 'N',
+ 'Й': 'N',
+ 'К': 'K',
+ 'Л': 'N',
+ 'М': 'M',
+ 'Н': 'H',
+ 'О': 'O',
+ 'П': 'N',
+ 'Р': 'P',
+ 'С': 'C',
+ 'Т': 'T',
+ 'У': 'y',
+ 'Ф': 'O',
+ 'Х': 'X',
+ 'Ц': 'U',
+ 'Ч': 'h',
+ 'Ш': 'W',
+ 'Щ': 'W',
+ 'Ъ': 'B',
+ 'Ы': 'X',
+ 'Ь': 'B',
+ 'Э': '3',
+ 'Ю': 'X',
+ 'Я': 'R',
+ 'а': 'a',
+ 'б': 'b',
+ 'в': 'a',
+ 'г': 'r',
+ 'д': 'y',
+ 'е': 'e',
+ 'ж': 'm',
+ 'з': 'e',
+ 'и': 'n',
+ 'й': 'n',
+ 'к': 'n',
+ 'л': 'n',
+ 'м': 'm',
+ 'н': 'n',
+ 'о': 'o',
+ 'п': 'n',
+ 'р': 'p',
+ 'с': 'c',
+ 'т': 'o',
+ 'у': 'y',
+ 'ф': 'b',
+ 'х': 'x',
+ 'ц': 'n',
+ 'ч': 'n',
+ 'ш': 'w',
+ 'щ': 'w',
+ 'ъ': 'a',
+ 'ы': 'm',
+ 'ь': 'a',
+ 'э': 'e',
+ 'ю': 'm',
+ 'я': 'r',
+};
+
/**
* This function is a convenience function for looking up information in the
* metricMap table. It takes a character as a string, and a style.
@@ -129,7 +269,13 @@ var metricMap = require("./fontMetricsData");
* built using `Make extended_metrics`.
*/
var getCharacterMetrics = function(character, style) {
- var metrics = metricMap[style][character.charCodeAt(0)];
+ var ch = character.charCodeAt(0);
+ if (character[0] in extraCharacterMap) {
+ ch = extraCharacterMap[character[0]].charCodeAt(0);
+ } else if (cjkRegex.test(character[0])) {
+ ch = 'M'.charCodeAt(0);
+ }
+ var metrics = metricMap[style][ch];
if (metrics) {
return {
depth: metrics[0],
diff --git a/src/symbols.js b/src/symbols.js
@@ -630,3 +630,25 @@ for (i = 0; i < letters.length; i++) {
defineSymbol(math, main, mathord, ch, ch);
defineSymbol(text, main, textord, ch, ch);
}
+
+// Latin-1 letters
+for (i = 0x00C0; i <= 0x00D6; i++) {
+ ch = String.fromCharCode(i);
+ defineSymbol(text, main, textord, ch, ch);
+}
+
+for (i = 0x00D8; i <= 0x00F6; i++) {
+ ch = String.fromCharCode(i);
+ defineSymbol(text, main, textord, ch, ch);
+}
+
+for (i = 0x00F8; i <= 0x00FF; i++) {
+ ch = String.fromCharCode(i);
+ defineSymbol(text, main, textord, ch, ch);
+}
+
+// Cyrillic
+for (i = 0x0410; i <= 0x044F; i++) {
+ ch = String.fromCharCode(i);
+ defineSymbol(text, main, textord, ch, ch);
+}
diff --git a/src/unicodeRegexes.js b/src/unicodeRegexes.js
@@ -0,0 +1,15 @@
+var hangulRegex = /[\uAC00-\uD7AF]/;
+
+// This regex combines
+// - Hiragana: [\u3040-\u309F]
+// - Katakana: [\u30A0-\u30FF]
+// - CJK ideograms: [\u4E00-\u9FAF]
+// - Hangul syllables: [\uAC00-\uD7AF]
+// Notably missing are halfwidth Katakana and Romanji glyphs.
+var cjkRegex =
+ /[\u3040-\u309F]|[\u30A0-\u30FF]|[\u4E00-\u9FAF]|[\uAC00-\uD7AF]/;
+
+module.exports = {
+ cjkRegex: cjkRegex,
+ hangulRegex: hangulRegex,
+};
diff --git a/static/katex.less b/static/katex.less
@@ -15,7 +15,7 @@
}
.katex {
- font: normal 1.21em KaTeX_Main;
+ font: normal 1.21em KaTeX_Main, Times New Roman, serif;
line-height: 1.2;
white-space: nowrap;
diff --git a/test/screenshotter/images/Unicode-chrome.png b/test/screenshotter/images/Unicode-chrome.png
Binary files differ.
diff --git a/test/screenshotter/images/Unicode-firefox.png b/test/screenshotter/images/Unicode-firefox.png
Binary files differ.
diff --git a/test/screenshotter/ss_data.yaml b/test/screenshotter/ss_data.yaml
@@ -114,6 +114,7 @@ Symbols1: |
\maltese\degree\pounds\$
\text{\maltese\degree}
Text: \frac{a}{b}\text{c~ {ab} \ e}+fg
+Unicode: \begin{matrix}\text{ÀàÇçÉéÏïÖöÛû} \\ \text{БГДЖЗЙЛФЦШЫЮЯ} \\ \text{여보세요} \\ \text{私はバナナです} \end{matrix}
UnsupportedCmds:
tex: \err\,\frac\fracerr3\,2^\superr_\suberr\,\sqrt\sqrterr
noThrow: 1
diff --git a/test/screenshotter/test.html b/test/screenshotter/test.html
@@ -11,6 +11,20 @@
body {
font-family: "DejaVu Serif",serif;
}
+ @font-face {
+ font-family: "Mincho";
+ src: url("unicode-fonts/mincho/font_1_honokamin.ttf") format("truetype");
+ }
+ @font-face {
+ font-family: "Batang";
+ src: url("unicode-fonts/batang/batang.ttf") format("truetype");
+ }
+ .katex .cjk_fallback {
+ font-family: "Mincho",serif;
+ }
+ .katex .hangul_fallback {
+ font-family: "Batang",serif;
+ }
</style>
</head>
<body>
diff --git a/test/unicode-spec.js b/test/unicode-spec.js
@@ -0,0 +1,103 @@
+/* eslint max-len:0 */
+/* global beforeEach: false */
+/* global jasmine: false */
+/* global expect: false */
+/* global it: false */
+/* global describe: false */
+var ParseError = require("../src/ParseError");
+var parseTree = require("../src/parseTree");
+var Settings = require("../src/Settings");
+
+var defaultSettings = new Settings({});
+
+var parseAndSetResult = function(expr, result, settings) {
+ try {
+ return parseTree(expr, settings || defaultSettings);
+ } catch (e) {
+ result.pass = false;
+ if (e instanceof ParseError) {
+ result.message = "'" + expr + "' failed " +
+ "parsing with error: " + e.message;
+ } else {
+ result.message = "'" + expr + "' failed " +
+ "parsing with unknown error: " + e.message;
+ }
+ }
+};
+
+describe("unicode", function() {
+ beforeEach(function() {
+ jasmine.addMatchers({
+
+ toParse: function() {
+ return {
+ compare: function(actual, settings) {
+ var usedSettings = settings ? settings : defaultSettings;
+
+ var result = {
+ pass: true,
+ message: "'" + actual + "' succeeded parsing",
+ };
+ parseAndSetResult(actual, result, usedSettings);
+ return result;
+ },
+ };
+ },
+
+ toNotParse: function() {
+ return {
+ compare: function(actual, settings) {
+ var usedSettings = settings ? settings : defaultSettings;
+
+ var result = {
+ pass: false,
+ message: "Expected '" + actual + "' to fail " +
+ "parsing, but it succeeded",
+ };
+
+ try {
+ parseTree(actual, usedSettings);
+ } catch (e) {
+ if (e instanceof ParseError) {
+ result.pass = true;
+ result.message = "'" + actual + "' correctly " +
+ "didn't parse with error: " + e.message;
+ } else {
+ result.message = "'" + actual + "' failed " +
+ "parsing with unknown error: " + e.message;
+ }
+ }
+
+ return result;
+ },
+ };
+ },
+ });
+ });
+
+ it("should parse Latin-1 inside \\text{}", function() {
+ expect('\\text{ÀàÇçÉéÏïÖöÛû}').toParse();
+ });
+
+ it("should not parse Latin-1 outside \\text{}", function() {
+ expect('ÀàÇçÉéÏïÖöÛû').toNotParse();
+ });
+
+ it("should parse Cyrillic inside \\text{}", function() {
+ expect('\\text{БГДЖЗЙЛФЦШЫЮЯ}').toParse();
+ });
+
+ it("should not parse Cyrillic outside \\text{}", function() {
+ expect('БГДЖЗЙЛФЦШЫЮЯ').toNotParse();
+ });
+
+ it("should parse CJK inside \\text{}", function() {
+ expect('\\text{私はバナナです}').toParse();
+ expect('\\text{여보세요}').toParse();
+ });
+
+ it("should not parse CJK outside \\text{}", function() {
+ expect('私はバナナです。').toNotParse();
+ expect('여보세요').toNotParse();
+ });
+});