KaTeX/metrics/metric_parse.py
Emily Eisenberg 0a3a2271f4 Add ability to calculate heights/depths
Summary:
This diff does a couple different things:

 - There is now a metrics/ folder, which contains the property files describing
   the metrics if the fonts, as well as a script for reading and printing the
   metrics in javascript.
 - Fractions and superscripts/subscripts are now rendered in slightly different
   ways now (notably, no use of inline-table). This allows for much more precise
   positioning of the superscripts, subscripts, numerators, and denominators,
   while still having an appropriate baseline. Also, there is no longer a
   sup/sub/supsub distinction, there are only supsubs with null sup/sub.
 - Using the new font metrics and by implementing the formulas found in The TeX
   Book, Appendix G, the heights and depths of all of the sub-expressions in a
   formula are now calculated. These are currently used to:
    - Correctly position superscripts, subscripts, numerators, and denominators
    - Adjust the height and depth of the overall expression so it takes up the
      appropriate space
 - Because we have to add attributes (height and depth) to every attribute, I
   have changed the way DOM nodes are assembled. Now, instead of assembling the
   DOM elements inline (which is a problem because we need to track
   height/depth, and we shouldn't (and can't in IE 8) attach raw attributes to DOM
   nodes), we assemble a pseudo-DOM structure with the extra information, and
   then actually assemble it at the very end.

The main page also now has an updated expression to show off and test the new
and improved parsing.

Test Plan:
View the main page, make sure that the expression renders. Make sure
that the tests pass. Make sure that expressions have the correct calculated
height (this is most easily tested by viewing them on the main page and making
sure that the top of the expression lines up with the bottom of the input box).

Reviewers: alpert

Reviewed By: alpert

Differential Revision: http://phabricator.khanacademy.org/D3442
2013-08-13 18:16:43 -07:00

329 lines
9.3 KiB
Python
Executable File

#!/usr/bin/env python
import re
import json
import os
expr_start = re.compile(r"\s*\(")
expr_value = re.compile(r"\s*([^\s\)]+)")
expr_end = re.compile(r"\s*\)")
empty = re.compile(r"\s*$")
def inner_parse_expr(expr):
l = []
while True:
if re.match(empty, expr):
return expr, l
elif re.match(expr_start, expr):
match = re.match(expr_start, expr)
new_expr = expr[len(match.group(0)):]
expr, group = inner_parse_expr(new_expr)
l.append(group)
elif re.match(expr_value, expr):
match = re.match(expr_value, expr)
expr = expr[len(match.group(0)):]
l.append(match.group(1))
elif re.match(expr_end, expr):
match = re.match(expr_end, expr)
new_expr = expr[len(match.group(0)):]
return new_expr, l
else:
raise ValueError("Invalid input")
def parse_expr(expr):
_, group = inner_parse_expr(expr)
return group
def read_pl(pl_file):
with open(pl_file) as pl:
return parse_expr(pl.read())
class Metric:
def __init__(self, char, height, depth):
self.char = char
self.height = float(height) / 1000
self.depth = float(depth) / 1000
def __repr__(self):
return "Char {0} ({1:.3f}+{2:.3f})".format(
self.char, self.height, self.depth)
# Different styles of text, found on page 13 of The TeX book
ROMAN = "roman"
SLANTED = "slanted"
ITALIC = "italic"
TYPEWRITER = "typewriter"
BOLD = "bold"
# The numbers used in this map are all in octal, because that's how all the
# charts for the numbers are written
file_map = {
"cmmi10.pl": {
13: (u"\u03b1", ITALIC), # \alpha
14: (u"\u03b2", ITALIC), # \beta
15: (u"\u03b3", ITALIC), # \gamma
16: (u"\u03b4", ITALIC), # \delta
17: (u"\u03b5", ITALIC), # \varepsilon
20: (u"\u03b6", ITALIC), # \zeta
21: (u"\u03b7", ITALIC), # \eta
22: (u"\u03b8", ITALIC), # \theta
23: (u"\u03b9", ITALIC), # \iota
24: (u"\u03ba", ITALIC), # \kappa
25: (u"\u03bb", ITALIC), # \lambda
26: (u"\u03bc", ITALIC), # \mu
27: (u"\u03bd", ITALIC), # \nu
30: (u"\u03be", ITALIC), # \xi
31: (u"\u03c0", ITALIC), # \pi
32: (u"\u03c1", ITALIC), # \rho
33: (u"\u03c3", ITALIC), # \sigma
34: (u"\u03c4", ITALIC), # \tau
35: (u"\u03c5", ITALIC), # \upsilon
36: (u"\u03d5", ITALIC), # \phi
37: (u"\u03c7", ITALIC), # \chi
40: (u"\u03c8", ITALIC), # \psi
41: (u"\u03c9", ITALIC), # \omega
42: (u"\u03b5", ITALIC), # \epsilon
43: (u"\u03d1", ITALIC), # \vartheta
44: (u"\u03d6", ITALIC), # \varpi
45: (u"\u03f1", ITALIC), # \varrho
46: (u"\u03c2", ITALIC), # \varsigma
47: (u"\u03c6", ITALIC), # \varphi
72: (u".", ROMAN),
73: (u",", ROMAN),
74: (u"<", ROMAN),
75: (u"/", ROMAN),
76: (u">", ROMAN),
101: (u"A", ITALIC),
102: (u"B", ITALIC),
103: (u"C", ITALIC),
104: (u"D", ITALIC),
105: (u"E", ITALIC),
106: (u"F", ITALIC),
107: (u"G", ITALIC),
110: (u"H", ITALIC),
111: (u"I", ITALIC),
112: (u"J", ITALIC),
113: (u"K", ITALIC),
114: (u"L", ITALIC),
115: (u"M", ITALIC),
116: (u"N", ITALIC),
117: (u"O", ITALIC),
120: (u"P", ITALIC),
121: (u"Q", ITALIC),
122: (u"R", ITALIC),
123: (u"S", ITALIC),
124: (u"T", ITALIC),
125: (u"U", ITALIC),
126: (u"V", ITALIC),
127: (u"W", ITALIC),
130: (u"X", ITALIC),
131: (u"Y", ITALIC),
132: (u"Z", ITALIC),
141: (u"a", ITALIC),
142: (u"b", ITALIC),
143: (u"c", ITALIC),
144: (u"d", ITALIC),
145: (u"e", ITALIC),
146: (u"f", ITALIC),
147: (u"g", ITALIC),
150: (u"h", ITALIC),
151: (u"i", ITALIC),
152: (u"j", ITALIC),
153: (u"k", ITALIC),
154: (u"l", ITALIC),
155: (u"m", ITALIC),
156: (u"n", ITALIC),
157: (u"o", ITALIC),
160: (u"p", ITALIC),
161: (u"q", ITALIC),
162: (u"r", ITALIC),
163: (u"s", ITALIC),
164: (u"t", ITALIC),
165: (u"u", ITALIC),
166: (u"v", ITALIC),
167: (u"w", ITALIC),
170: (u"x", ITALIC),
171: (u"y", ITALIC),
172: (u"z", ITALIC),
},
"cmr10.pl": {
0: (u"\u0393", ROMAN), # \Gamma
1: (u"\u0394", ROMAN), # \Delta
2: (u"\u0398", ROMAN), # \Theta
3: (u"\u039b", ROMAN), # \Lambda
4: (u"\u039e", ROMAN), # \Xi
5: (u"\u03a0", ROMAN), # \Pi
6: (u"\u03a3", ROMAN), # \Sigma
7: (u"\u03a5", ROMAN), # \Upsilon
10: (u"\u03a6", ROMAN), # \Phi
11: (u"\u03a8", ROMAN), # \Psi
12: (u"\u03a9", ROMAN), # \Omega
41: (u"!", ROMAN),
42: (u"\"", ROMAN),
44: (u"$", ROMAN),
50: (u"(", ROMAN),
51: (u")", ROMAN),
53: (u"+", ROMAN),
60: (u"0", ROMAN),
61: (u"1", ROMAN),
62: (u"2", ROMAN),
63: (u"3", ROMAN),
64: (u"4", ROMAN),
65: (u"5", ROMAN),
66: (u"6", ROMAN),
67: (u"7", ROMAN),
70: (u"8", ROMAN),
71: (u"9", ROMAN),
72: (u":", ROMAN),
73: (u";", ROMAN),
75: (u"=", ROMAN),
77: (u"?", ROMAN),
100: (u"@", ROMAN),
101: (u"A", ROMAN),
102: (u"B", ROMAN),
103: (u"C", ROMAN),
104: (u"D", ROMAN),
105: (u"E", ROMAN),
106: (u"F", ROMAN),
107: (u"G", ROMAN),
110: (u"H", ROMAN),
111: (u"I", ROMAN),
112: (u"J", ROMAN),
113: (u"K", ROMAN),
114: (u"L", ROMAN),
115: (u"M", ROMAN),
116: (u"N", ROMAN),
117: (u"O", ROMAN),
120: (u"P", ROMAN),
121: (u"Q", ROMAN),
122: (u"R", ROMAN),
123: (u"S", ROMAN),
124: (u"T", ROMAN),
125: (u"U", ROMAN),
126: (u"V", ROMAN),
127: (u"W", ROMAN),
130: (u"X", ROMAN),
131: (u"Y", ROMAN),
132: (u"Z", ROMAN),
133: (u"[", ROMAN),
135: (u"]", ROMAN),
140: (u"`", ROMAN),
141: (u"a", ROMAN),
142: (u"b", ROMAN),
143: (u"c", ROMAN),
144: (u"d", ROMAN),
145: (u"e", ROMAN),
146: (u"f", ROMAN),
147: (u"g", ROMAN),
150: (u"h", ROMAN),
151: (u"i", ROMAN),
152: (u"j", ROMAN),
153: (u"k", ROMAN),
154: (u"l", ROMAN),
155: (u"m", ROMAN),
156: (u"n", ROMAN),
157: (u"o", ROMAN),
160: (u"p", ROMAN),
161: (u"q", ROMAN),
162: (u"r", ROMAN),
163: (u"s", ROMAN),
164: (u"t", ROMAN),
165: (u"u", ROMAN),
166: (u"v", ROMAN),
167: (u"w", ROMAN),
170: (u"x", ROMAN),
171: (u"y", ROMAN),
172: (u"z", ROMAN),
},
"cmsy10.pl": {
0: (u"\u2212", ROMAN), # -
1: (u"\u22c5", ROMAN), # \cdot
2: (u"\u00d7", ROMAN), # \times
3: (u"\u2217", ROMAN), # *
4: (u"\u00f7", ROMAN), # \div
6: (u"\u00b1", ROMAN), # \pm
16: (u"\u2218", ROMAN), # \circ
24: (u"\u2264", ROMAN), # \leq
25: (u"\u2265", ROMAN), # \geq
40: (u"\u2190", ROMAN), # \leftarrow
41: (u"\u2192", ROMAN), # \rightarrow
60: (u"\u2032", ROMAN), # \prime
61: (u"\u221e", ROMAN), # \infty
152: (u"|", ROMAN), # |
}
}
def read_metrics(pl_file, metrics):
pl = read_pl(pl_file)
metrics_to_read = file_map[pl_file]
for elem in pl:
if elem[0] == "CHARACTER":
if elem[1] == "C":
char = int(oct(ord(elem[2])))
elif elem[1] == "O":
char = int(elem[2])
else:
continue
if not char in metrics_to_read:
continue
map_char, map_style = metrics_to_read[char]
char_height = 0
char_depth = 0
for metric in elem[3:]:
if metric[0] == "comment":
continue
elif metric[0] == "CHARHT":
char_height = int(metric[2])
elif metric[0] == "CHARDP":
char_depth = int(metric[2])
metrics[map_style].append(
Metric(map_char, char_height, char_depth))
def print_metrics(metrics):
metric_map = {
style: {
"height": {
metric.char: metric.height for metric in metric_list
},
"depth": {
metric.char: metric.depth for metric in metric_list
},
} for style, metric_list in metrics.iteritems()
}
print "var metricMap = {0};".format(json.dumps(metric_map, indent=4))
if __name__ == "__main__":
os.chdir(os.path.dirname(os.path.realpath(__file__)))
metrics = {
"roman": [],
"italic": [],
"slanted": [],
"typewriter": [],
"bold": [],
}
for metric_file in file_map:
read_metrics(metric_file, metrics)
metrics["roman"].append(Metric(u'\u00a0', 0, 0))
print_metrics(metrics)