Skip to content

Commit d8fc35e

Browse files
ronkokedemaine
andauthored
feat: Support Unicode (sub|super)script characters (#3633)
* feat: Support Unicode (sub|super)script characters * Acquire tokens via repeated fetch() * Match more Unicode (sub|super)script characters * Update docs with new characters * Add Greek characters to RegEx * Pick up review comments Co-authored-by: Erik Demaine <[email protected]>
1 parent c31256f commit d8fc35e

File tree

4 files changed

+139
-0
lines changed

4 files changed

+139
-0
lines changed

docs/supported.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,9 @@ $\allowbreak α β γ δ ϵ ζ η θ ι κ λ μ ν ξ o π \allowbreak ρ σ τ
190190

191191
Direct Input: $∂ ∇ ℑ Ⅎ ℵ ℶ ℷ ℸ ⅁ ℏ ð − ∗$
192192
ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖÙÚÛÜÝÞßàáâãäåçèéêëìíîïðñòóôöùúûüýþÿ
193+
₊₋₌₍₎₀₁₂₃₄₅₆₇₈₉ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓᵦᵧᵨᵩᵪ⁺⁻⁼⁽⁾⁰¹²³⁴⁵⁶⁷⁸⁹ᵃᵇᶜᵈᵉᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘʷˣʸᶻᵛᵝᵞᵟᵠᵡ
194+
195+
Math-mode Unicode (sub|super)script characters will render as if you had written regular characters in a subscript or superscript. For instance, `A²⁺³` will render the same as `A^{2+3}`.
193196

194197
</div>
195198
<div class="katex-cards" id="math-alpha">

src/Parser.js

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import ParseError from "./ParseError";
99
import {combiningDiacriticalMarksEndRegex} from "./Lexer";
1010
import Settings from "./Settings";
1111
import SourceLocation from "./SourceLocation";
12+
import {uSubsAndSups, unicodeSubRegEx} from "./unicodeSupOrSub";
1213
import {Token} from "./Token";
1314

1415
// Pre-evaluate both modules as unicodeSymbols require String.normalize()
@@ -399,6 +400,29 @@ export default class Parser {
399400
}
400401
// Put everything into an ordgroup as the superscript
401402
superscript = {type: "ordgroup", mode: this.mode, body: primes};
403+
} else if (uSubsAndSups[lex.text]) {
404+
// A Unicode subscript or superscript character.
405+
// We treat these similarly to the unicode-math package.
406+
// So we render a string of Unicode (sub|super)scripts the
407+
// same as a (sub|super)script of regular characters.
408+
let str = uSubsAndSups[lex.text];
409+
const isSub = unicodeSubRegEx.test(lex.text);
410+
this.consume();
411+
// Continue fetching tokens to fill out the string.
412+
while (true) {
413+
const token = this.fetch().text;
414+
if (!(uSubsAndSups[token])) { break; }
415+
if (unicodeSubRegEx.test(token) !== isSub) { break; }
416+
this.consume();
417+
str += uSubsAndSups[token];
418+
}
419+
// Now create a (sub|super)script.
420+
const body = (new Parser(str, this.settings)).parse();
421+
if (isSub) {
422+
subscript = {type: "ordgroup", mode: "math", body};
423+
} else {
424+
superscript = {type: "ordgroup", mode: "math", body};
425+
}
402426
} else {
403427
// If it wasn't ^, _, or ', stop parsing super/subscripts
404428
break;

src/unicodeSupOrSub.js

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
// Helpers for Parser.js handling of Unicode (sub|super)script characters.
2+
3+
export const unicodeSubRegEx = /^[]/;
4+
5+
export const uSubsAndSups = Object.freeze({
6+
'₊': '+',
7+
'₋': '-',
8+
'₌': '=',
9+
'₍': '(',
10+
'₎': ')',
11+
'₀': '0',
12+
'₁': '1',
13+
'₂': '2',
14+
'₃': '3',
15+
'₄': '4',
16+
'₅': '5',
17+
'₆': '6',
18+
'₇': '7',
19+
'₈': '8',
20+
'₉': '9',
21+
'\u2090': 'a',
22+
'\u2091': 'e',
23+
'\u2095': 'h',
24+
'\u1D62': 'i',
25+
'\u2C7C': 'j',
26+
'\u2096': 'k',
27+
'\u2097': 'l',
28+
'\u2098': 'm',
29+
'\u2099': 'n',
30+
'\u2092': 'o',
31+
'\u209A': 'p',
32+
'\u1D63': 'r',
33+
'\u209B': 's',
34+
'\u209C': 't',
35+
'\u1D64': 'u',
36+
'\u1D65': 'v',
37+
'\u2093': 'x',
38+
'\u1D66': 'β',
39+
'\u1D67': 'γ',
40+
'\u1D68': 'ρ',
41+
'\u1D69': '\u03d5',
42+
'\u1D6A': 'χ',
43+
'⁺': '+',
44+
'⁻': '-',
45+
'⁼': '=',
46+
'⁽': '(',
47+
'⁾': ')',
48+
'⁰': '0',
49+
'¹': '1',
50+
'²': '2',
51+
'³': '3',
52+
'⁴': '4',
53+
'⁵': '5',
54+
'⁶': '6',
55+
'⁷': '7',
56+
'⁸': '8',
57+
'⁹': '9',
58+
'\u1D2C': 'A',
59+
'\u1D2E': 'B',
60+
'\u1D30': 'D',
61+
'\u1D31': 'E',
62+
'\u1D33': 'G',
63+
'\u1D34': 'H',
64+
'\u1D35': 'I',
65+
'\u1D36': 'J',
66+
'\u1D37': 'K',
67+
'\u1D38': 'L',
68+
'\u1D39': 'M',
69+
'\u1D3A': 'N',
70+
'\u1D3C': 'O',
71+
'\u1D3E': 'P',
72+
'\u1D3F': 'R',
73+
'\u1D40': 'T',
74+
'\u1D41': 'U',
75+
'\u2C7D': 'V',
76+
'\u1D42': 'W',
77+
'\u1D43': 'a',
78+
'\u1D47': 'b',
79+
'\u1D9C': 'c',
80+
'\u1D48': 'd',
81+
'\u1D49': 'e',
82+
'\u1DA0': 'f',
83+
'\u1D4D': 'g',
84+
'\u02B0': 'h',
85+
'\u2071': 'i',
86+
'\u02B2': 'j',
87+
'\u1D4F': 'k',
88+
'\u02E1': 'l',
89+
'\u1D50': 'm',
90+
'\u207F': 'n',
91+
'\u1D52': 'o',
92+
'\u1D56': 'p',
93+
'\u02B3': 'r',
94+
'\u02E2': 's',
95+
'\u1D57': 't',
96+
'\u1D58': 'u',
97+
'\u1D5B': 'v',
98+
'\u02B7': 'w',
99+
'\u02E3': 'x',
100+
'\u02B8': 'y',
101+
'\u1DBB': 'z',
102+
'\u1D5D': 'β',
103+
'\u1D5E': 'γ',
104+
'\u1D5F': 'δ',
105+
'\u1D60': '\u03d5',
106+
'\u1D61': 'χ',
107+
'\u1DBF': 'θ',
108+
});

test/katex-spec.js

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,10 @@ describe("A subscript and superscript parser", function() {
275275
expect`x_{x^x}`.toParse();
276276
expect`x_{x_x}`.toParse();
277277
});
278+
279+
it("should work with Unicode (sub|super)script characters", function() {
280+
expect`A² + B²⁺³ + ¹²C + E₂³ + F₂₊₃`.toParseLike("A^{2} + B^{2+3} + ^{12}C + E_{2}^{3} + F_{2+3}");
281+
});
278282
});
279283

280284
describe("A subscript and superscript tree-builder", function() {

0 commit comments

Comments
 (0)