Skip to content

Commit 3da4d51

Browse files
BridgeARcodebytere
authored andcommitted
util: improve unicode support
The array grouping function relies on the width of the characters. It was not calculated correct so far, since it used the string length instead. This improves the unicode output by calculating the mono-spaced font width (other fonts might differ). PR-URL: #31319 Reviewed-By: James M Snell <[email protected]> Reviewed-By: Steven R Loomis <[email protected]> Reviewed-By: Rich Trott <[email protected]> Reviewed-By: Minwoo Jung <[email protected]>
1 parent 1bcf2f9 commit 3da4d51

File tree

11 files changed

+211
-192
lines changed

11 files changed

+211
-192
lines changed

lib/internal/cli_table.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ const {
66
ObjectPrototypeHasOwnProperty,
77
} = primordials;
88

9-
const { getStringWidth } = require('internal/readline/utils');
9+
const { getStringWidth } = require('internal/util/inspect');
1010

1111
// The use of Unicode characters below is the only non-comment use of non-ASCII
1212
// Unicode characters in Node.js built-in modules. If they are ever removed or

lib/internal/readline/utils.js

Lines changed: 0 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,13 @@
11
'use strict';
22

33
const {
4-
RegExp,
54
Symbol,
65
} = primordials;
76

8-
// Regex used for ansi escape code splitting
9-
// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
10-
// License: MIT, authors: @sindresorhus, Qix-, arjunmehta and LitoMore
11-
// Matches all ansi escape code sequences in a string
12-
const ansiPattern = '[\\u001B\\u009B][[\\]()#;?]*' +
13-
'(?:(?:(?:[a-zA-Z\\d]*(?:;[-a-zA-Z\\d\\/#&.:=?%@~_]*)*)?\\u0007)' +
14-
'|(?:(?:\\d{1,4}(?:;\\d{0,4})*)?[\\dA-PR-TZcf-ntqry=><~]))';
15-
const ansi = new RegExp(ansiPattern, 'g');
16-
177
const kUTF16SurrogateThreshold = 0x10000; // 2 ** 16
188
const kEscape = '\x1b';
199
const kSubstringSearch = Symbol('kSubstringSearch');
2010

21-
let getStringWidth;
22-
2311
function CSI(strings, ...args) {
2412
let ret = `${kEscape}[`;
2513
for (let n = 0; n < strings.length; n++) {
@@ -59,109 +47,6 @@ function charLengthAt(str, i) {
5947
return str.codePointAt(i) >= kUTF16SurrogateThreshold ? 2 : 1;
6048
}
6149

62-
if (internalBinding('config').hasIntl) {
63-
const icu = internalBinding('icu');
64-
// icu.getStringWidth(string, ambiguousAsFullWidth, expandEmojiSequence)
65-
// Defaults: ambiguousAsFullWidth = false; expandEmojiSequence = true;
66-
// TODO(BridgeAR): Expose the options to the user. That is probably the
67-
// best thing possible at the moment, since it's difficult to know what
68-
// the receiving end supports.
69-
getStringWidth = function getStringWidth(str) {
70-
let width = 0;
71-
str = stripVTControlCharacters(str);
72-
for (let i = 0; i < str.length; i++) {
73-
// Try to avoid calling into C++ by first handling the ASCII portion of
74-
// the string. If it is fully ASCII, we skip the C++ part.
75-
const code = str.charCodeAt(i);
76-
if (code >= 127) {
77-
width += icu.getStringWidth(str.slice(i));
78-
break;
79-
}
80-
width += code >= 32 ? 1 : 0;
81-
}
82-
return width;
83-
};
84-
} else {
85-
/**
86-
* Returns the number of columns required to display the given string.
87-
*/
88-
getStringWidth = function getStringWidth(str) {
89-
let width = 0;
90-
91-
str = stripVTControlCharacters(str);
92-
93-
for (const char of str) {
94-
const code = char.codePointAt(0);
95-
if (isFullWidthCodePoint(code)) {
96-
width += 2;
97-
} else if (!isZeroWidthCodePoint(code)) {
98-
width++;
99-
}
100-
}
101-
102-
return width;
103-
};
104-
105-
/**
106-
* Returns true if the character represented by a given
107-
* Unicode code point is full-width. Otherwise returns false.
108-
*/
109-
const isFullWidthCodePoint = (code) => {
110-
// Code points are partially derived from:
111-
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
112-
return code >= 0x1100 && (
113-
code <= 0x115f || // Hangul Jamo
114-
code === 0x2329 || // LEFT-POINTING ANGLE BRACKET
115-
code === 0x232a || // RIGHT-POINTING ANGLE BRACKET
116-
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
117-
(code >= 0x2e80 && code <= 0x3247 && code !== 0x303f) ||
118-
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
119-
(code >= 0x3250 && code <= 0x4dbf) ||
120-
// CJK Unified Ideographs .. Yi Radicals
121-
(code >= 0x4e00 && code <= 0xa4c6) ||
122-
// Hangul Jamo Extended-A
123-
(code >= 0xa960 && code <= 0xa97c) ||
124-
// Hangul Syllables
125-
(code >= 0xac00 && code <= 0xd7a3) ||
126-
// CJK Compatibility Ideographs
127-
(code >= 0xf900 && code <= 0xfaff) ||
128-
// Vertical Forms
129-
(code >= 0xfe10 && code <= 0xfe19) ||
130-
// CJK Compatibility Forms .. Small Form Variants
131-
(code >= 0xfe30 && code <= 0xfe6b) ||
132-
// Halfwidth and Fullwidth Forms
133-
(code >= 0xff01 && code <= 0xff60) ||
134-
(code >= 0xffe0 && code <= 0xffe6) ||
135-
// Kana Supplement
136-
(code >= 0x1b000 && code <= 0x1b001) ||
137-
// Enclosed Ideographic Supplement
138-
(code >= 0x1f200 && code <= 0x1f251) ||
139-
// Miscellaneous Symbols and Pictographs .. Emoticons
140-
(code >= 0x1f300 && code <= 0x1f64f) ||
141-
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
142-
(code >= 0x20000 && code <= 0x3fffd)
143-
);
144-
};
145-
146-
const isZeroWidthCodePoint = (code) => {
147-
return code <= 0x1F || // C0 control codes
148-
(code > 0x7F && code <= 0x9F) || // C1 control codes
149-
(code >= 0x0300 && code <= 0x036F) || // Combining Diacritical Marks
150-
(code >= 0x200B && code <= 0x200F) || // Modifying Invisible Characters
151-
(code >= 0xFE00 && code <= 0xFE0F) || // Variation Selectors
152-
(code >= 0xFE20 && code <= 0xFE2F) || // Combining Half Marks
153-
(code >= 0xE0100 && code <= 0xE01EF); // Variation Selectors
154-
};
155-
}
156-
157-
/**
158-
* Tries to remove all VT control characters. Use to estimate displayed
159-
* string width. May be buggy due to not running a real state machine
160-
*/
161-
function stripVTControlCharacters(str) {
162-
return str.replace(ansi, '');
163-
}
164-
16550
/*
16651
Some patterns seen in terminal key escape codes, derived from combos seen
16752
at http://www.midnight-commander.org/browser/lib/tty/key.c
@@ -477,8 +362,6 @@ module.exports = {
477362
charLengthLeft,
478363
commonPrefix,
479364
emitKeys,
480-
getStringWidth,
481365
kSubstringSearch,
482-
stripVTControlCharacters,
483366
CSI
484367
};

lib/internal/repl/utils.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,13 @@ const {
3232

3333
const {
3434
commonPrefix,
35-
getStringWidth,
3635
kSubstringSearch,
3736
} = require('internal/readline/utils');
3837

39-
const { inspect } = require('util');
38+
const {
39+
getStringWidth,
40+
inspect,
41+
} = require('internal/util/inspect');
4042

4143
const debug = require('internal/util/debuglog').debuglog('repl');
4244

lib/internal/util/inspect.js

Lines changed: 122 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,17 @@ const meta = [
190190
'', '', '', '', '', '', '', '\\\\'
191191
];
192192

193+
// Regex used for ansi escape code splitting
194+
// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
195+
// License: MIT, authors: @sindresorhus, Qix-, arjunmehta and LitoMore
196+
// Matches all ansi escape code sequences in a string
197+
const ansiPattern = '[\\u001B\\u009B][[\\]()#;?]*' +
198+
'(?:(?:(?:[a-zA-Z\\d]*(?:;[-a-zA-Z\\d\\/#&.:=?%@~_]*)*)?\\u0007)' +
199+
'|(?:(?:\\d{1,4}(?:;\\d{0,4})*)?[\\dA-PR-TZcf-ntqry=><~]))';
200+
const ansi = new RegExp(ansiPattern, 'g');
201+
202+
let getStringWidth;
203+
193204
function getUserOptions(ctx) {
194205
return {
195206
stylize: ctx.stylize,
@@ -1149,7 +1160,7 @@ function groupArrayElements(ctx, output, value) {
11491160
// entries length of all output entries. We have to remove colors first,
11501161
// otherwise the length would not be calculated properly.
11511162
for (; i < outputLength; i++) {
1152-
const len = ctx.colors ? removeColors(output[i]).length : output[i].length;
1163+
const len = getStringWidth(output[i], ctx.colors);
11531164
dataLen[i] = len;
11541165
totalLength += len + separatorSpace;
11551166
if (maxLength < len)
@@ -1192,8 +1203,6 @@ function groupArrayElements(ctx, output, value) {
11921203
if (columns <= 1) {
11931204
return output;
11941205
}
1195-
// TODO(BridgeAR): Add unicode support. Use the readline getStringWidth
1196-
// function.
11971206
const tmp = [];
11981207
const maxLineLength = [];
11991208
for (let i = 0; i < columns; i++) {
@@ -1560,11 +1569,8 @@ function formatProperty(ctx, value, recurseTimes, key, type, desc) {
15601569
const diff = (ctx.compact !== true || type !== kObjectType) ? 2 : 3;
15611570
ctx.indentationLvl += diff;
15621571
str = formatValue(ctx, desc.value, recurseTimes);
1563-
if (diff === 3) {
1564-
const len = ctx.colors ? removeColors(str).length : str.length;
1565-
if (ctx.breakLength < len) {
1566-
extra = `\n${' '.repeat(ctx.indentationLvl)}`;
1567-
}
1572+
if (diff === 3 && ctx.breakLength < getStringWidth(str, ctx.colors)) {
1573+
extra = `\n${' '.repeat(ctx.indentationLvl)}`;
15681574
}
15691575
ctx.indentationLvl -= diff;
15701576
} else if (desc.get !== undefined) {
@@ -1884,9 +1890,116 @@ function formatWithOptionsInternal(inspectOptions, ...args) {
18841890
return str;
18851891
}
18861892

1893+
if (internalBinding('config').hasIntl) {
1894+
const icu = internalBinding('icu');
1895+
// icu.getStringWidth(string, ambiguousAsFullWidth, expandEmojiSequence)
1896+
// Defaults: ambiguousAsFullWidth = false; expandEmojiSequence = true;
1897+
// TODO(BridgeAR): Expose the options to the user. That is probably the
1898+
// best thing possible at the moment, since it's difficult to know what
1899+
// the receiving end supports.
1900+
getStringWidth = function getStringWidth(str, removeControlChars = true) {
1901+
let width = 0;
1902+
if (removeControlChars)
1903+
str = stripVTControlCharacters(str);
1904+
for (let i = 0; i < str.length; i++) {
1905+
// Try to avoid calling into C++ by first handling the ASCII portion of
1906+
// the string. If it is fully ASCII, we skip the C++ part.
1907+
const code = str.charCodeAt(i);
1908+
if (code >= 127) {
1909+
width += icu.getStringWidth(str.slice(i));
1910+
break;
1911+
}
1912+
width += code >= 32 ? 1 : 0;
1913+
}
1914+
return width;
1915+
};
1916+
} else {
1917+
/**
1918+
* Returns the number of columns required to display the given string.
1919+
*/
1920+
getStringWidth = function getStringWidth(str, removeControlChars = true) {
1921+
let width = 0;
1922+
1923+
if (removeControlChars)
1924+
str = stripVTControlCharacters(str);
1925+
1926+
for (const char of str) {
1927+
const code = char.codePointAt(0);
1928+
if (isFullWidthCodePoint(code)) {
1929+
width += 2;
1930+
} else if (!isZeroWidthCodePoint(code)) {
1931+
width++;
1932+
}
1933+
}
1934+
1935+
return width;
1936+
};
1937+
1938+
/**
1939+
* Returns true if the character represented by a given
1940+
* Unicode code point is full-width. Otherwise returns false.
1941+
*/
1942+
const isFullWidthCodePoint = (code) => {
1943+
// Code points are partially derived from:
1944+
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
1945+
return code >= 0x1100 && (
1946+
code <= 0x115f || // Hangul Jamo
1947+
code === 0x2329 || // LEFT-POINTING ANGLE BRACKET
1948+
code === 0x232a || // RIGHT-POINTING ANGLE BRACKET
1949+
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
1950+
(code >= 0x2e80 && code <= 0x3247 && code !== 0x303f) ||
1951+
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
1952+
(code >= 0x3250 && code <= 0x4dbf) ||
1953+
// CJK Unified Ideographs .. Yi Radicals
1954+
(code >= 0x4e00 && code <= 0xa4c6) ||
1955+
// Hangul Jamo Extended-A
1956+
(code >= 0xa960 && code <= 0xa97c) ||
1957+
// Hangul Syllables
1958+
(code >= 0xac00 && code <= 0xd7a3) ||
1959+
// CJK Compatibility Ideographs
1960+
(code >= 0xf900 && code <= 0xfaff) ||
1961+
// Vertical Forms
1962+
(code >= 0xfe10 && code <= 0xfe19) ||
1963+
// CJK Compatibility Forms .. Small Form Variants
1964+
(code >= 0xfe30 && code <= 0xfe6b) ||
1965+
// Halfwidth and Fullwidth Forms
1966+
(code >= 0xff01 && code <= 0xff60) ||
1967+
(code >= 0xffe0 && code <= 0xffe6) ||
1968+
// Kana Supplement
1969+
(code >= 0x1b000 && code <= 0x1b001) ||
1970+
// Enclosed Ideographic Supplement
1971+
(code >= 0x1f200 && code <= 0x1f251) ||
1972+
// Miscellaneous Symbols and Pictographs 0x1f300 - 0x1f5ff
1973+
// Emoticons 0x1f600 - 0x1f64f
1974+
(code >= 0x1f300 && code <= 0x1f64f) ||
1975+
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
1976+
(code >= 0x20000 && code <= 0x3fffd)
1977+
);
1978+
};
1979+
1980+
const isZeroWidthCodePoint = (code) => {
1981+
return code <= 0x1F || // C0 control codes
1982+
(code > 0x7F && code <= 0x9F) || // C1 control codes
1983+
(code >= 0x300 && code <= 0x36F) || // Combining Diacritical Marks
1984+
(code >= 0x200B && code <= 0x200F) || // Modifying Invisible Characters
1985+
(code >= 0xFE00 && code <= 0xFE0F) || // Variation Selectors
1986+
(code >= 0xFE20 && code <= 0xFE2F) || // Combining Half Marks
1987+
(code >= 0xE0100 && code <= 0xE01EF); // Variation Selectors
1988+
};
1989+
}
1990+
1991+
/**
1992+
* Remove all VT control characters. Use to estimate displayed string width.
1993+
*/
1994+
function stripVTControlCharacters(str) {
1995+
return str.replace(ansi, '');
1996+
}
1997+
18871998
module.exports = {
18881999
inspect,
18892000
format,
18902001
formatWithOptions,
1891-
inspectDefaultOptions
2002+
getStringWidth,
2003+
inspectDefaultOptions,
2004+
stripVTControlCharacters
18922005
};

lib/readline.js

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,17 +46,19 @@ const {
4646
ERR_INVALID_OPT_VALUE
4747
} = require('internal/errors').codes;
4848
const { validateString } = require('internal/validators');
49-
const { inspect } = require('internal/util/inspect');
49+
const {
50+
inspect,
51+
getStringWidth,
52+
stripVTControlCharacters,
53+
} = require('internal/util/inspect');
5054
const EventEmitter = require('events');
5155
const {
5256
charLengthAt,
5357
charLengthLeft,
5458
commonPrefix,
5559
CSI,
5660
emitKeys,
57-
getStringWidth,
5861
kSubstringSearch,
59-
stripVTControlCharacters
6062
} = require('internal/readline/utils');
6163

6264
const { clearTimeout, setTimeout } = require('timers');

0 commit comments

Comments
 (0)