Skip to content

Commit 7f34dda

Browse files
BridgeARtargos
authored andcommitted
util: improve unicode support
The array grouping function relies on the width of the characters. It was not calculated correct so far, since it used the string length instead. This improves the unicode output by calculating the mono-spaced font width (other fonts might differ). PR-URL: nodejs#31319 Reviewed-By: James M Snell <[email protected]> Reviewed-By: Steven R Loomis <[email protected]> Reviewed-By: Rich Trott <[email protected]> Reviewed-By: Minwoo Jung <[email protected]>
1 parent 644b3c8 commit 7f34dda

File tree

11 files changed

+211
-192
lines changed

11 files changed

+211
-192
lines changed

lib/internal/cli_table.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ const {
66
ObjectPrototypeHasOwnProperty,
77
} = primordials;
88

9-
const { getStringWidth } = require('internal/readline/utils');
9+
const { getStringWidth } = require('internal/util/inspect');
1010

1111
// The use of Unicode characters below is the only non-comment use of non-ASCII
1212
// Unicode characters in Node.js built-in modules. If they are ever removed or

lib/internal/readline/utils.js

Lines changed: 0 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,13 @@
11
'use strict';
22

33
const {
4-
RegExp,
54
Symbol,
65
} = primordials;
76

8-
// Regex used for ansi escape code splitting
9-
// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
10-
// License: MIT, authors: @sindresorhus, Qix-, arjunmehta and LitoMore
11-
// Matches all ansi escape code sequences in a string
12-
const ansiPattern = '[\\u001B\\u009B][[\\]()#;?]*' +
13-
'(?:(?:(?:[a-zA-Z\\d]*(?:;[-a-zA-Z\\d\\/#&.:=?%@~_]*)*)?\\u0007)' +
14-
'|(?:(?:\\d{1,4}(?:;\\d{0,4})*)?[\\dA-PR-TZcf-ntqry=><~]))';
15-
const ansi = new RegExp(ansiPattern, 'g');
16-
177
const kUTF16SurrogateThreshold = 0x10000; // 2 ** 16
188
const kEscape = '\x1b';
199
const kSubstringSearch = Symbol('kSubstringSearch');
2010

21-
let getStringWidth;
22-
2311
function CSI(strings, ...args) {
2412
let ret = `${kEscape}[`;
2513
for (let n = 0; n < strings.length; n++) {
@@ -59,109 +47,6 @@ function charLengthAt(str, i) {
5947
return str.codePointAt(i) >= kUTF16SurrogateThreshold ? 2 : 1;
6048
}
6149

62-
if (internalBinding('config').hasIntl) {
63-
const icu = internalBinding('icu');
64-
// icu.getStringWidth(string, ambiguousAsFullWidth, expandEmojiSequence)
65-
// Defaults: ambiguousAsFullWidth = false; expandEmojiSequence = true;
66-
// TODO(BridgeAR): Expose the options to the user. That is probably the
67-
// best thing possible at the moment, since it's difficult to know what
68-
// the receiving end supports.
69-
getStringWidth = function getStringWidth(str) {
70-
let width = 0;
71-
str = stripVTControlCharacters(str);
72-
for (let i = 0; i < str.length; i++) {
73-
// Try to avoid calling into C++ by first handling the ASCII portion of
74-
// the string. If it is fully ASCII, we skip the C++ part.
75-
const code = str.charCodeAt(i);
76-
if (code >= 127) {
77-
width += icu.getStringWidth(str.slice(i));
78-
break;
79-
}
80-
width += code >= 32 ? 1 : 0;
81-
}
82-
return width;
83-
};
84-
} else {
85-
/**
86-
* Returns the number of columns required to display the given string.
87-
*/
88-
getStringWidth = function getStringWidth(str) {
89-
let width = 0;
90-
91-
str = stripVTControlCharacters(str);
92-
93-
for (const char of str) {
94-
const code = char.codePointAt(0);
95-
if (isFullWidthCodePoint(code)) {
96-
width += 2;
97-
} else if (!isZeroWidthCodePoint(code)) {
98-
width++;
99-
}
100-
}
101-
102-
return width;
103-
};
104-
105-
/**
106-
* Returns true if the character represented by a given
107-
* Unicode code point is full-width. Otherwise returns false.
108-
*/
109-
const isFullWidthCodePoint = (code) => {
110-
// Code points are partially derived from:
111-
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
112-
return code >= 0x1100 && (
113-
code <= 0x115f || // Hangul Jamo
114-
code === 0x2329 || // LEFT-POINTING ANGLE BRACKET
115-
code === 0x232a || // RIGHT-POINTING ANGLE BRACKET
116-
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
117-
(code >= 0x2e80 && code <= 0x3247 && code !== 0x303f) ||
118-
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
119-
(code >= 0x3250 && code <= 0x4dbf) ||
120-
// CJK Unified Ideographs .. Yi Radicals
121-
(code >= 0x4e00 && code <= 0xa4c6) ||
122-
// Hangul Jamo Extended-A
123-
(code >= 0xa960 && code <= 0xa97c) ||
124-
// Hangul Syllables
125-
(code >= 0xac00 && code <= 0xd7a3) ||
126-
// CJK Compatibility Ideographs
127-
(code >= 0xf900 && code <= 0xfaff) ||
128-
// Vertical Forms
129-
(code >= 0xfe10 && code <= 0xfe19) ||
130-
// CJK Compatibility Forms .. Small Form Variants
131-
(code >= 0xfe30 && code <= 0xfe6b) ||
132-
// Halfwidth and Fullwidth Forms
133-
(code >= 0xff01 && code <= 0xff60) ||
134-
(code >= 0xffe0 && code <= 0xffe6) ||
135-
// Kana Supplement
136-
(code >= 0x1b000 && code <= 0x1b001) ||
137-
// Enclosed Ideographic Supplement
138-
(code >= 0x1f200 && code <= 0x1f251) ||
139-
// Miscellaneous Symbols and Pictographs .. Emoticons
140-
(code >= 0x1f300 && code <= 0x1f64f) ||
141-
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
142-
(code >= 0x20000 && code <= 0x3fffd)
143-
);
144-
};
145-
146-
const isZeroWidthCodePoint = (code) => {
147-
return code <= 0x1F || // C0 control codes
148-
(code > 0x7F && code <= 0x9F) || // C1 control codes
149-
(code >= 0x0300 && code <= 0x036F) || // Combining Diacritical Marks
150-
(code >= 0x200B && code <= 0x200F) || // Modifying Invisible Characters
151-
(code >= 0xFE00 && code <= 0xFE0F) || // Variation Selectors
152-
(code >= 0xFE20 && code <= 0xFE2F) || // Combining Half Marks
153-
(code >= 0xE0100 && code <= 0xE01EF); // Variation Selectors
154-
};
155-
}
156-
157-
/**
158-
* Tries to remove all VT control characters. Use to estimate displayed
159-
* string width. May be buggy due to not running a real state machine
160-
*/
161-
function stripVTControlCharacters(str) {
162-
return str.replace(ansi, '');
163-
}
164-
16550
/*
16651
Some patterns seen in terminal key escape codes, derived from combos seen
16752
at http://www.midnight-commander.org/browser/lib/tty/key.c
@@ -477,8 +362,6 @@ module.exports = {
477362
charLengthLeft,
478363
commonPrefix,
479364
emitKeys,
480-
getStringWidth,
481365
kSubstringSearch,
482-
stripVTControlCharacters,
483366
CSI
484367
};

lib/internal/repl/utils.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,13 @@ const {
3232

3333
const {
3434
commonPrefix,
35-
getStringWidth,
3635
kSubstringSearch,
3736
} = require('internal/readline/utils');
3837

39-
const { inspect } = require('util');
38+
const {
39+
getStringWidth,
40+
inspect,
41+
} = require('internal/util/inspect');
4042

4143
const debug = require('internal/util/debuglog').debuglog('repl');
4244

lib/internal/util/inspect.js

Lines changed: 122 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,17 @@ const meta = [
193193
'', '', '', '', '', '', '', '\\\\'
194194
];
195195

196+
// Regex used for ansi escape code splitting
197+
// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
198+
// License: MIT, authors: @sindresorhus, Qix-, arjunmehta and LitoMore
199+
// Matches all ansi escape code sequences in a string
200+
const ansiPattern = '[\\u001B\\u009B][[\\]()#;?]*' +
201+
'(?:(?:(?:[a-zA-Z\\d]*(?:;[-a-zA-Z\\d\\/#&.:=?%@~_]*)*)?\\u0007)' +
202+
'|(?:(?:\\d{1,4}(?:;\\d{0,4})*)?[\\dA-PR-TZcf-ntqry=><~]))';
203+
const ansi = new RegExp(ansiPattern, 'g');
204+
205+
let getStringWidth;
206+
196207
function getUserOptions(ctx) {
197208
return {
198209
stylize: ctx.stylize,
@@ -1163,7 +1174,7 @@ function groupArrayElements(ctx, output, value) {
11631174
// entries length of all output entries. We have to remove colors first,
11641175
// otherwise the length would not be calculated properly.
11651176
for (; i < outputLength; i++) {
1166-
const len = ctx.colors ? removeColors(output[i]).length : output[i].length;
1177+
const len = getStringWidth(output[i], ctx.colors);
11671178
dataLen[i] = len;
11681179
totalLength += len + separatorSpace;
11691180
if (maxLength < len)
@@ -1206,8 +1217,6 @@ function groupArrayElements(ctx, output, value) {
12061217
if (columns <= 1) {
12071218
return output;
12081219
}
1209-
// TODO(BridgeAR): Add unicode support. Use the readline getStringWidth
1210-
// function.
12111220
const tmp = [];
12121221
const maxLineLength = [];
12131222
for (let i = 0; i < columns; i++) {
@@ -1582,11 +1591,8 @@ function formatProperty(ctx, value, recurseTimes, key, type, desc) {
15821591
const diff = (ctx.compact !== true || type !== kObjectType) ? 2 : 3;
15831592
ctx.indentationLvl += diff;
15841593
str = formatValue(ctx, desc.value, recurseTimes);
1585-
if (diff === 3) {
1586-
const len = ctx.colors ? removeColors(str).length : str.length;
1587-
if (ctx.breakLength < len) {
1588-
extra = `\n${' '.repeat(ctx.indentationLvl)}`;
1589-
}
1594+
if (diff === 3 && ctx.breakLength < getStringWidth(str, ctx.colors)) {
1595+
extra = `\n${' '.repeat(ctx.indentationLvl)}`;
15901596
}
15911597
ctx.indentationLvl -= diff;
15921598
} else if (desc.get !== undefined) {
@@ -1898,9 +1904,116 @@ function formatWithOptions(inspectOptions, ...args) {
18981904
return str;
18991905
}
19001906

1907+
if (internalBinding('config').hasIntl) {
1908+
const icu = internalBinding('icu');
1909+
// icu.getStringWidth(string, ambiguousAsFullWidth, expandEmojiSequence)
1910+
// Defaults: ambiguousAsFullWidth = false; expandEmojiSequence = true;
1911+
// TODO(BridgeAR): Expose the options to the user. That is probably the
1912+
// best thing possible at the moment, since it's difficult to know what
1913+
// the receiving end supports.
1914+
getStringWidth = function getStringWidth(str, removeControlChars = true) {
1915+
let width = 0;
1916+
if (removeControlChars)
1917+
str = stripVTControlCharacters(str);
1918+
for (let i = 0; i < str.length; i++) {
1919+
// Try to avoid calling into C++ by first handling the ASCII portion of
1920+
// the string. If it is fully ASCII, we skip the C++ part.
1921+
const code = str.charCodeAt(i);
1922+
if (code >= 127) {
1923+
width += icu.getStringWidth(str.slice(i));
1924+
break;
1925+
}
1926+
width += code >= 32 ? 1 : 0;
1927+
}
1928+
return width;
1929+
};
1930+
} else {
1931+
/**
1932+
* Returns the number of columns required to display the given string.
1933+
*/
1934+
getStringWidth = function getStringWidth(str, removeControlChars = true) {
1935+
let width = 0;
1936+
1937+
if (removeControlChars)
1938+
str = stripVTControlCharacters(str);
1939+
1940+
for (const char of str) {
1941+
const code = char.codePointAt(0);
1942+
if (isFullWidthCodePoint(code)) {
1943+
width += 2;
1944+
} else if (!isZeroWidthCodePoint(code)) {
1945+
width++;
1946+
}
1947+
}
1948+
1949+
return width;
1950+
};
1951+
1952+
/**
1953+
* Returns true if the character represented by a given
1954+
* Unicode code point is full-width. Otherwise returns false.
1955+
*/
1956+
const isFullWidthCodePoint = (code) => {
1957+
// Code points are partially derived from:
1958+
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
1959+
return code >= 0x1100 && (
1960+
code <= 0x115f || // Hangul Jamo
1961+
code === 0x2329 || // LEFT-POINTING ANGLE BRACKET
1962+
code === 0x232a || // RIGHT-POINTING ANGLE BRACKET
1963+
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
1964+
(code >= 0x2e80 && code <= 0x3247 && code !== 0x303f) ||
1965+
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
1966+
(code >= 0x3250 && code <= 0x4dbf) ||
1967+
// CJK Unified Ideographs .. Yi Radicals
1968+
(code >= 0x4e00 && code <= 0xa4c6) ||
1969+
// Hangul Jamo Extended-A
1970+
(code >= 0xa960 && code <= 0xa97c) ||
1971+
// Hangul Syllables
1972+
(code >= 0xac00 && code <= 0xd7a3) ||
1973+
// CJK Compatibility Ideographs
1974+
(code >= 0xf900 && code <= 0xfaff) ||
1975+
// Vertical Forms
1976+
(code >= 0xfe10 && code <= 0xfe19) ||
1977+
// CJK Compatibility Forms .. Small Form Variants
1978+
(code >= 0xfe30 && code <= 0xfe6b) ||
1979+
// Halfwidth and Fullwidth Forms
1980+
(code >= 0xff01 && code <= 0xff60) ||
1981+
(code >= 0xffe0 && code <= 0xffe6) ||
1982+
// Kana Supplement
1983+
(code >= 0x1b000 && code <= 0x1b001) ||
1984+
// Enclosed Ideographic Supplement
1985+
(code >= 0x1f200 && code <= 0x1f251) ||
1986+
// Miscellaneous Symbols and Pictographs 0x1f300 - 0x1f5ff
1987+
// Emoticons 0x1f600 - 0x1f64f
1988+
(code >= 0x1f300 && code <= 0x1f64f) ||
1989+
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
1990+
(code >= 0x20000 && code <= 0x3fffd)
1991+
);
1992+
};
1993+
1994+
const isZeroWidthCodePoint = (code) => {
1995+
return code <= 0x1F || // C0 control codes
1996+
(code > 0x7F && code <= 0x9F) || // C1 control codes
1997+
(code >= 0x300 && code <= 0x36F) || // Combining Diacritical Marks
1998+
(code >= 0x200B && code <= 0x200F) || // Modifying Invisible Characters
1999+
(code >= 0xFE00 && code <= 0xFE0F) || // Variation Selectors
2000+
(code >= 0xFE20 && code <= 0xFE2F) || // Combining Half Marks
2001+
(code >= 0xE0100 && code <= 0xE01EF); // Variation Selectors
2002+
};
2003+
}
2004+
2005+
/**
2006+
* Remove all VT control characters. Use to estimate displayed string width.
2007+
*/
2008+
function stripVTControlCharacters(str) {
2009+
return str.replace(ansi, '');
2010+
}
2011+
19012012
module.exports = {
19022013
inspect,
19032014
format,
19042015
formatWithOptions,
1905-
inspectDefaultOptions
2016+
getStringWidth,
2017+
inspectDefaultOptions,
2018+
stripVTControlCharacters
19062019
};

lib/readline.js

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,17 +46,19 @@ const {
4646
ERR_INVALID_OPT_VALUE
4747
} = require('internal/errors').codes;
4848
const { validateString } = require('internal/validators');
49-
const { inspect } = require('internal/util/inspect');
49+
const {
50+
inspect,
51+
getStringWidth,
52+
stripVTControlCharacters,
53+
} = require('internal/util/inspect');
5054
const EventEmitter = require('events');
5155
const {
5256
charLengthAt,
5357
charLengthLeft,
5458
commonPrefix,
5559
CSI,
5660
emitKeys,
57-
getStringWidth,
5861
kSubstringSearch,
59-
stripVTControlCharacters
6062
} = require('internal/readline/utils');
6163

6264
const { clearTimeout, setTimeout } = require('timers');

0 commit comments

Comments
 (0)