Skip to content

Commit

Permalink
icu: add icu module
Browse files Browse the repository at this point in the history
* Moves transcode and normalize to buffer module statics
* Moves getCharacterProperty and getColumnWidth to util
  • Loading branch information
jasnell committed Sep 27, 2016
1 parent 6b443d1 commit 3d261d3
Show file tree
Hide file tree
Showing 13 changed files with 1,048 additions and 99 deletions.
21 changes: 21 additions & 0 deletions doc/api/buffer.md
Original file line number Diff line number Diff line change
Expand Up @@ -2304,6 +2304,27 @@ added: v3.0.0
On 32-bit architectures, this value is `(2^30)-1` (~1GB).
On 64-bit architectures, this value is `(2^31)-1` (~2GB).

## buffer.normalize(buf, form[, encoding])

* `buf` {Buffer} A `Buffer` instance
* `form` {String} A Unicode normalization form (one of: `'NFC'`, `'NFD'`,
`NFKC`, or `NFKD`)
* `encoding` {String} The source character encoding of the `buf`. Defaults to
`'utf8'`

Performs Unicode Normalization to the `buf` and returns a new `Buffer` instance
containing the UTF-8 encoded results. Throws if the `form` does not specify a
valid Normalization form or if the normalization cannot be successfully applied.

## buffer.transcode(buf, from_enc, to_enc)

* `buf` {Buffer} A `Buffer` instance
* `from_enc` {string} The current encoding
* `to_enc` {string} The target encoding

Re-encodes the given `Buffer` from one character encoding to another. Returns
a new `Buffer` instance.

## Class: SlowBuffer
<!-- YAML
deprecated: v6.0.0
Expand Down
18 changes: 18 additions & 0 deletions doc/api/util.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ module developers as well. It can be accessed using:
const util = require('util');
```

## util.constants

Constants for use with `util.getCharacterProperty()`.

## util.debuglog(section)
<!-- YAML
added: v0.11.3
Expand Down Expand Up @@ -133,6 +137,20 @@ Each argument is converted to a string using `util.inspect()`.
util.format(1, 2, 3); // '1 2 3'
```

## util.getCharacterProperty(codepoint, property)

* `codepoint` {number} A Unicode codepoint value
* `property` {number} A Unicode codepoint constant (from `util.constants.*`)

Returns a specific Unicode codepoint property for the given codepoint value.

## util.getColumnWidth(cp)

* `cp` {number | String} A Unicode codepoint value or a String

Returns the number of terminal columns to be used to display the given Unicode
codepoint or string.

## util.inherits(constructor, superConstructor)
<!-- YAML
added: v0.3.0
Expand Down
3 changes: 3 additions & 0 deletions lib/buffer.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ const binding = process.binding('buffer');
const { isArrayBuffer, isSharedArrayBuffer } = process.binding('util');
const bindingObj = {};
const internalUtil = require('internal/util');
const internalBuffer = require('internal/buffer');

class FastBuffer extends Uint8Array {
constructor(arg1, arg2, arg3) {
Expand All @@ -19,6 +20,8 @@ exports.Buffer = Buffer;
exports.SlowBuffer = SlowBuffer;
exports.INSPECT_MAX_BYTES = 50;
exports.kMaxLength = binding.kMaxLength;
exports.transcode = internalBuffer.transcode;
exports.normalize = internalBuffer.normalize;

const kFromErrorMsg = 'First argument must be a string, Buffer, ' +
'ArrayBuffer, Array, or array-like object.';
Expand Down
96 changes: 96 additions & 0 deletions lib/internal/buffer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
'use strict';

const Buffer = require('buffer').Buffer;
const normalizeEncoding = require('internal/util').normalizeEncoding;

if (process.binding('config').hasIntl) {

const icu = process.binding('icu');

// Maps the supported transcoding conversions. The top key is the from_enc,
// the child key is the to_enc. The value is the transcoding function to.
const conversions = {
'ascii': {
'latin1': (source) => {
return Buffer.from(source);
},
'utf8': (source) => {
return Buffer.from(source);
},
'utf16le': (source) => {
return icu.convertToUcs2('us-ascii', source);
}
},
'latin1': {
'ascii': (source) => {
return icu.convert('us-ascii', 'iso8859-1', source);
},
'utf8': (source) => {
return icu.convert('utf-8', 'iso8859-1', source);
},
'utf16le': (source) => {
return icu.convertToUcs2('iso8859-1', source);
}
},
'utf8': {
'ascii': (source) => {
return icu.convert('us-ascii', 'utf-8', source);
},
'latin1': (source) => {
return icu.convert('iso-8859-1', 'utf-8', source);
},
'utf16le': icu.convertToUcs2FromUtf8,
},
'utf16le': {
'ascii': (source) => {
if (source.length % 2 !== 0)
throw new TypeError('Invalid UCS2 Buffer');
return icu.convertFromUcs2('us-ascii', source);
},
'latin1': (source) => {
if (source.length % 2 !== 0)
throw new TypeError('Invalid UCS2 Buffer');
return icu.convertFromUcs2('iso-8859-1', source);
},
'utf8': (source) => {
if (source.length % 2 !== 0)
throw new TypeError('Invalid UCS2 Buffer');
return icu.convertToUtf8FromUcs2(source);
}
}
};

// Transcodes the Buffer from one encoding to another, returning a new
// Buffer instance.
exports.transcode = function transcode(source, from_enc, to_enc) {
if (!source || !(source.buffer instanceof ArrayBuffer))
throw new TypeError('"source" argument must be a Buffer');
if (source.length === 0) return Buffer.alloc(0);

from_enc = normalizeEncoding(from_enc) || from_enc;
to_enc = normalizeEncoding(to_enc) || to_enc;

if (from_enc === to_enc)
return Buffer.from(source);

const cnv_from = conversions[from_enc];

if (cnv_from) {
const cnv_to = cnv_from[to_enc];
if (cnv_to)
return cnv_to(source);
}
throw new TypeError(`Unsupported conversion: ${from_enc} to ${to_enc}`);
};

// Perform Unicode Normalization on the Buffer.
exports.normalize = function normalize(buf, form, encoding) {
if (!buf || !(buf.buffer instanceof ArrayBuffer))
throw new TypeError('First argument must be a Buffer');
encoding = normalizeEncoding(encoding);
if (encoding === 'ascii')
encoding == 'us-ascii';
return icu.normalize(buf, encoding, String(form).toUpperCase());
};

}
172 changes: 99 additions & 73 deletions lib/internal/readline.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,102 +2,128 @@

// Regexes used for ansi escape code splitting
// eslint-disable-next-line no-control-regex
const metaKeyCodeReAnywhere = /(?:\x1b)([a-zA-Z0-9])/;
const functionKeyCodeReAnywhere = new RegExp('(?:\x1b+)(O|N|\\[|\\[\\[)(?:' + [
'(\\d+)(?:;(\\d+))?([~^$])',
'(?:M([@ #!a`])(.)(.))', // mouse
'(?:1;)?(\\d+)?([a-zA-Z])'
].join('|') + ')');
// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
// License: MIT, authors: @sindresorhus, Qix-, and arjunmehta
const ansi =
/[\u001b\u009b][[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-ORZcf-nqry=><]/g;


module.exports = {
exports = module.exports = {
emitKeys,
getStringWidth,
isFullWidthCodePoint,
stripVTControlCharacters
};


/**
* Returns the number of columns required to display the given string.
*/
function getStringWidth(str) {
let width = 0;

str = stripVTControlCharacters(str);

for (let i = 0; i < str.length; i++) {
const code = str.codePointAt(i);
if (process.binding('config').hasIntl) {
const util = require('util');
exports.getStringWidth = function getStringWidth(str) {
return util.getColumnWidth(stripVTControlCharacters(str));
};

exports.isFullWidthCodePoint = function isFullWidthCodePoint(code) {
// Defined here largely for legacy support reasons. Updated to
// use character properties rather than fixed ranges.
const eaw =
util.getCharacterProperty(code,
util.constants.UCHAR_EAST_ASIAN_WIDTH);
const emoji =
util.getCharacterProperty(code,
util.constants.UCHAR_EMOJI_PRESENTATION) &&
!util.getCharacterProperty(code,
util.constants.UCHAR_EMOJI_MODIFIER);
return eaw === util.constants.U_EA_FULLWIDTH ||
eaw === util.constants.U_EA_WIDE ||
emoji;
};

} else {
// These old implementations are used as fallbacks only when Node.js
// is compiled without ICU. The getStringWidth implementation here is
// about 30% slower than the ICU based implementation and does not
// work properly for emoji and newer unicode characters. The new impl
// uses ICU's built in character properties data to provide more accurate
// results.
/**
* Returns the number of columns required to display the given string.
*/
function getStringWidth(str) {
let width = 0;

str = stripVTControlCharacters(str);

for (let i = 0; i < str.length; i++) {
const code = str.codePointAt(i);

if (code >= 0x10000) { // surrogates
i++;
}

if (code >= 0x10000) { // surrogates
i++;
if (isFullWidthCodePoint(code)) {
width += 2;
} else {
width++;
}
}

if (isFullWidthCodePoint(code)) {
width += 2;
} else {
width++;
}
return width;
}

return width;
}

/**
* Returns true if the character represented by a given
* Unicode code point is full-width. Otherwise returns false.
*/
function isFullWidthCodePoint(code) {
if (isNaN(code)) {
return false;
}

/**
* Returns true if the character represented by a given
* Unicode code point is full-width. Otherwise returns false.
*/
function isFullWidthCodePoint(code) {
if (isNaN(code)) {
return false;
}
// Code points are derived from:
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
if (code >= 0x1100 && (
code <= 0x115f || // Hangul Jamo
0x2329 === code || // LEFT-POINTING ANGLE BRACKET
0x232a === code || // RIGHT-POINTING ANGLE BRACKET
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
(0x2e80 <= code && code <= 0x3247 && code !== 0x303f) ||
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
0x3250 <= code && code <= 0x4dbf ||
// CJK Unified Ideographs .. Yi Radicals
0x4e00 <= code && code <= 0xa4c6 ||
// Hangul Jamo Extended-A
0xa960 <= code && code <= 0xa97c ||
// Hangul Syllables
0xac00 <= code && code <= 0xd7a3 ||
// CJK Compatibility Ideographs
0xf900 <= code && code <= 0xfaff ||
// Vertical Forms
0xfe10 <= code && code <= 0xfe19 ||
// CJK Compatibility Forms .. Small Form Variants
0xfe30 <= code && code <= 0xfe6b ||
// Halfwidth and Fullwidth Forms
0xff01 <= code && code <= 0xff60 ||
0xffe0 <= code && code <= 0xffe6 ||
// Kana Supplement
0x1b000 <= code && code <= 0x1b001 ||
// Enclosed Ideographic Supplement
0x1f200 <= code && code <= 0x1f251 ||
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
0x20000 <= code && code <= 0x3fffd)) {
return true;
}

// Code points are derived from:
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
if (code >= 0x1100 && (
code <= 0x115f || // Hangul Jamo
0x2329 === code || // LEFT-POINTING ANGLE BRACKET
0x232a === code || // RIGHT-POINTING ANGLE BRACKET
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
(0x2e80 <= code && code <= 0x3247 && code !== 0x303f) ||
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
0x3250 <= code && code <= 0x4dbf ||
// CJK Unified Ideographs .. Yi Radicals
0x4e00 <= code && code <= 0xa4c6 ||
// Hangul Jamo Extended-A
0xa960 <= code && code <= 0xa97c ||
// Hangul Syllables
0xac00 <= code && code <= 0xd7a3 ||
// CJK Compatibility Ideographs
0xf900 <= code && code <= 0xfaff ||
// Vertical Forms
0xfe10 <= code && code <= 0xfe19 ||
// CJK Compatibility Forms .. Small Form Variants
0xfe30 <= code && code <= 0xfe6b ||
// Halfwidth and Fullwidth Forms
0xff01 <= code && code <= 0xff60 ||
0xffe0 <= code && code <= 0xffe6 ||
// Kana Supplement
0x1b000 <= code && code <= 0x1b001 ||
// Enclosed Ideographic Supplement
0x1f200 <= code && code <= 0x1f251 ||
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
0x20000 <= code && code <= 0x3fffd)) {
return true;
return false;
}

return false;
exports.isFullWidthCodePoint = isFullWidthCodePoint;
exports.getStringWidth = getStringWidth;
}


/**
* Tries to remove all VT control characters. Use to estimate displayed
* string width. May be buggy due to not running a real state machine
*/
function stripVTControlCharacters(str) {
str = str.replace(new RegExp(functionKeyCodeReAnywhere.source, 'g'), '');
return str.replace(new RegExp(metaKeyCodeReAnywhere.source, 'g'), '');
return str.replace(ansi, '');
}


Expand Down
12 changes: 12 additions & 0 deletions lib/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -1053,3 +1053,15 @@ exports._exceptionWithHostPort = function(err,
}
return ex;
};

if (process.binding('config').hasIntl) {
const icu = process.binding('icu');
const constants = process.binding('constants').icu;
Object.defineProperty(exports, 'constants', {
configurable: false,
enumerable: true,
value: constants
});
exports.getCharacterProperty = icu.getCharacterProperty;
exports.getColumnWidth = icu.getColumnWidth;
}
1 change: 1 addition & 0 deletions node.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
'lib/v8.js',
'lib/vm.js',
'lib/zlib.js',
'lib/internal/buffer.js',
'lib/internal/child_process.js',
'lib/internal/cluster.js',
'lib/internal/freelist.js',
Expand Down
Loading

0 comments on commit 3d261d3

Please sign in to comment.