From 6c0ec8df5cbad271173024464d34d38d0a587920 Mon Sep 17 00:00:00 2001 From: Neal Beeken Date: Mon, 29 Jan 2024 13:57:22 -0500 Subject: [PATCH 1/9] feat(NODE-5861): optimize parsing basic latin strings --- src/utils/node_byte_utils.ts | 27 +++++++++++++++++++++++++++ src/utils/web_byte_utils.ts | 27 +++++++++++++++++++++++++++ test/node/byte_utils.test.ts | 27 +++++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/src/utils/node_byte_utils.ts b/src/utils/node_byte_utils.ts index 214b1e39..1621811b 100644 --- a/src/utils/node_byte_utils.ts +++ b/src/utils/node_byte_utils.ts @@ -126,6 +126,33 @@ export const nodeJsByteUtils = { }, toUTF8(buffer: Uint8Array, start: number, end: number): string { + if (buffer.length === 0) { + return ''; + } + + const stringByteLength = end - start; + if (stringByteLength === 0) { + return ''; + } + + if (stringByteLength < 200) { + let basicLatin = true; + const latinBytes = []; + for (let i = start; i < end; i++) { + const byte = buffer[i]; + if (byte > 127) { + basicLatin = false; + break; + } + latinBytes.push(byte); + } + + if (basicLatin) { + // eslint-disable-next-line prefer-spread + return String.fromCharCode.apply(String, latinBytes); + } + } + return nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end); }, diff --git a/src/utils/web_byte_utils.ts b/src/utils/web_byte_utils.ts index cf93e43a..28dda05e 100644 --- a/src/utils/web_byte_utils.ts +++ b/src/utils/web_byte_utils.ts @@ -173,6 +173,33 @@ export const webByteUtils = { }, toUTF8(uint8array: Uint8Array, start: number, end: number): string { + if (uint8array.length === 0) { + return ''; + } + + const stringByteLength = end - start; + if (stringByteLength === 0) { + return ''; + } + + if (stringByteLength < 200) { + let basicLatin = true; + const latinBytes = []; + for (let i = start; i < end; i++) { + const byte = uint8array[i]; + if (byte > 127) { + basicLatin = false; + break; + } + latinBytes.push(byte); + } + + if (basicLatin) { + // eslint-disable-next-line prefer-spread + return String.fromCharCode.apply(String, latinBytes); + } + } + return new TextDecoder('utf8', { fatal: false }).decode(uint8array.slice(start, end)); }, diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts index 81fba0c4..f2cac490 100644 --- a/test/node/byte_utils.test.ts +++ b/test/node/byte_utils.test.ts @@ -400,7 +400,7 @@ const fromUTF8Tests: ByteUtilTest<'fromUTF8'>[] = [ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ { name: 'should create utf8 string from buffer input', - inputs: [Buffer.from('abc\u{1f913}', 'utf8')], + inputs: [Buffer.from('abc\u{1f913}', 'utf8'), 0, 7], expectation({ output, error }) { expect(error).to.be.null; expect(output).to.deep.equal(Buffer.from('abc\u{1f913}', 'utf8').toString('utf8')); @@ -408,7 +408,7 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ }, { name: 'should return empty string for empty buffer input', - inputs: [Buffer.alloc(0)], + inputs: [Buffer.alloc(0), 0, 1], expectation({ output, error }) { expect(error).to.be.null; expect(output).to.be.a('string').with.lengthOf(0); @@ -596,6 +596,29 @@ describe('ByteUtils', () => { }); }); + describe('toUTF8 basic latin optimization', () => { + afterEach(() => { + sinon.restore(); + }); + + context('Given a basic latin string', () => { + it('should not invoke Buffer.toString', () => { + const buffer = Buffer.from('abcdef', 'utf8'); + const spy = sinon.spy(buffer, 'toString'); + nodeJsByteUtils.toUTF8(buffer, 0, 6); + expect(spy).to.not.have.been.called; + }); + + it('should not invoke TextDecoder.decode', () => { + const utf8Bytes = Buffer.from('abcdef', 'utf8'); + const buffer = new Uint8Array(utf8Bytes.buffer, utf8Bytes.byteOffset, utf8Bytes.byteLength); + const spy = sinon.spy(TextDecoder.prototype, 'decode'); + webByteUtils.toUTF8(buffer, 0, 6); + expect(spy).to.not.have.been.called; + }); + }); + }); + describe('randomBytes fallback case when crypto is not present', () => { describe('web', function () { let bsonWithNoCryptoCtx; From e353a30c2e5b5459c74b5e56b1c15b1519c143d1 Mon Sep 17 00:00:00 2001 From: Neal Beeken Date: Mon, 29 Jan 2024 14:49:45 -0500 Subject: [PATCH 2/9] feat(NODE-5861): move UTF8 validation below basic latin parsing --- rollup.config.mjs | 2 +- src/binary.ts | 4 ++-- src/error.ts | 6 +++--- src/parser/deserializer.ts | 41 +++++++++--------------------------- src/utils/byte_utils.ts | 4 ++-- src/utils/node_byte_utils.ts | 17 +++++++++++++-- src/utils/web_byte_utils.ts | 11 ++++++++-- test/node/byte_utils.test.ts | 24 +++++++++++++++++---- tsconfig.json | 1 + 9 files changed, 63 insertions(+), 47 deletions(-) diff --git a/rollup.config.mjs b/rollup.config.mjs index b44e4cd7..17c402d8 100644 --- a/rollup.config.mjs +++ b/rollup.config.mjs @@ -13,7 +13,7 @@ const tsConfig = { module: 'esnext', moduleResolution: 'node', removeComments: true, - lib: ['es2021'], + lib: ['es2021', 'ES2022.Error'], importHelpers: false, noEmitHelpers: false, noEmitOnError: true, diff --git a/src/binary.ts b/src/binary.ts index b08067af..6c4d54fc 100644 --- a/src/binary.ts +++ b/src/binary.ts @@ -191,8 +191,8 @@ export class Binary extends BSONValue { if (encoding === 'hex') return ByteUtils.toHex(this.buffer); if (encoding === 'base64') return ByteUtils.toBase64(this.buffer); if (encoding === 'utf8' || encoding === 'utf-8') - return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength); - return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength); + return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength, false); + return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength, false); } /** @internal */ diff --git a/src/error.ts b/src/error.ts index 63e63517..5ae2495c 100644 --- a/src/error.ts +++ b/src/error.ts @@ -4,7 +4,7 @@ import { BSON_MAJOR_VERSION } from './constants'; * @public * @category Error * - * `BSONError` objects are thrown when BSON ecounters an error. + * `BSONError` objects are thrown when BSON encounters an error. * * This is the parent class for all the other errors thrown by this library. */ @@ -23,8 +23,8 @@ export class BSONError extends Error { return 'BSONError'; } - constructor(message: string) { - super(message); + constructor(message: string, options?: ErrorOptions) { + super(message, options); } /** diff --git a/src/parser/deserializer.ts b/src/parser/deserializer.ts index abb61046..28183387 100644 --- a/src/parser/deserializer.ts +++ b/src/parser/deserializer.ts @@ -236,7 +236,7 @@ function deserializeObject( if (i >= buffer.byteLength) throw new BSONError('Bad BSON Document: illegal CString'); // Represents the key - const name = isArray ? arrayIndex++ : ByteUtils.toUTF8(buffer, index, i); + const name = isArray ? arrayIndex++ : ByteUtils.toUTF8(buffer, index, i, false); // shouldValidateKey is true if the key should be validated, false otherwise let shouldValidateKey = true; @@ -266,7 +266,7 @@ function deserializeObject( ) { throw new BSONError('bad string length in bson'); } - value = getValidatedString(buffer, index, index + stringSize - 1, shouldValidateKey); + value = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey); index = index + stringSize; } else if (elementType === constants.BSON_DATA_OID) { const oid = ByteUtils.allocate(12); @@ -476,7 +476,7 @@ function deserializeObject( // If are at the end of the buffer there is a problem with the document if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString'); // Return the C string - const source = ByteUtils.toUTF8(buffer, index, i); + const source = ByteUtils.toUTF8(buffer, index, i, false); // Create the regexp index = i + 1; @@ -489,7 +489,7 @@ function deserializeObject( // If are at the end of the buffer there is a problem with the document if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString'); // Return the C string - const regExpOptions = ByteUtils.toUTF8(buffer, index, i); + const regExpOptions = ByteUtils.toUTF8(buffer, index, i, false); index = i + 1; // For each option add the corresponding one for javascript @@ -521,7 +521,7 @@ function deserializeObject( // If are at the end of the buffer there is a problem with the document if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString'); // Return the C string - const source = ByteUtils.toUTF8(buffer, index, i); + const source = ByteUtils.toUTF8(buffer, index, i, false); index = i + 1; // Get the start search index @@ -533,7 +533,7 @@ function deserializeObject( // If are at the end of the buffer there is a problem with the document if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString'); // Return the C string - const regExpOptions = ByteUtils.toUTF8(buffer, index, i); + const regExpOptions = ByteUtils.toUTF8(buffer, index, i, false); index = i + 1; // Set the object @@ -551,7 +551,7 @@ function deserializeObject( ) { throw new BSONError('bad string length in bson'); } - const symbol = getValidatedString(buffer, index, index + stringSize - 1, shouldValidateKey); + const symbol = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey); value = promoteValues ? symbol : new BSONSymbol(symbol); index = index + stringSize; } else if (elementType === constants.BSON_DATA_TIMESTAMP) { @@ -587,7 +587,7 @@ function deserializeObject( ) { throw new BSONError('bad string length in bson'); } - const functionString = getValidatedString( + const functionString = ByteUtils.toUTF8( buffer, index, index + stringSize - 1, @@ -626,7 +626,7 @@ function deserializeObject( } // Javascript function - const functionString = getValidatedString( + const functionString = ByteUtils.toUTF8( buffer, index, index + stringSize - 1, @@ -678,7 +678,7 @@ function deserializeObject( throw new BSONError('Invalid UTF-8 string in BSON document'); } } - const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1); + const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false); // Update parse index position index = index + stringSize; @@ -728,24 +728,3 @@ function deserializeObject( return object; } - -function getValidatedString( - buffer: Uint8Array, - start: number, - end: number, - shouldValidateUtf8: boolean -) { - const value = ByteUtils.toUTF8(buffer, start, end); - // if utf8 validation is on, do the check - if (shouldValidateUtf8) { - for (let i = 0; i < value.length; i++) { - if (value.charCodeAt(i) === 0xfffd) { - if (!validateUtf8(buffer, start, end)) { - throw new BSONError('Invalid UTF-8 string in BSON document'); - } - break; - } - } - } - return value; -} diff --git a/src/utils/byte_utils.ts b/src/utils/byte_utils.ts index 5f8d05ef..d611c906 100644 --- a/src/utils/byte_utils.ts +++ b/src/utils/byte_utils.ts @@ -25,8 +25,8 @@ export type ByteUtils = { toHex: (buffer: Uint8Array) => string; /** Create a Uint8Array containing utf8 code units from a string */ fromUTF8: (text: string) => Uint8Array; - /** Create a string from utf8 code units */ - toUTF8: (buffer: Uint8Array, start: number, end: number) => string; + /** Create a string from utf8 code units, fatal=true will throw an error if UTF-8 bytes are invalid, fatal=false will insert replacement characters */ + toUTF8: (buffer: Uint8Array, start: number, end: number, fatal: boolean) => string; /** Get the utf8 code unit count from a string if it were to be transformed to utf8 */ utf8ByteLength: (input: string) => number; /** Encode UTF8 bytes generated from `source` string into `destination` at byteOffset. Returns the number of bytes encoded. */ diff --git a/src/utils/node_byte_utils.ts b/src/utils/node_byte_utils.ts index 1621811b..961be476 100644 --- a/src/utils/node_byte_utils.ts +++ b/src/utils/node_byte_utils.ts @@ -1,4 +1,5 @@ import { BSONError } from '../error'; +import { validateUtf8 } from '../validate_utf8'; type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary'; type NodeJsBuffer = ArrayBufferView & @@ -125,7 +126,7 @@ export const nodeJsByteUtils = { return Buffer.from(text, 'utf8'); }, - toUTF8(buffer: Uint8Array, start: number, end: number): string { + toUTF8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string { if (buffer.length === 0) { return ''; } @@ -153,7 +154,19 @@ export const nodeJsByteUtils = { } } - return nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end); + const string = nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end); + if (fatal) { + // TODO(NODE-4930): Insufficiently strict BSON UTF8 validation + for (let i = 0; i < string.length; i++) { + if (string.charCodeAt(i) === 0xfffd) { + if (!validateUtf8(buffer, start, end)) { + throw new BSONError('Invalid UTF-8 string in BSON document'); + } + break; + } + } + } + return string; }, utf8ByteLength(input: string): number { diff --git a/src/utils/web_byte_utils.ts b/src/utils/web_byte_utils.ts index 28dda05e..5dee3732 100644 --- a/src/utils/web_byte_utils.ts +++ b/src/utils/web_byte_utils.ts @@ -172,7 +172,7 @@ export const webByteUtils = { return new TextEncoder().encode(text); }, - toUTF8(uint8array: Uint8Array, start: number, end: number): string { + toUTF8(uint8array: Uint8Array, start: number, end: number, fatal: boolean): string { if (uint8array.length === 0) { return ''; } @@ -200,7 +200,14 @@ export const webByteUtils = { } } - return new TextDecoder('utf8', { fatal: false }).decode(uint8array.slice(start, end)); + if (fatal) { + try { + return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end)); + } catch (cause) { + throw new BSONError('Invalid UTF-8 string in BSON document', { cause }); + } + } + return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end)); }, utf8ByteLength(input: string): number { diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts index f2cac490..49156739 100644 --- a/test/node/byte_utils.test.ts +++ b/test/node/byte_utils.test.ts @@ -8,6 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils'; import * as sinon from 'sinon'; import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson'; import * as crypto from 'node:crypto'; +import { BSONError } from '../register-bson'; type ByteUtilTest = { name: string; @@ -400,7 +401,7 @@ const fromUTF8Tests: ByteUtilTest<'fromUTF8'>[] = [ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ { name: 'should create utf8 string from buffer input', - inputs: [Buffer.from('abc\u{1f913}', 'utf8'), 0, 7], + inputs: [Buffer.from('abc\u{1f913}', 'utf8'), 0, 7, false], expectation({ output, error }) { expect(error).to.be.null; expect(output).to.deep.equal(Buffer.from('abc\u{1f913}', 'utf8').toString('utf8')); @@ -408,11 +409,26 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ }, { name: 'should return empty string for empty buffer input', - inputs: [Buffer.alloc(0), 0, 1], + inputs: [Buffer.alloc(0), 0, 1, false], expectation({ output, error }) { expect(error).to.be.null; expect(output).to.be.a('string').with.lengthOf(0); } + }, + { + name: 'should throw an error if fatal is set and string is invalid', + inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true], + expectation({ error }) { + expect(error).to.be.instanceOf(BSONError); + } + }, + { + name: 'should insert replacement character fatal is false and string is invalid', + inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false], + expectation({ error, output }) { + expect(error).to.not.exist; + expect(output).to.equal('abc\uFFFD'); + } } ]; const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [ @@ -605,7 +621,7 @@ describe('ByteUtils', () => { it('should not invoke Buffer.toString', () => { const buffer = Buffer.from('abcdef', 'utf8'); const spy = sinon.spy(buffer, 'toString'); - nodeJsByteUtils.toUTF8(buffer, 0, 6); + nodeJsByteUtils.toUTF8(buffer, 0, 6, false); expect(spy).to.not.have.been.called; }); @@ -613,7 +629,7 @@ describe('ByteUtils', () => { const utf8Bytes = Buffer.from('abcdef', 'utf8'); const buffer = new Uint8Array(utf8Bytes.buffer, utf8Bytes.byteOffset, utf8Bytes.byteLength); const spy = sinon.spy(TextDecoder.prototype, 'decode'); - webByteUtils.toUTF8(buffer, 0, 6); + webByteUtils.toUTF8(buffer, 0, 6, false); expect(spy).to.not.have.been.called; }); }); diff --git a/tsconfig.json b/tsconfig.json index b439bf5b..d11664fe 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -10,6 +10,7 @@ "skipLibCheck": true, "lib": [ "es2021", + "ES2022.Error" ], "outDir": "lib", "importHelpers": false, From 7e3bcd0c02c902b4fc66233b785519ecd3390a18 Mon Sep 17 00:00:00 2001 From: Neal Beeken Date: Mon, 29 Jan 2024 14:59:38 -0500 Subject: [PATCH 3/9] fix: error ctor and assertion for error --- src/error.ts | 2 +- test/node/byte_utils.test.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/error.ts b/src/error.ts index 5ae2495c..3f2711e9 100644 --- a/src/error.ts +++ b/src/error.ts @@ -23,7 +23,7 @@ export class BSONError extends Error { return 'BSONError'; } - constructor(message: string, options?: ErrorOptions) { + constructor(message: string, options?: { cause?: unknown }) { super(message, options); } diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts index 49156739..31ba4242 100644 --- a/test/node/byte_utils.test.ts +++ b/test/node/byte_utils.test.ts @@ -419,7 +419,7 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ name: 'should throw an error if fatal is set and string is invalid', inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true], expectation({ error }) { - expect(error).to.be.instanceOf(BSONError); + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); } }, { From d956aeaafbbaca224a012b9151b28e037d145843 Mon Sep 17 00:00:00 2001 From: Neal Beeken Date: Mon, 29 Jan 2024 15:01:27 -0500 Subject: [PATCH 4/9] fix: lint --- test/node/byte_utils.test.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts index 31ba4242..9526329a 100644 --- a/test/node/byte_utils.test.ts +++ b/test/node/byte_utils.test.ts @@ -8,7 +8,6 @@ import { webByteUtils } from '../../src/utils/web_byte_utils'; import * as sinon from 'sinon'; import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson'; import * as crypto from 'node:crypto'; -import { BSONError } from '../register-bson'; type ByteUtilTest = { name: string; From 9ef267c1ba1ba619c1d6bff710d8134d1dcffd86 Mon Sep 17 00:00:00 2001 From: Neal Beeken Date: Tue, 30 Jan 2024 16:18:16 -0500 Subject: [PATCH 5/9] feat: lower stringByteLength cap to 20 --- src/utils/latin.ts | 66 ++++++++++++++++++++++++++++++++++++ src/utils/node_byte_utils.ts | 29 +++------------- src/utils/web_byte_utils.ts | 29 +++------------- 3 files changed, 74 insertions(+), 50 deletions(-) create mode 100644 src/utils/latin.ts diff --git a/src/utils/latin.ts b/src/utils/latin.ts new file mode 100644 index 00000000..3dcf8045 --- /dev/null +++ b/src/utils/latin.ts @@ -0,0 +1,66 @@ +/** + * This function is an optimization for small basic latin strings. + * + * @remarks + * ### Important characteristics: + * - If the uint8array or distance between start and end is 0 this function returns an empty string + * - If the byteLength of the string is 1, 2, or 3 we invoke String.fromCharCode and manually offset into the buffer + * - If the byteLength of the string is less than or equal to 20 an array of bytes is built and `String.fromCharCode.apply` is called with the result + * - If any byte exceeds 128 this function returns null + * + * @param uint8array - A sequence of bytes that may contain basic latin characters + * @param start - The start index from which to search the uint8array + * @param end - The index to stop searching the uint8array + * @returns string if all bytes are within the basic latin range, otherwise null + */ +export function tryLatin(uint8array: Uint8Array, start: number, end: number): string | null { + if (uint8array.length === 0) { + return ''; + } + + const stringByteLength = end - start; + if (stringByteLength === 0) { + return ''; + } + + if (stringByteLength === 1 && uint8array[start] < 128) { + return String.fromCharCode(uint8array[start]); + } + + if (stringByteLength === 2 && uint8array[start] < 128 && uint8array[start + 1] < 128) { + return String.fromCharCode(uint8array[start]) + String.fromCharCode(uint8array[start + 1]); + } + + if ( + stringByteLength === 3 && + uint8array[start] < 128 && + uint8array[start + 1] < 128 && + uint8array[start + 2] < 128 + ) { + return ( + String.fromCharCode(uint8array[start]) + + String.fromCharCode(uint8array[start + 1]) + + String.fromCharCode(uint8array[start + 2]) + ); + } + + if (stringByteLength <= 20) { + let basicLatin = true; + const latinBytes = []; + for (let i = start; i < end; i++) { + const byte = uint8array[i]; + if (byte > 127) { + basicLatin = false; + break; + } + latinBytes.push(byte); + } + + if (basicLatin) { + // eslint-disable-next-line prefer-spread + return String.fromCharCode.apply(String, latinBytes); + } + } + + return null; +} diff --git a/src/utils/node_byte_utils.ts b/src/utils/node_byte_utils.ts index 961be476..85811e14 100644 --- a/src/utils/node_byte_utils.ts +++ b/src/utils/node_byte_utils.ts @@ -1,5 +1,6 @@ import { BSONError } from '../error'; import { validateUtf8 } from '../validate_utf8'; +import { tryLatin } from './latin'; type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary'; type NodeJsBuffer = ArrayBufferView & @@ -127,31 +128,9 @@ export const nodeJsByteUtils = { }, toUTF8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string { - if (buffer.length === 0) { - return ''; - } - - const stringByteLength = end - start; - if (stringByteLength === 0) { - return ''; - } - - if (stringByteLength < 200) { - let basicLatin = true; - const latinBytes = []; - for (let i = start; i < end; i++) { - const byte = buffer[i]; - if (byte > 127) { - basicLatin = false; - break; - } - latinBytes.push(byte); - } - - if (basicLatin) { - // eslint-disable-next-line prefer-spread - return String.fromCharCode.apply(String, latinBytes); - } + const basicLatin = end - start <= 20 ? tryLatin(buffer, start, end) : null; + if (basicLatin != null) { + return basicLatin; } const string = nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end); diff --git a/src/utils/web_byte_utils.ts b/src/utils/web_byte_utils.ts index 5dee3732..9e38efd4 100644 --- a/src/utils/web_byte_utils.ts +++ b/src/utils/web_byte_utils.ts @@ -1,4 +1,5 @@ import { BSONError } from '../error'; +import { tryLatin } from './latin'; type TextDecoder = { readonly encoding: string; @@ -173,31 +174,9 @@ export const webByteUtils = { }, toUTF8(uint8array: Uint8Array, start: number, end: number, fatal: boolean): string { - if (uint8array.length === 0) { - return ''; - } - - const stringByteLength = end - start; - if (stringByteLength === 0) { - return ''; - } - - if (stringByteLength < 200) { - let basicLatin = true; - const latinBytes = []; - for (let i = start; i < end; i++) { - const byte = uint8array[i]; - if (byte > 127) { - basicLatin = false; - break; - } - latinBytes.push(byte); - } - - if (basicLatin) { - // eslint-disable-next-line prefer-spread - return String.fromCharCode.apply(String, latinBytes); - } + const basicLatin = end - start <= 20 ? tryLatin(uint8array, start, end) : null; + if (basicLatin != null) { + return basicLatin; } if (fatal) { From 5837166077770450240f435976de270ded89d690 Mon Sep 17 00:00:00 2001 From: Neal Beeken Date: Tue, 30 Jan 2024 16:21:55 -0500 Subject: [PATCH 6/9] test: release new file --- src/utils/latin.ts | 2 +- test/node/release.test.ts | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/utils/latin.ts b/src/utils/latin.ts index 3dcf8045..016eb4d8 100644 --- a/src/utils/latin.ts +++ b/src/utils/latin.ts @@ -1,6 +1,6 @@ /** * This function is an optimization for small basic latin strings. - * + * @internal * @remarks * ### Important characteristics: * - If the uint8array or distance between start and end is 0 this function returns an empty string diff --git a/test/node/release.test.ts b/test/node/release.test.ts index b34f0517..c2a20cf4 100644 --- a/test/node/release.test.ts +++ b/test/node/release.test.ts @@ -46,6 +46,7 @@ const REQUIRED_FILES = [ 'src/utils/byte_utils.ts', 'src/utils/node_byte_utils.ts', 'src/utils/web_byte_utils.ts', + 'src/utils/latin.ts', 'src/validate_utf8.ts', 'vendor/base64/base64.js', 'vendor/base64/package.json', From 49d96aff9b0ba8ec293739997c1ee594b6b617d1 Mon Sep 17 00:00:00 2001 From: Neal Beeken Date: Wed, 31 Jan 2024 09:19:41 -0500 Subject: [PATCH 7/9] test: add unit tests --- src/utils/latin.ts | 3 +- test/node/utils/latin.test.ts | 118 ++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 test/node/utils/latin.test.ts diff --git a/src/utils/latin.ts b/src/utils/latin.ts index 016eb4d8..76f9ad26 100644 --- a/src/utils/latin.ts +++ b/src/utils/latin.ts @@ -1,3 +1,5 @@ +/* eslint-disable prefer-spread */ + /** * This function is an optimization for small basic latin strings. * @internal @@ -57,7 +59,6 @@ export function tryLatin(uint8array: Uint8Array, start: number, end: number): st } if (basicLatin) { - // eslint-disable-next-line prefer-spread return String.fromCharCode.apply(String, latinBytes); } } diff --git a/test/node/utils/latin.test.ts b/test/node/utils/latin.test.ts new file mode 100644 index 00000000..96e0e6fa --- /dev/null +++ b/test/node/utils/latin.test.ts @@ -0,0 +1,118 @@ +import { expect } from 'chai'; +import { tryLatin } from '../../../src/utils/latin'; +import * as sinon from 'sinon'; + +describe('tryLatin()', () => { + context('when given a buffer of length 0', () => { + it('returns an empty string', () => { + expect(tryLatin(new Uint8Array(), 0, 10)).to.equal(''); + }); + }); + + context('when the distance between end and start is 0', () => { + it('returns an empty string', () => { + expect(tryLatin(new Uint8Array([1, 2, 3]), 0, 0)).to.equal(''); + }); + }); + + let pushSpy; + let fromCharCodeSpy; + + beforeEach(() => { + pushSpy = sinon.spy(Array.prototype, 'push'); + fromCharCodeSpy = sinon.spy(String, 'fromCharCode'); + }); + + afterEach(() => { + sinon.restore(); + }); + + context('when there is 1 byte', () => { + context('that exceed 127', () => { + it('returns null', () => { + expect(tryLatin(new Uint8Array([128]), 0, 1)).be.null; + }); + }); + + it('calls fromCharCode once', () => { + tryLatin(new Uint8Array([95]), 0, 1); + expect(fromCharCodeSpy).to.have.been.calledOnce; + }); + + it('never calls array.push', () => { + tryLatin(new Uint8Array([95]), 0, 1); + expect(pushSpy).to.have.not.been.called; + }); + }); + + context('when there is 2 bytes', () => { + context('that exceed 127', () => { + it('returns null', () => { + expect(tryLatin(new Uint8Array([0, 128]), 0, 2)).be.null; + expect(tryLatin(new Uint8Array([128, 0]), 0, 2)).be.null; + expect(tryLatin(new Uint8Array([128, 128]), 0, 2)).be.null; + }); + }); + + it('calls fromCharCode twice', () => { + tryLatin(new Uint8Array([95, 105]), 0, 2); + expect(fromCharCodeSpy).to.have.been.calledTwice; + }); + + it('never calls array.push', () => { + tryLatin(new Uint8Array([95, 105]), 0, 2); + expect(pushSpy).to.have.not.been.called; + }); + }); + + context('when there is 3 bytes', () => { + context('that exceed 127', () => { + it('returns null', () => { + expect(tryLatin(new Uint8Array([0, 0, 128]), 0, 3)).be.null; + expect(tryLatin(new Uint8Array([0, 128, 0]), 0, 3)).be.null; + expect(tryLatin(new Uint8Array([128, 0, 0]), 0, 3)).be.null; + expect(tryLatin(new Uint8Array([128, 128, 128]), 0, 3)).be.null; + expect(tryLatin(new Uint8Array([128, 128, 0]), 0, 3)).be.null; + expect(tryLatin(new Uint8Array([128, 0, 128]), 0, 3)).be.null; + expect(tryLatin(new Uint8Array([0, 128, 128]), 0, 3)).be.null; + }); + }); + + it('calls fromCharCode thrice', () => { + tryLatin(new Uint8Array([95, 105, 100]), 0, 3); + expect(fromCharCodeSpy).to.have.been.calledThrice; + }); + + it('never calls array.push', () => { + tryLatin(new Uint8Array([95, 105, 100]), 0, 3); + expect(pushSpy).to.have.not.been.called; + }); + }); + + for (let stringLength = 4; stringLength <= 20; stringLength++) { + context(`when there is ${stringLength} bytes`, () => { + context('that exceed 127', () => { + it('returns null', () => { + expect(tryLatin(new Uint8Array(stringLength).fill(128), 0, stringLength)).be.null; + }); + }); + + it('calls fromCharCode once', () => { + tryLatin(new Uint8Array(stringLength).fill(95), 0, stringLength); + expect(fromCharCodeSpy).to.have.been.calledOnce; + }); + + it(`calls array.push ${stringLength}`, () => { + tryLatin(new Uint8Array(stringLength).fill(95), 0, stringLength); + expect(pushSpy).to.have.callCount(stringLength); + }); + }); + } + + context('when there is >21 bytes', () => { + it('returns null', () => { + expect(tryLatin(new Uint8Array(21).fill(95), 0, 21)).be.null; + expect(tryLatin(new Uint8Array(201).fill(95), 0, 201)).be.null; + }); + }); +}); From 3e609bf2cfbe67adf0ee5f38f7c6018809e495e8 Mon Sep 17 00:00:00 2001 From: Neal Beeken Date: Wed, 31 Jan 2024 14:22:27 -0500 Subject: [PATCH 8/9] fix: remove apply --- src/utils/latin.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/utils/latin.ts b/src/utils/latin.ts index 76f9ad26..f50805dd 100644 --- a/src/utils/latin.ts +++ b/src/utils/latin.ts @@ -1,5 +1,3 @@ -/* eslint-disable prefer-spread */ - /** * This function is an optimization for small basic latin strings. * @internal @@ -59,7 +57,7 @@ export function tryLatin(uint8array: Uint8Array, start: number, end: number): st } if (basicLatin) { - return String.fromCharCode.apply(String, latinBytes); + return String.fromCharCode(...latinBytes); } } From 267cc51ef0a984b4e7ea1f3514894f87a1c836a0 Mon Sep 17 00:00:00 2001 From: Neal Beeken Date: Wed, 31 Jan 2024 14:54:14 -0500 Subject: [PATCH 9/9] fix: move len 20 check up and remove basicLatin boolean --- src/utils/latin.ts | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/src/utils/latin.ts b/src/utils/latin.ts index f50805dd..860497db 100644 --- a/src/utils/latin.ts +++ b/src/utils/latin.ts @@ -23,6 +23,10 @@ export function tryLatin(uint8array: Uint8Array, start: number, end: number): st return ''; } + if (stringByteLength > 20) { + return null; + } + if (stringByteLength === 1 && uint8array[start] < 128) { return String.fromCharCode(uint8array[start]); } @@ -44,22 +48,14 @@ export function tryLatin(uint8array: Uint8Array, start: number, end: number): st ); } - if (stringByteLength <= 20) { - let basicLatin = true; - const latinBytes = []; - for (let i = start; i < end; i++) { - const byte = uint8array[i]; - if (byte > 127) { - basicLatin = false; - break; - } - latinBytes.push(byte); - } - - if (basicLatin) { - return String.fromCharCode(...latinBytes); + const latinBytes = []; + for (let i = start; i < end; i++) { + const byte = uint8array[i]; + if (byte > 127) { + return null; } + latinBytes.push(byte); } - return null; + return String.fromCharCode(...latinBytes); }