diff --git a/rollup.config.mjs b/rollup.config.mjs index b44e4cd7..17c402d8 100644 --- a/rollup.config.mjs +++ b/rollup.config.mjs @@ -13,7 +13,7 @@ const tsConfig = { module: 'esnext', moduleResolution: 'node', removeComments: true, - lib: ['es2021'], + lib: ['es2021', 'ES2022.Error'], importHelpers: false, noEmitHelpers: false, noEmitOnError: true, diff --git a/src/binary.ts b/src/binary.ts index b08067af..6c4d54fc 100644 --- a/src/binary.ts +++ b/src/binary.ts @@ -191,8 +191,8 @@ export class Binary extends BSONValue { if (encoding === 'hex') return ByteUtils.toHex(this.buffer); if (encoding === 'base64') return ByteUtils.toBase64(this.buffer); if (encoding === 'utf8' || encoding === 'utf-8') - return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength); - return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength); + return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength, false); + return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength, false); } /** @internal */ diff --git a/src/error.ts b/src/error.ts index 63e63517..3f2711e9 100644 --- a/src/error.ts +++ b/src/error.ts @@ -4,7 +4,7 @@ import { BSON_MAJOR_VERSION } from './constants'; * @public * @category Error * - * `BSONError` objects are thrown when BSON ecounters an error. + * `BSONError` objects are thrown when BSON encounters an error. * * This is the parent class for all the other errors thrown by this library. */ @@ -23,8 +23,8 @@ export class BSONError extends Error { return 'BSONError'; } - constructor(message: string) { - super(message); + constructor(message: string, options?: { cause?: unknown }) { + super(message, options); } /** diff --git a/src/parser/deserializer.ts b/src/parser/deserializer.ts index abb61046..28183387 100644 --- a/src/parser/deserializer.ts +++ b/src/parser/deserializer.ts @@ -236,7 +236,7 @@ function deserializeObject( if (i >= buffer.byteLength) throw new BSONError('Bad BSON Document: illegal CString'); // Represents the key - const name = isArray ? arrayIndex++ : ByteUtils.toUTF8(buffer, index, i); + const name = isArray ? arrayIndex++ : ByteUtils.toUTF8(buffer, index, i, false); // shouldValidateKey is true if the key should be validated, false otherwise let shouldValidateKey = true; @@ -266,7 +266,7 @@ function deserializeObject( ) { throw new BSONError('bad string length in bson'); } - value = getValidatedString(buffer, index, index + stringSize - 1, shouldValidateKey); + value = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey); index = index + stringSize; } else if (elementType === constants.BSON_DATA_OID) { const oid = ByteUtils.allocate(12); @@ -476,7 +476,7 @@ function deserializeObject( // If are at the end of the buffer there is a problem with the document if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString'); // Return the C string - const source = ByteUtils.toUTF8(buffer, index, i); + const source = ByteUtils.toUTF8(buffer, index, i, false); // Create the regexp index = i + 1; @@ -489,7 +489,7 @@ function deserializeObject( // If are at the end of the buffer there is a problem with the document if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString'); // Return the C string - const regExpOptions = ByteUtils.toUTF8(buffer, index, i); + const regExpOptions = ByteUtils.toUTF8(buffer, index, i, false); index = i + 1; // For each option add the corresponding one for javascript @@ -521,7 +521,7 @@ function deserializeObject( // If are at the end of the buffer there is a problem with the document if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString'); // Return the C string - const source = ByteUtils.toUTF8(buffer, index, i); + const source = ByteUtils.toUTF8(buffer, index, i, false); index = i + 1; // Get the start search index @@ -533,7 +533,7 @@ function deserializeObject( // If are at the end of the buffer there is a problem with the document if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString'); // Return the C string - const regExpOptions = ByteUtils.toUTF8(buffer, index, i); + const regExpOptions = ByteUtils.toUTF8(buffer, index, i, false); index = i + 1; // Set the object @@ -551,7 +551,7 @@ function deserializeObject( ) { throw new BSONError('bad string length in bson'); } - const symbol = getValidatedString(buffer, index, index + stringSize - 1, shouldValidateKey); + const symbol = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey); value = promoteValues ? symbol : new BSONSymbol(symbol); index = index + stringSize; } else if (elementType === constants.BSON_DATA_TIMESTAMP) { @@ -587,7 +587,7 @@ function deserializeObject( ) { throw new BSONError('bad string length in bson'); } - const functionString = getValidatedString( + const functionString = ByteUtils.toUTF8( buffer, index, index + stringSize - 1, @@ -626,7 +626,7 @@ function deserializeObject( } // Javascript function - const functionString = getValidatedString( + const functionString = ByteUtils.toUTF8( buffer, index, index + stringSize - 1, @@ -678,7 +678,7 @@ function deserializeObject( throw new BSONError('Invalid UTF-8 string in BSON document'); } } - const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1); + const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false); // Update parse index position index = index + stringSize; @@ -728,24 +728,3 @@ function deserializeObject( return object; } - -function getValidatedString( - buffer: Uint8Array, - start: number, - end: number, - shouldValidateUtf8: boolean -) { - const value = ByteUtils.toUTF8(buffer, start, end); - // if utf8 validation is on, do the check - if (shouldValidateUtf8) { - for (let i = 0; i < value.length; i++) { - if (value.charCodeAt(i) === 0xfffd) { - if (!validateUtf8(buffer, start, end)) { - throw new BSONError('Invalid UTF-8 string in BSON document'); - } - break; - } - } - } - return value; -} diff --git a/src/utils/byte_utils.ts b/src/utils/byte_utils.ts index 5f8d05ef..d611c906 100644 --- a/src/utils/byte_utils.ts +++ b/src/utils/byte_utils.ts @@ -25,8 +25,8 @@ export type ByteUtils = { toHex: (buffer: Uint8Array) => string; /** Create a Uint8Array containing utf8 code units from a string */ fromUTF8: (text: string) => Uint8Array; - /** Create a string from utf8 code units */ - toUTF8: (buffer: Uint8Array, start: number, end: number) => string; + /** Create a string from utf8 code units, fatal=true will throw an error if UTF-8 bytes are invalid, fatal=false will insert replacement characters */ + toUTF8: (buffer: Uint8Array, start: number, end: number, fatal: boolean) => string; /** Get the utf8 code unit count from a string if it were to be transformed to utf8 */ utf8ByteLength: (input: string) => number; /** Encode UTF8 bytes generated from `source` string into `destination` at byteOffset. Returns the number of bytes encoded. */ diff --git a/src/utils/latin.ts b/src/utils/latin.ts new file mode 100644 index 00000000..860497db --- /dev/null +++ b/src/utils/latin.ts @@ -0,0 +1,61 @@ +/** + * This function is an optimization for small basic latin strings. + * @internal + * @remarks + * ### Important characteristics: + * - If the uint8array or distance between start and end is 0 this function returns an empty string + * - If the byteLength of the string is 1, 2, or 3 we invoke String.fromCharCode and manually offset into the buffer + * - If the byteLength of the string is less than or equal to 20 an array of bytes is built and `String.fromCharCode.apply` is called with the result + * - If any byte exceeds 128 this function returns null + * + * @param uint8array - A sequence of bytes that may contain basic latin characters + * @param start - The start index from which to search the uint8array + * @param end - The index to stop searching the uint8array + * @returns string if all bytes are within the basic latin range, otherwise null + */ +export function tryLatin(uint8array: Uint8Array, start: number, end: number): string | null { + if (uint8array.length === 0) { + return ''; + } + + const stringByteLength = end - start; + if (stringByteLength === 0) { + return ''; + } + + if (stringByteLength > 20) { + return null; + } + + if (stringByteLength === 1 && uint8array[start] < 128) { + return String.fromCharCode(uint8array[start]); + } + + if (stringByteLength === 2 && uint8array[start] < 128 && uint8array[start + 1] < 128) { + return String.fromCharCode(uint8array[start]) + String.fromCharCode(uint8array[start + 1]); + } + + if ( + stringByteLength === 3 && + uint8array[start] < 128 && + uint8array[start + 1] < 128 && + uint8array[start + 2] < 128 + ) { + return ( + String.fromCharCode(uint8array[start]) + + String.fromCharCode(uint8array[start + 1]) + + String.fromCharCode(uint8array[start + 2]) + ); + } + + const latinBytes = []; + for (let i = start; i < end; i++) { + const byte = uint8array[i]; + if (byte > 127) { + return null; + } + latinBytes.push(byte); + } + + return String.fromCharCode(...latinBytes); +} diff --git a/src/utils/node_byte_utils.ts b/src/utils/node_byte_utils.ts index 214b1e39..85811e14 100644 --- a/src/utils/node_byte_utils.ts +++ b/src/utils/node_byte_utils.ts @@ -1,4 +1,6 @@ import { BSONError } from '../error'; +import { validateUtf8 } from '../validate_utf8'; +import { tryLatin } from './latin'; type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary'; type NodeJsBuffer = ArrayBufferView & @@ -125,8 +127,25 @@ export const nodeJsByteUtils = { return Buffer.from(text, 'utf8'); }, - toUTF8(buffer: Uint8Array, start: number, end: number): string { - return nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end); + toUTF8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string { + const basicLatin = end - start <= 20 ? tryLatin(buffer, start, end) : null; + if (basicLatin != null) { + return basicLatin; + } + + const string = nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end); + if (fatal) { + // TODO(NODE-4930): Insufficiently strict BSON UTF8 validation + for (let i = 0; i < string.length; i++) { + if (string.charCodeAt(i) === 0xfffd) { + if (!validateUtf8(buffer, start, end)) { + throw new BSONError('Invalid UTF-8 string in BSON document'); + } + break; + } + } + } + return string; }, utf8ByteLength(input: string): number { diff --git a/src/utils/web_byte_utils.ts b/src/utils/web_byte_utils.ts index cf93e43a..9e38efd4 100644 --- a/src/utils/web_byte_utils.ts +++ b/src/utils/web_byte_utils.ts @@ -1,4 +1,5 @@ import { BSONError } from '../error'; +import { tryLatin } from './latin'; type TextDecoder = { readonly encoding: string; @@ -172,8 +173,20 @@ export const webByteUtils = { return new TextEncoder().encode(text); }, - toUTF8(uint8array: Uint8Array, start: number, end: number): string { - return new TextDecoder('utf8', { fatal: false }).decode(uint8array.slice(start, end)); + toUTF8(uint8array: Uint8Array, start: number, end: number, fatal: boolean): string { + const basicLatin = end - start <= 20 ? tryLatin(uint8array, start, end) : null; + if (basicLatin != null) { + return basicLatin; + } + + if (fatal) { + try { + return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end)); + } catch (cause) { + throw new BSONError('Invalid UTF-8 string in BSON document', { cause }); + } + } + return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end)); }, utf8ByteLength(input: string): number { diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts index 81fba0c4..9526329a 100644 --- a/test/node/byte_utils.test.ts +++ b/test/node/byte_utils.test.ts @@ -400,7 +400,7 @@ const fromUTF8Tests: ByteUtilTest<'fromUTF8'>[] = [ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ { name: 'should create utf8 string from buffer input', - inputs: [Buffer.from('abc\u{1f913}', 'utf8')], + inputs: [Buffer.from('abc\u{1f913}', 'utf8'), 0, 7, false], expectation({ output, error }) { expect(error).to.be.null; expect(output).to.deep.equal(Buffer.from('abc\u{1f913}', 'utf8').toString('utf8')); @@ -408,11 +408,26 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ }, { name: 'should return empty string for empty buffer input', - inputs: [Buffer.alloc(0)], + inputs: [Buffer.alloc(0), 0, 1, false], expectation({ output, error }) { expect(error).to.be.null; expect(output).to.be.a('string').with.lengthOf(0); } + }, + { + name: 'should throw an error if fatal is set and string is invalid', + inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true], + expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { + name: 'should insert replacement character fatal is false and string is invalid', + inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false], + expectation({ error, output }) { + expect(error).to.not.exist; + expect(output).to.equal('abc\uFFFD'); + } } ]; const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [ @@ -596,6 +611,29 @@ describe('ByteUtils', () => { }); }); + describe('toUTF8 basic latin optimization', () => { + afterEach(() => { + sinon.restore(); + }); + + context('Given a basic latin string', () => { + it('should not invoke Buffer.toString', () => { + const buffer = Buffer.from('abcdef', 'utf8'); + const spy = sinon.spy(buffer, 'toString'); + nodeJsByteUtils.toUTF8(buffer, 0, 6, false); + expect(spy).to.not.have.been.called; + }); + + it('should not invoke TextDecoder.decode', () => { + const utf8Bytes = Buffer.from('abcdef', 'utf8'); + const buffer = new Uint8Array(utf8Bytes.buffer, utf8Bytes.byteOffset, utf8Bytes.byteLength); + const spy = sinon.spy(TextDecoder.prototype, 'decode'); + webByteUtils.toUTF8(buffer, 0, 6, false); + expect(spy).to.not.have.been.called; + }); + }); + }); + describe('randomBytes fallback case when crypto is not present', () => { describe('web', function () { let bsonWithNoCryptoCtx; diff --git a/test/node/release.test.ts b/test/node/release.test.ts index b34f0517..c2a20cf4 100644 --- a/test/node/release.test.ts +++ b/test/node/release.test.ts @@ -46,6 +46,7 @@ const REQUIRED_FILES = [ 'src/utils/byte_utils.ts', 'src/utils/node_byte_utils.ts', 'src/utils/web_byte_utils.ts', + 'src/utils/latin.ts', 'src/validate_utf8.ts', 'vendor/base64/base64.js', 'vendor/base64/package.json', diff --git a/test/node/utils/latin.test.ts b/test/node/utils/latin.test.ts new file mode 100644 index 00000000..96e0e6fa --- /dev/null +++ b/test/node/utils/latin.test.ts @@ -0,0 +1,118 @@ +import { expect } from 'chai'; +import { tryLatin } from '../../../src/utils/latin'; +import * as sinon from 'sinon'; + +describe('tryLatin()', () => { + context('when given a buffer of length 0', () => { + it('returns an empty string', () => { + expect(tryLatin(new Uint8Array(), 0, 10)).to.equal(''); + }); + }); + + context('when the distance between end and start is 0', () => { + it('returns an empty string', () => { + expect(tryLatin(new Uint8Array([1, 2, 3]), 0, 0)).to.equal(''); + }); + }); + + let pushSpy; + let fromCharCodeSpy; + + beforeEach(() => { + pushSpy = sinon.spy(Array.prototype, 'push'); + fromCharCodeSpy = sinon.spy(String, 'fromCharCode'); + }); + + afterEach(() => { + sinon.restore(); + }); + + context('when there is 1 byte', () => { + context('that exceed 127', () => { + it('returns null', () => { + expect(tryLatin(new Uint8Array([128]), 0, 1)).be.null; + }); + }); + + it('calls fromCharCode once', () => { + tryLatin(new Uint8Array([95]), 0, 1); + expect(fromCharCodeSpy).to.have.been.calledOnce; + }); + + it('never calls array.push', () => { + tryLatin(new Uint8Array([95]), 0, 1); + expect(pushSpy).to.have.not.been.called; + }); + }); + + context('when there is 2 bytes', () => { + context('that exceed 127', () => { + it('returns null', () => { + expect(tryLatin(new Uint8Array([0, 128]), 0, 2)).be.null; + expect(tryLatin(new Uint8Array([128, 0]), 0, 2)).be.null; + expect(tryLatin(new Uint8Array([128, 128]), 0, 2)).be.null; + }); + }); + + it('calls fromCharCode twice', () => { + tryLatin(new Uint8Array([95, 105]), 0, 2); + expect(fromCharCodeSpy).to.have.been.calledTwice; + }); + + it('never calls array.push', () => { + tryLatin(new Uint8Array([95, 105]), 0, 2); + expect(pushSpy).to.have.not.been.called; + }); + }); + + context('when there is 3 bytes', () => { + context('that exceed 127', () => { + it('returns null', () => { + expect(tryLatin(new Uint8Array([0, 0, 128]), 0, 3)).be.null; + expect(tryLatin(new Uint8Array([0, 128, 0]), 0, 3)).be.null; + expect(tryLatin(new Uint8Array([128, 0, 0]), 0, 3)).be.null; + expect(tryLatin(new Uint8Array([128, 128, 128]), 0, 3)).be.null; + expect(tryLatin(new Uint8Array([128, 128, 0]), 0, 3)).be.null; + expect(tryLatin(new Uint8Array([128, 0, 128]), 0, 3)).be.null; + expect(tryLatin(new Uint8Array([0, 128, 128]), 0, 3)).be.null; + }); + }); + + it('calls fromCharCode thrice', () => { + tryLatin(new Uint8Array([95, 105, 100]), 0, 3); + expect(fromCharCodeSpy).to.have.been.calledThrice; + }); + + it('never calls array.push', () => { + tryLatin(new Uint8Array([95, 105, 100]), 0, 3); + expect(pushSpy).to.have.not.been.called; + }); + }); + + for (let stringLength = 4; stringLength <= 20; stringLength++) { + context(`when there is ${stringLength} bytes`, () => { + context('that exceed 127', () => { + it('returns null', () => { + expect(tryLatin(new Uint8Array(stringLength).fill(128), 0, stringLength)).be.null; + }); + }); + + it('calls fromCharCode once', () => { + tryLatin(new Uint8Array(stringLength).fill(95), 0, stringLength); + expect(fromCharCodeSpy).to.have.been.calledOnce; + }); + + it(`calls array.push ${stringLength}`, () => { + tryLatin(new Uint8Array(stringLength).fill(95), 0, stringLength); + expect(pushSpy).to.have.callCount(stringLength); + }); + }); + } + + context('when there is >21 bytes', () => { + it('returns null', () => { + expect(tryLatin(new Uint8Array(21).fill(95), 0, 21)).be.null; + expect(tryLatin(new Uint8Array(201).fill(95), 0, 201)).be.null; + }); + }); +}); diff --git a/tsconfig.json b/tsconfig.json index b439bf5b..d11664fe 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -10,6 +10,7 @@ "skipLibCheck": true, "lib": [ "es2021", + "ES2022.Error" ], "outDir": "lib", "importHelpers": false,