Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(NODE-5861): optimize parsing basic latin strings #642

Merged
merged 9 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rollup.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ const tsConfig = {
module: 'esnext',
moduleResolution: 'node',
removeComments: true,
lib: ['es2021'],
lib: ['es2021', 'ES2022.Error'],
importHelpers: false,
noEmitHelpers: false,
noEmitOnError: true,
Expand Down
4 changes: 2 additions & 2 deletions src/binary.ts
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,8 @@ export class Binary extends BSONValue {
if (encoding === 'hex') return ByteUtils.toHex(this.buffer);
if (encoding === 'base64') return ByteUtils.toBase64(this.buffer);
if (encoding === 'utf8' || encoding === 'utf-8')
return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength);
return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength);
return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength, false);
return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength, false);
}

/** @internal */
Expand Down
6 changes: 3 additions & 3 deletions src/error.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { BSON_MAJOR_VERSION } from './constants';
* @public
* @category Error
*
* `BSONError` objects are thrown when BSON ecounters an error.
* `BSONError` objects are thrown when BSON encounters an error.
*
* This is the parent class for all the other errors thrown by this library.
*/
Expand All @@ -23,8 +23,8 @@ export class BSONError extends Error {
return 'BSONError';
}

constructor(message: string) {
super(message);
constructor(message: string, options?: { cause?: unknown }) {
super(message, options);
}

/**
Expand Down
41 changes: 10 additions & 31 deletions src/parser/deserializer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ function deserializeObject(
if (i >= buffer.byteLength) throw new BSONError('Bad BSON Document: illegal CString');

// Represents the key
const name = isArray ? arrayIndex++ : ByteUtils.toUTF8(buffer, index, i);
const name = isArray ? arrayIndex++ : ByteUtils.toUTF8(buffer, index, i, false);

// shouldValidateKey is true if the key should be validated, false otherwise
let shouldValidateKey = true;
Expand Down Expand Up @@ -266,7 +266,7 @@ function deserializeObject(
) {
throw new BSONError('bad string length in bson');
}
value = getValidatedString(buffer, index, index + stringSize - 1, shouldValidateKey);
value = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey);
index = index + stringSize;
} else if (elementType === constants.BSON_DATA_OID) {
const oid = ByteUtils.allocate(12);
Expand Down Expand Up @@ -476,7 +476,7 @@ function deserializeObject(
// If are at the end of the buffer there is a problem with the document
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
// Return the C string
const source = ByteUtils.toUTF8(buffer, index, i);
const source = ByteUtils.toUTF8(buffer, index, i, false);
// Create the regexp
index = i + 1;

Expand All @@ -489,7 +489,7 @@ function deserializeObject(
// If are at the end of the buffer there is a problem with the document
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
// Return the C string
const regExpOptions = ByteUtils.toUTF8(buffer, index, i);
const regExpOptions = ByteUtils.toUTF8(buffer, index, i, false);
index = i + 1;

// For each option add the corresponding one for javascript
Expand Down Expand Up @@ -521,7 +521,7 @@ function deserializeObject(
// If are at the end of the buffer there is a problem with the document
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
// Return the C string
const source = ByteUtils.toUTF8(buffer, index, i);
const source = ByteUtils.toUTF8(buffer, index, i, false);
index = i + 1;

// Get the start search index
Expand All @@ -533,7 +533,7 @@ function deserializeObject(
// If are at the end of the buffer there is a problem with the document
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
// Return the C string
const regExpOptions = ByteUtils.toUTF8(buffer, index, i);
const regExpOptions = ByteUtils.toUTF8(buffer, index, i, false);
index = i + 1;

// Set the object
Expand All @@ -551,7 +551,7 @@ function deserializeObject(
) {
throw new BSONError('bad string length in bson');
}
const symbol = getValidatedString(buffer, index, index + stringSize - 1, shouldValidateKey);
const symbol = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey);
value = promoteValues ? symbol : new BSONSymbol(symbol);
index = index + stringSize;
} else if (elementType === constants.BSON_DATA_TIMESTAMP) {
Expand Down Expand Up @@ -587,7 +587,7 @@ function deserializeObject(
) {
throw new BSONError('bad string length in bson');
}
const functionString = getValidatedString(
const functionString = ByteUtils.toUTF8(
buffer,
index,
index + stringSize - 1,
Expand Down Expand Up @@ -626,7 +626,7 @@ function deserializeObject(
}

// Javascript function
const functionString = getValidatedString(
const functionString = ByteUtils.toUTF8(
buffer,
index,
index + stringSize - 1,
Expand Down Expand Up @@ -678,7 +678,7 @@ function deserializeObject(
throw new BSONError('Invalid UTF-8 string in BSON document');
}
}
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1);
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false);
// Update parse index position
index = index + stringSize;

Expand Down Expand Up @@ -728,24 +728,3 @@ function deserializeObject(

return object;
}

function getValidatedString(
buffer: Uint8Array,
start: number,
end: number,
shouldValidateUtf8: boolean
) {
const value = ByteUtils.toUTF8(buffer, start, end);
// if utf8 validation is on, do the check
if (shouldValidateUtf8) {
for (let i = 0; i < value.length; i++) {
if (value.charCodeAt(i) === 0xfffd) {
if (!validateUtf8(buffer, start, end)) {
throw new BSONError('Invalid UTF-8 string in BSON document');
}
break;
}
}
}
return value;
}
4 changes: 2 additions & 2 deletions src/utils/byte_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ export type ByteUtils = {
toHex: (buffer: Uint8Array) => string;
/** Create a Uint8Array containing utf8 code units from a string */
fromUTF8: (text: string) => Uint8Array;
/** Create a string from utf8 code units */
toUTF8: (buffer: Uint8Array, start: number, end: number) => string;
/** Create a string from utf8 code units, fatal=true will throw an error if UTF-8 bytes are invalid, fatal=false will insert replacement characters */
toUTF8: (buffer: Uint8Array, start: number, end: number, fatal: boolean) => string;
/** Get the utf8 code unit count from a string if it were to be transformed to utf8 */
utf8ByteLength: (input: string) => number;
/** Encode UTF8 bytes generated from `source` string into `destination` at byteOffset. Returns the number of bytes encoded. */
Expand Down
66 changes: 66 additions & 0 deletions src/utils/latin.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/**
* This function is an optimization for small basic latin strings.
* @internal
* @remarks
* ### Important characteristics:
* - If the uint8array or distance between start and end is 0 this function returns an empty string
* - If the byteLength of the string is 1, 2, or 3 we invoke String.fromCharCode and manually offset into the buffer
* - If the byteLength of the string is less than or equal to 20 an array of bytes is built and `String.fromCharCode.apply` is called with the result
* - If any byte exceeds 128 this function returns null
*
* @param uint8array - A sequence of bytes that may contain basic latin characters
* @param start - The start index from which to search the uint8array
* @param end - The index to stop searching the uint8array
* @returns string if all bytes are within the basic latin range, otherwise null
*/
export function tryLatin(uint8array: Uint8Array, start: number, end: number): string | null {
if (uint8array.length === 0) {
return '';
}

const stringByteLength = end - start;
if (stringByteLength === 0) {
return '';
}

if (stringByteLength === 1 && uint8array[start] < 128) {
return String.fromCharCode(uint8array[start]);
}

if (stringByteLength === 2 && uint8array[start] < 128 && uint8array[start + 1] < 128) {
return String.fromCharCode(uint8array[start]) + String.fromCharCode(uint8array[start + 1]);
}

if (
stringByteLength === 3 &&
uint8array[start] < 128 &&
uint8array[start + 1] < 128 &&
uint8array[start + 2] < 128
) {
return (
String.fromCharCode(uint8array[start]) +
String.fromCharCode(uint8array[start + 1]) +
String.fromCharCode(uint8array[start + 2])
);
}

if (stringByteLength <= 20) {
let basicLatin = true;
const latinBytes = [];
for (let i = start; i < end; i++) {
const byte = uint8array[i];
if (byte > 127) {
basicLatin = false;
break;
}
latinBytes.push(byte);
}

if (basicLatin) {
// eslint-disable-next-line prefer-spread
return String.fromCharCode.apply(String, latinBytes);
}
}

return null;
}
23 changes: 21 additions & 2 deletions src/utils/node_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import { BSONError } from '../error';
import { validateUtf8 } from '../validate_utf8';
import { tryLatin } from './latin';

type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary';
type NodeJsBuffer = ArrayBufferView &
Expand Down Expand Up @@ -125,8 +127,25 @@ export const nodeJsByteUtils = {
return Buffer.from(text, 'utf8');
},

toUTF8(buffer: Uint8Array, start: number, end: number): string {
return nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end);
toUTF8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string {
const basicLatin = end - start <= 20 ? tryLatin(buffer, start, end) : null;
if (basicLatin != null) {
return basicLatin;
}

const string = nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end);
if (fatal) {
// TODO(NODE-4930): Insufficiently strict BSON UTF8 validation
for (let i = 0; i < string.length; i++) {
if (string.charCodeAt(i) === 0xfffd) {
if (!validateUtf8(buffer, start, end)) {
throw new BSONError('Invalid UTF-8 string in BSON document');
}
break;
}
}
}
return string;
},

utf8ByteLength(input: string): number {
Expand Down
17 changes: 15 additions & 2 deletions src/utils/web_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { BSONError } from '../error';
import { tryLatin } from './latin';

type TextDecoder = {
readonly encoding: string;
Expand Down Expand Up @@ -172,8 +173,20 @@ export const webByteUtils = {
return new TextEncoder().encode(text);
},

toUTF8(uint8array: Uint8Array, start: number, end: number): string {
return new TextDecoder('utf8', { fatal: false }).decode(uint8array.slice(start, end));
toUTF8(uint8array: Uint8Array, start: number, end: number, fatal: boolean): string {
const basicLatin = end - start <= 20 ? tryLatin(uint8array, start, end) : null;
if (basicLatin != null) {
return basicLatin;
}

if (fatal) {
try {
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
} catch (cause) {
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
}
}
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
},

utf8ByteLength(input: string): number {
Expand Down
42 changes: 40 additions & 2 deletions test/node/byte_utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -400,19 +400,34 @@ const fromUTF8Tests: ByteUtilTest<'fromUTF8'>[] = [
const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
{
name: 'should create utf8 string from buffer input',
inputs: [Buffer.from('abc\u{1f913}', 'utf8')],
inputs: [Buffer.from('abc\u{1f913}', 'utf8'), 0, 7, false],
expectation({ output, error }) {
expect(error).to.be.null;
expect(output).to.deep.equal(Buffer.from('abc\u{1f913}', 'utf8').toString('utf8'));
}
},
{
name: 'should return empty string for empty buffer input',
inputs: [Buffer.alloc(0)],
inputs: [Buffer.alloc(0), 0, 1, false],
expectation({ output, error }) {
expect(error).to.be.null;
expect(output).to.be.a('string').with.lengthOf(0);
}
},
{
name: 'should throw an error if fatal is set and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'should insert replacement character fatal is false and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
expectation({ error, output }) {
expect(error).to.not.exist;
expect(output).to.equal('abc\uFFFD');
}
}
];
const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
Expand Down Expand Up @@ -596,6 +611,29 @@ describe('ByteUtils', () => {
});
});

describe('toUTF8 basic latin optimization', () => {
afterEach(() => {
sinon.restore();
});

context('Given a basic latin string', () => {
it('should not invoke Buffer.toString', () => {
const buffer = Buffer.from('abcdef', 'utf8');
const spy = sinon.spy(buffer, 'toString');
nodeJsByteUtils.toUTF8(buffer, 0, 6, false);
expect(spy).to.not.have.been.called;
});

it('should not invoke TextDecoder.decode', () => {
const utf8Bytes = Buffer.from('abcdef', 'utf8');
const buffer = new Uint8Array(utf8Bytes.buffer, utf8Bytes.byteOffset, utf8Bytes.byteLength);
const spy = sinon.spy(TextDecoder.prototype, 'decode');
webByteUtils.toUTF8(buffer, 0, 6, false);
expect(spy).to.not.have.been.called;
});
});
});

describe('randomBytes fallback case when crypto is not present', () => {
describe('web', function () {
let bsonWithNoCryptoCtx;
Expand Down
1 change: 1 addition & 0 deletions test/node/release.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ const REQUIRED_FILES = [
'src/utils/byte_utils.ts',
'src/utils/node_byte_utils.ts',
'src/utils/web_byte_utils.ts',
'src/utils/latin.ts',
'src/validate_utf8.ts',
'vendor/base64/base64.js',
'vendor/base64/package.json',
Expand Down
1 change: 1 addition & 0 deletions tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"skipLibCheck": true,
"lib": [
"es2021",
"ES2022.Error"
],
"outDir": "lib",
"importHelpers": false,
Expand Down