Skip to content

Commit

Permalink
fix: wrong offset value when escape characters exist
Browse files Browse the repository at this point in the history
  • Loading branch information
SebastienGllmt authored Nov 13, 2024
1 parent bc28586 commit af7d3d9
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 35 deletions.
11 changes: 8 additions & 3 deletions packages/node/test/offset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ import TokenType from "@streamparser/json/utils/types/tokenType.js";

const input1 = '{\n "string": "value",\n "number": 3,\n "object"';
const input2 = ': {\n "key": "vд"\n },\n "array": [\n -1,\n 12\n ]\n ';
const input3 = '"null": null, "true": true, "false": false, "frac": 3.14 }';
const input3 = '"null": null, "true": true, "false": false, "frac": 3.14,';
const input4 = '"escape": "\\"\\u00e1" }';

const offsets = [
[0, TokenType.LEFT_BRACE],
Expand Down Expand Up @@ -46,15 +47,19 @@ const offsets = [
[146, TokenType.STRING],
[152, TokenType.COLON],
[154, TokenType.NUMBER],
[159, TokenType.RIGHT_BRACE],
[158, TokenType.COMMA],
[159, TokenType.STRING],
[167, TokenType.COLON],
[169, TokenType.STRING],
[180, TokenType.RIGHT_BRACE],
];

test("offset", async () => {
let i = 0;

await runTokenizerTest(
new Tokenizer(),
[input1, input2, input3],
[input1, input2, input3, input4],
({ token, offset }) => {
expect(offset).toEqual(offsets[i][0]);
expect(token).toEqual(offsets[i][1]);
Expand Down
32 changes: 19 additions & 13 deletions packages/plainjs/dist/deno/tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ export default class Tokenizer {
private separator?: string;
private separatorBytes?: Uint8Array;
private separatorIndex = 0;
private escapedCharsByteLength = 0;
private bufferedString: StringBuilder;
private bufferedNumber: StringBuilder;

Expand Down Expand Up @@ -300,6 +301,7 @@ export default class Tokenizer {

if (n === charset.QUOTATION_MARK) {
this.bufferedString.reset();
this.escapedCharsByteLength = 0;
this.state = TokenizerStates.STRING_DEFAULT;
continue;
}
Expand Down Expand Up @@ -336,7 +338,10 @@ export default class Tokenizer {
value: string,
offset: this.offset,
});
this.offset += this.bufferedString.byteLength + 1;
this.offset +=
this.escapedCharsByteLength +
this.bufferedString.byteLength +
1;
continue;
}

Expand Down Expand Up @@ -398,6 +403,7 @@ export default class Tokenizer {
const controlChar = escapedSequences[n];
if (controlChar) {
this.bufferedString.appendChar(controlChar);
this.escapedCharsByteLength += 1; // len(\")=2 minus the fact you're appending len(controlChar)=1
this.state = TokenizerStates.STRING_DEFAULT;
continue;
}
Expand Down Expand Up @@ -436,32 +442,32 @@ export default class Tokenizer {
this.unicode + String.fromCharCode(n),
16,
);
let unicodeString: string;
if (this.highSurrogate === undefined) {
if (intVal >= 0xd800 && intVal <= 0xdbff) {
//<55296,56319> - highSurrogate
this.highSurrogate = intVal;
this.state = TokenizerStates.STRING_DEFAULT;
continue;
} else {
this.bufferedString.appendBuf(
this.encoder.encode(String.fromCharCode(intVal)),
);
unicodeString = String.fromCharCode(intVal);
}
} else {
if (intVal >= 0xdc00 && intVal <= 0xdfff) {
//<56320,57343> - lowSurrogate
this.bufferedString.appendBuf(
this.encoder.encode(
String.fromCharCode(this.highSurrogate, intVal),
),
unicodeString = String.fromCharCode(
this.highSurrogate,
intVal,
);
} else {
this.bufferedString.appendBuf(
this.encoder.encode(
String.fromCharCode(this.highSurrogate),
),
);
unicodeString = String.fromCharCode(this.highSurrogate);
}
this.highSurrogate = undefined;
}
const unicodeBuffer = this.encoder.encode(unicodeString);
this.bufferedString.appendBuf(unicodeBuffer);
// len(\u0000)=6 minus the fact you're appending len(buf)
this.escapedCharsByteLength += 6 - unicodeBuffer.byteLength;
this.state = TokenizerStates.STRING_DEFAULT;
continue;
}
Expand Down
32 changes: 19 additions & 13 deletions packages/plainjs/src/tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ export default class Tokenizer {
private separator?: string;
private separatorBytes?: Uint8Array;
private separatorIndex = 0;
private escapedCharsByteLength = 0;
private bufferedString: StringBuilder;
private bufferedNumber: StringBuilder;

Expand Down Expand Up @@ -300,6 +301,7 @@ export default class Tokenizer {

if (n === charset.QUOTATION_MARK) {
this.bufferedString.reset();
this.escapedCharsByteLength = 0;
this.state = TokenizerStates.STRING_DEFAULT;
continue;
}
Expand Down Expand Up @@ -336,7 +338,10 @@ export default class Tokenizer {
value: string,
offset: this.offset,
});
this.offset += this.bufferedString.byteLength + 1;
this.offset +=
this.escapedCharsByteLength +
this.bufferedString.byteLength +
1;
continue;
}

Expand Down Expand Up @@ -398,6 +403,7 @@ export default class Tokenizer {
const controlChar = escapedSequences[n];
if (controlChar) {
this.bufferedString.appendChar(controlChar);
this.escapedCharsByteLength += 1; // len(\")=2 minus the fact you're appending len(controlChar)=1
this.state = TokenizerStates.STRING_DEFAULT;
continue;
}
Expand Down Expand Up @@ -436,32 +442,32 @@ export default class Tokenizer {
this.unicode + String.fromCharCode(n),
16,
);
let unicodeString: string;
if (this.highSurrogate === undefined) {
if (intVal >= 0xd800 && intVal <= 0xdbff) {
//<55296,56319> - highSurrogate
this.highSurrogate = intVal;
this.state = TokenizerStates.STRING_DEFAULT;
continue;
} else {
this.bufferedString.appendBuf(
this.encoder.encode(String.fromCharCode(intVal)),
);
unicodeString = String.fromCharCode(intVal);
}
} else {
if (intVal >= 0xdc00 && intVal <= 0xdfff) {
//<56320,57343> - lowSurrogate
this.bufferedString.appendBuf(
this.encoder.encode(
String.fromCharCode(this.highSurrogate, intVal),
),
unicodeString = String.fromCharCode(
this.highSurrogate,
intVal,
);
} else {
this.bufferedString.appendBuf(
this.encoder.encode(
String.fromCharCode(this.highSurrogate),
),
);
unicodeString = String.fromCharCode(this.highSurrogate);
}
this.highSurrogate = undefined;
}
const unicodeBuffer = this.encoder.encode(unicodeString);
this.bufferedString.appendBuf(unicodeBuffer);
// len(\u0000)=6 minus the fact you're appending len(buf)
this.escapedCharsByteLength += 6 - unicodeBuffer.byteLength;
this.state = TokenizerStates.STRING_DEFAULT;
continue;
}
Expand Down
11 changes: 8 additions & 3 deletions packages/plainjs/test/offset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ import TokenType from "../src/utils/types/tokenType.js";

const input1 = '{\n "string": "value",\n "number": 3,\n "object"';
const input2 = ': {\n "key": "vд"\n },\n "array": [\n -1,\n 12\n ]\n ';
const input3 = '"null": null, "true": true, "false": false, "frac": 3.14 }';
const input3 = '"null": null, "true": true, "false": false, "frac": 3.14,';
const input4 = '"escape": "\\"\\u00e1" }';

const offsets = [
[0, TokenType.LEFT_BRACE],
Expand Down Expand Up @@ -46,15 +47,19 @@ const offsets = [
[146, TokenType.STRING],
[152, TokenType.COLON],
[154, TokenType.NUMBER],
[159, TokenType.RIGHT_BRACE],
[158, TokenType.COMMA],
[159, TokenType.STRING],
[167, TokenType.COLON],
[169, TokenType.STRING],
[180, TokenType.RIGHT_BRACE],
];

test("offset", async () => {
let i = 0;

await runTokenizerTest(
new Tokenizer(),
[input1, input2, input3],
[input1, input2, input3, input4],
({ token, offset }) => {
expect(offset).toEqual(offsets[i][0]);
expect(token).toEqual(offsets[i][1]);
Expand Down
11 changes: 8 additions & 3 deletions packages/whatwg/test/offset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ import TokenType from "@streamparser/json/utils/types/tokenType.js";

const input1 = '{\n "string": "value",\n "number": 3,\n "object"';
const input2 = ': {\n "key": "vд"\n },\n "array": [\n -1,\n 12\n ]\n ';
const input3 = '"null": null, "true": true, "false": false, "frac": 3.14 }';
const input3 = '"null": null, "true": true, "false": false, "frac": 3.14,';
const input4 = '"escape": "\\"\\u00e1" }';

const offsets = [
[0, TokenType.LEFT_BRACE],
Expand Down Expand Up @@ -46,15 +47,19 @@ const offsets = [
[146, TokenType.STRING],
[152, TokenType.COLON],
[154, TokenType.NUMBER],
[159, TokenType.RIGHT_BRACE],
[158, TokenType.COMMA],
[159, TokenType.STRING],
[167, TokenType.COLON],
[169, TokenType.STRING],
[180, TokenType.RIGHT_BRACE],
];

test("offset", async () => {
let i = 0;

await runTokenizerTest(
new Tokenizer(),
[input1, input2, input3],
[input1, input2, input3, input4],
({ token, offset }) => {
expect(offset).toEqual(offsets[i][0]);
expect(token).toEqual(offsets[i][1]);
Expand Down

0 comments on commit af7d3d9

Please sign in to comment.