fix: wrong offset value when escape characters exist

juanjoDiaz · Nov 13, 2024 · af7d3d9 · af7d3d9
1 parent bc28586
commit af7d3d9
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 35 deletions.
diff --git a/packages/node/test/offset.ts b/packages/node/test/offset.ts
@@ -4,7 +4,8 @@ import TokenType from "@streamparser/json/utils/types/tokenType.js";
 
 const input1 = '{\n  "string": "value",\n  "number": 3,\n  "object"';
 const input2 = ': {\n  "key": "vд"\n  },\n  "array": [\n  -1,\n  12\n  ]\n  ';
-const input3 = '"null": null, "true": true, "false": false, "frac": 3.14 }';
+const input3 = '"null": null, "true": true, "false": false, "frac": 3.14,';
+const input4 = '"escape": "\\"\\u00e1" }';
 
 const offsets = [
   [0, TokenType.LEFT_BRACE],
@@ -46,15 +47,19 @@ const offsets = [
   [146, TokenType.STRING],
   [152, TokenType.COLON],
   [154, TokenType.NUMBER],
-  [159, TokenType.RIGHT_BRACE],
+  [158, TokenType.COMMA],
+  [159, TokenType.STRING],
+  [167, TokenType.COLON],
+  [169, TokenType.STRING],
+  [180, TokenType.RIGHT_BRACE],
 ];
 
 test("offset", async () => {
   let i = 0;
 
   await runTokenizerTest(
     new Tokenizer(),
-    [input1, input2, input3],
+    [input1, input2, input3, input4],
     ({ token, offset }) => {
       expect(offset).toEqual(offsets[i][0]);
       expect(token).toEqual(offsets[i][1]);

diff --git a/packages/plainjs/dist/deno/tokenizer.ts b/packages/plainjs/dist/deno/tokenizer.ts
@@ -110,6 +110,7 @@ export default class Tokenizer {
   private separator?: string;
   private separatorBytes?: Uint8Array;
   private separatorIndex = 0;
+  private escapedCharsByteLength = 0;
   private bufferedString: StringBuilder;
   private bufferedNumber: StringBuilder;
 
@@ -300,6 +301,7 @@ export default class Tokenizer {
 
             if (n === charset.QUOTATION_MARK) {
               this.bufferedString.reset();
+              this.escapedCharsByteLength = 0;
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }
@@ -336,7 +338,10 @@ export default class Tokenizer {
                 value: string,
                 offset: this.offset,
               });
-              this.offset += this.bufferedString.byteLength + 1;
+              this.offset +=
+                this.escapedCharsByteLength +
+                this.bufferedString.byteLength +
+                1;
               continue;
             }
 
@@ -398,6 +403,7 @@ export default class Tokenizer {
             const controlChar = escapedSequences[n];
             if (controlChar) {
               this.bufferedString.appendChar(controlChar);
+              this.escapedCharsByteLength += 1; // len(\")=2 minus the fact you're appending len(controlChar)=1
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }
@@ -436,32 +442,32 @@ export default class Tokenizer {
                 this.unicode + String.fromCharCode(n),
                 16,
               );
+              let unicodeString: string;
               if (this.highSurrogate === undefined) {
                 if (intVal >= 0xd800 && intVal <= 0xdbff) {
                   //<55296,56319> - highSurrogate
                   this.highSurrogate = intVal;
+                  this.state = TokenizerStates.STRING_DEFAULT;
+                  continue;
                 } else {
-                  this.bufferedString.appendBuf(
-                    this.encoder.encode(String.fromCharCode(intVal)),
-                  );
+                  unicodeString = String.fromCharCode(intVal);
                 }
               } else {
                 if (intVal >= 0xdc00 && intVal <= 0xdfff) {
                   //<56320,57343> - lowSurrogate
-                  this.bufferedString.appendBuf(
-                    this.encoder.encode(
-                      String.fromCharCode(this.highSurrogate, intVal),
-                    ),
+                  unicodeString = String.fromCharCode(
+                    this.highSurrogate,
+                    intVal,
                   );
                 } else {
-                  this.bufferedString.appendBuf(
-                    this.encoder.encode(
-                      String.fromCharCode(this.highSurrogate),
-                    ),
-                  );
+                  unicodeString = String.fromCharCode(this.highSurrogate);
                 }
                 this.highSurrogate = undefined;
               }
+              const unicodeBuffer = this.encoder.encode(unicodeString);
+              this.bufferedString.appendBuf(unicodeBuffer);
+              // len(\u0000)=6 minus the fact you're appending len(buf)
+              this.escapedCharsByteLength += 6 - unicodeBuffer.byteLength;
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }

diff --git a/packages/plainjs/src/tokenizer.ts b/packages/plainjs/src/tokenizer.ts
@@ -110,6 +110,7 @@ export default class Tokenizer {
   private separator?: string;
   private separatorBytes?: Uint8Array;
   private separatorIndex = 0;
+  private escapedCharsByteLength = 0;
   private bufferedString: StringBuilder;
   private bufferedNumber: StringBuilder;
 
@@ -300,6 +301,7 @@ export default class Tokenizer {
 
             if (n === charset.QUOTATION_MARK) {
               this.bufferedString.reset();
+              this.escapedCharsByteLength = 0;
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }
@@ -336,7 +338,10 @@ export default class Tokenizer {
                 value: string,
                 offset: this.offset,
               });
-              this.offset += this.bufferedString.byteLength + 1;
+              this.offset +=
+                this.escapedCharsByteLength +
+                this.bufferedString.byteLength +
+                1;
               continue;
             }
 
@@ -398,6 +403,7 @@ export default class Tokenizer {
             const controlChar = escapedSequences[n];
             if (controlChar) {
               this.bufferedString.appendChar(controlChar);
+              this.escapedCharsByteLength += 1; // len(\")=2 minus the fact you're appending len(controlChar)=1
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }
@@ -436,32 +442,32 @@ export default class Tokenizer {
                 this.unicode + String.fromCharCode(n),
                 16,
               );
+              let unicodeString: string;
               if (this.highSurrogate === undefined) {
                 if (intVal >= 0xd800 && intVal <= 0xdbff) {
                   //<55296,56319> - highSurrogate
                   this.highSurrogate = intVal;
+                  this.state = TokenizerStates.STRING_DEFAULT;
+                  continue;
                 } else {
-                  this.bufferedString.appendBuf(
-                    this.encoder.encode(String.fromCharCode(intVal)),
-                  );
+                  unicodeString = String.fromCharCode(intVal);
                 }
               } else {
                 if (intVal >= 0xdc00 && intVal <= 0xdfff) {
                   //<56320,57343> - lowSurrogate
-                  this.bufferedString.appendBuf(
-                    this.encoder.encode(
-                      String.fromCharCode(this.highSurrogate, intVal),
-                    ),
+                  unicodeString = String.fromCharCode(
+                    this.highSurrogate,
+                    intVal,
                   );
                 } else {
-                  this.bufferedString.appendBuf(
-                    this.encoder.encode(
-                      String.fromCharCode(this.highSurrogate),
-                    ),
-                  );
+                  unicodeString = String.fromCharCode(this.highSurrogate);
                 }
                 this.highSurrogate = undefined;
               }
+              const unicodeBuffer = this.encoder.encode(unicodeString);
+              this.bufferedString.appendBuf(unicodeBuffer);
+              // len(\u0000)=6 minus the fact you're appending len(buf)
+              this.escapedCharsByteLength += 6 - unicodeBuffer.byteLength;
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }

diff --git a/packages/plainjs/test/offset.ts b/packages/plainjs/test/offset.ts
@@ -4,7 +4,8 @@ import TokenType from "../src/utils/types/tokenType.js";
 
 const input1 = '{\n  "string": "value",\n  "number": 3,\n  "object"';
 const input2 = ': {\n  "key": "vд"\n  },\n  "array": [\n  -1,\n  12\n  ]\n  ';
-const input3 = '"null": null, "true": true, "false": false, "frac": 3.14 }';
+const input3 = '"null": null, "true": true, "false": false, "frac": 3.14,';
+const input4 = '"escape": "\\"\\u00e1" }';
 
 const offsets = [
   [0, TokenType.LEFT_BRACE],
@@ -46,15 +47,19 @@ const offsets = [
   [146, TokenType.STRING],
   [152, TokenType.COLON],
   [154, TokenType.NUMBER],
-  [159, TokenType.RIGHT_BRACE],
+  [158, TokenType.COMMA],
+  [159, TokenType.STRING],
+  [167, TokenType.COLON],
+  [169, TokenType.STRING],
+  [180, TokenType.RIGHT_BRACE],
 ];
 
 test("offset", async () => {
   let i = 0;
 
   await runTokenizerTest(
     new Tokenizer(),
-    [input1, input2, input3],
+    [input1, input2, input3, input4],
     ({ token, offset }) => {
       expect(offset).toEqual(offsets[i][0]);
       expect(token).toEqual(offsets[i][1]);

diff --git a/packages/whatwg/test/offset.ts b/packages/whatwg/test/offset.ts
@@ -4,7 +4,8 @@ import TokenType from "@streamparser/json/utils/types/tokenType.js";
 
 const input1 = '{\n  "string": "value",\n  "number": 3,\n  "object"';
 const input2 = ': {\n  "key": "vд"\n  },\n  "array": [\n  -1,\n  12\n  ]\n  ';
-const input3 = '"null": null, "true": true, "false": false, "frac": 3.14 }';
+const input3 = '"null": null, "true": true, "false": false, "frac": 3.14,';
+const input4 = '"escape": "\\"\\u00e1" }';
 
 const offsets = [
   [0, TokenType.LEFT_BRACE],
@@ -46,15 +47,19 @@ const offsets = [
   [146, TokenType.STRING],
   [152, TokenType.COLON],
   [154, TokenType.NUMBER],
-  [159, TokenType.RIGHT_BRACE],
+  [158, TokenType.COMMA],
+  [159, TokenType.STRING],
+  [167, TokenType.COLON],
+  [169, TokenType.STRING],
+  [180, TokenType.RIGHT_BRACE],
 ];
 
 test("offset", async () => {
   let i = 0;
 
   await runTokenizerTest(
     new Tokenizer(),
-    [input1, input2, input3],
+    [input1, input2, input3, input4],
     ({ token, offset }) => {
       expect(offset).toEqual(offsets[i][0]);
       expect(token).toEqual(offsets[i][1]);