Skip to content

Commit

Permalink
Merge pull request #724 from byroot/lookup-3
Browse files Browse the repository at this point in the history
Improve lookup tables for string escaping.
  • Loading branch information
byroot authored Jan 3, 2025
2 parents f745ec1 + dc7d766 commit 12965b9
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 111 deletions.
4 changes: 1 addition & 3 deletions benchmark/encoder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,10 @@ def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: [
benchmark_encoding "mixed utf8", ([("a" * 5000) + "€" + ("a" * 5000)] * 500)
benchmark_encoding "mostly utf8", ([("€" * 3333)] * 500)

# On these benchmarks we perform well, we're on par or better.
# On these benchmarks we perform well, we're on par or a bit better.
benchmark_encoding "integers", (1_000_000..1_001_000).to_a, except: %i(json_state)
benchmark_encoding "activitypub.json", JSON.load_file("#{__dir__}/data/activitypub.json")
benchmark_encoding "citm_catalog.json", JSON.load_file("#{__dir__}/data/citm_catalog.json")

# On twitter.json we're still about 6% slower, this is worth investigating.
benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json")

# This benchmark spent the overwhelming majority of its time in `ruby_dtoa`. We rely on Ruby's implementation
Expand Down
187 changes: 79 additions & 108 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,73 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
raise_generator_error_str(invalid_object, str);
}

// 0 - single byte char that don't need to be escaped.
// (x | 8) - char that needs to be escaped.
static const unsigned char CHAR_LENGTH_MASK = 7;

static const unsigned char escape_table[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
// ASCII Characters
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};

static const unsigned char ascii_only_escape_table[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
// ASCII Characters
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Continuation byte
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// First byte of a 2-byte code point
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
// First byte of a 3-byte code point
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
//First byte of a 4+ byte code point
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
};

static const unsigned char script_safe_escape_table[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
// ASCII Characters
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, // '"' and '/'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Continuation byte
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// First byte of a 2-byte code point
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
// First byte of a 3-byte code point
3, 3,11, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE2 is the start of \u2028 and \u2029
//First byte of a 4+ byte code point
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
};

/* Converts in_string to a JSON string (without the wrapping '"'
* characters) in FBuffer out_buffer.
*
Expand All @@ -106,13 +173,13 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
*
* - If out_ascii_only: non-ASCII characters (>0x7F)
*
* - If out_script_safe: forwardslash, line separator (U+2028), and
* - If script_safe: forwardslash (/), line separator (U+2028), and
* paragraph separator (U+2029)
*
* Everything else (should be UTF-8) is just passed through and
* appended to the result.
*/
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
Expand All @@ -131,7 +198,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca

if (RB_UNLIKELY(ch_len)) {
switch (ch_len) {
case 1: {
case 9: {
FLUSH_POS(1);
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
Expand All @@ -153,9 +220,9 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
}
break;
}
case 3: {
case 11: {
unsigned char b2 = ptr[pos + 1];
if (RB_UNLIKELY(out_script_safe && ch == 0xE2 && b2 == 0x80)) {
if (RB_UNLIKELY(b2 == 0x80)) {
unsigned char b3 = ptr[pos + 2];
if (b3 == 0xA8) {
FLUSH_POS(3);
Expand All @@ -167,6 +234,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
break;
}
}
ch_len = 3;
// fallthrough
}
default:
Expand All @@ -186,104 +254,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
RB_GC_GUARD(str);
}

static const char escape_table[256] = {
// ASCII Control Characters
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// ASCII Characters
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, // '"'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
// Continuation byte
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// First byte of a 2-byte code point
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
// First byte of a 4-byte code point
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
//First byte of a 4+byte code point
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
};

static const char script_safe_escape_table[256] = {
// ASCII Control Characters
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// ASCII Characters
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, // '"' and '/'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
// Continuation byte
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// First byte of a 2-byte code point
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
// First byte of a 4-byte code point
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
//First byte of a 4+byte code point
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
};

static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256])
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };

const char *ptr = RSTRING_PTR(str);
unsigned long len = RSTRING_LEN(str);

unsigned long beg = 0, pos;

for (pos = 0; pos < len;) {
unsigned char ch = ptr[pos];
/* JSON encoding */
if (escape_table[ch]) {
if (pos > beg) {
fbuffer_append(out_buffer, &ptr[beg], pos - beg);
}

beg = pos + 1;
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
default:
scratch[2] = '0';
scratch[3] = '0';
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(out_buffer, scratch, 6);
}
}

pos++;
}

if (beg < len) {
fbuffer_append(out_buffer, &ptr[beg], len - beg);
}

RB_GC_GUARD(str);
}

static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
Expand All @@ -301,7 +272,7 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons

if (RB_UNLIKELY(ch_len)) {
switch (ch_len) {
case 1: {
case 9: {
FLUSH_POS(1);
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
Expand All @@ -325,6 +296,8 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons
}
default: {
uint32_t wchar = 0;
ch_len = ch_len & CHAR_LENGTH_MASK;

switch(ch_len) {
case 2:
wchar = ptr[pos] & 0x1F;
Expand Down Expand Up @@ -935,13 +908,11 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat

switch(rb_enc_str_coderange(obj)) {
case ENC_CODERANGE_7BIT:
convert_ASCII_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
break;
case ENC_CODERANGE_VALID:
if (RB_UNLIKELY(state->ascii_only)) {
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
} else {
convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
}
break;
default:
Expand Down

0 comments on commit 12965b9

Please sign in to comment.