Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unicode support via normalisation step #70

Merged
merged 6 commits into from
Feb 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/apitests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ on:
branches:
- main
push:
branches:
- main
branches: [ "*" ]

jobs:
build:
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,6 @@
[submodule "frontend/public/manamoji"]
path = frontend/public/manamoji
url = https://github.com/scryfall/manamoji-discord.git
[submodule "backend/utf8proc"]
path = backend/utf8proc
url = https://github.com/JuliaStrings/utf8proc
8 changes: 7 additions & 1 deletion backend/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,10 @@ set(ABSL_LIBS

add_subdirectory(./re2)

add_subdirectory(./utf8proc/)

option(USE_JEMALLOC OFF)
set(LIBS jansson CURL::libcurl pthread ${ABSL_LIBS} re2)
set(LIBS jansson CURL::libcurl pthread ${ABSL_LIBS} re2 utf8proc)
if(USE_JEMALLOC)
set(LIBS ${LIBS} jemalloc)
endif()
Expand Down Expand Up @@ -205,6 +207,8 @@ set(SRC
./mse/query_parser_internals.c
./mse/levenshtein_difference.h
./mse/levenshtein_difference.c
./mse/utf8_normalisation.h
./mse/utf8_normalisation.c
./strptime/strptime.h
./strptime/strptime.c
${PARSER_SRC}
Expand Down Expand Up @@ -256,6 +260,8 @@ set(TESTS
./tests/test_formats.c
./tests/test_levenshtein_difference.h
./tests/test_levenshtein_difference.c
./tests/test_utf8_normalisation.h
./tests/test_utf8_normalisation.c
./tests/system_test.h
./tests/system_test.c)
set(WEB_API
Expand Down
2 changes: 1 addition & 1 deletion backend/abseil
Submodule abseil updated 63 files
+0 −4 CMake/AbseilDll.cmake
+0 −19 absl/base/BUILD.bazel
+0 −16 absl/base/CMakeLists.txt
+4 −26 absl/base/config.h
+0 −4 absl/base/exception_safety_testing_test.cc
+0 −64 absl/base/inline_variable_test.cc
+0 −27 absl/base/inline_variable_test_a.cc
+0 −27 absl/base/inline_variable_test_b.cc
+0 −5 absl/base/internal/cycleclock.cc
+7 −7 absl/base/internal/cycleclock_config.h
+0 −5 absl/base/internal/fast_type_id.h
+0 −108 absl/base/internal/inline_variable.h
+0 −46 absl/base/internal/inline_variable_testing.h
+0 −9 absl/base/internal/spinlock.cc
+0 −2 absl/base/no_destructor.h
+0 −4 absl/base/no_destructor_test.cc
+0 −2 absl/cleanup/cleanup.h
+0 −2 absl/cleanup/cleanup_test.cc
+0 −9 absl/container/fixed_array.h
+0 −4 absl/container/internal/hashtablez_sampler.cc
+23 −29 absl/container/internal/layout.h
+470 −120 absl/container/internal/raw_hash_set.cc
+411 −631 absl/container/internal/raw_hash_set.h
+246 −31 absl/container/internal/raw_hash_set_test.cc
+0 −4 absl/crc/crc32c.cc
+7 −5 absl/crc/crc32c.h
+13 −31 absl/crc/internal/crc_x86_arm_combined.cc
+28 −73 absl/functional/internal/any_invocable.h
+0 −4 absl/hash/internal/hash.cc
+6 −6 absl/hash/internal/hash.h
+0 −1 absl/log/BUILD.bazel
+0 −2 absl/log/CMakeLists.txt
+0 −4 absl/log/internal/structured.h
+0 −41 absl/log/log_entry.cc
+0 −52 absl/numeric/int128.cc
+6 −8 absl/random/distributions.h
+0 −4 absl/status/status.cc
+0 −1 absl/strings/BUILD.bazel
+0 −1 absl/strings/CMakeLists.txt
+3 −3 absl/strings/ascii.h
+0 −4 absl/strings/cord.cc
+0 −30 absl/strings/cord_buffer.cc
+0 −4 absl/strings/internal/cord_rep_btree.cc
+0 −4 absl/strings/internal/cordz_info.cc
+0 −22 absl/strings/internal/str_format/extension.cc
+0 −5 absl/strings/internal/string_constant.h
+0 −5 absl/strings/string_view.cc
+0 −4 absl/synchronization/internal/futex_waiter.cc
+0 −5 absl/synchronization/internal/kernel_timeout.cc
+0 −4 absl/synchronization/internal/pthread_waiter.cc
+0 −4 absl/synchronization/internal/sem_waiter.cc
+0 −4 absl/synchronization/internal/stdcpp_waiter.cc
+0 −4 absl/synchronization/internal/waiter_base.cc
+0 −4 absl/synchronization/internal/win32_waiter.cc
+1 −0 absl/time/BUILD.bazel
+0 −1 absl/types/internal/optional.h
+2 −3 absl/types/internal/variant.h
+1 −3 absl/types/optional.h
+0 −24 absl/utility/BUILD.bazel
+0 −24 absl/utility/CMakeLists.txt
+0 −70 absl/utility/internal/if_constexpr.h
+0 −79 absl/utility/internal/if_constexpr_test.cc
+1 −2 absl/utility/utility.h
83 changes: 51 additions & 32 deletions backend/mse/card.c
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#include "./card.h"
#include "./io_utils.h"
#include "../testing_h/testing.h"
#include <string.h>
#include <stdlib.h>
#include "./io_utils.h"
#include "./utf8_normalisation.h"
#include <math.h>
#include <stdlib.h>
#include <string.h>

mse_colour_enum_t mse_parse_colour(char colour)
{
Expand All @@ -26,7 +27,6 @@ mse_colour_enum_t mse_parse_colour(char colour)
default:
return 0;
}

}

mse_colour_enum_t mse_parse_colours(const char *colours)
Expand Down Expand Up @@ -63,22 +63,34 @@ int mse_parse_card_json(json_t *json, mse_card_t *card)
ASSERT(json_is_string(name_o));
ASSERT(card->name = strdup(json_string_value(name_o)));

char *normalised = mse_normalise_utf8(card->name);
ASSERT(normalised != NULL);
free(card->name);
card->name = normalised;

// Read oracle
json_t *oracle_o = json_object_get(json, "text");
if (oracle_o != NULL) {
ASSERT(json_is_string(oracle_o));
ASSERT(card->oracle_text = strdup(json_string_value(oracle_o)));
ASSERT(card->oracle_text_lower = mse_to_lower(card->oracle_text));
} else {
oracle_o = json_object_get(json, "originalText");

if (oracle_o != NULL) {
ASSERT(json_is_string(oracle_o));
ASSERT(card->oracle_text = strdup(json_string_value(oracle_o)));
ASSERT(card->oracle_text_lower = mse_to_lower(card->oracle_text));
}
}

if (card->oracle_text != NULL) {
char *normalised = mse_normalise_utf8(card->oracle_text);
ASSERT(normalised != NULL);
free(card->oracle_text);
card->oracle_text= normalised;

ASSERT(card->oracle_text_lower = mse_to_lower(card->oracle_text));
}

// Read mana cost
json_t *mana_cost_o = json_object_get(json, "manaCost");
if (mana_cost_o != NULL) {
Expand All @@ -99,7 +111,8 @@ int mse_parse_card_json(json_t *json, mse_card_t *card)
ASSERT(card->types = malloc(sizeof(*card->types)));
}

ASSERT(card->types = realloc(card->types, sizeof(*card->types) * (1 + card->types_count)));
ASSERT(card->types = realloc(card->types, sizeof(*card->types) *
(1 + card->types_count)));

char *tmp;
ASSERT(tmp = strdup(json_string_value(value)));
Expand All @@ -118,7 +131,8 @@ int mse_parse_card_json(json_t *json, mse_card_t *card)
ASSERT(card->types = malloc(sizeof(*card->types)));
}

ASSERT(card->types = realloc(card->types, sizeof(*card->types) * (1 + card->types_count)));
ASSERT(card->types = realloc(card->types, sizeof(*card->types) *
(1 + card->types_count)));

char *tmp;
ASSERT(tmp = strdup(json_string_value(value)));
Expand All @@ -137,7 +151,8 @@ int mse_parse_card_json(json_t *json, mse_card_t *card)
ASSERT(card->types = malloc(sizeof(*card->types)));
}

ASSERT(card->types = realloc(card->types, sizeof(*card->types) * (1 + card->types_count)));
ASSERT(card->types = realloc(card->types, sizeof(*card->types) *
(1 + card->types_count)));

char *tmp;
ASSERT(tmp = strdup(json_string_value(value)));
Expand Down Expand Up @@ -196,17 +211,20 @@ int mse_parse_card_json(json_t *json, mse_card_t *card)
ASSERT(card->set_codes = malloc(sizeof(*card->set_codes)));
}

ASSERT(card->set_codes = realloc(card->set_codes,
sizeof(*card->set_codes) * (1 + card->set_codes_count)));
ASSERT(card->set_codes =
realloc(card->set_codes,
sizeof(*card->set_codes) * (1 + card->set_codes_count)));

ASSERT(mse_get_set_code(json_string_value(value), &card->set_codes[card->set_codes_count]));
ASSERT(mse_get_set_code(json_string_value(value),
&card->set_codes[card->set_codes_count]));
card->set_codes_count++;
}

ASSERT(card->name_lower = mse_to_lower(card->name));

// Read the format legalities
ASSERT(mse_card_formats_legalities_t_from_json(json, &card->format_legalities));
ASSERT(
mse_card_formats_legalities_t_from_json(json, &card->format_legalities));
return 1;
}

Expand Down Expand Up @@ -262,7 +280,8 @@ int mse_read_card(FILE *f, mse_card_t *card)
card->colour_identity = tmp;

ASSERT(mse_read_size_t(f, &card->set_codes_count));
ASSERT(card->set_codes = malloc(sizeof(*card->set_codes) * card->set_codes_count));
ASSERT(card->set_codes =
malloc(sizeof(*card->set_codes) * card->set_codes_count));
for (size_t i = 0; i < card->set_codes_count; i++) {
ASSERT(mse_read_set_code(f, &card->set_codes[i]));
}
Expand Down Expand Up @@ -318,32 +337,32 @@ void mse_free_card(mse_card_t *card)
memset(card, 0, sizeof(*card));
}

int mse_avl_cmp_card(void * restrict a, void * restrict b)
int mse_avl_cmp_card(void *restrict a, void *restrict b)
{
mse_card_t *ca = (mse_card_t *) a;
mse_card_t *cb = (mse_card_t *) b;
mse_card_t *ca = (mse_card_t *)a;
mse_card_t *cb = (mse_card_t *)b;
return mse_uuid_cmp(ca->id, cb->id);
}

int mse_avl_cmp_card_name(void * restrict a, void * restrict b)
int mse_avl_cmp_card_name(void *restrict a, void *restrict b)
{
mse_card_t *ca = (mse_card_t *) a;
mse_card_t *cb = (mse_card_t *) b;
mse_card_t *ca = (mse_card_t *)a;
mse_card_t *cb = (mse_card_t *)b;
return strcmp(ca->name, cb->name);
}

#define MSE_CARD_DOUBLE_AVL_CMP(a, b, field) \
mse_card_t *ca = (mse_card_t *) a; \
mse_card_t *cb = (mse_card_t *) b; \
if ((int) ca->field == (int) cb->field) { \
return mse_avl_cmp_card(a, b); \
} \
double cmp = ca->field - cb->field; \
if (cmp < 0) { \
return -1; \
} else { \
return 1; \
} \
#define MSE_CARD_DOUBLE_AVL_CMP(a, b, field) \
mse_card_t *ca = (mse_card_t *)a; \
mse_card_t *cb = (mse_card_t *)b; \
if ((int)ca->field == (int)cb->field) { \
return mse_avl_cmp_card(a, b); \
} \
double cmp = ca->field - cb->field; \
if (cmp < 0) { \
return -1; \
} else { \
return 1; \
}

int mse_avl_cmp_card_power(void *a, void *b)
{
Expand Down
9 changes: 7 additions & 2 deletions backend/mse/query_parser.peg
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "mse/interpretor.h"
#include "mse/query_parser.h"
#include "mse_query_parser.h"
#include "mse/utf8_normalisation.h"
}

%source{
Expand Down Expand Up @@ -152,12 +153,15 @@ REGEX_STRING <- '/' ( '\/' / [^/] )+ '/'
%%
int mse_parse_input_string(const char* input_string, mse_interp_node_t **root)
{
char *normalised = mse_normalise_utf8(input_string);
ASSERT(normalised != NULL);

*root = NULL;

mse_parser_status_t ret;
memset(&ret, 0, sizeof(ret));
ret.input_buffer = input_string;
ret.input_buffer_len = strlen(input_string);
ret.input_buffer_len = strlen(normalised);

mse_context_t *ctx = mse_create(&ret);
while (ret.error == 0 && mse_parse(ctx, NULL));
Expand All @@ -167,8 +171,9 @@ int mse_parse_input_string(const char* input_string, mse_interp_node_t **root)
__mse_free_parser_status(&ret);

if (ret.error) {
lprintf(LOG_ERROR, "Cannot parse '%s'\n", input_string);
lprintf(LOG_ERROR, "Cannot parse '%s'\n", normalised);
}
free(normalised);

*root = ret.root;
ASSERT(root != NULL);
Expand Down
10 changes: 10 additions & 0 deletions backend/mse/utf8_normalisation.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#include "./utf8_normalisation.h"
#include <utf8proc.h>
#include <string.h>

char *mse_normalise_utf8(const char *utf8)
{
utf8proc_uint8_t *fold_str = NULL;
utf8proc_map((unsigned char *) utf8, 0, &fold_str, UTF8PROC_NULLTERM | UTF8PROC_STRIPMARK | UTF8PROC_DECOMPOSE);
return (char *) fold_str;
}
3 changes: 3 additions & 0 deletions backend/mse/utf8_normalisation.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#pragma once

char * mse_normalise_utf8(const char *utf8);
2 changes: 2 additions & 0 deletions backend/tests/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "./test_re2.h"
#include "./test_formats.h"
#include "./test_levenshtein_difference.h"
#include "./test_utf8_normalisation.h"

static int sanity_test()
{
Expand All @@ -29,6 +30,7 @@ static int sanity_test()
SUB_TEST(tests, {&sanity_test, "Sanity Test"},
{&test_re2, "Test re2 wrapper"},
{&test_levenshtein_difference, "Test Levenshtein difference"},
{&test_utf8_normalisation, "Test UTF8 normalisation"},
/* Test common funcs and types */
{&test_uuid, "Test UUID"},
{&test_io_utils, "Test IO utils"},
Expand Down
37 changes: 37 additions & 0 deletions backend/tests/test_utf8_normalisation.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#include "./test_utf8_normalisation.h"
#include "../testing_h/testing.h"
#include "../mse/utf8_normalisation.h"
#include <string.h>

#define TEST_STR_NORMAL "Testing 123."

static int test_utf8_normalisation_normal_input()
{
char *normalised = mse_normalise_utf8(TEST_STR_NORMAL);
ASSERT(normalised != NULL);
ASSERT(strcmp(TEST_STR_NORMAL, normalised) == 0);
free(normalised);
return 1;
}

static int test_utf8_normalisation_special_input_1()
{
char *normalised = mse_normalise_utf8("Mein Code ist sheiße");
ASSERT(normalised != NULL);
ASSERT(strcmp("Mein Code ist sheiße", normalised) == 0);
free(normalised);
return 1;
}

static int test_utf8_normalisation_special_input_2()
{
char *normalised = mse_normalise_utf8("Ça-va?");
ASSERT(normalised != NULL);
ASSERT(strcmp("Ca-va?", normalised) == 0);
free(normalised);
return 1;
}

SUB_TEST(test_utf8_normalisation, {&test_utf8_normalisation_normal_input, "Test normalisation with normal input"},
{&test_utf8_normalisation_special_input_1, "Test normalisation with special input 1"},
{&test_utf8_normalisation_special_input_2, "Test normalisation with special input 2"})
3 changes: 3 additions & 0 deletions backend/tests/test_utf8_normalisation.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#pragma once

int test_utf8_normalisation();
1 change: 1 addition & 0 deletions backend/utf8proc
Submodule utf8proc added at a1b99d
Loading