From 9e79431f1e41677c9fd921e2facb8dbf284eacdc Mon Sep 17 00:00:00 2001 From: Frank Liu Date: Mon, 23 Oct 2023 07:30:48 -0700 Subject: [PATCH] [tokenizers] Upgrade huggingface tokenizers to 1.14.1 (#2818) --- .github/workflows/native_s3_huggingface.yml | 4 ++-- extensions/tokenizers/build.cmd | 2 +- extensions/tokenizers/build.sh | 2 +- extensions/tokenizers/rust/Cargo.toml | 2 +- extensions/tokenizers/rust/src/lib.rs | 8 ++++++-- gradle.properties | 2 +- 6 files changed, 12 insertions(+), 8 deletions(-) diff --git a/.github/workflows/native_s3_huggingface.yml b/.github/workflows/native_s3_huggingface.yml index d9ce2d29197..e5a3d57fe42 100644 --- a/.github/workflows/native_s3_huggingface.yml +++ b/.github/workflows/native_s3_huggingface.yml @@ -45,7 +45,7 @@ jobs: - name: Install Environment run: | yum -y update - yum -y install centos-release-scl-rh epel-release + yum -y install centos-release-scl-rh epel-release perl-core yum -y install devtoolset-7 git patch cmake3 libstdc++-static ln -s /usr/bin/cmake3 /usr/bin/cmake curl https://sh.rustup.rs -sSf | sh -s -- -y @@ -184,7 +184,7 @@ jobs: - name: Install Environment run: | yum -y update - yum -y install centos-release-scl-rh epel-release + yum -y install centos-release-scl-rh epel-release perl-core yum -y install devtoolset-7 git patch cmake3 libstdc++-static ln -s /usr/bin/cmake3 /usr/bin/cmake curl https://sh.rustup.rs -sSf | sh -s -- -y diff --git a/extensions/tokenizers/build.cmd b/extensions/tokenizers/build.cmd index 3a481d33bab..d83f2c1ed74 100644 --- a/extensions/tokenizers/build.cmd +++ b/extensions/tokenizers/build.cmd @@ -3,7 +3,7 @@ @rem choco install rust -y @rem choco install jdk8 -y -set VERSION=python-v"%1" +set VERSION=v"%1" if exist "tokenizers" ( echo Found "tokenizers" diff --git a/extensions/tokenizers/build.sh b/extensions/tokenizers/build.sh index 4ba45a09965..229e8124914 100755 --- a/extensions/tokenizers/build.sh +++ b/extensions/tokenizers/build.sh @@ -10,7 +10,7 @@ elif [[ -n $(command -v sysctl) ]]; then fi PLATFORM=$(uname | tr '[:upper:]' '[:lower:]') -VERSION=python-v$1 +VERSION=v$1 ARCH=$2 pushd $WORK_DIR diff --git a/extensions/tokenizers/rust/Cargo.toml b/extensions/tokenizers/rust/Cargo.toml index f6b846f636c..3418c8f5129 100644 --- a/extensions/tokenizers/rust/Cargo.toml +++ b/extensions/tokenizers/rust/Cargo.toml @@ -6,7 +6,7 @@ edition = "2018" [dependencies] jni = "0.19.0" -tokenizers = { path = "../tokenizers/tokenizers", version = "*" } +tokenizers = { path = "../tokenizers/tokenizers", version = "*", features = ["http"] } [target.'cfg(target_os = "linux")'.dependencies] openssl = { version = "0.10", features = ["vendored"] } diff --git a/extensions/tokenizers/rust/src/lib.rs b/extensions/tokenizers/rust/src/lib.rs index d1c0c455c19..590099c2ecf 100644 --- a/extensions/tokenizers/rust/src/lib.rs +++ b/extensions/tokenizers/rust/src/lib.rs @@ -490,7 +490,7 @@ pub extern "system" fn Java_ai_djl_huggingface_tokenizers_jni_TokenizersLibrary_ } } let decoding: String = tokenizer - .decode(decode_ids, skip_special_tokens == JNI_TRUE) + .decode(&*decode_ids, skip_special_tokens == JNI_TRUE) .unwrap(); let ret = env .new_string(decoding) @@ -527,8 +527,12 @@ pub extern "system" fn Java_ai_djl_huggingface_tokenizers_jni_TokenizersLibrary_ } batch_decode_input.push(decode_ids); } + let mut references: Vec<&[u32]> = Vec::new(); + for reference in batch_decode_input.iter() { + references.push(reference); + } let decoding: Vec = tokenizer - .decode_batch(batch_decode_input, skip_special_tokens == JNI_TRUE) + .decode_batch(&references, skip_special_tokens == JNI_TRUE) .unwrap(); let ret: jobjectArray = env .new_object_array(batch_len, "java/lang/String", JObject::null()) diff --git a/gradle.properties b/gradle.properties index 4740f67c285..0ef8f6e991b 100644 --- a/gradle.properties +++ b/gradle.properties @@ -20,7 +20,7 @@ trt_version=8.4.1 onnxruntime_version=1.16.0 paddlepaddle_version=2.3.2 sentencepiece_version=0.1.97 -tokenizers_version=0.13.3 +tokenizers_version=0.14.1 fasttext_version=0.9.2 xgboost_version=1.7.5 lightgbm_version=3.2.110