From 52f8cc8f37fb3553e3401f6a1b7ec507f94cb50b Mon Sep 17 00:00:00 2001
From: opparco <parco.opaai@gmail.com>
Date: Wed, 30 Aug 2023 11:28:19 +0900
Subject: [PATCH] llama : fix bpe tokenize from byte

---
 llama.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index fcd6f276a0655..6c60e5a9be3d7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3319,9 +3319,15 @@ struct llm_tokenizer_bpe {
                         std::string byte_str(1, *j);
                         auto token_multibyte = vocab.token_to_id.find(byte_str);
                         if (token_multibyte == vocab.token_to_id.end()) {
-                            fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
+                            try {
+                                llama_token token_byte = llama_byte_to_token(vocab, *j);
+                                output.push_back(token_byte);
+                            } catch (const std::out_of_range & err) {
+                                fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
+                            }
+                        } else {
+                            output.push_back((*token_multibyte).second);
                         }
-                        output.push_back((*token_multibyte).second);
                     }
                 } else {
                     output.push_back((*token).second);