Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sentencepiece bpe compatible tokenizer #252

Merged
merged 1 commit into from
Mar 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ endif
#

CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC
LDFLAGS =

# OS specific
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105

**TEMPORARY NOTICE:**
If you're updating to the latest master, you will need to regenerate your model files as the format has changed.

## Description

The main goal is to run the model using 4-bit quantization on a MacBook
Expand Down
4 changes: 3 additions & 1 deletion convert-pth-to-ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ def write_header(fout, hparams, ftype):

keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
values = [
0x67676d6c, # magic: ggml in hex
0x67676d66, # magic: ggml in hex
1, # file version
*[hparams[key] for key in keys],
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
ftype
Expand All @@ -85,6 +86,7 @@ def write_tokens(fout, tokenizer):
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))

def process_and_write_variables(fout, model, ftype):

Expand Down
21 changes: 20 additions & 1 deletion main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "utils.h"

#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
Expand Down Expand Up @@ -105,10 +106,24 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
{
uint32_t magic;
fin.read((char *) &magic, sizeof(magic));
if (magic != 0x67676d6c) {
if (magic == 0x67676d6c) {
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
__func__, fname.c_str());
return false;
}
if (magic != 0x67676d66) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
return false;
}

uint32_t format_version;
fin.read((char *) &format_version, sizeof(format_version));

if (format_version != 1) {
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n",
__func__, fname.c_str(), format_version);
return false;
}
}

int n_ff = 0;
Expand Down Expand Up @@ -154,8 +169,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
word.resize(len);
fin.read((char *) word.data(), len);

float score;
fin.read((char *) &score, sizeof(score));

vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
vocab.score[i] = score;

//if (i < 30000) {
// fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
Expand Down
24 changes: 23 additions & 1 deletion quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "utils.h"

#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
Expand Down Expand Up @@ -63,12 +64,28 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
{
uint32_t magic;
finp.read((char *) &magic, sizeof(magic));
if (magic != 0x67676d6c) {
if (magic == 0x67676d6c) {
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
__func__, fname_inp.c_str());
return false;
}
if (magic != 0x67676d66) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
return false;
}

fout.write((char *) &magic, sizeof(magic));

uint32_t format_version;
finp.read((char *) &format_version, sizeof(format_version));

if (format_version != 1) {
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: move the format_version to a shared header file of some sort, and then say (unsupported version 2, expected 1)

__func__, fname_inp.c_str(), format_version);
return false;
}

fout.write((char *) &format_version, sizeof(format_version));
}

llama_hparams hparams;
Expand Down Expand Up @@ -122,8 +139,13 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
finp.read ((char *) word.data(), len);
fout.write((char *) word.data(), len);

float score;
finp.read ((char *) &score, sizeof(score));
fout.write((char *) &score, sizeof(score));

vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
vocab.score[i] = score;
}
}

Expand Down
171 changes: 130 additions & 41 deletions utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <regex>
#include <iostream>
#include <iterator>
#include <queue>
#include <string>
#include <math.h>

Expand Down Expand Up @@ -294,58 +295,146 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
return tokens;
}

// TODO: Calculate this constant from the vocabulary
#define MAX_TOKEN_LEN 18
// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
std::vector<gpt_vocab::id> res;
std::vector<int> score;
std::vector<gpt_vocab::id> prev;
int len = text.length();

score.resize(len + 1);
prev.resize(len + 1);

// Forward pass
for (int i = 0; i < len; i++) {
int max_len = std::min(len - i, MAX_TOKEN_LEN);
for (int sub_len = 1; sub_len <= max_len; sub_len++) {
auto sub = text.substr(i, sub_len);
auto token = vocab.token_to_id.find(sub);
if (token != vocab.token_to_id.end()) {
int token_score = sub.length() * sub.length();
int local_score = score[i] + token_score;
int next = i + sub_len;
if (score[next] < local_score) {
score[next] = local_score;
prev[next] = (*token).second;
static size_t utf8_len(char src) {
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
return lookup[highbits];
}

struct llama_sp_symbol {
using index = int;
index prev;
index next;
std::string_view text;
};

struct llama_sp_bigram {
struct comparator {
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
return (l.score < r.score) || (l.score == r.score && l.left > r.left);
}
};
using queue_storage = std::vector<llama_sp_bigram>;
using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
llama_sp_symbol::index left;
llama_sp_symbol::index right;
float score;
size_t size;
};

struct llama_tokenizer {
llama_tokenizer(const gpt_vocab & vocab): vocab_(vocab) {}

void tokenize(std::string_view text, std::vector<gpt_vocab::id> & output) {
// split string into utf8 chars
int index = 0;
while (!text.empty()) {
llama_sp_symbol sym;
size_t char_len = std::min(text.size(), utf8_len(text.data()[0]));
sym.text = std::string_view(text.data(), char_len);
sym.prev = index - 1;
text.remove_prefix(char_len);
sym.next = text.empty() ? -1 : index + 1;
index++;
symbols_.emplace_back(std::move(sym));
}

// seed the work queue with all possible 2-character tokens.
for (size_t i = 1; i < symbols_.size(); ++i) {
try_add_bigram(i - 1, i);
}

// keep substituting the highest frequency pairs for as long as we can.
while (!work_queue_.empty()) {
auto bigram = work_queue_.top();
work_queue_.pop();

auto & left_sym = symbols_[bigram.left];
auto & right_sym = symbols_[bigram.right];

// if one of the symbols already got merged, skip it.
if (left_sym.text.empty() || right_sym.text.empty() ||
left_sym.text.size() + right_sym.text.size() != bigram.size) {
continue;
}

// merge the right sym into the left one
left_sym.text = std::string_view(left_sym.text.data(), left_sym.text.size() + right_sym.text.size());
right_sym.text = std::string_view("");

// remove the right sym from the chain
left_sym.next = right_sym.next;
if (right_sym.next >= 0) {
symbols_[right_sym.next].prev = bigram.left;
}

// find more substitutions
try_add_bigram(left_sym.prev, bigram.left);
try_add_bigram(bigram.left, left_sym.next);
}

for (int i = 0; i != -1; i = symbols_[i].next) {
auto& symbol = symbols_[i];
auto token = vocab_.token_to_id.find(std::string(symbol.text));

if (token == vocab_.token_to_id.end()) {
// output any symbols that did not form tokens as bytes.
for (int j = 0; j < symbol.text.size(); ++j) {
gpt_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
output.push_back(token_id);
}
} else {
output.push_back((*token).second);
}
}
}

// Backward pass
int i = len;
while (i > 0) {
gpt_vocab::id token_id = prev[i];
if (token_id == 0) {
// TODO: Return error or something more meaningful
printf("failed to tokenize string!\n");
break;
private:
void try_add_bigram(int left, int right) {
if (left == -1 || right == -1) {
return;
}

std::string_view text(symbols_[left].text.data(), symbols_[left].text.size() + symbols_[right].text.size());
auto token = vocab_.token_to_id.find(std::string(text));

if (token == vocab_.token_to_id.end()) {
return;
}
res.push_back(token_id);
auto token = (*vocab.id_to_token.find(token_id)).second;
i -= token.length();

auto score = vocab_.score.find((*token).second);

if (score == vocab_.score.end()) {
return;
}

llama_sp_bigram bigram;
bigram.left = left;
bigram.right = right;
bigram.score = (*score).second;
bigram.size = text.size();
work_queue_.push(bigram);
}

if (bos) {
res.push_back(1); // TODO: replace with vocab.bos
const gpt_vocab & vocab_;
std::vector<llama_sp_symbol> symbols_;
llama_sp_bigram::queue work_queue_;
};

std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos) {
llama_tokenizer tokenizer(vocab);
std::vector<gpt_vocab::id> output;

if (text.size() == 0) {
return output;
}

// Pieces are in reverse order so correct that
std::reverse(res.begin(), res.end());
if (bos) {
output.push_back(1);
}

return res;
tokenizer.tokenize(text, output);
return output;
}

bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
Expand Down
3 changes: 2 additions & 1 deletion utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ struct gpt_vocab {

std::map<token, id> token_to_id;
std::map<id, token> id_to_token;
std::map<id, float> score;
};

void replace(std::string & str, const std::string & needle, const std::string & replacement);
Expand All @@ -79,7 +80,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri

// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
// ref: https://github.com/google/sentencepiece
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos);

// load the tokens from encoder.json
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
Expand Down