Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tokenizer test + revert to C++11 #355

Merged
merged 2 commits into from
Mar 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ jobs:
cd build
cmake ..
cmake --build . --config Release
ctest --output-on-failure

macOS-latest-make:
runs-on: macos-latest
Expand Down Expand Up @@ -90,6 +91,7 @@ jobs:
cd build
cmake ..
cmake --build . --config Release
ctest --output-on-failure

windows-latest-cmake:
runs-on: windows-latest
Expand All @@ -106,6 +108,7 @@ jobs:
cd build
cmake ..
cmake --build . --config Release
ctest --output-on-failure

- name: Get commit hash
id: commit
Expand Down
58 changes: 51 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,37 @@
cmake_minimum_required(VERSION 3.12)
cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
project("llama.cpp" C CXX)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif()

set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
set(LLAMA_STANDALONE ON)

# configure project version
# TODO
else()
set(LLAMA_STANDALONE OFF)
endif()

if (EMSCRIPTEN)
set(BUILD_SHARED_LIBS_DEFAULT OFF)

option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
else()
if (MINGW)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
else()
set(BUILD_SHARED_LIBS_DEFAULT ON)
endif()
endif()


#
# Option list
#
Expand Down Expand Up @@ -34,6 +60,9 @@ option(LLAMA_FMA "llama: enable FMA"
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)

option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})

#
# Compile flags
#
Expand Down Expand Up @@ -187,17 +216,19 @@ add_executable(llama main.cpp)

add_executable(quantize quantize.cpp)

add_library(ggml OBJECT
ggml.c
ggml.h)

add_library(utils OBJECT
utils.cpp
utils.h)

target_include_directories(utils PUBLIC .)
target_compile_features(utils PUBLIC cxx_std_11) # don't bump

add_library(ggml OBJECT
ggml.c
ggml.h)

target_include_directories(ggml PUBLIC .)
target_compile_features(ggml PUBLIC c_std_11)
target_compile_features(utils PUBLIC cxx_std_17)
target_compile_features(ggml PUBLIC c_std_11) # don't bump

#
# Linking
Expand All @@ -206,3 +237,16 @@ target_compile_features(utils PUBLIC cxx_std_17)
target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
target_link_libraries(llama PRIVATE ggml utils)
target_link_libraries(quantize PRIVATE ggml utils)

#
# programs, examples and tests
#

if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
enable_testing()
add_subdirectory(tests)
endif ()

#if (LLAMA_BUILD_EXAMPLES)
# add_subdirectory(examples)
#endif()
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ endif
# Compile flags
#

# keep standard at C11 and C++11
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =

# OS specific
Expand Down
28 changes: 25 additions & 3 deletions convert-pth-to-ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,26 @@
# - Name (char[name_length])
# - Data (float[n_dims])
#
# By default, the bigger matrices are converted to 16-bit floats.
# This can be disabled by adding the "use-f32" CLI argument.
#
# At the start of the ggml file we write the model parameters
# and vocabulary.
#

import argparse
import os
import sys
import json
import struct
import numpy as np
import torch

from sentencepiece import SentencePieceProcessor

def parse_args():

parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
parser.add_argument('dir_model', help='directory containing the model checkpoint')
parser.add_argument('ftype', type=int, choices=[0, 1], default=1, help='file type (0: float32, 1: float16)')
parser.add_argument('vocab_only', type=bool, default=False, help='only write vocab to file')
return parser.parse_args()

def get_n_parts(dim):
Expand Down Expand Up @@ -134,6 +134,27 @@ def main():
ftype_str = ["f32", "f16"]

hparams, tokenizer = load_hparams_and_tokenizer(dir_model)

# if only writing vocab to file
if args.vocab_only:

fname_model = f"{dir_model}/consolidated.00.pth"
fname_out = f"{dir_model}/ggml-vocab.bin"

print(f"Extracting only the vocab from '{fname_model}'\n")

model = torch.load(fname_model, map_location="cpu")

with open(fname_out, "wb") as fout:
fout.write(struct.pack("i", hparams["vocab_size"]))
write_tokens(fout, tokenizer)

del model

print(f"Done. Output file: {fname_out}\n")

return

n_parts = get_n_parts(hparams["dim"])

for p in range(n_parts):
Expand All @@ -151,6 +172,7 @@ def main():
process_and_write_variables(fout, model, ftype)

del model

print(f"Done. Output file: {fname_out}, (part {p})\n")

if __name__ == "__main__":
Expand Down
28 changes: 14 additions & 14 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ struct llama_model {
};

// load the model's weights from a file
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32) {
bool llama_model_load(const std::string & fname, llama_model & model, llama_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32) {
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

std::vector<char> f_buf(1024*1024);
Expand Down Expand Up @@ -544,9 +544,9 @@ bool llama_eval(
const llama_model & model,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
std::vector<float> & embd_w,
size_t & mem_per_token) {
const std::vector<llama_vocab::id> & embd_inp,
std::vector<float> & embd_w,
size_t & mem_per_token) {
const int N = embd_inp.size();

const auto & hparams = model.hparams;
Expand Down Expand Up @@ -832,7 +832,7 @@ int main(int argc, char ** argv) {

int64_t t_load_us = 0;

gpt_vocab vocab;
llama_vocab vocab;
llama_model model;

// load the model
Expand Down Expand Up @@ -864,13 +864,13 @@ int main(int argc, char ** argv) {
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
std::vector<llama_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);

params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());

// prefix & suffix for instruct mode
const std::vector<gpt_vocab::id> inp_pfx = ::llama_tokenize(vocab, "\n\n### Instruction:\n\n", true);
const std::vector<gpt_vocab::id> inp_sfx = ::llama_tokenize(vocab, "\n\n### Response:\n\n", false);
const std::vector<llama_vocab::id> inp_pfx = ::llama_tokenize(vocab, "\n\n### Instruction:\n\n", true);
const std::vector<llama_vocab::id> inp_sfx = ::llama_tokenize(vocab, "\n\n### Response:\n\n", false);

// in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) {
Expand All @@ -879,8 +879,8 @@ int main(int argc, char ** argv) {
}

// tokenize the reverse prompt
std::vector<std::vector<gpt_vocab::id>> antipromptv_inp;
std::vector<std::vector<llama_vocab::id>> antipromptv_inp;

for (auto antiprompt : params.antiprompt) {
antipromptv_inp.push_back(::llama_tokenize(vocab, antiprompt, false));
}
Expand Down Expand Up @@ -925,14 +925,14 @@ int main(int argc, char ** argv) {
fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
fprintf(stderr, "\n\n");

std::vector<gpt_vocab::id> embd;
std::vector<llama_vocab::id> embd;

// determine the required inference memory per token:
size_t mem_per_token = 0;
llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);

int last_n_size = params.repeat_last_n;
std::vector<gpt_vocab::id> last_n_tokens(last_n_size);
std::vector<llama_vocab::id> last_n_tokens(last_n_size);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);

if (params.interactive) {
Expand Down Expand Up @@ -980,7 +980,7 @@ int main(int argc, char ** argv) {

const int n_vocab = model.hparams.n_vocab;

gpt_vocab::id id = 0;
llama_vocab::id id = 0;

{
const int64_t t_start_sample_us = ggml_time_us();
Expand Down Expand Up @@ -1066,7 +1066,7 @@ int main(int argc, char ** argv) {
} while (another_line);
if (params.use_color) printf(ANSI_COLOR_RESET);

std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buffer, false);
std::vector<llama_vocab::id> line_inp = ::llama_tokenize(vocab, buffer, false);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

if (params.instruct) {
Expand Down
Binary file added models/ggml-vocab.bin
Binary file not shown.
2 changes: 1 addition & 1 deletion quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
return false;
}

gpt_vocab vocab;
llama_vocab vocab;

printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());

Expand Down
4 changes: 4 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
set(TEST_TARGET test-tokenizer-0)
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE utils)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
69 changes: 69 additions & 0 deletions tests/test-tokenizer-0.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#include "utils.h"

#include <cstdio>
#include <string>
#include <map>

static const std::map<std::string, std::vector<llama_vocab::id>> k_tests = {
{ "Hello World", { 1, 10994, 2787, }, },
{ " Hello World", { 1, 15043, 2787, }, },
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
};

int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
return 1;
}

const std::string fname = argv[1];

fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());

llama_vocab vocab;

if (!llama_vocab_load(fname, vocab)) {
fprintf(stderr, "%s : failed to load vocab from: '%s'\n", __func__, fname.c_str());
return 1;
}

const int n_vocab = vocab.id_to_token.size();

if (n_vocab != 32000) {
fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
return 2;
}

for (const auto & test_kv : k_tests) {
const auto res = llama_tokenize(vocab, test_kv.first, true);

bool correct = res.size() == test_kv.second.size();

for (int i = 0; i < (int) res.size() && correct; ++i) {
if (res[i] != test_kv.second[i]) {
correct = false;
}
}

if (!correct) {
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
fprintf(stderr, "%s : expected tokens: ", __func__);
for (const auto & t : test_kv.second) {
fprintf(stderr, "%6d, ", t);
}
fprintf(stderr, "\n");
fprintf(stderr, "%s : got tokens: ", __func__);
for (const auto & t : res) {
fprintf(stderr, "%6d, ", t);
}
fprintf(stderr, "\n");

return 3;
}
}

return 0;
}
Loading