Skip to content

Commit

Permalink
Cuda status beautify (ggml-org#20)
Browse files Browse the repository at this point in the history
* Cuda:
1) changed tensor split calculation to work on free vram instead of total vram (the main GPU often consumes a few GB from desktop work)

Changed the loading progress callback:
1) Added a numerical percentage to it
2) Added a status message to display what it is doing right now ("loading tensors: CPU)
3) normalized the length of the bar and designed it from dots to an actual progress bar
Example :
[============================================>-----]  88%  Loading tensor (GPU-Split)
Todo:
The progressbar can be changed into 3-4 lines, so it displays important flags and changes alongside the progress.
This would replace the list of "log entries" before the progress starts counting and preceding log-lines are only printed for errors and warnings.

* 1) Added a tensor split prepare wrapper, this allows to take the tensor split given by -ts immediately

2) Created a CUDA status struct with getter and update functions.
Moved g_main_device and g_num_devices into the new struct.
typedef struct {
    int num_devices;
    int main_device_id;
    size_t total_vram;
    size_t total_free_vram;
    struct cudaDeviceProp device_props[GGML_CUDA_MAX_DEVICES];
    size_t device_vram_free[GGML_CUDA_MAX_DEVICES];
    size_t device_vram_total[GGML_CUDA_MAX_DEVICES];
} GPUStatus;

3) Replaced the previous log output with a print function that gives better CUDA information on init
Example:
CUDA Device Summary - 2 devices found
+------------------------------------+------------+-----------+-----------+-----------+-----------+
| Device                             | VRAM Total | VRAM Free | VRAM Used |   Split % | Device ID |
+------------------------------------+------------+-----------+-----------+-----------+-----------+
| NVIDIA GeForce RTX 4090            |   24563 MB |  23006 MB |   1557 MB |      0.0% |  0 (Main) |
+------------------------------------+------------+-----------+-----------+-----------+-----------+
| NVIDIA GeForce RTX 3090            |   24575 MB |  23318 MB |   1257 MB |     67.2% |  1        |
+------------------------------------+------------+-----------+-----------+-----------+-----------+
Total VRAM: 47.99 GB, Free VRAM: 45.24 GB
--------------------

4) cleaned up libfalcon.cpp to use the new information instead of making cuda requests and device changes

TODO: The VRAM reported by CUDA is not the same as reported by CPU-Z which also is different from HWINFO64 (this one is totally off).

* CUDA:
1) Tensor split now allows to disable either device (usage proportion of 0)
- this currently still occupies minimal VRAM and if main-device it will still be used for non split tensors
2) Corrected (reduced) total free VRAM calculation if a device proportion is disabled (does not take main device into account)

* vram_total bugfix

* status table alignment

* bugfixes

* added perf counters also into makefile

---------
  • Loading branch information
cmp-nct authored Jun 22, 2023
1 parent 4195227 commit 7e38f5c
Show file tree
Hide file tree
Showing 8 changed files with 310 additions and 118 deletions.
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
# If CUDA toolkit is not found using msvc compiler switch to Community Edition (same compiler, just other kit..)
project("ggllm.cpp" C CXX)
add_definitions(-DGGML_PERF=1) # use "--debug-timings 1-3" to enable timing output
include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/include")
include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/lib/x64")
# include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/include")
# include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/lib/x64")

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
Expand Down
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@ CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
LDFLAGS =

ifdef LLAMA_DEBUG
CFLAGS += -O0 -g
CXXFLAGS += -O0 -g
CFLAGS += -O0 -g -DGGML_PERF=1
CXXFLAGS += -O0 -g -DGGML_PERF=1
LDFLAGS += -g
else
CFLAGS += -DNDEBUG
CXXFLAGS += -DNDEBUG
CFLAGS += -DNDEBUG -DGGML_PERF=1
CXXFLAGS += -DNDEBUG -DGGML_PERF=1
endif

# warnings
Expand Down
30 changes: 27 additions & 3 deletions examples/falcon_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,8 +305,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
}
#ifdef GGML_USE_CUBLAS
params.main_gpu = std::stoi(argv[i]);
ggml_cuda_set_main_device(params.main_gpu);
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
fprintf(stderr, "warning: falcon.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
#endif
} else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) {
Expand All @@ -321,16 +322,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
std::vector<std::string> split_arg{it, {}};
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);

bool all_zero = true;
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
if (i < split_arg.size()) {
params.tensor_split[i] = std::stof(split_arg[i]);
if (params.tensor_split[i] != 0.0f) {
all_zero = false;
}
} else {
params.tensor_split[i] = 0.0f;
}
}
if (all_zero) {
fprintf(stderr, "Error: all tensor split proportions are zero\n");
exit(1);
}
ggml_cuda_set_tensor_split_prepare(params.tensor_split,split_arg.size());
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
fprintf(stderr, "warning: falcon.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--no-mmap") {
params.use_mmap = false;
Expand Down Expand Up @@ -416,6 +425,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
process_escapes(params.prompt);
}


bool all_zero = true;
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
if (params.tensor_split[i] != 0.0f) {
all_zero = false;
break;
}
}
if (!all_zero) {
if (params.tensor_split[params.main_gpu] == 0.0f) {
fprintf(stderr, "Error: main GPU cannot have a tensor split proportion of zero.\n");
exit(1);
}
}

return true;
}

Expand Down
256 changes: 184 additions & 72 deletions ggml-cuda.cu

Large diffs are not rendered by default.

16 changes: 15 additions & 1 deletion ggml-cuda.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#pragma once

#include <cuda_runtime.h>
#include "ggml.h"


#ifdef __cplusplus
extern "C" {
#endif
Expand All @@ -11,8 +12,21 @@ extern "C" {
struct ggml_tensor_extra_gpu {
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
};
typedef struct {
int num_devices;
int main_device_id;
size_t total_vram;
size_t total_free_vram;
struct cudaDeviceProp device_props[GGML_CUDA_MAX_DEVICES];
size_t device_vram_free[GGML_CUDA_MAX_DEVICES];
size_t device_vram_total[GGML_CUDA_MAX_DEVICES];
} GPUStatus;
const GPUStatus* ggml_cuda_get_system_gpu_status();

void ggml_init_cublas(void);
void ggml_cuda_update_gpu_status(int device_id);
void ggml_cuda_print_gpu_status(const GPUStatus *status);
void ggml_cuda_set_tensor_split_prepare(const float * tensor_split, int num_devices);
void ggml_cuda_set_tensor_split(const float * tensor_split);

void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
Expand Down
2 changes: 1 addition & 1 deletion ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -14615,7 +14615,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
return;
}
if (tensor->src0->backend != GGML_BACKEND_CPU)
printf("%s src0->backend != GGML_BACKEND_CPU (%s; %s)\n", ggml_op_name (tensor->op),tensor->name,tensor->src0->src0->name);
printf("%s src0->backend != GGML_BACKEND_CPU (%s, %s)\n", ggml_op_name(tensor->op),tensor->name, tensor->src0->name);
GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
#endif // GGML_USE_CUBLAS
Expand Down
107 changes: 74 additions & 33 deletions libfalcon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -768,7 +768,7 @@ struct llama_model_loader {
}
}

void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
void load_all_data(falcon_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
size_t data_size = 0;
size_t prefetch_size = 0;
size_t lock_size = 0;
Expand All @@ -789,7 +789,15 @@ struct llama_model_loader {
size_t done_size = 0;
for (falcon_load_tensor & lt : tensors_map.tensors) {
if (progress_callback) {
progress_callback((float) done_size / data_size, progress_callback_user_data);
char *status="";
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU)
status = "Loading tensor (CPU)";
else if (lt.ggml_tensor->backend == GGML_BACKEND_GPU)
status = "Loading tensor (GPU-Main)";
else if (lt.ggml_tensor->backend == GGML_BACKEND_GPU_SPLIT)
status = "Loading tensor (GPU-Split)";

progress_callback((float) done_size / data_size, progress_callback_user_data,status);
}
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
lt.data = (uint8_t *) lt.ggml_tensor->data;
Expand Down Expand Up @@ -1084,7 +1092,7 @@ static void falcon_model_load_internal(
bool use_mmap,
bool use_mlock,
bool vocab_only,
llama_progress_callback progress_callback,
falcon_progress_callback progress_callback,
void * progress_callback_user_data) {

lctx.t_start_us = ggml_time_us();
Expand Down Expand Up @@ -1186,8 +1194,8 @@ static void falcon_model_load_internal(

(void) main_gpu;
#if defined(GGML_USE_CUBLAS)
if (n_gpu_layers > 0)
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
if (n_gpu_layers > 0)
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
ggml_cuda_set_main_device(main_gpu);
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
Expand All @@ -1200,24 +1208,27 @@ if (n_gpu_layers > 0)
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
#endif

size_t vram_total=0;
size_t vram_free=0;
const size_t vram_reserved=512*MB; // that amount of VRAM is to stay free on GPU (headroom for other processes - may be reduced in pure server environments)
size_t vram_overhead = 1250*MB; // this amount of vram is estimated for non weight storage buffers on VRAM (no big difference between 7B and 40B, needs to increase when more work is offloaded in the future)

const size_t vram_reserved=512*MB; // that amount of VRAM is to stay free on GPU (needs to become a user parameter)
size_t vram_overhead = 1250*MB; // this amount of vram is estimated for non weight storage buffers on VRAM
size_t vram_free = 0; // for vram simulation below
size_t vram_total = 0; // for vram simulation below
#if defined(GGML_USE_CUBLAS)
const GPUStatus *system_gpu_status = ggml_cuda_get_system_gpu_status();
vram_free = system_gpu_status->total_free_vram;
vram_total = system_gpu_status->total_vram;
// cublas is used in 32 bit mode, temporary cuda storage/conversion buffers are needed for batch ingestion ( could be run in 16 bit mode without performance downgrade and save half the VRAM)
if (model.type == FALCON_40B && n_batch > 1)
{
vram_overhead += (1024 + 288 + 256) * MB;
fprintf(stderr, "%s: INFO: using n_batch (-b) > 1 will require additional VRAM of: %7.2f MB\n", __func__, vram_overhead/MB*1.0);
vram_overhead += (1024 + 288 + 256) * MB; // todo: when can manually create one 1024 buffer manually, saves 500+mb vram
fprintf(stderr, "%s: INFO: using n_batch > 1 will require additional VRAM per device: %7.2f MB\n", __func__, vram_overhead/MB*1.0);
}
if (model.type == FALCON_7B && n_batch > 1)
{
vram_overhead += (315 + 80 + 78) * MB;
fprintf(stderr, "%s: INFO: using n_batch (-b) > 1 will require additional VRAM of: %7.2f MB\n", __func__, vram_overhead/MB*1.0);
vram_overhead += (315 + 80 + 78) * MB; // todo: manually create a 315mb buffer, saves 160mb vram
fprintf(stderr, "%s: INFO: using n_batch > 1 will require additional VRAM per device: %7.2f MB\n", __func__, vram_overhead/MB*1.0);
}
cudaMemGetInfo(&vram_free, &vram_total); // this should go in ggml-cuda.cu but I don't want to make Johannes life harder by modifying that yet
fprintf(stderr, "%s: VRAM free: %7.2f MB of %7.2f MB (in use: %7.2f MB)\n", __func__, vram_free/MB*1.0, vram_total/MB*1.0, (vram_total-vram_free)/MB*1.0);
fprintf(stderr, "%s: VRAM free: %7.2f MB of %7.2f MB (in use: %7.2f MB)\n", __func__, system_gpu_status->total_free_vram/MB*1.0, system_gpu_status->total_vram/MB*1.0, (system_gpu_status->total_vram-system_gpu_status->total_free_vram)/MB*1.0);
#endif

// prepare memory for the weights
Expand Down Expand Up @@ -1253,9 +1264,13 @@ if (n_gpu_layers > 0)

ggml_backend backend_norm;
ggml_backend backend_output;
// disabled norm/output offloading until further tests, causes silent crash at the moment
if (n_gpu_layers > int(n_layer) && false) { // NOLINT
backend_norm = LLAMA_BACKEND_OFFLOAD;
// output layer offloading is on by default now, it's one of the biggest CPU consumers
bool offload_output = true;
if (n_gpu_layers == 0) offload_output = false;

if (offload_output) { // NOLINT
// backend_norm = LLAMA_BACKEND_OFFLOAD; // this requires REPEAT on GPU (in f7b)
backend_norm = GGML_BACKEND_CPU;
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
} else {
backend_norm = GGML_BACKEND_CPU;
Expand All @@ -1279,6 +1294,7 @@ if (n_gpu_layers > 0)
{
vram_weights += ggml_nbytes(model.lm_head);
vram_free -= ggml_nbytes(model.lm_head);
fprintf(stderr, "%s: Offloading Output head tensor (%ld MB)\n", __func__, ggml_nbytes(model.lm_head)/MB);
}

int i_gpu_start = n_layer - n_gpu_layers;
Expand Down Expand Up @@ -1370,9 +1386,6 @@ if (n_gpu_layers > 0)

fprintf(stderr, "%s: offloading %d of %d layers to GPU, weights offloaded %7.2f MB\n",
__func__, n_gpu, hparams.n_layer, vram_weights / 1024.0 / 1024.0);
if (n_gpu_layers > (int) hparams.n_layer) {
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
}
fprintf(stderr, "%s: estimated VRAM usage: %zu MB\n",
__func__, (vram_weights + vram_scratch + vram_overhead + MB - 1) / MB); // round up
#else
Expand All @@ -1385,22 +1398,27 @@ if (n_gpu_layers > 0)
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
}

/* moved into common so that it is set from begin on and can be visualized before evaluation starts
(void) tensor_split;
#if defined(GGML_USE_CUBLAS)
{
// optional tensor split by custom parameters if defined
ggml_cuda_set_tensor_split(tensor_split);
}
#endif
*/

ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);

if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data);
progress_callback(1.0f, progress_callback_user_data,"Tensors populated");
}

#if defined(GGML_USE_CUBLAS)
//size_t vram_free_simulated = vram_free;
cudaMemGetInfo(&vram_free, &vram_total); // this should go in ggml-cuda.cu but I don't want to make Johannes life harder by modifying that yet
ggml_cuda_update_gpu_status(-1);
vram_free = system_gpu_status->total_free_vram;
vram_total= system_gpu_status->total_vram;
fprintf(stderr, "%s: VRAM free: %7.2f MB of %7.2f MB (used: %7.2f MB)\n", __func__, vram_free/MB*1.0, vram_total/MB*1.0, (vram_total-vram_free)/MB*1.0);

#endif
Expand All @@ -1426,7 +1444,7 @@ static bool falcon_model_load(
bool use_mmap,
bool use_mlock,
bool vocab_only,
llama_progress_callback progress_callback,
falcon_progress_callback progress_callback,
void *progress_callback_user_data) {
try {
falcon_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
Expand Down Expand Up @@ -1521,7 +1539,7 @@ static bool falcon_eval_internal(
offload_func_t offload_func_kqv = llama_nop;

#ifdef GGML_USE_CUBLAS
// todo: use either a flag in model/params or a backend test to determine if norm/output are on GPU
// todo: instead of n_layer use either a flag in model/params or a backend test to determine if norm/output are on GPU
if (n_gpu_layers > n_layer) {
offload_func_nr = ggml_cuda_assign_buffers;
}
Expand All @@ -1535,7 +1553,7 @@ static bool falcon_eval_internal(
offload_func_t offload_func = llama_nop;

#ifdef GGML_USE_CUBLAS
if (il >= i_gpu_start && il < i_gpu_last) {
if (il >= i_gpu_start && il <= i_gpu_last) {
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
}
#endif // GGML_USE_CUBLAS
Expand Down Expand Up @@ -1831,7 +1849,8 @@ static bool falcon_eval_internal(
if ((first && debug_timings <=2) || debug_timings > 2)
{
first = false;
ggml_graph_print_impl(&gf,true,false,GGML_OP_NONE); // GGML_OP_MUL_MAT
// ggml_graph_print_impl(&gf,true,false,GGML_OP_MUL_MAT); // GGML_OP_MUL_MAT / GGML_OP_NONE
ggml_graph_print_impl(&gf,true,false,GGML_OP_NONE); // GGML_OP_MUL_MAT / GGML_OP_NONE
}
}
// requires GGML_PERF to be defined for actual timing information
Expand Down Expand Up @@ -2751,15 +2770,37 @@ struct falcon_context * falcon_init_from_file(

unsigned cur_percentage = 0;
if (params.progress_callback == NULL) {
params.progress_callback_user_data = &cur_percentage;
params.progress_callback = [](float progress, void * ctx) {
unsigned * cur_percentage_p = (unsigned *) ctx;
params.progress_callback_user_data = &cur_percentage; // not sure why this is so complicated ? I left it for now
params.progress_callback = [](float progress, void * ctx, char *status) {
unsigned percentage = (unsigned) (100 * progress);
while (percentage > *cur_percentage_p) {
unsigned * cur_percentage_p = (unsigned *) ctx;
static const int bar_width = 50;
bool completed = false;
if (percentage >= 100) {
completed = true;
if (!strlen(status))
status = "Completed";
}

if (percentage > *cur_percentage_p) {
*cur_percentage_p = percentage;
fprintf(stderr, ".");
fprintf(stderr, "\r["); // using '\r' to overwrite the current line
int progress_position = percentage * bar_width / 100;
for (int i = 0; i < bar_width; ++i) {
if (i < progress_position) {
fprintf(stderr, "=");
} else if (i == progress_position) {
fprintf(stderr, ">");
} else {
fprintf(stderr, "-");
}
}

fprintf(stderr, "] %3u%% %-30s", percentage, status);


fflush(stderr);
if (percentage >= 100) {
if (completed) {
fprintf(stderr, "\n");
}
}
Expand Down
4 changes: 2 additions & 2 deletions libfalcon.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ extern "C" {
bool sorted;
} llama_token_data_array;

typedef void (*llama_progress_callback)(float progress, void *ctx);
typedef void (*falcon_progress_callback)(float progress, void *ctx, char *status);

struct falcon_context_params {
int n_ctx; // text context
Expand All @@ -89,7 +89,7 @@ extern "C" {
bool embedding; // embedding mode only

// called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback;
falcon_progress_callback progress_callback;
// context pointer passed to the progress callback
void * progress_callback_user_data;
};
Expand Down

0 comments on commit 7e38f5c

Please sign in to comment.