Cuda status beautify (ggml-org#20)

* Cuda: 1) changed tensor split calculation to work on free vram instead of total vram (the main GPU often consumes a few GB from desktop work) Changed the loading progress callback: 1) Added a numerical percentage to it 2) Added a status message to display what it is doing right now ("loading tensors: CPU) 3) normalized the length of the bar and designed it from dots to an actual progress bar Example : [============================================>-----] 88% Loading tensor (GPU-Split) Todo: The progressbar can be changed into 3-4 lines, so it displays important flags and changes alongside the progress. This would replace the list of "log entries" before the progress starts counting and preceding log-lines are only printed for errors and warnings. * 1) Added a tensor split prepare wrapper, this allows to take the tensor split given by -ts immediately 2) Created a CUDA status struct with getter and update functions. Moved g_main_device and g_num_devices into the new struct. typedef struct { int num_devices; int main_device_id; size_t total_vram; size_t total_free_vram; struct cudaDeviceProp device_props[GGML_CUDA_MAX_DEVICES]; size_t device_vram_free[GGML_CUDA_MAX_DEVICES]; size_t device_vram_total[GGML_CUDA_MAX_DEVICES]; } GPUStatus; 3) Replaced the previous log output with a print function that gives better CUDA information on init Example: CUDA Device Summary - 2 devices found +------------------------------------+------------+-----------+-----------+-----------+-----------+ | Device | VRAM Total | VRAM Free | VRAM Used | Split % | Device ID | +------------------------------------+------------+-----------+-----------+-----------+-----------+ | NVIDIA GeForce RTX 4090 | 24563 MB | 23006 MB | 1557 MB | 0.0% | 0 (Main) | +------------------------------------+------------+-----------+-----------+-----------+-----------+ | NVIDIA GeForce RTX 3090 | 24575 MB | 23318 MB | 1257 MB | 67.2% | 1 | +------------------------------------+------------+-----------+-----------+-----------+-----------+ Total VRAM: 47.99 GB, Free VRAM: 45.24 GB -------------------- 4) cleaned up libfalcon.cpp to use the new information instead of making cuda requests and device changes TODO: The VRAM reported by CUDA is not the same as reported by CPU-Z which also is different from HWINFO64 (this one is totally off). * CUDA: 1) Tensor split now allows to disable either device (usage proportion of 0) - this currently still occupies minimal VRAM and if main-device it will still be used for non split tensors 2) Corrected (reduced) total free VRAM calculation if a device proportion is disabled (does not take main device into account) * vram_total bugfix * status table alignment * bugfixes * added perf counters also into makefile ---------
flowgrad · Jun 22, 2023 · 7e38f5c · 7e38f5c
1 parent 4195227
commit 7e38f5c
Show file tree

Hide file tree

Showing 8 changed files with 310 additions and 118 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,8 +2,9 @@ cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
 # If CUDA toolkit is not found using msvc compiler switch to Community Edition (same compiler, just other kit..)
 project("ggllm.cpp" C CXX)
 add_definitions(-DGGML_PERF=1) # use "--debug-timings 1-3" to enable timing output
-include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/include")
-include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/lib/x64")
+# include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/include")
+# include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/lib/x64")
+
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)

diff --git a/Makefile b/Makefile
@@ -50,12 +50,12 @@ CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
 LDFLAGS  =
 
 ifdef LLAMA_DEBUG
-	CFLAGS   += -O0 -g
-	CXXFLAGS += -O0 -g
+	CFLAGS   += -O0 -g -DGGML_PERF=1
+	CXXFLAGS += -O0 -g -DGGML_PERF=1
 	LDFLAGS  += -g
 else
-	CFLAGS   += -DNDEBUG
-	CXXFLAGS += -DNDEBUG
+	CFLAGS   += -DNDEBUG -DGGML_PERF=1
+	CXXFLAGS += -DNDEBUG -DGGML_PERF=1
 endif
 
 # warnings

diff --git a/examples/falcon_common.cpp b/examples/falcon_common.cpp
@@ -305,8 +305,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             }
 #ifdef GGML_USE_CUBLAS
             params.main_gpu = std::stoi(argv[i]);
+            ggml_cuda_set_main_device(params.main_gpu);
 #else
-      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
+      fprintf(stderr, "warning: falcon.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
 #endif
         } else if (arg == "--tensor-split" || arg == "-ts") {
             if (++i >= argc) {
@@ -321,16 +322,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
             std::vector<std::string> split_arg{it, {}};
             GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
-
+            bool all_zero = true;
             for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
                 if (i < split_arg.size()) {
                     params.tensor_split[i] = std::stof(split_arg[i]);
+                    if (params.tensor_split[i] != 0.0f) {
+                        all_zero = false;
+                    }
                 } else {
                     params.tensor_split[i] = 0.0f;
                 }
             }
+            if (all_zero) {
+                fprintf(stderr, "Error: all tensor split proportions are zero\n");
+                exit(1);
+            }
+            ggml_cuda_set_tensor_split_prepare(params.tensor_split,split_arg.size());
 #else
-      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+      fprintf(stderr, "warning: falcon.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
 #endif // GGML_USE_CUBLAS
         } else if (arg == "--no-mmap") {
             params.use_mmap = false;
@@ -416,6 +425,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         process_escapes(params.prompt);
     }
 
+
+      bool all_zero = true;
+        for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
+            if (params.tensor_split[i] != 0.0f) {
+                all_zero = false;
+                break;
+            }
+        }
+        if (!all_zero) {
+            if (params.tensor_split[params.main_gpu] == 0.0f) {
+                fprintf(stderr, "Error: main GPU cannot have a tensor split proportion of zero.\n");
+                exit(1);
+            }
+        }
+
     return true;
 }
 

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
diff --git a/ggml-cuda.h b/ggml-cuda.h
@@ -1,7 +1,8 @@
 #pragma once
-
+#include <cuda_runtime.h>
 #include "ggml.h"
 
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -11,8 +12,21 @@ extern "C" {
 struct ggml_tensor_extra_gpu {
     void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
 };
+typedef struct {
+    int num_devices;
+    int main_device_id;
+    size_t total_vram;
+    size_t total_free_vram;
+    struct cudaDeviceProp device_props[GGML_CUDA_MAX_DEVICES];
+    size_t device_vram_free[GGML_CUDA_MAX_DEVICES];
+    size_t device_vram_total[GGML_CUDA_MAX_DEVICES];
+} GPUStatus;
+const GPUStatus* ggml_cuda_get_system_gpu_status();
 
 void   ggml_init_cublas(void);
+void   ggml_cuda_update_gpu_status(int device_id);
+void   ggml_cuda_print_gpu_status(const GPUStatus *status);
+void   ggml_cuda_set_tensor_split_prepare(const float * tensor_split, int num_devices);
 void   ggml_cuda_set_tensor_split(const float * tensor_split);
 
 void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);

diff --git a/ggml.c b/ggml.c
@@ -14615,7 +14615,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
         return;
     }
     if (tensor->src0->backend != GGML_BACKEND_CPU) 
-    printf("%s src0->backend != GGML_BACKEND_CPU (%s; %s)\n", ggml_op_name  (tensor->op),tensor->name,tensor->src0->src0->name);
+        printf("%s src0->backend != GGML_BACKEND_CPU (%s, %s)\n", ggml_op_name(tensor->op),tensor->name, tensor->src0->name);
     GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
     GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
 #endif // GGML_USE_CUBLAS

diff --git a/libfalcon.cpp b/libfalcon.cpp
@@ -768,7 +768,7 @@ struct llama_model_loader {
         }
     }
 
-    void load_all_data(llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
+    void load_all_data(falcon_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
         size_t data_size = 0;
         size_t prefetch_size = 0;
         size_t lock_size = 0;
@@ -789,7 +789,15 @@ struct llama_model_loader {
         size_t done_size = 0;
         for (falcon_load_tensor & lt : tensors_map.tensors) {
             if (progress_callback) {
-                progress_callback((float) done_size / data_size, progress_callback_user_data);
+                char *status="";
+                if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) 
+                    status = "Loading tensor (CPU)";
+                else if (lt.ggml_tensor->backend == GGML_BACKEND_GPU)
+                    status = "Loading tensor (GPU-Main)";
+                else if (lt.ggml_tensor->backend == GGML_BACKEND_GPU_SPLIT)
+                    status = "Loading tensor (GPU-Split)";
+
+                progress_callback((float) done_size / data_size, progress_callback_user_data,status);
             }
             LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
             lt.data = (uint8_t *) lt.ggml_tensor->data;
@@ -1084,7 +1092,7 @@ static void falcon_model_load_internal(
         bool use_mmap,
         bool use_mlock,
         bool vocab_only,
-        llama_progress_callback progress_callback,
+        falcon_progress_callback progress_callback,
         void * progress_callback_user_data) {
 
     lctx.t_start_us = ggml_time_us(); 
@@ -1186,8 +1194,8 @@ static void falcon_model_load_internal(
 
     (void) main_gpu;
 #if defined(GGML_USE_CUBLAS)
-if (n_gpu_layers > 0)
-    fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
+    if (n_gpu_layers > 0)
+        fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
     ggml_cuda_set_main_device(main_gpu);
 #define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
 #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
@@ -1200,24 +1208,27 @@ if (n_gpu_layers > 0)
 #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
 #endif
 
-    size_t vram_total=0;
-    size_t vram_free=0;
-    const size_t vram_reserved=512*MB;    // that amount of VRAM is to stay free on GPU (headroom for other processes - may be reduced in pure server environments)
-    size_t vram_overhead = 1250*MB; // this amount of vram is estimated for non weight storage buffers on VRAM (no big difference between 7B and 40B, needs to increase when  more work is offloaded in the future)
+
+    const size_t vram_reserved=512*MB;    // that amount of VRAM is to stay free on GPU (needs to become a user parameter)
+    size_t vram_overhead = 1250*MB; // this amount of vram is estimated for non weight storage buffers on VRAM
+    size_t vram_free = 0; // for vram simulation below
+    size_t vram_total = 0; // for vram simulation below
 #if defined(GGML_USE_CUBLAS)
+    const GPUStatus *system_gpu_status = ggml_cuda_get_system_gpu_status();
+    vram_free = system_gpu_status->total_free_vram;
+    vram_total = system_gpu_status->total_vram;
     // cublas is used in 32 bit mode, temporary cuda storage/conversion buffers are needed for batch ingestion ( could be run in 16 bit mode without performance downgrade and save half the VRAM)
     if (model.type == FALCON_40B && n_batch > 1)
     {
-        vram_overhead += (1024 + 288 + 256) * MB;
-        fprintf(stderr, "%s: INFO: using n_batch (-b) > 1 will require additional VRAM of: %7.2f MB\n", __func__, vram_overhead/MB*1.0);
+        vram_overhead += (1024 + 288 + 256) * MB; // todo: when can manually create one 1024 buffer manually, saves 500+mb vram
+        fprintf(stderr, "%s: INFO: using n_batch > 1 will require additional VRAM per device: %7.2f MB\n", __func__, vram_overhead/MB*1.0);
     }
     if (model.type == FALCON_7B && n_batch > 1)
     {
-        vram_overhead += (315 + 80 + 78) * MB;
-        fprintf(stderr, "%s: INFO: using n_batch (-b) > 1 will require additional VRAM of: %7.2f MB\n", __func__, vram_overhead/MB*1.0);
+        vram_overhead += (315 + 80 + 78) * MB; // todo: manually create a 315mb buffer, saves 160mb vram
+        fprintf(stderr, "%s: INFO: using n_batch > 1 will require additional VRAM per device: %7.2f MB\n", __func__, vram_overhead/MB*1.0);
     }
-    cudaMemGetInfo(&vram_free, &vram_total); // this should go in ggml-cuda.cu but I don't want to make Johannes life harder by modifying that yet
-    fprintf(stderr, "%s: VRAM free: %7.2f MB  of %7.2f MB (in use: %7.2f MB)\n", __func__, vram_free/MB*1.0, vram_total/MB*1.0, (vram_total-vram_free)/MB*1.0);
+    fprintf(stderr, "%s: VRAM free: %7.2f MB  of %7.2f MB (in use: %7.2f MB)\n", __func__, system_gpu_status->total_free_vram/MB*1.0, system_gpu_status->total_vram/MB*1.0, (system_gpu_status->total_vram-system_gpu_status->total_free_vram)/MB*1.0);
 #endif
 
     // prepare memory for the weights
@@ -1253,9 +1264,13 @@ if (n_gpu_layers > 0)
 
         ggml_backend backend_norm;
         ggml_backend backend_output;
-        // disabled norm/output offloading until further tests, causes silent crash at the moment
-        if (n_gpu_layers > int(n_layer) && false) { // NOLINT
-            backend_norm = LLAMA_BACKEND_OFFLOAD;
+        // output layer offloading is on by default now, it's one of the biggest CPU consumers
+        bool offload_output = true;
+        if (n_gpu_layers == 0) offload_output = false;
+
+        if (offload_output) { // NOLINT
+            // backend_norm = LLAMA_BACKEND_OFFLOAD; // this requires REPEAT on GPU (in f7b)
+            backend_norm = GGML_BACKEND_CPU; 
             backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
         } else {
             backend_norm = GGML_BACKEND_CPU;
@@ -1279,6 +1294,7 @@ if (n_gpu_layers > 0)
         {
             vram_weights += ggml_nbytes(model.lm_head);
             vram_free -= ggml_nbytes(model.lm_head);
+            fprintf(stderr, "%s: Offloading Output head tensor (%ld MB)\n", __func__, ggml_nbytes(model.lm_head)/MB);
         }
 
         int i_gpu_start = n_layer - n_gpu_layers;
@@ -1370,9 +1386,6 @@ if (n_gpu_layers > 0)
 
         fprintf(stderr, "%s: offloading %d of %d layers to GPU, weights offloaded %7.2f MB\n",
                 __func__, n_gpu, hparams.n_layer, vram_weights / 1024.0 / 1024.0);
-        if (n_gpu_layers > (int) hparams.n_layer) {
-            fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
-        }
         fprintf(stderr, "%s: estimated VRAM usage: %zu MB\n",
                 __func__, (vram_weights + vram_scratch + vram_overhead + MB - 1) / MB); // round up
 #else
@@ -1385,22 +1398,27 @@ if (n_gpu_layers > 0)
         model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
     }
 
+/*  moved into common so that it is set from begin on and can be visualized before evaluation starts
     (void) tensor_split;
 #if defined(GGML_USE_CUBLAS)
     {
+        // optional tensor split by custom parameters if defined
         ggml_cuda_set_tensor_split(tensor_split);
     }
 #endif
+*/
 
     ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
 
     if (progress_callback) {
-        progress_callback(1.0f, progress_callback_user_data);
+        progress_callback(1.0f, progress_callback_user_data,"Tensors populated");
     }
 
     #if defined(GGML_USE_CUBLAS)
     //size_t vram_free_simulated = vram_free;
-    cudaMemGetInfo(&vram_free, &vram_total); // this should go in ggml-cuda.cu but I don't want to make Johannes life harder by modifying that yet
+    ggml_cuda_update_gpu_status(-1);
+    vram_free = system_gpu_status->total_free_vram;
+    vram_total= system_gpu_status->total_vram;
     fprintf(stderr, "%s: VRAM free: %7.2f MB  of %7.2f MB (used: %7.2f MB)\n", __func__, vram_free/MB*1.0, vram_total/MB*1.0, (vram_total-vram_free)/MB*1.0);
 
     #endif
@@ -1426,7 +1444,7 @@ static bool falcon_model_load(
         bool use_mmap,
         bool use_mlock,
         bool vocab_only,
-        llama_progress_callback progress_callback,
+        falcon_progress_callback progress_callback,
         void *progress_callback_user_data) {
     try {
         falcon_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
@@ -1521,7 +1539,7 @@ static bool falcon_eval_internal(
     offload_func_t offload_func_kqv = llama_nop;
 
 #ifdef GGML_USE_CUBLAS
-        // todo: use either a flag in model/params or a backend test to determine if norm/output are on GPU
+        // todo: instead of n_layer use either a flag in model/params or a backend test to determine if norm/output are on GPU
         if (n_gpu_layers > n_layer) {
             offload_func_nr = ggml_cuda_assign_buffers;
         }
@@ -1535,7 +1553,7 @@ static bool falcon_eval_internal(
         offload_func_t offload_func = llama_nop;
 
 #ifdef GGML_USE_CUBLAS
-        if (il >= i_gpu_start && il < i_gpu_last) {
+        if (il >= i_gpu_start && il <= i_gpu_last) {
             offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
         }
 #endif // GGML_USE_CUBLAS
@@ -1831,7 +1849,8 @@ static bool falcon_eval_internal(
             if ((first && debug_timings <=2) || debug_timings > 2)
             {
                 first = false;
-                ggml_graph_print_impl(&gf,true,false,GGML_OP_NONE); // GGML_OP_MUL_MAT
+                // ggml_graph_print_impl(&gf,true,false,GGML_OP_MUL_MAT); // GGML_OP_MUL_MAT / GGML_OP_NONE
+                ggml_graph_print_impl(&gf,true,false,GGML_OP_NONE); // GGML_OP_MUL_MAT / GGML_OP_NONE
             }
         }
         // requires GGML_PERF to be defined for actual timing information
@@ -2751,15 +2770,37 @@ struct falcon_context * falcon_init_from_file(
 
     unsigned cur_percentage = 0;
     if (params.progress_callback == NULL) {
-        params.progress_callback_user_data = &cur_percentage;
-        params.progress_callback = [](float progress, void * ctx) {
-            unsigned * cur_percentage_p = (unsigned *) ctx;
+        params.progress_callback_user_data = &cur_percentage; // not sure why this is so complicated ? I left it for now
+        params.progress_callback = [](float progress, void * ctx, char *status) {
             unsigned percentage = (unsigned) (100 * progress);
-            while (percentage > *cur_percentage_p) {
+            unsigned * cur_percentage_p = (unsigned *) ctx;
+            static const int bar_width = 50;
+            bool completed = false;
+            if (percentage >= 100) {
+                completed = true;
+                if (!strlen(status))
+                status = "Completed";
+            }
+
+            if (percentage > *cur_percentage_p) {
                 *cur_percentage_p = percentage;
-                fprintf(stderr, ".");
+                fprintf(stderr, "\r["); // using '\r' to overwrite the current line
+                int progress_position = percentage * bar_width / 100;
+                for (int i = 0; i < bar_width; ++i) {
+                    if (i < progress_position) {
+                        fprintf(stderr, "=");
+                    } else if (i == progress_position) {
+                        fprintf(stderr, ">");
+                    } else {
+                        fprintf(stderr, "-");
+                    }
+                }
+
+                fprintf(stderr, "] %3u%%  %-30s", percentage, status);
+
+
                 fflush(stderr);
-                if (percentage >= 100) {
+                if (completed) {
                     fprintf(stderr, "\n");
                 }
             }

diff --git a/libfalcon.h b/libfalcon.h
@@ -69,7 +69,7 @@ extern "C" {
         bool sorted;
     } llama_token_data_array;
 
-    typedef void (*llama_progress_callback)(float progress, void *ctx);
+    typedef void (*falcon_progress_callback)(float progress, void *ctx, char *status);
 
     struct falcon_context_params {
         int n_ctx;                             // text context
@@ -89,7 +89,7 @@ extern "C" {
         bool embedding;  // embedding mode only
 
         // called with a progress value between 0 and 1, pass NULL to disable
-        llama_progress_callback progress_callback;
+        falcon_progress_callback progress_callback;
         // context pointer passed to the progress callback
         void * progress_callback_user_data;
     };