From 6e2c28b17850964e5c07d5f418211722a9b09212 Mon Sep 17 00:00:00 2001
From: XinyaoWa <xinyao.wang@intel.com>
Date: Thu, 4 Jul 2024 15:50:31 +0800
Subject: [PATCH] refine vllm instruction (#272)

* refine vllm instruction


---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 comps/llms/text-generation/vllm/README.md     | 86 ++++++++++++++-----
 .../vllm/launch_vllm_service.sh               |  8 +-
 2 files changed, 67 insertions(+), 27 deletions(-)
diff --git a/comps/llms/text-generation/vllm/README.md b/comps/llms/text-generation/vllm/README.md
index 6f202c2688..44ac1ae676 100644
--- a/comps/llms/text-generation/vllm/README.md
+++ b/comps/llms/text-generation/vllm/README.md
@@ -2,26 +2,81 @@
 
 [vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference and serving, it delivers state-of-the-art serving throughput with a set of advanced features such as PagedAttention, Continuous batching and etc.. Besides GPUs, vLLM already supported [Intel CPUs](https://www.intel.com/content/www/us/en/products/overview.html) and [Gaudi accelerators](https://habana.ai/products). This guide provides an example on how to launch vLLM serving endpoint on CPU and Gaudi accelerators.
 
-## Getting Started
+## vLLM on CPU
 
-### Launch vLLM Service
+First let's enable VLLM on CPU.
 
-#### Launch a local server instance:
+### Build docker
 
 ```bash
-bash ./serving/vllm/launch_vllm_service.sh
+bash ./build_docker_vllm.sh
 ```
 
-The `./serving/vllm/launch_vllm_service.sh` accepts one parameter `hw_mode` to specify the hardware mode of the service, with the default being `cpu`, and the optional selection can be `hpu`.
+The `build_docker_vllm` accepts one parameter `hw_mode` to specify the hardware mode of the service, with the default being `cpu`, and the optional selection can be `hpu`.
 
-For gated models such as `LLAMA-2`, you will have to pass -e HF_TOKEN=\<token\> to the docker run command above with a valid Hugging Face Hub read token.
+### Launch vLLM service
 
-Please follow this link [huggingface token](https://huggingface.co/docs/hub/security-tokens) to get the access token and export `HF_TOKEN` environment with the token.
+```bash
+bash ./launch_vllm_service.sh
+```
+
+The `launch_vllm_service.sh` script accepts four parameters:
+
+- port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8008.
+- model_name: The model name utilized for LLM, with the default set to 'meta-llama/Meta-Llama-3-8B-Instruct'.
+- hw_mode: The hardware mode utilized for LLM, with the default set to "cpu", and the optional selection can be "hpu".
+- parallel_number: parallel nodes number for 'hpu' mode
+
+If you want to customize the port or model_name, can run:
+
+```bash
+bash ./launch_vllm_service.sh ${port_number} ${model_name}
+```
+
+For gated models such as `LLAMA-2`, you will have to pass the environment HUGGINGFACEHUB_API_TOKEN. Please follow this link [huggingface token](https://huggingface.co/docs/hub/security-tokens) to get the access token and export `HUGGINGFACEHUB_API_TOKEN` environment with the token.
+
+```bash
+export HUGGINGFACEHUB_API_TOKEN=<token>
+```
+
+## vLLM on Gaudi
+
+Then we show how to enable VLLM on Gaudi.
+
+### Build docker
 
 ```bash
-export HF_TOKEN=<token>
+bash ./build_docker_vllm.sh hpu
 ```
 
+Set `hw_mode` to `hpu`.
+
+### Launch vLLM service on single node
+
+For small model, we can just use single node.
+
+```bash
+bash ./launch_vllm_service.sh ${port_number} ${model_name} hpu 1
+```
+
+Set `hw_mode` to `hpu` and `parallel_number` to 1.
+
+### Launch vLLM service on multiple nodes
+
+For large model such as `meta-llama/Meta-Llama-3-70b`, we need to launch on multiple nodes.
+
+```bash
+bash ./launch_vllm_service.sh ${port_number} ${model_name} hpu ${parallel_number}
+```
+
+For example, if we run `meta-llama/Meta-Llama-3-70b` with 8 cards, we can use following command.
+
+```bash
+bash ./launch_vllm_service.sh 8008 meta-llama/Meta-Llama-3-70b hpu 8
+```
+
+## Query the service
+
 And then you can make requests like below to check the service status:
 
 ```bash
@@ -34,18 +89,3 @@ curl http://127.0.0.1:8008/v1/completions \
   "temperature": 0
   }'
 ```
-
-#### Customize vLLM Service
-
-The `./serving/vllm/launch_vllm_service.sh` script accepts three parameters:
-
-- port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8080.
-- model_name: The model name utilized for LLM, with the default set to "Intel/neural-chat-7b-v3-3".
-- hw_mode: The hardware mode utilized for LLM, with the default set to "cpu", and the optional selection can be "hpu"
-
-You have the flexibility to customize two parameters according to your specific needs. Additionally, you can set the vLLM endpoint by exporting the environment variable `vLLM_LLM_ENDPOINT`:
-
-```bash
-export vLLM_LLM_ENDPOINT="http://xxx.xxx.xxx.xxx:8008"
-export LLM_MODEL=<model_name> # example: export LLM_MODEL="Intel/neural-chat-7b-v3-3"
-```
diff --git a/comps/llms/text-generation/vllm/launch_vllm_service.sh b/comps/llms/text-generation/vllm/launch_vllm_service.sh
index 3e9dea219c..0c7e3f4206 100644
--- a/comps/llms/text-generation/vllm/launch_vllm_service.sh
+++ b/comps/llms/text-generation/vllm/launch_vllm_service.sh
@@ -4,8 +4,8 @@
 
 # Set default values
 default_port=8008
+default_model="meta-llama/Meta-Llama-3-8B-Instruct"
 default_hw_mode="cpu"
-default_model=${LLM_MODEL_ID}
 default_parallel_number=1
 
 # Assign arguments to variables
@@ -18,7 +18,7 @@ parallel_number=${4:-$default_parallel_number}
 if [ "$#" -lt 0 ] || [ "$#" -gt 4 ]; then
     echo "Usage: $0 [port_number] [model_name] [hw_mode] [parallel_number]"
     echo "port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8080."
-    echo "model_name: The model name utilized for LLM, with the default set to 'Intel/neural-chat-7b-v3-3'."
+    echo "model_name: The model name utilized for LLM, with the default set to 'meta-llama/Meta-Llama-3-8B-Instruct'."
     echo "hw_mode: The hardware mode utilized for LLM, with the default set to 'cpu', and the optional selection can be 'hpu'"
     echo "parallel_number: parallel nodes number for 'hpu' mode"
     exit 1
@@ -29,7 +29,7 @@ volume=$PWD/data
 
 # Build the Docker run command based on hardware mode
 if [ "$hw_mode" = "hpu" ]; then
-    docker run -it --runtime=habana --rm --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name  --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80"
+    docker run -d --rm--runtime=habana --rm --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name  --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80"
 else
-    docker run -it --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --host 0.0.0.0 --port 80"
+    docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --host 0.0.0.0 --port 80"
 fi