From 6e2c28b17850964e5c07d5f418211722a9b09212 Mon Sep 17 00:00:00 2001 From: XinyaoWa Date: Thu, 4 Jul 2024 15:50:31 +0800 Subject: [PATCH] refine vllm instruction (#272) * refine vllm instruction --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- comps/llms/text-generation/vllm/README.md | 86 ++++++++++++++----- .../vllm/launch_vllm_service.sh | 8 +- 2 files changed, 67 insertions(+), 27 deletions(-) diff --git a/comps/llms/text-generation/vllm/README.md b/comps/llms/text-generation/vllm/README.md index 6f202c2688..44ac1ae676 100644 --- a/comps/llms/text-generation/vllm/README.md +++ b/comps/llms/text-generation/vllm/README.md @@ -2,26 +2,81 @@ [vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference and serving, it delivers state-of-the-art serving throughput with a set of advanced features such as PagedAttention, Continuous batching and etc.. Besides GPUs, vLLM already supported [Intel CPUs](https://www.intel.com/content/www/us/en/products/overview.html) and [Gaudi accelerators](https://habana.ai/products). This guide provides an example on how to launch vLLM serving endpoint on CPU and Gaudi accelerators. -## Getting Started +## vLLM on CPU -### Launch vLLM Service +First let's enable VLLM on CPU. -#### Launch a local server instance: +### Build docker ```bash -bash ./serving/vllm/launch_vllm_service.sh +bash ./build_docker_vllm.sh ``` -The `./serving/vllm/launch_vllm_service.sh` accepts one parameter `hw_mode` to specify the hardware mode of the service, with the default being `cpu`, and the optional selection can be `hpu`. +The `build_docker_vllm` accepts one parameter `hw_mode` to specify the hardware mode of the service, with the default being `cpu`, and the optional selection can be `hpu`. -For gated models such as `LLAMA-2`, you will have to pass -e HF_TOKEN=\ to the docker run command above with a valid Hugging Face Hub read token. +### Launch vLLM service -Please follow this link [huggingface token](https://huggingface.co/docs/hub/security-tokens) to get the access token and export `HF_TOKEN` environment with the token. +```bash +bash ./launch_vllm_service.sh +``` + +The `launch_vllm_service.sh` script accepts four parameters: + +- port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8008. +- model_name: The model name utilized for LLM, with the default set to 'meta-llama/Meta-Llama-3-8B-Instruct'. +- hw_mode: The hardware mode utilized for LLM, with the default set to "cpu", and the optional selection can be "hpu". +- parallel_number: parallel nodes number for 'hpu' mode + +If you want to customize the port or model_name, can run: + +```bash +bash ./launch_vllm_service.sh ${port_number} ${model_name} +``` + +For gated models such as `LLAMA-2`, you will have to pass the environment HUGGINGFACEHUB_API_TOKEN. Please follow this link [huggingface token](https://huggingface.co/docs/hub/security-tokens) to get the access token and export `HUGGINGFACEHUB_API_TOKEN` environment with the token. + +```bash +export HUGGINGFACEHUB_API_TOKEN= +``` + +## vLLM on Gaudi + +Then we show how to enable VLLM on Gaudi. + +### Build docker ```bash -export HF_TOKEN= +bash ./build_docker_vllm.sh hpu ``` +Set `hw_mode` to `hpu`. + +### Launch vLLM service on single node + +For small model, we can just use single node. + +```bash +bash ./launch_vllm_service.sh ${port_number} ${model_name} hpu 1 +``` + +Set `hw_mode` to `hpu` and `parallel_number` to 1. + +### Launch vLLM service on multiple nodes + +For large model such as `meta-llama/Meta-Llama-3-70b`, we need to launch on multiple nodes. + +```bash +bash ./launch_vllm_service.sh ${port_number} ${model_name} hpu ${parallel_number} +``` + +For example, if we run `meta-llama/Meta-Llama-3-70b` with 8 cards, we can use following command. + +```bash +bash ./launch_vllm_service.sh 8008 meta-llama/Meta-Llama-3-70b hpu 8 +``` + +## Query the service + And then you can make requests like below to check the service status: ```bash @@ -34,18 +89,3 @@ curl http://127.0.0.1:8008/v1/completions \ "temperature": 0 }' ``` - -#### Customize vLLM Service - -The `./serving/vllm/launch_vllm_service.sh` script accepts three parameters: - -- port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8080. -- model_name: The model name utilized for LLM, with the default set to "Intel/neural-chat-7b-v3-3". -- hw_mode: The hardware mode utilized for LLM, with the default set to "cpu", and the optional selection can be "hpu" - -You have the flexibility to customize two parameters according to your specific needs. Additionally, you can set the vLLM endpoint by exporting the environment variable `vLLM_LLM_ENDPOINT`: - -```bash -export vLLM_LLM_ENDPOINT="http://xxx.xxx.xxx.xxx:8008" -export LLM_MODEL= # example: export LLM_MODEL="Intel/neural-chat-7b-v3-3" -``` diff --git a/comps/llms/text-generation/vllm/launch_vllm_service.sh b/comps/llms/text-generation/vllm/launch_vllm_service.sh index 3e9dea219c..0c7e3f4206 100644 --- a/comps/llms/text-generation/vllm/launch_vllm_service.sh +++ b/comps/llms/text-generation/vllm/launch_vllm_service.sh @@ -4,8 +4,8 @@ # Set default values default_port=8008 +default_model="meta-llama/Meta-Llama-3-8B-Instruct" default_hw_mode="cpu" -default_model=${LLM_MODEL_ID} default_parallel_number=1 # Assign arguments to variables @@ -18,7 +18,7 @@ parallel_number=${4:-$default_parallel_number} if [ "$#" -lt 0 ] || [ "$#" -gt 4 ]; then echo "Usage: $0 [port_number] [model_name] [hw_mode] [parallel_number]" echo "port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8080." - echo "model_name: The model name utilized for LLM, with the default set to 'Intel/neural-chat-7b-v3-3'." + echo "model_name: The model name utilized for LLM, with the default set to 'meta-llama/Meta-Llama-3-8B-Instruct'." echo "hw_mode: The hardware mode utilized for LLM, with the default set to 'cpu', and the optional selection can be 'hpu'" echo "parallel_number: parallel nodes number for 'hpu' mode" exit 1 @@ -29,7 +29,7 @@ volume=$PWD/data # Build the Docker run command based on hardware mode if [ "$hw_mode" = "hpu" ]; then - docker run -it --runtime=habana --rm --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80" + docker run -d --rm--runtime=habana --rm --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80" else - docker run -it --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --host 0.0.0.0 --port 80" + docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --host 0.0.0.0 --port 80" fi