opea-project · lvliang-intel · Jun 20, 2024 · Jun 19, 2024 · Jun 19, 2024 · Jun 19, 2024
@@ -130,7 +130,6 @@ export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
 export vLLM_LLM_ENDPOINT="http://${your_ip}:8008"
 export LLM_MODEL_ID=${your_hf_llm_model}
 export LANGCHAIN_TRACING_V2=true
-export LANGCHAIN_API_KEY=${your_langchain_api_key}
 export LANGCHAIN_PROJECT="opea/llms"
 ```
 
@@ -141,8 +140,8 @@ export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
 export RAY_Serve_ENDPOINT="http://${your_ip}:8008"
 export LLM_MODEL=${your_hf_llm_model}
 export LANGCHAIN_TRACING_V2=true
-export LANGCHAIN_API_KEY=${your_langchain_api_key}
 export LANGCHAIN_PROJECT="opea/llms"
+export CHAT_PROCESSOR="ChatModelLlama"
 ```
 
 ## 2.2 Build Docker Image
@@ -156,16 +155,32 @@ docker build -t opea/llm-tgi:latest --build-arg https_proxy=$https_proxy --build
 
 ### 2.2.2 vLLM
 
+Build vllm docker.
+
 ```bash
-cd ../../
-docker build -t opea/llm-vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/vllm/Dockerfile .
+bash build_docker_vllm.sh
+```
+
+Build microservice docker.
+
+```bash
+cd ../../../../
+docker build -t opea/llm-vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/vllm/docker/Dockerfile.microservice .
 ```
 
 ### 2.2.3 Ray Serve
 
+Build Ray Serve docker.
+
 ```bash
-cd ../../
-docker built -t opeas/llm-ray:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/ray_serve/Dockerfile .
+bash build_docker_rayserve.sh
+```
+
+Build microservice docker.
+
+```bash
+cd ../../../../
+docker build -t opea/llm-ray:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/ray_serve/docker/Dockerfile.microservice .
 ```
 
 To start a docker container, you have two options:
@@ -185,12 +200,28 @@ docker run -d --name="llm-tgi-server" -p 9000:9000 --ipc=host -e http_proxy=$htt
 
 ### 2.3.2 vLLM
 
+Start vllm endpoint.
+
+```bash
+bash launch_vllm_service.sh
+```
+
+Start vllm microservice.
+
 ```bash
-docker run -d --name="llm-vllm-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e vLLM_LLM_ENDPOINT=$vLLM_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e LLM_MODEL_ID=$LLM_MODEL_ID opea/llm-vllm:latest
+docker run --name="llm-vllm-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=${no_proxy} -e vLLM_LLM_ENDPOINT=$vLLM_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e LLM_MODEL_ID=$LLM_MODEL_ID opea/llm-vllm:latest
 ```
 
 ### 2.3.3 Ray Serve
 
+Start Ray Serve endpoint.
+
+```bash
+bash launch_ray_service.sh
+```
+
+Start Ray Serve microservice.
+
 ```bash
 docker run -d --name="llm-ray-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e RAY_Serve_ENDPOINT=$RAY_Serve_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e LLM_MODEL=$LLM_MODEL opea/llm-ray:latest
 ```
@@ -250,11 +281,11 @@ curl http://${your_ip}:9000/v1/chat/completions \
 
 ## 4. Validated Model
 
-| Model                     | TGI-Gaudi | vLLM-CPU | Ray |
-| ------------------------- | --------- | -------- | --- |
-| Intel/neural-chat-7b-v3-3 | ✓         | ✓        | ✓   |
-| Llama-2-7b-chat-hf        | ✓         | ✓        | ✓   |
-| Llama-2-70b-chat-hf       | ✓         | -        | x   |
-| Meta-Llama-3-8B-Instruct  | ✓         | ✓        | ✓   |
-| Meta-Llama-3-70B-Instruct | ✓         | -        | x   |
-| Phi-3                     | x         | Limit 4K | ✓   |
+| Model                     | TGI-Gaudi | vLLM-CPU | vLLM-Gaudi | Ray |
+| ------------------------- | --------- | -------- | ---------- | --- |
+| Intel/neural-chat-7b-v3-3 | ✓         | ✓        | ✓          | ✓   |
+| Llama-2-7b-chat-hf        | ✓         | ✓        | ✓          | ✓   |
+| Llama-2-70b-chat-hf       | ✓         | -        | ✓          | x   |
+| Meta-Llama-3-8B-Instruct  | ✓         | ✓        | ✓          | ✓   |
+| Meta-Llama-3-70B-Instruct | ✓         | -        | ✓          | x   |
+| Phi-3                     | x         | Limit 4K | Limit 4K   | ✓   |
@@ -21,7 +21,7 @@ export HF_TOKEN=<token>
 And then you can make requests with the OpenAI-compatible APIs like below to check the service status:
 
 ```bash
-curl http://127.0.0.1:8080/v1/chat/completions \
+curl http://127.0.0.1:8008/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
   "model": <model_name>,
@@ -45,6 +45,6 @@ The ./serving/ray/launch_ray_service.sh script accepts five parameters:
 You have the flexibility to customize five parameters according to your specific needs. Additionally, you can set the Ray Gaudi endpoint by exporting the environment variable `RAY_Serve_ENDPOINT`:
 
 ```bash
-export RAY_Serve_ENDPOINT="http://xxx.xxx.xxx.xxx:8080"
+export RAY_Serve_ENDPOINT="http://xxx.xxx.xxx.xxx:8008"
 export LLM_MODEL=<model_name> # example: export LLM_MODEL="meta-llama/Llama-2-7b-chat-hf"
 ```
@@ -7,7 +7,7 @@
 cd docker
 
 docker build \
-    -f Dockerfile ../../ \
+    -f Dockerfile.rayserve ../../ \
     -t ray_serve:habana \
     --network=host \
     --build-arg http_proxy=${http_proxy} \

@@ -16,28 +16,26 @@ version: "3.8"
 
 services:
   ray_service:
-    image: rayllm:habana
+    image: ray_serve:habana
     container_name: ray-service
     ports:
       - "8008:80"
     runtime: habana
+    ipc: host
+    volumes:
+      - "./data:/data"
     environment:
-      - OMPI_MCA_btl_vader_single_copy_mechanism=none
-      - HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-      - TRUST_REMOTE_CODE=${TRUST_REMOTE_CODE}
-      - LLM_MODEL=${LLM_MODEL}
-      - CHAT_PROCESSOR=${CHAT_PROCESSOR}
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      TRUST_REMOTE_CODE: True
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LLM_MODEL: ${LLM_MODEL}
+      CHAT_PROCESSOR: ${CHAT_PROCESSOR}
+      HABANA_VISIBLE_DEVICES: all
     cap_add:
-      - SYS_NICE
-    command: >
-      /bin/bash -c "ray start --head &&
-      python api_server_openai.py --port_number 80
-      --model_id_or_path ${LLM_MODEL}
-      --chat_processor ${CHAT_PROCESSOR}
-      --num_cpus_per_worker 8
-      --num_hpus_per_worker 1"
+      - sys_nice
+    command: /bin/bash -c "ray start --head && python api_server_openai.py --port_number 80 --model_id_or_path ${LLM_MODEL} --chat_processor ${CHAT_PROCESSOR} --num_cpus_per_worker 8 --num_hpus_per_worker 1"
   llm:
-    image: opea/gen-ai-comps:llm-ray-server
+    image: opeas/llm-ray:latest
     container_name: llm-ray-server
     ports:
       - "9000:9000"
@@ -48,7 +46,6 @@ services:
       RAY_Serve_ENDPOINT: ${RAY_Serve_ENDPOINT}
       LLM_MODEL: ${LLM_MODEL}
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY}
     restart: unless-stopped
 
 networks:

@@ -5,8 +5,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Set default values
-default_port=8080
-default_model="meta-llama/Llama-2-7b-chat-hf"
+default_port=8008
+default_model=${LLM_MODEL}
 default_chat_processor="ChatModelLlama"
 default_num_cpus_per_worker=8
 default_num_hpus_per_worker=1
@@ -31,4 +31,4 @@ if [ "$#" -lt 0 ] || [ "$#" -gt 5 ]; then
 fi
 
 # Build the Docker run command based on the number of cards
-docker run -it --runtime=habana --name="ChatQnA_server" -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -p $port_number:$port_number -e HF_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e TRUST_REMOTE_CODE=$TRUST_REMOTE_CODE ray_serve:habana /bin/bash -c "ray start --head && python api_server_openai.py --port_number $port_number --model_id_or_path $model_name --chat_processor $chat_processor --num_cpus_per_worker $num_cpus_per_worker --num_hpus_per_worker $num_hpus_per_worker"
+docker run -it --runtime=habana --name="ray-service" -v $PWD/data:/data -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -p $port_number:80 -e HF_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e TRUST_REMOTE_CODE=True ray_serve:habana /bin/bash -c "ray start --head && python api_server_openai.py --port_number 80 --model_id_or_path $model_name --chat_processor $chat_processor --num_cpus_per_worker $num_cpus_per_worker --num_hpus_per_worker $num_hpus_per_worker"
@@ -25,7 +25,7 @@ export HF_TOKEN=<token>
 And then you can make requests like below to check the service status:
 
 ```bash
-curl http://127.0.0.1:8080/v1/completions \
+curl http://127.0.0.1:8008/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
   "model": <model_name>,
@@ -46,6 +46,6 @@ The `./serving/vllm/launch_vllm_service.sh` script accepts three parameters:
 You have the flexibility to customize two parameters according to your specific needs. Additionally, you can set the vLLM endpoint by exporting the environment variable `vLLM_LLM_ENDPOINT`:
 
 ```bash
-export vLLM_LLM_ENDPOINT="http://xxx.xxx.xxx.xxx:8080"
+export vLLM_LLM_ENDPOINT="http://xxx.xxx.xxx.xxx:8008"
 export LLM_MODEL=<model_name> # example: export LLM_MODEL="Intel/neural-chat-7b-v3-3"
 ```
@@ -19,7 +19,7 @@ services:
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
     command: /bin/sh -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --model $LLM_MODEL_ID --port 80"
   llm:
-    image: opea/gen-ai-comps:llm-vllm-server
+    image: opea/llm-vllm:latest
     container_name: llm-vllm-server
     ports:
       - "9000:9000"

@@ -1,25 +1,26 @@
 #!/bin/bash
-
-
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 # Set default values
-default_port=8080
+default_port=8008
 default_hw_mode="cpu"
-default_model="Intel/neural-chat-7b-v3-3"
+default_model=${LLM_MODEL_ID}
+default_parallel_number=1
 
 # Assign arguments to variables
 port_number=${1:-$default_port}
 model_name=${2:-$default_model}
 hw_mode=${3:-$default_hw_mode}
+parallel_number=${4:-$default_parallel_number}
 
 # Check if all required arguments are provided
-if [ "$#" -lt 0 ] || [ "$#" -gt 3 ]; then
-    echo "Usage: $0 [port_number] [model_name] [hw_mode]"
+if [ "$#" -lt 0 ] || [ "$#" -gt 4 ]; then
+    echo "Usage: $0 [port_number] [model_name] [hw_mode] [parallel_number]"
     echo "port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8080."
     echo "model_name: The model name utilized for LLM, with the default set to 'Intel/neural-chat-7b-v3-3'."
     echo "hw_mode: The hardware mode utilized for LLM, with the default set to 'cpu', and the optional selection can be 'hpu'"
+    echo "parallel_number: parallel nodes number for 'hpu' mode"
     exit 1
 fi
 
@@ -28,7 +29,7 @@ volume=$PWD/data
 
 # Build the Docker run command based on hardware mode
 if [ "$hw_mode" = "hpu" ]; then
-    docker run -it --runtime=habana --rm --name="ChatQnA_server" -p $port_number:$port_number -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --host 0.0.0.0 --port $port_number"
+    docker run -it --runtime=habana --rm --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name  --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80"
 else
-    docker run -it --rm --name="ChatQnA_server" -p $port_number:$port_number --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --host 0.0.0.0 --port $port_number"
+    docker run -it --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --host 0.0.0.0 --port 80"
 fi
@@ -31,12 +31,13 @@ def post_process_text(text: str):
 )
 @traceable(run_type="llm")
 def llm_generate(input: LLMParamsDoc):
-    llm_endpoint = os.getenv("vLLM_LLM_ENDPOINT", "http://localhost:8080")
+    llm_endpoint = os.getenv("vLLM_LLM_ENDPOINT", "http://localhost:8008")
+    model_name = os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")
     llm = VLLMOpenAI(
         openai_api_key="EMPTY",
         openai_api_base=llm_endpoint + "/v1",
         max_tokens=input.max_new_tokens,
-        model_name=os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct"),
+        model_name=model_name,
         top_p=input.top_p,
         temperature=input.temperature,
         presence_penalty=input.repetition_penalty,

@@ -6,6 +6,7 @@ langserve
 opentelemetry-api
 opentelemetry-exporter-otlp
 opentelemetry-sdk
+setuptools==69.5.1
 shortuuid
 transformers
 vllm