opea-project · lvliang-intel · Jun 13, 2024 · Jun 11, 2024 · Jun 13, 2024 · Jun 13, 2024
@@ -5,7 +5,7 @@
 
 ## License
 
-Generative AI Examples is licensed under [Apache License Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
+Generative AI Components is licensed under [Apache License Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
 This software includes components that have separate copyright notices and licensing terms.
 Your use of the source code for these components is subject to the terms and conditions of the following licenses.
 
@@ -15,13 +15,13 @@ See the accompanying [license](/LICENSE) file for full license text and copyrigh
 
 ## Citation
 
-If you use Generative AI Examples in your research, use the following BibTeX entry.
+If you use Generative AI Components in your research, use the following BibTeX entry.
 
 ```
-@misc{Generative AI Examples,
+@misc{Generative AI Components,
   author =       {Liang Lv, Haihao Shen},
-  title =        {Generative AI Examples},
-  howpublished = {\url{https://github.com/opea-project/GenAIExamples}},
+  title =        {Generative AI Components},
+  howpublished = {\url{https://github.com/opea-project/GenAIComps}},
   year =         {2024}
 }
 ```
@@ -251,6 +251,7 @@ self.gateway = ChatQnAGateway(megaservice=self.megaservice, host="0.0.0.0", port
 
 ## Additional Content
 
-- [Contribution](/CONTRIBUTING.md)
+- [Code of Conduct](https://github.com/opea-project/docs/tree/main/community/CODE_OF_CONDUCT.md)
+- [Contribution](https://github.com/opea-project/docs/tree/main/community/CONTRIBUTING.md)
+- [Security Policy](https://github.com/opea-project/docs/tree/main/community/SECURITY.md)
 - [Legal Information](/LEGAL_INFORMATION.md)
-- [Security Policy](/SECURITY.md)
@@ -0,0 +1,55 @@
+# Dataprep Microservice with Milvus
+
+# 🚀Start Microservice with Python
+
+## Install Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+## Start Milvus Server
+
+Please refer to this [readme](../../../vectorstores/langchain/milvus/README.md).
+
+## Setup Environment Variables
+
+```bash
+export http_proxy=${your_http_proxy}
+export https_proxy=${your_http_proxy}
+export MILVUS=${your_milvus_host_ip}
+export MILVUS_PORT=19530
+export COLLECTION_NAME=${your_collection_name}
+export TEI_EMBEDDING_ENDPOINT=${your_tei_endpoint}
+```
+
+## Start Document Preparation Microservice for Milvus with Python Script
+
+Start document preparation microservice for Milvus with below command.
+
+```bash
+python prepare_doc_milvus.py
+```
+
+# 🚀Start Microservice with Docker
+
+## Build Docker Image
+
+```bash
+cd ../../../../
+docker build -t opea/dataprep-milvus:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/milvus/docker/Dockerfile .
+```
+
+## Run Docker with CLI
+
+```bash
+docker run -d --name="dataprep-milvus-server" -p 6010:6010 --ipc=host -v /your_document_path/:/home/user/doc -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TEI_EMBEDDING_ENDPOINT=${your_tei_endpoint} -e MILVUS=${your_milvus_host_ip} opea/dataprep-milvus:latest
+```
+
+# Invoke Microservice
+
+Once document preparation microservice for Qdrant is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database.
+
+```bash
+curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name"}' http://localhost:6010/v1/dataprep
+```
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,13 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+# Embedding model
+EMBED_MODEL = os.getenv("EMBED_MODEL", "maidalun1020/bce-embedding-base_v1")
+# Embedding endpoints
+EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "")
+# MILVUS configuration
+MILVUS_HOST = os.getenv("MILVUS", "localhost")
+MILVUS_PORT = int(os.getenv("MILVUS_PORT", 19530))
+COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag_milvus")
@@ -0,0 +1,31 @@
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.11-slim
+
+ENV LANG C.UTF-8
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    libgl1-mesa-glx \
+    libjemalloc-dev \
+    vim
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+USER user
+
+COPY comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /home/user/comps/dataprep/milvus/requirements.txt
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user
+
+WORKDIR /home/user/comps/dataprep/milvus
+
+ENTRYPOINT ["python", "prepare_doc_milvus.py"]
+
@@ -0,0 +1,66 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import sys
+
+from config import COLLECTION_NAME, EMBED_MODEL, EMBEDDING_ENDPOINT, MILVUS_HOST, MILVUS_PORT
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings
+from langchain_milvus.vectorstores import Milvus
+
+from comps.cores.mega.micro_service import opea_microservices, register_microservice
+from comps.cores.proto.docarray import DocPath
+from comps.cores.telemetry.opea_telemetry import opea_telemetry
+
+current_script_path = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.dirname(current_script_path)
+sys.path.append(parent_dir)
+from utils import document_loader
+
+
+@register_microservice(
+    name="opea_service@prepare_doc_milvus",
+    endpoint="/v1/dataprep",
+    host="0.0.0.0",
+    port=6010,
+    input_datatype=DocPath,
+    output_datatype=None,
+)
+# @opea_telemetry
+def ingest_documents(doc_path: DocPath):
+    """Ingest document to Milvus."""
+    doc_path = doc_path.path
+    print(f"Parsing document {doc_path}.")
+
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True)
+    content = document_loader(doc_path)
+    chunks = text_splitter.split_text(content)
+
+    print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
+    # Create vectorstore
+    if EMBEDDING_ENDPOINT:
+        # create embeddings using TEI endpoint service
+        embedder = HuggingFaceHubEmbeddings(model=EMBEDDING_ENDPOINT)
+    else:
+        # create embeddings using local embedding model
+        embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)
+
+    # Batch size
+    batch_size = 32
+    num_chunks = len(chunks)
+    for i in range(0, num_chunks, batch_size):
+        batch_chunks = chunks[i : i + batch_size]
+        batch_texts = batch_chunks
+
+        _ = Milvus.from_texts(
+            texts=batch_texts,
+            embedding=embedder,
+            collection_name=COLLECTION_NAME,
+            connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
+        )
+        print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}")
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@prepare_doc_milvus"].start()
@@ -0,0 +1,21 @@
+beautifulsoup4
+docarray[full]
+easyocr
+fastapi
+frontend==0.0.3
+huggingface_hub
+langchain
+langchain-community
+langchain_milvus
+numpy
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+pandas
+Pillow
+pydantic==2.7.3
+pymilvus==2.4.3
+pymupdf==1.24.5
+python-docx==0.8.11
+sentence_transformers
+shortuuid
@@ -0,0 +1,68 @@
+# Retriever Microservice with Milvus
+
+# 🚀Start Microservice with Python
+
+## Install Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+## Start Milvus Server
+
+Please refer to this [readme](../../../vectorstores/langchain/milvus/README.md).
+
+## Setup Environment Variables
+
+```bash
+export http_proxy=${your_http_proxy}
+export https_proxy=${your_http_proxy}
+export MILVUS=${your_milvus_host_ip}
+export MILVUS_PORT=19530
+export COLLECTION_NAME=${your_collection_name}
+export TEI_EMBEDDING_ENDPOINT=${your_tei_endpoint}
+```
+
+## Start Retriever Service
+
+```bash
+export TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060"
+python langchain/retriever_redis.py
+```
+
+# 🚀Start Microservice with Docker
+
+## Build Docker Image
+
+```bash
+cd ../../
+docker build -t opea/retriever-milvus:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/langchain/milvus/docker/Dockerfile .
+```
+
+## Run Docker with CLI
+
+```bash
+docker run -d --name="retriever-milvus-server" -p 7000:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TEI_EMBEDDING_ENDPOINT=${your_tei_endpoint} -e MILVUS=${your_milvus_host_ip}  opea/retriever-milvus:latest
+```
+
+# 🚀3. Consume Retriever Service
+
+## 3.1 Check Service Status
+
+```bash
+curl http://${your_ip}:7000/v1/health_check \
+  -X GET \
+  -H 'Content-Type: application/json'
+```
+
+## 3.2 Consume Embedding Service
+
+To consume the Retriever Microservice, you can generate a mock embedding vector of length 768 with Python.
+
+```bash
+your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
+curl http://${your_ip}:7000/v1/retrieval \
+  -X POST \
+  -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \
+  -H 'Content-Type: application/json'
+```
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,13 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+# Embedding model
+EMBED_MODEL = os.getenv("EMBED_MODEL", "maidalun1020/bce-embedding-base_v1")
+# Embedding endpoints
+EMBED_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "")
+# MILVUS configuration
+MILVUS_HOST = os.getenv("MILVUS", "localhost")
+MILVUS_PORT = int(os.getenv("MILVUS_PORT", 19530))
+COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag_milvus")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Copyright (C) 2024 Intel Corporation
		# SPDX-License-Identifier: Apache-2.0