Skip to content

Commit

Permalink
Merge pull request #1 from mixedbread-ai/rui/mixedbread-ai-integration
Browse files Browse the repository at this point in the history
feat: add mixedbread ai integration
  • Loading branch information
juliuslipp authored Jul 9, 2024
2 parents 3f581e6 + d618105 commit dfd0724
Show file tree
Hide file tree
Showing 12 changed files with 14,058 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

### Enhancements

* **Add MixedbreadAI embedder** Adds MixedbreadAI embeddings to support embedding via Mixedbread AI.

### Features

### Fixes
Expand Down
42 changes: 42 additions & 0 deletions examples/embed/example_mixedbreadai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import os

from unstructured.documents.elements import Text
from unstructured.embed.mixedbreadai import (
MixedbreadAIEmbeddingConfig,
MixedbreadAIEmbeddingEncoder,
)

# To use Mixedbread AI you will need to pass
# Mixedbread AI API Key (obtained from https://www.mixedbread.ai)
# as the ``api_key`` parameter.
#
# The ``model_name`` parameter is mandatory, please check the available models
# at https://www.mixedbread.ai/docs/embeddings/models#whats-new-in-the-mixedbread-embed-model-family

embedding_encoder = MixedbreadAIEmbeddingEncoder(
config=MixedbreadAIEmbeddingConfig(
api_key=os.environ.get("MXBAI_API_KEY", None),
model_name="mixedbread-ai/mxbai-embed-large-v1",
)
)

embedding_encoder.initialize()

# Embedding documents
elements = embedding_encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")]
)

# Embedding a query
query = "This is the query"
query_embedding = embedding_encoder.embed_query(query=query)

# Printing document embeddings
for e in elements:
print(e, e.embeddings)

# Printing query embedding
print(query, query_embedding)

# Printing unit vector status and number of dimensions
print(embedding_encoder.is_unit_vector, embedding_encoder.num_of_dimensions)
3 changes: 3 additions & 0 deletions requirements/ingest/embed-mixedbreadai.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-c ../deps/constraints.txt
-c ../base.txt
mixedbread-ai
47 changes: 47 additions & 0 deletions requirements/ingest/embed-mixedbreadai.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile requirements/ingest/embed-mixedbreadai.in
#
annotated-types==0.7.0
# via pydantic
anyio==3.7.1
# via
# -c requirements/ingest/../deps/constraints.txt
# httpx
certifi==2024.6.2
# via
# -c requirements/ingest/../base.txt
# -c requirements/ingest/../deps/constraints.txt
# httpcore
# httpx
exceptiongroup==1.2.1
# via anyio
h11==0.14.0
# via httpcore
httpcore==1.0.5
# via httpx
httpx==0.27.0
# via mixedbread-ai
idna==3.7
# via
# -c requirements/ingest/../base.txt
# anyio
# httpx
mixedbread-ai==2.2.6
# via -r requirements/ingest/embed-mixedbreadai.in
pydantic==2.7.4
# via mixedbread-ai
pydantic-core==2.18.4
# via pydantic
sniffio==1.3.1
# via
# anyio
# httpx
typing-extensions==4.12.2
# via
# -c requirements/ingest/../base.txt
# mixedbread-ai
# pydantic
# pydantic-core
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
"embed-octoai": load_requirements("requirements/ingest/embed-octoai.in"),
"embed-vertexai": load_requirements("requirements/ingest/embed-vertexai.in"),
"embed-voyageai": load_requirements("requirements/ingest/embed-voyageai.in"),
"embed-mixedbreadai": load_requirements("requirements/ingest/embed-mixedbreadai.in"),
"openai": load_requirements("requirements/ingest/embed-openai.in"),
"bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.in"),
"databricks-volumes": load_requirements("requirements/ingest/databricks-volumes.in"),
Expand Down
72 changes: 72 additions & 0 deletions test_unstructured/embed/test_mixedbreadai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os

import pytest

from unstructured.documents.elements import Text
from unstructured.embed.mixedbreadai import (
MixedbreadAIEmbeddingConfig,
MixedbreadAIEmbeddingEncoder,
)


def test_embed_documents_does_not_break_element_to_dict(mocker):
mock_client = mocker.MagicMock()

def mock_embeddings(
model,
normalized,
encoding_format,
truncation_strategy,
dimensions,
prompt,
request_options,
input,
):
mock_response = mocker.MagicMock()
mock_response.data = [mocker.MagicMock(embedding=[i, i + 1]) for i in range(len(input))]
return mock_response

mock_client.embeddings.side_effect = mock_embeddings

# Mock create_client to return our mock_client
mocker.patch.object(MixedbreadAIEmbeddingEncoder, "create_client", return_value=mock_client)

encoder = MixedbreadAIEmbeddingEncoder(
config=MixedbreadAIEmbeddingConfig(
api_key="api_key", model_name="mixedbread-ai/mxbai-embed-large-v1"
)
)

encoder.initialize()

elements = encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
)
assert len(elements) == 2
assert elements[0].to_dict()["text"] == "This is sentence 1"
assert elements[1].to_dict()["text"] == "This is sentence 2"
assert elements[0].embeddings is not None
assert elements[1].embeddings is not None


@pytest.mark.skipif(
not os.environ.get("MXBAI_API_KEY", None),
reason="Export an env var called MXBAI_API_KEY "
"containing the Mixedbread AI API key to run this test.",
)
def test_embed_documents_live():
encoder = MixedbreadAIEmbeddingEncoder(
config=MixedbreadAIEmbeddingConfig(model_name="mixedbread-ai/mxbai-embed-large-v1")
)

encoder.initialize()

elements = encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
)

assert len(elements) == 2
assert elements[0].to_dict()["text"] == "This is sentence 1"
assert elements[1].to_dict()["text"] == "This is sentence 2"
assert elements[0].embeddings is not None
assert elements[1].embeddings is not None
Loading

0 comments on commit dfd0724

Please sign in to comment.