From 8d07ffbc4d772de087c4136ebe69315b1b64bc5f Mon Sep 17 00:00:00 2001 From: vadim Date: Thu, 13 Nov 2025 15:19:22 +0300 Subject: [PATCH 01/27] update infinity-emb version to 0.0.77 and add optimum dependency --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9812a81..d82080a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ runpod~=1.7.0 -infinity-emb[all]==0.0.76 +infinity-emb[all]==0.0.77 +optimum==1.24.0 einops # deployment of custom code with nomic git+https://github.com/pytorch-labs/float8_experimental.git From 9acf3f0d24c41ced6c88b7472aeecb5d48011b6d Mon Sep 17 00:00:00 2001 From: vadim Date: Thu, 13 Nov 2025 18:00:27 +0300 Subject: [PATCH 02/27] added clip-based model --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 2c81997..0bd5734 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,7 +13,7 @@ services: count: all capabilities: [gpu] environment: - MODEL_NAMES: "BAAI/bge-small-en-v1.5" + MODEL_NAMES: "BAAI/bge-small-en-v1.5;patrickjohncyh/fashion-clip" NVIDIA_VISIBLE_DEVICES: "all" volumes: - ./data/runpod-volume:/runpod-volume From 2c130ca6d61c1057b69ab1efecfffc6293b560cf Mon Sep 17 00:00:00 2001 From: vadim Date: Thu, 13 Nov 2025 18:00:40 +0300 Subject: [PATCH 03/27] multimodal tests --- .runpod/tests.json | 93 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 89 insertions(+), 4 deletions(-) diff --git a/.runpod/tests.json b/.runpod/tests.json index 02b8a4f..ed0d318 100644 --- a/.runpod/tests.json +++ b/.runpod/tests.json @@ -1,12 +1,97 @@ { "tests": [ { - "name": "basic_test", + "name": "multimodal_text", "input": { - "model": "BAAI/bge-small-en-v1.5", - "input": "Hello, world!" + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "patrickjohncyh/fashion-clip", + "input": "A beautiful red dress" + } + }, + "expected_output": { + "status": "COMPLETED" }, "timeout": 10000 + }, + { + "name": "multimodal_image_url", + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "patrickjohncyh/fashion-clip", + "input": "https://images.unsplash.com/photo-1515372039744-b8f02a3ae446?w=400" + } + }, + "expected_output": { + "status": "COMPLETED" + }, + "timeout": 15000 + }, + { + "name": "multimodal_multiple_images", + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "patrickjohncyh/fashion-clip", + "input": [ + "https://images.unsplash.com/photo-1515372039744-b8f02a3ae446?w=400", + "https://images.unsplash.com/photo-1551028719-00167b16eac5?w=400" + ] + } + }, + "expected_output": { + "status": "COMPLETED" + }, + "timeout": 20000 + }, + { + "name": "multimodal_multiple_texts", + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "patrickjohncyh/fashion-clip", + "input": [ + "A red dress", + "A blue shirt", + "Black shoes" + ] + } + }, + "expected_output": { + "status": "COMPLETED" + }, + "timeout": 15000 + }, + { + "name": "multimodal_mixed_text_and_images", + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "patrickjohncyh/fashion-clip", + "input": [ + "A red summer dress", + "https://images.unsplash.com/photo-1515372039744-b8f02a3ae446?w=400", + "A blue winter coat", + "https://images.unsplash.com/photo-1551028719-00167b16eac5?w=400" + ] + } + }, + "expected_output": { + "status": "COMPLETED" + }, + "timeout": 25000 + }, + { + "name": "get_models_list", + "input": { + "openai_route": "/v1/models", + "openai_input": {} + }, + "expected_output": { + "status": "COMPLETED" + }, + "timeout": 5000 } ], "config": { @@ -16,7 +101,7 @@ "env": [ { "key": "MODEL_NAMES", - "value": "BAAI/bge-small-en-v1.5" + "value": "patrickjohncyh/fashion-clip" } ] } From 17fee84a900908ebe329aeda36dcfcece7f4d24b Mon Sep 17 00:00:00 2001 From: vadim Date: Thu, 13 Nov 2025 18:10:04 +0300 Subject: [PATCH 04/27] added multimodal routing (images and/or text) --- src/multimodal_utils.py | 63 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 src/multimodal_utils.py diff --git a/src/multimodal_utils.py b/src/multimodal_utils.py new file mode 100644 index 0000000..fe55477 --- /dev/null +++ b/src/multimodal_utils.py @@ -0,0 +1,63 @@ +import base64 +import io +import re +from PIL import Image + + +def _is_url(text: str) -> bool: + """Check if string is a URL""" + return text.startswith('http://') or text.startswith('https://') + + +def _is_base64_image(data: str) -> tuple[bool, Image.Image | None]: + """ + Check if string is a base64-encoded image and decode it. + Supports both data URI format (data:image/...) and raw base64. + Returns (is_image, PIL_Image or None) + """ + try: + # Handle data URI format: data:image/png;base64,iVBORw0KG... + if data.startswith('data:'): + match = re.match(r'data:image/[^;]+;base64,(.+)', data) + if match: + base64_data = match.group(1) + else: + return False, None + else: + # Try raw base64 + base64_data = data + + img_bytes = base64.b64decode(base64_data) + + img = Image.open(io.BytesIO(img_bytes)) + img.load() # Force load to validate it's a real image + return True, img + except Exception: + + return False, None + + +async def parse_input_item(item: str | bytes | Image.Image) -> tuple[str, str | Image.Image | bytes]: + """ + Parse a single input item and determine if it's text or image. + Returns: (type, processed_data) + where type is 'text' or 'image' + and processed_data is either the original text or PIL.Image/URL/bytes + + Note: infinity_emb handles URL downloading internally, so we just pass URLs through. + """ + # Check if it's an image + if isinstance(item, (Image.Image, bytes)): + return 'image', item + + if isinstance(item, str): + # URL images + if _is_url(item): + return 'image', item + + # Base64 images + is_base64_img, img = _is_base64_image(item) + if is_base64_img and img is not None: + return 'image', img + + return 'text', item if isinstance(item, str) else str(item) From 3b6a67e0c4a54602bedc47ad904e71ccdd6fc00e Mon Sep 17 00:00:00 2001 From: vadim Date: Thu, 13 Nov 2025 18:10:43 +0300 Subject: [PATCH 05/27] support mixed text and image inputs, improve error handling and logging --- src/embedding_service.py | 109 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 101 insertions(+), 8 deletions(-) diff --git a/src/embedding_service.py b/src/embedding_service.py index a5afc44..7f17e36 100644 --- a/src/embedding_service.py +++ b/src/embedding_service.py @@ -1,13 +1,20 @@ -from config import EmbeddingServiceConfig +import asyncio +import logging + from infinity_emb.engine import AsyncEngineArray, EngineArgs +from infinity_emb.primitives import ModelNotDeployedError +from PIL import Image + +from config import EmbeddingServiceConfig +from src.multimodal_utils import parse_input_item from utils import ( - OpenAIModelInfo, ModelInfo, + OpenAIModelInfo, list_embeddings_to_response, to_rerank_response, ) -import asyncio +logger = logging.getLogger(__name__) class EmbeddingService: @@ -56,24 +63,110 @@ def list_models(self) -> list[str]: async def route_openai_get_embeddings( self, - embedding_input: str | list[str], + embedding_input: str | list[str] | list[str | bytes | Image.Image], model_name: str, return_as_list: bool = False, ): - """returns embeddings for the input text""" + """ + Returns embeddings for the input text and/or images. + Supports mixed text and image inputs while preserving order. + """ if not self.is_running: await self.start() + + available_models = self.list_models() + if model_name not in available_models: + logger.error( + f"Requested model '{model_name}' not found. " + f"Available models: {available_models}" + ) + raise ValueError( + f"Model '{model_name}' is not available. " + f"Available models: {', '.join(available_models)}" + ) + if not isinstance(embedding_input, list): embedding_input = [embedding_input] - embeddings, usage = await self.engine_array[model_name].embed(embedding_input) + parsed_items = await asyncio.gather( + *[parse_input_item(item) for item in embedding_input] + ) + + # Separate text and image items while tracking their original indices + text_items = [] + text_indices = [] + image_items = [] + image_indices = [] + + for idx, (item_type, processed_data) in enumerate(parsed_items): + if item_type == "text": + text_items.append(processed_data) + text_indices.append(idx) + else: # item_type == 'image' + image_items.append(processed_data) + image_indices.append(idx) + + logger.info( + f"Processing embeddings for model '{model_name}': " + f"{len(text_items)} text items, {len(image_items)} image items " + f"(total: {len(embedding_input)} items)" + ) + + text_embeddings = [] + text_usage = 0 + image_embeddings = [] + image_usage = 0 + + if text_items: + logger.debug(f"Calling .embed() with {len(text_items)} text items") + text_embeddings, text_usage = await self.engine_array[model_name].embed( + text_items + ) + logger.debug(f"Successfully got {len(text_embeddings)} text embeddings") + + if image_items: + logger.debug(f"Calling .image_embed() with {len(image_items)} image items") + try: + image_embeddings, image_usage = await self.engine_array[ + model_name + ].image_embed(images=image_items) + logger.debug( + f"Successfully got {len(image_embeddings)} image embeddings" + ) + except ModelNotDeployedError as e: + error_msg = ( + f"Model '{model_name}' does not support image embeddings. " + f"Please use a multimodal model (e.g., 'jinaai/jina-clip-v1') " + f"or provide text-only input. Found {len(image_items)} image items in the request." + ) + logger.error(f"{error_msg} Original error: {e}") + raise ValueError(error_msg) from e + + # Merge embeddings back in original order + total_items = len(embedding_input) + ordered_embeddings: list = [None] * total_items + + for idx, embedding in zip(text_indices, text_embeddings): + ordered_embeddings[idx] = embedding + + for idx, embedding in zip(image_indices, image_embeddings): + ordered_embeddings[idx] = embedding + + total_usage = text_usage + image_usage + if return_as_list: return [ - list_embeddings_to_response(embeddings, model=model_name, usage=usage) + list_embeddings_to_response( + ordered_embeddings, + model=model_name, + usage=total_usage, # type: ignore[arg-type] + ) ] else: return list_embeddings_to_response( - embeddings, model=model_name, usage=usage + ordered_embeddings, + model=model_name, + usage=total_usage, # type: ignore[arg-type] ) async def infinity_rerank( From ed7b1f85391a4420d178d079f41845f52df71227 Mon Sep 17 00:00:00 2001 From: vadim Date: Thu, 13 Nov 2025 18:11:02 +0300 Subject: [PATCH 06/27] update README to reflect multimodal support for text and image inputs --- README.md | 177 ++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 151 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index b22672b..c799f69 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,9 @@ --- -High-throughput, OpenAI-compatible text embedding & reranker powered by [Infinity](https://github.com/michaelfeil/infinity) +High-throughput, OpenAI-compatible **text & image embedding** & reranker powered by [Infinity](https://github.com/michaelfeil/infinity) + +**✨ New: Multimodal Support!** Now supports text, images (URLs & base64), and mixed inputs. --- @@ -11,14 +13,21 @@ High-throughput, OpenAI-compatible text embedding & reranker powered by [Infinit --- 1. [Quickstart](#quickstart) -2. [Endpoint Configuration](#endpoint-configuration) -3. [API Specification](#api-specification) +2. [Multimodal Features](#multimodal-features) +3. [Endpoint Configuration](#endpoint-configuration) +4. [API Specification](#api-specification) 1. [List Models](#list-models) 2. [Create Embeddings](#create-embeddings) 3. [Rerank Documents](#rerank-documents) -4. [Usage](#usage) -5. [Further Documentation](#further-documentation) -6. [Acknowledgements](#acknowledgements) +5. [Usage](#usage) + 1. [List Models](#list-models-1) + 2. [Text Embeddings](#text-embeddings) + 3. [Image Embeddings](#image-embeddings) + 4. [Mixed Text & Image Inputs](#mixed-text--image-inputs) + 5. [Reranking](#reranking) +6. [Testing](#testing) +7. [Further Documentation](#further-documentation) +8. [Acknowledgements](#acknowledgements) --- @@ -31,18 +40,45 @@ High-throughput, OpenAI-compatible text embedding & reranker powered by [Infinit --- +## Multimodal Features + +### Supported Input Types + +- ✅ **Text** – traditional text embeddings +- ✅ **Image URLs** – `http://` or `https://` links to images (`.jpg`, `.png`, `.gif`, etc.) +- ✅ **Base64 Images** – data URI format (`data:image/png;base64,...`) +- ✅ **Mixed Inputs** – combine text and images in a single request (order preserved) + +### Automatic Type Detection + +The worker automatically detects whether your input is text or an image: +- URLs ending with image extensions → processed as images +- Data URI with `data:image/...;base64` → decoded and processed as images +- Everything else → processed as text + +### Multimodal Models + +To use image embeddings, deploy a multimodal model such as: +- `patrickjohncyh/fashion-clip` – Fashion-focused CLIP model +- `jinaai/jina-clip-v1` – General-purpose multimodal embeddings +- Any other CLIP-based model with `image_embed` support + +> **Note:** Text-only models (like `BAAI/bge-small-en-v1.5`) will reject image inputs with a clear error message. + +--- + ## Endpoint Configuration All behaviour is controlled through environment variables: -| Variable | Required | Default | Description | -| ------------------------ | -------- | ------- | ---------------------------------------------------------------------------------------------------------------- | -| `MODEL_NAMES` | **Yes** | — | One or more Hugging-Face model IDs. Separate multiple IDs with a semicolon.
Example: `BAAI/bge-small-en-v1.5` | -| `BATCH_SIZES` | No | `32` | Per-model batch size; semicolon-separated list matching `MODEL_NAMES`. | -| `BACKEND` | No | `torch` | Inference engine for _all_ models: `torch`, `optimum`, or `ctranslate2`. | -| `DTYPES` | No | `auto` | Precision per model (`auto`, `fp16`, `fp8`). Semicolon-separated, must match `MODEL_NAMES`. | -| `INFINITY_QUEUE_SIZE` | No | `48000` | Max items queueable inside the Infinity engine. | -| `RUNPOD_MAX_CONCURRENCY` | No | `300` | Max concurrent requests the RunPod wrapper will accept. | +| Variable | Required | Default | Description | +| ------------------------ | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | +| `MODEL_NAMES` | **Yes** | — | One or more Hugging-Face model IDs. Separate multiple IDs with a semicolon.
Example: `BAAI/bge-small-en-v1.5;patrickjohncyh/fashion-clip` | +| `BATCH_SIZES` | No | `32` | Per-model batch size; semicolon-separated list matching `MODEL_NAMES`.
Example: `32;16` | +| `BACKEND` | No | `torch` | Inference engine for _all_ models: `torch`, `optimum`, or `ctranslate2`. | +| `DTYPES` | No | `auto` | Precision per model (`auto`, `fp16`, `fp8`). Semicolon-separated, must match `MODEL_NAMES`.
Example: `auto;auto` | +| `INFINITY_QUEUE_SIZE` | No | `48000` | Max items queueable inside the Infinity engine. | +| `RUNPOD_MAX_CONCURRENCY` | No | `300` | Max concurrent requests the RunPod wrapper will accept. | --- @@ -80,10 +116,10 @@ Except for transport (path + wrapper object) the JSON you send/receive is identi #### Request Fields (shared) -| Field | Type | Required | Description | -| ------- | ------------------- | -------- | ------------------------------------------------- | -| `model` | string | **Yes** | One of the IDs supplied via `MODEL_NAMES`. | -| `input` | string | array | **Yes** | A single text string _or_ list of texts to embed. | +| Field | Type | Required | Description | +| ------- | ------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------- | +| `model` | string | **Yes** | One of the IDs supplied via `MODEL_NAMES`. | +| `input` | string | array | **Yes** | Text string(s), image URL(s), base64 image(s), or mixed list. Automatically detects type. Order preserved for mixed inputs.| OpenAI route vs. Standard: @@ -146,34 +182,123 @@ Below are minimal `curl` snippets so you can copy-paste from any machine. > Replace `` with your endpoint ID and `` with a [RunPod API key](https://docs.runpod.io/get-started/api-keys). -### OpenAI-Compatible Calls +### List Models ```bash -# List models +# OpenAI-compatible format curl -H "Authorization: Bearer " \ https://api.runpod.ai/v2//openai/v1/models -# Create embeddings +# Standard RunPod format +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"input":{"openai_route":"/v1/models"}}' \ + https://api.runpod.ai/v2//runsync +``` + +### Text Embeddings + +```bash +# OpenAI-compatible format curl -X POST \ -H "Authorization: Bearer " \ -H "Content-Type: application/json" \ -d '{"model":"BAAI/bge-small-en-v1.5","input":"Hello world"}' \ https://api.runpod.ai/v2//openai/v1/embeddings + +# Standard RunPod format +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"input":{"model":"BAAI/bge-small-en-v1.5","input":"Hello world"}}' \ + https://api.runpod.ai/v2//runsync ``` -### Standard RunPod Calls +### Image Embeddings ```bash -# Create embeddings (wait for result) +# OpenAI-compatible format (image URL) curl -X POST \ + -H "Authorization: Bearer " \ -H "Content-Type: application/json" \ - -d '{"input":{"model":"BAAI/bge-small-en-v1.5","input":"Hello world"}}' \ + -d '{"model":"patrickjohncyh/fashion-clip","input":"https://example.com/image.jpg"}' \ + https://api.runpod.ai/v2//openai/v1/embeddings + +# Standard RunPod format (base64 image) +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"input":{"model":"patrickjohncyh/fashion-clip","input":"data:image/png;base64,iVBORw0KG..."}}' \ https://api.runpod.ai/v2//runsync +``` + +### Mixed Text & Image Inputs + +```bash +# OpenAI-compatible format +curl -X POST \ + -H "Authorization: Bearer " \ + -H "Content-Type: application/json" \ + -d '{ + "model": "patrickjohncyh/fashion-clip", + "input": [ + "Hello, world!", + "https://example.com/image.jpg", + "Bye, world!" + ] + }' \ + https://api.runpod.ai/v2//openai/v1/embeddings + +# Standard RunPod format +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "input": { + "model": "patrickjohncyh/fashion-clip", + "input": [ + "Hello, world!", + "https://example.com/image.jpg", + "Bye, world!" + ] + } + }' \ + https://api.runpod.ai/v2//runsync +``` + +**Note:** Output embeddings will be in the same order as inputs! + +### Reranking + +```bash +# OpenAI-compatible format +curl -X POST \ + -H "Authorization: Bearer " \ + -H "Content-Type: application/json" \ + -d '{ + "model": "BAAI/bge-reranker-large", + "query": "Which product has warranty coverage?", + "docs": [ + "Product A comes with a 2-year warranty", + "Product B is available in red and blue colors", + "All electronics include a standard 1-year warranty" + ], + "return_docs": true + }' \ + https://api.runpod.ai/v2//openai/v1/rerank -# Rerank +# Standard RunPod format curl -X POST \ -H "Content-Type: application/json" \ - -d '{"input":{"model":"BAAI/bge-reranker-large","query":"Which product has warranty coverage?","docs":["Product A comes with a 2-year warranty","Product B is available in red and blue colors","All electronics include a standard 1-year warranty"],"return_docs":true}}' \ + -d '{ + "input": { + "model": "BAAI/bge-reranker-large", + "query": "Which product has warranty coverage?", + "docs": [ + "Product A comes with a 2-year warranty", + "Product B is available in red and blue colors", + "All electronics include a standard 1-year warranty" + ], + "return_docs": true + } + }' \ https://api.runpod.ai/v2//runsync ``` From 267e95971e9d5e2d9c34f4c0d3af57d9bea6e2c8 Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 17 Nov 2025 16:06:47 +0300 Subject: [PATCH 07/27] local test modality ignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index b69f0a6..2ab3544 100644 --- a/.gitignore +++ b/.gitignore @@ -160,4 +160,5 @@ cython_debug/ #.idea/ -data \ No newline at end of file +data +test_modality.py \ No newline at end of file From 6aa6a3f8d323f76ba4c80934afebdcd8e089b773 Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 17 Nov 2025 16:07:07 +0300 Subject: [PATCH 08/27] add httpx dependency --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index d82080a..3bb060a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ runpod~=1.7.0 infinity-emb[all]==0.0.77 optimum==1.24.0 einops # deployment of custom code with nomic +httpx>=0.27.0 git+https://github.com/pytorch-labs/float8_experimental.git From 6b66b7d032c169038a761eb1f18714635c7cb7ad Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 17 Nov 2025 16:09:32 +0300 Subject: [PATCH 09/27] updated tests --- .runpod/tests.json | 60 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/.runpod/tests.json b/.runpod/tests.json index ed0d318..40e0da3 100644 --- a/.runpod/tests.json +++ b/.runpod/tests.json @@ -1,7 +1,24 @@ { "tests": [ { - "name": "multimodal_text", + "name": "text_embedding_explicit_modality", + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "patrickjohncyh/fashion-clip", + "input": "A beautiful red dress", + "extra_body": { + "modality": "text" + } + } + }, + "expected_output": { + "status": "COMPLETED" + }, + "timeout": 10000 + }, + { + "name": "text_embedding_default_modality", "input": { "openai_route": "/v1/embeddings", "openai_input": { @@ -15,12 +32,15 @@ "timeout": 10000 }, { - "name": "multimodal_image_url", + "name": "image_url_embedding", "input": { "openai_route": "/v1/embeddings", "openai_input": { "model": "patrickjohncyh/fashion-clip", - "input": "https://images.unsplash.com/photo-1515372039744-b8f02a3ae446?w=400" + "input": "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg", + "extra_body": { + "modality": "image" + } } }, "expected_output": { @@ -29,15 +49,18 @@ "timeout": 15000 }, { - "name": "multimodal_multiple_images", + "name": "multiple_images", "input": { "openai_route": "/v1/embeddings", "openai_input": { "model": "patrickjohncyh/fashion-clip", "input": [ - "https://images.unsplash.com/photo-1515372039744-b8f02a3ae446?w=400", - "https://images.unsplash.com/photo-1551028719-00167b16eac5?w=400" - ] + "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg", + "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg" + ], + "extra_body": { + "modality": "image" + } } }, "expected_output": { @@ -46,7 +69,7 @@ "timeout": 20000 }, { - "name": "multimodal_multiple_texts", + "name": "multiple_texts", "input": { "openai_route": "/v1/embeddings", "openai_input": { @@ -55,7 +78,10 @@ "A red dress", "A blue shirt", "Black shoes" - ] + ], + "extra_body": { + "modality": "text" + } } }, "expected_output": { @@ -64,23 +90,21 @@ "timeout": 15000 }, { - "name": "multimodal_mixed_text_and_images", + "name": "audio_not_implemented", "input": { "openai_route": "/v1/embeddings", "openai_input": { "model": "patrickjohncyh/fashion-clip", - "input": [ - "A red summer dress", - "https://images.unsplash.com/photo-1515372039744-b8f02a3ae446?w=400", - "A blue winter coat", - "https://images.unsplash.com/photo-1551028719-00167b16eac5?w=400" - ] + "input": "audio data", + "extra_body": { + "modality": "audio" + } } }, "expected_output": { - "status": "COMPLETED" + "status": "FAILED" }, - "timeout": 25000 + "timeout": 5000 }, { "name": "get_models_list", From d86b8ac822470bd60c13316a829c920907da9416 Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 17 Nov 2025 16:09:49 +0300 Subject: [PATCH 10/27] refactor: enhance embedding service with HTTP client and improved modality validation --- src/embedding_service.py | 197 ++++++++++++++++++++++----------------- 1 file changed, 112 insertions(+), 85 deletions(-) diff --git a/src/embedding_service.py b/src/embedding_service.py index 7f17e36..1946a42 100644 --- a/src/embedding_service.py +++ b/src/embedding_service.py @@ -6,7 +6,8 @@ from PIL import Image from config import EmbeddingServiceConfig -from src.multimodal_utils import parse_input_item +from http_client import create_http_client +from multimodal_utils import validate_item_for_modality from utils import ( ModelInfo, OpenAIModelInfo, @@ -38,18 +39,28 @@ def __init__(self): self.engine_array = AsyncEngineArray.from_args(engine_args) self.is_running = False self.sepamore = asyncio.Semaphore(1) + self.http_client = None async def start(self): """starts the engine background loop""" async with self.sepamore: if not self.is_running: await self.engine_array.astart() + if self.http_client is None: + self.http_client = create_http_client() + logger.info("Created persistent HTTP client for image downloads") + self.is_running = True async def stop(self): """stops the engine background loop""" async with self.sepamore: if self.is_running: + if self.http_client is not None: + await self.http_client.aclose() + self.http_client = None + logger.info("Closed HTTP client") + await self.engine_array.astop() self.is_running = False @@ -65,109 +76,125 @@ async def route_openai_get_embeddings( self, embedding_input: str | list[str] | list[str | bytes | Image.Image], model_name: str, + modality: str = "text", return_as_list: bool = False, ): """ - Returns embeddings for the input text and/or images. - Supports mixed text and image inputs while preserving order. - """ - if not self.is_running: - await self.start() + Returns embeddings for the input based on specified modality. - available_models = self.list_models() - if model_name not in available_models: - logger.error( - f"Requested model '{model_name}' not found. " - f"Available models: {available_models}" - ) - raise ValueError( - f"Model '{model_name}' is not available. " - f"Available models: {', '.join(available_models)}" - ) + Args: + embedding_input: Input text(s) or image(s) to embed + model_name: Name of the model to use + modality: Type of input - "text", "image", or "audio" (not yet implemented) + return_as_list: Whether to return results as a list - if not isinstance(embedding_input, list): - embedding_input = [embedding_input] + Raises: + ValueError: If model not available, modality invalid, or validation fails + NotImplementedError: If modality is "audio" + """ + try: + if not self.is_running: + await self.start() - parsed_items = await asyncio.gather( - *[parse_input_item(item) for item in embedding_input] - ) + available_models = self.list_models() + if model_name not in available_models: + logger.error( + f"Requested model '{model_name}' not found. " + f"Available models: {available_models}" + ) + raise ValueError( + f"Model '{model_name}' is not available. " + f"Available models: {', '.join(available_models)}" + ) - # Separate text and image items while tracking their original indices - text_items = [] - text_indices = [] - image_items = [] - image_indices = [] - - for idx, (item_type, processed_data) in enumerate(parsed_items): - if item_type == "text": - text_items.append(processed_data) - text_indices.append(idx) - else: # item_type == 'image' - image_items.append(processed_data) - image_indices.append(idx) - - logger.info( - f"Processing embeddings for model '{model_name}': " - f"{len(text_items)} text items, {len(image_items)} image items " - f"(total: {len(embedding_input)} items)" - ) + if not isinstance(embedding_input, list): + embedding_input = [embedding_input] - text_embeddings = [] - text_usage = 0 - image_embeddings = [] - image_usage = 0 + # Validate all items for the specified modality in parallel + try: + validated_items = await asyncio.gather( + *[ + validate_item_for_modality( + item, modality, idx, client=self.http_client + ) + for idx, item in enumerate(embedding_input) + ] + ) + except (ValueError, NotImplementedError) as e: + logger.error(f"Validation failed for modality '{modality}': {e}") + raise - if text_items: - logger.debug(f"Calling .embed() with {len(text_items)} text items") - text_embeddings, text_usage = await self.engine_array[model_name].embed( - text_items + logger.info( + f"Processing {len(validated_items)} {modality} items for model '{model_name}'" ) - logger.debug(f"Successfully got {len(text_embeddings)} text embeddings") - if image_items: - logger.debug(f"Calling .image_embed() with {len(image_items)} image items") - try: - image_embeddings, image_usage = await self.engine_array[ - model_name - ].image_embed(images=image_items) + # Route to appropriate embedding method based on modality + if modality == "text": + logger.debug(f"Calling .embed() with {len(validated_items)} text items") + embeddings, usage = await self.engine_array[model_name].embed( + validated_items + ) + logger.debug(f"Successfully got {len(embeddings)} text embeddings") + + elif modality == "image": logger.debug( - f"Successfully got {len(image_embeddings)} image embeddings" + f"Calling .image_embed() with {len(validated_items)} image items" ) - except ModelNotDeployedError as e: - error_msg = ( - f"Model '{model_name}' does not support image embeddings. " - f"Please use a multimodal model (e.g., 'jinaai/jina-clip-v1') " - f"or provide text-only input. Found {len(image_items)} image items in the request." + try: + embeddings, usage = await self.engine_array[model_name].image_embed( + images=validated_items + ) + logger.debug(f"Successfully got {len(embeddings)} image embeddings") + except ModelNotDeployedError as e: + error_msg = ( + f"Model '{model_name}' does not support image embeddings. " + f"Please use a multimodal model (e.g., 'jinaai/jina-clip-v1') " + f"or use modality='text' instead." + ) + logger.error(f"{error_msg} Original error: {e}") + raise ValueError(error_msg) from e + + elif modality == "audio": + raise NotImplementedError( + "Audio modality is not yet implemented. " + "Currently supported modalities: 'text', 'image'" ) - logger.error(f"{error_msg} Original error: {e}") - raise ValueError(error_msg) from e - # Merge embeddings back in original order - total_items = len(embedding_input) - ordered_embeddings: list = [None] * total_items - - for idx, embedding in zip(text_indices, text_embeddings): - ordered_embeddings[idx] = embedding - - for idx, embedding in zip(image_indices, image_embeddings): - ordered_embeddings[idx] = embedding - - total_usage = text_usage + image_usage + else: + raise ValueError( + f"Invalid modality: '{modality}'. " + f"Supported modalities: 'text', 'image', 'audio' (not yet implemented)" + ) - if return_as_list: - return [ - list_embeddings_to_response( - ordered_embeddings, + if return_as_list: + return [ + list_embeddings_to_response( + embeddings, + model=model_name, + usage=usage, # type: ignore[arg-type] + ) + ] + else: + return list_embeddings_to_response( + embeddings, model=model_name, - usage=total_usage, # type: ignore[arg-type] + usage=usage, # type: ignore[arg-type] ) - ] - else: - return list_embeddings_to_response( - ordered_embeddings, - model=model_name, - usage=total_usage, # type: ignore[arg-type] + + except (ValueError, NotImplementedError) as e: + logger.warning( + f"Expected error in route_openai_get_embeddings: {type(e).__name__}: {e}" + ) + raise + except Exception as e: + logger.exception( + f"Unexpected error in route_openai_get_embeddings: " + f"model='{model_name}', modality='{modality}', " + f"input_length={len(embedding_input) if isinstance(embedding_input, list) else 1}" ) + raise RuntimeError( + f"Internal error while processing embeddings: {type(e).__name__}: {str(e)}" + ) from e async def infinity_rerank( self, query: str, docs: str, return_docs: str, model_name: str From 9fe3f61965a08da2eae632cc3bcbfc363023e11f Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 17 Nov 2025 16:10:02 +0300 Subject: [PATCH 11/27] fix: add modality extraction for OpenAI embeddings handling --- src/handler.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/handler.py b/src/handler.py index 8f38e78..6b88701 100644 --- a/src/handler.py +++ b/src/handler.py @@ -2,6 +2,7 @@ from utils import create_error_response from typing import Any from embedding_service import EmbeddingService +from multimodal_utils import extract_modality # Gracefully catch configuration errors (e.g. missing env vars) so the user sees # a clean message instead of a full Python traceback when the container starts. @@ -25,16 +26,20 @@ async def async_generator_handler(job: dict[str, Any]): if openai_route and openai_route == "/v1/models": call_fn, kwargs = embedding_service.route_openai_models, {} elif openai_route and openai_route == "/v1/embeddings": - model_name = openai_input.get("model") if not openai_input: return create_error_response("Missing input").model_dump() + model_name = openai_input.get("model") if not model_name: return create_error_response( "Did not specify model in openai_input" ).model_dump() + + modality = extract_modality(job_input, openai_input) + call_fn, kwargs = embedding_service.route_openai_get_embeddings, { "embedding_input": openai_input.get("input"), "model_name": model_name, + "modality": modality, "return_as_list": True, } else: @@ -51,9 +56,11 @@ async def async_generator_handler(job: dict[str, Any]): "model_name": job_input.get("model"), } elif job_input.get("input"): + modality = extract_modality(job_input) call_fn, kwargs = embedding_service.route_openai_get_embeddings, { "embedding_input": job_input.get("input"), "model_name": job_input.get("model"), + "modality": modality, } else: return create_error_response(f"Invalid input: {job}").model_dump() From 802f465900cb7cd712f6fe19932aba3639307c02 Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 17 Nov 2025 16:10:10 +0300 Subject: [PATCH 12/27] feat: implement http client with configurable settings and user agent --- src/http_client.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 src/http_client.py diff --git a/src/http_client.py b/src/http_client.py new file mode 100644 index 0000000..58af57c --- /dev/null +++ b/src/http_client.py @@ -0,0 +1,34 @@ +import httpx + +DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + +DEFAULT_TIMEOUT = 10.0 +DEFAULT_MAX_CONNECTIONS = 50 +DEFAULT_MAX_KEEPALIVE_CONNECTIONS = 20 + + +def create_http_client() -> httpx.AsyncClient: + """ + Create a configured httpx.AsyncClient. + + Returns: + httpx.AsyncClient with proper timeout, limits, and headers configured. + """ + limits = httpx.Limits( + max_connections=DEFAULT_MAX_CONNECTIONS, + max_keepalive_connections=DEFAULT_MAX_KEEPALIVE_CONNECTIONS, + ) + + timeout = httpx.Timeout(DEFAULT_TIMEOUT) + + headers = { + "User-Agent": DEFAULT_USER_AGENT, + } + + return httpx.AsyncClient( + limits=limits, + timeout=timeout, + headers=headers, + follow_redirects=True, + trust_env=True, + ) From c96b6575d0510753fac23364acd8c6424465c904 Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 17 Nov 2025 16:10:22 +0300 Subject: [PATCH 13/27] feat: enhance image handling with RGB conversion, URL downloading, and base64 support --- src/multimodal_utils.py | 209 +++++++++++++++++++++++++++++++++++----- 1 file changed, 186 insertions(+), 23 deletions(-) diff --git a/src/multimodal_utils.py b/src/multimodal_utils.py index fe55477..e29a80b 100644 --- a/src/multimodal_utils.py +++ b/src/multimodal_utils.py @@ -1,19 +1,77 @@ import base64 import io +import logging import re +from typing import Any from PIL import Image +logger = logging.getLogger(__name__) + + +def _ensure_rgb(img: Image.Image) -> Image.Image: + """ + Ensure image is in RGB format (required for CLIP models). + + Args: + img: PIL Image in any mode + + Returns: + PIL Image in RGB mode + """ + if img.mode != 'RGB': + logger.debug(f"Converting image from {img.mode} to RGB") + return img.convert('RGB') + return img + def _is_url(text: str) -> bool: """Check if string is a URL""" return text.startswith('http://') or text.startswith('https://') -def _is_base64_image(data: str) -> tuple[bool, Image.Image | None]: +async def _download_image_from_url(url: str, client) -> Image.Image: + """ + Download image from URL using httpx with proper User-Agent and timeout. + + Args: + url: HTTP(S) URL to download image from + client: httpx.AsyncClient instance with configured timeout and limits + + Returns: + PIL.Image in RGB format + + Raises: + ValueError: If download fails or content is not a valid image + """ + try: + logger.debug(f"Downloading image from URL: {url}") + response = await client.get(url) + response.raise_for_status() # Raises exception for 4xx/5xx status codes + + img_bytes = response.content + logger.debug(f"Downloaded {len(img_bytes)} bytes from {url} (status: {response.status_code})") + except Exception as e: + raise ValueError(f"Failed to download image from URL: {type(e).__name__}: {e}") from e + + try: + img = Image.open(io.BytesIO(img_bytes)) + img.load() # Force load to validate it's a real image + logger.debug(f"Successfully loaded image from URL: {img.size} {img.mode}") + + return _ensure_rgb(img) + except Exception as e: + raise ValueError(f"Failed to decode image from URL: {type(e).__name__}: {e}") from e + + +def _is_base64_image(data: str) -> Image.Image | None: """ - Check if string is a base64-encoded image and decode it. + Try to decode string as base64-encoded image. Supports both data URI format (data:image/...) and raw base64. - Returns (is_image, PIL_Image or None) + + Returns: + PIL.Image in RGB format, or None if not a valid base64 image + + Note: Converts all images to RGB format for compatibility with multimodal models. """ try: # Handle data URI format: data:image/png;base64,iVBORw0KG... @@ -21,43 +79,148 @@ def _is_base64_image(data: str) -> tuple[bool, Image.Image | None]: match = re.match(r'data:image/[^;]+;base64,(.+)', data) if match: base64_data = match.group(1) + logger.debug(f"Matched data URI, extracted base64 data (length: {len(base64_data)})") else: - return False, None + logger.debug("data: URI does not match expected format") + return None else: # Try raw base64 base64_data = data + logger.debug("Treating as raw base64") img_bytes = base64.b64decode(base64_data) + logger.debug(f"Decoded base64 to {len(img_bytes)} bytes") img = Image.open(io.BytesIO(img_bytes)) img.load() # Force load to validate it's a real image - return True, img - except Exception: + logger.debug(f"Successfully loaded image: {img.size} {img.mode}") - return False, None + return _ensure_rgb(img) + except Exception as e: + logger.warning(f"Failed to decode base64 image: {type(e).__name__}: {e}") + return None + + +def validate_text_item(item: Any) -> str: + """ + Validate and convert item to text string. + Raises ValueError if item cannot be converted to text. + """ + if isinstance(item, str): + return item + + # Try to convert to string + try: + return str(item) + except Exception as e: + raise ValueError(f"Cannot convert item to text: {type(item).__name__}") from e -async def parse_input_item(item: str | bytes | Image.Image) -> tuple[str, str | Image.Image | bytes]: +async def validate_image_item(item: Any, client=None) -> Image.Image: """ - Parse a single input item and determine if it's text or image. - Returns: (type, processed_data) - where type is 'text' or 'image' - and processed_data is either the original text or PIL.Image/URL/bytes + Validate and process image item. + Accepts: PIL.Image, bytes, URL string, or base64 string. + Returns PIL.Image in RGB format. + Raises ValueError if item is not a valid image format. - Note: infinity_emb handles URL downloading internally, so we just pass URLs through. + Args: + item: The image item to validate (PIL.Image, bytes, URL string, or base64 string) + client: Optional httpx.AsyncClient for downloading URL images with timeout and User-Agent + + Note: All images are converted to RGB format for CLIP compatibility. """ - # Check if it's an image - if isinstance(item, (Image.Image, bytes)): - return 'image', item + # PIL Image - convert to RGB if needed + if isinstance(item, Image.Image): + return _ensure_rgb(item) + + # Bytes - decode to PIL Image + if isinstance(item, bytes): + try: + img = Image.open(io.BytesIO(item)) + img.load() + logger.debug(f"Loaded image from bytes: {img.size} {img.mode}") + return _ensure_rgb(img) + except Exception as e: + raise ValueError(f"Failed to decode image from bytes: {type(e).__name__}: {e}") from e if isinstance(item, str): - # URL images + # URL images - download with proper User-Agent and timeout if _is_url(item): - return 'image', item + if client is None: + raise ValueError("HTTP client required for downloading images from URLs") + return await _download_image_from_url(item, client) + + # Base64 images - decode and validate (already converted to RGB) + img = _is_base64_image(item) + if img is not None: + return img - # Base64 images - is_base64_img, img = _is_base64_image(item) - if is_base64_img and img is not None: - return 'image', img + # Not a valid image format + raise ValueError( + "String is not a valid image format (must be URL starting with http:// or https://, " + "or base64-encoded image with 'data:image/...' prefix)" + ) + + raise ValueError( + f"Invalid image type: {type(item).__name__}. " + f"Expected PIL.Image, bytes, URL string, or base64 string." + ) + + +async def validate_item_for_modality(item: Any, modality: str, index: int, client=None) -> Any: + """ + Validate a single item for the specified modality. + + Args: + item: The input item to validate + modality: One of "text", "image", "audio" + index: The index of the item in the batch (for error messages) + client: Optional httpx.AsyncClient for downloading URL images with timeout + + Returns: + Validated and processed item suitable for infinity_emb + + Raises: + ValueError: If item is not valid for the specified modality + NotImplementedError: If modality is "audio" + """ + try: + if modality == "text": + return validate_text_item(item) + elif modality == "image": + return await validate_image_item(item, client=client) + elif modality == "audio": + raise NotImplementedError( + "Audio modality is not yet implemented. " + "Currently supported modalities: 'text', 'image'" + ) + else: + raise ValueError( + f"Invalid modality: '{modality}'. " + f"Supported modalities: 'text', 'image', 'audio' (not yet implemented)" + ) + except (ValueError, NotImplementedError) as e: + # Re-raise with index information for better error messages + raise type(e)(f"Item at index {index}: {str(e)}") from e + + +def extract_modality(job_input: dict[str, Any], openai_input: dict[str, Any] | None = None) -> str: + """ + Extract modality parameter from job input. + Checks extra_body (OpenAI format) first, then direct input field. + Defaults to "text" for backward compatibility. - return 'text', item if isinstance(item, str) else str(item) + Args: + job_input: The main job input dictionary + openai_input: Optional OpenAI-format input dictionary + + Returns: + Modality string: "text", "image", or "audio" + """ + if openai_input: + # OpenAI format: check extra_body + extra_body = openai_input.get("extra_body", {}) + return extra_body.get("modality", "text") + else: + # Direct format: check top-level modality field + return job_input.get("modality", "text") \ No newline at end of file From 9db8204d216d60e462944b436c184db0ede0c115 Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 17 Nov 2025 23:13:37 +0300 Subject: [PATCH 14/27] remove extract_modality function and simplify modality extraction in handler --- src/handler.py | 5 ++--- src/multimodal_utils.py | 24 +----------------------- 2 files changed, 3 insertions(+), 26 deletions(-) diff --git a/src/handler.py b/src/handler.py index 6b88701..0d43789 100644 --- a/src/handler.py +++ b/src/handler.py @@ -2,7 +2,6 @@ from utils import create_error_response from typing import Any from embedding_service import EmbeddingService -from multimodal_utils import extract_modality # Gracefully catch configuration errors (e.g. missing env vars) so the user sees # a clean message instead of a full Python traceback when the container starts. @@ -34,7 +33,7 @@ async def async_generator_handler(job: dict[str, Any]): "Did not specify model in openai_input" ).model_dump() - modality = extract_modality(job_input, openai_input) + modality = openai_input.get("modality", "text") call_fn, kwargs = embedding_service.route_openai_get_embeddings, { "embedding_input": openai_input.get("input"), @@ -56,7 +55,7 @@ async def async_generator_handler(job: dict[str, Any]): "model_name": job_input.get("model"), } elif job_input.get("input"): - modality = extract_modality(job_input) + modality = job_input.get("modality", "text") call_fn, kwargs = embedding_service.route_openai_get_embeddings, { "embedding_input": job_input.get("input"), "model_name": job_input.get("model"), diff --git a/src/multimodal_utils.py b/src/multimodal_utils.py index e29a80b..53a8b28 100644 --- a/src/multimodal_utils.py +++ b/src/multimodal_utils.py @@ -46,7 +46,7 @@ async def _download_image_from_url(url: str, client) -> Image.Image: try: logger.debug(f"Downloading image from URL: {url}") response = await client.get(url) - response.raise_for_status() # Raises exception for 4xx/5xx status codes + response.raise_for_status() img_bytes = response.content logger.debug(f"Downloaded {len(img_bytes)} bytes from {url} (status: {response.status_code})") @@ -202,25 +202,3 @@ async def validate_item_for_modality(item: Any, modality: str, index: int, clien except (ValueError, NotImplementedError) as e: # Re-raise with index information for better error messages raise type(e)(f"Item at index {index}: {str(e)}") from e - - -def extract_modality(job_input: dict[str, Any], openai_input: dict[str, Any] | None = None) -> str: - """ - Extract modality parameter from job input. - Checks extra_body (OpenAI format) first, then direct input field. - Defaults to "text" for backward compatibility. - - Args: - job_input: The main job input dictionary - openai_input: Optional OpenAI-format input dictionary - - Returns: - Modality string: "text", "image", or "audio" - """ - if openai_input: - # OpenAI format: check extra_body - extra_body = openai_input.get("extra_body", {}) - return extra_body.get("modality", "text") - else: - # Direct format: check top-level modality field - return job_input.get("modality", "text") \ No newline at end of file From 3ba1f1564ba648585cf75a5327e5e21c9e2756d3 Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 17 Nov 2025 23:20:17 +0300 Subject: [PATCH 15/27] fix: update CMD syntax in Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 0155c94..f99387f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,4 +29,4 @@ ADD src . COPY test_input.json /test_input.json # start the handler -CMD python -u /handler.py +CMD ["python", "-u", "/handler.py"] From f61a9cc4769230396f0dc94c57609fcec713bb1e Mon Sep 17 00:00:00 2001 From: vadim Date: Sat, 29 Nov 2025 16:17:18 +0300 Subject: [PATCH 16/27] docs: update README to clarify multimodal support and explicit modality selection --- README.md | 88 ++++++++++++++++++++----------------------------------- 1 file changed, 32 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index c799f69..d0bedc6 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ High-throughput, OpenAI-compatible **text & image embedding** & reranker powered by [Infinity](https://github.com/michaelfeil/infinity) -**✨ New: Multimodal Support!** Now supports text, images (URLs & base64), and mixed inputs. +**✨ New: Multimodal Support!** Now supports text and image embeddings (URLs & base64) with an explicit `modality` switch per request. --- @@ -23,8 +23,7 @@ High-throughput, OpenAI-compatible **text & image embedding** & reranker powered 1. [List Models](#list-models-1) 2. [Text Embeddings](#text-embeddings) 3. [Image Embeddings](#image-embeddings) - 4. [Mixed Text & Image Inputs](#mixed-text--image-inputs) - 5. [Reranking](#reranking) + 4. [Reranking](#reranking) 6. [Testing](#testing) 7. [Further Documentation](#further-documentation) 8. [Acknowledgements](#acknowledgements) @@ -42,19 +41,28 @@ High-throughput, OpenAI-compatible **text & image embedding** & reranker powered ## Multimodal Features -### Supported Input Types +### Supported Modalities - ✅ **Text** – traditional text embeddings - ✅ **Image URLs** – `http://` or `https://` links to images (`.jpg`, `.png`, `.gif`, etc.) - ✅ **Base64 Images** – data URI format (`data:image/png;base64,...`) -- ✅ **Mixed Inputs** – combine text and images in a single request (order preserved) -### Automatic Type Detection +Each request targets a single modality: -The worker automatically detects whether your input is text or an image: -- URLs ending with image extensions → processed as images -- Data URI with `data:image/...;base64` → decoded and processed as images -- Everything else → processed as text +| Modality | How to request | Notes | +| -------- | ------------------------------------------------ | ------------------------------------------------- | +| `text` | Default; or set `modality="text"` | Works with any deployed embedding model | +| `image` | Set `modality="image"` | Requires a multimodal model (see below) | +| `audio` | Planned | Returns a clear `NotImplementedError` for now | + +> **Tip:** For OpenAI-compatible requests, include `"modality": "…"` alongside `model` and `input`. For native `/runsync` requests, pass `modality` inside the `input` object. If omitted, the worker assumes `text`. + +### Explicit Modality Selection + +- The previous automatic detector has been replaced with an explicit `modality` flag so you stay in control of routing. +- All inputs are validated eagerly for the chosen modality with detailed, index-aware error messages. +- Image downloads run through a shared `httpx.AsyncClient` with tuned keep-alive limits, timeouts, and a desktop browser User-Agent—improving compatibility with CDNs that block generic clients. +- If you configure `extra_body` via an OpenAI SDK, the `modality` field still arrives at the server top-level—no additional parsing is required. ### Multimodal Models @@ -116,17 +124,18 @@ Except for transport (path + wrapper object) the JSON you send/receive is identi #### Request Fields (shared) -| Field | Type | Required | Description | -| ------- | ------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------- | -| `model` | string | **Yes** | One of the IDs supplied via `MODEL_NAMES`. | -| `input` | string | array | **Yes** | Text string(s), image URL(s), base64 image(s), or mixed list. Automatically detects type. Order preserved for mixed inputs.| +| Field | Type | Required | Description | +| ---------- | ------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------- | +| `model` | string | **Yes** | One of the IDs supplied via `MODEL_NAMES`. | +| `input` | string | array | **Yes** | Text string(s) or image URL/base64 list matching the selected modality. Order is preserved. | +| `modality` | string | No | Required for images. Accepts `text` (default) or `image`. For OpenAI requests supply via `extra_body.modality`. | OpenAI route vs. Standard: -| Flavour | Method | Path | Body | -| -------- | ------ | ---------------- | --------------------------------------------- | -| OpenAI | `POST` | `/v1/embeddings` | `{ "model": "…", "input": "…" }` | -| Standard | `POST` | `/runsync` | `{ "input": { "model": "…", "input": "…" } }` | +| Flavour | Method | Path | Body | +| -------- | ------ | ---------------- | ---------------------------------------------------------------------- | +| OpenAI | `POST` | `/v1/embeddings` | `{ "model": "…", "input": "…", "modality": "text" }` (modality optional for text) | +| Standard | `POST` | `/runsync` | `{ "input": { "model": "…", "input": "…", "modality": "text" } }` | #### Response (both flavours) @@ -203,13 +212,13 @@ curl -X POST \ curl -X POST \ -H "Authorization: Bearer " \ -H "Content-Type: application/json" \ - -d '{"model":"BAAI/bge-small-en-v1.5","input":"Hello world"}' \ + -d '{"model":"BAAI/bge-small-en-v1.5","input":"Hello world","modality":"text"}' \ https://api.runpod.ai/v2//openai/v1/embeddings # Standard RunPod format curl -X POST \ -H "Content-Type: application/json" \ - -d '{"input":{"model":"BAAI/bge-small-en-v1.5","input":"Hello world"}}' \ + -d '{"input":{"model":"BAAI/bge-small-en-v1.5","input":"Hello world","modality":"text"}}' \ https://api.runpod.ai/v2//runsync ``` @@ -220,50 +229,17 @@ curl -X POST \ curl -X POST \ -H "Authorization: Bearer " \ -H "Content-Type: application/json" \ - -d '{"model":"patrickjohncyh/fashion-clip","input":"https://example.com/image.jpg"}' \ + -d '{"model":"patrickjohncyh/fashion-clip","input":"https://example.com/image.jpg","modality":"image"}' \ https://api.runpod.ai/v2//openai/v1/embeddings # Standard RunPod format (base64 image) curl -X POST \ -H "Content-Type: application/json" \ - -d '{"input":{"model":"patrickjohncyh/fashion-clip","input":"data:image/png;base64,iVBORw0KG..."}}' \ - https://api.runpod.ai/v2//runsync -``` - -### Mixed Text & Image Inputs - -```bash -# OpenAI-compatible format -curl -X POST \ - -H "Authorization: Bearer " \ - -H "Content-Type: application/json" \ - -d '{ - "model": "patrickjohncyh/fashion-clip", - "input": [ - "Hello, world!", - "https://example.com/image.jpg", - "Bye, world!" - ] - }' \ - https://api.runpod.ai/v2//openai/v1/embeddings - -# Standard RunPod format -curl -X POST \ - -H "Content-Type: application/json" \ - -d '{ - "input": { - "model": "patrickjohncyh/fashion-clip", - "input": [ - "Hello, world!", - "https://example.com/image.jpg", - "Bye, world!" - ] - } - }' \ + -d '{"input":{"model":"patrickjohncyh/fashion-clip","input":"data:image/png;base64,iVBORw0KG...","modality":"image"}}' \ https://api.runpod.ai/v2//runsync ``` -**Note:** Output embeddings will be in the same order as inputs! +> **Note:** Send one request per modality. If you need both text and image embeddings, issue two calls so each payload is validated consistently. ### Reranking From d8c8292530ac561d889878f80e37a46e4ad83d82 Mon Sep 17 00:00:00 2001 From: vadim Date: Sat, 29 Nov 2025 22:12:35 +0300 Subject: [PATCH 17/27] fix: remove test_modality.py from .gitignore --- .gitignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 2ab3544..b69f0a6 100644 --- a/.gitignore +++ b/.gitignore @@ -160,5 +160,4 @@ cython_debug/ #.idea/ -data -test_modality.py \ No newline at end of file +data \ No newline at end of file From 96a9afb71618797c7718f04f9c516be98ae654f7 Mon Sep 17 00:00:00 2001 From: vadim Date: Sat, 29 Nov 2025 22:13:20 +0300 Subject: [PATCH 18/27] docs: update README to reflect explicit modality selection --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index d0bedc6..d9a3f13 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,6 @@ Each request targets a single modality: ### Explicit Modality Selection -- The previous automatic detector has been replaced with an explicit `modality` flag so you stay in control of routing. - All inputs are validated eagerly for the chosen modality with detailed, index-aware error messages. - Image downloads run through a shared `httpx.AsyncClient` with tuned keep-alive limits, timeouts, and a desktop browser User-Agent—improving compatibility with CDNs that block generic clients. - If you configure `extra_body` via an OpenAI SDK, the `modality` field still arrives at the server top-level—no additional parsing is required. From 2e965d77f7626c848e94a24f490e237848e0267b Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 1 Dec 2025 14:23:37 +0300 Subject: [PATCH 19/27] test: add integration tests for explicit modality API using pytest --- tests/test_modality.py | 442 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 442 insertions(+) create mode 100644 tests/test_modality.py diff --git a/tests/test_modality.py b/tests/test_modality.py new file mode 100644 index 0000000..a776386 --- /dev/null +++ b/tests/test_modality.py @@ -0,0 +1,442 @@ +"""Integration tests for the explicit modality API using pytest.""" + +import base64 +import io +from collections.abc import Iterable, Mapping +from typing import Any + +import pytest +import requests +from PIL import Image + +BASE_URL = "http://localhost:8000" +RUNSYNC_URL = f"{BASE_URL}/runsync" +DEFAULT_TIMEOUT_SECONDS = 30 + + +def _post_runsync( + payload: Mapping[str, Any], timeout: int | float = DEFAULT_TIMEOUT_SECONDS +) -> requests.Response: + """Send a request to the worker and print useful diagnostics.""" + response = requests.post(RUNSYNC_URL, json=payload, timeout=timeout) + print(f"POST {RUNSYNC_URL} -> {response.status_code}") + return response + + +def _extract_output(result: Mapping[str, Any]) -> Mapping[str, Any] | None: + """Normalise RunPod /runsync output into a single mapping.""" + output = result.get("output") + if isinstance(output, list) and output: + return output[0] + if isinstance(output, Mapping): + return output + return None + + +def _extract_error_message(result: Mapping[str, Any]) -> str | None: + output = _extract_output(result) + if output and output.get("object") == "error": + message = output.get("message") + return str(message) if message is not None else None + return None + + +def generate_red_square_base64() -> str: + """Generate a 5x5 red square PNG encoded as a data URI.""" + img = Image.new("RGB", (5, 5), color=(255, 0, 0)) + buffer = io.BytesIO() + img.save(buffer, format="PNG") + buffer.seek(0) + img_base64 = base64.b64encode(buffer.read()).decode("utf-8") + return f"data:image/png;base64,{img_base64}" + + +@pytest.mark.parametrize( + "name,payload,expected_count", + [ + ( + "text modality", + { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "BAAI/bge-small-en-v1.5", + "input": ["Hello world", "How are you?"], + "modality": "text", + }, + } + }, + 2, + ), + ( + "image modality (url)", + { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "patrickjohncyh/fashion-clip", + "input": [ + "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg" + ], + "modality": "image", + }, + } + }, + 1, + ), + ( + "image modality (base64)", + { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "patrickjohncyh/fashion-clip", + "input": [generate_red_square_base64()], + "modality": "image", + }, + } + }, + 1, + ), + ( + "default text modality", + { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "BAAI/bge-small-en-v1.5", + "input": ["Default modality test"], + }, + } + }, + 1, + ), + ], +) +def test_modality_success( + name: str, payload: Mapping[str, Any], expected_count: int +) -> None: + response = _post_runsync(payload) + assert response.status_code == 200, f"HTTP error for {name}: {response.text}" + + result = response.json() + assert result.get("status") == "COMPLETED", ( + f"Unexpected status for {name}: {result}" + ) + + output = _extract_output(result) or {} + data = output.get("data", []) + assert isinstance(data, Iterable), f"Missing data for {name}: {output}" + + if isinstance(data, list): + assert len(data) == expected_count, ( + f"Expected {expected_count} embeddings for {name}, got {len(data)}" + ) + else: + pytest.fail(f"Unexpected data format for {name}: {type(data)}") + + +def test_wrong_modality_error() -> None: + payload = { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "BAAI/bge-small-en-v1.5", + "input": [ + "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg" + ], + "modality": "image", + }, + } + } + + response = _post_runsync(payload) + if response.status_code != 200: + assert response.status_code >= 400 + return + + result = response.json() + error_msg = _extract_error_message(result) + assert error_msg is not None, f"Expected error object, got: {result}" + + +def test_audio_not_implemented() -> None: + payload = { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "BAAI/bge-small-en-v1.5", + "input": ["audio data"], + "modality": "audio", + }, + } + } + + response = _post_runsync(payload) + if response.status_code != 200: + assert response.status_code >= 400 + return + + result = response.json() + error_msg = _extract_error_message(result) + assert error_msg is not None, f"Expected NotImplementedError output, got: {result}" + assert "not yet implemented" in error_msg.lower() + + +def test_validation_flexibility() -> None: + payload = { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "BAAI/bge-small-en-v1.5", + "input": ["https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg"], + "modality": "text", + }, + } + } + + response = _post_runsync(payload) + assert response.status_code == 200, f"HTTP error: {response.text}" + + result = response.json() + assert result.get("status") == "COMPLETED", ( + f"Expected success treating URL as text: {result}" + ) + + +@pytest.mark.parametrize( + "payload,expected_count", + [ + ( + { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "BAAI/bge-small-en-v1.5", + "input": [], + "modality": "text", + }, + } + }, + 0, + ), + ( + { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "BAAI/bge-small-en-v1.5", + "input": "Single text string", + "modality": "text", + }, + } + }, + 1, + ), + ], +) +def test_text_edge_cases(payload: Mapping[str, Any], expected_count: int) -> None: + response = _post_runsync(payload) + if response.status_code != 200: + assert response.status_code >= 400 + return + + result = response.json() + if result.get("status") != "COMPLETED": + assert _extract_error_message(result) is not None + return + + output = _extract_output(result) or {} + data = output.get("data", []) + assert isinstance(data, list) + assert len(data) == expected_count + + +def test_edge_very_long_text() -> None: + long_text = "This is a test sentence. " * 200 + payload = { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "BAAI/bge-small-en-v1.5", + "input": long_text, + "modality": "text", + }, + } + } + + response = _post_runsync(payload) + assert response.status_code == 200, f"HTTP error: {response.text}" + + result = response.json() + assert result.get("status") == "COMPLETED", f"Unexpected status: {result}" + + +def test_edge_empty_string() -> None: + payload = { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "BAAI/bge-small-en-v1.5", + "input": "", + "modality": "text", + }, + } + } + + response = _post_runsync(payload) + if response.status_code != 200: + assert response.status_code >= 400 + return + + result = response.json() + if result.get("status") == "COMPLETED": + output = _extract_output(result) or {} + data = output.get("data", []) + assert isinstance(data, list) + else: + assert _extract_error_message(result) is not None + + +def test_edge_invalid_modality() -> None: + payload = { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "BAAI/bge-small-en-v1.5", + "input": "test", + "modality": "video", + }, + } + } + + response = _post_runsync(payload) + if response.status_code != 200: + assert response.status_code >= 400 + return + + result = response.json() + error_msg = _extract_error_message(result) + assert error_msg is not None, f"Expected invalid modality error: {result}" + assert "invalid modality" in error_msg.lower() + + +def test_edge_missing_model() -> None: + payload = { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "input": "test", + "modality": "text", + }, + } + } + + response = _post_runsync(payload) + if response.status_code != 200: + assert response.status_code >= 400 + return + + result = response.json() + error_msg = _extract_error_message(result) + assert error_msg is not None, f"Expected missing model error: {result}" + + +def test_edge_nonexistent_model() -> None: + payload = { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "nonexistent/model-12345", + "input": "test", + "modality": "text", + }, + } + } + + response = _post_runsync(payload) + if response.status_code != 200: + assert response.status_code >= 400 + return + + result = response.json() + error_msg = _extract_error_message(result) + assert error_msg is not None, f"Expected model missing error: {result}" + assert "not available" in error_msg.lower() or "not found" in error_msg.lower() + + +def test_edge_invalid_image_url() -> None: + payload = { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "patrickjohncyh/fashion-clip", + "input": "https://example.com/nonexistent-image-12345.jpg", + "modality": "image", + }, + } + } + + response = _post_runsync(payload) + if response.status_code != 200: + assert response.status_code >= 400 + return + + result = response.json() + assert result.get("status") in {"COMPLETED", "FAILED"}, ( + f"Unexpected status: {result}" + ) + + +def test_edge_special_characters() -> None: + payload = { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "BAAI/bge-small-en-v1.5", + "input": "Hello 世界! 🌍 Special chars: @#$%^&*()_+-=[]{}|;:',.<>?/~`", + "modality": "text", + }, + } + } + + response = _post_runsync(payload) + assert response.status_code == 200, f"HTTP error: {response.text}" + + result = response.json() + assert result.get("status") == "COMPLETED", f"Unexpected status: {result}" + + +def test_edge_corrupted_base64() -> None: + invalid_payloads = [ + "data:image/png;base64,NotValidBase64Data!!!", + "data:image/png;base64,iVBORw0KGgoAAAA", + "data:image/jpeg;base64,/9j/4AAQSkZJRg", + "data:image/png;base64,SGVsbG8gV29ybGQh", + ] + + for idx, invalid_base64 in enumerate(invalid_payloads, start=1): + print( + f"\n Test variant {idx}/{len(invalid_payloads)}: {invalid_base64[:50]}..." + ) + payload = { + "input": { + "openai_route": "/v1/embeddings", + "openai_input": { + "model": "patrickjohncyh/fashion-clip", + "input": invalid_base64, + "modality": "image", + }, + } + } + + response = _post_runsync(payload) + if response.status_code != 200: + assert response.status_code >= 400 + continue + + result = response.json() + error_msg = _extract_error_message(result) + assert error_msg is not None, f"Expected error for corrupted base64: {result}" + + print("\n✓ All corrupted base64 variants handled without crash") From abde653a3e8d4fb5457c76e514e82f640240ffb6 Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 1 Dec 2025 14:26:19 +0300 Subject: [PATCH 20/27] docs: reorder sections in README for improved clarity --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d9a3f13..d2839ae 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,8 @@ High-throughput, OpenAI-compatible **text & image embedding** & reranker powered 2. [Text Embeddings](#text-embeddings) 3. [Image Embeddings](#image-embeddings) 4. [Reranking](#reranking) -6. [Testing](#testing) -7. [Further Documentation](#further-documentation) -8. [Acknowledgements](#acknowledgements) +6. [Further Documentation](#further-documentation) +7. [Acknowledgements](#acknowledgements) --- From a84f8cff083d3b763848f15336c5c8a76a34df3e Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 1 Dec 2025 14:32:39 +0300 Subject: [PATCH 21/27] docs: update section on modality selection to validation and image fetching defaults --- README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d2839ae..f5b3bfb 100644 --- a/README.md +++ b/README.md @@ -56,11 +56,10 @@ Each request targets a single modality: > **Tip:** For OpenAI-compatible requests, include `"modality": "…"` alongside `model` and `input`. For native `/runsync` requests, pass `modality` inside the `input` object. If omitted, the worker assumes `text`. -### Explicit Modality Selection +### Validation & Image Fetching Defaults - All inputs are validated eagerly for the chosen modality with detailed, index-aware error messages. -- Image downloads run through a shared `httpx.AsyncClient` with tuned keep-alive limits, timeouts, and a desktop browser User-Agent—improving compatibility with CDNs that block generic clients. -- If you configure `extra_body` via an OpenAI SDK, the `modality` field still arrives at the server top-level—no additional parsing is required. +- Image downloads run through a shared `httpx.AsyncClient` with tuned keep-alive limits, timeouts, and a desktop browser User-Agent—improving compatibility with CDNs that block generic clients. All of these knobs can be overridden using the `HTTP_CLIENT_*` environment variables listed below. ### Multimodal Models @@ -85,6 +84,10 @@ All behaviour is controlled through environment variables: | `DTYPES` | No | `auto` | Precision per model (`auto`, `fp16`, `fp8`). Semicolon-separated, must match `MODEL_NAMES`.
Example: `auto;auto` | | `INFINITY_QUEUE_SIZE` | No | `48000` | Max items queueable inside the Infinity engine. | | `RUNPOD_MAX_CONCURRENCY` | No | `300` | Max concurrent requests the RunPod wrapper will accept. | +| `HTTP_CLIENT_USER_AGENT` | No | Desktop Chrome UA | Override the browser-style User-Agent used for outbound image downloads. | +| `HTTP_CLIENT_TIMEOUT` | No | `10.0` | Request timeout (seconds) for outbound image fetches. | +| `HTTP_CLIENT_MAX_CONNECTIONS` | No | `50` | Concurrent connection pool size for the shared `httpx` client. | +| `HTTP_CLIENT_MAX_KEEPALIVE_CONNECTIONS` | No | `20` | Max keep-alive sockets retained by the shared `httpx` client. | --- From 6101244a4e7c75ca16e6fd772f1d9dcd9542a198 Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 1 Dec 2025 14:32:50 +0300 Subject: [PATCH 22/27] fix: streamline modality specification in test cases by removing extra_body --- .runpod/tests.json | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/.runpod/tests.json b/.runpod/tests.json index 40e0da3..114e67f 100644 --- a/.runpod/tests.json +++ b/.runpod/tests.json @@ -7,9 +7,7 @@ "openai_input": { "model": "patrickjohncyh/fashion-clip", "input": "A beautiful red dress", - "extra_body": { - "modality": "text" - } + "modality": "text" } }, "expected_output": { @@ -38,9 +36,7 @@ "openai_input": { "model": "patrickjohncyh/fashion-clip", "input": "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg", - "extra_body": { - "modality": "image" - } + "modality": "image" } }, "expected_output": { @@ -58,9 +54,7 @@ "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg", "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg" ], - "extra_body": { - "modality": "image" - } + "modality": "image" } }, "expected_output": { @@ -79,9 +73,7 @@ "A blue shirt", "Black shoes" ], - "extra_body": { - "modality": "text" - } + "modality": "text" } }, "expected_output": { @@ -96,9 +88,7 @@ "openai_input": { "model": "patrickjohncyh/fashion-clip", "input": "audio data", - "extra_body": { - "modality": "audio" - } + "modality": "audio" } }, "expected_output": { From bcfcc1810c2936c7c9c23a06f52e77ac53670c0a Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 1 Dec 2025 14:33:10 +0300 Subject: [PATCH 23/27] refactor: centralize HTTP client configuration using HttpClientConfig --- src/config.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++ src/http_client.py | 20 +++++++------- 2 files changed, 76 insertions(+), 11 deletions(-) diff --git a/src/config.py b/src/config.py index 6266ed5..bfa28cd 100644 --- a/src/config.py +++ b/src/config.py @@ -1,10 +1,20 @@ import os from dotenv import load_dotenv from functools import cached_property +from typing import Optional DEFAULT_BATCH_SIZE = 32 DEFAULT_BACKEND = "torch" +DEFAULT_HTTP_CLIENT_USER_AGENT = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" +) +DEFAULT_HTTP_CLIENT_TIMEOUT_SECONDS = 10.0 +DEFAULT_HTTP_CLIENT_MAX_CONNECTIONS = 50 +DEFAULT_HTTP_CLIENT_MAX_KEEPALIVE_CONNECTIONS = 20 + if not os.environ.get("INFINITY_QUEUE_SIZE"): # how many items can be in the queue os.environ["INFINITY_QUEUE_SIZE"] = "48000" @@ -56,3 +66,60 @@ def dtypes(self) -> list[str]: @cached_property def runpod_max_concurrency(self) -> int: return int(os.environ.get("RUNPOD_MAX_CONCURRENCY", 300)) + + +class HttpClientConfig: + ENV_USER_AGENT = "HTTP_CLIENT_USER_AGENT" + ENV_TIMEOUT = "HTTP_CLIENT_TIMEOUT" + ENV_MAX_CONNECTIONS = "HTTP_CLIENT_MAX_CONNECTIONS" + ENV_MAX_KEEPALIVE = "HTTP_CLIENT_MAX_KEEPALIVE_CONNECTIONS" + + def __init__(self): + load_dotenv() + + def _get_env_float(self, key: str, default: float) -> float: + value = os.environ.get(key) + if value is None: + return default + try: + return float(value) + except ValueError as exc: + raise ValueError(f"Environment variable {key} must be a float, got {value!r}") from exc + + def _get_env_int(self, key: str, default: int) -> int: + value = os.environ.get(key) + if value is None: + return default + try: + return int(value) + except ValueError as exc: + raise ValueError(f"Environment variable {key} must be an integer, got {value!r}") from exc + + def _get_env_str(self, key: str, default: str) -> str: + value: Optional[str] = os.environ.get(key) + if value is None: + return default + value = value.strip() + return value or default + + @cached_property + def user_agent(self) -> str: + return self._get_env_str(self.ENV_USER_AGENT, DEFAULT_HTTP_CLIENT_USER_AGENT) + + @cached_property + def timeout_seconds(self) -> float: + return self._get_env_float(self.ENV_TIMEOUT, DEFAULT_HTTP_CLIENT_TIMEOUT_SECONDS) + + @cached_property + def max_connections(self) -> int: + return self._get_env_int( + self.ENV_MAX_CONNECTIONS, + DEFAULT_HTTP_CLIENT_MAX_CONNECTIONS, + ) + + @cached_property + def max_keepalive_connections(self) -> int: + return self._get_env_int( + self.ENV_MAX_KEEPALIVE, + DEFAULT_HTTP_CLIENT_MAX_KEEPALIVE_CONNECTIONS, + ) diff --git a/src/http_client.py b/src/http_client.py index 58af57c..dc01a8b 100644 --- a/src/http_client.py +++ b/src/http_client.py @@ -1,10 +1,8 @@ import httpx -DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +from config import HttpClientConfig -DEFAULT_TIMEOUT = 10.0 -DEFAULT_MAX_CONNECTIONS = 50 -DEFAULT_MAX_KEEPALIVE_CONNECTIONS = 20 +_CLIENT_CONFIG = HttpClientConfig() def create_http_client() -> httpx.AsyncClient: @@ -15,16 +13,16 @@ def create_http_client() -> httpx.AsyncClient: httpx.AsyncClient with proper timeout, limits, and headers configured. """ limits = httpx.Limits( - max_connections=DEFAULT_MAX_CONNECTIONS, - max_keepalive_connections=DEFAULT_MAX_KEEPALIVE_CONNECTIONS, + max_connections=_CLIENT_CONFIG.max_connections, + max_keepalive_connections=_CLIENT_CONFIG.max_keepalive_connections, ) - - timeout = httpx.Timeout(DEFAULT_TIMEOUT) - + + timeout = httpx.Timeout(_CLIENT_CONFIG.timeout_seconds) + headers = { - "User-Agent": DEFAULT_USER_AGENT, + "User-Agent": _CLIENT_CONFIG.user_agent, } - + return httpx.AsyncClient( limits=limits, timeout=timeout, From 108ee4d7353336cd8f5038628c64f3847429683a Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 1 Dec 2025 15:07:17 +0300 Subject: [PATCH 24/27] fix: enhance error handling for modality in embedding routes --- src/embedding_service.py | 66 ++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/src/embedding_service.py b/src/embedding_service.py index 1946a42..d812d4f 100644 --- a/src/embedding_service.py +++ b/src/embedding_service.py @@ -129,42 +129,54 @@ async def route_openai_get_embeddings( ) # Route to appropriate embedding method based on modality - if modality == "text": - logger.debug(f"Calling .embed() with {len(validated_items)} text items") - embeddings, usage = await self.engine_array[model_name].embed( - validated_items - ) - logger.debug(f"Successfully got {len(embeddings)} text embeddings") + try: + if modality == "text": + logger.debug( + f"Calling .embed() with {len(validated_items)} text items" + ) + embeddings, usage = await self.engine_array[model_name].embed( + validated_items + ) + logger.debug( + f"Successfully got {len(embeddings)} text embeddings" + ) - elif modality == "image": - logger.debug( - f"Calling .image_embed() with {len(validated_items)} image items" - ) - try: + elif modality == "image": + logger.debug( + f"Calling .image_embed() with {len(validated_items)} image items" + ) embeddings, usage = await self.engine_array[model_name].image_embed( images=validated_items ) - logger.debug(f"Successfully got {len(embeddings)} image embeddings") - except ModelNotDeployedError as e: + logger.debug( + f"Successfully got {len(embeddings)} image embeddings" + ) + + elif modality == "audio": + raise NotImplementedError( + "Audio modality is not yet implemented. " + "Currently supported modalities: 'text', 'image'" + ) + + else: + raise ValueError( + f"Invalid modality: '{modality}'. " + f"Supported modalities: 'text', 'image', 'audio' (not yet implemented)" + ) + except ModelNotDeployedError as e: + if modality == "image": error_msg = ( f"Model '{model_name}' does not support image embeddings. " f"Please use a multimodal model (e.g., 'jinaai/jina-clip-v1') " f"or use modality='text' instead." ) - logger.error(f"{error_msg} Original error: {e}") - raise ValueError(error_msg) from e - - elif modality == "audio": - raise NotImplementedError( - "Audio modality is not yet implemented. " - "Currently supported modalities: 'text', 'image'" - ) - - else: - raise ValueError( - f"Invalid modality: '{modality}'. " - f"Supported modalities: 'text', 'image', 'audio' (not yet implemented)" - ) + else: + error_msg = ( + f"Model '{model_name}' is not deployed or does not support " + f"{modality} embeddings." + ) + logger.error(f"{error_msg} Original error: {e}") + raise ValueError(error_msg) from e if return_as_list: return [ From 4c7838bcf89a6030e87087e5c0cba31443bfc464 Mon Sep 17 00:00:00 2001 From: vadim Date: Tue, 2 Dec 2025 07:38:04 +0300 Subject: [PATCH 25/27] fix: update default User-Agent string for HTTP client in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f5b3bfb..d8c55fe 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ All behaviour is controlled through environment variables: | `DTYPES` | No | `auto` | Precision per model (`auto`, `fp16`, `fp8`). Semicolon-separated, must match `MODEL_NAMES`.
Example: `auto;auto` | | `INFINITY_QUEUE_SIZE` | No | `48000` | Max items queueable inside the Infinity engine. | | `RUNPOD_MAX_CONCURRENCY` | No | `300` | Max concurrent requests the RunPod wrapper will accept. | -| `HTTP_CLIENT_USER_AGENT` | No | Desktop Chrome UA | Override the browser-style User-Agent used for outbound image downloads. | +| `HTTP_CLIENT_USER_AGENT` | No | `Mozilla/5.0 ... Chrome/120.0.0.0 Safari/537.36` | Override the browser-style User-Agent used for outbound image downloads. | | `HTTP_CLIENT_TIMEOUT` | No | `10.0` | Request timeout (seconds) for outbound image fetches. | | `HTTP_CLIENT_MAX_CONNECTIONS` | No | `50` | Concurrent connection pool size for the shared `httpx` client. | | `HTTP_CLIENT_MAX_KEEPALIVE_CONNECTIONS` | No | `20` | Max keep-alive sockets retained by the shared `httpx` client. | From b5cd5af9ce61a9101d0407d5ea5d289b546008b4 Mon Sep 17 00:00:00 2001 From: vadim Date: Tue, 2 Dec 2025 07:38:42 +0300 Subject: [PATCH 26/27] fix: improve error messages for model availability and modality capabilities --- src/embedding_service.py | 36 +++++++++++++++++++----------------- tests/test_modality.py | 7 ++++++- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/src/embedding_service.py b/src/embedding_service.py index d812d4f..0dc4e30 100644 --- a/src/embedding_service.py +++ b/src/embedding_service.py @@ -98,14 +98,16 @@ async def route_openai_get_embeddings( available_models = self.list_models() if model_name not in available_models: - logger.error( - f"Requested model '{model_name}' not found. " - f"Available models: {available_models}" + available_models_msg = ( + ", ".join(available_models) if available_models else "none" ) - raise ValueError( - f"Model '{model_name}' is not available. " - f"Available models: {', '.join(available_models)}" + error_msg = ( + f"Model '{model_name}' is not deployed. " + f"Available deployments: {available_models_msg}. " + f"Deploy the requested model or choose one of the available models." ) + logger.error(error_msg) + raise ValueError(error_msg) if not isinstance(embedding_input, list): embedding_input = [embedding_input] @@ -164,17 +166,17 @@ async def route_openai_get_embeddings( f"Supported modalities: 'text', 'image', 'audio' (not yet implemented)" ) except ModelNotDeployedError as e: - if modality == "image": - error_msg = ( - f"Model '{model_name}' does not support image embeddings. " - f"Please use a multimodal model (e.g., 'jinaai/jina-clip-v1') " - f"or use modality='text' instead." - ) - else: - error_msg = ( - f"Model '{model_name}' is not deployed or does not support " - f"{modality} embeddings." - ) + available_capabilities = sorted( + getattr(self.engine_array[model_name], "capabilities", set()) + ) + capabilities_hint = ( + ", ".join(available_capabilities) if available_capabilities else "none" + ) + error_msg = ( + f"Model '{model_name}' does not expose the '{modality}' capability. " + f"Detected capabilities: {capabilities_hint}. Deploy a model that supports " + f"this modality or adjust the request." + ) logger.error(f"{error_msg} Original error: {e}") raise ValueError(error_msg) from e diff --git a/tests/test_modality.py b/tests/test_modality.py index a776386..870188d 100644 --- a/tests/test_modality.py +++ b/tests/test_modality.py @@ -158,6 +158,9 @@ def test_wrong_modality_error() -> None: result = response.json() error_msg = _extract_error_message(result) assert error_msg is not None, f"Expected error object, got: {result}" + lowered = error_msg.lower() + assert "does not expose the 'image' capability" in lowered + assert "detected capabilities" in lowered def test_audio_not_implemented() -> None: @@ -362,7 +365,9 @@ def test_edge_nonexistent_model() -> None: result = response.json() error_msg = _extract_error_message(result) assert error_msg is not None, f"Expected model missing error: {result}" - assert "not available" in error_msg.lower() or "not found" in error_msg.lower() + lowered = error_msg.lower() + assert "is not deployed" in lowered + assert "available deployments" in lowered def test_edge_invalid_image_url() -> None: From ceee11308c2a63da48a94f4923a368484dd56f20 Mon Sep 17 00:00:00 2001 From: vadim Date: Thu, 11 Dec 2025 07:13:11 +0300 Subject: [PATCH 27/27] fix: reorder imports for better organization --- src/utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/utils.py b/src/utils.py index 1a59304..4d8b42e 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,8 +1,7 @@ -from http import HTTPStatus -from typing import Annotated, Any, Dict, List, Literal, Optional, Union -from typing import Any, Dict, Iterable, List, Optional, Union -from uuid import uuid4 import time +from http import HTTPStatus +from typing import Annotated, Any, Dict, Iterable, List, Literal, Optional, Union + import numpy as np import numpy.typing as npt from pydantic import BaseModel, Field, conlist