Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 24 additions & 29 deletions fastembed/image/onnx_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,25 +74,21 @@ def __init__(
**kwargs: Any,
):
"""
Initializes an ONNX image embedding model with configurable device, threading, and loading options.

Args:
model_name (str): The name of the model to use.
cache_dir (str, optional): The path to the cache directory.
Can be set using the `FASTEMBED_CACHE_PATH` env variable.
Defaults to `fastembed_cache` in the system's temp directory.
threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
Defaults to False.
device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else

model_name: Name of the ONNX model to use, in the format <org>/<model>.
cache_dir: Optional directory for caching model files.
threads: Number of threads for ONNX runtime session.
providers: Optional list of ONNX runtime providers to use for inference.
cuda: If True, enables CUDA for inference; mutually exclusive with `providers`.
device_ids: Optional list of device IDs for parallel processing; used with `cuda=True`.
lazy_load: If True, defers model loading until first use.
device_id: Optional device ID for model loading in the current process.
specific_model_path: Optional path to a specific ONNX model directory.

Raises:
ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
ValueError: If `model_name` is not in the required <org>/<model> format.
"""

super().__init__(model_name, cache_dir, threads, **kwargs)
Expand All @@ -112,11 +108,12 @@ def __init__(

self.model_description = self._get_model_description(model_name)
self.cache_dir = str(define_cache_dir(cache_dir))
self._specific_model_path = specific_model_path
self._model_dir = self.download_model(
self.model_description,
self.cache_dir,
local_files_only=self._local_files_only,
specific_model_path=specific_model_path,
specific_model_path=self._specific_model_path,
)

if not self.lazy_load:
Expand Down Expand Up @@ -153,19 +150,15 @@ def embed(
**kwargs: Any,
) -> Iterable[NumpyArray]:
"""
Encode a list of images into list of embeddings.
We use mean pooling with attention so that the model can handle variable-length inputs.

Generates embeddings for one or more images using the loaded ONNX model.

Args:
images: Iterator of image paths or single image path to embed
batch_size: Batch size for encoding -- higher values will use more memory, but be faster
parallel:
If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
If 0, use all available cores.
If None, don't use data-parallel processing, use default onnxruntime threading instead.

images: A single image input or an iterable of image inputs to embed.
batch_size: Number of images to process in each batch.
parallel: Number of parallel workers to use for data-parallel encoding. If 0, uses all available cores; if None, disables parallel processing.

Returns:
List of embeddings, one per document
An iterable of numpy arrays, each representing the embedding of an input image.
"""

yield from self._embed_images(
Expand All @@ -177,6 +170,8 @@ def embed(
providers=self.providers,
cuda=self.cuda,
device_ids=self.device_ids,
local_files_only=self._local_files_only,
specific_model_path=self._specific_model_path,
**kwargs,
)

Expand Down
23 changes: 23 additions & 0 deletions fastembed/image/onnx_image_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,29 @@ def _embed_images(
providers: Optional[Sequence[OnnxProvider]] = None,
cuda: bool = False,
device_ids: Optional[list[int]] = None,
local_files_only: bool = False,
specific_model_path: Optional[str] = None,
**kwargs: Any,
) -> Iterable[T]:
"""
Embeds images using the ONNX model, processing them sequentially or in parallel.

Depending on the input size and the `parallel` parameter, images are embedded either in batches on the main process or distributed across multiple worker processes. Supports additional configuration for model loading and caching.

Args:
model_name: Name of the ONNX model to use.
cache_dir: Directory for model caching.
images: Single image or iterable of images to embed.
batch_size: Number of images per batch.
parallel: Number of parallel worker processes to use; if None or input is small, runs sequentially.
cuda: Whether to use CUDA-enabled devices.
device_ids: List of device IDs for parallel workers.
local_files_only: If True, restricts model loading to local files.
specific_model_path: Path to a specific model file to load.

Yields:
Embeddings for each input image, post-processed as defined by the subclass.
"""
is_small = False

if isinstance(images, (str, Path, Image.Image)):
Expand All @@ -123,6 +144,8 @@ def _embed_images(
"model_name": model_name,
"cache_dir": cache_dir,
"providers": providers,
"local_files_only": local_files_only,
"specific_model_path": specific_model_path,
**kwargs,
}

Expand Down
45 changes: 14 additions & 31 deletions fastembed/late_interaction/colbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,25 +130,9 @@ def __init__(
**kwargs: Any,
):
"""
Args:
model_name (str): The name of the model to use.
cache_dir (str, optional): The path to the cache directory.
Can be set using the `FASTEMBED_CACHE_PATH` env variable.
Defaults to `fastembed_cache` in the system's temp directory.
threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
Defaults to False.
device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else

Raises:
ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
Initializes a Colbert model instance for ONNX-based late interaction text embedding.

Configures model loading, device selection, threading, and caching options. Optionally supports lazy loading and specifying a custom ONNX model path. Raises a ValueError if the model name format is invalid.
"""

super().__init__(model_name, cache_dir, threads, **kwargs)
Expand All @@ -169,11 +153,12 @@ def __init__(
self.model_description = self._get_model_description(model_name)
self.cache_dir = str(define_cache_dir(cache_dir))

self._specific_model_path = specific_model_path
self._model_dir = self.download_model(
self.model_description,
self.cache_dir,
local_files_only=self._local_files_only,
specific_model_path=specific_model_path,
specific_model_path=self._specific_model_path,
)
self.mask_token_id: Optional[int] = None
self.pad_token_id: Optional[int] = None
Expand Down Expand Up @@ -210,19 +195,15 @@ def embed(
**kwargs: Any,
) -> Iterable[NumpyArray]:
"""
Encode a list of documents into list of embeddings.
We use mean pooling with attention so that the model can handle variable-length inputs.

Generates embeddings for one or more documents using mean pooling with attention.

Args:
documents: Iterator of documents or single document to embed
batch_size: Batch size for encoding -- higher values will use more memory, but be faster
parallel:
If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
If 0, use all available cores.
If None, don't use data-parallel processing, use default onnxruntime threading instead.

documents: A single document or an iterable of documents to embed.
batch_size: Number of documents to process per batch.
parallel: Number of parallel workers to use for data-parallel encoding. If 0, uses all available cores. If None, uses default threading.

Returns:
List of embeddings, one per document
An iterable of embeddings, one per input document.
"""
yield from self._embed_documents(
model_name=self.model_name,
Expand All @@ -233,6 +214,8 @@ def embed(
providers=self.providers,
cuda=self.cuda,
device_ids=self.device_ids,
local_files_only=self._local_files_only,
specific_model_path=self._specific_model_path,
**kwargs,
)

Expand Down
2 changes: 1 addition & 1 deletion fastembed/late_interaction/token_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
)
from fastembed.text.onnx_embedding import OnnxTextEmbedding
from fastembed.text.onnx_text_model import TextEmbeddingWorker
import numpy as np


supported_token_embeddings_models = [
DenseModelDescription(
Expand Down
62 changes: 23 additions & 39 deletions fastembed/late_interaction_multimodal/colpali.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,24 +57,9 @@ def __init__(
**kwargs: Any,
):
"""
Args:
model_name (str): The name of the model to use.
cache_dir (str, optional): The path to the cache directory.
Can be set using the `FASTEMBED_CACHE_PATH` env variable.
Defaults to `fastembed_cache` in the system's temp directory.
threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
Defaults to False.
device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
device_id (Optional[int], optional): The device id to use for loading the model in the worker process.

Raises:
ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
Initializes the ColPali multimodal embedding model with specified configuration.

Configures model loading, device and threading options, ONNX runtime providers, and cache directory. Supports lazy loading, CUDA acceleration, and custom model paths. Raises a ValueError if the model name format is invalid.
"""

super().__init__(model_name, cache_dir, threads, **kwargs)
Expand All @@ -95,11 +80,12 @@ def __init__(
self.model_description = self._get_model_description(model_name)
self.cache_dir = str(define_cache_dir(cache_dir))

self._specific_model_path = specific_model_path
self._model_dir = self.download_model(
self.model_description,
self.cache_dir,
local_files_only=self._local_files_only,
specific_model_path=specific_model_path,
specific_model_path=self._specific_model_path,
)
self.mask_token_id = None
self.pad_token_id = None
Expand Down Expand Up @@ -213,18 +199,15 @@ def embed_text(
**kwargs: Any,
) -> Iterable[NumpyArray]:
"""
Encode a list of documents into list of embeddings.

Generates embeddings for one or more text documents.
Args:
documents: Iterator of documents or single document to embed
batch_size: Batch size for encoding -- higher values will use more memory, but be faster
parallel:
If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
If 0, use all available cores.
If None, don't use data-parallel processing, use default onnxruntime threading instead.

documents: A string or iterable of strings representing the documents to embed.
batch_size: Number of documents to process per batch.
parallel: Number of parallel workers to use for encoding. If 0, uses all available cores; if None, disables parallelism.

Returns:
List of embeddings, one per document
An iterable of NumPy arrays, each representing the embedding of a document.
"""
yield from self._embed_documents(
model_name=self.model_name,
Expand All @@ -235,6 +218,8 @@ def embed_text(
providers=self.providers,
cuda=self.cuda,
device_ids=self.device_ids,
local_files_only=self._local_files_only,
specific_model_path=self._specific_model_path,
**kwargs,
)

Expand All @@ -246,18 +231,15 @@ def embed_image(
**kwargs: Any,
) -> Iterable[NumpyArray]:
"""
Encode a list of images into list of embeddings.

Generates embeddings for one or more images.
Args:
images: Iterator of image paths or single image path to embed
batch_size: Batch size for encoding -- higher values will use more memory, but be faster
parallel:
If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
If 0, use all available cores.
If None, don't use data-parallel processing, use default onnxruntime threading instead.

images: A single image input or an iterable of image inputs to embed.
batch_size: Number of images to process per batch.
parallel: Number of parallel workers to use for encoding. If 0, uses all available cores; if None, disables parallel processing.

Returns:
List of embeddings, one per document
An iterable of NumPy arrays, each representing the embedding of an input image.
"""
yield from self._embed_images(
model_name=self.model_name,
Expand All @@ -268,6 +250,8 @@ def embed_image(
providers=self.providers,
cuda=self.cuda,
device_ids=self.device_ids,
local_files_only=self._local_files_only,
specific_model_path=self._specific_model_path,
**kwargs,
)

Expand Down
33 changes: 33 additions & 0 deletions fastembed/late_interaction_multimodal/onnx_multimodal_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,30 @@ def _embed_documents(
providers: Optional[Sequence[OnnxProvider]] = None,
cuda: bool = False,
device_ids: Optional[list[int]] = None,
local_files_only: bool = False,
specific_model_path: Optional[str] = None,
**kwargs: Any,
) -> Iterable[T]:
"""
Embeds a collection of text documents using the ONNX model, with optional parallel processing.

If the input is small or parallelism is not requested, processes documents in batches on the main process. Otherwise, distributes batches across parallel worker processes. Supports additional options for local file usage and specifying a model path.

Args:
model_name: Name of the ONNX model to use.
cache_dir: Directory for model caching.
documents: Single string or iterable of text documents to embed.
batch_size: Number of documents per batch.
parallel: Number of parallel worker processes to use. If None or input is small, runs in the main process.
providers: Optional sequence of ONNX runtime providers.
cuda: Whether to use CUDA-enabled devices.
device_ids: Optional list of device IDs for parallel workers.
local_files_only: If True, restricts model loading to local files.
specific_model_path: Optional path to a specific model file.

Yields:
Embeddings for each input document, in order.
"""
is_small = False

if isinstance(documents, str):
Expand All @@ -146,6 +168,8 @@ def _embed_documents(
"model_name": model_name,
"cache_dir": cache_dir,
"providers": providers,
"local_files_only": local_files_only,
"specific_model_path": specific_model_path,
**kwargs,
}

Expand Down Expand Up @@ -183,8 +207,15 @@ def _embed_images(
providers: Optional[Sequence[OnnxProvider]] = None,
cuda: bool = False,
device_ids: Optional[list[int]] = None,
local_files_only: bool = False,
specific_model_path: Optional[str] = None,
**kwargs: Any,
) -> Iterable[T]:
"""
Embeds images using the ONNX model, with optional parallel processing.

Processes a collection of images in batches, either sequentially or in parallel using worker processes. Supports loading models from local files only or a specific model path if specified. Yields post-processed embeddings for each image.
"""
is_small = False

if isinstance(images, (str, Path, Image.Image)):
Expand All @@ -209,6 +240,8 @@ def _embed_images(
"model_name": model_name,
"cache_dir": cache_dir,
"providers": providers,
"local_files_only": local_files_only,
"specific_model_path": specific_model_path,
**kwargs,
}

Expand Down
Loading