diff --git a/fastembed/image/onnx_embedding.py b/fastembed/image/onnx_embedding.py index f82fbae3..ec4c3ef3 100644 --- a/fastembed/image/onnx_embedding.py +++ b/fastembed/image/onnx_embedding.py @@ -74,25 +74,21 @@ def __init__( **kwargs: Any, ): """ + Initializes an ONNX image embedding model with configurable device, threading, and loading options. + Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use. - Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None. - cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to False. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else - + model_name: Name of the ONNX model to use, in the format /. + cache_dir: Optional directory for caching model files. + threads: Number of threads for ONNX runtime session. + providers: Optional list of ONNX runtime providers to use for inference. + cuda: If True, enables CUDA for inference; mutually exclusive with `providers`. + device_ids: Optional list of device IDs for parallel processing; used with `cuda=True`. + lazy_load: If True, defers model loading until first use. + device_id: Optional device ID for model loading in the current process. + specific_model_path: Optional path to a specific ONNX model directory. + Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. + ValueError: If `model_name` is not in the required / format. """ super().__init__(model_name, cache_dir, threads, **kwargs) @@ -112,11 +108,12 @@ def __init__( self.model_description = self._get_model_description(model_name) self.cache_dir = str(define_cache_dir(cache_dir)) + self._specific_model_path = specific_model_path self._model_dir = self.download_model( self.model_description, self.cache_dir, local_files_only=self._local_files_only, - specific_model_path=specific_model_path, + specific_model_path=self._specific_model_path, ) if not self.lazy_load: @@ -153,19 +150,15 @@ def embed( **kwargs: Any, ) -> Iterable[NumpyArray]: """ - Encode a list of images into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - + Generates embeddings for one or more images using the loaded ONNX model. + Args: - images: Iterator of image paths or single image path to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - + images: A single image input or an iterable of image inputs to embed. + batch_size: Number of images to process in each batch. + parallel: Number of parallel workers to use for data-parallel encoding. If 0, uses all available cores; if None, disables parallel processing. + Returns: - List of embeddings, one per document + An iterable of numpy arrays, each representing the embedding of an input image. """ yield from self._embed_images( @@ -177,6 +170,8 @@ def embed( providers=self.providers, cuda=self.cuda, device_ids=self.device_ids, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, **kwargs, ) diff --git a/fastembed/image/onnx_image_model.py b/fastembed/image/onnx_image_model.py index f26ee9e0..99d5cee0 100644 --- a/fastembed/image/onnx_image_model.py +++ b/fastembed/image/onnx_image_model.py @@ -97,8 +97,29 @@ def _embed_images( providers: Optional[Sequence[OnnxProvider]] = None, cuda: bool = False, device_ids: Optional[list[int]] = None, + local_files_only: bool = False, + specific_model_path: Optional[str] = None, **kwargs: Any, ) -> Iterable[T]: + """ + Embeds images using the ONNX model, processing them sequentially or in parallel. + + Depending on the input size and the `parallel` parameter, images are embedded either in batches on the main process or distributed across multiple worker processes. Supports additional configuration for model loading and caching. + + Args: + model_name: Name of the ONNX model to use. + cache_dir: Directory for model caching. + images: Single image or iterable of images to embed. + batch_size: Number of images per batch. + parallel: Number of parallel worker processes to use; if None or input is small, runs sequentially. + cuda: Whether to use CUDA-enabled devices. + device_ids: List of device IDs for parallel workers. + local_files_only: If True, restricts model loading to local files. + specific_model_path: Path to a specific model file to load. + + Yields: + Embeddings for each input image, post-processed as defined by the subclass. + """ is_small = False if isinstance(images, (str, Path, Image.Image)): @@ -123,6 +144,8 @@ def _embed_images( "model_name": model_name, "cache_dir": cache_dir, "providers": providers, + "local_files_only": local_files_only, + "specific_model_path": specific_model_path, **kwargs, } diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py index cdbf9f50..946086ed 100644 --- a/fastembed/late_interaction/colbert.py +++ b/fastembed/late_interaction/colbert.py @@ -130,25 +130,9 @@ def __init__( **kwargs: Any, ): """ - Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use. - Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None. - cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to False. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else - - Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. + Initializes a Colbert model instance for ONNX-based late interaction text embedding. + + Configures model loading, device selection, threading, and caching options. Optionally supports lazy loading and specifying a custom ONNX model path. Raises a ValueError if the model name format is invalid. """ super().__init__(model_name, cache_dir, threads, **kwargs) @@ -169,11 +153,12 @@ def __init__( self.model_description = self._get_model_description(model_name) self.cache_dir = str(define_cache_dir(cache_dir)) + self._specific_model_path = specific_model_path self._model_dir = self.download_model( self.model_description, self.cache_dir, local_files_only=self._local_files_only, - specific_model_path=specific_model_path, + specific_model_path=self._specific_model_path, ) self.mask_token_id: Optional[int] = None self.pad_token_id: Optional[int] = None @@ -210,19 +195,15 @@ def embed( **kwargs: Any, ) -> Iterable[NumpyArray]: """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - + Generates embeddings for one or more documents using mean pooling with attention. + Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - + documents: A single document or an iterable of documents to embed. + batch_size: Number of documents to process per batch. + parallel: Number of parallel workers to use for data-parallel encoding. If 0, uses all available cores. If None, uses default threading. + Returns: - List of embeddings, one per document + An iterable of embeddings, one per input document. """ yield from self._embed_documents( model_name=self.model_name, @@ -233,6 +214,8 @@ def embed( providers=self.providers, cuda=self.cuda, device_ids=self.device_ids, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, **kwargs, ) diff --git a/fastembed/late_interaction/token_embeddings.py b/fastembed/late_interaction/token_embeddings.py index 1d4ebc9c..ec4844ba 100644 --- a/fastembed/late_interaction/token_embeddings.py +++ b/fastembed/late_interaction/token_embeddings.py @@ -9,7 +9,7 @@ ) from fastembed.text.onnx_embedding import OnnxTextEmbedding from fastembed.text.onnx_text_model import TextEmbeddingWorker -import numpy as np + supported_token_embeddings_models = [ DenseModelDescription( diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py index c43ff9d0..6f5a70b3 100644 --- a/fastembed/late_interaction_multimodal/colpali.py +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -57,24 +57,9 @@ def __init__( **kwargs: Any, ): """ - Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use. - Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None. - cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to False. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - - Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. + Initializes the ColPali multimodal embedding model with specified configuration. + + Configures model loading, device and threading options, ONNX runtime providers, and cache directory. Supports lazy loading, CUDA acceleration, and custom model paths. Raises a ValueError if the model name format is invalid. """ super().__init__(model_name, cache_dir, threads, **kwargs) @@ -95,11 +80,12 @@ def __init__( self.model_description = self._get_model_description(model_name) self.cache_dir = str(define_cache_dir(cache_dir)) + self._specific_model_path = specific_model_path self._model_dir = self.download_model( self.model_description, self.cache_dir, local_files_only=self._local_files_only, - specific_model_path=specific_model_path, + specific_model_path=self._specific_model_path, ) self.mask_token_id = None self.pad_token_id = None @@ -213,18 +199,15 @@ def embed_text( **kwargs: Any, ) -> Iterable[NumpyArray]: """ - Encode a list of documents into list of embeddings. - + Generates embeddings for one or more text documents. + Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - + documents: A string or iterable of strings representing the documents to embed. + batch_size: Number of documents to process per batch. + parallel: Number of parallel workers to use for encoding. If 0, uses all available cores; if None, disables parallelism. + Returns: - List of embeddings, one per document + An iterable of NumPy arrays, each representing the embedding of a document. """ yield from self._embed_documents( model_name=self.model_name, @@ -235,6 +218,8 @@ def embed_text( providers=self.providers, cuda=self.cuda, device_ids=self.device_ids, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, **kwargs, ) @@ -246,18 +231,15 @@ def embed_image( **kwargs: Any, ) -> Iterable[NumpyArray]: """ - Encode a list of images into list of embeddings. - + Generates embeddings for one or more images. + Args: - images: Iterator of image paths or single image path to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - + images: A single image input or an iterable of image inputs to embed. + batch_size: Number of images to process per batch. + parallel: Number of parallel workers to use for encoding. If 0, uses all available cores; if None, disables parallel processing. + Returns: - List of embeddings, one per document + An iterable of NumPy arrays, each representing the embedding of an input image. """ yield from self._embed_images( model_name=self.model_name, @@ -268,6 +250,8 @@ def embed_image( providers=self.providers, cuda=self.cuda, device_ids=self.device_ids, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, **kwargs, ) diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py index 089ba1b7..13e6cd6d 100644 --- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py +++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py @@ -120,8 +120,30 @@ def _embed_documents( providers: Optional[Sequence[OnnxProvider]] = None, cuda: bool = False, device_ids: Optional[list[int]] = None, + local_files_only: bool = False, + specific_model_path: Optional[str] = None, **kwargs: Any, ) -> Iterable[T]: + """ + Embeds a collection of text documents using the ONNX model, with optional parallel processing. + + If the input is small or parallelism is not requested, processes documents in batches on the main process. Otherwise, distributes batches across parallel worker processes. Supports additional options for local file usage and specifying a model path. + + Args: + model_name: Name of the ONNX model to use. + cache_dir: Directory for model caching. + documents: Single string or iterable of text documents to embed. + batch_size: Number of documents per batch. + parallel: Number of parallel worker processes to use. If None or input is small, runs in the main process. + providers: Optional sequence of ONNX runtime providers. + cuda: Whether to use CUDA-enabled devices. + device_ids: Optional list of device IDs for parallel workers. + local_files_only: If True, restricts model loading to local files. + specific_model_path: Optional path to a specific model file. + + Yields: + Embeddings for each input document, in order. + """ is_small = False if isinstance(documents, str): @@ -146,6 +168,8 @@ def _embed_documents( "model_name": model_name, "cache_dir": cache_dir, "providers": providers, + "local_files_only": local_files_only, + "specific_model_path": specific_model_path, **kwargs, } @@ -183,8 +207,15 @@ def _embed_images( providers: Optional[Sequence[OnnxProvider]] = None, cuda: bool = False, device_ids: Optional[list[int]] = None, + local_files_only: bool = False, + specific_model_path: Optional[str] = None, **kwargs: Any, ) -> Iterable[T]: + """ + Embeds images using the ONNX model, with optional parallel processing. + + Processes a collection of images in batches, either sequentially or in parallel using worker processes. Supports loading models from local files only or a specific model path if specified. Yields post-processed embeddings for each image. + """ is_small = False if isinstance(images, (str, Path, Image.Image)): @@ -209,6 +240,8 @@ def _embed_images( "model_name": model_name, "cache_dir": cache_dir, "providers": providers, + "local_files_only": local_files_only, + "specific_model_path": specific_model_path, **kwargs, } diff --git a/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py b/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py index 238f3acc..ccc08ca6 100644 --- a/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py +++ b/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py @@ -88,25 +88,9 @@ def __init__( **kwargs: Any, ): """ - Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use. - Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None. - cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to False. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else - - Raises: - ValueError: If the model_name is not in the format / e.g. Xenova/ms-marco-MiniLM-L-6-v2. + Initializes an ONNX-based cross-encoder model for text re-ranking. + + Configures model selection, caching, threading, device assignment, ONNX runtime providers, and model loading behavior. Downloads and prepares the ONNX model for inference, with support for custom model paths and lazy loading. Raises a ValueError if the model name format is invalid. """ super().__init__(model_name, cache_dir, threads, **kwargs) self.providers = providers @@ -131,11 +115,12 @@ def __init__( self.model_description = self._get_model_description(model_name) self.cache_dir = str(define_cache_dir(cache_dir)) + self._specific_model_path = specific_model_path self._model_dir = self.download_model( self.model_description, self.cache_dir, local_files_only=self._local_files_only, - specific_model_path=specific_model_path, + specific_model_path=self._specific_model_path, ) if not self.lazy_load: @@ -180,6 +165,17 @@ def rerank_pairs( parallel: Optional[int] = None, **kwargs: Any, ) -> Iterable[float]: + """ + Reranks pairs of texts using the ONNX cross-encoder model. + + Args: + pairs: An iterable of (query, document) string tuples to be scored. + batch_size: Number of pairs to process in each batch. Defaults to 64. + parallel: Optional number of parallel workers for processing. + + Yields: + Relevance scores as floats for each input pair, in order. + """ yield from self._rerank_pairs( model_name=self.model_name, cache_dir=str(self.cache_dir), @@ -189,6 +185,8 @@ def rerank_pairs( providers=self.providers, cuda=self.cuda, device_ids=self.device_ids, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, **kwargs, ) diff --git a/fastembed/rerank/cross_encoder/onnx_text_model.py b/fastembed/rerank/cross_encoder/onnx_text_model.py index de022e17..675b5536 100644 --- a/fastembed/rerank/cross_encoder/onnx_text_model.py +++ b/fastembed/rerank/cross_encoder/onnx_text_model.py @@ -94,8 +94,30 @@ def _rerank_pairs( providers: Optional[Sequence[OnnxProvider]] = None, cuda: bool = False, device_ids: Optional[list[int]] = None, + local_files_only: bool = False, + specific_model_path: Optional[str] = None, **kwargs: Any, ) -> Iterable[float]: + """ + Reranks a sequence of text pairs using the ONNX cross-encoder model, with optional parallel processing. + + If parallel processing is enabled and the input is large, distributes batches across multiple worker processes; otherwise, processes batches in the current process. Supports additional options for model loading, including restricting to local files and specifying a model path. + + Args: + model_name: Name of the ONNX model to use. + cache_dir: Directory for model caching. + pairs: Iterable of (query, document) text pairs to rerank. + batch_size: Number of pairs per inference batch. + parallel: Number of worker processes to use; if None or input is small, runs in the current process. + providers: Optional ONNX runtime providers. + cuda: Whether to use CUDA-enabled devices. + device_ids: Optional list of device IDs for parallel workers. + local_files_only: If True, restricts model loading to local files only. + specific_model_path: Optional path to a specific model file. + + Yields: + Reranked scores as floats, in the same order as the input pairs. + """ is_small = False if isinstance(pairs, tuple): @@ -120,6 +142,8 @@ def _rerank_pairs( "model_name": model_name, "cache_dir": cache_dir, "providers": providers, + "local_files_only": local_files_only, + "specific_model_path": specific_model_path, **kwargs, } diff --git a/fastembed/sparse/bm25.py b/fastembed/sparse/bm25.py index f6806e72..ecc71de6 100644 --- a/fastembed/sparse/bm25.py +++ b/fastembed/sparse/bm25.py @@ -101,6 +101,23 @@ def __init__( specific_model_path: Optional[str] = None, **kwargs: Any, ): + """ + Initializes a BM25 sparse embedding model with configurable parameters. + + Args: + model_name: Name of the BM25 model to use. + cache_dir: Directory for caching model files. + k: BM25 term frequency saturation parameter. + b: BM25 document length normalization parameter. + avg_len: Average document length for normalization. + language: Language for stemming and stopword removal. + token_max_length: Maximum allowed token length. + disable_stemmer: If True, disables stemming and stopword removal. + specific_model_path: Path to a specific model directory, if provided. + + Raises: + ValueError: If the specified language is not supported. + """ super().__init__(model_name, cache_dir, **kwargs) if language not in supported_languages: @@ -115,11 +132,12 @@ def __init__( model_description = self._get_model_description(model_name) self.cache_dir = str(define_cache_dir(cache_dir)) + self._specific_model_path = specific_model_path self._model_dir = self.download_model( model_description, self.cache_dir, local_files_only=self._local_files_only, - specific_model_path=specific_model_path, + specific_model_path=self._specific_model_path, ) self.token_max_length = token_max_length @@ -160,7 +178,24 @@ def _embed_documents( documents: Union[str, Iterable[str]], batch_size: int = 256, parallel: Optional[int] = None, + local_files_only: bool = False, + specific_model_path: Optional[str] = None, ) -> Iterable[SparseEmbedding]: + """ + Embeds documents into sparse BM25 representations, supporting batching and parallel processing. + + Args: + model_name: Name of the BM25 model to use. + cache_dir: Directory for model files and cache. + documents: Single string or iterable of document strings to embed. + batch_size: Number of documents per batch for processing. + parallel: Number of parallel worker processes to use; if None or input is small, processes sequentially. + local_files_only: If True, restricts model loading to local files only. + specific_model_path: Optional path to a specific model directory. + + Yields: + SparseEmbedding objects representing the BM25 embeddings of the input documents. + """ is_small = False if isinstance(documents, str): @@ -188,6 +223,8 @@ def _embed_documents( "language": self.language, "token_max_length": self.token_max_length, "disable_stemmer": self.disable_stemmer, + "local_files_only": local_files_only, + "specific_model_path": specific_model_path, } pool = ParallelWorkerPool( num_workers=parallel or 1, @@ -206,19 +243,15 @@ def embed( **kwargs: Any, ) -> Iterable[SparseEmbedding]: """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - + Embeds one or more documents as sparse BM25 vectors. + Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - + documents: A single document or an iterable of documents to embed. + batch_size: Number of documents to process per batch. + parallel: Number of worker processes to use for parallel embedding. If 0, uses all available cores; if None, runs sequentially. + Returns: - List of embeddings, one per document + An iterable of SparseEmbedding objects, one per input document. """ yield from self._embed_documents( model_name=self.model_name, @@ -226,9 +259,16 @@ def embed( documents=documents, batch_size=batch_size, parallel=parallel, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, ) def _stem(self, tokens: list[str]) -> list[str]: + """ + Filters and stems a list of tokens by removing punctuation, stopwords, and overly long tokens. + + Tokens are lowercased, filtered, and optionally stemmed based on the model's configuration. + """ stemmed_tokens: list[str] = [] for token in tokens: lower_token = token.lower() diff --git a/fastembed/sparse/bm42.py b/fastembed/sparse/bm42.py index 5d72d6c7..9e6bb14f 100644 --- a/fastembed/sparse/bm42.py +++ b/fastembed/sparse/bm42.py @@ -69,27 +69,22 @@ def __init__( **kwargs: Any, ): """ + Initializes the Bm42 sparse embedding model with specified configuration. + Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The providers to use for onnxruntime. - alpha (float, optional): Parameter, that defines the importance of the token weight in the document - versus the importance of the token frequency in the corpus. Defaults to 0.5, based on empirical testing. - It is recommended to only change this parameter based on training data for a specific dataset. - cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to False. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else - + model_name: Name of the model to use. + cache_dir: Optional path to the cache directory. + threads: Optional number of threads for ONNX runtime. + providers: Optional ONNX runtime providers. + alpha: Controls the importance of token weight versus token frequency in scoring. + cuda: Whether to use CUDA for inference. + device_ids: Optional list of device IDs for parallel processing. + lazy_load: If True, delays model loading until first use. + device_id: Optional device ID for model loading in the current process. + specific_model_path: Optional path to a specific ONNX model directory. + Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. + ValueError: If the model_name is not in the format /. """ super().__init__(model_name, cache_dir, threads, **kwargs) @@ -110,11 +105,12 @@ def __init__( self.model_description = self._get_model_description(model_name) self.cache_dir = str(define_cache_dir(cache_dir)) + self._specific_model_path = specific_model_path self._model_dir = self.download_model( self.model_description, self.cache_dir, local_files_only=self._local_files_only, - specific_model_path=specific_model_path, + specific_model_path=self._specific_model_path, ) self.invert_vocab: dict[int, str] = {} @@ -277,19 +273,15 @@ def embed( **kwargs: Any, ) -> Iterable[SparseEmbedding]: """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - + Generates sparse embeddings for one or more documents using mean-pooled attention. + Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - + documents: A single document or an iterable of documents to embed. + batch_size: Number of documents to process in each batch. + parallel: Number of parallel workers to use for encoding. If 0, uses all available cores; if None, uses default threading. + Returns: - List of embeddings, one per document + An iterable of SparseEmbedding objects, one for each input document. """ yield from self._embed_documents( model_name=self.model_name, @@ -301,6 +293,8 @@ def embed( cuda=self.cuda, device_ids=self.device_ids, alpha=self.alpha, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, ) @classmethod diff --git a/fastembed/sparse/minicoil.py b/fastembed/sparse/minicoil.py index 6ade9dfc..21c2d1e8 100644 --- a/fastembed/sparse/minicoil.py +++ b/fastembed/sparse/minicoil.py @@ -79,29 +79,12 @@ def __init__( **kwargs: Any, ): """ - Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The providers to use for onnxruntime. - k (float, optional): The k parameter in the BM25 formula. Defines the saturation of the term frequency. - I.e. defines how fast the moment when additional terms stop to increase the score. Defaults to 1.2. - b (float, optional): The b parameter in the BM25 formula. Defines the importance of the document length. - Defaults to 0.75. - avg_len (float, optional): The average length of the documents in the corpus. Defaults to 150.0. - cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to False. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else - + Initializes a MiniCOIL sparse text embedding model with specified configuration. + + Configures model parameters, BM25 weighting options, device and threading settings, and model file paths. Downloads and prepares the model files, and loads the ONNX model immediately unless lazy loading is enabled. + Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. + ValueError: If the model name is not in the format / (e.g., BAAI/bge-base-en). """ super().__init__(model_name, cache_dir, threads, **kwargs) @@ -127,11 +110,12 @@ def __init__( self.model_description = self._get_model_description(model_name) self.cache_dir = str(define_cache_dir(cache_dir)) + self._specific_model_path = specific_model_path self._model_dir = self.download_model( self.model_description, self.cache_dir, local_files_only=self._local_files_only, - specific_model_path=specific_model_path, + specific_model_path=self._specific_model_path, ) if not self.lazy_load: @@ -184,19 +168,17 @@ def embed( **kwargs: Any, ) -> Iterable[SparseEmbedding]: """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - + Generates sparse embeddings for one or more documents. + + Encodes input documents into sparse vector representations using mean pooling with attention, supporting batching and optional parallel processing. + Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - + documents: A single document or an iterable of documents to embed. + batch_size: Number of documents to process per batch. + parallel: Number of parallel workers to use; if 0, uses all available cores. + Returns: - List of embeddings, one per document + An iterable of sparse embeddings, one for each input document. """ yield from self._embed_documents( model_name=self.model_name, @@ -211,13 +193,22 @@ def embed( b=self.b, avg_len=self.avg_len, is_query=False, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, + **kwargs, ) def query_embed( self, query: Union[str, Iterable[str]], **kwargs: Any ) -> Iterable[SparseEmbedding]: """ - Encode a list of queries into list of embeddings. + Encodes queries into sparse embeddings suitable for retrieval tasks. + + Args: + query: A single query string or an iterable of query strings. + + Yields: + Sparse embeddings representing each input query. """ yield from self._embed_documents( model_name=self.model_name, @@ -230,6 +221,9 @@ def query_embed( b=self.b, avg_len=self.avg_len, is_query=True, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, + **kwargs, ) @classmethod diff --git a/fastembed/sparse/splade_pp.py b/fastembed/sparse/splade_pp.py index b1354b7c..0faab5aa 100644 --- a/fastembed/sparse/splade_pp.py +++ b/fastembed/sparse/splade_pp.py @@ -76,25 +76,12 @@ def __init__( **kwargs: Any, ): """ - Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use. - Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None. - cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to False. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else - + Initializes a SPLADE++ sparse embedding model instance with specified configuration. + + Configures model loading, device selection, threading, and ONNX runtime options. Downloads the model files if necessary and loads the ONNX model immediately unless lazy loading is enabled. + Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. + ValueError: If the model_name is not in the format / (e.g., BAAI/bge-base-en). """ super().__init__(model_name, cache_dir, threads, **kwargs) self.providers = providers @@ -114,11 +101,12 @@ def __init__( self.model_description = self._get_model_description(model_name) self.cache_dir = str(define_cache_dir(cache_dir)) + self._specific_model_path = specific_model_path self._model_dir = self.download_model( self.model_description, self.cache_dir, local_files_only=self._local_files_only, - specific_model_path=specific_model_path, + specific_model_path=self._specific_model_path, ) if not self.lazy_load: @@ -142,19 +130,15 @@ def embed( **kwargs: Any, ) -> Iterable[SparseEmbedding]: """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - + Encodes one or more documents into sparse embeddings. + Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - + documents: A single document or an iterable of documents to embed. + batch_size: Number of documents to process per batch. + parallel: Number of parallel workers to use for encoding. If >1, enables data-parallel processing; if 0, uses all available cores; if None, uses default threading. + Returns: - List of embeddings, one per document + An iterable of SparseEmbedding objects, one for each input document. """ yield from self._embed_documents( model_name=self.model_name, @@ -165,6 +149,8 @@ def embed( providers=self.providers, cuda=self.cuda, device_ids=self.device_ids, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, **kwargs, ) diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py index 8f145b3e..9894e43c 100644 --- a/fastembed/text/onnx_embedding.py +++ b/fastembed/text/onnx_embedding.py @@ -210,25 +210,21 @@ def __init__( **kwargs: Any, ): """ + Initializes an ONNX-based text embedding model with configurable device, threading, and model source options. + Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use. - Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None. - cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to False. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else - + model_name: Name of the ONNX model to use, in the format /. + cache_dir: Directory for caching downloaded models. If not provided, defaults to a system temp directory. + threads: Number of threads for ONNX runtime session. + providers: Sequence of ONNX runtime providers to use. Mutually exclusive with `cuda` and `device_ids`. + cuda: Whether to use CUDA for inference. Mutually exclusive with `providers`. + device_ids: List of device IDs for data parallel processing. Should be used with `cuda=True`. + lazy_load: If True, delays model loading until first use, useful for multi-GPU or parallel encoding. + device_id: Device ID for loading the model in the current process. + specific_model_path: Path to a specific ONNX model directory to load from an external location. + Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. + ValueError: If `model_name` is not in the required / format. """ super().__init__(model_name, cache_dir, threads, **kwargs) self.providers = providers @@ -247,11 +243,12 @@ def __init__( self.model_description = self._get_model_description(model_name) self.cache_dir = str(define_cache_dir(cache_dir)) + self._specific_model_path = specific_model_path self._model_dir = self.download_model( self.model_description, self.cache_dir, local_files_only=self._local_files_only, - specific_model_path=specific_model_path, + specific_model_path=self._specific_model_path, ) if not self.lazy_load: @@ -265,19 +262,15 @@ def embed( **kwargs: Any, ) -> Iterable[NumpyArray]: """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - + Generates embeddings for one or more documents using the ONNX text embedding model. + Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - + documents: A string or iterable of strings representing the documents to embed. + batch_size: Number of documents to process per batch. Larger values may improve speed at the cost of memory usage. + parallel: Number of parallel workers to use for data-parallel encoding. If 0, uses all available cores. If None, disables parallel processing. + Returns: - List of embeddings, one per document + An iterable of numpy arrays, each representing the embedding of a document. """ yield from self._embed_documents( model_name=self.model_name, @@ -288,6 +281,8 @@ def embed( providers=self.providers, cuda=self.cuda, device_ids=self.device_ids, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, **kwargs, ) diff --git a/fastembed/text/onnx_text_model.py b/fastembed/text/onnx_text_model.py index 45a9dc0e..80b39259 100644 --- a/fastembed/text/onnx_text_model.py +++ b/fastembed/text/onnx_text_model.py @@ -108,8 +108,30 @@ def _embed_documents( providers: Optional[Sequence[OnnxProvider]] = None, cuda: bool = False, device_ids: Optional[list[int]] = None, + local_files_only: bool = False, + specific_model_path: Optional[str] = None, **kwargs: Any, ) -> Iterable[T]: + """ + Embeds a collection of documents using an ONNX text model, with optional batching and parallel processing. + + Depending on the input size and parallelization settings, processes documents either sequentially or by distributing batches across multiple worker processes. Supports loading models from local files or a specific path. + + Args: + model_name: Name of the ONNX model to use. + cache_dir: Directory for caching model files. + documents: A string or iterable of strings representing the documents to embed. + batch_size: Number of documents per batch for embedding. + parallel: Number of parallel worker processes to use; if None or input is small, runs sequentially. + providers: Optional sequence of ONNX runtime providers. + cuda: Whether to use CUDA-enabled devices. + device_ids: Optional list of device IDs for parallel processing. + local_files_only: If True, restricts model loading to local files only. + specific_model_path: Optional path to a specific model file. + + Yields: + Embeddings for each document, in the same order as the input. + """ is_small = False if isinstance(documents, str): @@ -136,6 +158,8 @@ def _embed_documents( "model_name": model_name, "cache_dir": cache_dir, "providers": providers, + "local_files_only": local_files_only, + "specific_model_path": specific_model_path, **kwargs, }