diff --git a/.github/workflows/publish_dev_package.yml b/.github/workflows/publish_dev_package.yml index 3277131a6..007418ebf 100644 --- a/.github/workflows/publish_dev_package.yml +++ b/.github/workflows/publish_dev_package.yml @@ -6,38 +6,88 @@ on: - master jobs: - build-n-publish: - name: Build and publish Python 🐍 distributions 📦 to TestPyPI - runs-on: ubuntu-latest - environment: testpypi - permissions: - id-token: write + build_wheels: + name: Build wheels on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, windows-latest, macos-13, macos-14] + + steps: + - name: Check out from Git + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get history and tags for SCM versioning + run: | + git fetch --prune --unshallow || true + git fetch --depth=1 origin +refs/tags/*:refs/tags/* || true + + - name: Build wheels + uses: pypa/cibuildwheel@v2.21.3 + env: + CIBW_BUILD_VERBOSITY: 1 + + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-${{ matrix.os }} + path: ./wheelhouse/*.whl + build_sdist: + name: Build source distribution + runs-on: ubuntu-latest + steps: - name: Check out from Git uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Get history and tags for SCM versioning run: | - git fetch --prune --unshallow - git fetch --depth=1 origin +refs/tags/*:refs/tags/* - - name: Set up Python 3.13 + git fetch --prune --unshallow || true + git fetch --depth=1 origin +refs/tags/*:refs/tags/* || true + + - name: Set up Python uses: actions/setup-python@v5 with: - python-version: 3.13 - - name: Install pypa/build - run: >- - python -m - pip install - build - --user - - name: Build a binary wheel and a source tarball - run: >- - python -m - build - --sdist - --wheel - --outdir dist/ - . + python-version: '3.13' + + - name: Build sdist + run: | + python -m pip install build + python -m build --sdist --outdir dist/ + + - name: Upload sdist + uses: actions/upload-artifact@v4 + with: + name: sdist + path: dist/*.tar.gz + + upload_testpypi: + name: Upload to TestPyPI + needs: [build_wheels, build_sdist] + runs-on: ubuntu-latest + environment: testpypi + permissions: + id-token: write + + steps: + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + pattern: wheels-* + merge-multiple: true + path: dist/ + + - name: Download sdist + uses: actions/download-artifact@v4 + with: + name: sdist + path: dist/ + - name: Publish distribution 📦 to Test PyPI uses: pypa/gh-action-pypi-publish@release/v1 with: diff --git a/.github/workflows/publish_on_release.yml b/.github/workflows/publish_on_release.yml index e84142a54..e30f985cc 100644 --- a/.github/workflows/publish_on_release.yml +++ b/.github/workflows/publish_on_release.yml @@ -5,38 +5,88 @@ on: types: [published] jobs: - build-n-publish: - name: Build and publish Python 🐍 distributions 📦 to PyPI - runs-on: ubuntu-latest - environment: pypi - permissions: - id-token: write + build_wheels: + name: Build wheels on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, windows-latest, macos-13, macos-14] + + steps: + - name: Check out from Git + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get history and tags for SCM versioning + run: | + git fetch --prune --unshallow || true + git fetch --depth=1 origin +refs/tags/*:refs/tags/* || true + + - name: Build wheels + uses: pypa/cibuildwheel@v2.21.3 + env: + CIBW_BUILD_VERBOSITY: 1 + + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-${{ matrix.os }} + path: ./wheelhouse/*.whl + build_sdist: + name: Build source distribution + runs-on: ubuntu-latest + steps: - name: Check out from Git uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Get history and tags for SCM versioning run: | - git fetch --prune --unshallow - git fetch --depth=1 origin +refs/tags/*:refs/tags/* - - name: Set up Python 3.13 + git fetch --prune --unshallow || true + git fetch --depth=1 origin +refs/tags/*:refs/tags/* || true + + - name: Set up Python uses: actions/setup-python@v5 with: - python-version: 3.13 - - name: Install pypa/build - run: >- - python -m - pip install - build - --user - - name: Build a binary wheel and a source tarball - run: >- - python -m - build - --sdist - --wheel - --outdir dist/ - . + python-version: '3.13' + + - name: Build sdist + run: | + python -m pip install build + python -m build --sdist --outdir dist/ + + - name: Upload sdist + uses: actions/upload-artifact@v4 + with: + name: sdist + path: dist/*.tar.gz + + upload_pypi: + name: Upload to PyPI + needs: [build_wheels, build_sdist] + runs-on: ubuntu-latest + environment: pypi + permissions: + id-token: write + + steps: + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + pattern: wheels-* + merge-multiple: true + path: dist/ + + - name: Download sdist + uses: actions/download-artifact@v4 + with: + name: sdist + path: dist/ + - name: Publish distribution 📦 to PyPI if: startsWith(github.ref, 'refs/tags') uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 000000000..b7b5c9792 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,13 @@ +include README.md +include LICENSE +include pyproject.toml +include setup.py +include _version_helper.py + +recursive-include pymilvus *.pyx *.pxd +recursive-include pymilvus *.py +recursive-exclude pymilvus *.pyc +recursive-exclude pymilvus __pycache__ + +global-exclude *.so +global-exclude *.c diff --git a/pymilvus/client/_fast_extract.pyx b/pymilvus/client/_fast_extract.pyx new file mode 100644 index 000000000..fcccc7543 --- /dev/null +++ b/pymilvus/client/_fast_extract.pyx @@ -0,0 +1,113 @@ +# cython: language_level=3 +# cython: boundscheck=False +# cython: wraparound=False +# cython: cdivision=True + +from cpython.dict cimport PyDict_SetItem +from cpython.list cimport PyList_GET_ITEM, PyList_GET_SIZE +from cpython.object cimport PyObject + + +cpdef void assign_scalar_fast( + list entity_rows, + str field_name, + list data_list, + list valid_data, + bint has_valid +): + """Fast Cython implementation of scalar field assignment. + + Args: + entity_rows: List of dictionaries to populate + field_name: Name of the field to set + data_list: List of values to assign (already converted from protobuf) + valid_data: List of validity flags (empty if no validity checks) + has_valid: Whether to check validity + """ + cdef Py_ssize_t i + cdef Py_ssize_t row_count = PyList_GET_SIZE(entity_rows) + cdef PyObject* field_name_obj = field_name + cdef dict row + cdef object value + + if has_valid: + for i in range(row_count): + row = PyList_GET_ITEM(entity_rows, i) + if PyList_GET_ITEM(valid_data, i): + value = PyList_GET_ITEM(data_list, i) + else: + value = None + PyDict_SetItem(row, field_name, value) + else: + for i in range(row_count): + row = PyList_GET_ITEM(entity_rows, i) + value = PyList_GET_ITEM(data_list, i) + PyDict_SetItem(row, field_name, value) + + +cpdef void assign_array_fast( + list entity_rows, + str field_name, + list array_data, + list valid_data, + bint has_valid, + int element_type +): + """Fast Cython implementation of array field assignment. + + Args: + entity_rows: List of dictionaries to populate + field_name: Name of the field to set + array_data: List of array proto objects + valid_data: List of validity flags + has_valid: Whether to check validity + element_type: DataType of array elements + """ + cdef Py_ssize_t i + cdef Py_ssize_t row_count = PyList_GET_SIZE(entity_rows) + cdef dict row + cdef object array_item + cdef object value + + # element_type values from DataType enum + # INT64=5, BOOL=1, INT8/16/32=2/3/4, FLOAT=10, DOUBLE=11, STRING/VARCHAR=20/21 + + if has_valid: + for i in range(row_count): + row = PyList_GET_ITEM(entity_rows, i) + if not PyList_GET_ITEM(valid_data, i): + PyDict_SetItem(row, field_name, None) + else: + array_item = PyList_GET_ITEM(array_data, i) + # Extract based on element type + if element_type == 5: # INT64 + value = array_item.long_data.data + elif element_type == 1: # BOOL + value = array_item.bool_data.data + elif element_type in (2, 3, 4): # INT8/16/32 + value = array_item.int_data.data + elif element_type == 10: # FLOAT + value = array_item.float_data.data + elif element_type == 11: # DOUBLE + value = array_item.double_data.data + else: # STRING/VARCHAR + value = array_item.string_data.data + PyDict_SetItem(row, field_name, value) + else: + for i in range(row_count): + row = PyList_GET_ITEM(entity_rows, i) + array_item = PyList_GET_ITEM(array_data, i) + # Extract based on element type + if element_type == 5: # INT64 + value = array_item.long_data.data + elif element_type == 1: # BOOL + value = array_item.bool_data.data + elif element_type in (2, 3, 4): # INT8/16/32 + value = array_item.int_data.data + elif element_type == 10: # FLOAT + value = array_item.float_data.data + elif element_type == 11: # DOUBLE + value = array_item.double_data.data + else: # STRING/VARCHAR + value = array_item.string_data.data + PyDict_SetItem(row, field_name, value) diff --git a/pymilvus/client/entity_helper.py b/pymilvus/client/entity_helper.py index 56fc07ad0..47853804c 100644 --- a/pymilvus/client/entity_helper.py +++ b/pymilvus/client/entity_helper.py @@ -6,6 +6,14 @@ import numpy as np import orjson +# Try to import Cython fast path, fallback to pure Python +try: + from ._fast_extract import assign_array_fast, assign_scalar_fast + + HAS_CYTHON = True +except ImportError: + HAS_CYTHON = False + from pymilvus.exceptions import ( DataNotMatchException, ExceptionsMessage, @@ -651,55 +659,39 @@ def extract_array_row_data_with_validity(field_data: Any, entity_rows: List[Dict field_name = field_data.field_name data = field_data.scalars.array_data.data element_type = field_data.scalars.array_data.element_type + valid_data = field_data.valid_data + + # Use Cython fast path if available + if HAS_CYTHON: + assign_array_fast( + entity_rows, field_name, list(data), list(valid_data), True, int(element_type) + ) + return if element_type == DataType.INT64: - [ - entity_rows[i].__setitem__( - field_name, data[i].long_data.data if field_data.valid_data[i] else None - ) - for i in range(row_count) - ] + for i in range(row_count): + entity_rows[i][field_name] = data[i].long_data.data if valid_data[i] else None elif element_type == DataType.BOOL: - [ - entity_rows[i].__setitem__( - field_name, data[i].bool_data.data if field_data.valid_data[i] else None - ) - for i in range(row_count) - ] + for i in range(row_count): + entity_rows[i][field_name] = data[i].bool_data.data if valid_data[i] else None elif element_type in ( DataType.INT8, DataType.INT16, DataType.INT32, ): - [ - entity_rows[i].__setitem__( - field_name, data[i].int_data.data if field_data.valid_data[i] else None - ) - for i in range(row_count) - ] + for i in range(row_count): + entity_rows[i][field_name] = data[i].int_data.data if valid_data[i] else None elif element_type == DataType.FLOAT: - [ - entity_rows[i].__setitem__( - field_name, data[i].float_data.data if field_data.valid_data[i] else None - ) - for i in range(row_count) - ] + for i in range(row_count): + entity_rows[i][field_name] = data[i].float_data.data if valid_data[i] else None elif element_type == DataType.DOUBLE: - [ - entity_rows[i].__setitem__( - field_name, data[i].double_data.data if field_data.valid_data[i] else None - ) - for i in range(row_count) - ] + for i in range(row_count): + entity_rows[i][field_name] = data[i].double_data.data if valid_data[i] else None elif element_type in ( DataType.STRING, DataType.VARCHAR, ): - [ - entity_rows[i].__setitem__( - field_name, data[i].string_data.data if field_data.valid_data[i] else None - ) - for i in range(row_count) - ] + for i in range(row_count): + entity_rows[i][field_name] = data[i].string_data.data if valid_data[i] else None else: raise MilvusException(message=f"Unsupported data type: {element_type}") @@ -708,25 +700,36 @@ def extract_array_row_data_no_validity(field_data: Any, entity_rows: List[Dict], field_name = field_data.field_name data = field_data.scalars.array_data.data element_type = field_data.scalars.array_data.element_type + + # Use Cython fast path if available + if HAS_CYTHON: + assign_array_fast(entity_rows, field_name, list(data), [], False, int(element_type)) + return if element_type == DataType.INT64: - [entity_rows[i].__setitem__(field_name, data[i].long_data.data) for i in range(row_count)] + for i in range(row_count): + entity_rows[i][field_name] = data[i].long_data.data elif element_type == DataType.BOOL: - [entity_rows[i].__setitem__(field_name, data[i].bool_data.data) for i in range(row_count)] + for i in range(row_count): + entity_rows[i][field_name] = data[i].bool_data.data elif element_type in ( DataType.INT8, DataType.INT16, DataType.INT32, ): - [entity_rows[i].__setitem__(field_name, data[i].int_data.data) for i in range(row_count)] + for i in range(row_count): + entity_rows[i][field_name] = data[i].int_data.data elif element_type == DataType.FLOAT: - [entity_rows[i].__setitem__(field_name, data[i].float_data.data) for i in range(row_count)] + for i in range(row_count): + entity_rows[i][field_name] = data[i].float_data.data elif element_type == DataType.DOUBLE: - [entity_rows[i].__setitem__(field_name, data[i].double_data.data) for i in range(row_count)] + for i in range(row_count): + entity_rows[i][field_name] = data[i].double_data.data elif element_type in ( DataType.STRING, DataType.VARCHAR, ): - [entity_rows[i].__setitem__(field_name, data[i].string_data.data) for i in range(row_count)] + for i in range(row_count): + entity_rows[i][field_name] = data[i].string_data.data else: raise MilvusException(message=f"Unsupported data type: {element_type}") @@ -765,13 +768,21 @@ def extract_row_data_from_fields_data_v2( valid_data = field_data.valid_data def assign_scalar(data: List[Any]) -> None: - if has_valid: - [ - entity_rows[i].__setitem__(field_name, None if not valid_data[i] else data[i]) - for i in range(row_count) - ] + # Convert protobuf RepeatedScalarFieldContainer to list for faster indexing + data_list = list(data) + + # Use Cython fast path if available (2-3x faster) + if HAS_CYTHON: + assign_scalar_fast( + entity_rows, field_name, data_list, list(valid_data) if has_valid else [], has_valid + ) + # Pure Python fallback + elif has_valid: + for i in range(row_count): + entity_rows[i][field_name] = None if not valid_data[i] else data_list[i] else: - [entity_rows[i].__setitem__(field_name, data[i]) for i in range(row_count)] + for i in range(row_count): + entity_rows[i][field_name] = data_list[i] if field_data.type == DataType.BOOL: data = field_data.scalars.bool_data.data diff --git a/pymilvus/client/search_result.py b/pymilvus/client/search_result.py index 3e5c26f4b..7508e9e08 100644 --- a/pymilvus/client/search_result.py +++ b/pymilvus/client/search_result.py @@ -92,7 +92,7 @@ def __init__( self.lazy_field_data.append(field_data) else: msg = f"Unsupported field type: {field_data.type}" - raise MilvusException(msg) + raise MilvusException(message=msg) super().__init__(top_k_res) def __str__(self) -> str: @@ -218,7 +218,7 @@ def materialize(self): item["entity"][field_name] = [] else: msg = f"Unsupported field type: {field_data.type}" - raise MilvusException(msg) + raise MilvusException(message=msg) self.has_materialized = True @@ -555,7 +555,7 @@ def get_field_data(field_data: FieldData): if field_data.type == DataType._ARRAY_OF_VECTOR: return field_data.vectors.vector_array msg = f"Unsupported field type: {field_data.type}" - raise MilvusException(msg) + raise MilvusException(message=msg) class Hits(list): diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..797a4d089 --- /dev/null +++ b/setup.py @@ -0,0 +1,55 @@ +""" +Setup script for pymilvus with optional Cython extensions. + +This file enables automatic building of Cython extensions during install. +Configuration is primarily in pyproject.toml. +""" +import sys + +from setuptools import Extension, setup + +# Try to use Cython if available +try: + from Cython.Build import cythonize + + USE_CYTHON = True +except ImportError: + USE_CYTHON = False + print("Cython not found. Installing without compiled extensions.", file=sys.stderr) + print("For better performance, install Cython: pip install Cython", file=sys.stderr) + + +def get_extensions(): + """Build list of extensions to compile.""" + if not USE_CYTHON: + return [] + + extensions = [ + Extension( + "pymilvus.client._fast_extract", + sources=["pymilvus/client/_fast_extract.pyx"], + extra_compile_args=["-O3"], + language="c", + ) + ] + + return cythonize( + extensions, + compiler_directives={ + "language_level": "3", + "boundscheck": False, + "wraparound": False, + "cdivision": True, + "embedsignature": True, + }, + # Don't fail build if Cython source is missing + force=False, + ) + + +# setuptools will read most config from pyproject.toml +# We only specify ext_modules here +if __name__ == "__main__": + setup( + ext_modules=get_extensions(), + )