diff --git a/README.md b/README.md index 93698abaf..2922c16c1 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,12 @@ # DiskANN -[![DiskANN Pull Request Build and Test](https://github.com/microsoft/DiskANN/actions/workflows/pr-test.yml/badge.svg)](https://github.com/microsoft/DiskANN/actions/workflows/pr-test.yml) +[![DiskANN Paper](https://img.shields.io/badge/Paper-NeurIPS%3A_DiskANN-blue)](https://papers.nips.cc/paper/9527-rand-nsg-fast-accurate-billion-point-nearest-neighbor-search-on-a-single-node.pdf) +[![DiskANN Paper](https://img.shields.io/badge/Paper-Arxiv%3A_Fresh--DiskANN-blue)](https://arxiv.org/abs/2105.09613) +[![DiskANN Paper](https://img.shields.io/badge/Paper-Filtered--DiskANN-blue)](https://harsha-simhadri.org/pubs/Filtered-DiskANN23.pdf) +[![DiskANN Main](https://github.com/microsoft/DiskANN/actions/workflows/push-test.yml/badge.svg?branch=main)](https://github.com/microsoft/DiskANN/actions/workflows/push-test.yml) +[![PyPI version](https://img.shields.io/pypi/v/diskannpy.svg)](https://pypi.org/project/diskannpy/) +[![Downloads shield](https://pepy.tech/badge/diskannpy)](https://pepy.tech/project/diskannpy) +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) DiskANN is a suite of scalable, accurate and cost-effective approximate nearest neighbor search algorithms for large-scale vector search that support real-time changes and simple filters. This code is based on ideas from the [DiskANN](https://papers.nips.cc/paper/9527-rand-nsg-fast-accurate-billion-point-nearest-neighbor-search-on-a-single-node.pdf), [Fresh-DiskANN](https://arxiv.org/abs/2105.09613) and the [Filtered-DiskANN](https://harsha-simhadri.org/pubs/Filtered-DiskANN23.pdf) papers with further improvements. @@ -12,8 +18,6 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio See [guidelines](CONTRIBUTING.md) for contributing to this project. - - ## Linux build: Install the following packages through apt-get @@ -71,12 +75,16 @@ OR for Visual Studio 2017 and earlier: ``` \cmake .. ``` -* This will create a diskann.sln solution. Open it from VisualStudio and build either Release or Debug configuration. - * Alternatively, use MSBuild: +**This will create a diskann.sln solution**. Now you can: + +- Open it from VisualStudio and build either Release or Debug configuration. +- `\cmake --build build` +- Use MSBuild: ``` msbuild.exe diskann.sln /m /nologo /t:Build /p:Configuration="Release" /property:Platform="x64" ``` - * This will also build gperftools submodule for libtcmalloc_minimal dependency. + +* This will also build gperftools submodule for libtcmalloc_minimal dependency. * Generated binaries are stored in the x64/Release or x64/Debug directories. ## Usage: @@ -88,16 +96,16 @@ Please see the following pages on using the compiled code: - [Commandline examples for using in-memory streaming indices](workflows/dynamic_index.md) - [Commandline interface for building and search in memory indices with label data and filters](workflows/filtered_in_memory.md) - [Commandline interface for building and search SSD based indices with label data and filters](workflows/filtered_ssd_index.md) -- To be added: Python interfaces and docker files +- [diskannpy - DiskANN as a python extension module](python/README.md) Please cite this software in your work as: ``` @misc{diskann-github, - author = {Simhadri, Harsha Vardhan and Krishnaswamy, Ravishankar and Srinivasa, Gopal and Subramanya, Suhas Jayaram and Antonijevic, Andrija and Pryce, Dax and Kaczynski, David and Williams, Shane and Gollapudi, Siddarth and Sivashankar, Varun and Karia, Neel and Singh, Aditi and Jaiswal, Shikhar and Mahapatro, Neelam and Adams, Philip and Tower, Bryan}}, + author = {Simhadri, Harsha Vardhan and Krishnaswamy, Ravishankar and Srinivasa, Gopal and Subramanya, Suhas Jayaram and Antonijevic, Andrija and Pryce, Dax and Kaczynski, David and Williams, Shane and Gollapudi, Siddarth and Sivashankar, Varun and Karia, Neel and Singh, Aditi and Jaiswal, Shikhar and Mahapatro, Neelam and Adams, Philip and Tower, Bryan and Patel, Yash}}, title = {{DiskANN: Graph-structured Indices for Scalable, Fast, Fresh and Filtered Approximate Nearest Neighbor Search}}, url = {https://github.com/Microsoft/DiskANN}, - version = {0.5}, + version = {0.6.0}, year = {2023} } ``` diff --git a/pyproject.toml b/pyproject.toml index 55e8a465a..fb4349fab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ build-backend = "setuptools.build_meta" [project] name = "diskannpy" -version = "0.5.0.rc5" +version = "0.6.0" description = "DiskANN Python extension module" readme = "python/README.md" @@ -25,9 +25,19 @@ authors = [ {name = "Dax Pryce", email = "daxpryce@microsoft.com"} ] +[project.optional-dependencies] +dev = ["black", "isort", "mypy"] + [tool.setuptools] package-dir = {"" = "python/src"} +[tool.isort] +profile = "black" +multi_line_output = 3 + +[tool.mypy] +plugins = "numpy.typing.mypy_plugin" + [tool.cibuildwheel] manylinux-x86_64-image = "manylinux_2_28" test-requires = ["scikit-learn~=1.2"] @@ -35,7 +45,6 @@ build-frontend = "build" skip = ["pp*", "*-win32", "*-manylinux_i686", "*-musllinux*"] test-command = "python -m unittest discover {project}/python/tests" - [tool.cibuildwheel.linux] before-build = [ "dnf makecache --refresh", diff --git a/python/README.md b/python/README.md index 28628187a..1365fb422 100644 --- a/python/README.md +++ b/python/README.md @@ -1,9 +1,17 @@ # diskannpy +[![DiskANN Paper](https://img.shields.io/badge/Paper-NeurIPS%3A_DiskANN-blue)](https://papers.nips.cc/paper/9527-rand-nsg-fast-accurate-billion-point-nearest-neighbor-search-on-a-single-node.pdf) +[![DiskANN Paper](https://img.shields.io/badge/Paper-Arxiv%3A_Fresh--DiskANN-blue)](https://arxiv.org/abs/2105.09613) +[![DiskANN Paper](https://img.shields.io/badge/Paper-Filtered--DiskANN-blue)](https://harsha-simhadri.org/pubs/Filtered-DiskANN23.pdf) +[![DiskANN Main](https://github.com/microsoft/DiskANN/actions/workflows/push-test.yml/badge.svg?branch=main)](https://github.com/microsoft/DiskANN/actions/workflows/push-test.yml) +[![PyPI version](https://img.shields.io/pypi/v/diskannpy.svg)](https://pypi.org/project/diskannpy/) +[![Downloads shield](https://pepy.tech/badge/diskannpy)](https://pepy.tech/project/diskannpy) +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) + ## Installation Packages published to PyPI will always be built using the latest numpy major.minor release (at this time, 1.25). -Conda distributions for versions 1.19-1.25 will be completed as a future effort. In the meantime, feel free to +Conda distributions for versions 1.19-1.25 will be completed as a future effort. In the meantime, feel free to clone this repository and build it yourself. ## Local Build Instructions @@ -16,11 +24,18 @@ build `diskannpy` with these additional instructions. In the root folder of DiskANN, there is a file `pyproject.toml`. You will need to edit the version of numpy in both the `[build-system.requires]` section, as well as the `[project.dependencies]` section. The version numbers must match. +#### Linux ```bash -python3.11 -m venv venv # versions from python3.8 and up should work. on windows, you might need to use py -3.11 -m venv venv -source venv/bin/activate # linux -# or -venv\Scripts\Activate.{ps1, bat} # windows +python3.11 -m venv venv # versions from python3.9 and up should work +source venv/bin/activate +pip install build +python -m build +``` + +#### Windows +```powershell +py -3.11 -m venv venv # versions from python3.9 and up should work +venv\Scripts\Activate.ps1 pip install build python -m build ``` @@ -31,10 +46,10 @@ The built wheel will be placed in the `dist` directory in your DiskANN root. Ins Please cite this software in your work as: ``` @misc{diskann-github, - author = {Simhadri, Harsha Vardhan and Krishnaswamy, Ravishankar and Srinivasa, Gopal and Subramanya, Suhas Jayaram and Antonijevic, Andrija and Pryce, Dax and Kaczynski, David and Williams, Shane and Gollapudi, Siddarth and Sivashankar, Varun and Karia, Neel and Singh, Aditi and Jaiswal, Shikhar and Mahapatro, Neelam and Adams, Philip and Tower, Bryan}}, + author = {Simhadri, Harsha Vardhan and Krishnaswamy, Ravishankar and Srinivasa, Gopal and Subramanya, Suhas Jayaram and Antonijevic, Andrija and Pryce, Dax and Kaczynski, David and Williams, Shane and Gollapudi, Siddarth and Sivashankar, Varun and Karia, Neel and Singh, Aditi and Jaiswal, Shikhar and Mahapatro, Neelam and Adams, Philip and Tower, Bryan and Patel, Yash}}, title = {{DiskANN: Graph-structured Indices for Scalable, Fast, Fresh and Filtered Approximate Nearest Neighbor Search}}, url = {https://github.com/Microsoft/DiskANN}, - version = {0.5}, + version = {0.6.0}, year = {2023} } -``` \ No newline at end of file +``` diff --git a/python/apps/in-mem-dynamic.py b/python/apps/in-mem-dynamic.py index 83e65b4fc..f97e1313f 100644 --- a/python/apps/in-mem-dynamic.py +++ b/python/apps/in-mem-dynamic.py @@ -40,26 +40,25 @@ def insert_and_search( npts, ndims = utils.get_bin_metadata(indexdata_file) if dtype_str == "float": - index = diskannpy.DynamicMemoryIndex( - "l2", np.float32, ndims, npts, Lb, graph_degree - ) - queries = utils.bin_to_numpy(np.float32, querydata_file) - data = utils.bin_to_numpy(np.float32, indexdata_file) + dtype = np.float32 elif dtype_str == "int8": - index = diskannpy.DynamicMemoryIndex( - "l2", np.int8, ndims, npts, Lb, graph_degree - ) - queries = utils.bin_to_numpy(np.int8, querydata_file) - data = utils.bin_to_numpy(np.int8, indexdata_file) + dtype = np.int8 elif dtype_str == "uint8": - index = diskannpy.DynamicMemoryIndex( - "l2", np.uint8, ndims, npts, Lb, graph_degree - ) - queries = utils.bin_to_numpy(np.uint8, querydata_file) - data = utils.bin_to_numpy(np.uint8, indexdata_file) + dtype = np.uint8 else: raise ValueError("data_type must be float, int8 or uint8") + index = diskannpy.DynamicMemoryIndex( + distance_metric="l2", + vector_dtype=dtype, + dimensions=ndims, + max_vectors=npts, + complexity=Lb, + graph_degree=graph_degree + ) + queries = diskannpy.vectors_from_file(querydata_file, dtype) + data = diskannpy.vectors_from_file(indexdata_file, dtype) + tags = np.zeros(npts, dtype=np.uintc) timer = utils.Timer() for i in range(npts): diff --git a/python/apps/insert-in-clustered-order.py b/python/apps/insert-in-clustered-order.py index 3ff1d7e25..25cb9d53c 100644 --- a/python/apps/insert-in-clustered-order.py +++ b/python/apps/insert-in-clustered-order.py @@ -24,26 +24,25 @@ def insert_and_search( npts, ndims = utils.get_bin_metadata(indexdata_file) if dtype_str == "float": - index = diskannpy.DynamicMemoryIndex( - "l2", np.float32, ndims, npts, Lb, graph_degree, False - ) - queries = utils.bin_to_numpy(np.float32, querydata_file) - data = utils.bin_to_numpy(np.float32, indexdata_file) + dtype = np.float32 elif dtype_str == "int8": - index = diskannpy.DynamicMemoryIndex( - "l2", np.int8, ndims, npts, Lb, graph_degree - ) - queries = utils.bin_to_numpy(np.int8, querydata_file) - data = utils.bin_to_numpy(np.int8, indexdata_file) + dtype = np.int8 elif dtype_str == "uint8": - index = diskannpy.DynamicMemoryIndex( - "l2", np.uint8, ndims, npts, Lb, graph_degree - ) - queries = utils.bin_to_numpy(np.uint8, querydata_file) - data = utils.bin_to_numpy(np.uint8, indexdata_file) + dtype = np.uint8 else: raise ValueError("data_type must be float, int8 or uint8") + index = diskannpy.DynamicMemoryIndex( + distance_metric="l2", + vector_dtype=dtype, + dimensions=ndims, + max_vectors=npts, + complexity=Lb, + graph_degree=graph_degree + ) + queries = diskannpy.vectors_from_file(querydata_file, dtype) + data = diskannpy.vectors_from_file(indexdata_file, dtype) + offsets, permutation = utils.cluster_and_permute( dtype_str, npts, ndims, data, num_clusters ) @@ -52,7 +51,7 @@ def insert_and_search( timer = utils.Timer() for c in range(num_clusters): cluster_index_range = range(offsets[c], offsets[c + 1]) - cluster_indices = np.array(permutation[cluster_index_range], dtype=np.uintc) + cluster_indices = np.array(permutation[cluster_index_range], dtype=np.uint32) cluster_data = data[cluster_indices, :] index.batch_insert(cluster_data, cluster_indices + 1, num_insert_threads) print('Inserted cluster', c, 'in', timer.elapsed(), 's') diff --git a/python/src/__init__.py b/python/src/__init__.py index bf0eb340d..c2e1b07f6 100644 --- a/python/src/__init__.py +++ b/python/src/__init__.py @@ -1,24 +1,138 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT license. -from ._builder import ( - build_disk_index, - build_memory_index, - numpy_to_diskann_file, -) -from ._common import ( - DistanceMetric, - QueryResponse, - QueryResponseBatch, - VectorDType, - VectorIdentifier, - VectorIdentifierBatch, - VectorLike, - VectorLikeBatch, - valid_dtype -) -from ._diskannpy import defaults +""" +# Documentation Overview +`diskannpy` is mostly structured around 2 distinct processes: [Index Builder Functions](#index-builders) and [Search Classes](#search-classes) + +It also includes a few nascent [utilities](#utilities). + +And lastly, it makes substantial use of type hints, with various shorthand [type aliases](#parameter-and-response-type-aliases) documented. +When reading the `diskannpy` code we refer to the type aliases, though `pdoc` helpfully expands them. + +## Index Builders +- `build_disk_index` - To build an index that cannot fully fit into memory when searching +- `build_memory_index` - To build an index that can fully fit into memory when searching + +## Search Classes +- `StaticMemoryIndex` - for indices that can fully fit in memory and won't be changed during the search operations +- `StaticDiskIndex` - for indices that cannot fully fit in memory, thus relying on disk IO to search, and also won't be changed during search operations +- `DynamicMemoryIndex` - for indices that can fully fit in memory and will be mutated via insert/deletion operations as well as search operations + +## Parameter Defaults +- `diskannpy.defaults` - Default values exported from the C++ extension for Python users + +## Parameter and Response Type Aliases +- `DistanceMetric` - What distance metrics does `diskannpy` support? +- `VectorDType` - What vector datatypes does `diskannpy` support? +- `QueryResponse` - What can I expect as a response to my search? +- `QueryResponseBatch` - What can I expect as a response to my batch search? +- `VectorIdentifier` - What types do `diskannpy` support as vector identifiers? +- `VectorIdentifierBatch` - A batch of identifiers of the exact same type. The type can change, but they must **all** change. +- `VectorLike` - How does a vector look to `diskannpy`, to be inserted or searched with. +- `VectorLikeBatch` - A batch of those vectors, to be inserted or searched with. +- `Metadata` - DiskANN vector binary file metadata (num_points, vector_dim) + +## Utilities +- `vectors_to_file` - Turns a 2 dimensional `numpy.typing.NDArray[VectorDType]` with shape `(number_of_points, vector_dim)` into a DiskANN vector bin file. +- `vectors_from_file` - Reads a DiskANN vector bin file representing stored vectors into a numpy ndarray. +- `vectors_metadata_from_file` - Reads metadata stored in a DiskANN vector bin file without reading the entire file +- `tags_to_file` - Turns a 1 dimensional `numpy.typing.NDArray[VectorIdentifier]` into a DiskANN tags bin file. +- `tags_from_file` - Reads a DiskANN tags bin file representing stored tags into a numpy ndarray. +- `valid_dtype` - Checks if a given vector dtype is supported by `diskannpy` +""" + +from typing import Any, Literal, NamedTuple, Type, Union + +import numpy as np +from numpy import typing as npt + +DistanceMetric = Literal["l2", "mips", "cosine"] +""" Type alias for one of {"l2", "mips", "cosine"} """ +VectorDType = Union[Type[np.float32], Type[np.int8], Type[np.uint8]] +""" Type alias for one of {`numpy.float32`, `numpy.int8`, `numpy.uint8`} """ +VectorLike = npt.NDArray[VectorDType] +""" Type alias for something that can be treated as a vector """ +VectorLikeBatch = npt.NDArray[VectorDType] +""" Type alias for a batch of VectorLikes """ +VectorIdentifier = np.uint32 +""" +Type alias for a vector identifier, whether it be an implicit array index identifier from StaticMemoryIndex or +StaticDiskIndex, or an explicit tag identifier from DynamicMemoryIndex +""" +VectorIdentifierBatch = npt.NDArray[np.uint32] +""" Type alias for a batch of VectorIdentifiers """ + + +class QueryResponse(NamedTuple): + """ + Tuple with two values, identifiers and distances. Both are 1d arrays, positionally correspond, and will contain the + nearest neighbors from [0..k_neighbors) + """ + + identifiers: npt.NDArray[VectorIdentifier] + """ A `numpy.typing.NDArray[VectorIdentifier]` array of vector identifiers, 1 dimensional """ + distances: npt.NDArray[np.float32] + """ + A `numpy.typing.NDAarray[numpy.float32]` of distances as calculated by the distance metric function, 1 dimensional + """ + + +class QueryResponseBatch(NamedTuple): + """ + Tuple with two values, identifiers and distances. Both are 2d arrays, with dimensionality determined by the + rows corresponding to the number of queries made, and the columns corresponding to the k neighbors + requested. The two 2d arrays have an implicit, position-based relationship + """ + + identifiers: npt.NDArray[VectorIdentifier] + """ + A `numpy.typing.NDArray[VectorIdentifier]` array of vector identifiers, 2 dimensional. The row corresponds to index + of the query, and the column corresponds to the k neighbors requested + """ + distances: np.ndarray[np.float32] + """ + A `numpy.typing.NDAarray[numpy.float32]` of distances as calculated by the distance metric function, 2 dimensional. + The row corresponds to the index of the query, and the column corresponds to the distance of the query to the + *k-th* neighbor + """ + + +from . import defaults +from ._builder import build_disk_index, build_memory_index +from ._common import valid_dtype from ._dynamic_memory_index import DynamicMemoryIndex -from ._files import vectors_from_binary, vector_file_metadata +from ._files import ( + Metadata, + tags_from_file, + tags_to_file, + vectors_from_file, + vectors_metadata_from_file, + vectors_to_file, +) from ._static_disk_index import StaticDiskIndex from ._static_memory_index import StaticMemoryIndex + +__all__ = [ + "build_disk_index", + "build_memory_index", + "StaticDiskIndex", + "StaticMemoryIndex", + "DynamicMemoryIndex", + "defaults", + "DistanceMetric", + "VectorDType", + "QueryResponse", + "QueryResponseBatch", + "VectorIdentifier", + "VectorIdentifierBatch", + "VectorLike", + "VectorLikeBatch", + "Metadata", + "vectors_metadata_from_file", + "vectors_to_file", + "vectors_from_file", + "tags_to_file", + "tags_from_file", + "valid_dtype", +] diff --git a/python/src/_builder.py b/python/src/_builder.py index 8c9be32d9..18e9e9fa0 100644 --- a/python/src/_builder.py +++ b/python/src/_builder.py @@ -3,75 +3,47 @@ import os import shutil - from pathlib import Path -from typing import BinaryIO, Optional, Tuple, Union +from typing import Optional, Tuple, Union import numpy as np +from . import DistanceMetric, VectorDType, VectorIdentifierBatch, VectorLikeBatch from . import _diskannpy as _native_dap from ._common import ( - DistanceMetric, - VectorDType, - VectorLikeBatch, - VectorIdentifierBatch, _assert, - _assert_2d, - _assert_dtype, - _castable_dtype_or_raise, _assert_is_nonnegative_uint32, _assert_is_positive_uint32, + _castable_dtype_or_raise, _valid_metric, - _write_index_metadata + _write_index_metadata, + valid_dtype, ) -from ._files import vector_file_metadata from ._diskannpy import defaults - - -def _write_bin(data: np.ndarray, file_handler: BinaryIO): - if len(data.shape) == 1: - _ = file_handler.write(np.array([data.shape[0], 1], dtype=np.int32).tobytes()) - else: - _ = file_handler.write(np.array(data.shape, dtype=np.int32).tobytes()) - _ = file_handler.write(data.tobytes()) - - -def numpy_to_diskann_file(vectors: VectorLikeBatch, dtype: VectorDType, file_handler: BinaryIO): - """ - Utility function that writes a DiskANN binary vector formatted file to the location of your choosing. - - :param vectors: A 2d array of dtype ``numpy.single``, ``numpy.ubyte``, or ``numpy.byte`` - :type vectors: numpy.ndarray, dtype in set {numpy.single, numpy.ubyte, numpy.byte} - :param file_handler: An open binary file handler (typing.BinaryIO). - :type file_handler: io.BinaryIO - :raises ValueError: If vectors are the wrong shape or an unsupported dtype - :raises ValueError: If output_path is not a str or ``io.BinaryIO`` - """ - _assert_dtype(dtype) - _vectors = _castable_dtype_or_raise(vectors, expected=dtype, message=f"Unable to cast vectors to numpy array of type {dtype}") - _assert_2d(vectors, "vectors") - _write_bin(_vectors, file_handler) +from ._files import tags_to_file, vectors_metadata_from_file, vectors_to_file def _valid_path_and_dtype( - data: Union[str, VectorLikeBatch], vector_dtype: VectorDType, index_path: str + data: Union[str, VectorLikeBatch], + vector_dtype: VectorDType, + index_path: str, + index_prefix: str, ) -> Tuple[str, VectorDType]: if isinstance(data, str): vector_bin_path = data _assert( Path(data).exists() and Path(data).is_file(), "if data is of type `str`, it must both exist and be a file", - ) - vector_dtype_actual = vector_dtype + ) + vector_dtype_actual = valid_dtype(vector_dtype) else: - vector_bin_path = os.path.join(index_path, "vectors.bin") + vector_bin_path = os.path.join(index_path, f"{index_prefix}_vectors.bin") if Path(vector_bin_path).exists(): raise ValueError( f"The path {vector_bin_path} already exists. Remove it and try again." ) - with open(vector_bin_path, "wb") as temp_vector_bin: - numpy_to_diskann_file(vectors=data, dtype=data.dtype, file_handler=temp_vector_bin) - vector_dtype_actual = data.dtype + vector_dtype_actual = valid_dtype(data.dtype) + vectors_to_file(vector_file=vector_bin_path, vectors=data) return vector_bin_path, vector_dtype_actual @@ -88,51 +60,40 @@ def build_disk_index( pq_disk_bytes: int = defaults.PQ_DISK_BYTES, vector_dtype: Optional[VectorDType] = None, index_prefix: str = "ann", -): +) -> None: """ - This function will construct a DiskANN Disk Index and save it to disk. + This function will construct a DiskANN disk index. Disk indices are ideal for very large datasets that + are too large to fit in memory. Memory is still used, but it is primarily used to provide precise disk + locations for fast retrieval of smaller subsets of the index without compromising much on recall. If you provide a numpy array, it will save this array to disk in a temp location in the format DiskANN's PQ Flash Index builder requires. This temp folder is deleted upon index creation completion or error. - :param data: Either a ``str`` representing a path to a DiskANN vector bin file, or a numpy.ndarray, - of a supported dtype, in 2 dimensions. Note that vector_dtype must be provided if vector_path_or_np_array is a - ``str`` - :type data: Union[str, numpy.ndarray] - :param distance_metric: One of {"l2", "mips"}. L2 is supported for all 3 vector dtypes, but MIPS is only - available for single point floating numbers (numpy.single) - :type distance_metric: str - :param index_directory: The path on disk that the index will be created in. - :type index_directory: str - :param complexity: The size of queue to use when building the index for search. Values between 75 and 200 are - typical. Larger values will take more time to build but result in indices that provide higher recall for - the same search complexity. Use a value that is at least as large as R unless you are prepared to - somewhat compromise on quality - :type complexity: int - :param graph_degree: The degree of the graph index, typically between 60 and 150. A larger maximum degree will - result in larger indices and longer indexing times, but better search quality. - :type graph_degree int - :param search_memory_maximum: Build index with the expectation that the search will use at most - ``search_memory_maximum`` - :type search_memory_maximum: float - :param build_memory_maximum: Build index using at most ``build_memory_maximum`` - :type build_memory_maximum: float - :param num_threads: Number of threads to use when creating this index.0 indicates we should use all available - system threads. - :type num_threads: int - :param pq_disk_bytes: Use 0 to store uncompressed data on SSD. This allows the index to asymptote to 100% - recall. If your vectors are too large to store in SSD, this parameter provides the option to compress the - vectors using PQ for storing on SSD. This will trade off recall. You would also want this to be greater - than the number of bytes used for the PQ compressed data stored in-memory. Default is ``0``. - :type pq_disk_bytes: int (default = 0) - :param vector_dtype: Required if the provided ``vector_path_or_np_array`` is of type ``str``, else we use the - ``vector_path_or_np_array.dtype`` if np array. - :type vector_dtype: Optional[VectorDType], default is ``None``. - :param index_prefix: The prefix to give your index files. Defaults to ``ann``. - :type index_prefix: str, default="ann" - :raises ValueError: If vectors are not 2d numpy array or are not a supported dtype - :raises ValueError: If any numeric value is in an invalid range + ### Parameters + - **data**: Either a `str` representing a path to a DiskANN vector bin file, or a numpy.ndarray, + of a supported dtype, in 2 dimensions. Note that `vector_dtype` must be provided if data is a `str` + - **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3 + vector dtypes, but `mips` is only available for single precision floats. + - **index_directory**: The index files will be saved to this **existing** directory path + - **complexity**: The size of the candidate nearest neighbor list to use when building the index. Values between 75 + and 200 are typical. Larger values will take more time to build but result in indices that provide higher recall + for the same search complexity. Use a value that is at least as large as `graph_degree` unless you are prepared + to compromise on quality + - **graph_degree**: The degree of the graph index, typically between 60 and 150. A larger maximum degree will + result in larger indices and longer indexing times, but better search quality. + - **search_memory_maximum**: Build index with the expectation that the search will use at most + `search_memory_maximum`, in gb. + - **build_memory_maximum**: Build index using at most `build_memory_maximum` in gb. Building processes typically + require more memory, while search memory can be reduced. + - **num_threads**: Number of threads to use when creating this index. `0` is used to indicate all available + logical processors should be used. + - **pq_disk_bytes**: Use `0` to store uncompressed data on SSD. This allows the index to asymptote to 100% + recall. If your vectors are too large to store in SSD, this parameter provides the option to compress the + vectors using PQ for storing on SSD. This will trade off recall. You would also want this to be greater + than the number of bytes used for the PQ compressed data stored in-memory. Default is `0`. + - **vector_dtype**: Required if the provided `data` is of type `str`, else we use the `data.dtype` if np array. + - **index_prefix**: The prefix of the index files. Defaults to "ann". """ _assert( @@ -156,17 +117,17 @@ def build_disk_index( ) vector_bin_path, vector_dtype_actual = _valid_path_and_dtype( - data, vector_dtype, index_directory + data, vector_dtype, index_directory, index_prefix ) - num_points, dimensions = vector_file_metadata(vector_bin_path) + num_points, dimensions = vectors_metadata_from_file(vector_bin_path) - if vector_dtype_actual == np.single: - _builder = _native_dap.build_disk_float_index - elif vector_dtype_actual == np.ubyte: + if vector_dtype_actual == np.uint8: _builder = _native_dap.build_disk_uint8_index - else: + elif vector_dtype_actual == np.int8: _builder = _native_dap.build_disk_int8_index + else: + _builder = _native_dap.build_disk_float_index index_prefix_path = os.path.join(index_directory, index_prefix) @@ -181,7 +142,9 @@ def build_disk_index( num_threads=num_threads, pq_disk_bytes=pq_disk_bytes, ) - _write_index_metadata(index_prefix_path, vector_dtype_actual, dap_metric, num_points, dimensions) + _write_index_metadata( + index_prefix_path, vector_dtype_actual, dap_metric, num_points, dimensions + ) def build_memory_index( @@ -198,46 +161,50 @@ def build_memory_index( vector_dtype: Optional[VectorDType] = None, filter_complexity: int = defaults.FILTER_COMPLEXITY, tags: Union[str, VectorIdentifierBatch] = "", - index_prefix: str = "ann" -): + index_prefix: str = "ann", +) -> None: """ - Builds a memory index and saves it to disk to be loaded into ``StaticMemoryIndex``. - - :param data: Either a ``str`` representing a path to a DiskANN vector bin file, or a numpy.ndarray, - of a supported dtype, in 2 dimensions. Note that vector_dtype must be provided if vector_path_or_np_array is a - ``str`` - :type data: Union[str, numpy.ndarray] - :param distance_metric: One of {"l2", "mips"}. L2 is supported for all 3 vector dtypes, but MIPS is only - available for single point floating numbers (numpy.single) - :type distance_metric: str - :param index_directory: The path on disk that the index will be created in. - :type index_directory: str - :param complexity: The size of queue to use when building the index for search. Values between 75 and 200 are - typical. Larger values will take more time to build but result in indices that provide higher recall for - the same search complexity. Use a value that is at least as large as R unless you are prepared to - somewhat compromise on quality - :type complexity: int - :param graph_degree: The degree of the graph index, typically between 60 and 150. A larger maximum degree will - result in larger indices and longer indexing times, but better search quality. - :type graph_degree int - :param num_threads: Number of threads to use when creating this index. 0 indicates we should use all available - system threads. - :type num_threads: int - :param alpha: - :param use_pq_build: - :param num_pq_bytes: - :param use_opq: - :param vector_dtype: Required if the provided ``vector_path_or_np_array`` is of type ``str``, else we use the - ``vector_path_or_np_array.dtype`` if np array. - :type vector_dtype: Optional[VectorDType], default is ``None``. - :param filter_complexity: Complexity to use when using filters. Default is 0. - :type filter_complexity: int - :param tags: uint32 ids corresponding to the ordinal position of the vectors provided to build the index. - Defaults to "". - :type tags: Union[str, VectorIdentifierBatch] - :param index_prefix: The prefix to give your index files. Defaults to ``ann``. - :type index_prefix: str, default="ann" - :return: + This function will construct a DiskANN memory index. Memory indices are ideal for smaller datasets whose + indices can fit into memory. Memory indices are faster than disk indices, but usually cannot scale to massive + sizes in an individual index on an individual machine. + + `diskannpy`'s memory indices take two forms: a `diskannpy.StaticMemoryIndex`, which will not be mutated, only + searched upon, and a `diskannpy.DynamicMemoryIndex`, which can be mutated AND searched upon in the same process. + + ## Important Note: + You **must** determine the type of index you are building for. If you are building for a + `diskannpy.DynamicMemoryIndex`, you **must** supply a valid value for the `tags` parameter. **Do not supply + tags if the index is intended to be `diskannpy.StaticMemoryIndex`**! + + ### Parameters + + - **data**: Either a `str` representing a path to an existing DiskANN vector bin file, or a numpy.ndarray of a + supported dtype in 2 dimensions. Note that `vector_dtype` must be provided if `data` is a `str`. + - **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3 + vector dtypes, but `mips` is only available for single precision floats. + - **index_directory**: The index files will be saved to this **existing** directory path + - **complexity**: The size of the candidate nearest neighbor list to use when building the index. Values between 75 + and 200 are typical. Larger values will take more time to build but result in indices that provide higher recall + for the same search complexity. Use a value that is at least as large as `graph_degree` unless you are prepared + to compromise on quality + - **graph_degree**: The degree of the graph index, typically between 60 and 150. A larger maximum degree will + result in larger indices and longer indexing times, but better search quality. + - **num_threads**: Number of threads to use when creating this index. `0` is used to indicate all available + logical processors should be used. + - **alpha**: The alpha parameter (>=1) is used to control the nature and number of points that are added to the + graph. A higher alpha value (e.g., 1.4) will result in fewer hops (and IOs) to convergence, but probably more + distance comparisons compared to a lower alpha value. + - **use_pq_build**: Use product quantization during build. Product quantization is a lossy compression technique + that can reduce the size of the index on disk. This will trade off recall. Default is `True`. + - **num_pq_bytes**: The number of bytes used to store the PQ compressed data in memory. This will trade off recall. + Default is `0`. + - **use_opq**: Use optimized product quantization during build. + - **vector_dtype**: Required if the provided `data` is of type `str`, else we use the `data.dtype` if np array. + - **filter_complexity**: Complexity to use when using filters. Default is 0. + - **tags**: A `str` representing a path to a pre-built tags file on disk, or a `numpy.ndarray` of uint32 ids + corresponding to the ordinal position of the vectors provided to build the index. Defaults to "". **This value + must be provided if you want to build a memory index intended for use with `diskannpy.DynamicMemoryIndex`**. + - **index_prefix**: The prefix of the index files. Defaults to "ann". """ _assert( (isinstance(data, str) and vector_dtype is not None) @@ -247,7 +214,10 @@ def build_memory_index( dap_metric = _valid_metric(distance_metric) _assert_is_positive_uint32(complexity, "complexity") _assert_is_positive_uint32(graph_degree, "graph_degree") - _assert(alpha >= 1, "alpha must be >= 1, and realistically should be kept between [1.0, 2.0)") + _assert( + alpha >= 1, + "alpha must be >= 1, and realistically should be kept between [1.0, 2.0)", + ) _assert_is_nonnegative_uint32(num_threads, "num_threads") _assert_is_nonnegative_uint32(num_pq_bytes, "num_pq_bytes") _assert_is_nonnegative_uint32(filter_complexity, "filter_complexity") @@ -260,17 +230,17 @@ def build_memory_index( ) vector_bin_path, vector_dtype_actual = _valid_path_and_dtype( - data, vector_dtype, index_directory + data, vector_dtype, index_directory, index_prefix ) - num_points, dimensions = vector_file_metadata(vector_bin_path) + num_points, dimensions = vectors_metadata_from_file(vector_bin_path) - if vector_dtype_actual == np.single: - _builder = _native_dap.build_memory_float_index - elif vector_dtype_actual == np.ubyte: + if vector_dtype_actual == np.uint8: _builder = _native_dap.build_memory_uint8_index - else: + elif vector_dtype_actual == np.int8: _builder = _native_dap.build_memory_int8_index + else: + _builder = _native_dap.build_memory_float_index index_prefix_path = os.path.join(index_directory, index_prefix) @@ -279,19 +249,14 @@ def build_memory_index( shutil.copy(tags, index_prefix_path + ".tags") elif not isinstance(tags, str): use_tags = True - tags_as_array = _castable_dtype_or_raise( - tags, - expected=np.uint32, - message="tags must be a numpy array of dtype np.uint32" - ) + tags_as_array = _castable_dtype_or_raise(tags, expected=np.uint32) _assert(len(tags_as_array.shape) == 1, "Provided tags must be 1 dimensional") _assert( tags_as_array.shape[0] == num_points, "Provided tags must contain an identical population to the number of points, " - f"{tags_as_array.shape[0]=}, {num_points=}" + f"{tags_as_array.shape[0]=}, {num_points=}", ) - with open(index_prefix_path + ".tags", "wb") as tags_out: - _write_bin(tags, tags_out) + tags_to_file(index_prefix_path + ".tags", tags_as_array) else: use_tags = False @@ -307,7 +272,9 @@ def build_memory_index( num_pq_bytes=num_pq_bytes, use_opq=use_opq, filter_complexity=filter_complexity, - use_tags=use_tags + use_tags=use_tags, ) - _write_index_metadata(index_prefix_path, vector_dtype_actual, dap_metric, num_points, dimensions) + _write_index_metadata( + index_prefix_path, vector_dtype_actual, dap_metric, num_points, dimensions + ) diff --git a/python/src/_builder.pyi b/python/src/_builder.pyi index 7527aebfd..5014880c6 100644 --- a/python/src/_builder.pyi +++ b/python/src/_builder.pyi @@ -1,11 +1,11 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT license. -from typing import BinaryIO, overload, Optional +from typing import BinaryIO, Optional, overload import numpy as np -from ._common import DistanceMetric, VectorDType, VectorLikeBatch, VectorIdentifierBatch +from . import DistanceMetric, VectorDType, VectorIdentifierBatch, VectorLikeBatch def numpy_to_diskann_file(vectors: np.ndarray, file_handler: BinaryIO): ... @overload @@ -21,7 +21,7 @@ def build_disk_index( pq_disk_bytes: int, vector_dtype: VectorDType, index_prefix: str, -): ... +) -> None: ... @overload def build_disk_index( data: VectorLikeBatch, @@ -34,7 +34,7 @@ def build_disk_index( num_threads: int, pq_disk_bytes: int, index_prefix: str, -): ... +) -> None: ... @overload def build_memory_index( data: VectorLikeBatch, @@ -52,7 +52,7 @@ def build_memory_index( filter_complexity: int, tags: Optional[VectorIdentifierBatch], index_prefix: str, -): ... +) -> None: ... @overload def build_memory_index( data: str, @@ -71,5 +71,4 @@ def build_memory_index( filter_complexity: int, tags: Optional[str], index_prefix: str, -): ... - +) -> None: ... diff --git a/python/src/_common.py b/python/src/_common.py index e2437a5c1..53f1dbcab 100644 --- a/python/src/_common.py +++ b/python/src/_common.py @@ -3,62 +3,37 @@ import os import warnings - from enum import Enum from pathlib import Path -from typing import List, Literal, NamedTuple, Optional, Tuple, Type, Union +from typing import Literal, NamedTuple, Optional, Tuple, Type, Union import numpy as np -import numpy.typing as npt +from . import ( + DistanceMetric, + VectorDType, + VectorIdentifierBatch, + VectorLike, + VectorLikeBatch, +) from . import _diskannpy as _native_dap -__ALL__ = [ - "DistanceMetric", - "QueryResponse", - "QueryResponseBatch", - "VectorDType", - "VectorLike", - "VectorLikeBatch", - "VectorIdentifier", - "VectorIdentifierBatch" -] +__ALL__ = ["valid_dtype"] _VALID_DTYPES = [np.float32, np.int8, np.uint8] -DistanceMetric = Literal["l2", "mips", "cosine"] -VectorDType = Union[Type[np.float32], Type[np.int8], Type[np.uint8]] -VectorLike = Union[List[int], List[float], npt.NDArray[VectorDType]] -VectorLikeBatch = Union[List[List[int]], List[List[float]], npt.NDArray[VectorDType]] -VectorIdentifier = Union[int, np.uintc] -VectorIdentifierBatch = Union[List[int], List[np.uintc], npt.NDArray[np.uintc]] - - -class QueryResponse(NamedTuple): - """ - Tuple with two values, distances and indices. Both are 1d arrays and positionally correspond - """ - distances: np.ndarray - indices: np.ndarray - -class QueryResponseBatch(NamedTuple): +def valid_dtype(dtype: Type) -> VectorDType: """ - Tuple with two values, distances and indices. Both are 2d arrays, with dimensionality determined by the - rows corresponding to the number of queries made, and the columns corresponding to the k neighbors - requested. The two 2d arrays have an implicit, position-based relationship + Utility method to determine whether the provided dtype is supported by `diskannpy`, and if so, the canonical + dtype we will use internally (e.g. np.single -> np.float32) """ - distances: np.ndarray - indices: np.ndarray - - -def valid_dtype(dtype: Type) -> VectorDType: _assert_dtype(dtype) - if np.can_cast(dtype, np.uint8): + if dtype == np.uint8: return np.uint8 - if np.can_cast(dtype, np.int8): + if dtype == np.int8: return np.int8 - if np.can_cast(dtype, np.float32): + if dtype == np.float32: return np.float32 @@ -88,21 +63,14 @@ def _assert_dtype(dtype: Type): def _castable_dtype_or_raise( - data: Union[VectorLike, VectorLikeBatch, VectorIdentifierBatch], - expected: np.dtype, - message: str + data: Union[VectorLike, VectorLikeBatch, VectorIdentifierBatch], expected: np.dtype ) -> np.ndarray: - if isinstance(data, list): - return np.array(data, dtype=expected) # may result in an overflow and invalid data, but at least warns - elif isinstance(data, np.ndarray): - try: - _vectors = data.astype(dtype=expected, casting="safe", copy=False) # we would prefer no copy - except TypeError as e: - e.args = (message, *e.args) - raise - return _vectors + if isinstance(data, np.ndarray) and np.can_cast(data.dtype, expected): + return data.astype(expected, casting="safe") else: - raise TypeError(f"expecting a VectorLike, VectorLikeBatch, or VectorIdentifierBatch, not a {type(data)}") + raise TypeError( + f"expecting a numpy ndarray of dtype {expected}, not a {type(data)}" + ) def _assert_2d(vectors: np.ndarray, name: str): @@ -152,11 +120,11 @@ class _DataType(Enum): @classmethod def from_type(cls, vector_dtype: VectorDType) -> "DataType": - if vector_dtype == np.single: + if vector_dtype == np.float32: return cls.FLOAT32 - if vector_dtype == np.byte: + if vector_dtype == np.int8: return cls.INT8 - if vector_dtype == np.ubyte: + if vector_dtype == np.uint8: return cls.UINT8 def to_type(self) -> VectorDType: @@ -204,25 +172,37 @@ def _build_metadata_path(index_path_and_prefix: str) -> str: def _write_index_metadata( - index_path_and_prefix: str, - dtype: VectorDType, - metric: _native_dap.Metric, - num_points: int, - dimensions: int + index_path_and_prefix: str, + dtype: VectorDType, + metric: _native_dap.Metric, + num_points: int, + dimensions: int, ): np.array( - [_DataType.from_type(dtype).value, _Metric.from_native(metric).value, num_points, dimensions], - dtype=np.uint64 + [ + _DataType.from_type(dtype).value, + _Metric.from_native(metric).value, + num_points, + dimensions, + ], + dtype=np.uint64, ).tofile(_build_metadata_path(index_path_and_prefix)) -def _read_index_metadata(index_path_and_prefix: str) -> Optional[Tuple[VectorDType, str, np.uint64, np.uint64]]: +def _read_index_metadata( + index_path_and_prefix: str, +) -> Optional[Tuple[VectorDType, str, np.uint64, np.uint64]]: path = _build_metadata_path(index_path_and_prefix) if not Path(path).exists(): return None else: metadata = np.fromfile(path, dtype=np.uint64, count=-1) - return _DataType(int(metadata[0])).to_type(), _Metric(int(metadata[1])).to_str(), metadata[2], metadata[3] + return ( + _DataType(int(metadata[0])).to_type(), + _Metric(int(metadata[1])).to_str(), + metadata[2], + metadata[3], + ) def _ensure_index_metadata( @@ -238,7 +218,7 @@ def _ensure_index_metadata( all([vector_dtype, distance_metric, dimensions]), "distance_metric, vector_dtype, and dimensions must provided if a corresponding metadata file has not " "been built for this index, such as when an index was built via the CLI tools or prior to the addition " - "of a metadata file" + "of a metadata file", ) _assert_dtype(vector_dtype) _assert_is_positive_uint32(max_vectors, "max_vectors") @@ -260,7 +240,10 @@ def _ensure_index_metadata( def _valid_index_prefix(index_directory: str, index_prefix: str) -> str: - _assert(index_directory is not None and index_directory != "", "index_directory cannot be None or empty") + _assert( + index_directory is not None and index_directory != "", + "index_directory cannot be None or empty", + ) _assert_existing_directory(index_directory, "index_directory") _assert(index_prefix != "", "index_prefix cannot be an empty string") return os.path.join(index_directory, index_prefix) diff --git a/python/src/_dynamic_memory_index.py b/python/src/_dynamic_memory_index.py index dc7fd2978..9570b8345 100644 --- a/python/src/_dynamic_memory_index.py +++ b/python/src/_dynamic_memory_index.py @@ -3,14 +3,12 @@ import os import warnings - -import numpy as np - from pathlib import Path from typing import Optional -from . import _diskannpy as _native_dap -from ._common import ( +import numpy as np + +from . import ( DistanceMetric, QueryResponse, QueryResponseBatch, @@ -19,6 +17,9 @@ VectorIdentifierBatch, VectorLike, VectorLikeBatch, +) +from . import _diskannpy as _native_dap +from ._common import ( _assert, _assert_2d, _assert_dtype, @@ -27,9 +28,9 @@ _assert_is_positive_uint32, _castable_dtype_or_raise, _ensure_index_metadata, - _valid_metric, _valid_index_prefix, - _write_index_metadata + _valid_metric, + _write_index_metadata, ) from ._diskannpy import defaults @@ -37,6 +38,18 @@ class DynamicMemoryIndex: + """ + A DynamicMemoryIndex instance is used to both search and mutate a `diskannpy` memory index. This index is unlike + either `diskannpy.StaticMemoryIndex` or `diskannpy.StaticDiskIndex` in the following ways: + + - It requires an explicit vector identifier for each vector added to it. + - Insert and (lazy) deletion operations are provided for a flexible, living index + + The mutable aspect of this index will absolutely impact search time performance as new vectors are added and + old deleted. `DynamicMemoryIndex.consolidate_deletes()` should be called periodically to restructure the index + to remove deleted vectors and improve per-search performance, at the cost of an expensive index consolidation to + occur. + """ @classmethod def from_file( @@ -59,17 +72,79 @@ def from_file( vector_dtype: Optional[VectorDType] = None, dimensions: Optional[int] = None, ) -> "DynamicMemoryIndex": + """ + The `from_file` classmethod is used to load a previously saved index from disk. This index *must* have been + created with a valid `tags` file or `tags` np.ndarray of `diskannpy.VectorIdentifier`s. It is *strongly* + recommended that you use the same parameters as the `diskannpy.build_memory_index()` function that created + the index. + + ### Parameters + - **index_directory**: The directory containing the index files. This directory must contain the following + files: + - `{index_prefix}.data` + - `{index_prefix}.tags` + - `{index_prefix}` + + It may also include the following optional files: + - `{index_prefix}_vectors.bin`: Optional. `diskannpy` builder functions may create this file in the + `index_directory` if the index was created from a numpy array + - `{index_prefix}_metadata.bin`: Optional. `diskannpy` builder functions create this file to store metadata + about the index, such as vector dtype, distance metric, number of vectors and vector dimensionality. + If an index is built from the `diskann` cli tools, this file will not exist. + - **max_vectors**: Capacity of the memory index including space for future insertions. + - **complexity**: Complexity (a.k.a `L`) references the size of the list we store candidate approximate + neighbors in. It's used during save (which is an index rebuild), and it's used as an initial search size to + warm up our index and lower the latency for initial real searches. + - **graph_degree**: Graph degree (a.k.a. `R`) is the maximum degree allowed for a node in the index's graph + structure. This degree will be pruned throughout the course of the index build, but it will never grow beyond + this value. Higher R values require longer index build times, but may result in an index showing excellent + recall and latency characteristics. + - **saturate_graph**: If True, the adjacency list of each node will be saturated with neighbors to have exactly + `graph_degree` neighbors. If False, each node will have between 1 and `graph_degree` neighbors. + - **max_occlusion_size**: The maximum number of points that can be considered by occlude_list function. + - **alpha**: The alpha parameter (>=1) is used to control the nature and number of points that are added to the + graph. A higher alpha value (e.g., 1.4) will result in fewer hops (and IOs) to convergence, but probably + more distance comparisons compared to a lower alpha value. + - **num_threads**: Number of threads to use when creating this index. `0` indicates we should use all available + logical processors. + - **filter_complexity**: Complexity to use when using filters. Default is 0. + - **num_frozen_points**: Number of points to freeze. Default is 1. + - **initial_search_complexity**: Should be set to the most common `complexity` expected to be used during the + life of this `diskannpy.DynamicMemoryIndex` object. The working scratch memory allocated is based off of + `initial_search_complexity` * `search_threads`. Note that it may be resized if a `search` or `batch_search` + operation requests a space larger than can be accommodated by these values. + - **search_threads**: Should be set to the most common `num_threads` expected to be used during the + life of this `diskannpy.DynamicMemoryIndex` object. The working scratch memory allocated is based off of + `initial_search_complexity` * `search_threads`. Note that it may be resized if a `batch_search` + operation requests a space larger than can be accommodated by these values. + - **concurrent_consolidation**: This flag dictates whether consolidation can be run alongside inserts and + deletes, or whether the index is locked down to changes while consolidation is ongoing. + - **index_prefix**: The prefix of the index files. Defaults to "ann". + - **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3 + vector dtypes, but `mips` is only available for single precision floats. Default is `None`. **This + value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it does not exist, + you are required to provide it. + - **vector_dtype**: The vector dtype this index has been built with. **This value is only used if a + `{index_prefix}_metadata.bin` file does not exist.** If it does not exist, you are required to provide it. + - **dimensions**: The vector dimensionality of this index. All new vectors inserted must be the same + dimensionality. **This value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it + does not exist, you are required to provide it. + + ### Returns + A `diskannpy.DynamicMemoryIndex` object, with the index loaded from disk and ready to use for insertions, + deletions, and searches. + + """ index_prefix_path = _valid_index_prefix(index_directory, index_prefix) # do tags exist? tags_file = index_prefix_path + ".tags" - _assert(Path(tags_file).exists(), f"The file {tags_file} does not exist in {index_directory}") + _assert( + Path(tags_file).exists(), + f"The file {tags_file} does not exist in {index_directory}", + ) vector_dtype, dap_metric, num_vectors, dimensions = _ensure_index_metadata( - index_prefix_path, - vector_dtype, - distance_metric, - max_vectors, - dimensions + index_prefix_path, vector_dtype, distance_metric, max_vectors, dimensions ) index = cls( @@ -87,9 +162,10 @@ def from_file( num_frozen_points=num_frozen_points, initial_search_complexity=initial_search_complexity, search_threads=search_threads, - concurrent_consolidation=concurrent_consolidation + concurrent_consolidation=concurrent_consolidation, ) index._index.load(index_prefix_path) + index._num_vectors = num_vectors # current number of vectors loaded return index def __init__( @@ -108,58 +184,50 @@ def __init__( num_frozen_points: int = defaults.NUM_FROZEN_POINTS_DYNAMIC, initial_search_complexity: int = 0, search_threads: int = 0, - concurrent_consolidation: bool = True + concurrent_consolidation: bool = True, ): """ - The diskannpy.DynamicMemoryIndex represents our python API into a dynamic DiskANN InMemory Index library. - - This dynamic index is unlike the DiskIndex and StaticMemoryIndex, in that after loading it you can continue - to insert and delete vectors. - - Deletions are completed lazily, until the user executes `DynamicMemoryIndex.consolidate_deletes()` - :param distance_metric: If it exists, must be one of {"l2", "mips", "cosine"}. L2 is supported for all 3 vector dtypes, - but MIPS is only available for single point floating numbers (numpy.single). Default is ``None``. - :type distance_metric: str - :param vector_dtype: The vector dtype this index will be exposing. - :type vector_dtype: Union[Type[numpy.single], Type[numpy.byte], Type[numpy.ubyte]] - :param dimensions: The vector dimensionality of this index. All new vectors inserted must be the same - dimensionality. - :type dimensions: int - :param max_vectors: Capacity of the data store including space for future insertions - :type max_vectors: int - :param graph_degree: The degree of the graph index, typically between 60 and 150. A larger maximum degree will - result in larger indices and longer indexing times, but better search quality. - :type graph_degree: int - :param saturate_graph: - :type saturate_graph: bool - :param max_occlusion_size: - :type max_occlusion_size: int - :param alpha: - :type alpha: float - :param num_threads: - :type num_threads: int - :param filter_complexity: - :type filter_complexity: int - :param num_frozen_points: - :type num_frozen_points: int - :param initial_search_complexity: The working scratch memory allocated is predicated off of - initial_search_complexity * search_threads. If a larger list_size * num_threads value is - ultimately provided by the individual action executed in `batch_query` than provided in this constructor, - the scratch space is extended. If a smaller list_size * num_threads is provided by the action than the - constructor, the pre-allocated scratch space is used as-is. - :type initial_search_complexity: int - :param search_threads: Should be set to the most common batch_query num_threads size. The working - scratch memory allocated is predicated off of initial_search_list_size * initial_search_threads. If a - larger list_size * num_threads value is ultimately provided by the individual action executed in - `batch_query` than provided in this constructor, the scratch space is extended. If a smaller - list_size * num_threads is provided by the action than the constructor, the pre-allocated scratch space - is used as-is. - :type search_threads: int - :param concurrent_consolidation: - :type concurrent_consolidation: bool + The `diskannpy.DynamicMemoryIndex` represents our python API into a mutable DiskANN memory index. + + This constructor is used to create a new, empty index. If you wish to load a previously saved index from disk, + please use the `diskannpy.DynamicMemoryIndex.from_file` classmethod instead. + + ### Parameters + - **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3 + vector dtypes, but `mips` is only available for single precision floats. + - **vector_dtype**: One of {`np.float32`, `np.int8`, `np.uint8`}. The dtype of the vectors this index will + be storing. + - **dimensions**: The vector dimensionality of this index. All new vectors inserted must be the same + dimensionality. + - **max_vectors**: Capacity of the data store including space for future insertions + - **graph_degree**: Graph degree (a.k.a. `R`) is the maximum degree allowed for a node in the index's graph + structure. This degree will be pruned throughout the course of the index build, but it will never grow beyond + this value. Higher `graph_degree` values require longer index build times, but may result in an index showing + excellent recall and latency characteristics. + - **saturate_graph**: If True, the adjacency list of each node will be saturated with neighbors to have exactly + `graph_degree` neighbors. If False, each node will have between 1 and `graph_degree` neighbors. + - **max_occlusion_size**: The maximum number of points that can be considered by occlude_list function. + - **alpha**: The alpha parameter (>=1) is used to control the nature and number of points that are added to the + graph. A higher alpha value (e.g., 1.4) will result in fewer hops (and IOs) to convergence, but probably + more distance comparisons compared to a lower alpha value. + - **num_threads**: Number of threads to use when creating this index. `0` indicates we should use all available + logical processors. + - **filter_complexity**: Complexity to use when using filters. Default is 0. + - **num_frozen_points**: Number of points to freeze. Default is 1. + - **initial_search_complexity**: Should be set to the most common `complexity` expected to be used during the + life of this `diskannpy.DynamicMemoryIndex` object. The working scratch memory allocated is based off of + `initial_search_complexity` * `search_threads`. Note that it may be resized if a `search` or `batch_search` + operation requests a space larger than can be accommodated by these values. + - **search_threads**: Should be set to the most common `num_threads` expected to be used during the + life of this `diskannpy.DynamicMemoryIndex` object. The working scratch memory allocated is based off of + `initial_search_complexity` * `search_threads`. Note that it may be resized if a `batch_search` + operation requests a space larger than can be accommodated by these values. + - **concurrent_consolidation**: This flag dictates whether consolidation can be run alongside inserts and + deletes, or whether the index is locked down to changes while consolidation is ongoing. """ - + self._num_vectors = 0 + self._removed_num_vectors = 0 dap_metric = _valid_metric(distance_metric) self._dap_metric = dap_metric _assert_dtype(vector_dtype) @@ -171,7 +239,10 @@ def __init__( _assert_is_positive_uint32(max_vectors, "max_vectors") _assert_is_positive_uint32(complexity, "complexity") _assert_is_positive_uint32(graph_degree, "graph_degree") - _assert(alpha >= 1, "alpha must be >= 1, and realistically should be kept between [1.0, 2.0)") + _assert( + alpha >= 1, + "alpha must be >= 1, and realistically should be kept between [1.0, 2.0)", + ) _assert_is_nonnegative_uint32(max_occlusion_size, "max_occlusion_size") _assert_is_nonnegative_uint32(num_threads, "num_threads") _assert_is_nonnegative_uint32(filter_complexity, "filter_complexity") @@ -181,12 +252,17 @@ def __init__( ) _assert_is_nonnegative_uint32(search_threads, "search_threads") - if vector_dtype == np.single: - _index = _native_dap.DynamicMemoryFloatIndex - elif vector_dtype == np.ubyte: + self._max_vectors = max_vectors + self._complexity = complexity + self._graph_degree = graph_degree + + if vector_dtype == np.uint8: _index = _native_dap.DynamicMemoryUInt8Index - else: + elif vector_dtype == np.int8: _index = _native_dap.DynamicMemoryInt8Index + else: + _index = _native_dap.DynamicMemoryFloatIndex + self._index = _index( distance_metric=dap_metric, dimensions=dimensions, @@ -201,7 +277,7 @@ def __init__( num_frozen_points=num_frozen_points, initial_search_complexity=initial_search_complexity, search_threads=search_threads, - concurrent_consolidation=concurrent_consolidation + concurrent_consolidation=concurrent_consolidation, ) self._points_deleted = False @@ -209,31 +285,21 @@ def search( self, query: VectorLike, k_neighbors: int, complexity: int ) -> QueryResponse: """ - Searches the disk index by a single query vector in a 1d numpy array. - - numpy array dtype must match index. - - :param query: 1d numpy array of the same dimensionality and dtype of the index. - :type query: VectorLike - :param k_neighbors: Number of neighbors to be returned. If query vector exists in index, it almost definitely - will be returned as well, so adjust your ``k_neighbors`` as appropriate. (> 0) - :type k_neighbors: int - :param complexity: Size of list to use while searching. List size increases accuracy at the cost of latency. Must - be at least k_neighbors in size. - :type complexity: int - :return: Returns a tuple of 1-d numpy ndarrays; the first including the indices of the approximate nearest - neighbors, the second their distances. These are aligned arrays. + Searches the index by a single query vector. + + ### Parameters + - **query**: 1d numpy array of the same dimensionality and dtype of the index. + - **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely + will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0. + - **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size + increases accuracy at the cost of latency. Must be at least k_neighbors in size. """ - _query = _castable_dtype_or_raise( - query, - expected=self._vector_dtype, - message=f"StaticMemoryIndex expected a query vector of dtype of {self._vector_dtype}" - ) + _query = _castable_dtype_or_raise(query, expected=self._vector_dtype) _assert(len(_query.shape) == 1, "query vector must be 1-d") _assert( _query.shape[0] == self._dimensions, f"query vector must have the same dimensionality as the index; index dimensionality: {self._dimensions}, " - f"query dimensionality: {_query.shape[0]}" + f"query dimensionality: {_query.shape[0]}", ) _assert_is_positive_uint32(k_neighbors, "k_neighbors") _assert_is_nonnegative_uint32(complexity, "complexity") @@ -246,37 +312,32 @@ def search( return self._index.search(query=_query, knn=k_neighbors, complexity=complexity) def batch_search( - self, queries: VectorLikeBatch, k_neighbors: int, complexity: int, num_threads: int + self, + queries: VectorLikeBatch, + k_neighbors: int, + complexity: int, + num_threads: int, ) -> QueryResponseBatch: """ - Searches the disk index for many query vectors in a 2d numpy array. - - numpy array dtype must match index. + Searches the index by a batch of query vectors. This search is parallelized and far more efficient than searching for each vector individually. - :param queries: 2d numpy array, with column dimensionality matching the index and row dimensionality being the - number of queries intended to search for in parallel. Dtype must match dtype of the index. - :type queries: VectorLike - :param k_neighbors: Number of neighbors to be returned. If query vector exists in index, it almost definitely - will be returned as well, so adjust your ``k_neighbors`` as appropriate. (> 0) - :type k_neighbors: int - :param complexity: Size of list to use while searching. List size increases accuracy at the cost of latency. Must - be at least k_neighbors in size. - :type complexity: int - :param num_threads: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system - :type num_threads: int - :return: Returns a tuple of 2-d numpy ndarrays; each row corresponds to the query vector in the same index, - and elements in row corresponding from 1..k_neighbors approximate nearest neighbors. The second ndarray - contains the distances, of the same form: row index will match query index, column index refers to - 1..k_neighbors distance. These are aligned arrays. + ### Parameters + - **queries**: 2d numpy array, with column dimensionality matching the index and row dimensionality being the + number of queries intended to search for in parallel. Dtype must match dtype of the index. + - **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely + will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0. + - **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size + increases accuracy at the cost of latency. Must be at least k_neighbors in size. + - **num_threads**: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system """ - _queries = _castable_dtype_or_raise(queries, expected=self._vector_dtype, message=f"DynamicMemoryIndex expected a query vector of dtype of {self._vector_dtype}") + _queries = _castable_dtype_or_raise(queries, expected=self._vector_dtype) _assert_2d(_queries, "queries") _assert( _queries.shape[1] == self._dimensions, f"query vectors must have the same dimensionality as the index; index dimensionality: {self._dimensions}, " - f"query dimensionality: {_queries.shape[1]}" + f"query dimensionality: {_queries.shape[1]}", ) _assert_is_positive_uint32(k_neighbors, "k_neighbors") @@ -301,15 +362,17 @@ def batch_search( def save(self, save_path: str, index_prefix: str = "ann"): """ Saves this index to file. - :param save_path: The path to save these index files to. - :type save_path: str - :param index_prefix: The prefix to use for the index files. Default is "ann". - :type index_prefix: str + + ### Parameters + - **save_path**: The path to save these index files to. + - **index_prefix**: The prefix of the index files. Defaults to "ann". """ if save_path == "": raise ValueError("save_path cannot be empty") if index_prefix == "": raise ValueError("index_prefix cannot be empty") + + index_prefix = index_prefix.format(complexity=self._complexity, graph_degree=self._graph_degree) _assert_existing_directory(save_path, "save_path") save_path = os.path.join(save_path, index_prefix) if self._points_deleted is True: @@ -321,56 +384,120 @@ def save(self, save_path: str, index_prefix: str = "ann"): "required." ) self._index.consolidate_delete() - self._index.save(save_path=save_path, compact_before_save=True) # we do not yet support uncompacted saves - _write_index_metadata(save_path, self._vector_dtype, self._dap_metric, self._index.num_points(), self._dimensions) + self._index.save( + save_path=save_path, compact_before_save=True + ) # we do not yet support uncompacted saves + _write_index_metadata( + save_path, + self._vector_dtype, + self._dap_metric, + self._index.num_points(), + self._dimensions, + ) def insert(self, vector: VectorLike, vector_id: VectorIdentifier): """ Inserts a single vector into the index with the provided vector_id. - :param vector: The vector to insert. Note that dtype must match. - :type vector: VectorLike - :param vector_id: The vector_id to use for this vector. + + If this insertion will overrun the `max_vectors` count boundaries of this index, `consolidate_delete()` will + be executed automatically. + + ### Parameters + - **vector**: The vector to insert. Note that dtype must match. + - **vector_id**: The vector_id to use for this vector. """ - _vector = _castable_dtype_or_raise(vector, expected=self._vector_dtype, message=f"DynamicMemoryIndex expected a query vector of dtype of {self._vector_dtype}") + _vector = _castable_dtype_or_raise(vector, expected=self._vector_dtype) _assert(len(vector.shape) == 1, "insert vector must be 1-d") _assert_is_positive_uint32(vector_id, "vector_id") - return self._index.insert(_vector, np.uintc(vector_id)) + if self._num_vectors + 1 > self._max_vectors: + if self._removed_num_vectors > 0: + warnings.warn(f"Inserting this vector would overrun the max_vectors={self._max_vectors} specified at index " + f"construction. We are attempting to consolidate_delete() to make space.") + self.consolidate_delete() + else: + raise RuntimeError(f"Inserting this vector would overrun the max_vectors={self._max_vectors} specified " + f"at index construction. Unable to make space by consolidating deletions. The insert" + f"operation has failed.") + status = self._index.insert(_vector, np.uint32(vector_id)) + if status == 0: + self._num_vectors += 1 + else: + raise RuntimeError( + f"Insert was unable to complete successfully; error code returned from diskann C++ lib: {status}" + ) + def batch_insert( - self, vectors: VectorLikeBatch, vector_ids: VectorIdentifierBatch, num_threads: int = 0 + self, + vectors: VectorLikeBatch, + vector_ids: VectorIdentifierBatch, + num_threads: int = 0, ): """ - :param vectors: The 2d numpy array of vectors to insert. - :type vectors: np.ndarray - :param vector_ids: The 1d array of vector ids to use. This array must have the same number of elements as - the vectors array has rows. The dtype of vector_ids must be ``np.uintc`` (or any alias that is your - platform's equivalent) - :param num_threads: Number of threads to use when inserting into this index. (>= 0), 0 = num_threads in system - :type num_threads: int + Inserts a batch of vectors into the index with the provided vector_ids. + + If this batch insertion will overrun the `max_vectors` count boundaries of this index, `consolidate_delete()` + will be executed automatically. + + ### Parameters + - **vectors**: The 2d numpy array of vectors to insert. + - **vector_ids**: The 1d array of vector ids to use. This array must have the same number of elements as + the vectors array has rows. The dtype of vector_ids must be `np.uint32` + - **num_threads**: Number of threads to use when inserting into this index. (>= 0), 0 = num_threads in system """ - _query = _castable_dtype_or_raise(vectors, expected=self._vector_dtype, message=f"DynamicMemoryIndex expected a query vector of dtype of {self._vector_dtype}") + _query = _castable_dtype_or_raise(vectors, expected=self._vector_dtype) _assert(len(vectors.shape) == 2, "vectors must be a 2-d array") _assert( - vectors.shape[0] == vector_ids.shape[0], "Number of vectors must be equal to number of ids" + vectors.shape[0] == vector_ids.shape[0], + "Number of vectors must be equal to number of ids", ) _vectors = vectors.astype(dtype=self._vector_dtype, casting="safe", copy=False) - _vector_ids = vector_ids.astype(dtype=np.uintc, casting="safe", copy=False) - - return self._index.batch_insert( + _vector_ids = vector_ids.astype(dtype=np.uint32, casting="safe", copy=False) + + if self._num_vectors + _vector_ids.shape[0] > self._max_vectors: + if self._max_vectors + self._removed_num_vectors >= _vector_ids.shape[0]: + warnings.warn(f"Inserting these vectors, count={_vector_ids.shape[0]} would overrun the " + f"max_vectors={self._max_vectors} specified at index construction. We are attempting to " + f"consolidate_delete() to make space.") + self.consolidate_delete() + else: + raise RuntimeError(f"Inserting these vectors count={_vector_ids.shape[0]} would overrun the " + f"max_vectors={self._max_vectors} specified at index construction. Unable to make " + f"space by consolidating deletions. The batch insert operation has failed.") + + statuses = self._index.batch_insert( _vectors, _vector_ids, _vector_ids.shape[0], num_threads ) + successes = [] + failures = [] + for i in range(0, len(statuses)): + if statuses[i] == 0: + successes.append(i) + else: + failures.append(i) + self._num_vectors += len(successes) + if len(failures) == 0: + return + failed_ids = vector_ids[failures] + raise RuntimeError( + f"During batch insert, the following vector_ids were unable to be inserted into the index: {failed_ids}. " + f"{len(successes)} were successfully inserted" + ) + def mark_deleted(self, vector_id: VectorIdentifier): """ Mark vector for deletion. This is a soft delete that won't return the vector id in any results, but does not remove it from the underlying index files or memory structure. To execute a hard delete, call this method and - then call the much more expensive ``consolidate_delete`` method on this index. - :param vector_id: The vector id to delete. Must be a uint32. - :type vector_id: int + then call the much more expensive `consolidate_delete` method on this index. + ### Parameters + - **vector_id**: The vector id to delete. Must be a uint32. """ _assert_is_positive_uint32(vector_id, "vector_id") self._points_deleted = True - self._index.mark_deleted(np.uintc(vector_id)) + self._removed_num_vectors += 1 + # we do not decrement self._num_vectors until consolidate_delete + self._index.mark_deleted(np.uint32(vector_id)) def consolidate_delete(self): """ @@ -378,3 +505,5 @@ def consolidate_delete(self): """ self._index.consolidate_delete() self._points_deleted = False + self._num_vectors -= self._removed_num_vectors + self._removed_num_vectors = 0 diff --git a/python/src/_files.py b/python/src/_files.py index 32f118d0c..1c9fa2103 100644 --- a/python/src/_files.py +++ b/python/src/_files.py @@ -1,26 +1,111 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT license. +import warnings +from typing import BinaryIO, NamedTuple + import numpy as np import numpy.typing as npt -from typing import NamedTuple - -from ._common import VectorDType, _assert_existing_file +from . import VectorDType, VectorIdentifierBatch, VectorLikeBatch +from ._common import _assert, _assert_2d, _assert_dtype, _assert_existing_file class Metadata(NamedTuple): + """DiskANN binary vector files contain a small stanza containing some metadata about them.""" + num_vectors: int + """ The number of vectors in the file. """ dimensions: int + """ The dimensionality of the vectors in the file. """ -def vector_file_metadata(vector_file: str) -> Metadata: +def vectors_metadata_from_file(vector_file: str) -> Metadata: + """ + Read the metadata from a DiskANN binary vector file. + ### Parameters + - **vector_file**: The path to the vector file to read the metadata from. + + ### Returns + `diskannpy.Metadata` + """ _assert_existing_file(vector_file, "vector_file") - points, dims = np.fromfile(file=vector_file, dtype=np.uintc, count=2) + points, dims = np.fromfile(file=vector_file, dtype=np.int32, count=2) return Metadata(points, dims) -def vectors_from_binary(vector_file: str, dtype: VectorDType) -> npt.NDArray[VectorDType]: - points, dims = vector_file_metadata(vector_file) +def _write_bin(data: np.ndarray, file_handler: BinaryIO): + if len(data.shape) == 1: + _ = file_handler.write(np.array([data.shape[0], 1], dtype=np.int32).tobytes()) + else: + _ = file_handler.write(np.array(data.shape, dtype=np.int32).tobytes()) + _ = file_handler.write(data.tobytes()) + + +def vectors_to_file(vector_file: str, vectors: VectorLikeBatch) -> None: + """ + Utility function that writes a DiskANN binary vector formatted file to the location of your choosing. + + ### Parameters + - **vector_file**: The path to the vector file to write the vectors to. + - **vectors**: A 2d array of dtype `numpy.float32`, `numpy.uint8`, or `numpy.int8` + """ + _assert_dtype(vectors.dtype) + _assert_2d(vectors, "vectors") + with open(vector_file, "wb") as fh: + _write_bin(vectors, fh) + + +def vectors_from_file(vector_file: str, dtype: VectorDType) -> npt.NDArray[VectorDType]: + """ + Read vectors from a DiskANN binary vector file. + + ### Parameters + - **vector_file**: The path to the vector file to read the vectors from. + - **dtype**: The data type of the vectors in the file. Ensure you match the data types exactly + + ### Returns + `numpy.typing.NDArray[dtype]` + """ + points, dims = vectors_metadata_from_file(vector_file) return np.fromfile(file=vector_file, dtype=dtype, offset=8).reshape(points, dims) + +def tags_to_file(tags_file: str, tags: VectorIdentifierBatch) -> None: + """ + Write tags to a DiskANN binary tag file. + + ### Parameters + - **tags_file**: The path to the tag file to write the tags to. + - **tags**: A 1d array of dtype `numpy.uint32` containing the tags to write. If you have a 2d array of tags with + one column, you can pass it here and it will be reshaped and copied to a new array. It is more efficient for you + to reshape on your own without copying it first, as it should be a constant time operation vs. linear time + + """ + _assert(np.can_cast(tags.dtype, np.uint32), "valid tags must be uint32") + _assert( + len(tags.shape) == 1 or tags.shape[1] == 1, + "tags must be 1d or 2d with 1 column", + ) + if len(tags.shape) == 2: + warnings.warn( + "Tags in 2d with one column will be reshaped and copied to a new array. " + "It is more efficient for you to reshape without copying first." + ) + tags = tags.reshape(tags.shape[0], copy=True) + with open(tags_file, "wb") as fh: + _write_bin(tags.astype(np.uint32), fh) + + +def tags_from_file(tags_file: str) -> VectorIdentifierBatch: + """ + Read tags from a DiskANN binary tag file and return them as a 1d array of dtype `numpy.uint32`. + + ### Parameters + - **tags_file**: The path to the tag file to read the tags from. + """ + _assert_existing_file(tags_file, "tags_file") + points, dims = vectors_metadata_from_file( + tags_file + ) # tag files contain the same metadata stanza + return np.fromfile(file=tags_file, dtype=np.uint32, offset=8).reshape(points) diff --git a/python/src/_static_disk_index.py b/python/src/_static_disk_index.py index 9111ffcee..1ca93c0a4 100644 --- a/python/src/_static_disk_index.py +++ b/python/src/_static_disk_index.py @@ -7,14 +7,16 @@ import numpy as np -from . import _diskannpy as _native_dap -from ._common import ( +from . import ( DistanceMetric, QueryResponse, QueryResponseBatch, VectorDType, VectorLike, VectorLikeBatch, +) +from . import _diskannpy as _native_dap +from ._common import ( _assert, _assert_2d, _assert_is_nonnegative_uint32, @@ -29,6 +31,10 @@ class StaticDiskIndex: + """ + A StaticDiskIndex is a disk-backed index that is not mutable. + """ + def __init__( self, index_directory: str, @@ -41,29 +47,37 @@ def __init__( index_prefix: str = "ann", ): """ - The diskannpy.DiskIndex represents our python API into the DiskANN Product Quantization Flash Index library. - - This class is responsible for searching a DiskANN disk index. - - :param metric: One of {"l2", "mips"}. L2 is supported for all 3 vector dtypes, but MIPS is only - available for single point floating numbers (numpy.single) - :type metric: str - :param vector_dtype: The vector dtype this index will be exposing. - :type vector_dtype: Type[numpy.single], Type[numpy.byte], Type[numpy.ubyte] - :param index_directory: Path on disk where the disk index is stored - :type index_directory: str - :param num_threads: Number of threads used to load the index (>= 0) - :type num_threads: int - :param num_nodes_to_cache: Number of nodes to cache in memory (> -1) - :type num_nodes_to_cache: int - :param cache_mechanism: 1 -> use the generated sample_data.bin file for - the index to initialize a set of cached nodes, up to ``num_nodes_to_cache``, 2 -> ready the cache for up to - ``num_nodes_to_cache``, but do not initialize it with any nodes. Any other value disables node caching. - :param index_prefix: A shared prefix that all files in this index will use. Default is "ann". - :type index_prefix: str - :raises ValueError: If metric is not a valid metric - :raises ValueError: If vector dtype is not a supported dtype - :raises ValueError: If num_threads or num_nodes_to_cache is an invalid range. + ### Parameters + - **index_directory**: The directory containing the index files. This directory must contain the following + files: + - `{index_prefix}_sample_data.bin` + - `{index_prefix}_mem.index.data` + - `{index_prefix}_pq_compressed.bin` + - `{index_prefix}_pq_pivots.bin` + - `{index_prefix}_sample_ids.bin` + - `{index_prefix}_disk.index` + + It may also include the following optional files: + - `{index_prefix}_vectors.bin`: Optional. `diskannpy` builder functions may create this file in the + `index_directory` if the index was created from a numpy array + - `{index_prefix}_metadata.bin`: Optional. `diskannpy` builder functions create this file to store metadata + about the index, such as vector dtype, distance metric, number of vectors and vector dimensionality. + If an index is built from the `diskann` cli tools, this file will not exist. + - **num_threads**: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system + - **num_nodes_to_cache**: Number of nodes to cache in memory (> -1) + - **cache_mechanism**: 1 -> use the generated sample_data.bin file for + the index to initialize a set of cached nodes, up to `num_nodes_to_cache`, 2 -> ready the cache for up to + `num_nodes_to_cache`, but do not initialize it with any nodes. Any other value disables node caching. + - **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3 + vector dtypes, but `mips` is only available for single precision floats. Default is `None`. **This + value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it does not exist, + you are required to provide it. + - **vector_dtype**: The vector dtype this index has been built with. **This value is only used if a + `{index_prefix}_metadata.bin` file does not exist.** If it does not exist, you are required to provide it. + - **dimensions**: The vector dimensionality of this index. All new vectors inserted must be the same + dimensionality. **This value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it + does not exist, you are required to provide it. + - **index_prefix**: The prefix of the index files. Defaults to "ann". """ index_prefix = _valid_index_prefix(index_directory, index_prefix) vector_dtype, metric, _, _ = _ensure_index_metadata( @@ -71,7 +85,7 @@ def __init__( vector_dtype, distance_metric, 1, # it doesn't matter because we don't need it in this context anyway - dimensions + dimensions, ) dap_metric = _valid_metric(metric) @@ -79,12 +93,12 @@ def __init__( _assert_is_nonnegative_uint32(num_nodes_to_cache, "num_nodes_to_cache") self._vector_dtype = vector_dtype - if vector_dtype == np.single: - _index = _native_dap.StaticDiskFloatIndex - elif vector_dtype == np.ubyte: + if vector_dtype == np.uint8: _index = _native_dap.StaticDiskUInt8Index - else: + elif vector_dtype == np.int8: _index = _native_dap.StaticDiskInt8Index + else: + _index = _native_dap.StaticDiskFloatIndex self._index = _index( distance_metric=dap_metric, index_path_prefix=os.path.join(index_directory, index_prefix), @@ -97,33 +111,22 @@ def search( self, query: VectorLike, k_neighbors: int, complexity: int, beam_width: int = 2 ) -> QueryResponse: """ - Searches the disk index by a single query vector in a 1d numpy array. - - numpy array dtype must match index. - - :param query: 1d numpy array of the same dimensionality and dtype of the index. - :type query: numpy.ndarray - :param k_neighbors: Number of neighbors to be returned. If query vector exists in index, it almost definitely - will be returned as well, so adjust your ``k_neighbors`` as appropriate. (> 0) - :type k_neighbors: int - :param complexity: Size of list to use while searching. List size increases accuracy at the cost of latency. Must - be at least k_neighbors in size. - :type complexity: int - :param beam_width: The beamwidth to be used for search. This is the maximum number of IO requests each query - will issue per iteration of search code. Larger beamwidth will result in fewer IO round-trips per query, - but might result in slightly higher total number of IO requests to SSD per query. For the highest query - throughput with a fixed SSD IOps rating, use W=1. For best latency, use W=4,8 or higher complexity search. - Specifying 0 will optimize the beamwidth depending on the number of threads performing search, but will - involve some tuning overhead. - :type beam_width: int - :return: Returns a tuple of 1-d numpy ndarrays; the first including the indices of the approximate nearest - neighbors, the second their distances. These are aligned arrays. + Searches the index by a single query vector. + + ### Parameters + - **query**: 1d numpy array of the same dimensionality and dtype of the index. + - **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely + will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0. + - **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size + increases accuracy at the cost of latency. Must be at least k_neighbors in size. + - **beam_width**: The beamwidth to be used for search. This is the maximum number of IO requests each query + will issue per iteration of search code. Larger beamwidth will result in fewer IO round-trips per query, + but might result in slightly higher total number of IO requests to SSD per query. For the highest query + throughput with a fixed SSD IOps rating, use W=1. For best latency, use W=4,8 or higher complexity search. + Specifying 0 will optimize the beamwidth depending on the number of threads performing search, but will + involve some tuning overhead. """ - _query = _castable_dtype_or_raise( - query, - expected=self._vector_dtype, - message=f"DiskIndex expected a query vector of dtype of {self._vector_dtype}" - ) + _query = _castable_dtype_or_raise(query, expected=self._vector_dtype) _assert(len(_query.shape) == 1, "query vector must be 1-d") _assert_is_positive_uint32(k_neighbors, "k_neighbors") _assert_is_positive_uint32(complexity, "complexity") @@ -151,40 +154,26 @@ def batch_search( beam_width: int = 2, ) -> QueryResponseBatch: """ - Searches the disk index for many query vectors in a 2d numpy array. - - numpy array dtype must match index. + Searches the index by a batch of query vectors. This search is parallelized and far more efficient than searching for each vector individually. - :param queries: 2d numpy array, with column dimensionality matching the index and row dimensionality being the - number of queries intended to search for in parallel. Dtype must match dtype of the index. - :type queries: numpy.ndarray - :param k_neighbors: Number of neighbors to be returned. If query vector exists in index, it almost definitely - will be returned as well, so adjust your ``k_neighbors`` as appropriate. (> 0) - :type k_neighbors: int - :param complexity: Size of list to use while searching. List size increases accuracy at the cost of latency. Must - be at least k_neighbors in size. - :type complexity: int - :param num_threads: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system - :type num_threads: int - :param beam_width: The beamwidth to be used for search. This is the maximum number of IO requests each query - will issue per iteration of search code. Larger beamwidth will result in fewer IO round-trips per query, - but might result in slightly higher total number of IO requests to SSD per query. For the highest query - throughput with a fixed SSD IOps rating, use W=1. For best latency, use W=4,8 or higher complexity search. - Specifying 0 will optimize the beamwidth depending on the number of threads performing search, but will - involve some tuning overhead. - :type beam_width: int - :return: Returns a tuple of 2-d numpy ndarrays; each row corresponds to the query vector in the same index, - and elements in row corresponding from 1..k_neighbors approximate nearest neighbors. The second ndarray - contains the distances, of the same form: row index will match query index, column index refers to - 1..k_neighbors distance. These are aligned arrays. + ### Parameters + - **queries**: 2d numpy array, with column dimensionality matching the index and row dimensionality being the + number of queries intended to search for in parallel. Dtype must match dtype of the index. + - **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely + will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0. + - **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size + increases accuracy at the cost of latency. Must be at least k_neighbors in size. + - **num_threads**: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system + - **beam_width**: The beamwidth to be used for search. This is the maximum number of IO requests each query + will issue per iteration of search code. Larger beamwidth will result in fewer IO round-trips per query, + but might result in slightly higher total number of IO requests to SSD per query. For the highest query + throughput with a fixed SSD IOps rating, use W=1. For best latency, use W=4,8 or higher complexity search. + Specifying 0 will optimize the beamwidth depending on the number of threads performing search, but will + involve some tuning overhead. """ - _queries = _castable_dtype_or_raise( - queries, - expected=self._vector_dtype, - message=f"DiskIndex expected a query vector of dtype of {self._vector_dtype}" - ) + _queries = _castable_dtype_or_raise(queries, expected=self._vector_dtype) _assert_2d(_queries, "queries") _assert_is_positive_uint32(k_neighbors, "k_neighbors") _assert_is_positive_uint32(complexity, "complexity") diff --git a/python/src/_static_memory_index.py b/python/src/_static_memory_index.py index c570b4e30..8b87cd561 100644 --- a/python/src/_static_memory_index.py +++ b/python/src/_static_memory_index.py @@ -3,32 +3,37 @@ import os import warnings +from typing import Optional import numpy as np -from typing import Optional - -from . import _diskannpy as _native_dap -from ._common import ( +from . import ( DistanceMetric, QueryResponse, QueryResponseBatch, VectorDType, VectorLike, VectorLikeBatch, +) +from . import _diskannpy as _native_dap +from ._common import ( _assert, _assert_is_nonnegative_uint32, _assert_is_positive_uint32, _castable_dtype_or_raise, _ensure_index_metadata, _valid_index_prefix, - _valid_metric + _valid_metric, ) __ALL__ = ["StaticMemoryIndex"] class StaticMemoryIndex: + """ + A StaticMemoryIndex is an immutable in-memory DiskANN index. + """ + def __init__( self, index_directory: str, @@ -40,19 +45,34 @@ def __init__( dimensions: Optional[int] = None, ): """ - The diskannpy.StaticMemoryIndex represents our python API into a static DiskANN InMemory Index library. - - This static index is intended for searching. - - :param index_directory: The directory the index files reside in - :type index_directory: str - :param initial_search_complexity: A positive integer that tunes how much work should be completed in the - conduct of a search. This can be overridden on a per search basis, but this initial value allows us - to pre-allocate a search scratch space. It is suggested that you set this value to the P95 of your - search complexity values. - :type initial_search_complexity: int - :param index_prefix: A shared prefix that all files in this index will use. Default is "ann". - :type index_prefix: str + ### Parameters + - **index_directory**: The directory containing the index files. This directory must contain the following + files: + - `{index_prefix}.data` + - `{index_prefix}` + + + It may also include the following optional files: + - `{index_prefix}_vectors.bin`: Optional. `diskannpy` builder functions may create this file in the + `index_directory` if the index was created from a numpy array + - `{index_prefix}_metadata.bin`: Optional. `diskannpy` builder functions create this file to store metadata + about the index, such as vector dtype, distance metric, number of vectors and vector dimensionality. + If an index is built from the `diskann` cli tools, this file will not exist. + - **num_threads**: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system + - **initial_search_complexity**: Should be set to the most common `complexity` expected to be used during the + life of this `diskannpy.DynamicMemoryIndex` object. The working scratch memory allocated is based off of + `initial_search_complexity` * `search_threads`. Note that it may be resized if a `search` or `batch_search` + operation requests a space larger than can be accommodated by these values. + - **index_prefix**: The prefix of the index files. Defaults to "ann". + - **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3 + vector dtypes, but `mips` is only available for single precision floats. Default is `None`. **This + value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it does not exist, + you are required to provide it. + - **vector_dtype**: The vector dtype this index has been built with. **This value is only used if a + `{index_prefix}_metadata.bin` file does not exist.** If it does not exist, you are required to provide it. + - **dimensions**: The vector dimensionality of this index. All new vectors inserted must be the same + dimensionality. **This value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it + does not exist, you are required to provide it. """ index_prefix = _valid_index_prefix(index_directory, index_prefix) vector_dtype, metric, num_points, dims = _ensure_index_metadata( @@ -60,7 +80,7 @@ def __init__( vector_dtype, distance_metric, 1, # it doesn't matter because we don't need it in this context anyway - dimensions + dimensions, ) dap_metric = _valid_metric(metric) @@ -72,12 +92,13 @@ def __init__( self._vector_dtype = vector_dtype self._dimensions = dims - if vector_dtype == np.single: - _index = _native_dap.StaticMemoryFloatIndex - elif vector_dtype == np.ubyte: + if vector_dtype == np.uint8: _index = _native_dap.StaticMemoryUInt8Index - else: + elif vector_dtype == np.int8: _index = _native_dap.StaticMemoryInt8Index + else: + _index = _native_dap.StaticMemoryFloatIndex + self._index = _index( distance_metric=dap_metric, num_points=num_points, @@ -87,40 +108,25 @@ def __init__( initial_search_complexity=initial_search_complexity, ) - def search(self, query: VectorLike, k_neighbors: int, complexity: int) -> QueryResponse: + def search( + self, query: VectorLike, k_neighbors: int, complexity: int + ) -> QueryResponse: """ - Searches the static in memory index by a single query vector in a 1d numpy array. - - numpy array dtype must match index. - - :param query: 1d numpy array of the same dimensionality and dtype of the index. - :type query: numpy.ndarray - :param k_neighbors: Number of neighbors to be returned. If query vector exists in index, it almost definitely - will be returned as well, so adjust your ``k_neighbors`` as appropriate. (> 0) - :type k_neighbors: int - :param complexity: Size of list to use while searching. List size increases accuracy at the cost of latency. Must - be at least k_neighbors in size. - :type complexity: int - :param beam_width: The beamwidth to be used for search. This is the maximum number of IO requests each query - will issue per iteration of search code. Larger beamwidth will result in fewer IO round-trips per query, - but might result in slightly higher total number of IO requests to SSD per query. For the highest query - throughput with a fixed SSD IOps rating, use W=1. For best latency, use W=4,8 or higher complexity search. - Specifying 0 will optimize the beamwidth depending on the number of threads performing search, but will - involve some tuning overhead. - :type beam_width: int - :return: Returns a tuple of 1-d numpy ndarrays; the first including the indices of the approximate nearest - neighbors, the second their distances. These are aligned arrays. + Searches the index by a single query vector. + + ### Parameters + - **query**: 1d numpy array of the same dimensionality and dtype of the index. + - **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely + will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0. + - **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size + increases accuracy at the cost of latency. Must be at least k_neighbors in size. """ - _query = _castable_dtype_or_raise( - query, - expected=self._vector_dtype, - message=f"StaticMemoryIndex expected a query vector of dtype of {self._vector_dtype}" - ) + _query = _castable_dtype_or_raise(query, expected=self._vector_dtype) _assert(len(_query.shape) == 1, "query vector must be 1-d") _assert( _query.shape[0] == self._dimensions, f"query vector must have the same dimensionality as the index; index dimensionality: {self._dimensions}, " - f"query dimensionality: {_query.shape[0]}" + f"query dimensionality: {_query.shape[0]}", ) _assert_is_positive_uint32(k_neighbors, "k_neighbors") _assert_is_nonnegative_uint32(complexity, "complexity") @@ -133,38 +139,33 @@ def search(self, query: VectorLike, k_neighbors: int, complexity: int) -> QueryR return self._index.search(query=_query, knn=k_neighbors, complexity=complexity) def batch_search( - self, queries: VectorLikeBatch, k_neighbors: int, complexity: int, num_threads: int + self, + queries: VectorLikeBatch, + k_neighbors: int, + complexity: int, + num_threads: int, ) -> QueryResponseBatch: """ - Searches the static, in memory index for many query vectors in a 2d numpy array. - - numpy array dtype must match index. + Searches the index by a batch of query vectors. This search is parallelized and far more efficient than searching for each vector individually. - :param queries: 2d numpy array, with column dimensionality matching the index and row dimensionality being the - number of queries intended to search for in parallel. Dtype must match dtype of the index. - :type queries: numpy.ndarray - :param k_neighbors: Number of neighbors to be returned. If query vector exists in index, it almost definitely - will be returned as well, so adjust your ``k_neighbors`` as appropriate. (> 0) - :type k_neighbors: int - :param complexity: Size of list to use while searching. List size increases accuracy at the cost of latency. Must - be at least k_neighbors in size. - :type complexity: int - :param num_threads: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system - :type num_threads: int - :return: Returns a tuple of 2-d numpy ndarrays; each row corresponds to the query vector in the same index, - and elements in row corresponding from 1..k_neighbors approximate nearest neighbors. The second ndarray - contains the distances, of the same form: row index will match query index, column index refers to - 1..k_neighbors distance. These are aligned arrays. + ### Parameters + - **queries**: 2d numpy array, with column dimensionality matching the index and row dimensionality being the + number of queries intended to search for in parallel. Dtype must match dtype of the index. + - **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely + will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0. + - **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size + increases accuracy at the cost of latency. Must be at least k_neighbors in size. + - **num_threads**: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system """ - _queries = _castable_dtype_or_raise(queries, expected=self._vector_dtype, message=f"StaticMemoryIndex expected a query vector of dtype of {self._vector_dtype}") + _queries = _castable_dtype_or_raise(queries, expected=self._vector_dtype) _assert(len(_queries.shape) == 2, "queries must must be 2-d np array") _assert( _queries.shape[1] == self._dimensions, f"query vectors must have the same dimensionality as the index; index dimensionality: {self._dimensions}, " - f"query dimensionality: {_queries.shape[1]}" + f"query dimensionality: {_queries.shape[1]}", ) _assert_is_positive_uint32(k_neighbors, "k_neighbors") _assert_is_positive_uint32(complexity, "complexity") diff --git a/python/src/defaults.py b/python/src/defaults.py new file mode 100644 index 000000000..4e22983fd --- /dev/null +++ b/python/src/defaults.py @@ -0,0 +1,71 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +""" +# Parameter Defaults +These parameter defaults are re-exported from the C++ extension module, and used to keep the pythonic wrapper in sync with the C++. +""" +from ._diskannpy import defaults as _defaults + +ALPHA = _defaults.ALPHA +""" +Note that, as ALPHA is a `float32` (single precision float) in C++, when converted into Python it becomes a +`float64` (double precision float). The actual value is 1.2f. The alpha parameter (>=1) is used to control the nature +and number of points that are added to the graph. A higher alpha value (e.g., 1.4) will result in fewer hops (and IOs) +to convergence, but probably more distance comparisons compared to a lower alpha value. +""" +NUM_THREADS = _defaults.NUM_THREADS +""" Number of threads to use. `0` will use all available detected logical processors """ +MAX_OCCLUSION_SIZE = _defaults.MAX_OCCLUSION_SIZE +""" +The maximum number of points that can be occluded by a single point. This is used to prevent a single point from +dominating the graph structure. If a point has more than `max_occlusion_size` neighbors closer to it than the current +point, it will not be added to the graph. This is a tradeoff between index build time and search quality. +""" +FILTER_COMPLEXITY = _defaults.FILTER_COMPLEXITY +""" +Complexity (a.k.a. `L`) references the size of the list we store candidate approximate neighbors in while doing a +filtered search. This value must be larger than `k_neighbors`, and larger values tend toward higher recall in the +resultant ANN search at the cost of more time. +""" +NUM_FROZEN_POINTS_STATIC = _defaults.NUM_FROZEN_POINTS_STATIC +""" Number of points frozen by default in a StaticMemoryIndex """ +NUM_FROZEN_POINTS_DYNAMIC = _defaults.NUM_FROZEN_POINTS_DYNAMIC +""" Number of points frozen by default in a DynamicMemoryIndex """ +SATURATE_GRAPH = _defaults.SATURATE_GRAPH +""" Whether to saturate the graph or not. Default is `True` """ +GRAPH_DEGREE = _defaults.GRAPH_DEGREE +""" +Graph degree (a.k.a. `R`) is the maximum degree allowed for a node in the index's graph structure. This degree will be +pruned throughout the course of the index build, but it will never grow beyond this value. Higher R values require +longer index build times, but may result in an index showing excellent recall and latency characteristics. +""" +COMPLEXITY = _defaults.COMPLEXITY +""" +Complexity (a.k.a `L`) references the size of the list we store candidate approximate neighbors in while doing build +or search tasks. It's used during index build as part of the index optimization processes. It's used in index search +classes both to help mitigate poor latencies during cold start, as well as on subsequent queries to conduct the search. +Large values will likely increase latency but also may improve recall, and tuning these values for your particular +index is certainly a reasonable choice. +""" +PQ_DISK_BYTES = _defaults.PQ_DISK_BYTES +""" +Use `0` to store uncompressed data on SSD. This allows the index to asymptote to 100% recall. If your vectors are +too large to store in SSD, this parameter provides the option to compress the vectors using PQ for storing on SSD. +This will trade off recall. You would also want this to be greater than the number of bytes used for the PQ +compressed data stored in-memory. Default is `0`. +""" +USE_PQ_BUILD = _defaults.USE_PQ_BUILD +""" + Whether to use product quantization in the index building process. Product quantization is an approximation +technique that can vastly speed up vector computations and comparisons in a spatial neighborhood, but it is still an +approximation technique. It should be preferred when index creation times take longer than you can afford for your +use case. +""" +NUM_PQ_BYTES = _defaults.NUM_PQ_BYTES +""" +The number of product quantization bytes to use. More bytes requires more resources in both memory and time, but is +like to result in better approximations. +""" +USE_OPQ = _defaults.USE_OPQ +""" Whether to use Optimized Product Quantization or not. """ diff --git a/python/tests/fixtures/build_memory_index.py b/python/tests/fixtures/build_memory_index.py index ccfdb1f6c..3c30bed25 100644 --- a/python/tests/fixtures/build_memory_index.py +++ b/python/tests/fixtures/build_memory_index.py @@ -2,7 +2,6 @@ # Licensed under the MIT license. import os - from tempfile import mkdtemp import diskannpy as dap @@ -12,11 +11,7 @@ def build_random_vectors_and_memory_index( - dtype, - metric, - with_tags: bool = False, - index_prefix: str = "ann", - seed: int = 12345 + dtype, metric, with_tags: bool = False, index_prefix: str = "ann", seed: int = 12345 ): query_vectors: np.ndarray = random_vectors(1000, 10, dtype=dtype, seed=seed) index_vectors: np.ndarray = random_vectors(10000, 10, dtype=dtype, seed=seed) @@ -24,7 +19,7 @@ def build_random_vectors_and_memory_index( if with_tags: rng = np.random.default_rng(seed) - tags = np.arange(start=1, stop=10001, dtype=np.uintc) + tags = np.arange(start=1, stop=10001, dtype=np.uint32) rng.shuffle(tags) else: tags = "" @@ -42,7 +37,7 @@ def build_random_vectors_and_memory_index( use_opq=False, filter_complexity=32, tags=tags, - index_prefix=index_prefix + index_prefix=index_prefix, ) return ( @@ -52,5 +47,5 @@ def build_random_vectors_and_memory_index( index_vectors, ann_dir, os.path.join(ann_dir, "vectors.bin"), - tags + tags, ) diff --git a/python/tests/fixtures/create_test_data.py b/python/tests/fixtures/create_test_data.py index 6e390bd2f..44e413ed6 100644 --- a/python/tests/fixtures/create_test_data.py +++ b/python/tests/fixtures/create_test_data.py @@ -11,18 +11,18 @@ def random_vectors(rows: int, dimensions: int, dtype, seed: int = 12345) -> np.ndarray: rng = np.random.default_rng(seed) - if dtype == np.single: + if dtype == np.float32: vectors = rng.random((rows, dimensions), dtype=dtype) - elif dtype == np.ubyte: + elif dtype == np.uint8: vectors = rng.integers( low=0, high=256, size=(rows, dimensions), dtype=dtype ) # low is inclusive, high is exclusive - elif dtype == np.byte: + elif dtype == np.int8: vectors = rng.integers( low=-128, high=128, size=(rows, dimensions), dtype=dtype ) # low is inclusive, high is exclusive else: - raise RuntimeError("Only np.single, np.byte, and np.ubyte are supported") + raise RuntimeError("Only np.float32, np.int8, and np.uint8 are supported") return vectors diff --git a/python/tests/test_dynamic_memory_index.py b/python/tests/test_dynamic_memory_index.py index ec3737702..ff9c8981d 100644 --- a/python/tests/test_dynamic_memory_index.py +++ b/python/tests/test_dynamic_memory_index.py @@ -4,6 +4,7 @@ import shutil import tempfile import unittest +import warnings import diskannpy as dap import numpy as np @@ -12,18 +13,19 @@ def _calculate_recall( - result_set_tags: np.ndarray, - original_indices_to_tags: np.ndarray, - truth_set_indices: np.ndarray, - recall_at: int = 5 + result_set_tags: np.ndarray, + original_indices_to_tags: np.ndarray, + truth_set_indices: np.ndarray, + recall_at: int = 5, ) -> float: - found = 0 for i in range(0, result_set_tags.shape[0]): result_set_set = set(result_set_tags[i][0:recall_at]) truth_set_set = set() for knn_index in truth_set_indices[i][0:recall_at]: - truth_set_set.add(original_indices_to_tags[knn_index]) # mapped into our tag number instead + truth_set_set.add( + original_indices_to_tags[knn_index] + ) # mapped into our tag number instead found += len(result_set_set.intersection(truth_set_set)) return found / (result_set_tags.shape[0] * recall_at) @@ -32,12 +34,12 @@ class TestDynamicMemoryIndex(unittest.TestCase): @classmethod def setUpClass(cls) -> None: cls._test_matrix = [ - build_random_vectors_and_memory_index(np.single, "l2", with_tags=True), - build_random_vectors_and_memory_index(np.ubyte, "l2", with_tags=True), - build_random_vectors_and_memory_index(np.byte, "l2", with_tags=True), - build_random_vectors_and_memory_index(np.single, "cosine", with_tags=True), - build_random_vectors_and_memory_index(np.ubyte, "cosine", with_tags=True), - build_random_vectors_and_memory_index(np.byte, "cosine", with_tags=True), + build_random_vectors_and_memory_index(np.float32, "l2", with_tags=True), + build_random_vectors_and_memory_index(np.uint8, "l2", with_tags=True), + build_random_vectors_and_memory_index(np.int8, "l2", with_tags=True), + build_random_vectors_and_memory_index(np.float32, "cosine", with_tags=True), + build_random_vectors_and_memory_index(np.uint8, "cosine", with_tags=True), + build_random_vectors_and_memory_index(np.int8, "cosine", with_tags=True), ] cls._example_ann_dir = cls._test_matrix[0][4] @@ -58,9 +60,9 @@ def test_recall_and_batch(self): index_vectors, ann_dir, vector_bin_file, - generated_tags + generated_tags, ) in self._test_matrix: - with self.subTest(): + with self.subTest(msg=f"Testing dtype {dtype}"): index = dap.DynamicMemoryIndex.from_file( index_directory=ann_dir, max_vectors=11_000, @@ -82,7 +84,9 @@ def test_recall_and_batch(self): ) knn.fit(index_vectors) knn_distances, knn_indices = knn.kneighbors(query_vectors) - recall = _calculate_recall(diskann_neighbors, generated_tags, knn_indices, k) + recall = _calculate_recall( + diskann_neighbors, generated_tags, knn_indices, k + ) self.assertTrue( recall > 0.70, f"Recall [{recall}] was not over 0.7", @@ -96,9 +100,9 @@ def test_single(self): index_vectors, ann_dir, vector_bin_file, - generated_tags + generated_tags, ) in self._test_matrix: - with self.subTest(): + with self.subTest(msg=f"Testing dtype {dtype}"): index = dap.DynamicMemoryIndex( distance_metric="l2", vector_dtype=dtype, @@ -114,9 +118,6 @@ def test_single(self): ids, dists = index.search(query_vectors[0], k_neighbors=k, complexity=5) self.assertEqual(ids.shape[0], k) self.assertEqual(dists.shape[0], k) - ids, dists = index.search(query_vectors[0].tolist(), k_neighbors=k, complexity=5) - self.assertEqual(ids.shape[0], k) - self.assertEqual(dists.shape[0], k) def test_valid_metric(self): with self.assertRaises(ValueError): @@ -176,7 +177,7 @@ def test_valid_vector_dtype(self): index_vectors, ann_dir, vector_bin_file, - generated_tags + generated_tags, ) in self._test_matrix: with self.subTest(): index = dap.DynamicMemoryIndex( @@ -211,8 +212,10 @@ def test_value_ranges_ctor(self): index_vectors, ann_dir, vector_bin_file, - generated_tags - ) = build_random_vectors_and_memory_index(np.single, "l2", with_tags=True, index_prefix="not_ann") + generated_tags, + ) = build_random_vectors_and_memory_index( + np.single, "l2", with_tags=True, index_prefix="not_ann" + ) good_ranges = { "distance_metric": "l2", "vector_dtype": np.single, @@ -226,7 +229,7 @@ def test_value_ranges_ctor(self): "filter_complexity": 10, "num_frozen_points": 10, "initial_search_complexity": 32, - "search_threads": 0 + "search_threads": 0, } bad_ranges = { @@ -248,7 +251,10 @@ def test_value_ranges_ctor(self): kwargs = good_ranges.copy() kwargs[bad_value_key] = bad_ranges[bad_value_key] with self.subTest(): - with self.assertRaises(ValueError, msg=f"expected to fail with parameter {bad_value_key}={bad_ranges[bad_value_key]}"): + with self.assertRaises( + ValueError, + msg=f"expected to fail with parameter {bad_value_key}={bad_ranges[bad_value_key]}", + ): index = dap.DynamicMemoryIndex(saturate_graph=False, **kwargs) def test_value_ranges_search(self): @@ -265,7 +271,7 @@ def test_value_ranges_search(self): initial_search_complexity=32, max_vectors=10001, complexity=64, - graph_degree=32 + graph_degree=32, ) index.search(query=np.array([], dtype=np.single), **kwargs) @@ -291,7 +297,7 @@ def test_value_ranges_batch_search(self): initial_search_complexity=32, max_vectors=10001, complexity=64, - graph_degree=32 + graph_degree=32, ) index.batch_search( queries=np.array([[]], dtype=np.single), **kwargs @@ -310,18 +316,26 @@ def test_issue400(self): initial_search_complexity=32, max_vectors=10100, complexity=64, - graph_degree=32 + graph_degree=32, ) index.insert(np.array([1.0] * 10, dtype=np.single), 10099) index.insert(np.array([2.0] * 10, dtype=np.single), 10050) index.insert(np.array([3.0] * 10, dtype=np.single), 10053) - tags, distances = index.search(np.array([3.0] * 10, dtype=np.single), k_neighbors=5, complexity=64) + tags, distances = index.search( + np.array([3.0] * 10, dtype=np.single), k_neighbors=5, complexity=64 + ) self.assertIn(10053, tags) tags, distances = index.search(deletion_vector, k_neighbors=5, complexity=64) - self.assertIn(deletion_tag, tags, "deletion_tag should exist, as we have not deleted yet") + self.assertIn( + deletion_tag, tags, "deletion_tag should exist, as we have not deleted yet" + ) index.mark_deleted(deletion_tag) tags, distances = index.search(deletion_vector, k_neighbors=5, complexity=64) - self.assertNotIn(deletion_tag, tags, "deletion_tag should not exist, as we have marked it for deletion") + self.assertNotIn( + deletion_tag, + tags, + "deletion_tag should not exist, as we have marked it for deletion", + ) with tempfile.TemporaryDirectory() as tmpdir: index.save(tmpdir) @@ -331,11 +345,96 @@ def test_issue400(self): initial_search_complexity=32, max_vectors=10100, complexity=64, - graph_degree=32 + graph_degree=32, + ) + tags, distances = index2.search( + deletion_vector, k_neighbors=5, complexity=64 ) - tags, distances = index2.search(deletion_vector, k_neighbors=5, complexity=64) self.assertNotIn( deletion_tag, tags, - "deletion_tag should not exist, as we saved and reloaded the index without it" + "deletion_tag should not exist, as we saved and reloaded the index without it", ) + + def test_inserts_past_max_vectors(self): + def _tiny_index(): + return dap.DynamicMemoryIndex( + distance_metric="l2", + vector_dtype=np.float32, + dimensions=10, + max_vectors=2, + complexity=64, + graph_degree=32, + num_threads=16, + ) + + + rng = np.random.default_rng(12345) + + # insert 3 vectors and look for an exception + index = _tiny_index() + index.insert(rng.random(10, dtype=np.float32), 1) + index.insert(rng.random(10, dtype=np.float32), 2) + with self.assertRaises(RuntimeError): + index.insert(rng.random(10, dtype=np.float32), 3) + + # insert 2 vectors, delete 1, and insert another and expect a warning + index = _tiny_index() + index.insert(rng.random(10, dtype=np.float32), 1) + index.insert(rng.random(10, dtype=np.float32), 2) + index.mark_deleted(2) + with self.assertWarns(UserWarning): + self.assertEqual(index._removed_num_vectors, 1) + self.assertEqual(index._num_vectors, 2) + index.insert(rng.random(10, dtype=np.float32), 3) + self.assertEqual(index._removed_num_vectors, 0) + self.assertEqual(index._num_vectors, 2) + + # insert 3 batch and look for an exception + index = _tiny_index() + with self.assertRaises(RuntimeError): + index.batch_insert( + rng.random((3, 10), dtype=np.float32), + np.array([1,2,3], dtype=np.uint32) + ) + + + # insert 2 batch, remove 1, add 1 and expect a warning, remove 1, insert 2 batch and look for an exception + index = _tiny_index() + index.batch_insert( + rng.random((2, 10), dtype=np.float32), + np.array([1,2], dtype=np.uint32) + ) + index.mark_deleted(1) + with self.assertWarns(UserWarning): + index.insert(rng.random(10, dtype=np.float32), 3) + index.mark_deleted(2) + with self.assertRaises(RuntimeError): + index.batch_insert(rng.random((2,10), dtype=np.float32), np.array([4, 5], dtype=np.uint32)) + + # insert 1, remove it, add 2 batch, and expect a warning + index = _tiny_index() + index.insert(rng.random(10, dtype=np.float32), 1) + index.mark_deleted(1) + with self.assertWarns(UserWarning): + index.batch_insert(rng.random((2, 10), dtype=np.float32), np.array([10, 20], dtype=np.uint32)) + + # insert 2 batch, remove both, add 2 batch, and expect a warning + index = _tiny_index() + index.batch_insert(rng.random((2,10), dtype=np.float32), np.array([10, 20], dtype=np.uint32)) + index.mark_deleted(10) + index.mark_deleted(20) + with self.assertWarns(UserWarning): + index.batch_insert(rng.random((2, 10), dtype=np.float32), np.array([15, 25], dtype=np.uint32)) + + # insert 2 batch, remove both, consolidate_delete, add 2 batch and do not expect warning + index = _tiny_index() + index.batch_insert(rng.random((2,10), dtype=np.float32), np.array([10, 20], dtype=np.uint32)) + index.mark_deleted(10) + index.mark_deleted(20) + index.consolidate_delete() + with warnings.catch_warnings(): + warnings.simplefilter("error") # turns warnings into raised exceptions + index.batch_insert(rng.random((2, 10), dtype=np.float32), np.array([15, 25], dtype=np.uint32)) + + diff --git a/python/tests/test_static_disk_index.py b/python/tests/test_static_disk_index.py index 6cff484da..4ba544106 100644 --- a/python/tests/test_static_disk_index.py +++ b/python/tests/test_static_disk_index.py @@ -35,9 +35,9 @@ class TestStaticDiskIndex(unittest.TestCase): @classmethod def setUpClass(cls) -> None: cls._test_matrix = [ - _build_random_vectors_and_index(np.single, "l2"), - _build_random_vectors_and_index(np.ubyte, "l2"), - _build_random_vectors_and_index(np.byte, "l2"), + _build_random_vectors_and_index(np.float32, "l2"), + _build_random_vectors_and_index(np.uint8, "l2"), + _build_random_vectors_and_index(np.int8, "l2"), ] cls._example_ann_dir = cls._test_matrix[0][4] @@ -52,7 +52,7 @@ def tearDownClass(cls) -> None: def test_recall_and_batch(self): for metric, dtype, query_vectors, index_vectors, ann_dir in self._test_matrix: - with self.subTest(): + with self.subTest(msg=f"Testing dtype {dtype}"): index = dap.StaticDiskIndex( distance_metric="l2", vector_dtype=dtype, @@ -83,7 +83,7 @@ def test_recall_and_batch(self): def test_single(self): for metric, dtype, query_vectors, index_vectors, ann_dir in self._test_matrix: - with self.subTest(): + with self.subTest(msg=f"Testing dtype {dtype}"): index = dap.StaticDiskIndex( distance_metric="l2", vector_dtype=dtype, diff --git a/python/tests/test_static_memory_index.py b/python/tests/test_static_memory_index.py index 782466a84..cb7f0f01d 100644 --- a/python/tests/test_static_memory_index.py +++ b/python/tests/test_static_memory_index.py @@ -40,9 +40,9 @@ def test_recall_and_batch(self): index_vectors, ann_dir, vector_bin_file, - _ + _, ) in self._test_matrix: - with self.subTest(): + with self.subTest(msg=f"Testing dtype {dtype}"): index = dap.StaticMemoryIndex( index_directory=ann_dir, num_threads=16, @@ -66,7 +66,7 @@ def test_recall_and_batch(self): self.assertTrue( recall > 0.70, f"Recall [{recall}] was not over 0.7", - ) + ) def test_single(self): for ( @@ -76,9 +76,9 @@ def test_single(self): index_vectors, ann_dir, vector_bin_file, - _ + _, ) in self._test_matrix: - with self.subTest(): + with self.subTest(msg=f"Testing dtype {dtype}"): index = dap.StaticMemoryIndex( index_directory=ann_dir, num_threads=16, @@ -98,7 +98,7 @@ def test_value_ranges_ctor(self): index_vectors, ann_dir, vector_bin_file, - _ + _, ) = build_random_vectors_and_memory_index(np.single, "l2", "not_ann") good_ranges = { "index_directory": ann_dir,