Skip to content

Commit

Permalink
Halfway approach to the new indexfactory, but it doesn't have the sam…
Browse files Browse the repository at this point in the history
…e featureset as the old way. Committing this for posterity but reverting my changes ultimately
  • Loading branch information
daxpryce committed Oct 30, 2023
1 parent 3d58ceb commit 03dccb5
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 42 deletions.
7 changes: 3 additions & 4 deletions python/include/builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,10 @@ void build_disk_index(diskann::Metric metric, const std::string &data_file_path,
uint32_t complexity, uint32_t graph_degree, double final_index_ram_limit,
double indexing_ram_budget, uint32_t num_threads, uint32_t pq_disk_bytes);

template <typename DT, typename TagT = DynamicIdType, typename LabelT = filterT>
void build_memory_index(diskann::Metric metric, const std::string &vector_bin_path,
void build_memory_index(const std::string &vector_dtype, diskann::Metric metric, const std::string &vector_bin_path,
const std::string &index_output_path, uint32_t graph_degree, uint32_t complexity,
float alpha, uint32_t num_threads, bool use_pq_build,
size_t num_pq_bytes, bool use_opq, uint32_t filter_complexity,
bool use_tags = false);
size_t num_pq_bytes, bool use_opq, bool use_tags = false, const std::string &label_path = "",
const std::string &universal_label = "", uint32_t filter_complexity = 0);

}
9 changes: 5 additions & 4 deletions python/src/_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,11 @@ def build_memory_index(
num_points, dimensions = vectors_metadata_from_file(vector_bin_path)

if vector_dtype_actual == np.uint8:
_builder = _native_dap.build_memory_uint8_index
_native_dtype = "uint8"
elif vector_dtype_actual == np.int8:
_builder = _native_dap.build_memory_int8_index
_native_dtype = "int8"
else:
_builder = _native_dap.build_memory_float_index
_native_dtype = "float"

index_prefix_path = os.path.join(index_directory, index_prefix)

Expand All @@ -288,7 +288,8 @@ def build_memory_index(
else:
use_tags = False

_builder(
_native_dap.build_memory_index(
vector_dtype=_native_dtype,
distance_metric=dap_metric,
data_file_path=vector_bin_path,
index_output_path=index_prefix_path,
Expand Down
65 changes: 40 additions & 25 deletions python/src/builder.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#include <cstdint>
#include "abstract_index.h"
#include "builder.h"
#include "common.h"
#include "disk_utils.h"
#include "index.h"
#include "index_factory.h"
#include "parameters.h"

namespace diskannpy
Expand All @@ -31,28 +34,48 @@ template void build_disk_index<uint8_t>(diskann::Metric, const std::string &, co
template void build_disk_index<int8_t>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
double, double, uint32_t, uint32_t);

template <typename T, typename TagT, typename LabelT>
void build_memory_index(const diskann::Metric metric, const std::string &vector_bin_path,
void build_memory_index(const std::string &vector_dtype, const diskann::Metric metric, const std::string &vector_bin_path,
const std::string &index_output_path, const uint32_t graph_degree, const uint32_t complexity,
const float alpha, const uint32_t num_threads, const bool use_pq_build,
const size_t num_pq_bytes, const bool use_opq, const uint32_t filter_complexity,
const bool use_tags)
const size_t num_pq_bytes, const bool use_opq, const bool use_tags,
const std::string &label_path, const std::string &universal_label,
const uint32_t filter_complexity)
{
diskann::IndexWriteParameters index_build_params = diskann::IndexWriteParametersBuilder(complexity, graph_degree)
auto index_build_params = diskann::IndexWriteParametersBuilder(complexity, graph_degree)
.with_filter_list_size(filter_complexity)
.with_alpha(alpha)
.with_saturate_graph(false)
.with_num_threads(num_threads)
.build();
diskann::IndexSearchParams index_search_params =
diskann::IndexSearchParams(index_build_params.search_list_size, num_threads);
auto filter_params = diskann::IndexFilterParamsBuilder()
.with_universal_label(universal_label)
.with_label_file(label_path)
.with_save_path_prefix(index_output_path)
.build();

size_t data_num, data_dim;
diskann::get_bin_metadata(vector_bin_path, data_num, data_dim);

diskann::Index<T, TagT, LabelT> index(metric, data_dim, data_num,
std::make_shared<diskann::IndexWriteParameters>(index_build_params),
std::make_shared<diskann::IndexSearchParams>(index_search_params), 0,
use_tags, use_tags, false, use_pq_build, num_pq_bytes, use_opq);
auto config = diskann::IndexConfigBuilder()
.with_metric(metric)
.with_dimension(data_dim)
.with_max_points(data_num)
.with_data_load_store_strategy(diskann::DataStoreStrategy::MEMORY)
.with_graph_load_store_strategy(diskann::GraphStoreStrategy::MEMORY)
.with_data_type(vector_dtype)
.with_tag_type("uint32") // fixed type
.with_label_type("uint32") // fixed type
.is_dynamic_index(false)
.with_index_write_params(index_build_params)
.is_concurrent_consolidate(false)
.is_enable_tags(use_tags)
.is_filtered(!label_path.empty())
.is_use_opq(use_opq)
.is_pq_dist_build(use_pq_build)
.with_num_pq_chunks(num_pq_bytes)
.build();

auto index = diskann::IndexFactory(config).create_instance();

if (use_tags)
{
Expand All @@ -61,27 +84,19 @@ void build_memory_index(const diskann::Metric metric, const std::string &vector_
{
throw std::runtime_error("tags file not found at expected path: " + tags_file);
}
TagT *tags_data;
uint32_t *tags_data;
size_t tag_dims = 1;
diskann::load_bin(tags_file, tags_data, data_num, tag_dims);
std::vector<TagT> tags(tags_data, tags_data + data_num);
index.build(vector_bin_path.c_str(), data_num, tags);
std::vector<uint32_t> tags(tags_data, tags_data + data_num);
index->build(vector_bin_path, data_num, tags);
}
else
{
index.build(vector_bin_path.c_str(), data_num);
index->build(vector_bin_path, data_num, filter_params);
}

index.save(index_output_path.c_str());
index->save(index_output_path.c_str());
index.reset();
}

template void build_memory_index<float>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
float, uint32_t, bool, size_t, bool, uint32_t, bool);

template void build_memory_index<int8_t>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
float, uint32_t, bool, size_t, bool, uint32_t, bool);

template void build_memory_index<uint8_t>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
float, uint32_t, bool, size_t, bool, uint32_t, bool);

} // namespace diskannpy
1 change: 0 additions & 1 deletion python/src/diskann_bindings.cpp

This file was deleted.

16 changes: 8 additions & 8 deletions python/src/module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,18 @@ using namespace pybind11::literals;
struct Variant
{
std::string disk_builder_name;
std::string memory_builder_name;
std::string dynamic_memory_index_name;
std::string static_memory_index_name;
std::string static_disk_index_name;
};

const Variant FloatVariant{"build_disk_float_index", "build_memory_float_index", "DynamicMemoryFloatIndex",
const Variant FloatVariant{"build_disk_float_index", "DynamicMemoryFloatIndex",
"StaticMemoryFloatIndex", "StaticDiskFloatIndex"};

const Variant UInt8Variant{"build_disk_uint8_index", "build_memory_uint8_index", "DynamicMemoryUInt8Index",
const Variant UInt8Variant{"build_disk_uint8_index", "DynamicMemoryUInt8Index",
"StaticMemoryUInt8Index", "StaticDiskUInt8Index"};

const Variant Int8Variant{"build_disk_int8_index", "build_memory_int8_index", "DynamicMemoryInt8Index",
const Variant Int8Variant{"build_disk_int8_index", "DynamicMemoryInt8Index",
"StaticMemoryInt8Index", "StaticDiskInt8Index"};

template <typename T> inline void add_variant(py::module_ &m, const Variant &variant)
Expand All @@ -46,10 +45,6 @@ template <typename T> inline void add_variant(py::module_ &m, const Variant &var
"index_prefix_path"_a, "complexity"_a, "graph_degree"_a, "final_index_ram_limit"_a, "indexing_ram_budget"_a,
"num_threads"_a, "pq_disk_bytes"_a);

m.def(variant.memory_builder_name.c_str(), &diskannpy::build_memory_index<T>, "distance_metric"_a,
"data_file_path"_a, "index_output_path"_a, "graph_degree"_a, "complexity"_a, "alpha"_a, "num_threads"_a,
"use_pq_build"_a, "num_pq_bytes"_a, "use_opq"_a, "filter_complexity"_a = 0, "use_tags"_a = false);

py::class_<diskannpy::StaticMemoryIndex<T>>(m, variant.static_memory_index_name.c_str())
.def(py::init<const diskann::Metric, const std::string &, const size_t, const size_t, const uint32_t,
const uint32_t>(),
Expand Down Expand Up @@ -122,6 +117,11 @@ PYBIND11_MODULE(_diskannpy, m)
default_values.attr("NUM_PQ_BYTES") = (uint32_t)0;
default_values.attr("USE_OPQ") = false;

m.def("build_memory_index", &diskannpy::build_memory_index, "vector_dtype"_a, "distance_metric"_a,
"data_file_path"_a, "index_output_path"_a, "graph_degree"_a, "complexity"_a, "alpha"_a, "num_threads"_a,
"use_pq_build"_a, "num_pq_bytes"_a, "use_opq"_a, "use_tags"_a = false, "label_path"_a = "",
"universal_label"_a = "", "filter_complexity"_a = 0);

add_variant<float>(m, FloatVariant);
add_variant<uint8_t>(m, UInt8Variant);
add_variant<int8_t>(m, Int8Variant);
Expand Down

0 comments on commit 03dccb5

Please sign in to comment.