diff --git a/CMakeLists.txt b/CMakeLists.txt index 89530f818..3d3d2b860 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,7 @@ # such behavior. # Contact for this feature: gopalrs. + # Some variables like MSVC are defined only after project(), so put that first. cmake_minimum_required(VERSION 3.15) project(diskann) @@ -52,6 +53,9 @@ endif() include_directories(${PROJECT_SOURCE_DIR}/include) +if(NOT PYBIND) + set(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS ON) +endif() # It's necessary to include tcmalloc headers only if calling into MallocExtension interface. # For using tcmalloc in DiskANN tools, it's enough to just link with tcmalloc. if (DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) @@ -92,7 +96,9 @@ if (MSVC) set(Boost_USE_STATIC_LIBS ON) endif() -find_package(Boost COMPONENTS program_options) +if(NOT MSVC) + find_package(Boost COMPONENTS program_options) +endif() # For Windows, fall back to nuget version if find_package didn't find it. if (MSVC AND NOT Boost_FOUND) @@ -219,13 +225,13 @@ if (MSVC) # Tell CMake how to build the tcmalloc linker library from the submodule. add_custom_target(build_libtcmalloc_minimal DEPENDS ${TCMALLOC_LINK_LIBRARY}) add_custom_command(OUTPUT ${TCMALLOC_LINK_LIBRARY} - COMMAND ${CMAKE_VS_MSBUILD_COMMAND} gperftools.sln /m /nologo - /t:libtcmalloc_minimal /p:Configuration="Release-Patch" - /property:Platform="x64" - /p:PlatformToolset=v${MSVC_TOOLSET_VERSION} - /p:WindowsTargetPlatformVersion=${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION} - WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/gperftools) - + COMMAND ${CMAKE_VS_MSBUILD_COMMAND} gperftools.sln /m /nologo + /t:libtcmalloc_minimal /p:Configuration="Release-Patch" + /property:Platform="x64" + /p:PlatformToolset=v${MSVC_TOOLSET_VERSION} + /p:WindowsTargetPlatformVersion=${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION} + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/gperftools) + add_library(libtcmalloc_minimal_for_exe STATIC IMPORTED) add_library(libtcmalloc_minimal_for_dll STATIC IMPORTED) diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index e42c0b6cb..f414ef4ea 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -16,6 +16,15 @@ target_link_libraries(search_memory_index ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} $ add_executable(build_disk_index build_disk_index.cpp) target_link_libraries(build_disk_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} ${DISKANN_ASYNC_LIB} Boost::program_options) +add_executable(split_subgraph_index split_subgraph_index.cpp) +target_link_libraries(split_subgraph_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} ${DISKANN_ASYNC_LIB} Boost::program_options) + +add_executable(build_subgraph_index build_subgraph_index.cpp) +target_link_libraries(build_subgraph_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} ${DISKANN_ASYNC_LIB} Boost::program_options) + +add_executable(merge_subgraph_index merge_subgraph_index.cpp) +target_link_libraries(merge_subgraph_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} ${DISKANN_ASYNC_LIB} Boost::program_options) + add_executable(search_disk_index search_disk_index.cpp) target_link_libraries(search_disk_index ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options) diff --git a/apps/build_subgraph_index.cpp b/apps/build_subgraph_index.cpp new file mode 100644 index 000000000..bf2b30e86 --- /dev/null +++ b/apps/build_subgraph_index.cpp @@ -0,0 +1,193 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include + +#include "utils.h" +#include "disk_utils.h" +#include "math_utils.h" +#include "index.h" +#include "partition.h" +#include "program_options_utils.hpp" + +namespace po = boost::program_options; + +int main(int argc, char **argv) +{ + std::string data_type, dist_fn, data_path, index_path_prefix, codebook_prefix, label_file, universal_label, + label_type; + uint32_t num_threads, R, L, disk_PQ, build_PQ, QD, Lf, filter_threshold, subshard_id; + float B, M; + bool append_reorder_data = false; + bool use_opq = false; + + po::options_description desc{ + program_options_utils::make_program_description("build_disk_index", "Build a disk-based index.")}; + try + { + desc.add_options()("help,h", "Print information on arguments"); + + // Required parameters + po::options_description required_configs("Required"); + required_configs.add_options()("data_type", po::value(&data_type)->required(), + program_options_utils::DATA_TYPE_DESCRIPTION); + required_configs.add_options()("dist_fn", po::value(&dist_fn)->required(), + program_options_utils::DISTANCE_FUNCTION_DESCRIPTION); + required_configs.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), + program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION); + required_configs.add_options()("data_path", po::value(&data_path)->required(), + program_options_utils::INPUT_DATA_PATH); + required_configs.add_options()("search_DRAM_budget,B", po::value(&B)->required(), + "DRAM budget in GB for searching the index to set the " + "compressed level for data while search happens"); + required_configs.add_options()("build_DRAM_budget,M", po::value(&M)->required(), + "DRAM budget in GB for building the index"); + + // Optional parameters + po::options_description optional_configs("Optional"); + optional_configs.add_options()("num_threads,T", + po::value(&num_threads)->default_value(omp_get_num_procs()), + program_options_utils::NUMBER_THREADS_DESCRIPTION); + optional_configs.add_options()("max_degree,R", po::value(&R)->default_value(64), + program_options_utils::MAX_BUILD_DEGREE); + optional_configs.add_options()("Lbuild,L", po::value(&L)->default_value(100), + program_options_utils::GRAPH_BUILD_COMPLEXITY); + optional_configs.add_options()("QD", po::value(&QD)->default_value(0), + " Quantized Dimension for compression"); + optional_configs.add_options()("codebook_prefix", po::value(&codebook_prefix)->default_value(""), + "Path prefix for pre-trained codebook"); + optional_configs.add_options()("PQ_disk_bytes", po::value(&disk_PQ)->default_value(0), + "Number of bytes to which vectors should be compressed " + "on SSD; 0 for no compression"); + optional_configs.add_options()("append_reorder_data", po::bool_switch()->default_value(false), + "Include full precision data in the index. Use only in " + "conjuction with compressed data on SSD."); + optional_configs.add_options()("build_PQ_bytes", po::value(&build_PQ)->default_value(0), + program_options_utils::BUIlD_GRAPH_PQ_BYTES); + optional_configs.add_options()("use_opq", po::bool_switch()->default_value(false), + program_options_utils::USE_OPQ); + optional_configs.add_options()("label_file", po::value(&label_file)->default_value(""), + program_options_utils::LABEL_FILE); + optional_configs.add_options()("universal_label", po::value(&universal_label)->default_value(""), + program_options_utils::UNIVERSAL_LABEL); + optional_configs.add_options()("FilteredLbuild", po::value(&Lf)->default_value(0), + program_options_utils::FILTERED_LBUILD); + optional_configs.add_options()("filter_threshold,F", po::value(&filter_threshold)->default_value(0), + "Threshold to break up the existing nodes to generate new graph " + "internally where each node has a maximum F labels."); + optional_configs.add_options()("label_type", po::value(&label_type)->default_value("uint"), + program_options_utils::LABEL_TYPE_DESCRIPTION); + optional_configs.add_options()("subshard_id", po::value(&subshard_id)->default_value(0), + program_options_utils::SUBSHARD_ID_DESCRIPTION); + + // Merge required and optional parameters + desc.add(required_configs).add(optional_configs); + + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + if (vm.count("help")) + { + std::cout << desc; + return 0; + } + po::notify(vm); + if (vm["append_reorder_data"].as()) + append_reorder_data = true; + if (vm["use_opq"].as()) + use_opq = true; + } + catch (const std::exception &ex) + { + std::cerr << ex.what() << '\n'; + return -1; + } + + bool use_filters = (label_file != "") ? true : false; + diskann::Metric metric; + if (dist_fn == std::string("l2")) + metric = diskann::Metric::L2; + else if (dist_fn == std::string("mips")) + metric = diskann::Metric::INNER_PRODUCT; + else if (dist_fn == std::string("cosine")) + metric = diskann::Metric::COSINE; + else + { + std::cout << "Error. Only l2 and mips distance functions are supported" << std::endl; + return -1; + } + + if (append_reorder_data) + { + if (disk_PQ == 0) + { + std::cout << "Error: It is not necessary to append data for reordering " + "when vectors are not compressed on disk." + << std::endl; + return -1; + } + if (data_type != std::string("float")) + { + std::cout << "Error: Appending data for reordering currently only " + "supported for float data type." + << std::endl; + return -1; + } + } + + std::string params = std::string(std::to_string(R)) + " " + std::string(std::to_string(L)) + " " + + std::string(std::to_string(B)) + " " + std::string(std::to_string(M)) + " " + + std::string(std::to_string(num_threads)) + " " + std::string(std::to_string(disk_PQ)) + " " + + std::string(std::to_string(append_reorder_data)) + " " + + std::string(std::to_string(build_PQ)) + " " + std::string(std::to_string(QD)); + + try + { + if (label_file != "" && label_type == "ushort") + { + if (data_type == std::string("int8")) + return diskann::build_disk_index(data_path.c_str(), index_path_prefix.c_str(), params.c_str(), + metric, use_opq, codebook_prefix, use_filters, label_file, + universal_label, filter_threshold, Lf); + else if (data_type == std::string("uint8")) + return diskann::build_disk_index( + data_path.c_str(), index_path_prefix.c_str(), params.c_str(), metric, use_opq, codebook_prefix, + use_filters, label_file, universal_label, filter_threshold, Lf); + else if (data_type == std::string("float")) + return diskann::build_disk_index( + data_path.c_str(), index_path_prefix.c_str(), params.c_str(), metric, use_opq, codebook_prefix, + use_filters, label_file, universal_label, filter_threshold, Lf); + else + { + diskann::cerr << "Error. Unsupported data type" << std::endl; + return -1; + } + } + else + { + if (data_type == std::string("int8")) + + return diskann::build_subgraph_index(data_path.c_str(), index_path_prefix.c_str(), params.c_str(), + metric, use_opq, codebook_prefix, use_filters, label_file, universal_label, filter_threshold, Lf, subshard_id); + //else if (data_type == std::string("uint8")) + // return diskann::build_disk_index(data_path.c_str(), index_path_prefix.c_str(), params.c_str(), + // metric, use_opq, codebook_prefix, use_filters, label_file, + // universal_label, filter_threshold, Lf); + //else if (data_type == std::string("float")) + // return diskann::build_disk_index(data_path.c_str(), index_path_prefix.c_str(), params.c_str(), + // metric, use_opq, codebook_prefix, use_filters, label_file, + // universal_label, filter_threshold, Lf); + else + { + diskann::cerr << "Error. Unsupported data type" << std::endl; + return -1; + } + } + } + catch (const std::exception &e) + { + std::cout << std::string(e.what()) << std::endl; + diskann::cerr << "Index build failed." << std::endl; + return -1; + } +} diff --git a/apps/merge_subgraph_index.cpp b/apps/merge_subgraph_index.cpp new file mode 100644 index 000000000..402bb6eff --- /dev/null +++ b/apps/merge_subgraph_index.cpp @@ -0,0 +1,193 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include + +#include "utils.h" +#include "disk_utils.h" +#include "math_utils.h" +#include "index.h" +#include "partition.h" +#include "program_options_utils.hpp" + +namespace po = boost::program_options; + +int main(int argc, char **argv) +{ + std::string data_type, dist_fn, data_path, index_path_prefix, codebook_prefix, label_file, universal_label, + label_type; + uint32_t num_threads, R, L, disk_PQ, build_PQ, QD, Lf, filter_threshold, num_parts; + float B, M; + bool append_reorder_data = false; + bool use_opq = false; + + po::options_description desc{ + program_options_utils::make_program_description("build_disk_index", "Build a disk-based index.")}; + try + { + desc.add_options()("help,h", "Print information on arguments"); + + // Required parameters + po::options_description required_configs("Required"); + required_configs.add_options()("data_type", po::value(&data_type)->required(), + program_options_utils::DATA_TYPE_DESCRIPTION); + required_configs.add_options()("dist_fn", po::value(&dist_fn)->required(), + program_options_utils::DISTANCE_FUNCTION_DESCRIPTION); + required_configs.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), + program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION); + required_configs.add_options()("data_path", po::value(&data_path)->required(), + program_options_utils::INPUT_DATA_PATH); + required_configs.add_options()("search_DRAM_budget,B", po::value(&B)->required(), + "DRAM budget in GB for searching the index to set the " + "compressed level for data while search happens"); + required_configs.add_options()("build_DRAM_budget,M", po::value(&M)->required(), + "DRAM budget in GB for building the index"); + + // Optional parameters + po::options_description optional_configs("Optional"); + optional_configs.add_options()("num_threads,T", + po::value(&num_threads)->default_value(omp_get_num_procs()), + program_options_utils::NUMBER_THREADS_DESCRIPTION); + optional_configs.add_options()("max_degree,R", po::value(&R)->default_value(64), + program_options_utils::MAX_BUILD_DEGREE); + optional_configs.add_options()("Lbuild,L", po::value(&L)->default_value(100), + program_options_utils::GRAPH_BUILD_COMPLEXITY); + optional_configs.add_options()("QD", po::value(&QD)->default_value(0), + " Quantized Dimension for compression"); + optional_configs.add_options()("codebook_prefix", po::value(&codebook_prefix)->default_value(""), + "Path prefix for pre-trained codebook"); + optional_configs.add_options()("PQ_disk_bytes", po::value(&disk_PQ)->default_value(0), + "Number of bytes to which vectors should be compressed " + "on SSD; 0 for no compression"); + optional_configs.add_options()("append_reorder_data", po::bool_switch()->default_value(false), + "Include full precision data in the index. Use only in " + "conjuction with compressed data on SSD."); + optional_configs.add_options()("build_PQ_bytes", po::value(&build_PQ)->default_value(0), + program_options_utils::BUIlD_GRAPH_PQ_BYTES); + optional_configs.add_options()("use_opq", po::bool_switch()->default_value(false), + program_options_utils::USE_OPQ); + optional_configs.add_options()("label_file", po::value(&label_file)->default_value(""), + program_options_utils::LABEL_FILE); + optional_configs.add_options()("universal_label", po::value(&universal_label)->default_value(""), + program_options_utils::UNIVERSAL_LABEL); + optional_configs.add_options()("FilteredLbuild", po::value(&Lf)->default_value(0), + program_options_utils::FILTERED_LBUILD); + optional_configs.add_options()("filter_threshold,F", po::value(&filter_threshold)->default_value(0), + "Threshold to break up the existing nodes to generate new graph " + "internally where each node has a maximum F labels."); + optional_configs.add_options()("label_type", po::value(&label_type)->default_value("uint"), + program_options_utils::LABEL_TYPE_DESCRIPTION); + optional_configs.add_options()("num_parts", po::value(&num_parts)->default_value(0), + program_options_utils::NUM_PARTS_DESCRIPTION); + + // Merge required and optional parameters + desc.add(required_configs).add(optional_configs); + + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + if (vm.count("help")) + { + std::cout << desc; + return 0; + } + po::notify(vm); + if (vm["append_reorder_data"].as()) + append_reorder_data = true; + if (vm["use_opq"].as()) + use_opq = true; + } + catch (const std::exception &ex) + { + std::cerr << ex.what() << '\n'; + return -1; + } + + bool use_filters = (label_file != "") ? true : false; + diskann::Metric metric; + if (dist_fn == std::string("l2")) + metric = diskann::Metric::L2; + else if (dist_fn == std::string("mips")) + metric = diskann::Metric::INNER_PRODUCT; + else if (dist_fn == std::string("cosine")) + metric = diskann::Metric::COSINE; + else + { + std::cout << "Error. Only l2 and mips distance functions are supported" << std::endl; + return -1; + } + + if (append_reorder_data) + { + if (disk_PQ == 0) + { + std::cout << "Error: It is not necessary to append data for reordering " + "when vectors are not compressed on disk." + << std::endl; + return -1; + } + if (data_type != std::string("float")) + { + std::cout << "Error: Appending data for reordering currently only " + "supported for float data type." + << std::endl; + return -1; + } + } + + std::string params = std::string(std::to_string(R)) + " " + std::string(std::to_string(L)) + " " + + std::string(std::to_string(B)) + " " + std::string(std::to_string(M)) + " " + + std::string(std::to_string(num_threads)) + " " + std::string(std::to_string(disk_PQ)) + " " + + std::string(std::to_string(append_reorder_data)) + " " + + std::string(std::to_string(build_PQ)) + " " + std::string(std::to_string(QD)); + + try + { + if (label_file != "" && label_type == "ushort") + { + if (data_type == std::string("int8")) + return diskann::build_disk_index(data_path.c_str(), index_path_prefix.c_str(), params.c_str(), + metric, use_opq, codebook_prefix, use_filters, label_file, + universal_label, filter_threshold, Lf); + else if (data_type == std::string("uint8")) + return diskann::build_disk_index( + data_path.c_str(), index_path_prefix.c_str(), params.c_str(), metric, use_opq, codebook_prefix, + use_filters, label_file, universal_label, filter_threshold, Lf); + else if (data_type == std::string("float")) + return diskann::build_disk_index( + data_path.c_str(), index_path_prefix.c_str(), params.c_str(), metric, use_opq, codebook_prefix, + use_filters, label_file, universal_label, filter_threshold, Lf); + else + { + diskann::cerr << "Error. Unsupported data type" << std::endl; + return -1; + } + } + else + { + if (data_type == std::string("int8")) + return diskann::merge_subgraph_index(data_path.c_str(), index_path_prefix.c_str(), params.c_str(), + metric, use_opq, codebook_prefix, use_filters, label_file, + universal_label, filter_threshold, Lf, num_parts); + //else if (data_type == std::string("uint8")) + // return diskann::build_disk_index(data_path.c_str(), index_path_prefix.c_str(), params.c_str(), + // metric, use_opq, codebook_prefix, use_filters, label_file, + // universal_label, filter_threshold, Lf); + //else if (data_type == std::string("float")) + // return diskann::build_disk_index(data_path.c_str(), index_path_prefix.c_str(), params.c_str(), + // metric, use_opq, codebook_prefix, use_filters, label_file, + // universal_label, filter_threshold, Lf); + else + { + diskann::cerr << "Error. Unsupported data type" << std::endl; + return -1; + } + } + } + catch (const std::exception &e) + { + std::cout << std::string(e.what()) << std::endl; + diskann::cerr << "Index build failed." << std::endl; + return -1; + } +} diff --git a/apps/split_subgraph_index.cpp b/apps/split_subgraph_index.cpp new file mode 100644 index 000000000..da96a5bc4 --- /dev/null +++ b/apps/split_subgraph_index.cpp @@ -0,0 +1,199 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include + +#include "utils.h" +#include "disk_utils.h" +#include "math_utils.h" +#include "index.h" +#include "partition.h" +#include "program_options_utils.hpp" + +namespace po = boost::program_options; + +int main(int argc, char **argv) +{ + std::string data_type, dist_fn, data_path, index_path_prefix, codebook_prefix, label_file, universal_label, + label_type; + uint32_t num_threads, R, L, disk_PQ, build_PQ, QD, Lf, filter_threshold; + float B, M; + bool append_reorder_data = false; + bool use_opq = false; + + po::options_description desc{ + program_options_utils::make_program_description("build_disk_index", "Build a disk-based index.")}; + try + { + desc.add_options()("help,h", "Print information on arguments"); + + // Required parameters + po::options_description required_configs("Required"); + required_configs.add_options()("data_type", po::value(&data_type)->required(), + program_options_utils::DATA_TYPE_DESCRIPTION); + required_configs.add_options()("dist_fn", po::value(&dist_fn)->required(), + program_options_utils::DISTANCE_FUNCTION_DESCRIPTION); + required_configs.add_options()("index_path_prefix", po::value(&index_path_prefix)->required(), + program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION); + required_configs.add_options()("data_path", po::value(&data_path)->required(), + program_options_utils::INPUT_DATA_PATH); + required_configs.add_options()("search_DRAM_budget,B", po::value(&B)->required(), + "DRAM budget in GB for searching the index to set the " + "compressed level for data while search happens"); + required_configs.add_options()("build_DRAM_budget,M", po::value(&M)->required(), + "DRAM budget in GB for building the index"); + + // Optional parameters + po::options_description optional_configs("Optional"); + optional_configs.add_options()("num_threads,T", + po::value(&num_threads)->default_value(omp_get_num_procs()), + program_options_utils::NUMBER_THREADS_DESCRIPTION); + optional_configs.add_options()("max_degree,R", po::value(&R)->default_value(64), + program_options_utils::MAX_BUILD_DEGREE); + optional_configs.add_options()("Lbuild,L", po::value(&L)->default_value(100), + program_options_utils::GRAPH_BUILD_COMPLEXITY); + optional_configs.add_options()("QD", po::value(&QD)->default_value(0), + " Quantized Dimension for compression"); + optional_configs.add_options()("codebook_prefix", po::value(&codebook_prefix)->default_value(""), + "Path prefix for pre-trained codebook"); + optional_configs.add_options()("PQ_disk_bytes", po::value(&disk_PQ)->default_value(0), + "Number of bytes to which vectors should be compressed " + "on SSD; 0 for no compression"); + optional_configs.add_options()("append_reorder_data", po::bool_switch()->default_value(false), + "Include full precision data in the index. Use only in " + "conjuction with compressed data on SSD."); + optional_configs.add_options()("build_PQ_bytes", po::value(&build_PQ)->default_value(0), + program_options_utils::BUIlD_GRAPH_PQ_BYTES); + optional_configs.add_options()("use_opq", po::bool_switch()->default_value(false), + program_options_utils::USE_OPQ); + optional_configs.add_options()("label_file", po::value(&label_file)->default_value(""), + program_options_utils::LABEL_FILE); + optional_configs.add_options()("universal_label", po::value(&universal_label)->default_value(""), + program_options_utils::UNIVERSAL_LABEL); + optional_configs.add_options()("FilteredLbuild", po::value(&Lf)->default_value(0), + program_options_utils::FILTERED_LBUILD); + optional_configs.add_options()("filter_threshold,F", po::value(&filter_threshold)->default_value(0), + "Threshold to break up the existing nodes to generate new graph " + "internally where each node has a maximum F labels."); + optional_configs.add_options()("label_type", po::value(&label_type)->default_value("uint"), + program_options_utils::LABEL_TYPE_DESCRIPTION); + + // Merge required and optional parameters + desc.add(required_configs).add(optional_configs); + + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + if (vm.count("help")) + { + std::cout << desc; + return 0; + } + po::notify(vm); + if (vm["append_reorder_data"].as()) + append_reorder_data = true; + if (vm["use_opq"].as()) + use_opq = true; + } + catch (const std::exception &ex) + { + std::cerr << ex.what() << '\n'; + return -1; + } + + bool use_filters = (label_file != "") ? true : false; + diskann::Metric metric; + if (dist_fn == std::string("l2")) + metric = diskann::Metric::L2; + else if (dist_fn == std::string("mips")) + metric = diskann::Metric::INNER_PRODUCT; + else if (dist_fn == std::string("cosine")) + metric = diskann::Metric::COSINE; + else + { + std::cout << "Error. Only l2 and mips distance functions are supported" << std::endl; + return -1; + } + + if (append_reorder_data) + { + if (disk_PQ == 0) + { + std::cout << "Error: It is not necessary to append data for reordering " + "when vectors are not compressed on disk." + << std::endl; + return -1; + } + if (data_type != std::string("float")) + { + std::cout << "Error: Appending data for reordering currently only " + "supported for float data type." + << std::endl; + return -1; + } + } + + std::string params = std::string(std::to_string(R)) + " " + std::string(std::to_string(L)) + " " + + std::string(std::to_string(B)) + " " + std::string(std::to_string(M)) + " " + + std::string(std::to_string(num_threads)) + " " + std::string(std::to_string(disk_PQ)) + " " + + std::string(std::to_string(append_reorder_data)) + " " + + std::string(std::to_string(build_PQ)) + " " + std::string(std::to_string(QD)); + + try + { + if (label_file != "" && label_type == "ushort") + { + if (data_type == std::string("int8")) + return diskann::build_disk_index(data_path.c_str(), index_path_prefix.c_str(), params.c_str(), + metric, use_opq, codebook_prefix, use_filters, label_file, + universal_label, filter_threshold, Lf); + else if (data_type == std::string("uint8")) + return diskann::build_disk_index( + data_path.c_str(), index_path_prefix.c_str(), params.c_str(), metric, use_opq, codebook_prefix, + use_filters, label_file, universal_label, filter_threshold, Lf); + else if (data_type == std::string("float")) + return diskann::build_disk_index( + data_path.c_str(), index_path_prefix.c_str(), params.c_str(), metric, use_opq, codebook_prefix, + use_filters, label_file, universal_label, filter_threshold, Lf); + else + { + diskann::cerr << "Error. Unsupported data type" << std::endl; + return -1; + } + } + else + { + if (data_type == std::string("int8")) + return diskann::split_subgraph_index(data_path.c_str(), index_path_prefix.c_str(), params.c_str(), + metric, use_opq, codebook_prefix, use_filters, label_file, + universal_label, filter_threshold, Lf); + + + // + // + // + // + // + // + //else if (data_type == std::string("uint8")) + // return diskann::build_disk_index(data_path.c_str(), index_path_prefix.c_str(), params.c_str(), + // metric, use_opq, codebook_prefix, use_filters, label_file, + // universal_label, filter_threshold, Lf); + //else if (data_type == std::string("float")) + // return diskann::build_disk_index(data_path.c_str(), index_path_prefix.c_str(), params.c_str(), + // metric, use_opq, codebook_prefix, use_filters, label_file, + // universal_label, filter_threshold, Lf); + else + { + diskann::cerr << "Error. Unsupported data type" << std::endl; + return -1; + } + } + } + catch (const std::exception &e) + { + std::cout << std::string(e.what()) << std::endl; + diskann::cerr << "Index build failed." << std::endl; + return -1; + } +} diff --git a/include/disk_utils.h b/include/disk_utils.h index 08f046dcd..779b8d73d 100644 --- a/include/disk_utils.h +++ b/include/disk_utils.h @@ -74,6 +74,7 @@ template DISKANN_DLLEXPORT std::string preprocess_base_file(const std::string &infile, const std::string &indexPrefix, diskann::Metric &distMetric); +//================= template DISKANN_DLLEXPORT int build_merged_vamana_index(std::string base_file, diskann::Metric _compareMetric, uint32_t L, uint32_t R, double sampling_rate, double ram_budget, @@ -83,7 +84,36 @@ DISKANN_DLLEXPORT int build_merged_vamana_index(std::string base_file, diskann:: const std::string &label_file = std::string(""), const std::string &labels_to_medoids_file = std::string(""), const std::string &universal_label = "", const uint32_t Lf = 0); +template +DISKANN_DLLEXPORT int split_merged_vamana_index(std::string base_file, diskann::Metric _compareMetric, uint32_t L, + uint32_t R, double sampling_rate, double ram_budget, + std::string mem_index_path, std::string medoids_file, + std::string centroids_file, size_t build_pq_bytes, bool use_opq, + uint32_t num_threads, bool use_filters = false, + const std::string &label_file = std::string(""), + const std::string &labels_to_medoids_file = std::string(""), + const std::string &universal_label = "", const uint32_t Lf = 0); +template +DISKANN_DLLEXPORT int build_split_merged_vamana_index(std::string base_file, diskann::Metric _compareMetric, uint32_t L, + uint32_t R, double sampling_rate, double ram_budget, + std::string mem_index_path, std::string medoids_file, + std::string centroids_file, size_t build_pq_bytes, bool use_opq, + uint32_t num_threads, bool use_filters = false, + const std::string &label_file = std::string(""), + const std::string &labels_to_medoids_file = std::string(""), + const std::string &universal_label = "", const uint32_t Lf = 0, const uint32_t subshard_id = 0); +template +DISKANN_DLLEXPORT int merge_split_vamana_index(std::string base_file, diskann::Metric _compareMetric, uint32_t L, + uint32_t R, double sampling_rate, double ram_budget, + std::string mem_index_path, std::string medoids_file, + std::string centroids_file, size_t build_pq_bytes, bool use_opq, + uint32_t num_threads, bool use_filters = false, + const std::string &label_file = std::string(""), + const std::string &labels_to_medoids_file = std::string(""), + const std::string &universal_label = "", const uint32_t Lf = 0, const uint32_t num_parts = 0); + +//======================== template DISKANN_DLLEXPORT uint32_t optimize_beamwidth(std::unique_ptr> &_pFlashIndex, T *tuning_sample, uint64_t tuning_sample_num, @@ -100,6 +130,37 @@ DISKANN_DLLEXPORT int build_disk_index( const std::string &universal_label = "", const uint32_t filter_threshold = 0, const uint32_t Lf = 0); // default is empty string for no universal label +template +DISKANN_DLLEXPORT int split_subgraph_index( + const char *dataFilePath, const char *indexFilePath, const char *indexBuildParameters, + diskann::Metric _compareMetric, bool use_opq = false, + const std::string &codebook_prefix = "", // default is empty for no codebook pass in + bool use_filters = false, + const std::string &label_file = std::string(""), // default is empty string for no label_file + const std::string &universal_label = "", const uint32_t filter_threshold = 0, + const uint32_t Lf = 0); // default is empty string for no universal label + +template +DISKANN_DLLEXPORT int build_subgraph_index( + const char *dataFilePath, const char *indexFilePath, const char *indexBuildParameters, + diskann::Metric _compareMetric, bool use_opq = false, + const std::string &codebook_prefix = "", // default is empty for no codebook pass in + bool use_filters = false, + const std::string &label_file = std::string(""), // default is empty string for no label_file + const std::string &universal_label = "", const uint32_t filter_threshold = 0, + const uint32_t Lf = 0, + const uint32_t subshard_id = 0); // default is empty string for no universal label + +template +DISKANN_DLLEXPORT int merge_subgraph_index( + const char *dataFilePath, const char *indexFilePath, const char *indexBuildParameters, + diskann::Metric _compareMetric, bool use_opq = false, + const std::string &codebook_prefix = "", // default is empty for no codebook pass in + bool use_filters = false, + const std::string &label_file = std::string(""), // default is empty string for no label_file + const std::string &universal_label = "", const uint32_t filter_threshold = 0, const uint32_t Lf = 0, + const uint32_t num_parts = 0); // default is empty string for no universal label + template DISKANN_DLLEXPORT void create_disk_layout(const std::string base_file, const std::string mem_index_file, const std::string output_file, diff --git a/include/program_options_utils.hpp b/include/program_options_utils.hpp index 2be60595b..af7365700 100644 --- a/include/program_options_utils.hpp +++ b/include/program_options_utils.hpp @@ -77,5 +77,6 @@ const char *UNIVERSAL_LABEL = "in the labels file instead of listing all labels for a node. DiskANN will not automatically assign a " "universal label to a node."; const char *FILTERED_LBUILD = "Build complexity for filtered points, higher value results in better graphs"; - +const char *SUBSHARD_ID_DESCRIPTION = "multi shard id for distributed multishard index build"; +const char *NUM_PARTS_DESCRIPTION = "Number of parts to split the index for distributed multishard index build"; } // namespace program_options_utils diff --git a/include/utils.h b/include/utils.h index bb03d13f1..02735a84b 100644 --- a/include/utils.h +++ b/include/utils.h @@ -159,6 +159,11 @@ inline int delete_file(const std::string &fileName) { if (file_exists(fileName)) { + std::ifstream file(fileName, std::ifstream::ate | std::ifstream::binary); + std::streamsize size = file.tellg(); + file.close(); + diskann::cout << "Deleting file: " << fileName << " size: " << size << std::endl; + auto rc = ::remove(fileName.c_str()); if (rc != 0) { @@ -168,6 +173,9 @@ inline int delete_file(const std::string &fileName) "If you see this message, please contact the diskann team." << std::endl; } + + diskann::cout << "Deleted file: " << fileName << " size: " << size << std::endl; + return rc; } else @@ -731,7 +739,7 @@ inline size_t save_bin(const std::string &filename, T *data, size_t npts, size_t writer.write((char *)data, npts * ndims * sizeof(T)); writer.close(); - diskann::cout << "Finished writing bin." << std::endl; + diskann::cout << "Finished writing bin." << filename << std::endl; return bytes_written; } @@ -942,6 +950,7 @@ template inline size_t save_data_in_base_dimensions(const std::string &filename, T *data, size_t npts, size_t ndims, size_t aligned_dim, size_t offset = 0) { + diskann::cout << "Start writing data in base dimentions cpk1 " << filename << std::endl; std::ofstream writer; //(filename, std::ios::binary | std::ios::out); open_file_to_write(writer, filename); int npts_i32 = (int)npts, ndims_i32 = (int)ndims; @@ -949,11 +958,18 @@ inline size_t save_data_in_base_dimensions(const std::string &filename, T *data, writer.seekp(offset, writer.beg); writer.write((char *)&npts_i32, sizeof(int)); writer.write((char *)&ndims_i32, sizeof(int)); + + diskann::cout << "Start writing data in base dimentions cpk2 " << filename << std::endl; for (size_t i = 0; i < npts; i++) { writer.write((char *)(data + i * aligned_dim), ndims * sizeof(T)); } writer.close(); + + std::ifstream file(filename, std::ifstream::ate | std::ifstream::binary); + std::streamsize size = file.tellg(); + file.close(); + diskann::cout << "Finished writing data in base dimentions." << filename << " size: " << size << std::endl; return bytes_written; } diff --git a/src/disk_utils.cpp b/src/disk_utils.cpp index 297619b4a..51c146bb5 100644 --- a/src/disk_utils.cpp +++ b/src/disk_utils.cpp @@ -3,7 +3,7 @@ #include "common_includes.h" -#if defined(RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) +#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) #include "gperftools/malloc_extension.h" #endif @@ -675,7 +675,7 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr Timer timer; int num_parts = partition_with_ram_budget(base_file, sampling_rate, ram_budget, 2 * R / 3, merged_index_prefix, 2); - diskann::cout << timer.elapsed_seconds_for_step("partitioning data") << std::endl; + diskann::cout << timer.elapsed_seconds_for_step("partitioning data ") << std::endl; std::string cur_centroid_filepath = merged_index_prefix + "_centroids.bin"; std::rename(cur_centroid_filepath.c_str(), centroids_file.c_str()); @@ -683,6 +683,10 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr timer.reset(); for (int p = 0; p < num_parts; p++) { +#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) + MallocExtension::instance()->ReleaseFreeMemory(); +#endif + std::string shard_base_file = merged_index_prefix + "_subshard-" + std::to_string(p) + ".bin"; std::string shard_ids_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_ids_uint32.bin"; @@ -769,6 +773,231 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr return 0; } + +template +int split_merged_vamana_index(std::string base_file, diskann::Metric compareMetric, uint32_t L, uint32_t R, + double sampling_rate, double ram_budget, std::string mem_index_path, + std::string medoids_file, std::string centroids_file, size_t build_pq_bytes, bool use_opq, + uint32_t num_threads, bool use_filters, const std::string &label_file, + const std::string &labels_to_medoids_file, const std::string &universal_label, + const uint32_t Lf) +{ + size_t base_num, base_dim; + diskann::get_bin_metadata(base_file, base_num, base_dim); + + double full_index_ram = estimate_ram_usage(base_num, (uint32_t)base_dim, sizeof(T), R); + + // TODO: Make this honest when there is filter support + if (full_index_ram < ram_budget * 1024 * 1024 * 1024) + { + diskann::cout << "Full index fits in RAM budget, should consume at most " + << full_index_ram / (1024 * 1024 * 1024) << "GiBs, so building in one shot" << std::endl; + diskann::cout << "you should not reach here !!! " << std::endl; + + return 1; + } + + // where the universal label is to be saved in the final graph + std::string final_index_universal_label_file = mem_index_path + "_universal_label.txt"; + + std::string merged_index_prefix = mem_index_path + "_tempFiles"; + + Timer timer; + int num_parts = + partition_with_ram_budget(base_file, sampling_rate, ram_budget, 2 * R / 3, merged_index_prefix, 2); + diskann::cout << timer.elapsed_seconds_for_step("partitioning data ") << std::endl; + + std::string cur_centroid_filepath = merged_index_prefix + "_centroids.bin"; + std::rename(cur_centroid_filepath.c_str(), centroids_file.c_str()); + diskann::cout << "num_parts " << num_parts << std::endl; + + //[TODO: jinweizhang need to read off set for graph merging and decide whether] + // need to write a offset for the output for distributed job to sub normal one shot or multi-shards jobs. + + timer.reset(); + return num_parts; +} + +template +int build_split_merged_vamana_index(std::string base_file, diskann::Metric compareMetric, uint32_t L, uint32_t R, + double sampling_rate, double ram_budget, std::string mem_index_path, + std::string medoids_file, std::string centroids_file, size_t build_pq_bytes, + bool use_opq, uint32_t num_threads, bool use_filters, const std::string &label_file, + const std::string &labels_to_medoids_file, const std::string &universal_label, + const uint32_t Lf, const uint32_t subshard_id) +{ + size_t base_num, base_dim; + diskann::get_bin_metadata(base_file, base_num, base_dim); + + double full_index_ram = estimate_ram_usage(base_num, (uint32_t)base_dim, sizeof(T), R); + + // TODO: Make this honest when there is filter support + if (full_index_ram < ram_budget * 1024 * 1024 * 1024) + { + diskann::cout << "Full index fits in RAM budget, should consume at most " + << full_index_ram / (1024 * 1024 * 1024) << "GiBs, so building in one shot" << std::endl; + diskann::cout << "bad idea " << std::endl; + return 0; + } + + // where the universal label is to be saved in the final graph + std::string final_index_universal_label_file = mem_index_path + "_universal_label.txt"; + + std::string merged_index_prefix = mem_index_path + "_tempFiles"; + + Timer timer; + //int num_parts = + // partition_with_ram_budget(base_file, sampling_rate, ram_budget, 2 * R / 3, merged_index_prefix, 2); + //diskann::cout << timer.elapsed_seconds_for_step("partitioning data ") << std::endl; + + //std::string cur_centroid_filepath = merged_index_prefix + "_centroids.bin"; + + //// {todo jinweizhang: need to change this as reading existing file or just remove ?} + //std::rename(cur_centroid_filepath.c_str(), centroids_file.c_str()); + + int p = subshard_id; + timer.reset(); + +#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) + MallocExtension::instance()->ReleaseFreeMemory(); +#endif + + std::string shard_base_file = merged_index_prefix + "_subshard-" + std::to_string(p) + ".bin"; + + std::string shard_ids_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_ids_uint32.bin"; + + std::string shard_labels_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_labels.txt"; + + retrieve_shard_data_from_ids(base_file, shard_ids_file, shard_base_file); + + std::string shard_index_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_mem.index"; + + // [todo:jinweizhang] can we try sub graph with high degree here ? + diskann::IndexWriteParameters low_degree_params = diskann::IndexWriteParametersBuilder(L, 2 * R / 3) + .with_filter_list_size(Lf) + .with_saturate_graph(false) + .with_num_threads(num_threads) + .build(); + + uint64_t shard_base_dim, shard_base_pts; + get_bin_metadata(shard_base_file, shard_base_pts, shard_base_dim); + + diskann::Index _index(compareMetric, shard_base_dim, shard_base_pts, + std::make_shared(low_degree_params), nullptr, + defaults::NUM_FROZEN_POINTS_STATIC, false, false, false, build_pq_bytes > 0, + build_pq_bytes, use_opq); + if (!use_filters) + { + _index.build(shard_base_file.c_str(), shard_base_pts); + } + else + { + diskann::extract_shard_labels(label_file, shard_ids_file, shard_labels_file); + if (universal_label != "") + { // indicates no universal label + LabelT unv_label_as_num = 0; + _index.set_universal_label(unv_label_as_num); + } + _index.build_filtered_index(shard_base_file.c_str(), shard_labels_file, shard_base_pts); + } + _index.save(shard_index_file.c_str()); + // copy universal label file from first shard to the final destination + // index, since all shards anyway share the universal label + if (p == 0) + { + std::string shard_universal_label_file = shard_index_file + "_universal_label.txt"; + if (universal_label != "") + { + copy_file(shard_universal_label_file, final_index_universal_label_file); + } + } + + std::remove(shard_base_file.c_str()); + + diskann::cout << timer.elapsed_seconds_for_step("building indices on shards") << std::endl; + + return 0; + //diskann::merge_shards(merged_index_prefix + "_subshard-", "_mem.index", merged_index_prefix + "_subshard-", + // "_ids_uint32.bin", num_parts, R, mem_index_path, medoids_file, use_filters, + // labels_to_medoids_file); + //diskann::cout << timer.elapsed_seconds_for_step("merging indices") << std::endl; + + //// delete tempFiles + //for (int p = 0; p < num_parts; p++) + //{ + // std::string shard_base_file = merged_index_prefix + "_subshard-" + std::to_string(p) + ".bin"; + // std::string shard_id_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_ids_uint32.bin"; + // std::string shard_labels_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_labels.txt"; + // std::string shard_index_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_mem.index"; + // std::string shard_index_file_data = shard_index_file + ".data"; + + // std::remove(shard_base_file.c_str()); + // std::remove(shard_id_file.c_str()); + // std::remove(shard_index_file.c_str()); + // std::remove(shard_index_file_data.c_str()); + // if (use_filters) + // { + // std::string shard_index_label_file = shard_index_file + "_labels.txt"; + // std::string shard_index_univ_label_file = shard_index_file + "_universal_label.txt"; + // std::string shard_index_label_map_file = shard_index_file + "_labels_to_medoids.txt"; + // std::remove(shard_labels_file.c_str()); + // std::remove(shard_index_label_file.c_str()); + // std::remove(shard_index_label_map_file.c_str()); + // std::remove(shard_index_univ_label_file.c_str()); + // } + //} + //return 0; +} + + +template +int merge_split_vamana_index(std::string base_file, diskann::Metric compareMetric, uint32_t L, uint32_t R, + double sampling_rate, double ram_budget, std::string mem_index_path, + std::string medoids_file, std::string centroids_file, size_t build_pq_bytes, bool use_opq, + uint32_t num_threads, bool use_filters, const std::string &label_file, + const std::string &labels_to_medoids_file, const std::string &universal_label, + const uint32_t Lf, const uint32_t num_parts) +{ + // where the universal label is to be saved in the final graph + Timer timer; + int total_parts = num_parts; + std::string final_index_universal_label_file = mem_index_path + "_universal_label.txt"; + + std::string merged_index_prefix = mem_index_path + "_tempFiles"; + + + diskann::merge_shards(merged_index_prefix + "_subshard-", "_mem.index", merged_index_prefix + "_subshard-", + "_ids_uint32.bin", total_parts, R, mem_index_path, medoids_file, use_filters, + labels_to_medoids_file); + diskann::cout << timer.elapsed_seconds_for_step("merging indices") << std::endl; + + // delete tempFiles + for (int p = 0; p < total_parts; p++) + { + std::string shard_base_file = merged_index_prefix + "_subshard-" + std::to_string(p) + ".bin"; + std::string shard_id_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_ids_uint32.bin"; + std::string shard_labels_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_labels.txt"; + std::string shard_index_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_mem.index"; + std::string shard_index_file_data = shard_index_file + ".data"; + + std::remove(shard_base_file.c_str()); + std::remove(shard_id_file.c_str()); + std::remove(shard_index_file.c_str()); + std::remove(shard_index_file_data.c_str()); + if (use_filters) + { + std::string shard_index_label_file = shard_index_file + "_labels.txt"; + std::string shard_index_univ_label_file = shard_index_file + "_universal_label.txt"; + std::string shard_index_label_map_file = shard_index_file + "_labels_to_medoids.txt"; + std::remove(shard_labels_file.c_str()); + std::remove(shard_index_label_file.c_str()); + std::remove(shard_index_label_map_file.c_str()); + std::remove(shard_index_univ_label_file.c_str()); + } + } + return 0; +} + // General purpose support for DiskANN interface // optimizes the beamwidth to maximize QPS for a given L_search subject to @@ -1283,7 +1512,7 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const // Gopal. Splitting diskann_dll into separate DLLs for search and build. // This code should only be available in the "build" DLL. -#if defined(RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) +#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) MallocExtension::instance()->ReleaseFreeMemory(); #endif @@ -1339,100 +1568,964 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const return 0; } -template DISKANN_DLLEXPORT void create_disk_layout(const std::string base_file, - const std::string mem_index_file, - const std::string output_file, - const std::string reorder_data_file); -template DISKANN_DLLEXPORT void create_disk_layout(const std::string base_file, - const std::string mem_index_file, - const std::string output_file, - const std::string reorder_data_file); -template DISKANN_DLLEXPORT void create_disk_layout(const std::string base_file, const std::string mem_index_file, - const std::string output_file, - const std::string reorder_data_file); -template DISKANN_DLLEXPORT int8_t *load_warmup(const std::string &cache_warmup_file, uint64_t &warmup_num, - uint64_t warmup_dim, uint64_t warmup_aligned_dim); -template DISKANN_DLLEXPORT uint8_t *load_warmup(const std::string &cache_warmup_file, uint64_t &warmup_num, - uint64_t warmup_dim, uint64_t warmup_aligned_dim); -template DISKANN_DLLEXPORT float *load_warmup(const std::string &cache_warmup_file, uint64_t &warmup_num, - uint64_t warmup_dim, uint64_t warmup_aligned_dim); +template +int split_subgraph_index(const char *dataFilePath, const char *indexFilePath, const char *indexBuildParameters, + diskann::Metric compareMetric, bool use_opq, const std::string &codebook_prefix, bool use_filters, + const std::string &label_file, const std::string &universal_label, const uint32_t filter_threshold, + const uint32_t Lf) +{ + std::stringstream parser; + parser << std::string(indexBuildParameters); + std::string cur_param; + std::vector param_list; + while (parser >> cur_param) + { + param_list.push_back(cur_param); + } + if (param_list.size() < 5 || param_list.size() > 9) + { + diskann::cout << "Correct usage of parameters is R (max degree)\n" + "L (indexing list size, better if >= R)\n" + "B (RAM limit of final index in GB)\n" + "M (memory limit while indexing)\n" + "T (number of threads for indexing)\n" + "B' (PQ bytes for disk index: optional parameter for " + "very large dimensional data)\n" + "reorder (set true to include full precision in data file" + ": optional paramter, use only when using disk PQ\n" + "build_PQ_byte (number of PQ bytes for inde build; set 0 to use " + "full precision vectors)\n" + "QD Quantized Dimension to overwrite the derived dim from B " + << std::endl; + return -1; + } -#ifdef EXEC_ENV_OLS -template DISKANN_DLLEXPORT int8_t *load_warmup(MemoryMappedFiles &files, const std::string &cache_warmup_file, - uint64_t &warmup_num, uint64_t warmup_dim, - uint64_t warmup_aligned_dim); -template DISKANN_DLLEXPORT uint8_t *load_warmup(MemoryMappedFiles &files, const std::string &cache_warmup_file, - uint64_t &warmup_num, uint64_t warmup_dim, - uint64_t warmup_aligned_dim); -template DISKANN_DLLEXPORT float *load_warmup(MemoryMappedFiles &files, const std::string &cache_warmup_file, - uint64_t &warmup_num, uint64_t warmup_dim, - uint64_t warmup_aligned_dim); -#endif + if (!std::is_same::value && + (compareMetric == diskann::Metric::INNER_PRODUCT || compareMetric == diskann::Metric::COSINE)) + { + std::stringstream stream; + stream << "Disk-index build currently only supports floating point data for Max " + "Inner Product Search/ cosine similarity. " + << std::endl; + throw diskann::ANNException(stream.str(), -1); + } -template DISKANN_DLLEXPORT uint32_t optimize_beamwidth( - std::unique_ptr> &pFlashIndex, int8_t *tuning_sample, - uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw); -template DISKANN_DLLEXPORT uint32_t optimize_beamwidth( - std::unique_ptr> &pFlashIndex, uint8_t *tuning_sample, - uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw); -template DISKANN_DLLEXPORT uint32_t optimize_beamwidth( - std::unique_ptr> &pFlashIndex, float *tuning_sample, - uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw); + size_t disk_pq_dims = 0; + bool use_disk_pq = false; + size_t build_pq_bytes = 0; -template DISKANN_DLLEXPORT uint32_t optimize_beamwidth( - std::unique_ptr> &pFlashIndex, int8_t *tuning_sample, - uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw); -template DISKANN_DLLEXPORT uint32_t optimize_beamwidth( - std::unique_ptr> &pFlashIndex, uint8_t *tuning_sample, - uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw); -template DISKANN_DLLEXPORT uint32_t optimize_beamwidth( - std::unique_ptr> &pFlashIndex, float *tuning_sample, - uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw); + // if there is a 6th parameter, it means we compress the disk index + // vectors also using PQ data (for very large dimensionality data). If the + // provided parameter is 0, it means we store full vectors. + if (param_list.size() > 5) + { + disk_pq_dims = atoi(param_list[5].c_str()); + use_disk_pq = true; + if (disk_pq_dims == 0) + use_disk_pq = false; + } -template DISKANN_DLLEXPORT int build_disk_index(const char *dataFilePath, const char *indexFilePath, - const char *indexBuildParameters, - diskann::Metric compareMetric, bool use_opq, - const std::string &codebook_prefix, bool use_filters, - const std::string &label_file, - const std::string &universal_label, - const uint32_t filter_threshold, const uint32_t Lf); -template DISKANN_DLLEXPORT int build_disk_index(const char *dataFilePath, const char *indexFilePath, - const char *indexBuildParameters, - diskann::Metric compareMetric, bool use_opq, - const std::string &codebook_prefix, bool use_filters, - const std::string &label_file, - const std::string &universal_label, - const uint32_t filter_threshold, const uint32_t Lf); -template DISKANN_DLLEXPORT int build_disk_index(const char *dataFilePath, const char *indexFilePath, - const char *indexBuildParameters, - diskann::Metric compareMetric, bool use_opq, - const std::string &codebook_prefix, bool use_filters, - const std::string &label_file, - const std::string &universal_label, - const uint32_t filter_threshold, const uint32_t Lf); -// LabelT = uint16 -template DISKANN_DLLEXPORT int build_disk_index(const char *dataFilePath, const char *indexFilePath, - const char *indexBuildParameters, - diskann::Metric compareMetric, bool use_opq, - const std::string &codebook_prefix, bool use_filters, - const std::string &label_file, - const std::string &universal_label, - const uint32_t filter_threshold, const uint32_t Lf); -template DISKANN_DLLEXPORT int build_disk_index(const char *dataFilePath, const char *indexFilePath, - const char *indexBuildParameters, - diskann::Metric compareMetric, bool use_opq, - const std::string &codebook_prefix, bool use_filters, - const std::string &label_file, - const std::string &universal_label, - const uint32_t filter_threshold, const uint32_t Lf); -template DISKANN_DLLEXPORT int build_disk_index(const char *dataFilePath, const char *indexFilePath, + bool reorder_data = false; + if (param_list.size() >= 7) + { + if (1 == atoi(param_list[6].c_str())) + { + reorder_data = true; + } + } + + if (param_list.size() >= 8) + { + build_pq_bytes = atoi(param_list[7].c_str()); + } + + std::string base_file(dataFilePath); + std::string data_file_to_use = base_file; + std::string labels_file_original = label_file; + std::string index_prefix_path(indexFilePath); + std::string labels_file_to_use = index_prefix_path + "_label_formatted.txt"; + std::string pq_pivots_path_base = codebook_prefix; + std::string pq_pivots_path = file_exists(pq_pivots_path_base) ? pq_pivots_path_base + "_pq_pivots.bin" + : index_prefix_path + "_pq_pivots.bin"; + std::string pq_compressed_vectors_path = index_prefix_path + "_pq_compressed.bin"; + std::string mem_index_path = index_prefix_path + "_mem.index"; + std::string disk_index_path = index_prefix_path + "_disk.index"; + std::string medoids_path = disk_index_path + "_medoids.bin"; + std::string centroids_path = disk_index_path + "_centroids.bin"; + + std::string labels_to_medoids_path = disk_index_path + "_labels_to_medoids.txt"; + std::string mem_labels_file = mem_index_path + "_labels.txt"; + std::string disk_labels_file = disk_index_path + "_labels.txt"; + std::string mem_univ_label_file = mem_index_path + "_universal_label.txt"; + std::string disk_univ_label_file = disk_index_path + "_universal_label.txt"; + std::string disk_labels_int_map_file = disk_index_path + "_labels_map.txt"; + std::string dummy_remap_file = disk_index_path + "_dummy_remap.txt"; // remap will be used if we break-up points of + // high label-density to create copies + + std::string sample_base_prefix = index_prefix_path + "_sample"; + // optional, used if disk index file must store pq data + std::string disk_pq_pivots_path = index_prefix_path + "_disk.index_pq_pivots.bin"; + // optional, used if disk index must store pq data + std::string disk_pq_compressed_vectors_path = index_prefix_path + "_disk.index_pq_compressed.bin"; + std::string prepped_base = + index_prefix_path + + "_prepped_base.bin"; // temp file for storing pre-processed base file for cosine/ mips metrics + bool created_temp_file_for_processed_data = false; + + // output a new base file which contains extra dimension with sqrt(1 - + // ||x||^2/M^2) for every x, M is max norm of all points. Extra space on + // disk needed! + if (compareMetric == diskann::Metric::INNER_PRODUCT) + { + Timer timer; + std::cout << "Using Inner Product search, so need to pre-process base " + "data into temp file. Please ensure there is additional " + "(n*(d+1)*4) bytes for storing pre-processed base vectors, " + "apart from the interim indices created by DiskANN and the final index." + << std::endl; + data_file_to_use = prepped_base; + float max_norm_of_base = diskann::prepare_base_for_inner_products(base_file, prepped_base); + std::string norm_file = disk_index_path + "_max_base_norm.bin"; + diskann::save_bin(norm_file, &max_norm_of_base, 1, 1); + diskann::cout << timer.elapsed_seconds_for_step("preprocessing data for inner product") << std::endl; + created_temp_file_for_processed_data = true; + } + else if (compareMetric == diskann::Metric::COSINE) + { + Timer timer; + std::cout << "Normalizing data for cosine to temporary file, please ensure there is additional " + "(n*d*4) bytes for storing normalized base vectors, " + "apart from the interim indices created by DiskANN and the final index." + << std::endl; + data_file_to_use = prepped_base; + diskann::normalize_data_file(base_file, prepped_base); + diskann::cout << timer.elapsed_seconds_for_step("preprocessing data for cosine") << std::endl; + created_temp_file_for_processed_data = true; + } + + uint32_t R = (uint32_t)atoi(param_list[0].c_str()); + uint32_t L = (uint32_t)atoi(param_list[1].c_str()); + + double final_index_ram_limit = get_memory_budget(param_list[2]); + if (final_index_ram_limit <= 0) + { + std::cerr << "Insufficient memory budget (or string was not in right " + "format). Should be > 0." + << std::endl; + return -1; + } + double indexing_ram_budget = (float)atof(param_list[3].c_str()); + if (indexing_ram_budget <= 0) + { + std::cerr << "Not building index. Please provide more RAM budget" << std::endl; + return -1; + } + uint32_t num_threads = (uint32_t)atoi(param_list[4].c_str()); + + if (num_threads != 0) + { + omp_set_num_threads(num_threads); + mkl_set_num_threads(num_threads); + } + + diskann::cout << "Starting index build: R=" << R << " L=" << L << " Query RAM budget: " << final_index_ram_limit + << " Indexing ram budget: " << indexing_ram_budget << " T: " << num_threads << std::endl; + + auto s = std::chrono::high_resolution_clock::now(); + + // If there is filter support, we break-up points which have too many labels + // into replica dummy points which evenly distribute the filters. The rest + // of index build happens on the augmented base and labels + std::string augmented_data_file, augmented_labels_file; + if (use_filters) + { + convert_labels_string_to_int(labels_file_original, labels_file_to_use, disk_labels_int_map_file, + universal_label); + augmented_data_file = index_prefix_path + "_augmented_data.bin"; + augmented_labels_file = index_prefix_path + "_augmented_labels.txt"; + if (filter_threshold != 0) + { + dummy_remap_file = index_prefix_path + "_dummy_remap.txt"; + breakup_dense_points(data_file_to_use, labels_file_to_use, filter_threshold, augmented_data_file, + augmented_labels_file, + dummy_remap_file); // RKNOTE: This has large memory footprint, + // need to make this streaming + data_file_to_use = augmented_data_file; + labels_file_to_use = augmented_labels_file; + } + } + + size_t points_num, dim; + + Timer timer; + diskann::get_bin_metadata(data_file_to_use.c_str(), points_num, dim); + const double p_val = ((double)MAX_PQ_TRAINING_SET_SIZE / (double)points_num); + + if (use_disk_pq) + { + generate_disk_quantized_data(data_file_to_use, disk_pq_pivots_path, disk_pq_compressed_vectors_path, + compareMetric, p_val, disk_pq_dims); + } + size_t num_pq_chunks = (size_t)(std::floor)(uint64_t(final_index_ram_limit / points_num)); + + num_pq_chunks = num_pq_chunks <= 0 ? 1 : num_pq_chunks; + num_pq_chunks = num_pq_chunks > dim ? dim : num_pq_chunks; + num_pq_chunks = num_pq_chunks > MAX_PQ_CHUNKS ? MAX_PQ_CHUNKS : num_pq_chunks; + + if (param_list.size() >= 9 && atoi(param_list[8].c_str()) <= MAX_PQ_CHUNKS && atoi(param_list[8].c_str()) > 0) + { + std::cout << "Use quantized dimension (QD) to overwrite derived quantized " + "dimension from search_DRAM_budget (B)" + << std::endl; + num_pq_chunks = atoi(param_list[8].c_str()); + } + ///------------ early estimation for ram useage + size_t base_num, base_dim; + diskann::get_bin_metadata(data_file_to_use.c_str(), base_num, base_dim); + + double full_index_ram = estimate_ram_usage(base_num, (uint32_t)base_dim, sizeof(T), R); + + // TODO: Make this honest when there is filter support + if (full_index_ram < indexing_ram_budget * 1024 * 1024 * 1024) + { + diskann::cout << "Full index fits in RAM budget, should consume at most " + << full_index_ram / (1024 * 1024 * 1024) << "GiBs, so building in one shot" << std::endl; + diskann::cout << "Dont even need train OPQ, leave room for setting parition offset" << std::endl; + + return 1; + } + ///--------------early estimation for ram useage + /// + diskann::cout << "Compressing " << dim << "-dimensional data into " << num_pq_chunks << " bytes per vector." + << std::endl; + + generate_quantized_data(data_file_to_use, pq_pivots_path, pq_compressed_vectors_path, compareMetric, p_val, + num_pq_chunks, use_opq, codebook_prefix); + diskann::cout << timer.elapsed_seconds_for_step("generating quantized data") << std::endl; + +// Gopal. Splitting diskann_dll into separate DLLs for search and build. +// This code should only be available in the "build" DLL. +#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) + MallocExtension::instance()->ReleaseFreeMemory(); +#endif + // Whether it is cosine or inner product, we still L2 metric due to the pre-processing. + timer.reset(); + int num_parts = diskann::split_merged_vamana_index(data_file_to_use.c_str(), diskann::Metric::L2, L, R, p_val, + indexing_ram_budget, mem_index_path, medoids_path, centroids_path, + build_pq_bytes, use_opq, num_threads, use_filters, labels_file_to_use, + labels_to_medoids_path, universal_label, Lf); + //diskann::cout << timer.elapsed_seconds_for_step("building merged vamana index") << std::endl; + + //timer.reset(); + //if (!use_disk_pq) + //{ + // diskann::create_disk_layout(data_file_to_use.c_str(), mem_index_path, disk_index_path); + //} + //else + //{ + // if (!reorder_data) + // diskann::create_disk_layout(disk_pq_compressed_vectors_path, mem_index_path, disk_index_path); + // else + // diskann::create_disk_layout(disk_pq_compressed_vectors_path, mem_index_path, disk_index_path, + // data_file_to_use.c_str()); + //} + //diskann::cout << timer.elapsed_seconds_for_step("generating disk layout") << std::endl; + + //double ten_percent_points = std::ceil(points_num * 0.1); + //double num_sample_points = + // ten_percent_points > MAX_SAMPLE_POINTS_FOR_WARMUP ? MAX_SAMPLE_POINTS_FOR_WARMUP : ten_percent_points; + //double sample_sampling_rate = num_sample_points / points_num; + //gen_random_slice(data_file_to_use.c_str(), sample_base_prefix, sample_sampling_rate); + //if (use_filters) + //{ + // copy_file(labels_file_to_use, disk_labels_file); + // std::remove(mem_labels_file.c_str()); + // if (universal_label != "") + // { + // copy_file(mem_univ_label_file, disk_univ_label_file); + // std::remove(mem_univ_label_file.c_str()); + // } + // std::remove(augmented_data_file.c_str()); + // std::remove(augmented_labels_file.c_str()); + // std::remove(labels_file_to_use.c_str()); + //} + //if (created_temp_file_for_processed_data) + // std::remove(prepped_base.c_str()); + //std::remove(mem_index_path.c_str()); + //if (use_disk_pq) + // std::remove(disk_pq_compressed_vectors_path.c_str()); + + //auto e = std::chrono::high_resolution_clock::now(); + //std::chrono::duration diff = e - s; + //diskann::cout << "Indexing time: " << diff.count() << std::endl; + + return num_parts; +} + + + +template +int build_subgraph_index(const char *dataFilePath, const char *indexFilePath, const char *indexBuildParameters, + diskann::Metric compareMetric, bool use_opq, const std::string &codebook_prefix, bool use_filters, + const std::string &label_file, const std::string &universal_label, const uint32_t filter_threshold, + const uint32_t Lf, const uint32_t subshard_id) +{ + std::stringstream parser; + parser << std::string(indexBuildParameters); + std::string cur_param; + std::vector param_list; + while (parser >> cur_param) + { + param_list.push_back(cur_param); + } + if (param_list.size() < 5 || param_list.size() > 9) + { + diskann::cout << "Correct usage of parameters is R (max degree)\n" + "L (indexing list size, better if >= R)\n" + "B (RAM limit of final index in GB)\n" + "M (memory limit while indexing)\n" + "T (number of threads for indexing)\n" + "B' (PQ bytes for disk index: optional parameter for " + "very large dimensional data)\n" + "reorder (set true to include full precision in data file" + ": optional paramter, use only when using disk PQ\n" + "build_PQ_byte (number of PQ bytes for inde build; set 0 to use " + "full precision vectors)\n" + "QD Quantized Dimension to overwrite the derived dim from B " + << std::endl; + return -1; + } + + if (!std::is_same::value && + (compareMetric == diskann::Metric::INNER_PRODUCT || compareMetric == diskann::Metric::COSINE)) + { + std::stringstream stream; + stream << "Disk-index build currently only supports floating point data for Max " + "Inner Product Search/ cosine similarity. " + << std::endl; + throw diskann::ANNException(stream.str(), -1); + } + + size_t disk_pq_dims = 0; + bool use_disk_pq = false; + size_t build_pq_bytes = 0; + + // if there is a 6th parameter, it means we compress the disk index + // vectors also using PQ data (for very large dimensionality data). If the + // provided parameter is 0, it means we store full vectors. + if (param_list.size() > 5) + { + disk_pq_dims = atoi(param_list[5].c_str()); + use_disk_pq = true; + if (disk_pq_dims == 0) + use_disk_pq = false; + } + + bool reorder_data = false; + if (param_list.size() >= 7) + { + if (1 == atoi(param_list[6].c_str())) + { + reorder_data = true; + } + } + + if (param_list.size() >= 8) + { + build_pq_bytes = atoi(param_list[7].c_str()); + } + + std::string base_file(dataFilePath); + std::string data_file_to_use = base_file; + std::string labels_file_original = label_file; + std::string index_prefix_path(indexFilePath); + std::string labels_file_to_use = index_prefix_path + "_label_formatted.txt"; + std::string pq_pivots_path_base = codebook_prefix; + std::string pq_pivots_path = file_exists(pq_pivots_path_base) ? pq_pivots_path_base + "_pq_pivots.bin" + : index_prefix_path + "_pq_pivots.bin"; + std::string pq_compressed_vectors_path = index_prefix_path + "_pq_compressed.bin"; + std::string mem_index_path = index_prefix_path + "_mem.index"; + std::string disk_index_path = index_prefix_path + "_disk.index"; + std::string medoids_path = disk_index_path + "_medoids.bin"; + std::string centroids_path = disk_index_path + "_centroids.bin"; + + std::string labels_to_medoids_path = disk_index_path + "_labels_to_medoids.txt"; + std::string mem_labels_file = mem_index_path + "_labels.txt"; + std::string disk_labels_file = disk_index_path + "_labels.txt"; + std::string mem_univ_label_file = mem_index_path + "_universal_label.txt"; + std::string disk_univ_label_file = disk_index_path + "_universal_label.txt"; + std::string disk_labels_int_map_file = disk_index_path + "_labels_map.txt"; + std::string dummy_remap_file = disk_index_path + "_dummy_remap.txt"; // remap will be used if we break-up points of + // high label-density to create copies + + std::string sample_base_prefix = index_prefix_path + "_sample"; + // optional, used if disk index file must store pq data + std::string disk_pq_pivots_path = index_prefix_path + "_disk.index_pq_pivots.bin"; + // optional, used if disk index must store pq data + std::string disk_pq_compressed_vectors_path = index_prefix_path + "_disk.index_pq_compressed.bin"; + std::string prepped_base = + index_prefix_path + + "_prepped_base.bin"; // temp file for storing pre-processed base file for cosine/ mips metrics + bool created_temp_file_for_processed_data = false; + + // output a new base file which contains extra dimension with sqrt(1 - + // ||x||^2/M^2) for every x, M is max norm of all points. Extra space on + // disk needed! + if (compareMetric == diskann::Metric::INNER_PRODUCT) + { + Timer timer; + std::cout << "Using Inner Product search, so need to pre-process base " + "data into temp file. Please ensure there is additional " + "(n*(d+1)*4) bytes for storing pre-processed base vectors, " + "apart from the interim indices created by DiskANN and the final index." + << std::endl; + data_file_to_use = prepped_base; + float max_norm_of_base = diskann::prepare_base_for_inner_products(base_file, prepped_base); + std::string norm_file = disk_index_path + "_max_base_norm.bin"; + diskann::save_bin(norm_file, &max_norm_of_base, 1, 1); + diskann::cout << timer.elapsed_seconds_for_step("preprocessing data for inner product") << std::endl; + created_temp_file_for_processed_data = true; + } + else if (compareMetric == diskann::Metric::COSINE) + { + Timer timer; + std::cout << "Normalizing data for cosine to temporary file, please ensure there is additional " + "(n*d*4) bytes for storing normalized base vectors, " + "apart from the interim indices created by DiskANN and the final index." + << std::endl; + data_file_to_use = prepped_base; + diskann::normalize_data_file(base_file, prepped_base); + diskann::cout << timer.elapsed_seconds_for_step("preprocessing data for cosine") << std::endl; + created_temp_file_for_processed_data = true; + } + + uint32_t R = (uint32_t)atoi(param_list[0].c_str()); + uint32_t L = (uint32_t)atoi(param_list[1].c_str()); + + double final_index_ram_limit = get_memory_budget(param_list[2]); + if (final_index_ram_limit <= 0) + { + std::cerr << "Insufficient memory budget (or string was not in right " + "format). Should be > 0." + << std::endl; + return -1; + } + double indexing_ram_budget = (float)atof(param_list[3].c_str()); + if (indexing_ram_budget <= 0) + { + std::cerr << "Not building index. Please provide more RAM budget" << std::endl; + return -1; + } + uint32_t num_threads = (uint32_t)atoi(param_list[4].c_str()); + + if (num_threads != 0) + { + omp_set_num_threads(num_threads); + mkl_set_num_threads(num_threads); + } + + diskann::cout << "Starting index build: R=" << R << " L=" << L << " Query RAM budget: " << final_index_ram_limit + << " Indexing ram budget: " << indexing_ram_budget << " T: " << num_threads << std::endl; + + auto s = std::chrono::high_resolution_clock::now(); + + // If there is filter support, we break-up points which have too many labels + // into replica dummy points which evenly distribute the filters. The rest + // of index build happens on the augmented base and labels + std::string augmented_data_file, augmented_labels_file; + if (use_filters) + { + convert_labels_string_to_int(labels_file_original, labels_file_to_use, disk_labels_int_map_file, + universal_label); + augmented_data_file = index_prefix_path + "_augmented_data.bin"; + augmented_labels_file = index_prefix_path + "_augmented_labels.txt"; + if (filter_threshold != 0) + { + dummy_remap_file = index_prefix_path + "_dummy_remap.txt"; + breakup_dense_points(data_file_to_use, labels_file_to_use, filter_threshold, augmented_data_file, + augmented_labels_file, + dummy_remap_file); // RKNOTE: This has large memory footprint, + // need to make this streaming + data_file_to_use = augmented_data_file; + labels_file_to_use = augmented_labels_file; + } + } + + size_t points_num, dim; + + Timer timer; + diskann::get_bin_metadata(data_file_to_use.c_str(), points_num, dim); + const double p_val = ((double)MAX_PQ_TRAINING_SET_SIZE / (double)points_num); + +// if (use_disk_pq) +// { +// generate_disk_quantized_data(data_file_to_use, disk_pq_pivots_path, disk_pq_compressed_vectors_path, +// compareMetric, p_val, disk_pq_dims); +// } +// size_t num_pq_chunks = (size_t)(std::floor)(uint64_t(final_index_ram_limit / points_num)); +// +// num_pq_chunks = num_pq_chunks <= 0 ? 1 : num_pq_chunks; +// num_pq_chunks = num_pq_chunks > dim ? dim : num_pq_chunks; +// num_pq_chunks = num_pq_chunks > MAX_PQ_CHUNKS ? MAX_PQ_CHUNKS : num_pq_chunks; +// +// if (param_list.size() >= 9 && atoi(param_list[8].c_str()) <= MAX_PQ_CHUNKS && atoi(param_list[8].c_str()) > 0) +// { +// std::cout << "Use quantized dimension (QD) to overwrite derived quantized " +// "dimension from search_DRAM_budget (B)" +// << std::endl; +// num_pq_chunks = atoi(param_list[8].c_str()); +// } +// +// diskann::cout << "Compressing " << dim << "-dimensional data into " << num_pq_chunks << " bytes per vector." +// << std::endl; +// +// generate_quantized_data(data_file_to_use, pq_pivots_path, pq_compressed_vectors_path, compareMetric, p_val, +// num_pq_chunks, use_opq, codebook_prefix); +// diskann::cout << timer.elapsed_seconds_for_step("generating quantized data") << std::endl; +// +//// Gopal. Splitting diskann_dll into separate DLLs for search and build. +//// This code should only be available in the "build" DLL. +//#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) +// MallocExtension::instance()->ReleaseFreeMemory(); +//#endif +// // Whether it is cosine or inner product, we still L2 metric due to the pre-processing. +// timer.reset(); + diskann::build_split_merged_vamana_index(data_file_to_use.c_str(), diskann::Metric::L2, L, R, p_val, + indexing_ram_budget, mem_index_path, medoids_path, centroids_path, + build_pq_bytes, use_opq, num_threads, use_filters, labels_file_to_use, + labels_to_medoids_path, universal_label, Lf, subshard_id); + diskann::cout << timer.elapsed_seconds_for_step("building merged vamana index for shard") << subshard_id << std::endl; + + /* timer.reset(); + if (!use_disk_pq) + { + diskann::create_disk_layout(data_file_to_use.c_str(), mem_index_path, disk_index_path); + } + else + { + if (!reorder_data) + diskann::create_disk_layout(disk_pq_compressed_vectors_path, mem_index_path, disk_index_path); + else + diskann::create_disk_layout(disk_pq_compressed_vectors_path, mem_index_path, disk_index_path, + data_file_to_use.c_str()); + } + diskann::cout << timer.elapsed_seconds_for_step("generating disk layout") << std::endl; + + double ten_percent_points = std::ceil(points_num * 0.1); + double num_sample_points = + ten_percent_points > MAX_SAMPLE_POINTS_FOR_WARMUP ? MAX_SAMPLE_POINTS_FOR_WARMUP : ten_percent_points; + double sample_sampling_rate = num_sample_points / points_num; + gen_random_slice(data_file_to_use.c_str(), sample_base_prefix, sample_sampling_rate); + if (use_filters) + { + copy_file(labels_file_to_use, disk_labels_file); + std::remove(mem_labels_file.c_str()); + if (universal_label != "") + { + copy_file(mem_univ_label_file, disk_univ_label_file); + std::remove(mem_univ_label_file.c_str()); + } + std::remove(augmented_data_file.c_str()); + std::remove(augmented_labels_file.c_str()); + std::remove(labels_file_to_use.c_str()); + } + if (created_temp_file_for_processed_data) + std::remove(prepped_base.c_str()); + std::remove(mem_index_path.c_str()); + if (use_disk_pq) + std::remove(disk_pq_compressed_vectors_path.c_str()); + + auto e = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = e - s; + diskann::cout << "Indexing time: " << diff.count() << std::endl;*/ + + return 0; +} + +template +int merge_subgraph_index(const char *dataFilePath, const char *indexFilePath, const char *indexBuildParameters, + diskann::Metric compareMetric, bool use_opq, const std::string &codebook_prefix, + bool use_filters, const std::string &label_file, const std::string &universal_label, + const uint32_t filter_threshold, const uint32_t Lf, const uint32_t num_parts) +{ + std::stringstream parser; + parser << std::string(indexBuildParameters); + std::string cur_param; + std::vector param_list; + while (parser >> cur_param) + { + param_list.push_back(cur_param); + } + if (param_list.size() < 5 || param_list.size() > 9) + { + diskann::cout << "Correct usage of parameters is R (max degree)\n" + "L (indexing list size, better if >= R)\n" + "B (RAM limit of final index in GB)\n" + "M (memory limit while indexing)\n" + "T (number of threads for indexing)\n" + "B' (PQ bytes for disk index: optional parameter for " + "very large dimensional data)\n" + "reorder (set true to include full precision in data file" + ": optional paramter, use only when using disk PQ\n" + "build_PQ_byte (number of PQ bytes for inde build; set 0 to use " + "full precision vectors)\n" + "QD Quantized Dimension to overwrite the derived dim from B " + << std::endl; + return -1; + } + + if (!std::is_same::value && compareMetric == diskann::Metric::INNER_PRODUCT) + { + std::stringstream stream; + stream << "DiskANN currently only supports floating point data for Max " + "Inner Product Search. " + << std::endl; + throw diskann::ANNException(stream.str(), -1); + } + + size_t disk_pq_dims = 0; + bool use_disk_pq = false; + size_t build_pq_bytes = 0; + + // if there is a 6th parameter, it means we compress the disk index + // vectors also using PQ data (for very large dimensionality data). If the + // provided parameter is 0, it means we store full vectors. + if (param_list.size() > 5) + { + disk_pq_dims = atoi(param_list[5].c_str()); + use_disk_pq = true; + if (disk_pq_dims == 0) + use_disk_pq = false; + } + + bool reorder_data = false; + if (param_list.size() >= 7) + { + if (1 == atoi(param_list[6].c_str())) + { + reorder_data = true; + } + } + + if (param_list.size() >= 8) + { + build_pq_bytes = atoi(param_list[7].c_str()); + } + + std::string base_file(dataFilePath); + std::string data_file_to_use = base_file; + std::string labels_file_original = label_file; + std::string index_prefix_path(indexFilePath); + std::string labels_file_to_use = index_prefix_path + "_label_formatted.txt"; + std::string pq_pivots_path_base = codebook_prefix; + std::string pq_pivots_path = file_exists(pq_pivots_path_base) ? pq_pivots_path_base + "_pq_pivots.bin" + : index_prefix_path + "_pq_pivots.bin"; + std::string pq_compressed_vectors_path = index_prefix_path + "_pq_compressed.bin"; + std::string mem_index_path = index_prefix_path + "_mem.index"; + std::string disk_index_path = index_prefix_path + "_disk.index"; + std::string medoids_path = disk_index_path + "_medoids.bin"; + std::string centroids_path = disk_index_path + "_centroids.bin"; + + std::string labels_to_medoids_path = disk_index_path + "_labels_to_medoids.txt"; + std::string mem_labels_file = mem_index_path + "_labels.txt"; + std::string disk_labels_file = disk_index_path + "_labels.txt"; + std::string mem_univ_label_file = mem_index_path + "_universal_label.txt"; + std::string disk_univ_label_file = disk_index_path + "_universal_label.txt"; + std::string disk_labels_int_map_file = disk_index_path + "_labels_map.txt"; + std::string dummy_remap_file = disk_index_path + "_dummy_remap.txt"; // remap will be used if we break-up points of + // high label-density to create copies + + std::string sample_base_prefix = index_prefix_path + "_sample"; + // optional, used if disk index file must store pq data + std::string disk_pq_pivots_path = index_prefix_path + "_disk.index_pq_pivots.bin"; + // optional, used if disk index must store pq data + std::string disk_pq_compressed_vectors_path = index_prefix_path + "_disk.index_pq_compressed.bin"; + + // output a new base file which contains extra dimension with sqrt(1 - + // ||x||^2/M^2) for every x, M is max norm of all points. Extra space on + // disk needed! + if (compareMetric == diskann::Metric::INNER_PRODUCT) + { + Timer timer; + std::cout << "Using Inner Product search, so need to pre-process base " + "data into temp file. Please ensure there is additional " + "(n*(d+1)*4) bytes for storing pre-processed base vectors, " + "apart from the intermin indices and final index." + << std::endl; + std::string prepped_base = index_prefix_path + "_prepped_base.bin"; + data_file_to_use = prepped_base; + float max_norm_of_base = diskann::prepare_base_for_inner_products(base_file, prepped_base); + std::string norm_file = disk_index_path + "_max_base_norm.bin"; + diskann::save_bin(norm_file, &max_norm_of_base, 1, 1); + diskann::cout << timer.elapsed_seconds_for_step("preprocessing data for inner product") << std::endl; + } + + uint32_t R = (uint32_t)atoi(param_list[0].c_str()); + uint32_t L = (uint32_t)atoi(param_list[1].c_str()); + + double final_index_ram_limit = get_memory_budget(param_list[2]); + if (final_index_ram_limit <= 0) + { + std::cerr << "Insufficient memory budget (or string was not in right " + "format). Should be > 0." + << std::endl; + return -1; + } + double indexing_ram_budget = (float)atof(param_list[3].c_str()); + if (indexing_ram_budget <= 0) + { + std::cerr << "Not building index. Please provide more RAM budget" << std::endl; + return -1; + } + uint32_t num_threads = (uint32_t)atoi(param_list[4].c_str()); + + if (num_threads != 0) + { + omp_set_num_threads(num_threads); + mkl_set_num_threads(num_threads); + } + + diskann::cout << "Starting index build: R=" << R << " L=" << L << " Query RAM budget: " << final_index_ram_limit + << " Indexing ram budget: " << indexing_ram_budget << " T: " << num_threads << std::endl; + + auto s = std::chrono::high_resolution_clock::now(); + + // If there is filter support, we break-up points which have too many labels + // into replica dummy points which evenly distribute the filters. The rest + // of index build happens on the augmented base and labels + std::string augmented_data_file, augmented_labels_file; + if (use_filters) + { + convert_labels_string_to_int(labels_file_original, labels_file_to_use, disk_labels_int_map_file, + universal_label); + augmented_data_file = index_prefix_path + "_augmented_data.bin"; + augmented_labels_file = index_prefix_path + "_augmented_labels.txt"; + if (filter_threshold != 0) + { + dummy_remap_file = index_prefix_path + "_dummy_remap.txt"; + breakup_dense_points(data_file_to_use, labels_file_to_use, filter_threshold, augmented_data_file, + augmented_labels_file, + dummy_remap_file); // RKNOTE: This has large memory footprint, + // need to make this streaming + data_file_to_use = augmented_data_file; + labels_file_to_use = augmented_labels_file; + } + } + + size_t points_num, dim; + + Timer timer; + diskann::get_bin_metadata(data_file_to_use.c_str(), points_num, dim); + const double p_val = ((double)MAX_PQ_TRAINING_SET_SIZE / (double)points_num); + + //if (use_disk_pq) + //{ + // generate_disk_quantized_data(data_file_to_use, disk_pq_pivots_path, disk_pq_compressed_vectors_path, + // compareMetric, p_val, disk_pq_dims); + //} + //size_t num_pq_chunks = (size_t)(std::floor)(uint64_t(final_index_ram_limit / points_num)); + + //num_pq_chunks = num_pq_chunks <= 0 ? 1 : num_pq_chunks; + //num_pq_chunks = num_pq_chunks > dim ? dim : num_pq_chunks; + //num_pq_chunks = num_pq_chunks > MAX_PQ_CHUNKS ? MAX_PQ_CHUNKS : num_pq_chunks; + + //if (param_list.size() >= 9 && atoi(param_list[8].c_str()) <= MAX_PQ_CHUNKS && atoi(param_list[8].c_str()) > 0) + //{ + // std::cout << "Use quantized dimension (QD) to overwrite derived quantized " + // "dimension from search_DRAM_budget (B)" + // << std::endl; + // num_pq_chunks = atoi(param_list[8].c_str()); + //} + + //diskann::cout << "Compressing " << dim << "-dimensional data into " << num_pq_chunks << " bytes per vector." + // << std::endl; + + //generate_quantized_data(data_file_to_use, pq_pivots_path, pq_compressed_vectors_path, compareMetric, p_val, + // num_pq_chunks, use_opq, codebook_prefix); + //diskann::cout << timer.elapsed_seconds_for_step("generating quantized data") << std::endl; + +// Gopal. Splitting diskann_dll into separate DLLs for search and build. +// This code should only be available in the "build" DLL. +#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) + MallocExtension::instance()->ReleaseFreeMemory(); +#endif + + timer.reset(); + diskann::merge_split_vamana_index(data_file_to_use.c_str(), diskann::Metric::L2, L, R, p_val, + indexing_ram_budget, mem_index_path, medoids_path, centroids_path, + build_pq_bytes, use_opq, num_threads, use_filters, labels_file_to_use, + labels_to_medoids_path, universal_label, Lf, num_parts); + diskann::cout << timer.elapsed_seconds_for_step("merged vamana index") << std::endl; + + timer.reset(); + if (!use_disk_pq) + { + diskann::create_disk_layout(data_file_to_use.c_str(), mem_index_path, disk_index_path); + } + else + { + if (!reorder_data) + diskann::create_disk_layout(disk_pq_compressed_vectors_path, mem_index_path, disk_index_path); + else + diskann::create_disk_layout(disk_pq_compressed_vectors_path, mem_index_path, disk_index_path, + data_file_to_use.c_str()); + } + diskann::cout << timer.elapsed_seconds_for_step("generating disk layout") << std::endl; + + double ten_percent_points = std::ceil(points_num * 0.1); + double num_sample_points = + ten_percent_points > MAX_SAMPLE_POINTS_FOR_WARMUP ? MAX_SAMPLE_POINTS_FOR_WARMUP : ten_percent_points; + double sample_sampling_rate = num_sample_points / points_num; + gen_random_slice(data_file_to_use.c_str(), sample_base_prefix, sample_sampling_rate); + if (use_filters) + { + copy_file(labels_file_to_use, disk_labels_file); + std::remove(mem_labels_file.c_str()); + if (universal_label != "") + { + copy_file(mem_univ_label_file, disk_univ_label_file); + std::remove(mem_univ_label_file.c_str()); + } + std::remove(augmented_data_file.c_str()); + std::remove(augmented_labels_file.c_str()); + std::remove(labels_file_to_use.c_str()); + } + + std::remove(mem_index_path.c_str()); + if (use_disk_pq) + std::remove(disk_pq_compressed_vectors_path.c_str()); + + auto e = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = e - s; + diskann::cout << "Indexing time: " << diff.count() << std::endl; + + return 0; +} + +template DISKANN_DLLEXPORT void create_disk_layout(const std::string base_file, + const std::string mem_index_file, + const std::string output_file, + const std::string reorder_data_file); +template DISKANN_DLLEXPORT void create_disk_layout(const std::string base_file, + const std::string mem_index_file, + const std::string output_file, + const std::string reorder_data_file); +template DISKANN_DLLEXPORT void create_disk_layout(const std::string base_file, const std::string mem_index_file, + const std::string output_file, + const std::string reorder_data_file); + +template DISKANN_DLLEXPORT int8_t *load_warmup(const std::string &cache_warmup_file, uint64_t &warmup_num, + uint64_t warmup_dim, uint64_t warmup_aligned_dim); +template DISKANN_DLLEXPORT uint8_t *load_warmup(const std::string &cache_warmup_file, uint64_t &warmup_num, + uint64_t warmup_dim, uint64_t warmup_aligned_dim); +template DISKANN_DLLEXPORT float *load_warmup(const std::string &cache_warmup_file, uint64_t &warmup_num, + uint64_t warmup_dim, uint64_t warmup_aligned_dim); + +#ifdef EXEC_ENV_OLS +template DISKANN_DLLEXPORT int8_t *load_warmup(MemoryMappedFiles &files, const std::string &cache_warmup_file, + uint64_t &warmup_num, uint64_t warmup_dim, + uint64_t warmup_aligned_dim); +template DISKANN_DLLEXPORT uint8_t *load_warmup(MemoryMappedFiles &files, const std::string &cache_warmup_file, + uint64_t &warmup_num, uint64_t warmup_dim, + uint64_t warmup_aligned_dim); +template DISKANN_DLLEXPORT float *load_warmup(MemoryMappedFiles &files, const std::string &cache_warmup_file, + uint64_t &warmup_num, uint64_t warmup_dim, + uint64_t warmup_aligned_dim); +#endif + +template DISKANN_DLLEXPORT uint32_t optimize_beamwidth( + std::unique_ptr> &pFlashIndex, int8_t *tuning_sample, + uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw); +template DISKANN_DLLEXPORT uint32_t optimize_beamwidth( + std::unique_ptr> &pFlashIndex, uint8_t *tuning_sample, + uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw); +template DISKANN_DLLEXPORT uint32_t optimize_beamwidth( + std::unique_ptr> &pFlashIndex, float *tuning_sample, + uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw); + +template DISKANN_DLLEXPORT uint32_t optimize_beamwidth( + std::unique_ptr> &pFlashIndex, int8_t *tuning_sample, + uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw); +template DISKANN_DLLEXPORT uint32_t optimize_beamwidth( + std::unique_ptr> &pFlashIndex, uint8_t *tuning_sample, + uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw); +template DISKANN_DLLEXPORT uint32_t optimize_beamwidth( + std::unique_ptr> &pFlashIndex, float *tuning_sample, + uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw); + +template DISKANN_DLLEXPORT int build_disk_index(const char *dataFilePath, const char *indexFilePath, + const char *indexBuildParameters, + diskann::Metric compareMetric, bool use_opq, + const std::string &codebook_prefix, bool use_filters, + const std::string &label_file, + const std::string &universal_label, + const uint32_t filter_threshold, const uint32_t Lf); +template DISKANN_DLLEXPORT int build_disk_index(const char *dataFilePath, const char *indexFilePath, + const char *indexBuildParameters, + diskann::Metric compareMetric, bool use_opq, + const std::string &codebook_prefix, bool use_filters, + const std::string &label_file, + const std::string &universal_label, + const uint32_t filter_threshold, const uint32_t Lf); +template DISKANN_DLLEXPORT int build_disk_index(const char *dataFilePath, const char *indexFilePath, + const char *indexBuildParameters, + diskann::Metric compareMetric, bool use_opq, + const std::string &codebook_prefix, bool use_filters, + const std::string &label_file, + const std::string &universal_label, + const uint32_t filter_threshold, const uint32_t Lf); +// LabelT = uint16 +template DISKANN_DLLEXPORT int build_disk_index(const char *dataFilePath, const char *indexFilePath, + const char *indexBuildParameters, + diskann::Metric compareMetric, bool use_opq, + const std::string &codebook_prefix, bool use_filters, + const std::string &label_file, + const std::string &universal_label, + const uint32_t filter_threshold, const uint32_t Lf); +template DISKANN_DLLEXPORT int build_disk_index(const char *dataFilePath, const char *indexFilePath, + const char *indexBuildParameters, + diskann::Metric compareMetric, bool use_opq, + const std::string &codebook_prefix, bool use_filters, + const std::string &label_file, + const std::string &universal_label, + const uint32_t filter_threshold, const uint32_t Lf); +template DISKANN_DLLEXPORT int build_disk_index(const char *dataFilePath, const char *indexFilePath, const char *indexBuildParameters, diskann::Metric compareMetric, bool use_opq, const std::string &codebook_prefix, bool use_filters, const std::string &label_file, const std::string &universal_label, const uint32_t filter_threshold, const uint32_t Lf); +//---------- +template DISKANN_DLLEXPORT int split_subgraph_index(const char *dataFilePath, const char *indexFilePath, + const char *indexBuildParameters, + diskann::Metric compareMetric, bool use_opq, + const std::string &codebook_prefix, bool use_filters, + const std::string &label_file, + const std::string &universal_label, + const uint32_t filter_threshold, const uint32_t Lf); + +//--------- + +//---------- +template DISKANN_DLLEXPORT int build_subgraph_index(const char *dataFilePath, const char *indexFilePath, + const char *indexBuildParameters, + diskann::Metric compareMetric, bool use_opq, + const std::string &codebook_prefix, bool use_filters, + const std::string &label_file, + const std::string &universal_label, + const uint32_t filter_threshold, const uint32_t Lf, const uint32_t subshard_id); + +//---------- +template DISKANN_DLLEXPORT int merge_subgraph_index(const char *dataFilePath, const char *indexFilePath, + const char *indexBuildParameters, + diskann::Metric compareMetric, bool use_opq, + const std::string &codebook_prefix, bool use_filters, + const std::string &label_file, + const std::string &universal_label, + const uint32_t filter_threshold, const uint32_t Lf, const uint32_t num_parts); +//---------build inner-----======================================================================================================= template DISKANN_DLLEXPORT int build_merged_vamana_index( std::string base_file, diskann::Metric compareMetric, uint32_t L, uint32_t R, double sampling_rate, @@ -1465,4 +2558,25 @@ template DISKANN_DLLEXPORT int build_merged_vamana_index( double ram_budget, std::string mem_index_path, std::string medoids_path, std::string centroids_file, size_t build_pq_bytes, bool use_opq, uint32_t num_threads, bool use_filters, const std::string &label_file, const std::string &labels_to_medoids_file, const std::string &universal_label, const uint32_t Lf); -}; // namespace diskann + +//--------- +template DISKANN_DLLEXPORT int split_merged_vamana_index( + std::string base_file, diskann::Metric compareMetric, uint32_t L, uint32_t R, double sampling_rate, + double ram_budget, std::string mem_index_path, std::string medoids_path, std::string centroids_file, + size_t build_pq_bytes, bool use_opq, uint32_t num_threads, bool use_filters, const std::string &label_file, + const std::string &labels_to_medoids_file, const std::string &universal_label, const uint32_t Lf); +//--------- + +template DISKANN_DLLEXPORT int build_split_merged_vamana_index( + std::string base_file, diskann::Metric compareMetric, uint32_t L, uint32_t R, double sampling_rate, + double ram_budget, std::string mem_index_path, std::string medoids_path, std::string centroids_file, + size_t build_pq_bytes, bool use_opq, uint32_t num_threads, bool use_filters, const std::string &label_file, + const std::string &labels_to_medoids_file, const std::string &universal_label, const uint32_t Lf, const uint32_t subshard_id); +//--------- +template DISKANN_DLLEXPORT int merge_split_vamana_index( + std::string base_file, diskann::Metric compareMetric, uint32_t L, uint32_t R, double sampling_rate, + double ram_budget, std::string mem_index_path, std::string medoids_path, std::string centroids_file, + size_t build_pq_bytes, bool use_opq, uint32_t num_threads, bool use_filters, const std::string &label_file, + const std::string &labels_to_medoids_file, const std::string &universal_label, const uint32_t Lf, + const uint32_t num_parts); +}; // namespace diskann \ No newline at end of file diff --git a/src/dll/CMakeLists.txt b/src/dll/CMakeLists.txt index d00cfeb95..b4726668f 100644 --- a/src/dll/CMakeLists.txt +++ b/src/dll/CMakeLists.txt @@ -10,6 +10,9 @@ set(TARGET_DIR "$<$:${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}>$<$ InMemGraphStore::load_impl(const std::str int InMemGraphStore::save_graph(const std::string &index_path_prefix, const size_t num_points, const size_t num_frozen_points, const uint32_t start) { + diskann::cout << "Start graph saving to ckp1 " << index_path_prefix << std::endl; std::ofstream out; open_file_to_write(out, index_path_prefix); - + diskann::cout << "Open_file finished " << index_path_prefix << std::endl; size_t file_offset = 0; out.seekp(file_offset, out.beg); + diskann::cout << "seekp finished " << index_path_prefix << std::endl; size_t index_size = 24; uint32_t max_degree = 0; + diskann::cout << "start writting bits chp1 " << index_path_prefix << std::endl; out.write((char *)&index_size, sizeof(uint64_t)); + diskann::cout << "start writting bits chp2 " << index_path_prefix << std::endl; out.write((char *)&_max_observed_degree, sizeof(uint32_t)); + diskann::cout << "start writting bits chp3 " << index_path_prefix << std::endl; uint32_t ep_u32 = start; out.write((char *)&ep_u32, sizeof(uint32_t)); + diskann::cout << "start writting bits chp4 " << index_path_prefix << std::endl; out.write((char *)&num_frozen_points, sizeof(size_t)); + diskann::cout << "start writting bits chp5 " << index_path_prefix << std::endl; + diskann::cout << "Start graph saving to ckp2 " << index_path_prefix << std::endl; // Note: num_points = _nd + _num_frozen_points for (uint32_t i = 0; i < num_points; i++) { @@ -226,6 +234,11 @@ int InMemGraphStore::save_graph(const std::string &index_path_prefix, const size out.write((char *)&index_size, sizeof(uint64_t)); out.write((char *)&max_degree, sizeof(uint32_t)); out.close(); + + std::ifstream file(index_path_prefix, std::ifstream::ate | std::ifstream::binary); + std::streamsize size = file.tellg(); + file.close(); + diskann::cout << "Graph saved to " << index_path_prefix << " size: " << size << std::endl; return (int)index_size; } diff --git a/src/index.cpp b/src/index.cpp index 3de3a3b7f..d101b5dc8 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -12,7 +12,7 @@ #include "tsl/robin_map.h" #include "tsl/robin_set.h" #include "windows_customizations.h" -#if defined(RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) +#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) #include "gperftools/malloc_extension.h" #endif @@ -1213,7 +1213,7 @@ void Index::prune_neighbors(const uint32_t location, std::vecto pruned_list.reserve(range); occlude_list(location, pool, alpha, range, max_candidate_size, pruned_list, scratch); - assert(pruned_list.size() <= range); + //assert(pruned_list.size() <= range); if (_saturate_graph && alpha > 1) { @@ -1234,12 +1234,12 @@ void Index::inter_insert(uint32_t n, std::vector &pru { const auto &src_pool = pruned_list; - assert(!src_pool.empty()); + //assert(!src_pool.empty()); for (auto des : src_pool) { - // des.loc is the loc of the neighbors of n - assert(des < _max_points + _num_frozen_pts); + //// des.loc is the loc of the neighbors of n + //assert(des < _max_points + _num_frozen_pts); // des_pool contains the neighbors of the neighbors of n std::vector copy_of_neighbors; bool prune_needed = false; @@ -1346,13 +1346,13 @@ template void Index 0); + /* assert(pruned_list.size() > 0);*/ { LockGuard guard(_locks[node]); _graph_store->set_neighbours(node, pruned_list); - assert(_graph_store->get_neighbours((location_t)node).size() <= _indexingRange); + //assert(_graph_store->get_neighbours((location_t)node).size() <= _indexingRange); } inter_insert(node, pruned_list, scratch); diff --git a/src/partition.cpp b/src/partition.cpp index 2d46f9faf..570d45c7d 100644 --- a/src/partition.cpp +++ b/src/partition.cpp @@ -11,7 +11,7 @@ #include "tsl/robin_map.h" #include "tsl/robin_set.h" -#if defined(RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) +#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) #include "gperftools/malloc_extension.h" #endif diff --git a/src/pq.cpp b/src/pq.cpp index c59fc2dce..a86bf39bf 100644 --- a/src/pq.cpp +++ b/src/pq.cpp @@ -2,7 +2,9 @@ // Licensed under the MIT license. #include "mkl.h" - +#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) +#include "gperftools/malloc_extension.h" +#endif #include "pq.h" #include "partition.h" #include "math_utils.h" @@ -923,7 +925,7 @@ int generate_pq_data_from_pivots(const std::string &data_file, uint32_t num_cent } // Gopal. Splitting diskann_dll into separate DLLs for search and build. // This code should only be available in the "build" DLL. -#if defined(RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) +#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) MallocExtension::instance()->ReleaseFreeMemory(); #endif compressed_file_writer.close(); diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index 133cee614..0af83e159 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -794,8 +794,8 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons std::string pq_table_bin = pivots_filepath; std::string pq_compressed_vectors = compressed_filepath; std::string _disk_index_file = index_filepath; - std::string medoids_file = std::string(_disk_index_file) + "_medoids.bin"; - std::string centroids_file = std::string(_disk_index_file) + "_centroids.bin"; + std::string medoids_file = "C:\\Users\\jinweizhang\\Downloads\\bannindex_disk.index_medoids.bin"; + std::string centroids_file = "C:\\Users\\jinweizhang\\Downloads\\bannindex_disk.index_centroids.bin"; std::string labels_file = std ::string(_disk_index_file) + "_labels.txt"; std::string labels_to_medoids = std ::string(_disk_index_file) + "_labels_to_medoids.txt";