This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch vector-index-dev
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/vector-index-dev by this push:
new 2a915e4b2ca enable faiss hnsw (#49745)
2a915e4b2ca is described below
commit 2a915e4b2ca613c8dda5ce55f0e5c79f935e1dbb
Author: zhiqiang <[email protected]>
AuthorDate: Wed Apr 2 10:00:40 2025 +0800
enable faiss hnsw (#49745)
```
CREATE TABLE `vector_table` (
`siteid` int(11) NULL DEFAULT "10" COMMENT "",
`embedding` array<float> NOT NULL COMMENT "",
`comment` text NULL,
INDEX idx_test_ann (`embedding`) USING ANN PROPERTIES(
"index_type"="hnsw",
"metric_type"="l2",
"dim"="8",
"max_degree"="100") COMMENT 'test diskann index',
INDEX idx_comment (`comment`) USING INVERTED PROPERTIES("support_phrase"
= "true", "parser" = "english", "lower_case" = "true") COMMENT 'inverted index
for comment' )
ENGINE=OLAP duplicate KEY(`siteid`) COMMENT "OLAP" DISTRIBUTED BY
HASH(`siteid`) BUCKETS 1 PROPERTIES ( "replication_num" = "1" );
INSERT INTO `vector_table` (`siteid`, `embedding`,`comment`) VALUES
(10, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,20],"emb1"),
(20, [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,30],"emb2")
--------------
Query OK, 2 rows affected (0.07 sec)
{'label':'label_858347013b14baf_b9db5d59b5e30322', 'status':'VISIBLE',
'txnId':'18029'}
```
```
I20250401 19:18:17.977408 3765348 faiss_vector_index.cpp:86] Faiss index
saved to faiss.idx, rows 2
```
---
be/CMakeLists.txt | 4 +
be/src/olap/rowset/segment_v2/ann_index_writer.cpp | 44 ++++++----
be/src/olap/rowset/segment_v2/ann_index_writer.h | 4 +-
be/src/vector/CMakeLists.txt | 16 +++-
be/src/vector/faiss_vector_index.cpp | 93 ++++++++++++++++++++++
be/src/vector/faiss_vector_index.h | 84 +++++++++++++++++++
be/src/vector/vector_index.h | 4 +-
build.sh | 1 -
8 files changed, 224 insertions(+), 26 deletions(-)
diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index 72dbf5cf185..f8a976a6e12 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -395,6 +395,10 @@ if (USE_DWARF)
add_compile_options(-gdwarf-5)
endif()
+if (BUILD_FAISS)
+ add_definitions(-DBUILD_FAISS)
+endif()
+
# For CMAKE_BUILD_TYPE=Debug
if (OS_MACOSX AND ARCH_ARM)
# Using -O0 may meet ARM64 branch out of range errors when linking with
tcmalloc.
diff --git a/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
b/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
index e08aafdcb69..5d7b70430f5 100644
--- a/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
@@ -1,5 +1,9 @@
#include "olap/rowset/segment_v2/ann_index_writer.h"
+#ifdef BUILD_FAISS
+#include "vector/faiss_vector_index.h"
+#endif
+
namespace doris::segment_v2 {
AnnIndexColumnWriter::AnnIndexColumnWriter(const std::string& field_name,
@@ -27,22 +31,30 @@ std::string get_or_default(const std::map<std::string,
std::string>& properties,
}
Status AnnIndexColumnWriter::init_ann_index() {
- // if(get_or_default(_index_meta->properties(), INDEX_TYPE,
"")=="diskann"){
- // _vector_index_writer = std::make_shared<DiskannVectorIndex>(_dir);
- // std::shared_ptr<DiskannBuilderParameter> builderParameterPtr =
std::make_shared<DiskannBuilderParameter>();
- //
builderParameterPtr->with_dim(std::stoi(get_or_default(_index_meta->properties(),
DIM,"")))
- //
.with_L(std::stoi(get_or_default(_index_meta->properties(),
DISKANN_SEARCH_LIST,"")))
- //
.with_R(std::stoi(get_or_default(_index_meta->properties(),
DISKANN_MAX_DEGREE,"")))
- // .with_build_num_threads(8)
- // .with_sample_rate(1)
- // .with_indexing_ram_budget_mb(10*1024)
- // .with_search_ram_budget_mb(30)
- //
.with_mertic_type(VectorIndex::string_to_metric(get_or_default(_index_meta->properties(),
METRIC_TYPE,"")));
- //
_vector_index_writer->set_build_params(std::static_pointer_cast<BuilderParameter>(builderParameterPtr));
- // return Status::OK();
- // }else{
- return Status::NotSupported("ANN index is not support for now.");
- // }
+ _vector_index_writer = nullptr;
+ std::string index_type = get_or_default(_index_meta->properties(),
INDEX_TYPE, "");
+ if (index_type == "hnsw") {
+#ifdef BUILD_FAISS
+ std::shared_ptr<FaissVectorIndex> faiss_index_writer =
+ std::make_shared<FaissVectorIndex>(_dir);
+
+ FaissBuildParameter builderParameter;
+ builderParameter.index_type =
FaissBuildParameter::string_to_index_type("hnsw");
+ builderParameter.d =
std::stoi(get_or_default(_index_meta->properties(), DIM, "512"));
+ builderParameter.m =
std::stoi(get_or_default(_index_meta->properties(), MAX_DEGREE, "32"));
+ builderParameter.quantilizer =
FaissBuildParameter::string_to_quantilizer(
+ get_or_default(_index_meta->properties(), QUANTILIZER,
"flat"));
+ faiss_index_writer->set_build_params(builderParameter);
+ _vector_index_writer = faiss_index_writer;
+#else
+ return Status::NotSupported("Faiss index is not supported, please
build doris with faiss");
+#endif
+ }
+ if (_vector_index_writer == nullptr) {
+ return Status::NotSupported("Unsupported index type: " + index_type);
+ } else {
+ return Status::OK();
+ }
}
Status AnnIndexColumnWriter::open_index_directory() {
diff --git a/be/src/olap/rowset/segment_v2/ann_index_writer.h
b/be/src/olap/rowset/segment_v2/ann_index_writer.h
index ce6e7c93a5d..2425fecc153 100644
--- a/be/src/olap/rowset/segment_v2/ann_index_writer.h
+++ b/be/src/olap/rowset/segment_v2/ann_index_writer.h
@@ -49,9 +49,9 @@ class AnnIndexColumnWriter : public IndexColumnWriter {
public:
static constexpr const char* INDEX_TYPE = "index_type";
static constexpr const char* METRIC_TYPE = "metric_type";
+ static constexpr const char* QUANTILIZER = "quantilizer";
static constexpr const char* DIM = "dim";
- static constexpr const char* DISKANN_MAX_DEGREE = "max_degree";
- static constexpr const char* DISKANN_SEARCH_LIST = "search_list";
+ static constexpr const char* MAX_DEGREE = "max_degree";
explicit AnnIndexColumnWriter(const std::string& field_name,
XIndexFileWriter* index_file_writer,
diff --git a/be/src/vector/CMakeLists.txt b/be/src/vector/CMakeLists.txt
index 8aa56be531d..816d7da34c8 100644
--- a/be/src/vector/CMakeLists.txt
+++ b/be/src/vector/CMakeLists.txt
@@ -18,17 +18,25 @@
# where to put generated libraries
set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/vector")
-set(HEADER_FILES
+set(VECTOR_LIB_SRC
vector_index.h
stream_wrapper.h
)
-# Use INTERFACE library type for header-only libraries
-add_library(vector INTERFACE)
+set(VECTOR_LIB_DEPENDENCIES)
+
if (BUILD_FAISS)
- target_link_libraries(vector INTERFACE faiss)
+ # append faiss src to VECTOR_LIB_SRC
+ list(APPEND VECTOR_LIB_SRC
+ faiss_vector_index.h
+ faiss_vector_index.cpp
+ )
+ list(APPEND VECTOR_LIB_DEPENDENCIES faiss)
endif()
+add_library(vector OBJECT ${VECTOR_LIB_SRC})
+target_link_libraries(vector PRIVATE ${VECTOR_LIB_DEPENDENCIES})
+
# Make the headers available to targets that link against the vector library
target_include_directories(vector INTERFACE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
diff --git a/be/src/vector/faiss_vector_index.cpp
b/be/src/vector/faiss_vector_index.cpp
new file mode 100644
index 00000000000..d05fd920f60
--- /dev/null
+++ b/be/src/vector/faiss_vector_index.cpp
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "faiss_vector_index.h"
+
+#include <faiss/index_io.h>
+
+#include <memory>
+
+#include "CLucene/store/IndexOutput.h"
+#include "common/exception.h"
+#include "common/logging.h"
+#include "common/status.h"
+#include "faiss/IndexHNSW.h"
+#include "faiss/impl/io.h"
+
+struct FaissIndexWriter : faiss::IOWriter {
+public:
+ FaissIndexWriter() = default;
+ FaissIndexWriter(lucene::store::IndexOutput* output) : _output(output) {}
+ ~FaissIndexWriter() override {
+ if (_output != nullptr) {
+ _output->close();
+ delete _output;
+ }
+ }
+
+ size_t operator()(const void* ptr, size_t size, size_t nitems) override {
+ size_t bytes = size * nitems;
+ if (bytes > 0) {
+ try {
+ _output->writeBytes(reinterpret_cast<const uint8_t*>(ptr),
bytes);
+ } catch (const std::exception& e) {
+ throw doris::Exception(doris::ErrorCode::IO_ERROR,
+ "Failed to write vector index {}",
e.what());
+ }
+ }
+ return nitems;
+ };
+
+ lucene::store::IndexOutput* _output = nullptr;
+};
+
+doris::Status FaissVectorIndex::add(int n, const float* vec) {
+ DCHECK(vec != nullptr);
+
+ _index->add(n, vec);
+
+ return doris::Status::OK();
+}
+
+void FaissVectorIndex::set_build_params(const FaissBuildParameter& params) {
+ if (params.index_type == FaissBuildParameter::IndexType::BruteForce) {
+ _index = std::make_shared<faiss::IndexFlatL2>(params.d);
+ } else if (params.index_type == FaissBuildParameter::IndexType::HNSW) {
+ _index = std::make_shared<faiss::IndexHNSWFlat>(params.d, params.m);
+ } else {
+ throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT,
"Unsupported index type: {}",
+ static_cast<int>(params.index_type));
+ }
+}
+
+doris::Status FaissVectorIndex::search(const float* query_vec, int k,
SearchResult* result,
+ const SearchParameters* params) {
+ return doris::Status::OK();
+}
+
+doris::Status FaissVectorIndex::save() {
+ lucene::store::IndexOutput* idx_output = _dir->createOutput("faiss.idx");
+ auto writer = std::make_unique<FaissIndexWriter>(idx_output);
+ faiss::write_index(_index.get(), writer.get());
+ VLOG_DEBUG << fmt::format("Faiss index saved to faiss.idx, rows {}",
_index->ntotal);
+ return doris::Status::OK();
+}
+doris::Status FaissVectorIndex::load(Metric type) {
+ // Load the index from the directory
+ // This is a placeholder for actual implementation
+ return doris::Status::OK();
+}
\ No newline at end of file
diff --git a/be/src/vector/faiss_vector_index.h
b/be/src/vector/faiss_vector_index.h
new file mode 100644
index 00000000000..b99359efee7
--- /dev/null
+++ b/be/src/vector/faiss_vector_index.h
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <CLucene.h>
+#include <CLucene/store/IndexInput.h>
+#include <CLucene/store/IndexOutput.h>
+#include <faiss/Index.h>
+
+#include <memory>
+#include <string>
+
+#include "common/status.h"
+#include "vector_index.h"
+
+struct FaissBuildParameter {
+ enum class IndexType { BruteForce, IVF, HNSW };
+
+ enum class Quantilizer { FLAT, SQ, PQ };
+
+ static IndexType string_to_index_type(const std::string& type) {
+ if (type == "brute_force") {
+ return IndexType::BruteForce;
+ } else if (type == "ivf") {
+ return IndexType::IVF;
+ } else if (type == "hnsw") {
+ return IndexType::HNSW;
+ }
+ return IndexType::HNSW; // default
+ }
+
+ static Quantilizer string_to_quantilizer(const std::string& type) {
+ if (type == "flat") {
+ return Quantilizer::FLAT;
+ } else if (type == "sq") {
+ return Quantilizer::SQ;
+ } else if (type == "pq") {
+ return Quantilizer::PQ;
+ }
+ return Quantilizer::FLAT; // default
+ }
+
+ // HNSW
+ int d = 0;
+ int m = 0;
+ IndexType index_type;
+ Quantilizer quantilizer;
+};
+
+class FaissVectorIndex : public VectorIndex {
+public:
+ FaissVectorIndex(std::shared_ptr<lucene::store::Directory> dir) :
_index(nullptr), _dir(dir) {}
+
+ doris::Status add(int n, const float* vec) override;
+
+ void set_build_params(const FaissBuildParameter& params);
+
+ doris::Status search(const float* query_vec, int k, SearchResult* result,
+ const SearchParameters* params = nullptr) override;
+
+ doris::Status save() override;
+
+ doris::Status load(Metric type) override;
+
+private:
+ std::shared_ptr<faiss::Index> _index;
+
+ std::shared_ptr<lucene::store::Directory> _dir;
+};
diff --git a/be/src/vector/vector_index.h b/be/src/vector/vector_index.h
index 717c1a79a3a..66442dcecf4 100644
--- a/be/src/vector/vector_index.h
+++ b/be/src/vector/vector_index.h
@@ -54,14 +54,12 @@ struct SearchParameters {
virtual ~SearchParameters() {}
};
-struct BuilderParameter {};
-
class VectorIndex {
public:
enum class Metric { L2, COSINE, INNER_PRODUCT, UNKNOWN };
virtual doris::Status add(int n, const float* vec) = 0;
- virtual void set_build_params(std::shared_ptr<BuilderParameter> params) =
0;
+
virtual doris::Status search(const float* query_vec, int k, SearchResult*
result,
const SearchParameters* params = nullptr) = 0;
//virtual Status save(FileWriter* writer);
diff --git a/build.sh b/build.sh
index 8f057b5b5c8..8f87c08ea10 100755
--- a/build.sh
+++ b/build.sh
@@ -646,7 +646,6 @@ if [[ "${BUILD_BE}" -eq 1 ]]; then
-DENABLE_CLANG_COVERAGE="${DENABLE_CLANG_COVERAGE}" \
-DDORIS_JAVA_HOME="${JAVA_HOME}" \
-DBUILD_AZURE="${BUILD_AZURE}" \
- -DBUILD_FAISS="${BUILD_FAISS}" \
"${DORIS_HOME}/be"
if [[ "${OUTPUT_BE_BINARY}" -eq 1 ]]; then
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]