This is an automated email from the ASF dual-hosted git repository.
leaves12138 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/paimon-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 6e93a04 feat: add global index infrastructure (#65)
6e93a04 is described below
commit 6e93a046fbec054e0f33b597aaded81b671fc5ab
Author: lszskye <[email protected]>
AuthorDate: Tue Jun 9 01:45:38 2026 -0700
feat: add global index infrastructure (#65)
---
.../global_index/bitmap_global_index_result.h | 95 ++++++++
.../bitmap_scored_global_index_result.h | 99 ++++++++
include/paimon/global_index/global_index_io_meta.h | 43 ++++
include/paimon/global_index/global_index_reader.h | 57 +++++
include/paimon/global_index/global_index_result.h | 141 +++++++++++
include/paimon/global_index/global_index_scan.h | 117 +++++++++
.../paimon/global_index/global_index_write_task.h | 62 +++++
include/paimon/global_index/global_index_writer.h | 49 ++++
include/paimon/global_index/global_indexer.h | 71 ++++++
.../paimon/global_index/global_indexer_factory.h | 60 +++++
include/paimon/global_index/indexed_split.h | 48 ++++
.../global_index/io/global_index_file_reader.h | 39 +++
.../global_index/io/global_index_file_writer.h | 47 ++++
.../global_index/bitmap_global_index_result.cpp | 104 ++++++++
.../bitmap_global_index_result_test.cpp | 208 ++++++++++++++++
.../bitmap_scored_global_index_result.cpp | 160 +++++++++++++
.../bitmap_scored_global_index_result_test.cpp | 263 +++++++++++++++++++++
.../complete_index_score_batch_reader.cpp | 107 +++++++++
.../complete_index_score_batch_reader.h | 70 ++++++
.../complete_index_score_batch_reader_test.cpp | 165 +++++++++++++
.../common/global_index/global_index_result.cpp | 164 +++++++++++++
.../global_index/global_index_result_test.cpp | 154 ++++++++++++
.../common/global_index/global_index_utils.h | 58 +++++
.../global_index/global_index_utils_test.cpp | 111 +++++++++
.../common/global_index/global_indexer_factory.cpp | 50 ++++
.../global_index/global_indexer_factory_test.cpp | 57 +++++
26 files changed, 2599 insertions(+)
diff --git a/include/paimon/global_index/bitmap_global_index_result.h
b/include/paimon/global_index/bitmap_global_index_result.h
new file mode 100644
index 0000000..4c0c8e9
--- /dev/null
+++ b/include/paimon/global_index/bitmap_global_index_result.h
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paimon/global_index/global_index_result.h"
+#include "paimon/utils/range.h"
+#include "paimon/utils/roaring_bitmap64.h"
+#include "paimon/visibility.h"
+
+namespace paimon {
+/// Represents a global index query result that **lazily materializes** its
matching row ids as a
+/// Roaring bitmap. The underlying 64-bit Roaring bitmap is **not constructed
during object
+/// creation**; instead, it is built on-demand the first time GetBitmap() is
called. This design
+/// avoids unnecessary computation and memory allocation when the bitmap is
not needed (e.g., during
+/// early stopping).
+class PAIMON_EXPORT BitmapGlobalIndexResult : public GlobalIndexResult {
+ public:
+ using BitmapSupplier = std::function<Result<RoaringBitmap64>()>;
+ explicit BitmapGlobalIndexResult(BitmapSupplier bitmap_supplier)
+ : bitmap_supplier_(bitmap_supplier) {}
+
+ class Iterator : public GlobalIndexResult::Iterator {
+ public:
+ Iterator(const RoaringBitmap64* bitmap, RoaringBitmap64::Iterator&&
iter)
+ : bitmap_(bitmap), iter_(std::move(iter)) {}
+
+ bool HasNext() const override {
+ return iter_ != bitmap_->End();
+ }
+
+ int64_t Next() override {
+ uint64_t value = *iter_;
+ ++iter_;
+ return value;
+ }
+
+ private:
+ const RoaringBitmap64* bitmap_;
+ RoaringBitmap64::Iterator iter_;
+ };
+
+ Result<std::unique_ptr<GlobalIndexResult::Iterator>> CreateIterator()
const override;
+
+ Result<std::shared_ptr<GlobalIndexResult>> And(
+ const std::shared_ptr<GlobalIndexResult>& other) override;
+
+ Result<std::shared_ptr<GlobalIndexResult>> Or(
+ const std::shared_ptr<GlobalIndexResult>& other) override;
+
+ Result<bool> IsEmpty() const override;
+
+ Result<std::shared_ptr<GlobalIndexResult>> AddOffset(int64_t offset)
override;
+
+ std::string ToString() const override;
+
+ /// @return A non-owning, const pointer to the bitmap. The returned
pointer is valid as long as
+ /// this BitmapGlobalIndexResult object is alive. The caller must
not modify the bitmap.
+ /// @note **Lazy initialization**: The bitmap is constructed only on the
first call to this
+ /// method.
+ /// Subsequent calls return the cached instance. Construction may
involve non-trivial
+ /// CPU/IO cost (e.g., read indexes or merging bitmap), so avoid
calling this if the
+ /// bitmap is not actually required. **Not thread-safe**.
+ Result<const RoaringBitmap64*> GetBitmap() const;
+
+ /// Creates `BitmapGlobalIndexResult` for all row ids in the given ranges.
+ /// @note Overlapping or unsorted ranges are accepted.
+ static std::shared_ptr<BitmapGlobalIndexResult> FromRanges(const
std::vector<Range>& ranges);
+
+ private:
+ mutable bool initialized_ = false;
+ BitmapSupplier bitmap_supplier_;
+ mutable RoaringBitmap64 bitmap_;
+};
+} // namespace paimon
diff --git a/include/paimon/global_index/bitmap_scored_global_index_result.h
b/include/paimon/global_index/bitmap_scored_global_index_result.h
new file mode 100644
index 0000000..a04f26d
--- /dev/null
+++ b/include/paimon/global_index/bitmap_scored_global_index_result.h
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paimon/global_index/global_index_result.h"
+#include "paimon/utils/roaring_bitmap64.h"
+#include "paimon/visibility.h"
+
+namespace paimon {
+/// Represents a scored global index result that combines a Roaring bitmap of
candidate row
+/// ids with an array of associated relevance scores.
+///
+/// **Important Ordering Note**: Inheriting from ScoredGlobalIndexResult, the
results are
+/// **NOT sorted by score**. Instead, both the bitmap and the score vector are
ordered by
+/// **ascending row id**. This design enables efficient merging and set
operations while preserving
+/// row id-to-score mapping.
+class PAIMON_EXPORT BitmapScoredGlobalIndexResult : public
ScoredGlobalIndexResult {
+ public:
+ BitmapScoredGlobalIndexResult(RoaringBitmap64&& bitmap,
std::vector<float>&& scores)
+ : bitmap_(std::move(bitmap)), scores_(std::move(scores)) {
+ assert(static_cast<size_t>(bitmap_.Cardinality()) == scores_.size());
+ }
+
+ class ScoredIterator : public ScoredGlobalIndexResult::ScoredIterator {
+ public:
+ ScoredIterator(const RoaringBitmap64* bitmap,
RoaringBitmap64::Iterator&& iter,
+ const float* scores)
+ : bitmap_(bitmap), iter_(std::move(iter)), scores_(scores) {}
+
+ bool HasNext() const override {
+ return iter_ != bitmap_->End();
+ }
+
+ std::pair<int64_t, float> NextWithScore() override {
+ uint64_t value = *iter_;
+ ++iter_;
+ return {value, scores_[cursor_++]};
+ }
+
+ private:
+ size_t cursor_ = 0;
+ const RoaringBitmap64* bitmap_;
+ RoaringBitmap64::Iterator iter_;
+ const float* scores_;
+ };
+
+ Result<std::unique_ptr<GlobalIndexResult::Iterator>> CreateIterator()
const override;
+
+ Result<std::unique_ptr<ScoredGlobalIndexResult::ScoredIterator>>
CreateScoredIterator()
+ const override;
+
+ Result<std::shared_ptr<GlobalIndexResult>> And(
+ const std::shared_ptr<GlobalIndexResult>& other) override;
+
+ Result<std::shared_ptr<GlobalIndexResult>> Or(
+ const std::shared_ptr<GlobalIndexResult>& other) override;
+
+ Result<std::shared_ptr<GlobalIndexResult>> AddOffset(int64_t offset)
override;
+
+ Result<bool> IsEmpty() const override;
+
+ std::string ToString() const override;
+
+ /// @return A non-owning, const pointer to the bitmap. The row ids in the
bitmap are stored in
+ /// ascending order (as guaranteed by Roaring64 iteration).
+ Result<const RoaringBitmap64*> GetBitmap() const;
+
+ /// @return A const reference to a vector of float scores, where the i-th
element corresponds to
+ /// the i-th row id when iterating the bitmap in **ascending row
id order**.
+ const std::vector<float>& GetScores() const;
+
+ private:
+ RoaringBitmap64 bitmap_;
+ // ordered by row id
+ std::vector<float> scores_;
+};
+} // namespace paimon
diff --git a/include/paimon/global_index/global_index_io_meta.h
b/include/paimon/global_index/global_index_io_meta.h
new file mode 100644
index 0000000..f4497b7
--- /dev/null
+++ b/include/paimon/global_index/global_index_io_meta.h
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "paimon/memory/bytes.h"
+#include "paimon/utils/range.h"
+
+namespace paimon {
+/// Metadata describing a single file entry in a global index.
+struct PAIMON_EXPORT GlobalIndexIOMeta {
+ GlobalIndexIOMeta(const std::string& _file_path, int64_t _file_size,
+ const std::shared_ptr<Bytes>& _metadata)
+ : file_path(_file_path), file_size(_file_size), metadata(_metadata) {}
+
+ std::string file_path;
+ int64_t file_size;
+ /// Optional binary metadata associated with the file, such as serialized
+ /// secondary index structures or inline index bytes.
+ /// May be null if no additional metadata is available.
+ std::shared_ptr<Bytes> metadata;
+};
+
+} // namespace paimon
diff --git a/include/paimon/global_index/global_index_reader.h
b/include/paimon/global_index/global_index_reader.h
new file mode 100644
index 0000000..36bb201
--- /dev/null
+++ b/include/paimon/global_index/global_index_reader.h
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "paimon/global_index/global_index_result.h"
+#include "paimon/predicate/full_text_search.h"
+#include "paimon/predicate/function_visitor.h"
+#include "paimon/predicate/vector_search.h"
+#include "paimon/visibility.h"
+
+namespace paimon {
+/// Reads and evaluates filter predicates against a global file index.
+///
+/// Derived classes are expected to implement the visitor methods (e.g.,
`VisitEqual`,
+/// `VisitIsNull`, etc.) to return index-based results that indicate which
+/// rows satisfy the given predicate.
+class PAIMON_EXPORT GlobalIndexReader : public
FunctionVisitor<std::shared_ptr<GlobalIndexResult>> {
+ public:
+ /// VisitVectorSearch performs approximate vector similarity search.
+ /// @warning `VisitVectorSearch` may return error status when it is
incorrectly invoked (e.g.,
+ /// BitmapGlobalIndexReader call `VisitVectorSearch`).
+ virtual Result<std::shared_ptr<ScoredGlobalIndexResult>> VisitVectorSearch(
+ const std::shared_ptr<VectorSearch>& vector_search) = 0;
+
+ /// VisitFullTextSearch performs full text search.
+ virtual Result<std::shared_ptr<GlobalIndexResult>> VisitFullTextSearch(
+ const std::shared_ptr<FullTextSearch>& full_text_search) = 0;
+
+ /// @return true if the reader is thread-safe; false otherwise.
+ virtual bool IsThreadSafe() const = 0;
+
+ /// @return An identifier representing the index type. (e.g., "bitmap",
"lumina").
+ virtual std::string GetIndexType() const = 0;
+};
+
+} // namespace paimon
diff --git a/include/paimon/global_index/global_index_result.h
b/include/paimon/global_index/global_index_result.h
new file mode 100644
index 0000000..bb16d6e
--- /dev/null
+++ b/include/paimon/global_index/global_index_result.h
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paimon/memory/bytes.h"
+#include "paimon/memory/memory_pool.h"
+#include "paimon/result.h"
+#include "paimon/utils/range.h"
+#include "paimon/visibility.h"
+
+namespace paimon {
+/// Global index result that holds the row ids.
+class PAIMON_EXPORT GlobalIndexResult : public
std::enable_shared_from_this<GlobalIndexResult> {
+ public:
+ virtual ~GlobalIndexResult() = default;
+
+ /// Iterator interface for traversing selected row ids.
+ class Iterator {
+ public:
+ virtual ~Iterator() = default;
+
+ /// Checks whether more row ids are available.
+ virtual bool HasNext() const = 0;
+
+ /// @return The next row id and advances the iterator.
+ virtual int64_t Next() = 0;
+ };
+
+ /// Checks whether the global index result contains no matching row ids.
+ ///
+ /// @return A `Result<bool>` where:
+ /// - `true` indicates the result is empty (no matching rows),
+ /// - `false` indicates at least one matching row exists,
+ /// - An error is returned only if internal state is corrupted or
I/O fails
+ /// (e.g., during lazy loading of index data).
+ virtual Result<bool> IsEmpty() const = 0;
+
+ /// Creates a new iterator over the selected row ids.
+ virtual Result<std::unique_ptr<Iterator>> CreateIterator() const = 0;
+
+ /// Returns non-overlapping, sorted ranges covering all row ids in
`GlobalIndexResult`.
+ Result<std::vector<Range>> ToRanges() const;
+
+ /// Computes the logical AND (intersection) between current result and
another.
+ virtual Result<std::shared_ptr<GlobalIndexResult>> And(
+ const std::shared_ptr<GlobalIndexResult>& other);
+
+ /// Computes the logical OR (union) between this result and another.
+ virtual Result<std::shared_ptr<GlobalIndexResult>> Or(
+ const std::shared_ptr<GlobalIndexResult>& other);
+
+ /// Adds the given offset to each row id in current result and returns the
new global index
+ /// result.
+ virtual Result<std::shared_ptr<GlobalIndexResult>> AddOffset(int64_t
offset) = 0;
+
+ virtual std::string ToString() const = 0;
+
+ /// Serializes a GlobalIndexResult object into a byte array.
+ ///
+ /// @note This method only supports the following concrete implementations:
+ /// - BitmapScoredGlobalIndexResult
+ /// - BitmapGlobalIndexResult
+ ///
+ /// @param global_index_result The GlobalIndexResult instance to serialize
(must not be null).
+ /// @param pool Memory pool used to allocate the output byte buffer.
+ /// @return A Result containing a unique pointer to the serialized Bytes
on success,
+ /// or an error status on failure.
+ static Result<PAIMON_UNIQUE_PTR<Bytes>> Serialize(
+ const std::shared_ptr<GlobalIndexResult>& global_index_result,
+ const std::shared_ptr<MemoryPool>& pool);
+
+ /// Deserializes a GlobalIndexResult object from a raw byte buffer.
+ ///
+ /// @note The concrete type of the deserialized object is determined by
metadata
+ /// embedded in the buffer. Currently, only the following types are
supported:
+ /// - BitmapScoredGlobalIndexResult
+ /// - BitmapGlobalIndexResult
+ ///
+ /// @param buffer Pointer to the serialized byte data (must not be null).
+ /// @param length Size of the buffer in bytes.
+ /// @param pool Memory pool used to allocate internal objects during
deserialization.
+ /// @return A Result containing a shared pointer to the reconstructed
GlobalIndexResult
+ /// on success, or an error status on failure.
+ static Result<std::shared_ptr<GlobalIndexResult>> Deserialize(
+ const char* buffer, size_t length, const std::shared_ptr<MemoryPool>&
pool);
+
+ private:
+ static constexpr int32_t VERSION = 1;
+};
+
+/// Represents the result with score of a query against a global index.
+/// This class encapsulates a set of search candidates (row id + score pairs)
and provides
+/// an iterator interface to traverse them.
+class PAIMON_EXPORT ScoredGlobalIndexResult : public GlobalIndexResult {
+ public:
+ /// An iterator over the scored results, returning (row_id, score) pairs.
+ ///
+ /// @note The results are **NOT sorted by score**. Instead, they are
returned in **ascending
+ /// order of row_id**.
+ class ScoredIterator {
+ public:
+ virtual ~ScoredIterator() = default;
+
+ /// Checks whether more row ids are available.
+ virtual bool HasNext() const = 0;
+
+ /// Retrieves the next (row_id, score) pair and advances the iterator.
+ ///
+ /// @return A pair where:
+ /// - first: the row id (returned in ascending order).
+ /// - second: the associated score computed by the index.
+ ///
+ /// @note The sequence is ordered by **row_id**, not by score.
+ virtual std::pair<int64_t, float> NextWithScore() = 0;
+ };
+
+ /// Creates a new iterator for traversing the scored results.
+ virtual Result<std::unique_ptr<ScoredIterator>> CreateScoredIterator()
const = 0;
+};
+} // namespace paimon
diff --git a/include/paimon/global_index/global_index_scan.h
b/include/paimon/global_index/global_index_scan.h
new file mode 100644
index 0000000..d925e2f
--- /dev/null
+++ b/include/paimon/global_index/global_index_scan.h
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "paimon/global_index/global_index_reader.h"
+#include "paimon/global_index/global_index_result.h"
+#include "paimon/predicate/predicate.h"
+#include "paimon/utils/range.h"
+#include "paimon/utils/row_range_index.h"
+#include "paimon/visibility.h"
+namespace paimon {
+class MemoryPool;
+class FileSystem;
+
+/// Represents a logical scan over a global index for a table.
+class PAIMON_EXPORT GlobalIndexScan {
+ public:
+ /// Creates a `GlobalIndexScan` instance for the specified table and
context.
+ /// @param table_path Root directory of the table.
+ /// @param snapshot_id Optional snapshot id to read from; if not
provided, uses the latest.
+ /// @param partitions Optional list of specific partitions to restrict
the scan scope.
+ /// Each map represents one partition (e.g., {"dt":
"2024-06-01"}).
+ /// If omitted (`std::nullopt`), scans all
partitions of the table.
+ /// @param options User defined configuration.
+ /// @param file_system File system for accessing index files.
+ /// If not provided (nullptr), it is inferred from
the `FILE_SYSTEM`
+ /// key in the `options` parameter.
+ /// @param executor The executor to be used for asynchronous
operations during global
+ /// index scan.
+ /// @param pool Memory pool for temporary allocations; if
nullptr, uses default.
+ /// @return A `Result` containing a unique pointer to the created scanner,
+ /// or an error if initialization fails (e.g., I/O error, invalid
snapshot id,
+ /// unknown partition).
+ static Result<std::unique_ptr<GlobalIndexScan>> Create(
+ const std::string& table_path, const std::optional<int64_t>&
snapshot_id,
+ const std::optional<std::vector<std::map<std::string, std::string>>>&
partitions,
+ const std::map<std::string, std::string>& options,
+ const std::shared_ptr<FileSystem>& file_system, const
std::shared_ptr<Executor>& executor,
+ const std::shared_ptr<MemoryPool>& pool);
+
+ /// Creates a `GlobalIndexScan` instance for the specified table and
context, with a
+ /// predicate-based partition filter.
+ /// @param root_path Root directory of the table.
+ /// @param snapshot_id Optional snapshot id to read from; if not
provided, uses the
+ /// latest snapshot.
+ /// @param partition_filters Optional partition-level predicate used for
partition pruning.
+ /// If nullptr, all partitions are scanned.
+ /// @param options User defined configuration.
+ /// @param file_system File system for accessing index files. If
nullptr, it is
+ /// inferred from the `FILE_SYSTEM` key in
`options`.
+ /// @param executor The executor to be used for asynchronous
operations during global
+ /// index scan.
+ /// @param pool Memory pool for temporary allocations; if
nullptr, uses default.
+ /// @return A `Result` containing a unique pointer to the created scanner,
+ /// or an error if initialization fails.
+ static Result<std::unique_ptr<GlobalIndexScan>> Create(
+ const std::string& root_path, const std::optional<int64_t>&
snapshot_id,
+ const std::shared_ptr<Predicate>& partition_filters,
+ const std::map<std::string, std::string>& options,
+ const std::shared_ptr<FileSystem>& file_system, const
std::shared_ptr<Executor>& executor,
+ const std::shared_ptr<MemoryPool>& pool);
+
+ virtual ~GlobalIndexScan() = default;
+
+ /// Creates several `GlobalIndexReader`s for a specific field.
+ /// @param field_name Name of the indexed column.
+ /// @param row_range_index Optional row range that limits the scan to a
sub-range of row ids.
+ /// If not provided, the entire row range is
considered.
+ /// @return A `Result` that is:
+ /// - Successful with several readers(with global row id) if the
indexes exist and load
+ /// correctly;
+ /// - Successful with an empty vector if no index was built for
the given field;
+ /// - Error returns when loading fails (e.g., file corruption, I/O
error,
+ /// unsupported format).
+ virtual Result<std::vector<std::shared_ptr<GlobalIndexReader>>>
CreateReaders(
+ const std::string& field_name,
+ const std::optional<RowRangeIndex>& row_range_index) const = 0;
+
+ /// Creates several `GlobalIndexReader`s for a specific field (looked up
by id),
+ /// @param field_id Field id of the indexed column.
+ /// @param row_range_index Optional row range that limits the scan to a
sub-range of row ids.
+ /// If not provided, the entire row range is
considered.
+ /// @return A `Result` that is:
+ /// - Successful with several readers(with global row id) if the
indexes exist and load
+ /// correctly;
+ /// - Successful with an empty vector if no index was built for
the given field;
+ /// - Error returns when loading fails (e.g., file corruption, I/O
error,
+ /// unsupported format).
+ virtual Result<std::vector<std::shared_ptr<GlobalIndexReader>>>
CreateReaders(
+ int32_t field_id, const std::optional<RowRangeIndex>& row_range_index)
const = 0;
+};
+
+} // namespace paimon
diff --git a/include/paimon/global_index/global_index_write_task.h
b/include/paimon/global_index/global_index_write_task.h
new file mode 100644
index 0000000..6151382
--- /dev/null
+++ b/include/paimon/global_index/global_index_write_task.h
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "paimon/global_index/indexed_split.h"
+#include "paimon/memory/memory_pool.h"
+#include "paimon/result.h"
+#include "paimon/utils/range.h"
+#include "paimon/visibility.h"
+
+namespace paimon {
+/// Writes a range-level global index for a specific data split and field.
+class PAIMON_EXPORT GlobalIndexWriteTask {
+ public:
+ GlobalIndexWriteTask() = delete;
+ ~GlobalIndexWriteTask() = delete;
+ /// Builds and writes a global index for the specified data range.
+ ///
+ /// @param table_path Path to the table root directory where index files
are stored.
+ /// @param field_name Name of the indexed column (must be present in the
table schema).
+ /// @param index_type Type of global index to build (e.g., "bitmap",
"lumina").
+ /// @param index_split The indexed split containing the actual data
(e.g., Parquet file) and
+ // row id range [from, to] for data to build index.
+ /// The range must be fully contained within the data
covered
+ /// by the given `split`.
+ /// @param options Index-specific configuration (e.g., false positive
rate for bloom
+ /// filters).
+ /// @param pool Memory pool for temporary allocations during index
construction.
+ /// If `nullptr`, the system's default memory pool
will be used.
+ /// @param file_system Specifies the file system for file operations.
+ /// If `nullptr`, use default file system.
+ /// @return A `Result` containing a shared pointer to the `CommitMessage`
with index metadata,
+ /// or an error if indexing fails (e.g., unsupported type, I/O
error).
+ static Result<std::shared_ptr<CommitMessage>> WriteIndex(
+ const std::string& table_path, const std::string& field_name, const
std::string& index_type,
+ const std::shared_ptr<IndexedSplit>& indexed_split,
+ const std::map<std::string, std::string>& options, const
std::shared_ptr<MemoryPool>& pool,
+ const std::shared_ptr<FileSystem>& file_system = nullptr);
+};
+
+} // namespace paimon
diff --git a/include/paimon/global_index/global_index_writer.h
b/include/paimon/global_index/global_index_writer.h
new file mode 100644
index 0000000..9901ae1
--- /dev/null
+++ b/include/paimon/global_index/global_index_writer.h
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include "paimon/global_index/global_index_io_meta.h"
+#include "paimon/result.h"
+#include "paimon/status.h"
+#include "paimon/visibility.h"
+struct ArrowArray;
+
+namespace paimon {
+/// Abstract interface for building a global index from Arrow data batches.
+class PAIMON_EXPORT GlobalIndexWriter {
+ public:
+ virtual ~GlobalIndexWriter() = default;
+
+ /// Builds index structures from a batch of columnar data.
+ ///
+ /// @param arrow_array A valid C ArrowArray pointer representing a struct
array.
+ /// Must not be nullptr, and must conform to the
expected schema.
+ /// @param relative_row_ids local row id calculated by `row_id -
range.from`.
+ /// @return `Status::OK()` on success; otherwise, an error indicating
malformed
+ /// input, I/O failure, or unsupported type, etc.
+ virtual Status AddBatch(::ArrowArray* arrow_array, std::vector<int64_t>&&
relative_row_ids) = 0;
+
+ /// Finalizes the index build process and returns metadata for persisted
index.
+ virtual Result<std::vector<GlobalIndexIOMeta>> Finish() = 0;
+};
+
+} // namespace paimon
diff --git a/include/paimon/global_index/global_indexer.h
b/include/paimon/global_index/global_indexer.h
new file mode 100644
index 0000000..3cf5f2c
--- /dev/null
+++ b/include/paimon/global_index/global_indexer.h
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paimon/global_index/global_index_io_meta.h"
+#include "paimon/global_index/global_index_reader.h"
+#include "paimon/global_index/global_index_writer.h"
+#include "paimon/global_index/io/global_index_file_reader.h"
+#include "paimon/global_index/io/global_index_file_writer.h"
+#include "paimon/memory/memory_pool.h"
+#include "paimon/visibility.h"
+
+struct ArrowSchema;
+
+namespace paimon {
+/// Interface for creating global index readers and writers.
+class PAIMON_EXPORT GlobalIndexer {
+ public:
+ virtual ~GlobalIndexer() = default;
+
+ /// Creates a writer for building a global index on a specific field.
+ ///
+ /// @param field_name Name of the field to be indexed.
+ /// @param arrow_schema Schema of the input Arrow struct array.
+ /// It must contain the field specified by
field_name and may
+ /// include additional associated fields used during
index construction.
+ /// @param file_writer I/O handler for persisting index data to storage.
+ /// @param pool Memory pool for temporary allocations; if
nullptr, uses default.
+ /// @return A `Result` containing a shared pointer to the created
`GlobalIndexWriter`,
+ /// or an error if the field is not found, unsupported, or
initialization fails, etc.
+ virtual Result<std::shared_ptr<GlobalIndexWriter>> CreateWriter(
+ const std::string& field_name, ::ArrowSchema* arrow_schema,
+ const std::shared_ptr<GlobalIndexFileWriter>& file_writer,
+ const std::shared_ptr<MemoryPool>& pool) const = 0;
+
+ /// Creates a reader for querying a pre-built global index.
+ ///
+ /// @param arrow_schema Schema of the indexed data; used to interpret
predicate literals.
+ /// @param file_reader I/O handler for reading index artifacts from
storage.
+ /// @param files List of index file metadata entries produced
during writing.
+ /// @param pool Memory pool for temporary allocations; if
nullptr, uses default.
+ /// @return A `Result` containing a shared pointer to the created
`GlobalIndexReader`,
+ /// or an error if the index cannot be loaded or is incompatible,
etc.
+ virtual Result<std::shared_ptr<GlobalIndexReader>> CreateReader(
+ ::ArrowSchema* arrow_schema, const
std::shared_ptr<GlobalIndexFileReader>& file_reader,
+ const std::vector<GlobalIndexIOMeta>& files,
+ const std::shared_ptr<MemoryPool>& pool) const = 0;
+};
+
+} // namespace paimon
diff --git a/include/paimon/global_index/global_indexer_factory.h
b/include/paimon/global_index/global_indexer_factory.h
new file mode 100644
index 0000000..6b0783a
--- /dev/null
+++ b/include/paimon/global_index/global_indexer_factory.h
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "paimon/factories/factory.h"
+#include "paimon/result.h"
+#include "paimon/visibility.h"
+
+namespace paimon {
+class GlobalIndexer;
+
+/// Factory for creating `GlobalIndexer` instances based on index type
identifiers.
+class PAIMON_EXPORT GlobalIndexerFactory : public Factory {
+ public:
+ ~GlobalIndexerFactory() override = default;
+
+ /// Suffix used to distinguish global index identifiers (e.g.,
"bitmap-global").
+ static const char GLOBAL_INDEX_IDENTIFIER_SUFFIX[];
+
+ /// Creates a `GlobalIndexer` instance by looking up a registered factory
using an identifier.
+ ///
+ /// The provided `identifier` is automatically appended with
`GLOBAL_INDEX_IDENTIFIER_SUFFIX`
+ /// (e.g., "-global") to form the full key used for factory lookup. This
ensures namespace
+ /// separation between file and global index types.
+ ///
+ /// @param identifier The base name of the index type (e.g., "bitmap").
+ /// @param options Configuration parameters for the indexer.
+ /// @return A `Result` containing a unique pointer to the created
`GlobalIndexer`,
+ /// or an error if creation fails.
+ /// @return nullptr if no matching factory.
+ static Result<std::unique_ptr<GlobalIndexer>> Get(
+ const std::string& identifier, const std::map<std::string,
std::string>& options);
+
+ /// Creates a `GlobalIndexer` using the current factory’s implementation
and the given options.
+ virtual Result<std::unique_ptr<GlobalIndexer>> Create(
+ const std::map<std::string, std::string>& options) const = 0;
+};
+
+} // namespace paimon
diff --git a/include/paimon/global_index/indexed_split.h
b/include/paimon/global_index/indexed_split.h
new file mode 100644
index 0000000..9f1b0d9
--- /dev/null
+++ b/include/paimon/global_index/indexed_split.h
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "paimon/table/source/data_split.h"
+#include "paimon/utils/range.h"
+#include "paimon/visibility.h"
+
+namespace paimon {
+/// Indexed split for global index reading operation.
+class PAIMON_EXPORT IndexedSplit : public Split {
+ public:
+ /// @returns The underlying physical data split containing actual data
file details.
+ virtual std::shared_ptr<DataSplit> GetDataSplit() const = 0;
+
+ /// @returns A list of row intervals [start, end] indicating which rows
+ /// are relevant (e.g., passed predicate pushdown).
+ virtual const std::vector<Range>& RowRanges() const = 0;
+
+ /// @returns A score for **each individual row** included in `RowRanges()`,
+ /// in the order they appear when traversing the ranges.
+ virtual const std::vector<float>& Scores() const = 0;
+};
+} // namespace paimon
diff --git a/include/paimon/global_index/io/global_index_file_reader.h
b/include/paimon/global_index/io/global_index_file_reader.h
new file mode 100644
index 0000000..ec553d3
--- /dev/null
+++ b/include/paimon/global_index/io/global_index_file_reader.h
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "paimon/result.h"
+#include "paimon/visibility.h"
+namespace paimon {
+class InputStream;
+/// Abstract interface for reading global index files from storage.
+class PAIMON_EXPORT GlobalIndexFileReader {
+ public:
+ virtual ~GlobalIndexFileReader() = default;
+
+ /// Opens an input stream for reading the specified global index file.
+ virtual Result<std::unique_ptr<InputStream>> GetInputStream(
+ const std::string& file_path) const = 0;
+};
+
+} // namespace paimon
diff --git a/include/paimon/global_index/io/global_index_file_writer.h
b/include/paimon/global_index/io/global_index_file_writer.h
new file mode 100644
index 0000000..639169c
--- /dev/null
+++ b/include/paimon/global_index/io/global_index_file_writer.h
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "paimon/fs/file_system.h"
+namespace paimon {
+/// Abstract interface for writing global index files to storage.
+class PAIMON_EXPORT GlobalIndexFileWriter {
+ public:
+ virtual ~GlobalIndexFileWriter() = default;
+
+ /// Generates a unique file name for a new index file using the given
prefix.
+ /// @note This function may be called multiple times if the index consists
of multiple files.
+ virtual Result<std::string> NewFileName(const std::string& prefix) const =
0;
+
+ /// Opens a new output stream for writing index data to the specified file.
+ virtual Result<std::unique_ptr<OutputStream>> NewOutputStream(
+ const std::string& file_name) const = 0;
+
+ /// Get the file size of input file name.
+ virtual Result<int64_t> GetFileSize(const std::string& file_name) const =
0;
+
+ /// Get the index file path of input file name.
+ virtual std::string ToPath(const std::string& file_name) const = 0;
+};
+
+} // namespace paimon
diff --git a/src/paimon/common/global_index/bitmap_global_index_result.cpp
b/src/paimon/common/global_index/bitmap_global_index_result.cpp
new file mode 100644
index 0000000..2533cac
--- /dev/null
+++ b/src/paimon/common/global_index/bitmap_global_index_result.cpp
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "paimon/global_index/bitmap_global_index_result.h"
+
+namespace paimon {
+Result<std::unique_ptr<GlobalIndexResult::Iterator>>
BitmapGlobalIndexResult::CreateIterator()
+ const {
+ PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* bitmap, GetBitmap());
+ auto iter = bitmap->Begin();
+ return std::make_unique<BitmapGlobalIndexResult::Iterator>(bitmap,
std::move(iter));
+}
+
+Result<std::shared_ptr<GlobalIndexResult>> BitmapGlobalIndexResult::And(
+ const std::shared_ptr<GlobalIndexResult>& other) {
+ auto typed_result =
std::dynamic_pointer_cast<BitmapGlobalIndexResult>(other);
+ if (typed_result) {
+ auto supplier = [typed_result, result =
std::dynamic_pointer_cast<BitmapGlobalIndexResult>(
+ shared_from_this())]() ->
Result<RoaringBitmap64> {
+ PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* r1,
result->GetBitmap());
+ PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* r2,
typed_result->GetBitmap());
+ return RoaringBitmap64::And(*r1, *r2);
+ };
+ return std::make_shared<BitmapGlobalIndexResult>(supplier);
+ }
+ return GlobalIndexResult::And(other);
+}
+
+Result<std::shared_ptr<GlobalIndexResult>> BitmapGlobalIndexResult::Or(
+ const std::shared_ptr<GlobalIndexResult>& other) {
+ auto typed_result =
std::dynamic_pointer_cast<BitmapGlobalIndexResult>(other);
+ if (typed_result) {
+ auto supplier = [typed_result, result =
std::dynamic_pointer_cast<BitmapGlobalIndexResult>(
+ shared_from_this())]() ->
Result<RoaringBitmap64> {
+ PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* r1,
result->GetBitmap());
+ PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* r2,
typed_result->GetBitmap());
+ return RoaringBitmap64::Or(*r1, *r2);
+ };
+ return std::make_shared<BitmapGlobalIndexResult>(supplier);
+ }
+ return GlobalIndexResult::Or(other);
+}
+
+Result<std::shared_ptr<GlobalIndexResult>>
BitmapGlobalIndexResult::AddOffset(int64_t offset) {
+ auto supplier = [offset, result =
std::dynamic_pointer_cast<BitmapGlobalIndexResult>(
+ shared_from_this())]() ->
Result<RoaringBitmap64> {
+ PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* bitmap,
result->GetBitmap());
+ RoaringBitmap64 bitmap64;
+ for (auto iter = bitmap->Begin(); iter != bitmap->End(); ++iter) {
+ bitmap64.Add(offset + (*iter));
+ }
+ return bitmap64;
+ };
+ return std::make_shared<BitmapGlobalIndexResult>(supplier);
+}
+
+Result<const RoaringBitmap64*> BitmapGlobalIndexResult::GetBitmap() const {
+ if (!initialized_) {
+ PAIMON_ASSIGN_OR_RAISE(bitmap_, bitmap_supplier_());
+ initialized_ = true;
+ }
+ return &bitmap_;
+}
+
+Result<bool> BitmapGlobalIndexResult::IsEmpty() const {
+ PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* bitmap, GetBitmap());
+ return bitmap->IsEmpty();
+}
+
+std::string BitmapGlobalIndexResult::ToString() const {
+ auto bitmap = GetBitmap();
+ if (!bitmap.ok()) {
+ return bitmap.status().ToString();
+ }
+ return bitmap.value()->ToString();
+}
+
+std::shared_ptr<BitmapGlobalIndexResult> BitmapGlobalIndexResult::FromRanges(
+ const std::vector<Range>& ranges) {
+ BitmapGlobalIndexResult::BitmapSupplier supplier = [ranges]() ->
Result<RoaringBitmap64> {
+ RoaringBitmap64 bitmap;
+ for (const auto& range : ranges) {
+ bitmap.AddRange(range.from, range.to + 1);
+ }
+ return bitmap;
+ };
+ return std::make_shared<BitmapGlobalIndexResult>(supplier);
+}
+} // namespace paimon
diff --git a/src/paimon/common/global_index/bitmap_global_index_result_test.cpp
b/src/paimon/common/global_index/bitmap_global_index_result_test.cpp
new file mode 100644
index 0000000..fedceaf
--- /dev/null
+++ b/src/paimon/common/global_index/bitmap_global_index_result_test.cpp
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "paimon/global_index/bitmap_global_index_result.h"
+
+#include "gtest/gtest.h"
+#include "paimon/testing/utils/testharness.h"
+#include "paimon/utils/roaring_bitmap32.h"
+namespace paimon::test {
+class BitmapGlobalIndexResultTest : public ::testing::Test {
+ public:
+ void SetUp() override {}
+ void TearDown() override {}
+
+ class FakeGlobalIndexResult : public GlobalIndexResult {
+ public:
+ explicit FakeGlobalIndexResult(const std::vector<int64_t>& values) :
values_(values) {}
+ class Iterator : public GlobalIndexResult::Iterator {
+ public:
+ Iterator(const std::vector<int64_t>* values,
+ const std::vector<int64_t>::const_iterator& iter)
+ : values_(values), iter_(iter) {}
+ bool HasNext() const override {
+ return iter_ != values_->end();
+ }
+ int64_t Next() override {
+ int64_t value = *iter_;
+ iter_++;
+ return value;
+ }
+ const std::vector<int64_t>* values_;
+ std::vector<int64_t>::const_iterator iter_;
+ };
+
+ Result<std::unique_ptr<GlobalIndexResult::Iterator>> CreateIterator()
const override {
+ auto iter = values_.begin();
+ return std::make_unique<Iterator>(&values_, iter);
+ }
+
+ std::string ToString() const override {
+ return "fake";
+ }
+
+ Result<bool> IsEmpty() const override {
+ return values_.empty();
+ }
+
+ Result<std::shared_ptr<GlobalIndexResult>> AddOffset(int64_t offset)
override {
+ std::vector<int64_t> values = values_;
+ for (auto& value : values) {
+ value += offset;
+ }
+ return std::make_shared<FakeGlobalIndexResult>(values);
+ }
+
+ private:
+ std::vector<int64_t> values_;
+ };
+};
+TEST_F(BitmapGlobalIndexResultTest, TestIterator) {
+ auto check_iterator = [](const std::vector<int64_t>& expected_ids) {
+ auto bitmap_supplier = [&]() -> Result<RoaringBitmap64> {
+ return RoaringBitmap64::From(expected_ids);
+ };
+ auto index_result =
std::make_shared<BitmapGlobalIndexResult>(bitmap_supplier);
+ if (expected_ids.empty()) {
+ ASSERT_TRUE(index_result->IsEmpty().value());
+ }
+
+ ASSERT_OK_AND_ASSIGN(auto iter, index_result->CreateIterator());
+ std::vector<int64_t> result_ids;
+ while (iter->HasNext()) {
+ result_ids.push_back(iter->Next());
+ }
+ ASSERT_EQ(result_ids, expected_ids);
+ };
+
+ check_iterator({});
+ check_iterator({1, 4, 7, RoaringBitmap32::MAX_VALUE,
RoaringBitmap64::MAX_VALUE});
+ check_iterator({100, 101, 102, 103});
+}
+
+TEST_F(BitmapGlobalIndexResultTest, TestAnd) {
+ auto check_and_result = [](const std::vector<int64_t>& left, const
std::vector<int64_t>& right,
+ const std::string& expected_str) {
+ auto bitmap_supplier1 = [&]() -> Result<RoaringBitmap64> {
+ return RoaringBitmap64::From(left);
+ };
+ auto index_result1 =
std::make_shared<BitmapGlobalIndexResult>(bitmap_supplier1);
+ auto bitmap_supplier2 = [&]() -> Result<RoaringBitmap64> {
+ return RoaringBitmap64::From(right);
+ };
+ auto index_result2 =
std::make_shared<BitmapGlobalIndexResult>(bitmap_supplier2);
+
+ ASSERT_OK_AND_ASSIGN(auto result, index_result1->And(index_result2));
+ ASSERT_EQ(result->ToString(), expected_str);
+ };
+ check_and_result({1, 2, 3}, {1, 2, 7}, "{1,2}");
+ check_and_result({1, 2, 3}, {1, 2, 3}, "{1,2,3}");
+ check_and_result({1, 2, 3}, {100, 200, 300}, "{}");
+ check_and_result({1, 2, 3}, {}, "{}");
+ check_and_result({}, {}, "{}");
+ check_and_result({1, 2, 3, RoaringBitmap64::MAX_VALUE}, {1,
RoaringBitmap64::MAX_VALUE},
+ "{1,9223372036854775807}");
+}
+
+TEST_F(BitmapGlobalIndexResultTest, TestBitmapResultAndOtherResult) {
+ auto bitmap_supplier1 = [&]() -> Result<RoaringBitmap64> {
+ return RoaringBitmap64::From({1, 2, 3});
+ };
+ auto index_result1 =
std::make_shared<BitmapGlobalIndexResult>(bitmap_supplier1);
+
+ auto fake_result =
std::make_shared<FakeGlobalIndexResult>(std::vector<int64_t>({1l, 2l, 7l}));
+
+ ASSERT_OK_AND_ASSIGN(auto result, index_result1->And(fake_result));
+ ASSERT_EQ(result->ToString(), "{1,2}");
+}
+
+TEST_F(BitmapGlobalIndexResultTest, TestOr) {
+ auto check_and_result = [](const std::vector<int64_t>& left, const
std::vector<int64_t>& right,
+ const std::string& expected_str) {
+ auto bitmap_supplier1 = [&]() -> Result<RoaringBitmap64> {
+ return RoaringBitmap64::From(left);
+ };
+ auto index_result1 =
std::make_shared<BitmapGlobalIndexResult>(bitmap_supplier1);
+ auto bitmap_supplier2 = [&]() -> Result<RoaringBitmap64> {
+ return RoaringBitmap64::From(right);
+ };
+ auto index_result2 =
std::make_shared<BitmapGlobalIndexResult>(bitmap_supplier2);
+
+ ASSERT_OK_AND_ASSIGN(auto result, index_result1->Or(index_result2));
+ ASSERT_EQ(result->ToString(), expected_str);
+ };
+ check_and_result({1, 2, 3}, {1, 2, 7}, "{1,2,3,7}");
+ check_and_result({1, 2, 3}, {1, 2, 3}, "{1,2,3}");
+ check_and_result({1, 2, 3}, {100, 200, 300}, "{1,2,3,100,200,300}");
+ check_and_result({1, 2, 3}, {}, "{1,2,3}");
+ check_and_result({}, {}, "{}");
+ check_and_result({1, 2, 3, RoaringBitmap64::MAX_VALUE}, {1,
RoaringBitmap64::MAX_VALUE},
+ "{1,2,3,9223372036854775807}");
+}
+
+TEST_F(BitmapGlobalIndexResultTest, TestBitmapResultOrOtherResult) {
+ auto bitmap_supplier1 = [&]() -> Result<RoaringBitmap64> {
+ return RoaringBitmap64::From({1, 2, 3});
+ };
+ auto index_result1 =
std::make_shared<BitmapGlobalIndexResult>(bitmap_supplier1);
+
+ auto fake_result =
std::make_shared<FakeGlobalIndexResult>(std::vector<int64_t>({1l, 2l, 7l}));
+
+ ASSERT_OK_AND_ASSIGN(auto result, index_result1->Or(fake_result));
+ ASSERT_EQ(result->ToString(), "{1,2,3,7}");
+}
+
+TEST_F(BitmapGlobalIndexResultTest, TestInvalidBitmapResult) {
+ auto bitmap_supplier = [&]() -> Result<RoaringBitmap64> {
+ return Status::Invalid("invalid supplier");
+ };
+ auto result = std::make_shared<BitmapGlobalIndexResult>(bitmap_supplier);
+ ASSERT_TRUE(result->ToString().find("Invalid: invalid supplier") !=
std::string::npos);
+}
+
+TEST_F(BitmapGlobalIndexResultTest, TestFromRanges) {
+ {
+ auto result = BitmapGlobalIndexResult::FromRanges({Range(0, 5)});
+ ASSERT_EQ(result->ToString(), "{0,1,2,3,4,5}");
+ }
+ {
+ auto result = BitmapGlobalIndexResult::FromRanges({Range(10, 10)});
+ ASSERT_EQ(result->ToString(), "{10}");
+ }
+ {
+ auto result = BitmapGlobalIndexResult::FromRanges({Range(0, 5),
Range(10, 10)});
+ ASSERT_EQ(result->ToString(), "{0,1,2,3,4,5,10}");
+ }
+}
+
+TEST_F(BitmapGlobalIndexResultTest, TestAddOffset) {
+ {
+ auto result = BitmapGlobalIndexResult::FromRanges({Range(0, 5)});
+ ASSERT_OK_AND_ASSIGN(auto result_with_offset, result->AddOffset(0));
+ ASSERT_EQ(result_with_offset->ToString(), "{0,1,2,3,4,5}");
+
+ ASSERT_OK_AND_ASSIGN(result_with_offset, result->AddOffset(10));
+ ASSERT_EQ(result_with_offset->ToString(), "{10,11,12,13,14,15}");
+ }
+ {
+ auto result = BitmapGlobalIndexResult::FromRanges({});
+ ASSERT_OK_AND_ASSIGN(auto result_with_offset, result->AddOffset(10));
+ ASSERT_EQ(result_with_offset->ToString(), "{}");
+ }
+}
+} // namespace paimon::test
diff --git
a/src/paimon/common/global_index/bitmap_scored_global_index_result.cpp
b/src/paimon/common/global_index/bitmap_scored_global_index_result.cpp
new file mode 100644
index 0000000..4beb10e
--- /dev/null
+++ b/src/paimon/common/global_index/bitmap_scored_global_index_result.cpp
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "paimon/global_index/bitmap_scored_global_index_result.h"
+
+#include "fmt/format.h"
+#include "fmt/ranges.h"
+#include "paimon/global_index/bitmap_global_index_result.h"
+
+namespace paimon {
+namespace {
+std::map<int64_t, float> CreateIdToScoreMap(const RoaringBitmap64& bitmap,
+ const std::vector<float>& scores) {
+ std::map<int64_t, float> id_to_score;
+ size_t idx = 0;
+ for (auto iter = bitmap.Begin(); iter != bitmap.End(); ++iter, ++idx) {
+ id_to_score[*iter] = scores[idx];
+ }
+ return id_to_score;
+}
+std::vector<float> GetScoresFromMap(const RoaringBitmap64& bitmap,
+ std::map<int64_t, float>& id_to_score) {
+ std::vector<float> scores;
+ scores.reserve(bitmap.Cardinality());
+ for (auto iter = bitmap.Begin(); iter != bitmap.End(); ++iter) {
+ scores.push_back(id_to_score[*iter]);
+ }
+ return scores;
+}
+} // namespace
+Result<std::unique_ptr<GlobalIndexResult::Iterator>>
BitmapScoredGlobalIndexResult::CreateIterator()
+ const {
+ return std::make_unique<BitmapGlobalIndexResult::Iterator>(&bitmap_,
bitmap_.Begin());
+}
+
+Result<std::unique_ptr<ScoredGlobalIndexResult::ScoredIterator>>
+BitmapScoredGlobalIndexResult::CreateScoredIterator() const {
+ return std::make_unique<BitmapScoredGlobalIndexResult::ScoredIterator>(
+ &bitmap_, bitmap_.Begin(), scores_.data());
+}
+
+Result<std::shared_ptr<GlobalIndexResult>> BitmapScoredGlobalIndexResult::And(
+ const std::shared_ptr<GlobalIndexResult>& other) {
+ auto scored_other =
std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(other);
+ if (scored_other) {
+ // If current and other result are both BitmapScoredGlobalIndexResult,
return
+ // BitmapGlobalIndexResult. Erase scores to prevent the same row id
with different
+ // scores in current and other results.
+ auto supplier = [scored_other,
+ result =
std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(
+ shared_from_this())]() -> Result<RoaringBitmap64>
{
+ PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* r1,
scored_other->GetBitmap());
+ PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* r2,
result->GetBitmap());
+ return RoaringBitmap64::And(*r1, *r2);
+ };
+ return std::make_shared<BitmapGlobalIndexResult>(supplier);
+ }
+ auto bitmap_other =
std::dynamic_pointer_cast<BitmapGlobalIndexResult>(other);
+ if (bitmap_other) {
+ // If other bitmap is BitmapGlobalIndexResult, return
BitmapScoredGlobalIndexResult as
+ // score must exist in current scored result.
+ std::map<int64_t, float> id_to_score = CreateIdToScoreMap(bitmap_,
scores_);
+ PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* other_bitmap,
bitmap_other->GetBitmap());
+ auto and_bitmap = RoaringBitmap64::And(bitmap_, *other_bitmap);
+ std::vector<float> and_scores = GetScoresFromMap(and_bitmap,
id_to_score);
+ return
std::make_shared<BitmapScoredGlobalIndexResult>(std::move(and_bitmap),
+
std::move(and_scores));
+ }
+ return GlobalIndexResult::And(other);
+}
+
+Result<std::shared_ptr<GlobalIndexResult>> BitmapScoredGlobalIndexResult::Or(
+ const std::shared_ptr<GlobalIndexResult>& other) {
+ auto scored_other =
std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(other);
+ if (scored_other) {
+ // If current and other result are both BitmapScoredGlobalIndexResult,
return
+ // BitmapScoredGlobalIndexResult when current and other have has no
intersection row
+ // id.
+ std::map<int64_t, float> id_to_score = CreateIdToScoreMap(bitmap_,
scores_);
+ size_t idx = 0;
+ for (auto iter = scored_other->bitmap_.Begin(); iter !=
scored_other->bitmap_.End();
+ ++iter, ++idx) {
+ if (id_to_score.find(*iter) != id_to_score.end()) {
+ return Status::Invalid(
+ "not support two BitmapScoredGlobalIndexResult or with
same row id");
+ }
+ id_to_score[*iter] = scored_other->scores_[idx];
+ }
+ auto or_bitmap = RoaringBitmap64::Or(bitmap_, scored_other->bitmap_);
+ std::vector<float> or_scores = GetScoresFromMap(or_bitmap,
id_to_score);
+ return
std::make_shared<BitmapScoredGlobalIndexResult>(std::move(or_bitmap),
+
std::move(or_scores));
+ }
+
+ auto bitmap_other =
std::dynamic_pointer_cast<BitmapGlobalIndexResult>(other);
+ if (bitmap_other) {
+ // If other bitmap is BitmapGlobalIndexResult, return
BitmapGlobalIndexResult as
+ // score for union row id is unknown.
+ auto supplier = [bitmap_other,
+ result =
std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(
+ shared_from_this())]() -> Result<RoaringBitmap64>
{
+ PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* r1,
bitmap_other->GetBitmap());
+ PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* r2,
result->GetBitmap());
+ return RoaringBitmap64::Or(*r1, *r2);
+ };
+ return std::make_shared<BitmapGlobalIndexResult>(supplier);
+ }
+ return GlobalIndexResult::Or(other);
+}
+
+Result<std::shared_ptr<GlobalIndexResult>>
BitmapScoredGlobalIndexResult::AddOffset(
+ int64_t offset) {
+ PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* bitmap, GetBitmap());
+ RoaringBitmap64 bitmap64;
+ for (auto iter = bitmap->Begin(); iter != bitmap->End(); ++iter) {
+ bitmap64.Add(offset + (*iter));
+ }
+ auto scores = GetScores();
+ return
std::make_shared<BitmapScoredGlobalIndexResult>(std::move(bitmap64),
std::move(scores));
+}
+
+Result<bool> BitmapScoredGlobalIndexResult::IsEmpty() const {
+ return bitmap_.IsEmpty();
+}
+
+Result<const RoaringBitmap64*> BitmapScoredGlobalIndexResult::GetBitmap()
const {
+ return &bitmap_;
+}
+
+const std::vector<float>& BitmapScoredGlobalIndexResult::GetScores() const {
+ return scores_;
+}
+
+std::string BitmapScoredGlobalIndexResult::ToString() const {
+ std::vector<std::string> formatted_scores;
+ formatted_scores.reserve(scores_.size());
+ for (const auto& score : scores_) {
+ formatted_scores.push_back(fmt::format("{:.2f}", score));
+ }
+ return fmt::format("row ids: {}, scores: {{{}}}", bitmap_.ToString(),
+ fmt::join(formatted_scores, ","));
+}
+
+} // namespace paimon
diff --git
a/src/paimon/common/global_index/bitmap_scored_global_index_result_test.cpp
b/src/paimon/common/global_index/bitmap_scored_global_index_result_test.cpp
new file mode 100644
index 0000000..25540da
--- /dev/null
+++ b/src/paimon/common/global_index/bitmap_scored_global_index_result_test.cpp
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "paimon/global_index/bitmap_scored_global_index_result.h"
+
+#include "gtest/gtest.h"
+#include "paimon/global_index/bitmap_global_index_result.h"
+#include "paimon/testing/utils/testharness.h"
+#include "paimon/utils/roaring_bitmap32.h"
+
+namespace paimon::test {
+class BitmapScoredGlobalIndexResultTest : public ::testing::Test {
+ public:
+ void SetUp() override {}
+ void TearDown() override {}
+
+ class FakeGlobalIndexResult : public GlobalIndexResult {
+ public:
+ explicit FakeGlobalIndexResult(const std::vector<int64_t>& values) :
values_(values) {}
+ class Iterator : public GlobalIndexResult::Iterator {
+ public:
+ Iterator(const std::vector<int64_t>* values,
+ const std::vector<int64_t>::const_iterator& iter)
+ : values_(values), iter_(iter) {}
+ bool HasNext() const override {
+ return iter_ != values_->end();
+ }
+ int64_t Next() override {
+ int64_t value = *iter_;
+ iter_++;
+ return value;
+ }
+ const std::vector<int64_t>* values_;
+ std::vector<int64_t>::const_iterator iter_;
+ };
+
+ Result<std::unique_ptr<GlobalIndexResult::Iterator>> CreateIterator()
const override {
+ auto iter = values_.begin();
+ return std::make_unique<Iterator>(&values_, iter);
+ }
+
+ std::string ToString() const override {
+ return "fake";
+ }
+
+ Result<bool> IsEmpty() const override {
+ return values_.empty();
+ }
+
+ Result<std::shared_ptr<GlobalIndexResult>> AddOffset(int64_t offset)
override {
+ std::vector<int64_t> values = values_;
+ for (auto& value : values) {
+ value += offset;
+ }
+ return std::make_shared<FakeGlobalIndexResult>(values);
+ }
+
+ private:
+ std::vector<int64_t> values_;
+ };
+};
+
+TEST_F(BitmapScoredGlobalIndexResultTest, TestIterator) {
+ auto check_iterator = [](const std::vector<int64_t>& expected_ids,
+ const std::vector<float>& expected_scores) {
+ ASSERT_EQ(expected_ids.size(), expected_scores.size());
+ auto tmp_scores = expected_scores;
+
+ auto index_result = std::make_shared<BitmapScoredGlobalIndexResult>(
+ RoaringBitmap64::From(expected_ids), std::move(tmp_scores));
+ if (expected_ids.empty()) {
+ ASSERT_TRUE(index_result->IsEmpty().value());
+ }
+ // check iterator
+ ASSERT_OK_AND_ASSIGN(auto iter, index_result->CreateIterator());
+ for (auto expected_id : expected_ids) {
+ ASSERT_TRUE(iter->HasNext());
+ ASSERT_EQ(iter->Next(), expected_id);
+ }
+ ASSERT_FALSE(iter->HasNext());
+
+ // check scored iterator
+ ASSERT_OK_AND_ASSIGN(auto scored_iter,
index_result->CreateScoredIterator());
+ for (size_t i = 0; i < expected_ids.size(); i++) {
+ ASSERT_TRUE(scored_iter->HasNext());
+ auto [id, score] = scored_iter->NextWithScore();
+ ASSERT_EQ(id, expected_ids[i]);
+ ASSERT_NEAR(score, expected_scores[i], 0.01);
+ }
+ ASSERT_FALSE(scored_iter->HasNext());
+ };
+
+ check_iterator({}, {});
+ check_iterator({1, 4, 7, RoaringBitmap32::MAX_VALUE,
RoaringBitmap64::MAX_VALUE},
+ {1.0f, 2.1f, 3.2f, 4.5f, 6.7f});
+ check_iterator({100, 101, 102, 103}, {100.1f, 200.2f, 0.12f, 0.34f});
+}
+
+TEST_F(BitmapScoredGlobalIndexResultTest, TestAnd) {
+ auto check_and_result = [](const std::vector<int64_t>& left_ids,
+ const std::vector<int64_t>& right_ids,
+ const std::string& expected_str) {
+ std::vector<float> left_scores(left_ids.size(), 1.1f);
+ auto index_result1 = std::make_shared<BitmapScoredGlobalIndexResult>(
+ RoaringBitmap64::From(left_ids), std::move(left_scores));
+ std::vector<float> right_scores(right_ids.size(), 1.2f);
+ auto index_result2 = std::make_shared<BitmapScoredGlobalIndexResult>(
+ RoaringBitmap64::From(right_ids), std::move(right_scores));
+ ASSERT_OK_AND_ASSIGN(auto result, index_result1->And(index_result2));
+ ASSERT_EQ(result->ToString(), expected_str);
+ };
+ check_and_result({1, 2, 3}, {1, 2, 7}, "{1,2}");
+ check_and_result({1, 2, 3}, {1, 2, 3}, "{1,2,3}");
+ check_and_result({1, 2, 3}, {100, 200, 300}, "{}");
+ check_and_result({1, 2, 3}, {}, "{}");
+ check_and_result({}, {}, "{}");
+ check_and_result({1, 2, 3, RoaringBitmap64::MAX_VALUE}, {1,
RoaringBitmap64::MAX_VALUE},
+ "{1,9223372036854775807}");
+}
+
+TEST_F(BitmapScoredGlobalIndexResultTest, TestAndBitmapResult) {
+ auto check_and_result =
+ [](const std::vector<int64_t>& left_ids, std::vector<float>&&
left_scores,
+ const std::vector<int64_t>& right_ids, const std::string&
expected_str) {
+ auto index_result1 =
std::make_shared<BitmapScoredGlobalIndexResult>(
+ RoaringBitmap64::From(left_ids), std::move(left_scores));
+
+ auto bitmap_supplier2 = [&]() -> Result<RoaringBitmap64> {
+ return RoaringBitmap64::From(right_ids);
+ };
+ auto index_result2 =
std::make_shared<BitmapGlobalIndexResult>(bitmap_supplier2);
+
+ ASSERT_OK_AND_ASSIGN(auto result,
index_result1->And(index_result2));
+ ASSERT_EQ(result->ToString(), expected_str);
+ };
+ check_and_result({1, 2, 3}, {1.1f, 1.2f, 1.3f}, {1, 2, 7},
+ "row ids: {1,2}, scores: {1.10,1.20}");
+ check_and_result({1, 2, 3}, {100.1f, 100.2f, 100.3f}, {1, 2, 3},
+ "row ids: {1,2,3}, scores: {100.10,100.20,100.30}");
+ check_and_result({1, 2, 3}, {1.1f, 1.2f, 1.3f}, {100, 200, 300}, "row ids:
{}, scores: {}");
+ check_and_result({1, 2, 3}, {1.1f, 1.2f, 1.3f}, {}, "row ids: {}, scores:
{}");
+ check_and_result({}, {}, {}, "row ids: {}, scores: {}");
+ check_and_result({1, 2, 3, RoaringBitmap64::MAX_VALUE}, {0.12f, 0.13f,
0.14f, 0.15f},
+ {1, RoaringBitmap64::MAX_VALUE},
+ "row ids: {1,9223372036854775807}, scores: {0.12,0.15}");
+}
+
+TEST_F(BitmapScoredGlobalIndexResultTest, TestAndOtherResult) {
+ auto index_result1 = std::make_shared<BitmapScoredGlobalIndexResult>(
+ RoaringBitmap64::From({1, 2, 3}), std::vector<float>({1.1f, 1.2f,
1.3f}));
+
+ auto fake_result =
std::make_shared<FakeGlobalIndexResult>(std::vector<int64_t>({1l, 2l, 7l}));
+
+ ASSERT_OK_AND_ASSIGN(auto result, index_result1->And(fake_result));
+ ASSERT_EQ(result->ToString(), "{1,2}");
+}
+
+TEST_F(BitmapScoredGlobalIndexResultTest, TestOr) {
+ auto check_or_result = [](const std::vector<int64_t>& left_ids,
+ std::vector<float>&& left_scores,
+ const std::vector<int64_t>& right_ids,
+ std::vector<float>&& right_scores, const
std::string& expected_str) {
+ auto index_result1 = std::make_shared<BitmapScoredGlobalIndexResult>(
+ RoaringBitmap64::From(left_ids), std::move(left_scores));
+ auto index_result2 = std::make_shared<BitmapScoredGlobalIndexResult>(
+ RoaringBitmap64::From(right_ids), std::move(right_scores));
+ ASSERT_OK_AND_ASSIGN(auto result, index_result1->Or(index_result2));
+ ASSERT_EQ(result->ToString(), expected_str);
+ };
+ check_or_result({1, 2, 3}, {1.1f, 1.2f, 1.3f}, {100, 200, 300}, {100.1f,
200.1f, 300.1f},
+ "row ids: {1,2,3,100,200,300}, scores:
{1.10,1.20,1.30,100.10,200.10,300.10}");
+ check_or_result({1, 2, 3}, {1.1f, 1.2f, 1.3f}, {}, {},
+ "row ids: {1,2,3}, scores: {1.10,1.20,1.30}");
+ check_or_result({}, {}, {}, {}, "row ids: {}, scores: {}");
+ check_or_result(
+ {1, 2, 3, RoaringBitmap64::MAX_VALUE}, {1.1f, 1.2f, 1.3f, 1.4f},
+ {RoaringBitmap32::MAX_VALUE}, {0.12f},
+ "row ids: {1,2,3,2147483647,9223372036854775807}, scores:
{1.10,1.20,1.30,0.12,1.40}");
+}
+
+TEST_F(BitmapScoredGlobalIndexResultTest, TestOrBitmapResult) {
+ auto check_or_result = [](const std::vector<int64_t>& left_ids,
+ const std::vector<int64_t>& right_ids,
+ const std::string& expected_str) {
+ std::vector<float> left_scores(left_ids.size(), 1.1f);
+ auto index_result1 = std::make_shared<BitmapScoredGlobalIndexResult>(
+ RoaringBitmap64::From(left_ids), std::move(left_scores));
+
+ auto bitmap_supplier2 = [&]() -> Result<RoaringBitmap64> {
+ return RoaringBitmap64::From(right_ids);
+ };
+ auto index_result2 =
std::make_shared<BitmapGlobalIndexResult>(bitmap_supplier2);
+
+ ASSERT_OK_AND_ASSIGN(auto result, index_result1->Or(index_result2));
+ ASSERT_EQ(result->ToString(), expected_str);
+ };
+
+ check_or_result({1, 2, 3}, {1, 2, 7}, "{1,2,3,7}");
+ check_or_result({1, 2, 3}, {1, 2, 3}, "{1,2,3}");
+ check_or_result({1, 2, 3}, {100, 200, 300}, "{1,2,3,100,200,300}");
+ check_or_result({1, 2, 3}, {}, "{1,2,3}");
+ check_or_result({}, {}, "{}");
+ check_or_result({1, 2, 3, RoaringBitmap64::MAX_VALUE}, {1,
RoaringBitmap64::MAX_VALUE},
+ "{1,2,3,9223372036854775807}");
+}
+
+TEST_F(BitmapScoredGlobalIndexResultTest, TestOrOtherResult) {
+ auto index_result1 = std::make_shared<BitmapScoredGlobalIndexResult>(
+ RoaringBitmap64::From({1, 2, 3}), std::vector<float>({1.1f, 1.2f,
1.3f}));
+
+ auto fake_result =
std::make_shared<FakeGlobalIndexResult>(std::vector<int64_t>({1l, 2l, 7l}));
+
+ ASSERT_OK_AND_ASSIGN(auto result, index_result1->Or(fake_result));
+ ASSERT_EQ(result->ToString(), "{1,2,3,7}");
+}
+
+TEST_F(BitmapScoredGlobalIndexResultTest, TestInvalidOr) {
+ std::vector<int64_t> left_ids = {1, 2, 3};
+ std::vector<float> left_scores = {1.1f, 1.2f, 1.3f};
+ auto index_result1 = std::make_shared<BitmapScoredGlobalIndexResult>(
+ RoaringBitmap64::From(left_ids), std::move(left_scores));
+ std::vector<int64_t> right_ids = {1, 2, 7};
+ std::vector<float> right_scores = {2.1f, 2.2f, 2.3f};
+ auto index_result2 = std::make_shared<BitmapScoredGlobalIndexResult>(
+ RoaringBitmap64::From(right_ids), std::move(right_scores));
+ ASSERT_NOK_WITH_MSG(index_result1->Or(index_result2),
+ "not support two BitmapScoredGlobalIndexResult or with
same row id");
+}
+
+TEST_F(BitmapScoredGlobalIndexResultTest, TestAddOffset) {
+ {
+ std::vector<int64_t> ids = {1, 2, 3};
+ std::vector<float> scores = {1.1f, 1.2f, 1.3f};
+ auto index_result = std::make_shared<BitmapScoredGlobalIndexResult>(
+ RoaringBitmap64::From(ids), std::move(scores));
+ ASSERT_OK_AND_ASSIGN(auto result_with_offset,
index_result->AddOffset(10));
+ ASSERT_EQ(result_with_offset->ToString(), "row ids: {11,12,13},
scores: {1.10,1.20,1.30}");
+ }
+ {
+ std::vector<int64_t> ids = {};
+ std::vector<float> scores = {};
+ auto index_result = std::make_shared<BitmapScoredGlobalIndexResult>(
+ RoaringBitmap64::From(ids), std::move(scores));
+ ASSERT_OK_AND_ASSIGN(auto result_with_offset,
index_result->AddOffset(10));
+ ASSERT_EQ(result_with_offset->ToString(), "row ids: {}, scores: {}");
+ }
+}
+} // namespace paimon::test
diff --git
a/src/paimon/common/global_index/complete_index_score_batch_reader.cpp
b/src/paimon/common/global_index/complete_index_score_batch_reader.cpp
new file mode 100644
index 0000000..15cdc8f
--- /dev/null
+++ b/src/paimon/common/global_index/complete_index_score_batch_reader.cpp
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "paimon/common/global_index/complete_index_score_batch_reader.h"
+
+#include <cstddef>
+
+#include "arrow/api.h"
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_nested.h"
+#include "arrow/array/util.h"
+#include "arrow/c/abi.h"
+#include "arrow/c/bridge.h"
+#include "arrow/scalar.h"
+#include "paimon/common/reader/reader_utils.h"
+#include "paimon/common/table/special_fields.h"
+#include "paimon/common/types/row_kind.h"
+#include "paimon/common/utils/arrow/mem_utils.h"
+#include "paimon/common/utils/arrow/status_utils.h"
+#include "paimon/status.h"
+namespace paimon {
+CompleteIndexScoreBatchReader::CompleteIndexScoreBatchReader(
+ std::unique_ptr<BatchReader>&& reader, const std::vector<float>& scores,
+ const std::shared_ptr<MemoryPool>& pool)
+ : arrow_pool_(GetArrowPool(pool)), reader_(std::move(reader)),
scores_(scores) {}
+
+Result<BatchReader::ReadBatch> CompleteIndexScoreBatchReader::NextBatch() {
+ PAIMON_ASSIGN_OR_RAISE(BatchReader::ReadBatchWithBitmap batch_with_bitmap,
+ NextBatchWithBitmap());
+ return ReaderUtils::ApplyBitmapToReadBatch(std::move(batch_with_bitmap),
arrow_pool_.get());
+}
+
+void CompleteIndexScoreBatchReader::UpdateScoreFieldIndex(const
arrow::StructType* struct_type) {
+ if (index_score_field_idx_ != -1) {
+ return;
+ }
+ index_score_field_idx_ =
struct_type->GetFieldIndex(SpecialFields::IndexScore().Name());
+ field_names_with_score_.reserve(struct_type->num_fields());
+ for (const auto& field : struct_type->fields()) {
+ field_names_with_score_.push_back(field->name());
+ }
+}
+Result<BatchReader::ReadBatchWithBitmap>
CompleteIndexScoreBatchReader::NextBatchWithBitmap() {
+ PAIMON_ASSIGN_OR_RAISE(BatchReader::ReadBatchWithBitmap batch_with_bitmap,
+ reader_->NextBatchWithBitmap());
+ if (BatchReader::IsEofBatch(batch_with_bitmap)) {
+ return batch_with_bitmap;
+ }
+ if (scores_.empty()) {
+ // Indicates score field all null.
+ return batch_with_bitmap;
+ }
+
+ auto& [batch, bitmap] = batch_with_bitmap;
+ auto& [c_array, c_schema] = batch;
+ PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Array>
arrow_array,
+ arrow::ImportArray(c_array.get(),
c_schema.get()));
+ auto struct_array =
std::dynamic_pointer_cast<arrow::StructArray>(arrow_array);
+ if (!struct_array) {
+ return Status::Invalid("cannot cast array to StructArray in
CompleteIndexScoreBatchReader");
+ }
+ auto struct_type = struct_array->struct_type();
+ UpdateScoreFieldIndex(struct_type);
+
+ // prepare index score array
+ std::unique_ptr<arrow::ArrayBuilder> index_score_builder;
+ PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::MakeBuilder(
+ arrow_pool_.get(), SpecialFields::IndexScore().Type(),
&index_score_builder));
+ auto typed_builder =
dynamic_cast<arrow::FloatBuilder*>(index_score_builder.get());
+ assert(typed_builder);
+
PAIMON_RETURN_NOT_OK_FROM_ARROW(typed_builder->Reserve(struct_array->length()));
+ bool all_not_null = (struct_array->length() == bitmap.Cardinality());
+ for (int64_t i = 0; i < struct_array->length(); i++) {
+ if (all_not_null || bitmap.Contains(i)) {
+
PAIMON_RETURN_NOT_OK_FROM_ARROW(typed_builder->Append(scores_[score_cursor_++]));
+ } else {
+ PAIMON_RETURN_NOT_OK_FROM_ARROW(typed_builder->AppendNull());
+ }
+ }
+ std::shared_ptr<arrow::Array> index_score_array;
+ PAIMON_RETURN_NOT_OK_FROM_ARROW(typed_builder->Finish(&index_score_array));
+ // update index score array to struct array
+ arrow::ArrayVector array_vec = struct_array->fields();
+ array_vec[index_score_field_idx_] = index_score_array;
+ PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::StructArray>
array_with_score,
+ arrow::StructArray::Make(array_vec,
field_names_with_score_));
+ PAIMON_RETURN_NOT_OK_FROM_ARROW(
+ arrow::ExportArray(*array_with_score, c_array.get(), c_schema.get()));
+ return batch_with_bitmap;
+}
+} // namespace paimon
diff --git a/src/paimon/common/global_index/complete_index_score_batch_reader.h
b/src/paimon/common/global_index/complete_index_score_batch_reader.h
new file mode 100644
index 0000000..ac97b7e
--- /dev/null
+++ b/src/paimon/common/global_index/complete_index_score_batch_reader.h
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/api.h"
+#include "arrow/array/array_base.h"
+#include "paimon/reader/batch_reader.h"
+#include "paimon/result.h"
+
+namespace paimon {
+class MemoryPool;
+class Metrics;
+/// A batch reader that enriches the output Arrow array with index score
information.
+/// It assumes the input data already contains the `_INDEX_SCORE` column,
+/// and ensures this score is properly updated in the returned batches.
+///
+/// @pre The read schema must include the `_INDEX_SCORE` field.
+class CompleteIndexScoreBatchReader : public BatchReader {
+ public:
+ CompleteIndexScoreBatchReader(std::unique_ptr<BatchReader>&& reader,
+ const std::vector<float>& scores,
+ const std::shared_ptr<MemoryPool>& pool);
+
+ Result<ReadBatch> NextBatch() override;
+
+ Result<ReadBatchWithBitmap> NextBatchWithBitmap() override;
+
+ void Close() override {
+ reader_->Close();
+ }
+
+ std::shared_ptr<Metrics> GetReaderMetrics() const override {
+ return reader_->GetReaderMetrics();
+ }
+
+ private:
+ void UpdateScoreFieldIndex(const arrow::StructType* struct_type);
+
+ private:
+ size_t score_cursor_ = 0;
+ int32_t index_score_field_idx_ = -1;
+ std::vector<std::string> field_names_with_score_;
+ std::unique_ptr<arrow::MemoryPool> arrow_pool_;
+ std::unique_ptr<BatchReader> reader_;
+ std::vector<float> scores_;
+};
+} // namespace paimon
diff --git
a/src/paimon/common/global_index/complete_index_score_batch_reader_test.cpp
b/src/paimon/common/global_index/complete_index_score_batch_reader_test.cpp
new file mode 100644
index 0000000..e14b28f
--- /dev/null
+++ b/src/paimon/common/global_index/complete_index_score_batch_reader_test.cpp
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "paimon/common/global_index/complete_index_score_batch_reader.h"
+
+#include "arrow/api.h"
+#include "arrow/array/array_base.h"
+#include "arrow/c/abi.h"
+#include "arrow/c/bridge.h"
+#include "arrow/ipc/json_simple.h"
+#include "gtest/gtest.h"
+#include "paimon/common/table/special_fields.h"
+#include "paimon/common/types/data_field.h"
+#include "paimon/format/file_format.h"
+#include "paimon/format/file_format_factory.h"
+#include "paimon/memory/memory_pool.h"
+#include "paimon/status.h"
+#include "paimon/testing/mock/mock_file_batch_reader.h"
+#include "paimon/testing/utils/read_result_collector.h"
+#include "paimon/testing/utils/testharness.h"
+
+namespace paimon::test {
+class CompleteIndexScoreBatchReaderTest : public ::testing::Test {
+ public:
+ void SetUp() override {
+ pool_ = GetDefaultPool();
+ }
+ void TearDown() override {
+ pool_.reset();
+ }
+
+ std::unique_ptr<BatchReader> PrepareCompleteIndexScoreBatchReader(
+ const std::shared_ptr<arrow::Array>& src_array, const RoaringBitmap32&
selected_bitmap,
+ const std::vector<float>& scores, int32_t batch_size) const {
+ auto file_batch_reader =
std::make_unique<MockFileBatchReader>(src_array, src_array->type(),
+
selected_bitmap, batch_size);
+ return
std::make_unique<CompleteIndexScoreBatchReader>(std::move(file_batch_reader),
scores,
+ pool_);
+ }
+
+ std::unique_ptr<BatchReader> PrepareCompleteIndexScoreBatchReader(
+ const std::shared_ptr<arrow::Array>& src_array, const
std::vector<float>& scores,
+ int32_t batch_size) const {
+ auto file_batch_reader =
+ std::make_unique<MockFileBatchReader>(src_array,
src_array->type(), batch_size);
+ return
std::make_unique<CompleteIndexScoreBatchReader>(std::move(file_batch_reader),
scores,
+ pool_);
+ }
+
+ private:
+ std::shared_ptr<MemoryPool> pool_;
+};
+
+TEST_F(CompleteIndexScoreBatchReaderTest, TestSimple) {
+ arrow::FieldVector fields = {
+ arrow::field("f0", arrow::utf8()),
+ arrow::field("f1", arrow::int32()),
+ arrow::field("_INDEX_SCORE", arrow::float32()),
+ arrow::field("_ROW_ID", arrow::int64()),
+ };
+
+ auto src_array =
arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([
+ ["Alice", 10, null, 0],
+ ["Bob", 11, null, 1],
+ ["Cathy", 12, null, 2]
+ ])")
+ .ValueOrDie();
+
+ std::vector<float> scores = {1.23f, 2.34f, 100.10f};
+ auto reader = PrepareCompleteIndexScoreBatchReader(src_array, scores,
+ /*batch_size=*/1);
+
+ ASSERT_OK_AND_ASSIGN(auto result_array,
ReadResultCollector::CollectResult(reader.get()));
+
+ std::shared_ptr<arrow::ChunkedArray> expected_array;
+ auto array_status =
+
arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow::struct_(fields), {R"([
+ ["Alice", 10, 1.23, 0],
+ ["Bob", 11, 2.34, 1],
+ ["Cathy", 12, 100.10, 2]
+])"},
+ &expected_array);
+ ASSERT_TRUE(array_status.ok());
+ ASSERT_TRUE(expected_array->ApproxEquals(*result_array));
+ reader->Close();
+}
+
+TEST_F(CompleteIndexScoreBatchReaderTest, TestWithBitmap) {
+ arrow::FieldVector fields = {
+ arrow::field("f0", arrow::utf8()),
+ arrow::field("f1", arrow::int32()),
+ arrow::field("_INDEX_SCORE", arrow::float32()),
+ arrow::field("_ROW_ID", arrow::int64()),
+ };
+
+ auto src_array =
arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([
+ ["Alice", 10, null, 0],
+ ["Bob", 11, null, 1],
+ ["Cathy", 12, null, 2],
+ ["David", 13, null, 4]
+ ])")
+ .ValueOrDie();
+
+ std::vector<float> scores = {1.23f, -19.12f};
+ auto selected_bitmap = RoaringBitmap32::From({0, 3});
+ auto reader = PrepareCompleteIndexScoreBatchReader(src_array,
selected_bitmap, scores,
+ /*batch_size=*/2);
+
+ ASSERT_OK_AND_ASSIGN(auto result_array,
ReadResultCollector::CollectResult(reader.get()));
+
+ std::shared_ptr<arrow::ChunkedArray> expected_array;
+ auto array_status =
+
arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow::struct_(fields), {R"([
+ ["Alice", 10, 1.23, 0],
+ ["David", 13, -19.12, 4]
+])"},
+ &expected_array);
+ ASSERT_TRUE(array_status.ok());
+ ASSERT_TRUE(expected_array->ApproxEquals(*result_array));
+ reader->Close();
+}
+
+TEST_F(CompleteIndexScoreBatchReaderTest, TestReadWithNullScores) {
+ arrow::FieldVector fields = {
+ arrow::field("f0", arrow::utf8()),
+ arrow::field("f1", arrow::int32()),
+ arrow::field("_INDEX_SCORE", arrow::float32()),
+ arrow::field("_ROW_ID", arrow::int64()),
+ };
+
+ auto src_array =
arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([
+ ["Alice", 10, null, 0],
+ ["Bob", 11, null, 1],
+ ["Cathy", 12, null, 2]
+ ])")
+ .ValueOrDie();
+
+ // scores is empty, indicates all null score
+ auto reader = PrepareCompleteIndexScoreBatchReader(src_array,
/*scores=*/{},
+ /*batch_size=*/1);
+
+ ASSERT_OK_AND_ASSIGN(auto result_array,
ReadResultCollector::CollectResult(reader.get()));
+
+ auto expected_array = std::make_shared<arrow::ChunkedArray>(src_array);
+ ASSERT_TRUE(expected_array->Equals(*result_array));
+ reader->Close();
+}
+
+} // namespace paimon::test
diff --git a/src/paimon/common/global_index/global_index_result.cpp
b/src/paimon/common/global_index/global_index_result.cpp
new file mode 100644
index 0000000..f329b03
--- /dev/null
+++ b/src/paimon/common/global_index/global_index_result.cpp
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "paimon/global_index/global_index_result.h"
+
+#include "fmt/format.h"
+#include "paimon/common/io/memory_segment_output_stream.h"
+#include "paimon/common/memory/memory_segment_utils.h"
+#include "paimon/global_index/bitmap_global_index_result.h"
+#include "paimon/global_index/bitmap_scored_global_index_result.h"
+#include "paimon/io/byte_array_input_stream.h"
+#include "paimon/io/data_input_stream.h"
+#include "paimon/memory/bytes.h"
+#include "paimon/memory/memory_pool.h"
+namespace paimon {
+namespace {
+void WriteBitmapAndScores(const RoaringBitmap64* bitmap, const
std::vector<float>& scores,
+ MemorySegmentOutputStream* out, MemoryPool* pool) {
+ std::shared_ptr<Bytes> bitmap_bytes = bitmap->Serialize(pool);
+ out->WriteValue<int32_t>(bitmap_bytes->size());
+ out->WriteBytes(bitmap_bytes);
+
+ out->WriteValue<int32_t>(scores.size());
+ for (auto score : scores) {
+ out->WriteValue<float>(score);
+ }
+}
+
+} // namespace
+Result<std::shared_ptr<GlobalIndexResult>> GlobalIndexResult::And(
+ const std::shared_ptr<GlobalIndexResult>& other) {
+ auto supplier = [other, result = shared_from_this()]() ->
Result<RoaringBitmap64> {
+ PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<GlobalIndexResult::Iterator>
iter1,
+ result->CreateIterator());
+ PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<GlobalIndexResult::Iterator>
iter2,
+ other->CreateIterator());
+ RoaringBitmap64 bitmap1;
+ while (iter1->HasNext()) {
+ bitmap1.Add(iter1->Next());
+ }
+ RoaringBitmap64 bitmap2;
+ while (iter2->HasNext()) {
+ bitmap2.Add(iter2->Next());
+ }
+ bitmap1 &= bitmap2;
+ return bitmap1;
+ };
+ return std::make_shared<BitmapGlobalIndexResult>(supplier);
+}
+
+Result<std::shared_ptr<GlobalIndexResult>> GlobalIndexResult::Or(
+ const std::shared_ptr<GlobalIndexResult>& other) {
+ auto supplier = [other, result = shared_from_this()]() ->
Result<RoaringBitmap64> {
+ PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<GlobalIndexResult::Iterator>
iter1,
+ result->CreateIterator());
+ PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<GlobalIndexResult::Iterator>
iter2,
+ other->CreateIterator());
+ RoaringBitmap64 bitmap;
+ while (iter1->HasNext()) {
+ bitmap.Add(iter1->Next());
+ }
+ while (iter2->HasNext()) {
+ bitmap.Add(iter2->Next());
+ }
+ return bitmap;
+ };
+ return std::make_shared<BitmapGlobalIndexResult>(supplier);
+}
+
+Result<PAIMON_UNIQUE_PTR<Bytes>> GlobalIndexResult::Serialize(
+ const std::shared_ptr<GlobalIndexResult>& global_index_result,
+ const std::shared_ptr<MemoryPool>& pool) {
+ MemorySegmentOutputStream
out(MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool);
+ out.WriteValue<int32_t>(VERSION);
+ if (auto bitmap_result =
+
std::dynamic_pointer_cast<BitmapGlobalIndexResult>(global_index_result)) {
+ PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* bitmap,
bitmap_result->GetBitmap());
+ WriteBitmapAndScores(bitmap, {}, &out, pool.get());
+ } else if (auto bitmap_scored_result =
+
std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(global_index_result)) {
+ PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* bitmap,
bitmap_scored_result->GetBitmap());
+ const auto& scores = bitmap_scored_result->GetScores();
+ WriteBitmapAndScores(bitmap, scores, &out, pool.get());
+ } else {
+ return Status::Invalid(
+ "invalid GlobalIndexResult, must be BitmapGlobalIndexResult or "
+ "BitmapScoredGlobalIndexResult");
+ }
+ return MemorySegmentUtils::CopyToBytes(out.Segments(), 0,
out.CurrentSize(), pool.get());
+}
+
+Result<std::shared_ptr<GlobalIndexResult>> GlobalIndexResult::Deserialize(
+ const char* buffer, size_t length, const std::shared_ptr<MemoryPool>&
pool) {
+ auto input_stream = std::make_shared<ByteArrayInputStream>(buffer, length);
+ DataInputStream in(input_stream);
+ PAIMON_ASSIGN_OR_RAISE(int32_t version, in.ReadValue<int32_t>());
+ if (version != VERSION) {
+ return Status::Invalid(fmt::format("invalid version {} for
GlobalIndexResult", version));
+ }
+ PAIMON_ASSIGN_OR_RAISE(int32_t bitmap_bytes_len, in.ReadValue<int32_t>());
+ auto bitmap_bytes = Bytes::AllocateBytes(bitmap_bytes_len, pool.get());
+ PAIMON_RETURN_NOT_OK(in.ReadBytes(bitmap_bytes.get()));
+ RoaringBitmap64 bitmap;
+ PAIMON_RETURN_NOT_OK(bitmap.Deserialize(bitmap_bytes->data(),
bitmap_bytes->size()));
+
+ PAIMON_ASSIGN_OR_RAISE(int32_t score_len, in.ReadValue<int32_t>());
+ if (score_len == 0) {
+ return std::make_shared<BitmapGlobalIndexResult>(
+ [bitmap]() -> Result<RoaringBitmap64> { return bitmap; });
+ }
+ if (score_len != bitmap.Cardinality()) {
+ return Status::Invalid("row id count mismatches score count");
+ }
+ std::vector<float> scores;
+ scores.reserve(score_len);
+ for (int32_t i = 0; i < score_len; i++) {
+ PAIMON_ASSIGN_OR_RAISE(float score, in.ReadValue<float>());
+ scores.push_back(score);
+ }
+ return std::make_shared<BitmapScoredGlobalIndexResult>(std::move(bitmap),
std::move(scores));
+}
+
+Result<std::vector<Range>> GlobalIndexResult::ToRanges() const {
+ std::vector<Range> ranges;
+ PAIMON_ASSIGN_OR_RAISE(bool empty, IsEmpty());
+ if (empty) {
+ return ranges;
+ }
+ PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<Iterator> iter, CreateIterator());
+ int64_t range_start = iter->Next();
+ int64_t range_end = range_start;
+ while (iter->HasNext()) {
+ int64_t current = iter->Next();
+ if (current == range_end + 1) {
+ // Extend the current range
+ range_end = current;
+ } else {
+ ranges.emplace_back(range_start, range_end);
+ range_start = current;
+ range_end = current;
+ }
+ }
+ // Add the last range
+ ranges.emplace_back(range_start, range_end);
+ return ranges;
+}
+
+} // namespace paimon
diff --git a/src/paimon/common/global_index/global_index_result_test.cpp
b/src/paimon/common/global_index/global_index_result_test.cpp
new file mode 100644
index 0000000..73c6d05
--- /dev/null
+++ b/src/paimon/common/global_index/global_index_result_test.cpp
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "paimon/global_index/global_index_result.h"
+
+#include <utility>
+
+#include "gtest/gtest.h"
+#include "paimon/global_index/bitmap_global_index_result.h"
+#include "paimon/global_index/bitmap_scored_global_index_result.h"
+#include "paimon/testing/utils/testharness.h"
+
+namespace paimon::test {
+class GlobalIndexResultTest : public ::testing::Test {
+ public:
+ void SetUp() override {}
+ void TearDown() override {}
+
+ class FakeGlobalIndexResult : public GlobalIndexResult {
+ public:
+ explicit FakeGlobalIndexResult(const std::vector<int64_t>& values) :
values_(values) {}
+ class Iterator : public GlobalIndexResult::Iterator {
+ public:
+ Iterator(const std::vector<int64_t>* values,
+ const std::vector<int64_t>::const_iterator& iter)
+ : values_(values), iter_(iter) {}
+ bool HasNext() const override {
+ return iter_ != values_->end();
+ }
+ int64_t Next() override {
+ int64_t value = *iter_;
+ iter_++;
+ return value;
+ }
+ const std::vector<int64_t>* values_;
+ std::vector<int64_t>::const_iterator iter_;
+ };
+
+ Result<std::unique_ptr<GlobalIndexResult::Iterator>> CreateIterator()
const override {
+ auto iter = values_.begin();
+ return std::make_unique<Iterator>(&values_, iter);
+ }
+
+ Result<bool> IsEmpty() const override {
+ return values_.empty();
+ }
+
+ std::string ToString() const override {
+ return "fake";
+ }
+
+ Result<std::shared_ptr<GlobalIndexResult>> AddOffset(int64_t offset)
override {
+ std::vector<int64_t> values = values_;
+ for (auto& value : values) {
+ value += offset;
+ }
+ return std::make_shared<FakeGlobalIndexResult>(values);
+ }
+
+ private:
+ std::vector<int64_t> values_;
+ };
+};
+
+TEST_F(GlobalIndexResultTest, TestSimple) {
+ auto result1 =
std::make_shared<FakeGlobalIndexResult>(std::vector<int64_t>({1, 3, 5, 100}));
+ auto result2 =
+ std::make_shared<FakeGlobalIndexResult>(std::vector<int64_t>({100, 5,
4, 3, 200}));
+ ASSERT_OK_AND_ASSIGN(auto and_result, result1->And(result2));
+ ASSERT_EQ(and_result->ToString(), "{3,5,100}");
+ ASSERT_OK_AND_ASSIGN(auto and_ranges, and_result->ToRanges());
+ std::vector<Range> expect_and_ranges = {Range(3, 3), Range(5, 5),
Range(100, 100)};
+ ASSERT_EQ(and_ranges, expect_and_ranges);
+
+ ASSERT_OK_AND_ASSIGN(auto or_result, result1->Or(result2));
+ ASSERT_EQ(or_result->ToString(), "{1,3,4,5,100,200}");
+ ASSERT_OK_AND_ASSIGN(auto or_ranges, or_result->ToRanges());
+ std::vector<Range> expect_or_ranges = {Range(1, 1), Range(3, 5),
Range(100, 100),
+ Range(200, 200)};
+ ASSERT_EQ(or_ranges, expect_or_ranges);
+}
+
+TEST_F(GlobalIndexResultTest, TestSerializeAndDeserializeSimple) {
+ auto pool = GetDefaultPool();
+ std::vector<uint8_t> byte_buffer = {
+ 0, 0, 0, 1, 0, 0, 0, 69, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 59,
+ 48, 3, 0, 5, 0, 0, 5, 0, 255, 127, 0, 0, 0, 128, 2, 0, 245,
133, 0, 0, 37,
+ 0, 0, 0, 47, 0, 0, 0, 49, 0, 0, 0, 55, 0, 0, 0, 2, 0,
1, 0, 4, 0,
+ 10, 0, 0, 0, 255, 255, 1, 0, 0, 0, 2, 0, 255, 224, 0, 0, 0,
0};
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<GlobalIndexResult> index_result,
+
GlobalIndexResult::Deserialize(reinterpret_cast<char*>(byte_buffer.data()),
+ byte_buffer.size(),
pool));
+ auto typed_result =
std::dynamic_pointer_cast<BitmapGlobalIndexResult>(index_result);
+ ASSERT_TRUE(typed_result);
+
+ auto bitmap = RoaringBitmap64::From(
+ {1l, 2l, 3l, 4l, 5l, 10l, 2247483647l, 2147483647l, 2147483648l,
2147483649l, 2147483650l});
+ auto expected_result = std::make_shared<BitmapGlobalIndexResult>(
+ [bitmap]() -> Result<RoaringBitmap64> { return bitmap; });
+ ASSERT_EQ(expected_result->ToString(), typed_result->ToString());
+ ASSERT_OK_AND_ASSIGN(auto serialize_bytes,
GlobalIndexResult::Serialize(index_result, pool));
+ ASSERT_EQ(byte_buffer, std::vector<uint8_t>(serialize_bytes->data(),
+ serialize_bytes->data() +
serialize_bytes->size()));
+}
+
+TEST_F(GlobalIndexResultTest, TestSerializeAndDeserializeWithScore) {
+ auto pool = GetDefaultPool();
+ std::vector<uint8_t> byte_buffer = {
+ 0, 0, 0, 1, 0, 0, 0, 64, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0,
+ 58, 48, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 255, 127, 0,
0, 0, 128, 2, 0,
+ 245, 133, 0, 0, 40, 0, 0, 0, 42, 0, 0, 0, 44, 0, 0,
0, 50, 0, 0, 0,
+ 10, 0, 255, 255, 1, 0, 3, 0, 5, 0, 255, 224, 0, 0, 0,
6, 63, 129, 71, 174,
+ 191, 168, 245, 195, 64, 135, 92, 41, 66, 74, 245, 195, 194, 200, 128,
0, 64, 6, 102, 102};
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<GlobalIndexResult> index_result,
+
GlobalIndexResult::Deserialize(reinterpret_cast<char*>(byte_buffer.data()),
+ byte_buffer.size(),
pool));
+ auto typed_result =
std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(index_result);
+ ASSERT_TRUE(typed_result);
+
+ auto bitmap = RoaringBitmap64::From(
+ {10l, 2147483647l, 2147483649l, 2147483651l, 2147483653l,
2247483647l});
+ std::vector<float> scores = {1.01f, -1.32f, 4.23f, 50.74f, -100.25f,
2.10f};
+ auto expected_result =
+ std::make_shared<BitmapScoredGlobalIndexResult>(std::move(bitmap),
std::move(scores));
+ ASSERT_EQ(expected_result->ToString(), typed_result->ToString());
+ ASSERT_OK_AND_ASSIGN(auto serialize_bytes,
GlobalIndexResult::Serialize(index_result, pool));
+ ASSERT_EQ(byte_buffer, std::vector<uint8_t>(serialize_bytes->data(),
+ serialize_bytes->data() +
serialize_bytes->size()));
+}
+
+TEST_F(GlobalIndexResultTest, TestInvalidSerialize) {
+ auto pool = GetDefaultPool();
+ auto result =
std::make_shared<FakeGlobalIndexResult>(std::vector<int64_t>({1, 3, 5, 100}));
+ ASSERT_NOK_WITH_MSG(GlobalIndexResult::Serialize(result, pool),
+ "invalid GlobalIndexResult, must be
BitmapGlobalIndexResult or "
+ "BitmapScoredGlobalIndexResult");
+}
+} // namespace paimon::test
diff --git a/src/paimon/common/global_index/global_index_utils.h
b/src/paimon/common/global_index/global_index_utils.h
new file mode 100644
index 0000000..0aed101
--- /dev/null
+++ b/src/paimon/common/global_index/global_index_utils.h
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+#include <optional>
+
+#include "arrow/c/abi.h"
+#include "arrow/c/helpers.h"
+#include "fmt/format.h"
+#include "paimon/common/utils/scope_guard.h"
+#include "paimon/status.h"
+namespace paimon {
+class GlobalIndexUtils {
+ public:
+ GlobalIndexUtils() = delete;
+ ~GlobalIndexUtils() = delete;
+
+ static Status CheckRelativeRowIds(::ArrowArray* c_arrow_array,
+ const std::vector<int64_t>&
relative_row_ids,
+ std::optional<int64_t>
expected_next_row_id) {
+ if (!c_arrow_array) {
+ return Status::Invalid("CheckRelativeRowIds failed: null
c_arrow_array");
+ }
+ int64_t length = c_arrow_array->length;
+ ScopeGuard guard([c_arrow_array]() -> void {
ArrowArrayRelease(c_arrow_array); });
+ if (static_cast<int64_t>(relative_row_ids.size()) != length) {
+ return Status::Invalid(fmt::format(
+ "relative_row_ids length {} mismatch arrow_array length {} in
CheckRelativeRowIds",
+ relative_row_ids.size(), length));
+ }
+ if (!relative_row_ids.empty() && expected_next_row_id &&
+ relative_row_ids[0] != expected_next_row_id.value()) {
+ return Status::Invalid(
+ fmt::format("first relative_row_ids {} mismatch inner
expected_next_row_id {} in "
+ "CheckRelativeRowIds",
+ relative_row_ids[0],
expected_next_row_id.value()));
+ }
+ guard.Release();
+ return Status::OK();
+ }
+};
+} // namespace paimon
diff --git a/src/paimon/common/global_index/global_index_utils_test.cpp
b/src/paimon/common/global_index/global_index_utils_test.cpp
new file mode 100644
index 0000000..c31aebd
--- /dev/null
+++ b/src/paimon/common/global_index/global_index_utils_test.cpp
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "paimon/common/global_index/global_index_utils.h"
+
+#include <optional>
+#include <vector>
+
+#include "arrow/api.h"
+#include "arrow/c/bridge.h"
+#include "gtest/gtest.h"
+#include "paimon/testing/utils/testharness.h"
+
+namespace paimon::test {
+
+class GlobalIndexUtilsTest : public ::testing::Test {
+ public:
+ /// Helper to create a valid ArrowArray with the given number of int32
elements.
+ static ArrowArray CreateInt32Array(const std::vector<int32_t>& values) {
+ arrow::Int32Builder builder;
+ EXPECT_TRUE(builder.AppendValues(values).ok());
+ std::shared_ptr<arrow::Array> array;
+ EXPECT_TRUE(builder.Finish(&array).ok());
+ ArrowArray c_array;
+ EXPECT_TRUE(arrow::ExportArray(*array, &c_array).ok());
+ return c_array;
+ }
+};
+
+TEST_F(GlobalIndexUtilsTest, TestNullArrowArray) {
+ std::vector<int64_t> row_ids = {0, 1, 2};
+ ASSERT_NOK_WITH_MSG(GlobalIndexUtils::CheckRelativeRowIds(nullptr,
row_ids, std::nullopt),
+ "CheckRelativeRowIds failed: null c_arrow_array");
+}
+
+TEST_F(GlobalIndexUtilsTest, TestEmptyArrayReturnsOk) {
+ auto c_array = CreateInt32Array({});
+ ASSERT_OK(GlobalIndexUtils::CheckRelativeRowIds(&c_array, {},
std::nullopt));
+ // ArrowArray was not released by CheckRelativeRowIds (early return for
length==0),
+ // so we release it manually.
+ if (!ArrowArrayIsReleased(&c_array)) {
+ ArrowArrayRelease(&c_array);
+ }
+}
+
+TEST_F(GlobalIndexUtilsTest, TestEmptyArrayReturnsInvalid) {
+ auto c_array = CreateInt32Array({});
+ std::vector<int64_t> row_ids = {0, 1, 2};
+ ASSERT_NOK_WITH_MSG(
+ GlobalIndexUtils::CheckRelativeRowIds(&c_array, row_ids, std::nullopt),
+ "relative_row_ids length 3 mismatch arrow_array length 0 in
CheckRelativeRowIds");
+}
+
+TEST_F(GlobalIndexUtilsTest, TestMatchingRowIdsAndNoExpectedNextRowId) {
+ auto c_array = CreateInt32Array({10, 20, 30});
+ std::vector<int64_t> row_ids = {0, 1, 2};
+ // expected_next_row_id is nullopt, so the first-element check is skipped.
+ ASSERT_OK(GlobalIndexUtils::CheckRelativeRowIds(&c_array, row_ids,
std::nullopt));
+ // On success, guard.Release() is called and ArrowArray is NOT released by
+ // CheckRelativeRowIds, so we must release it manually.
+ if (!ArrowArrayIsReleased(&c_array)) {
+ ArrowArrayRelease(&c_array);
+ }
+}
+
+TEST_F(GlobalIndexUtilsTest, TestMatchingRowIdsWithCorrectExpectedNextRowId) {
+ auto c_array = CreateInt32Array({10, 20, 30});
+ std::vector<int64_t> row_ids = {5, 6, 7};
+ // expected_next_row_id matches row_ids[0]
+ ASSERT_OK(GlobalIndexUtils::CheckRelativeRowIds(&c_array, row_ids, 5));
+ // On success, guard.Release() is called and ArrowArray is NOT released by
+ // CheckRelativeRowIds, so we must release it manually.
+ if (!ArrowArrayIsReleased(&c_array)) {
+ ArrowArrayRelease(&c_array);
+ }
+}
+
+TEST_F(GlobalIndexUtilsTest, TestMismatchedLength) {
+ auto c_array = CreateInt32Array({10, 20, 30});
+ std::vector<int64_t> row_ids = {0, 1};
+ ASSERT_NOK_WITH_MSG(
+ GlobalIndexUtils::CheckRelativeRowIds(&c_array, row_ids, std::nullopt),
+ "relative_row_ids length 2 mismatch arrow_array length 3 in
CheckRelativeRowIds");
+}
+
+TEST_F(GlobalIndexUtilsTest, TestMismatchedExpectedNextRowId) {
+ auto c_array = CreateInt32Array({10, 20, 30});
+ std::vector<int64_t> row_ids = {5, 6, 7};
+ // expected_next_row_id is 100, but row_ids[0] is 5
+ ASSERT_NOK_WITH_MSG(
+ GlobalIndexUtils::CheckRelativeRowIds(&c_array, row_ids, 100),
+ "first relative_row_ids 5 mismatch inner expected_next_row_id 100 in
CheckRelativeRowIds");
+}
+
+} // namespace paimon::test
diff --git a/src/paimon/common/global_index/global_indexer_factory.cpp
b/src/paimon/common/global_index/global_indexer_factory.cpp
new file mode 100644
index 0000000..11dbc98
--- /dev/null
+++ b/src/paimon/common/global_index/global_indexer_factory.cpp
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "paimon/global_index/global_indexer_factory.h"
+
+#include <cassert>
+#include <utility>
+
+#include "paimon/factories/factory_creator.h"
+#include "paimon/global_index/global_indexer.h"
+#include "paimon/status.h"
+
+namespace paimon {
+const char GlobalIndexerFactory::GLOBAL_INDEX_IDENTIFIER_SUFFIX[] = "-global";
+
+Result<std::unique_ptr<GlobalIndexer>> GlobalIndexerFactory::Get(
+ const std::string& identifier, const std::map<std::string, std::string>&
options) {
+ // Compatibility: "lumina-vector-ann" was the old identifier for lumina
global index.
+ std::string final_identifier = (identifier == "lumina-vector-ann" ?
"lumina" : identifier);
+ std::string global_index_identifier = final_identifier +
GLOBAL_INDEX_IDENTIFIER_SUFFIX;
+ auto factory_creator = FactoryCreator::GetInstance();
+ if (factory_creator == nullptr) {
+ assert(false);
+ return Status::Invalid("factory creator is null pointer");
+ }
+ auto global_indexer_factory =
+
dynamic_cast<GlobalIndexerFactory*>(factory_creator->Create(global_index_identifier));
+ if (global_indexer_factory == nullptr) {
+ // if an index type is not found, return nullptr to skip this index
instead of return error
+ return std::unique_ptr<GlobalIndexer>();
+ }
+ return global_indexer_factory->Create(options);
+}
+} // namespace paimon
diff --git a/src/paimon/common/global_index/global_indexer_factory_test.cpp
b/src/paimon/common/global_index/global_indexer_factory_test.cpp
new file mode 100644
index 0000000..77e819c
--- /dev/null
+++ b/src/paimon/common/global_index/global_indexer_factory_test.cpp
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "paimon/global_index/global_indexer_factory.h"
+
+#include <utility>
+
+#include "gtest/gtest.h"
+#include "paimon/common/global_index/bitmap/bitmap_global_index.h"
+#include "paimon/testing/utils/testharness.h"
+
+namespace paimon::test {
+TEST(GlobalIndexerFactoryTest, TestSimple) {
+ std::map<std::string, std::string> options;
+ ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalIndexer> indexer,
+ GlobalIndexerFactory::Get("bitmap", options));
+
+ auto bitmap_global_index = dynamic_cast<BitmapGlobalIndex*>(indexer.get());
+ ASSERT_TRUE(bitmap_global_index);
+}
+
+TEST(GlobalIndexerFactoryTest, TestNonExist) {
+ std::map<std::string, std::string> options;
+ ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalIndexer> indexer,
+ GlobalIndexerFactory::Get("nonexist", options));
+ ASSERT_FALSE(indexer);
+}
+
+TEST(GlobalIndexerFactoryTest, TestLuminaVectorAnnCompatibility) {
+ // "lumina-vector-ann" should be treated as "lumina" for backward
compatibility.
+ // Both identifiers should produce the same result (either both succeed or
both return nullptr
+ // depending on whether the lumina module is linked).
+ std::map<std::string, std::string> options;
+ ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalIndexer>
lumina_vector_ann_indexer,
+ GlobalIndexerFactory::Get("lumina-vector-ann",
options));
+ ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalIndexer> lumina_indexer,
+ GlobalIndexerFactory::Get("lumina", options));
+
+ ASSERT_EQ(static_cast<bool>(lumina_vector_ann_indexer),
static_cast<bool>(lumina_indexer));
+}
+} // namespace paimon::test