This is an automated email from the ASF dual-hosted git repository.
zhouyuan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gluten.git
The following commit(s) were added to refs/heads/main by this push:
new dd16714827 [VL][Delta] Add roaring bitmap facilities for Delta DV
support (#12001)
dd16714827 is described below
commit dd167148274a41984860ae36e3a1e3b37cf8ddef
Author: Mohammad Linjawi <[email protected]>
AuthorDate: Tue May 5 17:29:23 2026 +0300
[VL][Delta] Add roaring bitmap facilities for Delta DV support (#12001)
This PR is the first step in the split Delta deletion-vector (DV) stack.
It introduces the native roaring bitmap facilities that later PRs use for
Delta DV payload handling, without adding Delta scan integration yet.
Main changes:
add FindRoaring.cmake to discover CRoaring from an existing Velox build,
pkg-config, or FetchContent
wire roaring into the native Velox build
add gluten::delta::RoaringBitmapArray, a small 64-bit roaring bitmap
wrapper for Delta DV payloads
use the same payload shape expected by the later native DV reader:
4-byte magic number
CRoaring portable Roaring64Map serialization
add focused native unit coverage for:
serialization / deserialization round-trip
invalid magic-number rejection
This PR is intentionally infrastructure-only:
no Delta scan offload behavior changes yet
no JVM-side Delta scan integration yet
Those pieces will be added in follow-up stacked PRs.
issue #11901
Co-authored-by: Mohammad Linjawi <[email protected]>
---
cpp/CMake/BuildRoaring.cmake | 32 ++++
cpp/CMake/FindRoaring.cmake | 180 +++++++++++++++++++++
cpp/velox/CMakeLists.txt | 5 +
cpp/velox/compute/delta/RoaringBitmapArray.cpp | 86 ++++++++++
cpp/velox/compute/delta/RoaringBitmapArray.h | 60 +++++++
cpp/velox/compute/delta/tests/CMakeLists.txt | 24 +++
.../compute/delta/tests/RoaringBitmapArrayTest.cpp | 53 ++++++
7 files changed, 440 insertions(+)
diff --git a/cpp/CMake/BuildRoaring.cmake b/cpp/CMake/BuildRoaring.cmake
new file mode 100644
index 0000000000..964a832258
--- /dev/null
+++ b/cpp/CMake/BuildRoaring.cmake
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
contributor
+# license agreements. See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership. The ASF licenses this
+# file to You under the Apache License, Version 2.0 (the "License"); you may
not
+# use this file except in compliance with the License. You may obtain a copy
of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+include_guard(GLOBAL)
+include(FetchContent)
+
+if(NOT DEFINED GLUTEN_ROARING_VERSION)
+ set(GLUTEN_ROARING_VERSION "4.3.11")
+endif()
+
+set(ENABLE_ROARING_TESTS
+ OFF
+ CACHE BOOL "" FORCE)
+
+message(STATUS "Building roaring from source")
+FetchContent_Declare(
+ roaring_fetch
+ GIT_REPOSITORY "https://github.com/RoaringBitmap/CRoaring.git"
+ GIT_TAG "v${GLUTEN_ROARING_VERSION}"
+ GIT_SHALLOW TRUE)
+FetchContent_MakeAvailable(roaring_fetch)
diff --git a/cpp/CMake/FindRoaring.cmake b/cpp/CMake/FindRoaring.cmake
new file mode 100644
index 0000000000..0d36108373
--- /dev/null
+++ b/cpp/CMake/FindRoaring.cmake
@@ -0,0 +1,180 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
contributor
+# license agreements. See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership. The ASF licenses this
+# file to You under the Apache License, Version 2.0 (the "License"); you may
not
+# use this file except in compliance with the License. You may obtain a copy
of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+set(_roaring_pkgconfig_hints "")
+set(_roaring_include_hints "")
+set(_roaring_library_hints "")
+set(GLUTEN_ROARING_VERSION "4.3.11")
+
+foreach(_root ${VELOX_BUILD_PATH} ${VELOX_HOME}/_build/release
+ ${VELOX_HOME}/_build/debug)
+ if(_root)
+ list(APPEND _roaring_pkgconfig_hints "${_root}/_deps/roaring-build")
+ list(APPEND _roaring_include_hints "${_root}/_deps/roaring-src/include"
+ "${_root}/_deps/roaring-src/cpp"
+ "${_root}/_deps/roaring-build/include")
+ list(APPEND _roaring_library_hints "${_root}/_deps/roaring-build/src"
+ "${_root}/_deps/roaring-build")
+ endif()
+endforeach()
+
+if(DEFINED VELOX_HOME)
+ list(APPEND _roaring_pkgconfig_hints
+ "${VELOX_HOME}/deps-install/lib/pkgconfig"
+ "${VELOX_HOME}/deps-install/lib64/pkgconfig")
+ list(APPEND _roaring_include_hints "${VELOX_HOME}/deps-install/include")
+ list(APPEND _roaring_library_hints "${VELOX_HOME}/deps-install/lib"
+ "${VELOX_HOME}/deps-install/lib64")
+endif()
+
+list(REMOVE_DUPLICATES _roaring_pkgconfig_hints)
+list(REMOVE_DUPLICATES _roaring_include_hints)
+list(REMOVE_DUPLICATES _roaring_library_hints)
+
+function(_gluten_roaring_add_headers target_name)
+ find_path(
+ Roaring_INCLUDE_DIR
+ NAMES roaring/roaring.h
+ HINTS ${_roaring_include_hints})
+ find_path(
+ Roaring_CPP_INCLUDE_DIR
+ NAMES roaring/roaring64map.hh
+ HINTS ${_roaring_include_hints})
+ if(Roaring_INCLUDE_DIR)
+ target_include_directories(${target_name}
+ INTERFACE "${Roaring_INCLUDE_DIR}")
+ endif()
+ if(Roaring_CPP_INCLUDE_DIR)
+ target_include_directories(${target_name}
+ INTERFACE "${Roaring_CPP_INCLUDE_DIR}")
+ endif()
+endfunction()
+
+function(_gluten_roaring_enable_pic target_name)
+ if(NOT TARGET ${target_name})
+ return()
+ endif()
+
+ get_target_property(_gluten_roaring_imported ${target_name} IMPORTED)
+ if(NOT _gluten_roaring_imported)
+ set_target_properties(${target_name} PROPERTIES POSITION_INDEPENDENT_CODE
+ ON)
+ endif()
+endfunction()
+
+# Check if roaring target already exists.
+if(TARGET roaring)
+ _gluten_roaring_enable_pic(roaring)
+ _gluten_roaring_add_headers(roaring)
+ set(Roaring_FOUND TRUE)
+ message(STATUS "Target roaring was already found.")
+ return()
+endif()
+
+find_package(PkgConfig QUIET)
+set(_roaring_found_via_pkgconfig OFF)
+
+if(PkgConfig_FOUND)
+ set(_roaring_saved_pkg_config_path "$ENV{PKG_CONFIG_PATH}")
+ list(JOIN _roaring_pkgconfig_hints ":" _roaring_hint_pkg_config_path)
+ if(_roaring_hint_pkg_config_path)
+ if(_roaring_saved_pkg_config_path STREQUAL "")
+ set(ENV{PKG_CONFIG_PATH} "${_roaring_hint_pkg_config_path}")
+ else()
+ set(ENV{PKG_CONFIG_PATH}
+ "${_roaring_hint_pkg_config_path}:${_roaring_saved_pkg_config_path}")
+ endif()
+ endif()
+ pkg_check_modules(Roaring QUIET IMPORTED_TARGET roaring)
+ set(ENV{PKG_CONFIG_PATH} "${_roaring_saved_pkg_config_path}")
+ if(Roaring_FOUND)
+ list(APPEND _roaring_include_hints ${Roaring_INCLUDE_DIRS})
+ list(APPEND _roaring_library_hints ${Roaring_LIBRARY_DIRS})
+ if(DEFINED Roaring_INCLUDEDIR)
+ list(APPEND _roaring_include_hints "${Roaring_INCLUDEDIR}")
+ endif()
+ if(DEFINED Roaring_LIBDIR)
+ list(APPEND _roaring_library_hints "${Roaring_LIBDIR}")
+ endif()
+ set(_roaring_found_via_pkgconfig ON)
+ endif()
+endif()
+
+list(REMOVE_DUPLICATES _roaring_include_hints)
+list(REMOVE_DUPLICATES _roaring_library_hints)
+
+find_path(
+ Roaring_INCLUDE_DIR
+ NAMES roaring/roaring.h
+ HINTS ${_roaring_include_hints})
+find_path(
+ Roaring_CPP_INCLUDE_DIR
+ NAMES roaring/roaring64map.hh
+ HINTS ${_roaring_include_hints})
+find_library(
+ Roaring_LIBRARY
+ NAMES roaring
+ HINTS ${_roaring_library_hints})
+
+if((NOT Roaring_LIBRARY)
+ AND DEFINED pkgcfg_lib_Roaring_roaring
+ AND EXISTS "${pkgcfg_lib_Roaring_roaring}")
+ set(Roaring_LIBRARY "${pkgcfg_lib_Roaring_roaring}")
+endif()
+
+if(Roaring_INCLUDE_DIR
+ AND Roaring_CPP_INCLUDE_DIR
+ AND Roaring_LIBRARY)
+ add_library(roaring UNKNOWN IMPORTED)
+ set_target_properties(
+ roaring
+ PROPERTIES IMPORTED_LOCATION "${Roaring_LIBRARY}"
+ INTERFACE_INCLUDE_DIRECTORIES
+ "${Roaring_INCLUDE_DIR};${Roaring_CPP_INCLUDE_DIR}")
+ _gluten_roaring_add_headers(roaring)
+ set(Roaring_FOUND TRUE)
+ message(STATUS "Found roaring via direct library lookup.")
+ return()
+endif()
+
+if(_roaring_found_via_pkgconfig)
+ add_library(roaring INTERFACE)
+ target_link_libraries(roaring INTERFACE PkgConfig::Roaring)
+ _gluten_roaring_add_headers(roaring)
+ set(Roaring_FOUND TRUE)
+ message(STATUS "Found roaring via pkg-config imported target fallback.")
+ return()
+endif()
+
+include(BuildRoaring)
+
+if(TARGET roaring)
+ _gluten_roaring_enable_pic(roaring)
+ _gluten_roaring_add_headers(roaring)
+ set(Roaring_FOUND TRUE)
+ message(STATUS "Found roaring via FetchContent.")
+ return()
+endif()
+
+set(Roaring_FOUND FALSE)
+
+if(Roaring_FIND_REQUIRED)
+ message(
+ FATAL_ERROR
+ "Failed to find roaring. Set VELOX_HOME/VELOX_BUILD_PATH to a Velox
build "
+ "that contains roaring in _deps, or provide roaring via
PKG_CONFIG_PATH.")
+elseif(NOT Roaring_FIND_QUIETLY)
+ message(WARNING "Failed to find roaring.")
+endif()
diff --git a/cpp/velox/CMakeLists.txt b/cpp/velox/CMakeLists.txt
index b9d3e65afe..5034c1601a 100644
--- a/cpp/velox/CMakeLists.txt
+++ b/cpp/velox/CMakeLists.txt
@@ -75,6 +75,8 @@ if(NOT DEFINED VELOX_BUILD_PATH)
endif()
endif()
+find_package(Roaring REQUIRED)
+
set(VELOX_PROTO_SRC_DIR
${GLUTEN_HOME}/backends-velox/src/main/resources/org/apache/gluten/proto)
message(STATUS "Set Gluten Proto Directory in ${VELOX_PROTO_SRC_DIR}")
@@ -157,6 +159,7 @@ set(VELOX_SRCS
compute/VeloxRuntime.cc
compute/VeloxPlanConverter.cc
compute/WholeStageResultIterator.cc
+ compute/delta/RoaringBitmapArray.cpp
compute/iceberg/IcebergPlanConverter.cc
jni/JniFileSystem.cc
jni/JniUdf.cc
@@ -323,6 +326,7 @@ endif()
target_link_libraries(velox PUBLIC facebook::velox)
target_link_libraries(velox PUBLIC Folly::folly)
+target_link_libraries(velox PUBLIC roaring)
find_re2()
target_link_libraries(velox PUBLIC ${RE2_LIBRARY})
@@ -387,6 +391,7 @@ target_link_libraries(velox PUBLIC ICU::i18n ICU::uc
ICU::data)
if(BUILD_TESTS)
add_subdirectory(tests)
+ add_subdirectory(compute/delta/tests)
endif()
if(BUILD_BENCHMARKS)
diff --git a/cpp/velox/compute/delta/RoaringBitmapArray.cpp
b/cpp/velox/compute/delta/RoaringBitmapArray.cpp
new file mode 100644
index 0000000000..7da3364d0b
--- /dev/null
+++ b/cpp/velox/compute/delta/RoaringBitmapArray.cpp
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute/delta/RoaringBitmapArray.h"
+
+#include <cstring>
+
+#include "velox/common/base/Exceptions.h"
+
+namespace gluten::delta {
+
+namespace {
+
+uint32_t readUint32LittleEndian(const char* data) {
+ const auto* bytes = reinterpret_cast<const uint8_t*>(data);
+ return static_cast<uint32_t>(bytes[0]) | (static_cast<uint32_t>(bytes[1]) <<
8) |
+ (static_cast<uint32_t>(bytes[2]) << 16) |
(static_cast<uint32_t>(bytes[3]) << 24);
+}
+
+void writeUint32LittleEndian(char* data, uint32_t value) {
+ auto* bytes = reinterpret_cast<uint8_t*>(data);
+ bytes[0] = static_cast<uint8_t>(value & 0xFF);
+ bytes[1] = static_cast<uint8_t>((value >> 8) & 0xFF);
+ bytes[2] = static_cast<uint8_t>((value >> 16) & 0xFF);
+ bytes[3] = static_cast<uint8_t>((value >> 24) & 0xFF);
+}
+
+} // namespace
+
+void RoaringBitmapArray::addSafe(uint64_t value) {
+ bitmap_.add(value);
+}
+
+bool RoaringBitmapArray::containsSafe(uint64_t value) const {
+ return bitmap_.contains(value);
+}
+
+void RoaringBitmapArray::serialize(char* buffer) const {
+ VELOX_CHECK_NOT_NULL(buffer, "RoaringBitmapArray serialization buffer is
null");
+ writeUint32LittleEndian(buffer, kPortableSerializationFormatMagicNumber);
+ bitmap_.write(buffer + sizeof(uint32_t), true);
+}
+
+void RoaringBitmapArray::deserialize(const char* buffer, size_t size) {
+ VELOX_CHECK_NOT_NULL(buffer, "RoaringBitmapArray input buffer is null");
+ VELOX_CHECK_GE(size, sizeof(uint32_t), "RoaringBitmapArray payload is too
small: {}", size);
+ const auto magic = readUint32LittleEndian(buffer);
+ VELOX_CHECK_EQ(
+ magic, kPortableSerializationFormatMagicNumber, "Unexpected
RoaringBitmapArray magic number {}", magic);
+ bitmap_ = roaring::Roaring64Map::readSafe(buffer + sizeof(uint32_t), size -
sizeof(uint32_t));
+}
+
+size_t RoaringBitmapArray::serializedSizeInBytes() const {
+ return sizeof(uint32_t) + bitmap_.getSizeInBytes(true);
+}
+
+} // namespace gluten::delta
diff --git a/cpp/velox/compute/delta/RoaringBitmapArray.h
b/cpp/velox/compute/delta/RoaringBitmapArray.h
new file mode 100644
index 0000000000..68eb2c9db3
--- /dev/null
+++ b/cpp/velox/compute/delta/RoaringBitmapArray.h
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <roaring/roaring64map.hh>
+
+namespace gluten::delta {
+
+/// Minimal 64-bit roaring bitmap wrapper for Delta deletion-vector payloads.
+/// This intentionally models the payload format consumed by
DeltaDeletionVectorReader:
+/// a 4-byte magic number followed by CRoaring's portable Roaring64Map
serialization.
+class RoaringBitmapArray {
+ public:
+ static constexpr uint32_t kPortableSerializationFormatMagicNumber =
1681511377;
+
+ void addSafe(uint64_t value);
+ bool containsSafe(uint64_t value) const;
+
+ void serialize(char* buffer) const;
+ void deserialize(const char* buffer, size_t size);
+
+ size_t serializedSizeInBytes() const;
+
+ private:
+ roaring::Roaring64Map bitmap_;
+};
+
+} // namespace gluten::delta
diff --git a/cpp/velox/compute/delta/tests/CMakeLists.txt
b/cpp/velox/compute/delta/tests/CMakeLists.txt
new file mode 100644
index 0000000000..ec86cab0d7
--- /dev/null
+++ b/cpp/velox/compute/delta/tests/CMakeLists.txt
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_executable(velox_roaring_bitmap_array_test RoaringBitmapArrayTest.cpp)
+
+target_link_libraries(velox_roaring_bitmap_array_test velox roaring
+ GTest::gtest GTest::gtest_main)
+
+add_test(
+ NAME velox_roaring_bitmap_array_test
+ COMMAND velox_roaring_bitmap_array_test
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/cpp/velox/compute/delta/tests/RoaringBitmapArrayTest.cpp
b/cpp/velox/compute/delta/tests/RoaringBitmapArrayTest.cpp
new file mode 100644
index 0000000000..0eca672608
--- /dev/null
+++ b/cpp/velox/compute/delta/tests/RoaringBitmapArrayTest.cpp
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute/delta/RoaringBitmapArray.h"
+
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+namespace gluten::delta {
+namespace {
+
+TEST(RoaringBitmapArrayTest, SerializeRoundTrip) {
+ RoaringBitmapArray bitmap;
+ bitmap.addSafe(1);
+ bitmap.addSafe(7);
+ bitmap.addSafe(1ULL << 33);
+
+ std::vector<char> serialized(bitmap.serializedSizeInBytes());
+ bitmap.serialize(serialized.data());
+
+ RoaringBitmapArray restored;
+ restored.deserialize(serialized.data(), serialized.size());
+
+ EXPECT_TRUE(restored.containsSafe(1));
+ EXPECT_TRUE(restored.containsSafe(7));
+ EXPECT_TRUE(restored.containsSafe(1ULL << 33));
+ EXPECT_FALSE(restored.containsSafe(2));
+}
+
+TEST(RoaringBitmapArrayTest, RejectsBadMagic) {
+ RoaringBitmapArray bitmap;
+ std::vector<char> invalid(sizeof(uint32_t), '\0');
+ EXPECT_ANY_THROW(bitmap.deserialize(invalid.data(), invalid.size()));
+}
+
+} // namespace
+} // namespace gluten::delta
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]