This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new d437efea333 [chore](thirdparty)(paimon-cpp) reuse Doris Arrow stack
and isolate external headers (#60946)
d437efea333 is described below
commit d437efea3331da63de72ba5939d9283f1bad7385
Author: Chenjunwei <[email protected]>
AuthorDate: Mon Mar 2 20:20:43 2026 +0800
[chore](thirdparty)(paimon-cpp) reuse Doris Arrow stack and isolate
external headers (#60946)
## Summary
Split thirdparty-only changes from #60883 into an independent PR, so
`thirdparty` can merge first.
## Included Files
- `thirdparty/build-thirdparty.sh`
- `thirdparty/download-thirdparty.sh`
- `thirdparty/paimon-cpp-cache.cmake`
- `thirdparty/patches/apache-arrow-17.0.0-paimon.patch`
- `thirdparty/patches/paimon-cpp-buildutils-static-deps.patch`
## Why Split
- Keep this PR focused on `thirdparty` integration only.
- Reduce rebase/conflict risk for the original feature branch.
## Follow-up
1. Merge this PR first.
2. Rebase the original feature branch on latest `master`.
3. Keep non-thirdparty logic in the original PR.
---
thirdparty/build-thirdparty.sh | 38 ++--
thirdparty/download-thirdparty.sh | 10 +
thirdparty/paimon-cpp-cache.cmake | 47 +++--
.../patches/apache-arrow-17.0.0-paimon.patch | 224 +++++++++++++++++++++
.../paimon-cpp-buildutils-static-deps.patch | 160 ++++++++++++++-
5 files changed, 446 insertions(+), 33 deletions(-)
diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh
index fc413fbb896..d858596d1bd 100755
--- a/thirdparty/build-thirdparty.sh
+++ b/thirdparty/build-thirdparty.sh
@@ -1090,6 +1090,10 @@ build_arrow() {
-DARROW_BUILD_STATIC=ON -DARROW_WITH_BROTLI=ON -DARROW_WITH_LZ4=ON
-DARROW_USE_GLOG=ON \
-DARROW_WITH_SNAPPY=ON -DARROW_WITH_ZLIB=ON -DARROW_WITH_ZSTD=ON
-DARROW_JSON=ON \
-DARROW_WITH_UTF8PROC=OFF -DARROW_WITH_RE2=ON -DARROW_ORC=ON \
+ -DARROW_COMPUTE=ON \
+ -DARROW_FILESYSTEM=ON \
+ -DARROW_DATASET=ON \
+ -DARROW_ACERO=ON \
-DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
-DCMAKE_INSTALL_LIBDIR=lib64 \
-DARROW_BOOST_USE_SHARED=OFF \
@@ -1137,6 +1141,8 @@ build_arrow() {
cp -rf ./brotli_ep/src/brotli_ep-install/lib/libbrotlicommon-static.a
"${TP_INSTALL_DIR}/lib64/libbrotlicommon.a"
strip_lib libarrow.a
strip_lib libparquet.a
+ strip_lib libarrow_dataset.a
+ strip_lib libarrow_acero.a
}
# abseil
@@ -2028,20 +2034,26 @@ build_paimon_cpp() {
# These libraries are built but not installed by default
echo "Installing paimon-cpp internal dependencies..."
- # Install paimon-cpp Arrow deps used by paimon parquet static libs.
- # Keep them in an isolated directory to avoid clashing with Doris Arrow.
+ # Arrow deps: When PAIMON_USE_EXTERNAL_ARROW=ON (Plan B), paimon-cpp
+ # reuses Doris's Arrow and does NOT build arrow_ep, so the paimon_deps
+ # directory is not needed. When building its own Arrow (legacy), copy
+ # arrow artefacts into an isolated directory to avoid clashing with Doris.
local paimon_deps_dir="${TP_INSTALL_DIR}/paimon-cpp/lib64/paimon_deps"
- mkdir -p "${paimon_deps_dir}"
- for paimon_arrow_dep in \
- libarrow.a \
- libarrow_filesystem.a \
- libarrow_dataset.a \
- libarrow_acero.a \
- libparquet.a; do
- if [ -f "arrow_ep-install/lib/${paimon_arrow_dep}" ]; then
- cp -v "arrow_ep-install/lib/${paimon_arrow_dep}"
"${paimon_deps_dir}/${paimon_arrow_dep}"
- fi
- done
+ if [ -d "arrow_ep-install/lib" ]; then
+ mkdir -p "${paimon_deps_dir}"
+ for paimon_arrow_dep in \
+ libarrow.a \
+ libarrow_filesystem.a \
+ libarrow_dataset.a \
+ libarrow_acero.a \
+ libparquet.a; do
+ if [ -f "arrow_ep-install/lib/${paimon_arrow_dep}" ]; then
+ cp -v "arrow_ep-install/lib/${paimon_arrow_dep}"
"${paimon_deps_dir}/${paimon_arrow_dep}"
+ fi
+ done
+ else
+ echo " arrow_ep-install not found (PAIMON_USE_EXTERNAL_ARROW=ON?) –
skipping paimon_deps Arrow copy"
+ fi
# Install roaring_bitmap, renamed to avoid conflict with Doris's
croaringbitmap
if [ -f "release/libroaring_bitmap.a" ]; then
diff --git a/thirdparty/download-thirdparty.sh
b/thirdparty/download-thirdparty.sh
index 447b8852618..d913c389ee0 100755
--- a/thirdparty/download-thirdparty.sh
+++ b/thirdparty/download-thirdparty.sh
@@ -425,6 +425,16 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " ARROW " ]]; then
fi
cd -
fi
+ if [[ "${ARROW_SOURCE}" == "arrow-apache-arrow-17.0.0" ]]; then
+ cd "${TP_SOURCE_DIR}/${ARROW_SOURCE}"
+ if [[ ! -f "${PATCHED_MARK}" ]]; then
+ # Paimon-cpp parquet patches: row-group-aware batch reader,
max_row_group_size,
+ # GetBufferedSize(), int96 NANO guard, and Thrift_VERSION empty
fix.
+ patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-paimon.patch"
+ touch "${PATCHED_MARK}"
+ fi
+ cd -
+ fi
echo "Finished patching ${ARROW_SOURCE}"
fi
diff --git a/thirdparty/paimon-cpp-cache.cmake
b/thirdparty/paimon-cpp-cache.cmake
index 40623dd10f0..dbebd94a0cc 100644
--- a/thirdparty/paimon-cpp-cache.cmake
+++ b/thirdparty/paimon-cpp-cache.cmake
@@ -57,18 +57,29 @@ set(LZ4_LIBRARY "${DORIS_LIB_DIR}/liblz4.a" CACHE FILEPATH
"LZ4 library")
set(LZ4_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "LZ4 include directory")
# ============================================================================
-# glog - Reuse from Doris (version 0.6.0)
-# Note: Paimon-cpp uses 0.7.1, but 0.6.0 is compatible
+# glog - NOT reused from Doris
+# paimon-cpp's build_glog() unconditionally calls externalproject_add() to
+# build glog 0.7.1. Any GLOG_ROOT/GLOG_LIBRARY/GLOG_INCLUDE_DIR set here
+# would be overwritten by that macro, so we skip them entirely.
# ============================================================================
-set(GLOG_ROOT "${DORIS_THIRDPARTY_DIR}" CACHE PATH "glog root directory")
-set(GLOG_LIBRARY "${DORIS_LIB_DIR}/libglog.a" CACHE FILEPATH "glog library")
-set(GLOG_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "glog include
directory")
# ============================================================================
-# Arrow, Protobuf, Thrift - NOT reusing from Doris
-# paimon-cpp will build its own versions with symbol visibility=hidden
-# to prevent conflicts with Doris's versions
+# Arrow - Reuse from Doris (Doris Arrow now includes
COMPUTE/DATASET/ACERO/FILESYSTEM)
+# Doris's Arrow 17.0.0 is built with the full module set that paimon-cpp
+# needs, so we skip paimon-cpp's internal externalproject_add(arrow_ep ...).
# ============================================================================
+set(PAIMON_USE_EXTERNAL_ARROW ON CACHE BOOL "Use pre-built Arrow from Doris
instead of building from source")
+
+set(DORIS_LIB64_DIR "${DORIS_THIRDPARTY_DIR}/lib64" CACHE PATH "Doris lib64
directory")
+
+set(PAIMON_EXTERNAL_ARROW_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "Arrow
include directory")
+set(PAIMON_EXTERNAL_ARROW_LIB "${DORIS_LIB64_DIR}/libarrow.a" CACHE FILEPATH
"Arrow core library")
+set(PAIMON_EXTERNAL_ARROW_DATASET_LIB "${DORIS_LIB64_DIR}/libarrow_dataset.a"
CACHE FILEPATH "Arrow Dataset library")
+set(PAIMON_EXTERNAL_ARROW_ACERO_LIB "${DORIS_LIB64_DIR}/libarrow_acero.a"
CACHE FILEPATH "Arrow Acero library")
+set(PAIMON_EXTERNAL_PARQUET_LIB "${DORIS_LIB64_DIR}/libparquet.a" CACHE
FILEPATH "Parquet library")
+set(PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB
"${DORIS_LIB64_DIR}/libarrow_bundled_dependencies.a" CACHE FILEPATH "Arrow
bundled dependencies library")
+
+# Protobuf, Thrift - still built separately by paimon-cpp
# ============================================================================
# Snappy - Reuse from Doris
@@ -103,17 +114,23 @@ endif()
if(NOT EXISTS "${SNAPPY_LIBRARY}")
message(FATAL_ERROR "Snappy library not found: ${SNAPPY_LIBRARY}")
endif()
-if(NOT EXISTS "${GLOG_LIBRARY}")
- message(FATAL_ERROR "glog library not found: ${GLOG_LIBRARY}")
-endif()
message(STATUS "========================================")
message(STATUS "Paimon-cpp Library Reuse Configuration")
message(STATUS "========================================")
message(STATUS "Reusing from Doris:")
-message(STATUS " ✓ ZLIB, ZSTD, LZ4, Snappy, glog")
+message(STATUS " ✓ ZLIB, ZSTD, LZ4, Snappy")
+if(PAIMON_USE_EXTERNAL_ARROW)
+ message(STATUS " ✓ Arrow, Parquet, Arrow Dataset, Arrow Acero (Plan B)")
+else()
+ message(STATUS " ✗ Arrow (building separately, symbol visibility=hidden)")
+endif()
message(STATUS "")
-message(STATUS "Building separately (symbol visibility=hidden):")
-message(STATUS " - Arrow, Protobuf, Thrift, ORC")
-message(STATUS " - RapidJSON, TBB")
+message(STATUS "Building separately:")
+if(NOT PAIMON_USE_EXTERNAL_ARROW)
+ message(STATUS " - Arrow, Protobuf, Thrift, ORC")
+else()
+ message(STATUS " - Protobuf, Thrift, ORC")
+endif()
+message(STATUS " - glog, RapidJSON, TBB")
message(STATUS "========================================")
diff --git a/thirdparty/patches/apache-arrow-17.0.0-paimon.patch
b/thirdparty/patches/apache-arrow-17.0.0-paimon.patch
new file mode 100644
index 00000000000..4e53117b79b
--- /dev/null
+++ b/thirdparty/patches/apache-arrow-17.0.0-paimon.patch
@@ -0,0 +1,224 @@
+diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
+index ec3890a41f..943f69bb6c 100644
+--- a/cpp/src/parquet/arrow/schema.cc
++++ b/cpp/src/parquet/arrow/schema.cc
+@@ -178,7 +178,7 @@ static Status GetTimestampMetadata(const
::arrow::TimestampType& type,
+
+ // The user is explicitly asking for Impala int96 encoding, there is no
+ // logical type.
+- if (arrow_properties.support_deprecated_int96_timestamps()) {
++ if (arrow_properties.support_deprecated_int96_timestamps() && target_unit
== ::arrow::TimeUnit::NANO) {
+ *physical_type = ParquetType::INT96;
+ return Status::OK();
+ }
+
+diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
+index 285e2a5973..aa6f92f077 100644
+--- a/cpp/src/parquet/arrow/reader.cc
++++ b/cpp/src/parquet/arrow/reader.cc
+@@ -1013,25 +1013,32 @@ Status FileReaderImpl::GetRecordBatchReader(const
std::vector<int>& row_groups,
+ return Status::OK();
+ }
+
+- int64_t num_rows = 0;
++ std::vector<int64_t> num_rows;
+ for (int row_group : row_groups) {
+- num_rows += parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
++
num_rows.push_back(parquet_reader()->metadata()->RowGroup(row_group)->num_rows());
+ }
+
+ using ::arrow::RecordBatchIterator;
++ int row_group_idx = 0;
+
+ // NB: This lambda will be invoked outside the scope of this call to
+ // `GetRecordBatchReader()`, so it must capture `readers` and
`batch_schema` by value.
+ // `this` is a non-owning pointer so we are relying on the parent
FileReader outliving
+ // this RecordBatchReader.
+ ::arrow::Iterator<RecordBatchIterator> batches =
::arrow::MakeFunctionIterator(
+- [readers, batch_schema, num_rows,
++ [readers, batch_schema, num_rows, row_group_idx,
+ this]() mutable -> ::arrow::Result<RecordBatchIterator> {
+ ::arrow::ChunkedArrayVector columns(readers.size());
+
+- // don't reserve more rows than necessary
+- int64_t batch_size = std::min(properties().batch_size(), num_rows);
+- num_rows -= batch_size;
++ int64_t batch_size = 0;
++ if (!num_rows.empty()) {
++ // don't reserve more rows than necessary
++ batch_size = std::min(properties().batch_size(),
num_rows[row_group_idx]);
++ num_rows[row_group_idx] -= batch_size;
++ if (num_rows[row_group_idx] == 0 && (num_rows.size() - 1) !=
row_group_idx) {
++ row_group_idx++;
++ }
++ }
+
+ RETURN_NOT_OK(::arrow::internal::OptionalParallelFor(
+ reader_properties_.use_threads(),
static_cast<int>(readers.size()),
+diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc
+index 4fd7ef1b47..87326a54f1 100644
+--- a/cpp/src/parquet/arrow/writer.cc
++++ b/cpp/src/parquet/arrow/writer.cc
+@@ -314,6 +314,14 @@ class FileWriterImpl : public FileWriter {
+ return Status::OK();
+ }
+
++ int64_t GetBufferedSize() override {
++ if (row_group_writer_ == nullptr) {
++ return 0;
++ }
++ return row_group_writer_->total_compressed_bytes() +
++ row_group_writer_->total_compressed_bytes_written();
++ }
++
+ Status Close() override {
+ if (!closed_) {
+ // Make idempotent
+@@ -418,10 +426,13 @@ class FileWriterImpl : public FileWriter {
+
+ // Max number of rows allowed in a row group.
+ const int64_t max_row_group_length =
this->properties().max_row_group_length();
++ const int64_t max_row_group_size =
this->properties().max_row_group_size();
+
+ // Initialize a new buffered row group writer if necessary.
+ if (row_group_writer_ == nullptr || !row_group_writer_->buffered() ||
+- row_group_writer_->num_rows() >= max_row_group_length) {
++ row_group_writer_->num_rows() >= max_row_group_length ||
++ (row_group_writer_->total_compressed_bytes_written() +
++ row_group_writer_->total_compressed_bytes() >= max_row_group_size)) {
+ RETURN_NOT_OK(NewBufferedRowGroup());
+ }
+
+diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h
+index 4a1a033a7b..0f13d05e44 100644
+--- a/cpp/src/parquet/arrow/writer.h
++++ b/cpp/src/parquet/arrow/writer.h
+@@ -138,6 +138,9 @@ class PARQUET_EXPORT FileWriter {
+ /// option in this case.
+ virtual ::arrow::Status WriteRecordBatch(const ::arrow::RecordBatch& batch)
= 0;
+
++ /// \brief Return the buffered size in bytes.
++ virtual int64_t GetBufferedSize() = 0;
++
+ /// \brief Write the footer and close the file.
+ virtual ::arrow::Status Close() = 0;
+ virtual ~FileWriter();
+diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
+index 4d3acb491e..3906ff3c59 100644
+--- a/cpp/src/parquet/properties.h
++++ b/cpp/src/parquet/properties.h
+@@ -139,6 +139,7 @@ static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
+ static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT =
kDefaultDataPageSize;
+ static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
+ static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024;
++static constexpr int64_t DEFAULT_MAX_ROW_GROUP_SIZE = 128 * 1024 * 1024;
+ static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
+ static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
+ static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN;
+@@ -232,6 +233,7 @@ class PARQUET_EXPORT WriterProperties {
+ dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
+ write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
+ max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
++ max_row_group_size_(DEFAULT_MAX_ROW_GROUP_SIZE),
+ pagesize_(kDefaultDataPageSize),
+ version_(ParquetVersion::PARQUET_2_6),
+ data_page_version_(ParquetDataPageVersion::V1),
+@@ -244,6 +246,7 @@ class PARQUET_EXPORT WriterProperties {
+ dictionary_pagesize_limit_(properties.dictionary_pagesize_limit()),
+ write_batch_size_(properties.write_batch_size()),
+ max_row_group_length_(properties.max_row_group_length()),
++ max_row_group_size_(properties.max_row_group_size()),
+ pagesize_(properties.data_pagesize()),
+ version_(properties.version()),
+ data_page_version_(properties.data_page_version()),
+@@ -321,6 +324,13 @@ class PARQUET_EXPORT WriterProperties {
+ return this;
+ }
+
++ /// Specify the max bytes size to put in a single row group.
++ /// Default 128 M.
++ Builder* max_row_group_size(int64_t max_row_group_size) {
++ max_row_group_size_ = max_row_group_size;
++ return this;
++ }
++
+ /// Specify the data page size.
+ /// Default 1MB.
+ Builder* data_pagesize(int64_t pg_size) {
+@@ -664,7 +674,7 @@ class PARQUET_EXPORT WriterProperties {
+
+ return std::shared_ptr<WriterProperties>(new WriterProperties(
+ pool_, dictionary_pagesize_limit_, write_batch_size_,
max_row_group_length_,
+- pagesize_, version_, created_by_, page_checksum_enabled_,
++ max_row_group_size_, pagesize_, version_, created_by_,
page_checksum_enabled_,
+ std::move(file_encryption_properties_), default_column_properties_,
+ column_properties, data_page_version_, store_decimal_as_integer_,
+ std::move(sorting_columns_)));
+@@ -675,6 +685,7 @@ class PARQUET_EXPORT WriterProperties {
+ int64_t dictionary_pagesize_limit_;
+ int64_t write_batch_size_;
+ int64_t max_row_group_length_;
++ int64_t max_row_group_size_;
+ int64_t pagesize_;
+ ParquetVersion::type version_;
+ ParquetDataPageVersion data_page_version_;
+@@ -705,6 +716,8 @@ class PARQUET_EXPORT WriterProperties {
+
+ inline int64_t max_row_group_length() const { return max_row_group_length_;
}
+
++ inline int64_t max_row_group_size() const { return max_row_group_size_; }
++
+ inline int64_t data_pagesize() const { return pagesize_; }
+
+ inline ParquetDataPageVersion data_page_version() const {
+@@ -810,7 +823,7 @@ class PARQUET_EXPORT WriterProperties {
+ private:
+ explicit WriterProperties(
+ MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t
write_batch_size,
+- int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type
version,
++ int64_t max_row_group_length, int64_t max_row_group_size, int64_t
pagesize, ParquetVersion::type version,
+ const std::string& created_by, bool page_write_checksum_enabled,
+ std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
+ const ColumnProperties& default_column_properties,
+@@ -821,6 +834,7 @@ class PARQUET_EXPORT WriterProperties {
+ dictionary_pagesize_limit_(dictionary_pagesize_limit),
+ write_batch_size_(write_batch_size),
+ max_row_group_length_(max_row_group_length),
++ max_row_group_size_(max_row_group_size),
+ pagesize_(pagesize),
+ parquet_data_page_version_(data_page_version),
+ parquet_version_(version),
+@@ -836,6 +850,7 @@ class PARQUET_EXPORT WriterProperties {
+ int64_t dictionary_pagesize_limit_;
+ int64_t write_batch_size_;
+ int64_t max_row_group_length_;
++ int64_t max_row_group_size_;
+ int64_t pagesize_;
+ ParquetDataPageVersion parquet_data_page_version_;
+ ParquetVersion::type parquet_version_;
+diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake
b/cpp/cmake_modules/ThirdpartyToolchain.cmake
+index 9df922afa2..5c8b3d4d07 100644
+--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
+@@ -1789,7 +1789,20 @@ if(ARROW_WITH_THRIFT)
+ REQUIRED_VERSION
+ 0.11.0)
+
+- string(REPLACE "." ";" Thrift_VERSION_LIST ${Thrift_VERSION})
++ if(NOT Thrift_VERSION)
++ if(DEFINED thrift_PC_VERSION AND thrift_PC_VERSION)
++ set(Thrift_VERSION "${thrift_PC_VERSION}")
++ elseif(DEFINED ThriftAlt_VERSION AND ThriftAlt_VERSION)
++ set(Thrift_VERSION "${ThriftAlt_VERSION}")
++ elseif(DEFINED THRIFT_VERSION AND THRIFT_VERSION)
++ set(Thrift_VERSION "${THRIFT_VERSION}")
++ endif()
++ endif()
++ if(NOT Thrift_VERSION)
++ message(FATAL_ERROR "Thrift_VERSION is empty after resolving Thrift
dependency")
++ endif()
++
++ string(REPLACE "." ";" Thrift_VERSION_LIST "${Thrift_VERSION}")
+ list(GET Thrift_VERSION_LIST 0 Thrift_VERSION_MAJOR)
+ list(GET Thrift_VERSION_LIST 1 Thrift_VERSION_MINOR)
+ list(GET Thrift_VERSION_LIST 2 Thrift_VERSION_PATCH)
diff --git a/thirdparty/patches/paimon-cpp-buildutils-static-deps.patch
b/thirdparty/patches/paimon-cpp-buildutils-static-deps.patch
index 7de7d2875ca..31af1db7f0f 100644
--- a/thirdparty/patches/paimon-cpp-buildutils-static-deps.patch
+++ b/thirdparty/patches/paimon-cpp-buildutils-static-deps.patch
@@ -41,7 +41,7 @@ diff --git a/cmake_modules/ThirdpartyToolchain.cmake
b/cmake_modules/ThirdpartyT
@@ -923,6 +920,13 @@ macro(build_orc)
-DBUILD_TOOLS=OFF
-DBUILD_CPP_ENABLE_METRICS=ON)
-
+
+ if(ORC_RPATH)
+ list(APPEND ORC_CMAKE_ARGS
+ "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath,${ORC_RPATH}"
@@ -82,7 +82,7 @@ diff --git a/cmake_modules/ThirdpartyToolchain.cmake
b/cmake_modules/ThirdpartyT
+ set(THIRDPARTY_ZLIB_STATIC_LIB
+
"${THIRDPARTY_ZLIB_ROOT}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}z${CMAKE_STATIC_LIBRARY_SUFFIX}"
+ )
-
+
# Strip lto flags (which may be added by dh_auto_configure)
# See https://github.com/protocolbuffers/protobuf/issues/7092
@@ -778,6 +781,10 @@ macro(build_protobuf)
@@ -97,6 +97,156 @@ diff --git a/cmake_modules/ThirdpartyToolchain.cmake
b/cmake_modules/ThirdpartyT
-Dprotobuf_DEBUG_POSTFIX=)
set(PROTOBUF_CONFIGURE SOURCE_SUBDIR "cmake" CMAKE_ARGS
${PROTOBUF_CMAKE_ARGS})
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -34,6 +34,16 @@ set(EP_COMMON_TOOLCHAIN
"-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
+ "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}")
+
++option(PAIMON_USE_EXTERNAL_ARROW "Reuse external Arrow/Parquet instead of
building arrow_ep" OFF)
++set(PAIMON_EXTERNAL_ARROW_INCLUDE_DIR "" CACHE PATH
++ "Include directory for external Arrow/Parquet headers")
++set(PAIMON_EXTERNAL_ARROW_LIB "" CACHE FILEPATH "Path to external libarrow.a")
++set(PAIMON_EXTERNAL_ARROW_DATASET_LIB "" CACHE FILEPATH "Path to external
libarrow_dataset.a")
++set(PAIMON_EXTERNAL_ARROW_ACERO_LIB "" CACHE FILEPATH "Path to external
libarrow_acero.a")
++set(PAIMON_EXTERNAL_PARQUET_LIB "" CACHE FILEPATH "Path to external
libparquet.a")
++set(PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB "" CACHE FILEPATH
++ "Path to external libarrow_bundled_dependencies.a")
++
+ macro(set_urls URLS)
+ set(${URLS} ${ARGN})
+ endmacro()
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -961,5 +961,95 @@ macro(build_orc)
+ endmacro()
+
+ macro(build_arrow)
+- message(STATUS "Building Arrow from source")
++ if(PAIMON_USE_EXTERNAL_ARROW)
++ set(ARROW_INCLUDE_DIR
"${CMAKE_CURRENT_BINARY_DIR}/doris_external_arrow_include")
++ file(MAKE_DIRECTORY "${ARROW_INCLUDE_DIR}")
++ if(NOT EXISTS "${ARROW_INCLUDE_DIR}/arrow")
++ execute_process(COMMAND "${CMAKE_COMMAND}" -E create_symlink
++ "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}/arrow"
++ "${ARROW_INCLUDE_DIR}/arrow")
++ endif()
++ if(EXISTS "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}/parquet"
++ AND NOT EXISTS "${ARROW_INCLUDE_DIR}/parquet")
++ execute_process(COMMAND "${CMAKE_COMMAND}" -E create_symlink
++ "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}/parquet"
++ "${ARROW_INCLUDE_DIR}/parquet")
++ endif()
++
++ if(NOT PAIMON_EXTERNAL_ARROW_INCLUDE_DIR)
++ message(FATAL_ERROR
++ "PAIMON_EXTERNAL_ARROW_INCLUDE_DIR must be set when
PAIMON_USE_EXTERNAL_ARROW=ON"
++ )
++ endif()
++ if(NOT EXISTS "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}")
++ message(FATAL_ERROR
++ "PAIMON_EXTERNAL_ARROW_INCLUDE_DIR not found:
${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}"
++ )
++ endif()
++
++ foreach(_paimon_external_lib
++ IN ITEMS PAIMON_EXTERNAL_ARROW_LIB
++ PAIMON_EXTERNAL_ARROW_DATASET_LIB
++ PAIMON_EXTERNAL_ARROW_ACERO_LIB
++ PAIMON_EXTERNAL_PARQUET_LIB
++ PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB)
++ if(NOT ${_paimon_external_lib})
++ message(FATAL_ERROR
++ "${_paimon_external_lib} must be set when
PAIMON_USE_EXTERNAL_ARROW=ON")
++ endif()
++ if(NOT EXISTS "${${_paimon_external_lib}}")
++ message(FATAL_ERROR
++ "${_paimon_external_lib} not found:
${${_paimon_external_lib}}")
++ endif()
++ endforeach()
++
++ add_library(arrow STATIC IMPORTED)
++ set_target_properties(arrow
++ PROPERTIES IMPORTED_LOCATION
"${PAIMON_EXTERNAL_ARROW_LIB}"
++ INTERFACE_INCLUDE_DIRECTORIES
++ "${ARROW_INCLUDE_DIR}")
++
++ add_library(arrow_dataset STATIC IMPORTED)
++ set_target_properties(arrow_dataset
++ PROPERTIES IMPORTED_LOCATION
++
"${PAIMON_EXTERNAL_ARROW_DATASET_LIB}"
++ INTERFACE_INCLUDE_DIRECTORIES
++ "${ARROW_INCLUDE_DIR}")
++
++ add_library(arrow_acero STATIC IMPORTED)
++ set_target_properties(arrow_acero
++ PROPERTIES IMPORTED_LOCATION
++ "${PAIMON_EXTERNAL_ARROW_ACERO_LIB}"
++ INTERFACE_INCLUDE_DIRECTORIES
++ "${ARROW_INCLUDE_DIR}")
++
++ add_library(parquet STATIC IMPORTED)
++ set_target_properties(parquet
++ PROPERTIES IMPORTED_LOCATION
"${PAIMON_EXTERNAL_PARQUET_LIB}"
++ INTERFACE_INCLUDE_DIRECTORIES
++ "${ARROW_INCLUDE_DIR}")
++
++ add_library(arrow_bundled_dependencies STATIC IMPORTED)
++ set_target_properties(arrow_bundled_dependencies
++ PROPERTIES IMPORTED_LOCATION
++
"${PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB}"
++ INTERFACE_INCLUDE_DIRECTORIES
++ "${ARROW_INCLUDE_DIR}")
++
++ target_link_libraries(arrow_acero INTERFACE arrow)
++
++ target_link_libraries(arrow_dataset INTERFACE arrow_acero)
++
++ target_link_libraries(arrow
++ INTERFACE zstd
++ snappy
++ lz4
++ zlib
++ arrow_bundled_dependencies)
++
++ target_link_libraries(parquet
++ INTERFACE zstd snappy lz4 zlib
arrow_bundled_dependencies
++ arrow_dataset)
++ else()
++ message(STATUS "Building Arrow from source")
+
+ get_target_property(ARROW_SNAPPY_INCLUDE_DIR snappy
INTERFACE_INCLUDE_DIRECTORIES)
+ get_filename_component(ARROW_SNAPPY_ROOT "${ARROW_SNAPPY_INCLUDE_DIR}"
DIRECTORY)
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -1121,6 +1121,7 @@ macro(build_arrow)
+ zlib
+ arrow_bundled_dependencies
+ arrow_dataset)
++ endif()
+
+ endmacro(build_arrow)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -326,10 +326,10 @@ if(PAIMON_ENABLE_LUMINA)
+ include_directories("${CMAKE_SOURCE_DIR}/third_party/lumina/include")
+ endif()
+
++include_directories(SYSTEM ${GLOG_INCLUDE_DIR})
+ include_directories(SYSTEM ${ARROW_INCLUDE_DIR})
+ include_directories(SYSTEM ${TBB_INCLUDE_DIR})
+
+-include_directories(SYSTEM ${GLOG_INCLUDE_DIR})
+ add_compile_definitions("GLOG_USE_GLOG_EXPORT")
+
+ set(THREADS_PREFER_PTHREAD_FLAG ON)
+
diff --git a/src/paimon/common/logging/logging.cpp
b/src/paimon/common/logging/logging.cpp
--- a/src/paimon/common/logging/logging.cpp
+++ b/src/paimon/common/logging/logging.cpp
@@ -116,7 +266,7 @@ diff --git a/src/paimon/common/memory/memory_pool.cpp
b/src/paimon/common/memory
@@ -55,7 +55,7 @@ void* MemoryPoolImpl::Malloc(uint64_t size, uint64_t
alignment) {
return memptr;
}
-
+
-void* MemoryPoolImpl::Realloc(void* p, size_t old_size, size_t new_size,
size_t alignment) {
+void* MemoryPoolImpl::Realloc(void* p, size_t old_size, size_t new_size,
uint64_t alignment) {
if (alignment == 0) {
@@ -144,7 +294,7 @@ diff --git a/src/paimon/format/blob/blob_format_writer.cpp
b/src/paimon/format/b
+ read_len = static_cast<uint32_t>(
+ std::min<uint64_t>(file_length - total_read_length,
tmp_buffer_->size()));
}
-
+
// write bin length
--- a/cmake_modules/arrow.diff
@@ -160,7 +310,7 @@ diff --git a/src/paimon/format/blob/blob_format_writer.cpp
b/src/paimon/format/b
+@@ -1789,7 +1789,20 @@ if(ARROW_WITH_THRIFT)
+ REQUIRED_VERSION
+ 0.11.0)
-+
++
+- string(REPLACE "." ";" Thrift_VERSION_LIST ${Thrift_VERSION})
++ if(NOT Thrift_VERSION)
++ if(DEFINED thrift_PC_VERSION AND thrift_PC_VERSION)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]