Copilot commented on code in PR #60194:
URL: https://github.com/apache/doris/pull/60194#discussion_r2721030592


##########
be/src/vec/exec/format/table/paimon_doris_file_system.cpp:
##########
@@ -0,0 +1,664 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "paimon_doris_file_system.h"
+
+#include <algorithm>
+#include <cctype>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "common/status.h"
+#include "gen_cpp/Types_types.h"
+#include "io/file_factory.h"
+#include "io/fs/file_reader.h"
+#include "io/fs/file_system.h"
+#include "io/fs/file_writer.h"
+#include "io/fs/local_file_system.h"
+#include "paimon/factories/factory.h"
+#include "paimon/fs/file_system.h"
+#include "paimon/fs/file_system_factory.h"
+#include "paimon/result.h"
+#include "paimon/status.h"
+
+namespace paimon {
+
+struct ParsedUri {
+    std::string scheme;
+    std::string authority;
+};
+
+std::string to_lower(std::string value) {
+    std::ranges::transform(value, value.begin(),
+                           [](unsigned char c) { return 
static_cast<char>(std::tolower(c)); });
+    return value;
+}
+
+ParsedUri parse_uri(const std::string& path) {
+    ParsedUri parsed;
+    size_t scheme_pos = path.find("://");
+    size_t delim_len = 3;
+    if (scheme_pos == std::string::npos) {
+        scheme_pos = path.find(":/");
+        delim_len = 2;
+    }
+    if (scheme_pos == std::string::npos || scheme_pos == 0) {
+        return parsed;
+    }
+    parsed.scheme = to_lower(path.substr(0, scheme_pos));
+    size_t authority_start = scheme_pos + delim_len;
+    if (authority_start >= path.size() || path[authority_start] == '/') {
+        return parsed;
+    }
+    size_t next_slash = path.find('/', authority_start);
+    if (next_slash == std::string::npos) {
+        parsed.authority = path.substr(authority_start);
+    } else {
+        parsed.authority = path.substr(authority_start, next_slash - 
authority_start);
+    }
+    return parsed;
+}
+
+bool is_s3_scheme(const std::string& scheme) {
+    return scheme == "s3" || scheme == "s3a" || scheme == "s3n" || scheme == 
"oss" ||
+           scheme == "obs" || scheme == "cos" || scheme == "cosn" || scheme == 
"gs" ||
+           scheme == "abfs" || scheme == "abfss" || scheme == "wasb" || scheme 
== "wasbs";
+}
+
+bool is_hdfs_scheme(const std::string& scheme) {
+    return scheme == "hdfs" || scheme == "viewfs" || scheme == "local";
+}
+
+bool is_http_scheme(const std::string& scheme) {
+    return scheme == "http" || scheme == "https";
+}
+
+doris::TFileType::type map_scheme_to_file_type(const std::string& scheme) {
+    if (scheme.empty()) {
+        return doris::TFileType::FILE_HDFS;
+    }
+    if (scheme == "file") {
+        return doris::TFileType::FILE_LOCAL;
+    }
+    if (is_hdfs_scheme(scheme)) {
+        return doris::TFileType::FILE_HDFS;
+    }
+    if (is_s3_scheme(scheme)) {
+        return doris::TFileType::FILE_S3;
+    }
+    if (is_http_scheme(scheme)) {
+        return doris::TFileType::FILE_HTTP;
+    }
+    if (scheme == "ofs" || scheme == "gfs" || scheme == "jfs") {
+        return doris::TFileType::FILE_BROKER;
+    }
+    return doris::TFileType::FILE_HDFS;
+}
+
+std::string replace_scheme(const std::string& path, const std::string& scheme) 
{
+    size_t scheme_pos = path.find("://");
+    size_t delim_len = 3;
+    if (scheme_pos == std::string::npos) {
+        scheme_pos = path.find(":/");
+        delim_len = 2;
+    }
+    if (scheme_pos == std::string::npos) {
+        return path;
+    }
+    return scheme + "://" + path.substr(scheme_pos + delim_len);
+}
+
+std::string normalize_local_path(const std::string& path) {
+    if (!path.starts_with("file:")) {
+        return path;
+    }
+    constexpr size_t file_prefix_len = 5;
+    size_t start = file_prefix_len;
+    if (path.compare(start, 2, "//") == 0 && path.size() - start > 2) {
+        size_t next_slash = path.find('/', start + 2);
+        if (next_slash == std::string::npos) {
+            return "";
+        }
+        start = next_slash;
+    }
+    return path.substr(start);
+}
+
+std::string normalize_path_for_type(const std::string& path, const 
std::string& scheme,
+                                    doris::TFileType::type type) {
+    if (type == doris::TFileType::FILE_LOCAL) {
+        return normalize_local_path(path);
+    }
+    if (type == doris::TFileType::FILE_S3 && scheme != "s3" && 
!is_http_scheme(scheme)) {
+        return replace_scheme(path, "s3");
+    }
+    return path;
+}
+
+std::string build_fs_cache_key(doris::TFileType::type type, const ParsedUri& 
uri,
+                               const std::string& default_fs_name) {
+    switch (type) {
+    case doris::TFileType::FILE_LOCAL:
+        return "local";
+    case doris::TFileType::FILE_S3:
+        return "s3://" + uri.authority;
+    case doris::TFileType::FILE_HTTP:
+        return "http://"; + uri.authority;
+    case doris::TFileType::FILE_BROKER:
+        return "broker";
+    case doris::TFileType::FILE_HDFS:
+    default:
+        if (!uri.scheme.empty() || !uri.authority.empty()) {
+            return uri.scheme + "://" + uri.authority;
+        }
+        return default_fs_name;
+    }
+}
+
+paimon::Status to_paimon_status(const doris::Status& status) {
+    if (status.ok()) {
+        return paimon::Status::OK();
+    }
+    switch (status.code()) {
+    case doris::ErrorCode::NOT_FOUND:
+    case doris::ErrorCode::DIR_NOT_EXIST:
+        return paimon::Status::NotExist(status.to_string());
+    case doris::ErrorCode::ALREADY_EXIST:
+    case doris::ErrorCode::FILE_ALREADY_EXIST:
+        return paimon::Status::Exist(status.to_string());
+    case doris::ErrorCode::INVALID_ARGUMENT:
+    case doris::ErrorCode::INVALID_INPUT_SYNTAX:
+        return paimon::Status::Invalid(status.to_string());
+    case doris::ErrorCode::NOT_IMPLEMENTED_ERROR:
+        return paimon::Status::NotImplemented(status.to_string());
+    default:
+        return paimon::Status::IOError(status.to_string());
+    }
+}
+
+std::string join_path(const std::string& base, const std::string& child) {
+    if (base.empty()) {
+        return child;
+    }
+    if (base.back() == '/') {
+        return base + child;
+    }
+    return base + "/" + child;
+}
+
+std::string parent_path_no_scheme(const std::string& path) {
+    if (path.empty()) {
+        return "";
+    }
+    size_t end = path.size();
+    while (end > 1 && path[end - 1] == '/') {
+        --end;
+    }
+    size_t pos = path.rfind('/', end - 1);
+    if (pos == std::string::npos) {
+        return "";
+    }
+    if (pos == 0) {
+        return "/";
+    }
+    return path.substr(0, pos);
+}
+
+std::string parent_path(const std::string& path) {
+    ParsedUri uri = parse_uri(path);
+    if (uri.scheme.empty()) {
+        return parent_path_no_scheme(path);
+    }
+    size_t scheme_pos = path.find("://");
+    size_t delim_len = 3;
+    if (scheme_pos == std::string::npos) {
+        scheme_pos = path.find(":/");
+        delim_len = 2;
+    }
+    if (scheme_pos == std::string::npos) {
+        return parent_path_no_scheme(path);
+    }
+    size_t start = scheme_pos + delim_len;
+    size_t slash = path.find('/', start);
+    if (slash == std::string::npos) {
+        return "";
+    }
+    std::string path_part = path.substr(slash);
+    std::string parent_part = parent_path_no_scheme(path_part);
+    if (parent_part.empty()) {
+        return "";
+    }
+    std::string prefix = uri.scheme + "://";
+    if (!uri.authority.empty()) {
+        prefix += uri.authority;
+    }
+    return prefix + parent_part;
+}
+
+class DorisInputStream : public InputStream {
+public:
+    DorisInputStream(doris::io::FileReaderSPtr reader, std::string path)
+            : reader_(std::move(reader)), path_(std::move(path)) {}
+
+    Status Seek(int64_t offset, SeekOrigin origin) override {
+        int64_t target = 0;
+        if (origin == SeekOrigin::FS_SEEK_SET) {
+            target = offset;
+        } else if (origin == SeekOrigin::FS_SEEK_CUR) {
+            target = position_ + offset;
+        } else if (origin == SeekOrigin::FS_SEEK_END) {
+            target = static_cast<int64_t>(reader_->size()) + offset;
+        } else {
+            return Status::Invalid("unknown seek origin");
+        }
+        if (target < 0) {
+            return Status::Invalid("seek position is negative");
+        }
+        position_ = target;
+        return Status::OK();
+    }
+
+    Result<int64_t> GetPos() const override { return position_; }
+
+    Result<int32_t> Read(char* buffer, uint32_t size) override {
+        size_t bytes_read = 0;
+        doris::Status status = reader_->read_at(position_, 
doris::Slice(buffer, size), &bytes_read);
+        if (!status.ok()) {
+            return to_paimon_status(status);
+        }
+        position_ += static_cast<int64_t>(bytes_read);
+        return static_cast<int32_t>(bytes_read);
+    }
+
+    Result<int32_t> Read(char* buffer, uint32_t size, uint64_t offset) 
override {
+        size_t bytes_read = 0;
+        doris::Status status = reader_->read_at(offset, doris::Slice(buffer, 
size), &bytes_read);
+        if (!status.ok()) {
+            return to_paimon_status(status);
+        }
+        return static_cast<int32_t>(bytes_read);
+    }
+
+    void ReadAsync(char* buffer, uint32_t size, uint64_t offset,
+                   std::function<void(Status)>&& callback) override {
+        Result<int32_t> result = Read(buffer, size, offset);
+        Status status = Status::OK();
+        if (!result.ok()) {
+            status = result.status();
+        }
+        callback(status);
+    }
+
+    Result<std::string> GetUri() const override { return path_; }
+
+    Result<uint64_t> Length() const override { return 
static_cast<uint64_t>(reader_->size()); }
+
+    Status Close() override { return to_paimon_status(reader_->close()); }
+
+private:
+    doris::io::FileReaderSPtr reader_;
+    std::string path_;
+    int64_t position_ = 0;
+};
+
+class DorisOutputStream : public OutputStream {
+public:
+    DorisOutputStream(doris::io::FileWriterPtr writer, std::string path)
+            : writer_(std::move(writer)), path_(std::move(path)) {}
+
+    Result<int32_t> Write(const char* buffer, uint32_t size) override {
+        doris::Status status = writer_->append(doris::Slice(buffer, size));
+        if (!status.ok()) {
+            return to_paimon_status(status);
+        }
+        return static_cast<int32_t>(size);
+    }
+
+    Status Flush() override { return Status::OK(); }
+
+    Result<int64_t> GetPos() const override {
+        return static_cast<int64_t>(writer_->bytes_appended());
+    }
+
+    Result<std::string> GetUri() const override { return path_; }
+
+    Status Close() override { return to_paimon_status(writer_->close()); }
+
+private:
+    doris::io::FileWriterPtr writer_;
+    std::string path_;
+};
+
+class DorisBasicFileStatus : public BasicFileStatus {
+public:
+    DorisBasicFileStatus(std::string path, bool is_dir) : 
path_(std::move(path)), is_dir_(is_dir) {}
+
+    bool IsDir() const override { return is_dir_; }
+    std::string GetPath() const override { return path_; }
+
+private:
+    std::string path_;
+    bool is_dir_;
+};
+
+class DorisFileStatus : public FileStatus {
+public:
+    DorisFileStatus(std::string path, bool is_dir, uint64_t length, int64_t 
mtime)
+            : path_(std::move(path)), is_dir_(is_dir), length_(length), 
mtime_(mtime) {}
+
+    uint64_t GetLen() const override { return length_; }
+    bool IsDir() const override { return is_dir_; }
+    std::string GetPath() const override { return path_; }
+    int64_t GetModificationTime() const override { return mtime_; }
+
+private:
+    std::string path_;
+    bool is_dir_;
+    uint64_t length_;
+    int64_t mtime_;
+};
+
+class DorisFileSystem : public FileSystem {
+public:
+    explicit DorisFileSystem(std::map<std::string, std::string> options)
+            : options_(std::move(options)) {
+        auto it = options_.find("fs.defaultFS");
+        if (it != options_.end()) {
+            default_fs_name_ = it->second;
+        }
+    }
+
+    Result<std::unique_ptr<InputStream>> Open(const std::string& path) const 
override {
+        PAIMON_ASSIGN_OR_RAISE(auto resolved, resolve_path(path));
+        auto& fs = resolved.first;
+        auto& normalized_path = resolved.second;
+        doris::io::FileReaderSPtr reader;
+        doris::io::FileReaderOptions reader_options = 
doris::io::FileReaderOptions::DEFAULT;
+        doris::Status status = fs->open_file(normalized_path, &reader, 
&reader_options);
+        if (!status.ok()) {
+            return to_paimon_status(status);
+        }
+        return std::make_unique<DorisInputStream>(std::move(reader), 
normalized_path);
+    }
+
+    Result<std::unique_ptr<OutputStream>> Create(const std::string& path,
+                                                 bool overwrite) const 
override {
+        PAIMON_ASSIGN_OR_RAISE(auto resolved, resolve_path(path));
+        auto& fs = resolved.first;
+        auto& normalized_path = resolved.second;
+        if (!overwrite) {
+            bool exists = false;
+            doris::Status exists_status = fs->exists(normalized_path, &exists);
+            if (!exists_status.ok()) {
+                return to_paimon_status(exists_status);
+            }
+            if (exists) {
+                return Status::Exist("file already exists: ", normalized_path);

Review Comment:
   This call returns Status::Exist with two arguments. Elsewhere in this file 
paimon::Status helpers are called with a single message string, so this likely 
won’t compile against paimon::Status API. Consider building a single message 
string (e.g., concatenation/format) and passing that to Status::Exist, or use 
the correct overload if one exists.
   ```suggestion
                   return Status::Exist(std::string("file already exists: ") + 
normalized_path);
   ```



##########
thirdparty/patches/paimon-cpp-b1ffd6f73e5edf57aac24ec2eaf6d2ef9e9a9850.patch:
##########
@@ -0,0 +1,651 @@
+diff --git a/.gitignore b/.gitignore
+index bbf839e..adda78d 100644
+--- a/.gitignore
++++ b/.gitignore
+@@ -16,6 +16,7 @@
+ build
+ build-release
+ build-debug
++build-asan
+ 
+ # IDE settings
+ .idea
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index f942510..ef5b68c 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -44,11 +44,20 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+ 
+ option(PAIMON_BUILD_STATIC "Build static library" ON)
+ option(PAIMON_BUILD_SHARED "Build shared library" ON)
++option(PAIMON_USE_BSYMBOLIC "Link shared libraries with -Bsymbolic" OFF)
++option(PAIMON_LINK_SHARED_STDLIB "Link shared libraries with shared 
libstdc++/libgcc" ON)
+ option(PAIMON_BUILD_TESTS "Build tests" OFF)
+ option(PAIMON_USE_ASAN "Use Address Sanitizer" OFF)
+ option(PAIMON_USE_UBSAN "Use Undefined Behavior Sanitizer" OFF)
+ option(PAIMON_ENABLE_AVRO "Whether to enable avro file format" ON)
+ option(PAIMON_ENABLE_ORC "Whether to enable orc file format" ON)
++option(PAIMON_USE_EXTERNAL_ORC "Use external ORC target (orc::orc) instead of 
bundled ORC" OFF)
++option(PAIMON_ORC_V1 "Build with ORC 1.x compatibility" OFF)
++option(PAIMON_USE_EXTERNAL_GLOG "Use external glog library instead of bundled 
glog" OFF)
++set(GLOG_INCLUDE_DIR "" CACHE PATH "External glog include directory")
++set(GLOG_STATIC_LIB "" CACHE FILEPATH "External glog static library")
++set(GFLAGS_INCLUDE_DIR "" CACHE PATH "External gflags include directory")
++set(GFLAGS_STATIC_LIB "" CACHE FILEPATH "External gflags static library")
+ option(PAIMON_ENABLE_LANCE "Whether to enable lance file format" OFF)
+ option(PAIMON_ENABLE_JINDO "Whether to enable jindo file system" OFF)
+ option(PAIMON_ENABLE_LUMINA "Whether to enable lumina vector index" ON)
+@@ -56,6 +65,9 @@ option(PAIMON_ENABLE_LUMINA "Whether to enable lumina vector 
index" ON)
+ if(PAIMON_ENABLE_ORC)
+     add_definitions(-DPAIMON_ENABLE_ORC)
+ endif()
++if(PAIMON_ORC_V1)
++    add_definitions(-DPAIMON_ORC_V1)
++endif()
+ if(PAIMON_ENABLE_AVRO)
+     add_definitions(-DPAIMON_ENABLE_AVRO)
+ endif()
+@@ -79,7 +91,7 @@ set(CLANG_TIDY_BIN "clang-tidy")
+ set(PAIMON_SOURCE_DIR ${PROJECT_SOURCE_DIR})
+ set(PAIMON_BINARY_DIR ${PROJECT_BINARY_DIR})
+ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} 
"${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules")
+-set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build_support")
++set(BUILD_SUPPORT_DIR "${PAIMON_SOURCE_DIR}/build_support")
+ set(PAIMON_CMAKE_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+ 
+ include(GNUInstallDirs)
+@@ -101,8 +113,8 @@ endif()
+ 
+ set(LINT_EXCLUSIONS_FILE ${BUILD_SUPPORT_DIR}/lint_exclusions.txt)
+ 
+-set(LINT_SOURCE_DIRS ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/src
+-                     ${CMAKE_SOURCE_DIR}/test ${CMAKE_SOURCE_DIR}/examples)
++set(LINT_SOURCE_DIRS ${PAIMON_SOURCE_DIR}/include ${PAIMON_SOURCE_DIR}/src
++                     ${PAIMON_SOURCE_DIR}/test ${PAIMON_SOURCE_DIR}/examples)
+ 
+ set(CLANG_TIDY_OPTIONS
+     ${PAIMON_LINT_QUIET}
+@@ -121,7 +133,7 @@ if(PAIMON_LINT_GIT_DIFF_MODE)
+                       COMMAND git diff-index --name-only --diff-filter=d 
--cached
+                               ${PAIMON_LINT_GIT_TARGET_COMMIT} > 
${GIT_DIFF_FILE_PATH}
+                       # convert to absolute path
+-                      COMMAND sed -i \"s|^|${CMAKE_SOURCE_DIR}/|\" 
${GIT_DIFF_FILE_PATH}
++                      COMMAND sed -i \"s|^|${PAIMON_SOURCE_DIR}/|\" 
${GIT_DIFF_FILE_PATH}
+                       # filter with ${LINT_SOURCE_DIRS} dirs
+                       COMMAND sed -nE -i '\\@${LINT_DIR_REGEX}@p' 
${GIT_DIFF_FILE_PATH}
+                       COMMENT "Generate git_diff_files.txt for git-diff lint"
+@@ -280,15 +292,15 @@ set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+ set(CMAKE_C_VISIBILITY_PRESET hidden)
+ 
+ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+-include_directories("${CMAKE_SOURCE_DIR}/third_party/roaring_bitmap")
+-include_directories("${CMAKE_SOURCE_DIR}/third_party/xxhash")
++include_directories("${PAIMON_SOURCE_DIR}/third_party/roaring_bitmap")
++include_directories("${PAIMON_SOURCE_DIR}/third_party/xxhash")
+ 
+ if(PAIMON_ENABLE_LANCE)
+-    include_directories("${CMAKE_SOURCE_DIR}/third_party/lance")
++    include_directories("${PAIMON_SOURCE_DIR}/third_party/lance")
+ endif()
+ 
+ if(PAIMON_ENABLE_LUMINA)
+-    include_directories("${CMAKE_SOURCE_DIR}/third_party/lumina/include")
++    include_directories("${PAIMON_SOURCE_DIR}/third_party/lumina/include")
+ endif()
+ 
+ include_directories(SYSTEM ${ARROW_INCLUDE_DIR})
+@@ -300,9 +312,9 @@ add_compile_definitions("GLOG_USE_GLOG_EXPORT")
+ set(THREADS_PREFER_PTHREAD_FLAG ON)
+ find_package(Threads REQUIRED)
+ set(PAIMON_VERSION_SCRIPT_FLAGS
+-    "-Wl,--version-script=${CMAKE_SOURCE_DIR}/src/paimon/symbols.map")
++    "-Wl,--version-script=${PAIMON_SOURCE_DIR}/src/paimon/symbols.map")
+ 
+-set(ENV{PAIMON_TEST_DATA} "${CMAKE_SOURCE_DIR}/test/test_data")
++set(ENV{PAIMON_TEST_DATA} "${PAIMON_SOURCE_DIR}/test/test_data")
+ 
+ if(PAIMON_BUILD_TESTS)
+     if(NOT PAIMON_ENABLE_ORC)
+@@ -322,7 +334,7 @@ if(PAIMON_BUILD_TESTS)
+     add_dependencies(unittest paimon-tests)
+ 
+     include_directories(SYSTEM ${GTEST_INCLUDE_DIR})
+-    include_directories("${CMAKE_SOURCE_DIR}/test/")
++    include_directories("${PAIMON_SOURCE_DIR}/test/")
+ 
+     set(TEST_STATIC_LINK_LIBS
+         "-Wl,--whole-archive"
+diff --git a/cmake_modules/BuildUtils.cmake b/cmake_modules/BuildUtils.cmake
+index c63d17f..73db743 100644
+--- a/cmake_modules/BuildUtils.cmake
++++ b/cmake_modules/BuildUtils.cmake
+@@ -55,7 +55,17 @@ function(add_paimon_lib LIB_NAME)
+     # Necessary to make static linking into other shared libraries work 
properly
+     set_property(TARGET ${LIB_NAME}_objlib PROPERTY POSITION_INDEPENDENT_CODE 
1)
+     if(ARG_DEPENDENCIES)
+-        add_dependencies(${LIB_NAME}_objlib ${ARG_DEPENDENCIES})
++        set(_paimon_objlib_deps)
++        foreach(_paimon_dep IN LISTS ARG_DEPENDENCIES)
++            if(TARGET ${_paimon_dep})
++                list(APPEND _paimon_objlib_deps ${_paimon_dep})
++            endif()
++        endforeach()
++        if(_paimon_objlib_deps)
++            add_dependencies(${LIB_NAME}_objlib ${_paimon_objlib_deps})
++        endif()
++        unset(_paimon_objlib_deps)
++        unset(_paimon_dep)
+     endif()
+     set(LIB_DEPS $<TARGET_OBJECTS:${LIB_NAME}_objlib>)
+     set(LIB_INCLUDES)
+@@ -106,10 +116,17 @@ function(add_paimon_lib LIB_NAME)
+         target_link_options(${LIB_NAME}_shared
+                             PRIVATE
+                             -Wl,--exclude-libs,ALL
+-                            -Wl,-Bsymbolic
+                             -Wl,-z,defs
+                             -Wl,--gc-sections)
+ 
++        if(PAIMON_USE_BSYMBOLIC)
++            target_link_options(${LIB_NAME}_shared PRIVATE -Wl,-Bsymbolic)
++        endif()
++
++        if(PAIMON_LINK_SHARED_STDLIB AND NOT APPLE AND CMAKE_CXX_COMPILER_ID 
MATCHES "GNU|Clang")
++            target_link_options(${LIB_NAME}_shared PRIVATE -shared-libstdc++ 
-shared-libgcc)
++        endif()
++
+         install(TARGETS ${LIB_NAME}_shared ${INSTALL_IS_OPTIONAL}
+                 EXPORT ${LIB_NAME}_targets
+                 RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR}
+diff --git a/cmake_modules/DefineOptions.cmake 
b/cmake_modules/DefineOptions.cmake
+index 0211f40..c4a6b2f 100644
+--- a/cmake_modules/DefineOptions.cmake
++++ b/cmake_modules/DefineOptions.cmake
+@@ -95,6 +95,11 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL 
"${CMAKE_CURRENT_SOURCE_DIR}")
+     define_option(PAIMON_BUILD_STATIC "Build static libraries" OFF)
+ 
+     define_option(PAIMON_BUILD_SHARED "Build shared libraries" ON)
++
++    define_option(PAIMON_USE_BSYMBOLIC "Link shared libraries with 
-Bsymbolic" OFF)
++    define_option(PAIMON_LINK_SHARED_STDLIB "Link shared libraries with 
shared libstdc++/libgcc"
++                  ON)
++    define_option(PAIMON_USE_EXTERNAL_RAPIDJSON "Use external RapidJSON 
headers" OFF)
+     #----------------------------------------------------------------------
+     set_option_category("Test")
+ 
+diff --git a/cmake_modules/SetupCxxFlags.cmake 
b/cmake_modules/SetupCxxFlags.cmake
+index e4b2710..2d705ba 100644
+--- a/cmake_modules/SetupCxxFlags.cmake
++++ b/cmake_modules/SetupCxxFlags.cmake
+@@ -41,6 +41,15 @@ set(CMAKE_CXX_EXTENSIONS OFF)
+ # shared libraries
+ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+ 
++# Use global-dynamic TLS model to avoid relocation issues when linking
++# with other libraries that may use different TLS models (e.g., when
++# rapidjson templates are instantiated in both this library and the
++# consuming application)
++if(NOT MSVC)
++    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ftls-model=global-dynamic")
++    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ftls-model=global-dynamic")
++endif()
++
+ string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE)
+ 
+ set(UNKNOWN_COMPILER_MESSAGE
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake 
b/cmake_modules/ThirdpartyToolchain.cmake
+index 3f07104..dfe2ba3 100644
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -1011,6 +1011,17 @@ endmacro()
+ 
+ build_fmt()
+-build_rapidjson()
++if(PAIMON_USE_EXTERNAL_RAPIDJSON)
++    if(NOT RAPIDJSON_INCLUDE_DIR)
++        message(FATAL_ERROR
++                "PAIMON_USE_EXTERNAL_RAPIDJSON=ON but RAPIDJSON_INCLUDE_DIR 
is not set")
++    endif()
++
++    file(MAKE_DIRECTORY "${RAPIDJSON_INCLUDE_DIR}")
++    add_library(RapidJSON INTERFACE IMPORTED)
++    target_include_directories(RapidJSON SYSTEM INTERFACE 
"${RAPIDJSON_INCLUDE_DIR}")
++else()
++    build_rapidjson()
++endif()
+ build_snappy()
+ build_zstd()
+ build_zlib()
+@@ -1018,14 +1018,56 @@ build_zlib()
+ build_lz4()
+ build_arrow()
+ build_tbb()
+-build_glog()
++if(PAIMON_USE_EXTERNAL_GLOG)
++    if(NOT GLOG_INCLUDE_DIR OR NOT GLOG_STATIC_LIB)
++        message(FATAL_ERROR
++                "PAIMON_USE_EXTERNAL_GLOG=ON but GLOG_INCLUDE_DIR or 
GLOG_STATIC_LIB is not set")
++    endif()
++    add_library(glog STATIC IMPORTED)
++    set(_glog_interface_includes "${GLOG_INCLUDE_DIR}")
++    set(_glog_interface_libs "")
++    if(GFLAGS_STATIC_LIB)
++        list(APPEND _glog_interface_libs "${GFLAGS_STATIC_LIB}")
++        if(GFLAGS_INCLUDE_DIR)
++            list(APPEND _glog_interface_includes "${GFLAGS_INCLUDE_DIR}")
++        endif()
++    endif()
++    set_target_properties(glog
++                          PROPERTIES IMPORTED_LOCATION "${GLOG_STATIC_LIB}"
++                                     INTERFACE_SYSTEM_INCLUDE_DIRECTORIES
++                                     "${_glog_interface_includes}"
++                                     INTERFACE_LINK_LIBRARIES 
"${_glog_interface_libs}"
++                                     INTERFACE_COMPILE_DEFINITIONS 
"GLOG_USE_GLOG_EXPORT")
++    unset(_glog_interface_includes)
++    unset(_glog_interface_libs)
++else()
++    build_glog()
++endif()
+ 
+ if(PAIMON_ENABLE_AVRO)
+     build_avro()
+ endif()
+ if(PAIMON_ENABLE_ORC)
+-    build_protobuf()
+-    build_orc()
++    if(PAIMON_USE_EXTERNAL_ORC)
++        if(TARGET orc::orc)
++            message(STATUS "Using external ORC target: orc::orc")
++        elseif(TARGET orc)
++            add_library(orc::orc ALIAS orc)
++            message(STATUS "Using external ORC target: orc (aliased to 
orc::orc)")
++        else()
++            message(FATAL_ERROR
++                    "PAIMON_USE_EXTERNAL_ORC=ON but no ORC target found. 
Define target orc::orc "
++                    "(or orc) before including ThirdpartyToolchain.cmake.")
++        endif()
++
++        get_target_property(_orc_include_dirs orc::orc 
INTERFACE_INCLUDE_DIRECTORIES)
++        if(_orc_include_dirs AND NOT _orc_include_dirs STREQUAL "NOTFOUND")
++            set(ORC_INCLUDE_DIR "${_orc_include_dirs}")
++        endif()
++    else()
++        build_protobuf()
++        build_orc()
++    endif()
+ endif()
+ if(PAIMON_ENABLE_JINDO)
+     build_jindosdk_c()
+diff --git a/cmake_modules/arrow.diff b/cmake_modules/arrow.diff
+index 997cb6b..c7b1cf4 100644
+--- a/cmake_modules/arrow.diff
++++ b/cmake_modules/arrow.diff
+@@ -194,5 +194,22 @@ index 4d3acb491e..3906ff3c59 100644
+    int64_t max_row_group_length_;
+ +  int64_t max_row_group_size_;
+    int64_t pagesize_;
+-   ParquetDataPageVersion parquet_data_page_version_;
+-   ParquetVersion::type parquet_version_;
++  ParquetDataPageVersion parquet_data_page_version_;
++  ParquetVersion::type parquet_version_;
++
++diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake 
b/cpp/cmake_modules/ThirdpartyToolchain.cmake
++--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
++@@ -1789,8 +1789,11 @@ if(ARROW_WITH_THRIFT)
++                      REQUIRED_VERSION
++                      0.11.0)
++ 
++-  string(REPLACE "." ";" Thrift_VERSION_LIST ${Thrift_VERSION})
+++  if(NOT Thrift_VERSION)
+++    set(Thrift_VERSION "${ARROW_THRIFT_BUILD_VERSION}")
+++  endif()
+++  string(REPLACE "." ";" Thrift_VERSION_LIST "${Thrift_VERSION}")
++   list(GET Thrift_VERSION_LIST 0 Thrift_VERSION_MAJOR)
++   list(GET Thrift_VERSION_LIST 1 Thrift_VERSION_MINOR)
++   list(GET Thrift_VERSION_LIST 2 Thrift_VERSION_PATCH)
++ endif()
+diff --git a/cmake_modules/san-config.cmake b/cmake_modules/san-config.cmake
+index 13399f6..6a8b6db 100644
+--- a/cmake_modules/san-config.cmake
++++ b/cmake_modules/san-config.cmake
+@@ -17,6 +17,9 @@ if(PAIMON_USE_ASAN)
+         target_compile_options(paimon_sanitizer_flags INTERFACE 
-fsanitize=address
+                                                                 
-fno-omit-frame-pointer)
+         target_link_options(paimon_sanitizer_flags INTERFACE 
-fsanitize=address)
++        if(PAIMON_LINK_SHARED_STDLIB AND NOT APPLE)
++            target_link_options(paimon_sanitizer_flags INTERFACE 
-shared-libasan)
++        endif()
+         message(STATUS "Address Sanitizer enabled")
+     else()
+         message(WARNING "Address Sanitizer is only supported for GCC and 
Clang compilers")
+diff --git a/scripts/check_paimon_deps.sh b/scripts/check_paimon_deps.sh
+new file mode 100755
+index 0000000..4b54e5b
+--- /dev/null
++++ b/scripts/check_paimon_deps.sh
+@@ -0,0 +1,82 @@
++#!/bin/bash
++# Paimon-cpp 依赖库检查脚本
++
++# 默认 PAIMON_HOME
++PAIMON_HOME="${PAIMON_HOME:-/mnt/disk1/chenjunwei/paimon-install-asan}"

Review Comment:
   The dependency check script added to the paimon-cpp patch hardcodes a 
developer-specific default PAIMON_HOME 
("/mnt/disk1/chenjunwei/paimon-install-asan"). This will be wrong in almost all 
environments and can mislead users. Prefer no default (require PAIMON_HOME to 
be set) or derive it relative to the script location, and keep the 
output/messages environment-agnostic.
   ```suggestion
   @@ -0,0 +1,86 @@
   +#!/bin/bash
   +# Paimon-cpp 依赖库检查脚本
   +
   +# 默认 PAIMON_HOME:优先使用环境变量,否则相对于当前脚本目录推导
   +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
   +if [ -z "${PAIMON_HOME:-}" ]; then
   +    PAIMON_HOME="$(cd "$SCRIPT_DIR/.." && pwd)"
   +fi
   ```



##########
be/src/vec/exec/scan/file_scanner.cpp:
##########
@@ -67,6 +67,8 @@
 #include "vec/exec/format/table/iceberg_reader.h"
 #include "vec/exec/format/table/lakesoul_jni_reader.h"
 #include "vec/exec/format/table/max_compute_jni_reader.h"
+#include "vec/exec/format/table/paimon_cpp_reader.h"
+#include "vec/exec/format/table/paimon_predicate_converter.h"

Review Comment:
   These paimon-cpp headers are included unconditionally, but the project only 
adds PAIMON_HOME include paths when ENABLE_PAIMON_CPP is ON. With 
ENABLE_PAIMON_CPP=OFF this will fail to compile due to missing "paimon/..." 
headers (and Vec/CMakeLists.txt globs all *.cpp, so paimon_cpp_reader.cpp will 
be built too). Please gate the includes/usage (and the new paimon-cpp .cpp 
files) behind the CMake option (e.g., exclude sources when disabled and/or add 
a compile definition when enabled).
   ```suggestion
   #include "vec/exec/format/table/max_compute_jni_reader.h"
   #ifdef ENABLE_PAIMON_CPP
   #include "vec/exec/format/table/paimon_cpp_reader.h"
   #include "vec/exec/format/table/paimon_predicate_converter.h"
   #endif
   ```



##########
fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonSource.java:
##########
@@ -70,4 +70,12 @@ public ExternalCatalog getCatalog() {
     public String getFileFormatFromTableProperties() {
         return originTable.options().getOrDefault("file.format", "parquet");
     }
+
+    public String getTableLocation() {
+        if (originTable instanceof FileStoreTable) {
+            return ((FileStoreTable) originTable).location().toString();
+        }
+        // Fallback to path option
+        return originTable.options().get("path");
+    }

Review Comment:
   getTableLocation() dereferences originTable without a null check, but the 
@VisibleForTesting no-arg constructor explicitly sets originTable = null. This 
will now throw NPE in unit tests (and any test helpers using the no-arg 
constructor). Consider returning null when originTable is null (or throwing a 
clearer exception in non-test paths).



##########
be/CMakeLists.txt:
##########
@@ -544,6 +562,91 @@ set(COMMON_THIRDPARTY
     ${COMMON_THIRDPARTY}
 )
 
+if (ENABLE_PAIMON_CPP)
+    if (NOT PAIMON_HOME)
+        set(_paimon_default_home "${THIRDPARTY_DIR}/paimon-cpp")
+        if (EXISTS 
"${_paimon_default_home}/lib/cmake/Paimon/PaimonConfig.cmake")
+            set(PAIMON_HOME "${_paimon_default_home}" CACHE PATH "" FORCE)
+        endif()
+        unset(_paimon_default_home)
+    endif()
+    if (NOT PAIMON_HOME)
+        message(FATAL_ERROR "ENABLE_PAIMON_CPP=ON but PAIMON_HOME is not set")
+    endif()
+    set(Paimon_DIR "${PAIMON_HOME}/lib/cmake/Paimon" CACHE PATH "" FORCE)
+    find_package(Paimon CONFIG REQUIRED NO_DEFAULT_PATH)
+    include_directories("${PAIMON_HOME}/include")
+    # Link glog after paimon static libs to satisfy RawLog__ in static link.
+    list(REMOVE_ITEM COMMON_THIRDPARTY glog)
+
+    # Paimon static library dependencies
+    set(paimon_fmt_lib "${PAIMON_HOME}/lib64/paimon_deps/libfmtd.a")
+    if (NOT EXISTS "${paimon_fmt_lib}")
+        set(paimon_fmt_lib "${PAIMON_HOME}/lib64/paimon_deps/libfmt.a")
+    endif()
+    add_library(paimon_fmt STATIC IMPORTED)
+    set_target_properties(paimon_fmt PROPERTIES IMPORTED_LOCATION 
"${paimon_fmt_lib}")
+    set(paimon_tbb_lib "${PAIMON_HOME}/lib64/paimon_deps/libtbb_debug.a")
+    if (NOT EXISTS "${paimon_tbb_lib}")
+        set(paimon_tbb_lib "${PAIMON_HOME}/lib64/paimon_deps/libtbb.a")
+    endif()
+    add_library(paimon_tbb STATIC IMPORTED)
+    set_target_properties(paimon_tbb PROPERTIES IMPORTED_LOCATION 
"${paimon_tbb_lib}")
+    add_library(paimon_roaring STATIC IMPORTED)
+    set_target_properties(paimon_roaring PROPERTIES IMPORTED_LOCATION 
"${PAIMON_HOME}/lib64/paimon_deps/libroaring_bitmap.a")
+    add_library(paimon_xxhash STATIC IMPORTED)
+    set_target_properties(paimon_xxhash PROPERTIES IMPORTED_LOCATION 
"${PAIMON_HOME}/lib64/paimon_deps/libxxhash.a")
+    add_library(paimon_arrow_dataset STATIC IMPORTED)
+    set_target_properties(paimon_arrow_dataset PROPERTIES IMPORTED_LOCATION 
"${PAIMON_HOME}/lib64/paimon_deps/libarrow_dataset.a")
+    add_library(paimon_arrow_acero STATIC IMPORTED)
+    set_target_properties(paimon_arrow_acero PROPERTIES IMPORTED_LOCATION 
"${PAIMON_HOME}/lib64/paimon_deps/libarrow_acero.a")
+    add_library(paimon_arrow STATIC IMPORTED)
+    set_target_properties(paimon_arrow PROPERTIES IMPORTED_LOCATION 
"${PAIMON_HOME}/lib64/paimon_deps/libarrow.a")
+    set(paimon_arrow_bundled_deps_lib 
"${PAIMON_HOME}/lib64/paimon_deps/libarrow_bundled_dependencies.a")
+    if (EXISTS "${paimon_arrow_bundled_deps_lib}")
+        add_library(paimon_arrow_bundled_dependencies STATIC IMPORTED)
+        set_target_properties(paimon_arrow_bundled_dependencies PROPERTIES 
IMPORTED_LOCATION "${paimon_arrow_bundled_deps_lib}")
+    endif()
+    add_library(paimon_parquet STATIC IMPORTED)
+    set_target_properties(paimon_parquet PROPERTIES IMPORTED_LOCATION 
"${PAIMON_HOME}/lib64/paimon_deps/libparquet.a")
+    if(NOT PAIMON_USE_EXTERNAL_ORC)
+        add_library(paimon_orc STATIC IMPORTED)
+        set_target_properties(paimon_orc PROPERTIES IMPORTED_LOCATION 
"${PAIMON_HOME}/lib64/paimon_deps/liborc.a")
+    endif()

Review Comment:
   The imported paimon dependency libraries (fmt/tbb fallbacks, 
roaring/xxhash/arrow/parquet/orc) are not validated after selecting paths. If 
any file is missing, CMake will fail later with a less actionable error. 
Consider adding explicit EXISTS checks and a clear FATAL_ERROR listing the 
missing artifacts under PAIMON_HOME, especially after the fallback selection 
for fmt/tbb.



##########
fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java:
##########
@@ -218,9 +218,19 @@ private void setPaimonParams(TFileRangeDesc rangeDesc, 
PaimonSplit paimonSplit)
 
         String fileFormat = getFileFormat(paimonSplit.getPathString());
         if (split != null) {
-            // use jni reader
+            // use jni reader or paimon-cpp reader
             rangeDesc.setFormatType(TFileFormatType.FORMAT_JNI);
-            fileDesc.setPaimonSplit(PaimonUtil.encodeObjectToString(split));
+            // Use Paimon native serialization for paimon-cpp reader
+            if (sessionVariable.isEnablePaimonCppReader() && split instanceof 
DataSplit) {
+                
fileDesc.setPaimonSplit(PaimonUtil.encodeDataSplitToString((DataSplit) split));
+            } else {
+                
fileDesc.setPaimonSplit(PaimonUtil.encodeObjectToString(split));
+            }
+            // Set table location for paimon-cpp reader
+            String tableLocation = source.getTableLocation();
+            if (tableLocation != null) {
+                fileDesc.setPaimonTable(tableLocation);
+            }

Review Comment:
   TPaimonFileDesc.paimon_table is marked deprecated in 
gensrc/thrift/PlanNodes.thrift. Reusing it for paimon-cpp table location makes 
the API harder to evolve and risks removal/behavior changes. Prefer adding a 
new non-deprecated thrift field (or updating the thrift definition/comment if 
this field is intended to be used again) for the paimon-cpp table root/location.
   ```suggestion
   
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to