This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new f1e1020e6fc [fix](parquet)fix parquet write timestamp int96 type. 
(1/2) (#61760)
f1e1020e6fc is described below

commit f1e1020e6fc4e7eaae8b89fe2dbd1ce2a9e1e7da
Author: daidai <[email protected]>
AuthorDate: Sat Mar 28 09:15:03 2026 +0800

    [fix](parquet)fix parquet write timestamp int96 type. (1/2) (#61760)
    
    ### What problem does this PR solve?
    PR #60946
    Problem Summary:
    This pull request fixes a patch introduced in #60946 that caused Doris
    exports to fail to write Parquet int96 data types. This issue is
    resolved by adding a new patch to arrow that introduces a parameter that
    forces writing to int96.
    
    This pr only update thirdparty, next pr update be code.
---
 thirdparty/download-thirdparty.sh                  |  4 +
 ...arrow-17.0.0-force-write-int96-timestamps.patch | 98 ++++++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/thirdparty/download-thirdparty.sh 
b/thirdparty/download-thirdparty.sh
index f57cab9de1a..75ba6313529 100755
--- a/thirdparty/download-thirdparty.sh
+++ b/thirdparty/download-thirdparty.sh
@@ -431,6 +431,10 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " ARROW " ]]; then
             # Paimon-cpp parquet patches: row-group-aware batch reader, 
max_row_group_size,
             # GetBufferedSize(), int96 NANO guard, and Thrift_VERSION empty 
fix.
             patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-paimon.patch"
+
+            # apache-arrow-17.0.0-force-write-int96-timestamps.patch : 
+            # Introducing the parameter that forces writing int96 timestampes 
for compatibility with Paimon cpp. 
+            patch -p1 
<"${TP_PATCH_DIR}/apache-arrow-17.0.0-force-write-int96-timestamps.patch"
             touch "${PATCHED_MARK}"
         fi
         cd -
diff --git 
a/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch 
b/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch
new file mode 100644
index 00000000000..5a754247566
--- /dev/null
+++ b/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch
@@ -0,0 +1,98 @@
+diff -ruN 
arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/arrow/schema.cc 
arrow-apache-arrow-17.0.0/cpp/src/parquet/arrow/schema.cc
+--- arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/arrow/schema.cc     
2026-03-27 01:23:23.651831424 +0800
++++ arrow-apache-arrow-17.0.0/cpp/src/parquet/arrow/schema.cc  2026-03-27 
01:28:36.855281965 +0800
+@@ -178,7 +178,8 @@
+ 
+   // The user is explicitly asking for Impala int96 encoding, there is no
+   // logical type.
+-  if (arrow_properties.support_deprecated_int96_timestamps() && target_unit 
== ::arrow::TimeUnit::NANO) {
++  if (arrow_properties.force_write_int96_timestamps() ||
++      (arrow_properties.support_deprecated_int96_timestamps() && target_unit 
== ::arrow::TimeUnit::NANO)) {
+     *physical_type = ParquetType::INT96;
+     return Status::OK();
+   }
+diff -ruN arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/properties.h 
arrow-apache-arrow-17.0.0/cpp/src/parquet/properties.h
+--- arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/properties.h        
2026-03-27 01:23:23.643831362 +0800
++++ arrow-apache-arrow-17.0.0/cpp/src/parquet/properties.h     2026-03-27 
01:27:47.717897537 +0800
+@@ -980,6 +980,7 @@
+    public:
+     Builder()
+         : write_timestamps_as_int96_(false),
++          force_write_int96_timestamps_(false),
+           coerce_timestamps_enabled_(false),
+           coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
+           truncated_timestamps_allowed_(false),
+@@ -1005,6 +1006,21 @@
+       return this;
+     }
+ 
++    /// \brief Force writing legacy int96 timestamps.
++    ///
++    /// This bypasses unit-based guards and writes INT96 whenever timestamp
++    /// metadata is resolved.
++    Builder* enable_force_write_int96_timestamps() {
++      force_write_int96_timestamps_ = true;
++      return this;
++    }
++
++    /// \brief Disable forcing legacy int96 timestamps (default).
++    Builder* disable_force_write_int96_timestamps() {
++      force_write_int96_timestamps_ = false;
++      return this;
++    }
++
+     /// \brief Coerce all timestamps to the specified time unit.
+     /// \param unit time unit to truncate to.
+     /// For Parquet versions 1.0 and 2.4, nanoseconds are casted to 
microseconds.
+@@ -1085,7 +1101,8 @@
+     /// Create the final properties.
+     std::shared_ptr<ArrowWriterProperties> build() {
+       return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
+-          write_timestamps_as_int96_, coerce_timestamps_enabled_, 
coerce_timestamps_unit_,
++          write_timestamps_as_int96_, force_write_int96_timestamps_,
++          coerce_timestamps_enabled_, coerce_timestamps_unit_,
+           truncated_timestamps_allowed_, store_schema_, 
compliant_nested_types_,
+           engine_version_, use_threads_, executor_));
+     }
+@@ -1093,6 +1110,8 @@
+    private:
+     bool write_timestamps_as_int96_;
+ 
++    bool force_write_int96_timestamps_;
++
+     bool coerce_timestamps_enabled_;
+     ::arrow::TimeUnit::type coerce_timestamps_unit_;
+     bool truncated_timestamps_allowed_;
+@@ -1107,6 +1126,8 @@
+ 
+   bool support_deprecated_int96_timestamps() const { return 
write_timestamps_as_int96_; }
+ 
++  bool force_write_int96_timestamps() const { return 
force_write_int96_timestamps_; }
++
+   bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; 
}
+   ::arrow::TimeUnit::type coerce_timestamps_unit() const {
+     return coerce_timestamps_unit_;
+@@ -1138,6 +1159,7 @@
+ 
+  private:
+   explicit ArrowWriterProperties(bool write_nanos_as_int96,
++                                 bool force_write_int96_timestamps,
+                                  bool coerce_timestamps_enabled,
+                                  ::arrow::TimeUnit::type 
coerce_timestamps_unit,
+                                  bool truncated_timestamps_allowed, bool 
store_schema,
+@@ -1145,6 +1167,7 @@
+                                  EngineVersion engine_version, bool 
use_threads,
+                                  ::arrow::internal::Executor* executor)
+       : write_timestamps_as_int96_(write_nanos_as_int96),
++        force_write_int96_timestamps_(force_write_int96_timestamps),
+         coerce_timestamps_enabled_(coerce_timestamps_enabled),
+         coerce_timestamps_unit_(coerce_timestamps_unit),
+         truncated_timestamps_allowed_(truncated_timestamps_allowed),
+@@ -1155,6 +1178,7 @@
+         executor_(executor) {}
+ 
+   const bool write_timestamps_as_int96_;
++  const bool force_write_int96_timestamps_;
+   const bool coerce_timestamps_enabled_;
+   const ::arrow::TimeUnit::type coerce_timestamps_unit_;
+   const bool truncated_timestamps_allowed_;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to