This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new bf071a7e007 branch-4.0:[fix](parquet)fix parquet write timestamp int96 
type. (1/2). (#63779)
bf071a7e007 is described below

commit bf071a7e007cf641cf10f9c75a9909d77ba9d46b
Author: daidai <[email protected]>
AuthorDate: Thu May 28 14:40:19 2026 +0800

    branch-4.0:[fix](parquet)fix parquet write timestamp int96 type. (1/2). 
(#63779)
    
    ### What problem does this PR solve?
    Problem Summary:
    pick #61760
---
 thirdparty/download-thirdparty.sh                  |  3 +
 ...arrow-17.0.0-force-write-int96-timestamps.patch | 97 ++++++++++++++++++++++
 thirdparty/vars.sh                                 |  2 +-
 3 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/thirdparty/download-thirdparty.sh 
b/thirdparty/download-thirdparty.sh
index 89040c09d6a..d5350ac9a30 100755
--- a/thirdparty/download-thirdparty.sh
+++ b/thirdparty/download-thirdparty.sh
@@ -380,6 +380,9 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " ARROW " ]]; then
             # std::string objects in RELRO, then crash while initializing them.
             patch -p1 
<"${TP_PATCH_DIR}/apache-arrow-17.0.0-status-inline-static-fix.patch"
 
+            # apache-arrow-17.0.0-force-write-int96-timestamps.patch : 
+            # Introducing the parameter that forces writing int96 timestampes 
for compatibility build branch-4.0. 
+            patch -p1 
<"${TP_PATCH_DIR}/apache-arrow-17.0.0-force-write-int96-timestamps.patch"
             touch "${PATCHED_MARK}"
         fi
         cd -
diff --git 
a/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch 
b/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch
new file mode 100644
index 00000000000..30ca13aa1c0
--- /dev/null
+++ b/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch
@@ -0,0 +1,97 @@
+diff -ruN arrow-apache-arrow-17.0.0/cpp/src/parquet/arrow/schema.cc 
arrow-apache-arrow-branch40-17.0.0/cpp/src/parquet/arrow/schema.cc
+--- arrow-apache-arrow-17.0.0/cpp/src/parquet/arrow/schema.cc  2024-07-11 
16:57:21.000000000 +0800
++++ arrow-apache-arrow-branch40-17.0.0/cpp/src/parquet/arrow/schema.cc 
2026-05-28 10:47:43.886820058 +0800
+@@ -178,7 +178,8 @@
+ 
+   // The user is explicitly asking for Impala int96 encoding, there is no
+   // logical type.
+-  if (arrow_properties.support_deprecated_int96_timestamps()) {
++  if (arrow_properties.force_write_int96_timestamps() || 
++      arrow_properties.support_deprecated_int96_timestamps()) {
+     *physical_type = ParquetType::INT96;
+     return Status::OK();
+   }
+diff -ruN arrow-apache-arrow-17.0.0/cpp/src/parquet/properties.h 
arrow-apache-arrow-branch40-17.0.0/cpp/src/parquet/properties.h
+--- arrow-apache-arrow-17.0.0/cpp/src/parquet/properties.h     2024-07-11 
16:57:21.000000000 +0800
++++ arrow-apache-arrow-branch40-17.0.0/cpp/src/parquet/properties.h    
2026-05-28 10:53:44.136083775 +0800
+@@ -965,6 +965,7 @@
+    public:
+     Builder()
+         : write_timestamps_as_int96_(false),
++          force_write_int96_timestamps_(false),
+           coerce_timestamps_enabled_(false),
+           coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
+           truncated_timestamps_allowed_(false),
+@@ -990,6 +991,21 @@
+       return this;
+     }
+ 
++    /// \brief Force writing legacy int96 timestamps.
++    ///
++    /// This bypasses unit-based guards and writes INT96 whenever timestamp
++    /// metadata is resolved.
++    Builder* enable_force_write_int96_timestamps() {
++      force_write_int96_timestamps_ = true;
++      return this;
++    }
++
++    /// \brief Disable forcing legacy int96 timestamps (default).
++    Builder* disable_force_write_int96_timestamps() {
++      force_write_int96_timestamps_ = false;
++      return this;
++    }
++
+     /// \brief Coerce all timestamps to the specified time unit.
+     /// \param unit time unit to truncate to.
+     /// For Parquet versions 1.0 and 2.4, nanoseconds are casted to 
microseconds.
+@@ -1070,14 +1086,15 @@
+     /// Create the final properties.
+     std::shared_ptr<ArrowWriterProperties> build() {
+       return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
+-          write_timestamps_as_int96_, coerce_timestamps_enabled_, 
coerce_timestamps_unit_,
++          write_timestamps_as_int96_, force_write_int96_timestamps_,
++          coerce_timestamps_enabled_, coerce_timestamps_unit_,
+           truncated_timestamps_allowed_, store_schema_, 
compliant_nested_types_,
+           engine_version_, use_threads_, executor_));
+     }
+ 
+    private:
+     bool write_timestamps_as_int96_;
+-
++    bool force_write_int96_timestamps_;
+     bool coerce_timestamps_enabled_;
+     ::arrow::TimeUnit::type coerce_timestamps_unit_;
+     bool truncated_timestamps_allowed_;
+@@ -1092,6 +1109,8 @@
+ 
+   bool support_deprecated_int96_timestamps() const { return 
write_timestamps_as_int96_; }
+ 
++  bool force_write_int96_timestamps() const { return 
force_write_int96_timestamps_; }
++
+   bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; 
}
+   ::arrow::TimeUnit::type coerce_timestamps_unit() const {
+     return coerce_timestamps_unit_;
+@@ -1123,6 +1142,7 @@
+ 
+  private:
+   explicit ArrowWriterProperties(bool write_nanos_as_int96,
++                                 bool force_write_int96_timestamps,
+                                  bool coerce_timestamps_enabled,
+                                  ::arrow::TimeUnit::type 
coerce_timestamps_unit,
+                                  bool truncated_timestamps_allowed, bool 
store_schema,
+@@ -1130,6 +1150,7 @@
+                                  EngineVersion engine_version, bool 
use_threads,
+                                  ::arrow::internal::Executor* executor)
+       : write_timestamps_as_int96_(write_nanos_as_int96),
++        force_write_int96_timestamps_(force_write_int96_timestamps),
+         coerce_timestamps_enabled_(coerce_timestamps_enabled),
+         coerce_timestamps_unit_(coerce_timestamps_unit),
+         truncated_timestamps_allowed_(truncated_timestamps_allowed),
+@@ -1140,6 +1161,7 @@
+         executor_(executor) {}
+ 
+   const bool write_timestamps_as_int96_;
++  const bool force_write_int96_timestamps_;
+   const bool coerce_timestamps_enabled_;
+   const ::arrow::TimeUnit::type coerce_timestamps_unit_;
+   const bool truncated_timestamps_allowed_;
diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh
index ac36a438055..7356cba4c11 100644
--- a/thirdparty/vars.sh
+++ b/thirdparty/vars.sh
@@ -134,7 +134,7 @@ BZIP_SOURCE=bzip2-1.0.8
 BZIP_MD5SUM="67e051268d0c475ea773822f7500d0e5"
 
 # lzo2
-LZO2_DOWNLOAD="https://fossies.org/linux/misc/lzo-2.10.tar.gz";
+LZO2_DOWNLOAD="https://www.oberhumer.com/opensource/lzo/download/lzo-2.10.tar.gz";
 LZO2_NAME=lzo-2.10.tar.gz
 LZO2_SOURCE=lzo-2.10
 LZO2_MD5SUM="39d3f3f9c55c87b1e5d6888e1420f4b5"


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to