This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new c34b80a2706 branch-4.1: [fix](hive) Fix Hive DATE timezone shift in 
external readers #61330 (#61722)
c34b80a2706 is described below

commit c34b80a27064dca77fe62667dbdd7542602cb16e
Author: Socrates <[email protected]>
AuthorDate: Thu Mar 26 09:09:53 2026 +0800

    branch-4.1: [fix](hive) Fix Hive DATE timezone shift in external readers 
#61330 (#61722)
    
    Cherry-pick #61330 to branch-4.1
    
    ### What problem does this PR solve?
    
    - Related PR: #61330
    
    Fix Hive external table DATE columns being shifted by one day in west
    time zones when reading ORC/Parquet files.
    
    This backport keeps DATE semantics time-zone-independent for Hive
    external ORC/Parquet reads and includes the matching unit and regression
    coverage from the merged master change.
    
    ### Cherry-pick commit
    
    - `18e5dda9732` - [fix](hive) Fix Hive DATE timezone shift in external
    readers (#61330)
---
 be/src/format/orc/vorc_reader.cpp                  |  3 -
 be/src/format/orc/vorc_reader.h                    |  5 +-
 be/src/format/parquet/parquet_column_convert.h     | 10 +---
 be/test/format/orc/orc_read_lines.cpp              | 36 ++++++++---
 be/test/format/parquet/parquet_expr_test.cpp       | 70 ++++++++++++++++++++++
 .../hive/test_hive_date_timezone.out               | 56 +++++++++++++++++
 .../hive/test_hive_date_timezone.groovy            | 60 +++++++++++++++++++
 7 files changed, 217 insertions(+), 23 deletions(-)

diff --git a/be/src/format/orc/vorc_reader.cpp 
b/be/src/format/orc/vorc_reader.cpp
index b569f7e85a8..ba48aab565d 100644
--- a/be/src/format/orc/vorc_reader.cpp
+++ b/be/src/format/orc/vorc_reader.cpp
@@ -191,9 +191,6 @@ OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState* 
state,
                   state == nullptr ? true : 
state->query_options().enable_orc_filter_by_min_max),
           _dict_cols_has_converted(false) {
     TimezoneUtils::find_cctz_time_zone(ctz, _time_zone);
-    VecDateTimeValue t;
-    t.from_unixtime(0, ctz);
-    _offset_days = t.day() == 31 ? -1 : 0; // If 1969-12-31, then returns -1.
     _meta_cache = meta_cache;
     _init_profile();
     _init_system_properties();
diff --git a/be/src/format/orc/vorc_reader.h b/be/src/format/orc/vorc_reader.h
index c5d04c652dd..e4b53221cc5 100644
--- a/be/src/format/orc/vorc_reader.h
+++ b/be/src/format/orc/vorc_reader.h
@@ -511,8 +511,8 @@ private:
                     }
                 }
 
-                // because the date api argument is int32_t, we should cast to 
int32_t.
-                int32_t date_value = cast_set<int32_t>(data->data[i]) + 
_offset_days;
+                // ORC DATE stores a logical day count without time zone 
semantics.
+                int32_t date_value = cast_set<int32_t>(data->data[i]);
                 if constexpr (std::is_same_v<CppType, VecDateTimeValue>) {
                     v.create_from_date_v2(date_dict[date_value], TIME_DATE);
                     // we should cast to date if using date v1.
@@ -655,7 +655,6 @@ private:
     int64_t _range_size;
     std::string _ctz;
 
-    int32_t _offset_days = 0;
     cctz::time_zone _time_zone;
 
     // The columns of the table to be read (contain columns that do not exist)
diff --git a/be/src/format/parquet/parquet_column_convert.h 
b/be/src/format/parquet/parquet_column_convert.h
index be7ac3a9bcc..0d9fa12466a 100644
--- a/be/src/format/parquet/parquet_column_convert.h
+++ b/be/src/format/parquet/parquet_column_convert.h
@@ -39,7 +39,6 @@ struct ConvertParams {
     static const cctz::time_zone utc0;
     // schema.logicalType.TIMESTAMP.isAdjustedToUTC == true, we should set 
local time zone
     const cctz::time_zone* ctz = nullptr;
-    size_t offset_days = 0;
     int64_t second_mask = 1;
     int64_t scale_to_nano_factor = 1;
     const FieldSchema* field_schema = nullptr;
@@ -110,11 +109,6 @@ struct ConvertParams {
             }
         }
 
-        if (ctz) {
-            VecDateTimeValue t;
-            t.from_unixtime(0, *ctz);
-            offset_days = t.day() == 31 ? -1 : 0;
-        }
         is_type_compatibility = field_schema_->is_type_compatibility;
     }
 };
@@ -642,9 +636,7 @@ class Int32ToDate : public PhysicalToLogicalConverter {
         date_day_offset_dict& date_dict = date_day_offset_dict::get();
 
         for (int i = 0; i < rows; i++) {
-            int64_t date_value = (int64_t)src_data[i] + 
_convert_params->offset_days;
-            data.push_back_without_reserve(
-                    
date_dict[cast_set<int32_t>(date_value)].to_date_int_val());
+            
data.push_back_without_reserve(date_dict[src_data[i]].to_date_int_val());
         }
 
         return Status::OK();
diff --git a/be/test/format/orc/orc_read_lines.cpp 
b/be/test/format/orc/orc_read_lines.cpp
index d1452141ad6..f1ece335987 100644
--- a/be/test/format/orc/orc_read_lines.cpp
+++ b/be/test/format/orc/orc_read_lines.cpp
@@ -57,7 +57,8 @@ public:
     OrcReadLinesTest() {}
 };
 
-static void read_orc_line(int64_t line, std::string block_dump) {
+static void read_orc_line(int64_t line, std::string block_dump,
+                          const std::string& time_zone = "CST") {
     auto runtime_state = RuntimeState::create_unique();
 
     std::vector<std::string> column_names = {"col1", "col2", "col3", "col4", 
"col5",
@@ -119,7 +120,6 @@ static void read_orc_line(int64_t line, std::string 
block_dump) {
     io::IOContext io_ctx;
     io::FileReaderStats file_reader_stats;
     io_ctx.file_reader_stats = &file_reader_stats;
-    std::string time_zone = "CST";
     auto reader = OrcReader::create_unique(nullptr, runtime_state.get(), 
params, range, 100,
                                            time_zone, &io_ctx, nullptr, true);
     auto local_fs = io::global_local_filesystem();
@@ -143,7 +143,8 @@ static void read_orc_line(int64_t line, std::string 
block_dump) {
     std::unordered_map<std::string, std::tuple<std::string, const 
SlotDescriptor*>>
             partition_columns;
     std::unordered_map<std::string, VExprContextSPtr> missing_columns;
-    static_cast<void>(reader->set_fill_columns(partition_columns, 
missing_columns));
+    auto st = reader->set_fill_columns(partition_columns, missing_columns);
+    EXPECT_TRUE(st.ok()) << st;
     BlockUPtr block = Block::create_unique();
     for (const auto& slot_desc : tuple_desc->slots()) {
         auto data_type = slot_desc->type();
@@ -158,7 +159,8 @@ static void read_orc_line(int64_t line, std::string 
block_dump) {
 
     bool eof = false;
     size_t read_row = 0;
-    static_cast<void>(reader->get_next_block(block.get(), &read_row, &eof));
+    st = reader->get_next_block(block.get(), &read_row, &eof);
+    EXPECT_TRUE(st.ok()) << st;
     auto row_id_string_column = static_cast<const ColumnString&>(
             
*block->get_by_position(block->get_position_by_name("row_id")).column.get());
     for (auto i = 0; i < row_id_string_column.size(); i++) {
@@ -185,7 +187,7 @@ static void read_orc_line(int64_t line, std::string 
block_dump) {
         slot_info.is_file_slot = true;
         params.required_slots.emplace_back(slot_info);
     }
-    runtime_state->_timezone = "CST";
+    runtime_state->_timezone = time_zone;
 
     std::unique_ptr<RuntimeProfile> runtime_profile;
     runtime_profile = std::make_unique<RuntimeProfile>("ExternalRowIDFetcher");
@@ -196,9 +198,9 @@ static void read_orc_line(int64_t line, std::string 
block_dump) {
     ExternalFileMappingInfo external_info(0, range, false);
     int64_t init_reader_ms = 0;
     int64_t get_block_ms = 0;
-    auto st = vf->read_lines_from_range(range, {line}, block.get(), 
external_info, &init_reader_ms,
-                                        &get_block_ms);
-    EXPECT_TRUE(st.ok());
+    st = vf->read_lines_from_range(range, {line}, block.get(), external_info, 
&init_reader_ms,
+                                   &get_block_ms);
+    EXPECT_TRUE(st.ok()) << st;
     EXPECT_EQ(block->dump_data(1), block_dump);
 }
 
@@ -375,4 +377,22 @@ TEST_F(OrcReadLinesTest, test9) {
     read_orc_line(9, block_dump);
 }
 
+TEST_F(OrcReadLinesTest, date_should_not_shift_in_west_timezone) {
+    std::string block_dump =
+            
"+----------------------+--------------------+----------------------+------------------"
+            
"----+----------------------+---------------------+-------------------+----------------"
+            
"--------+----------------------+\n|col1(Nullable(BIGINT))|col2(Nullable(BOOL))|col3("
+            
"Nullable(String))|col4(Nullable(DateV2))|col5(Nullable(DOUBLE))|col6(Nullable(FLOAT))|"
+            
"col7(Nullable(INT))|col8(Nullable(SMALLINT))|col9(Nullable(String))|\n+---------------"
+            
"-------+--------------------+----------------------+----------------------+-----------"
+            
"-----------+---------------------+-------------------+------------------------+-------"
+            "---------------+\n|                     1|                   1|   
              "
+            "doris|            1900-01-01|                 1.567|              
  1.567|            "
+            "  12345|                       1|                 "
+            
"doris|\n+----------------------+--------------------+----------------------+----------"
+            
"------------+----------------------+---------------------+-------------------+--------"
+            "----------------+----------------------+\n";
+    read_orc_line(1, block_dump, "America/Mexico_City");
+}
+
 } // namespace doris
diff --git a/be/test/format/parquet/parquet_expr_test.cpp 
b/be/test/format/parquet/parquet_expr_test.cpp
index 159ea128584..73441901db7 100644
--- a/be/test/format/parquet/parquet_expr_test.cpp
+++ b/be/test/format/parquet/parquet_expr_test.cpp
@@ -292,6 +292,69 @@ public:
         p_reader->_ctz = &ctz;
     }
 
+    std::string read_date_column_dump(const std::string& timezone_name) {
+        TDescriptorTable local_desc_table;
+        TTableDescriptor local_table_desc;
+        create_table_desc(local_desc_table, local_table_desc, {"date_col"},
+                          {TPrimitiveType::DATEV2});
+        DescriptorTbl* local_desc_tbl = nullptr;
+        ObjectPool local_obj_pool;
+        static_cast<void>(
+                DescriptorTbl::create(&local_obj_pool, local_desc_table, 
&local_desc_tbl));
+
+        auto tuple_desc = local_desc_tbl->get_tuple_descriptor(0);
+        auto slot_descs = tuple_desc->slots();
+        auto local_fs = io::global_local_filesystem();
+        io::FileReaderSPtr local_file_reader;
+        static_cast<void>(local_fs->open_file(file_path, &local_file_reader));
+
+        cctz::time_zone local_ctz;
+        TimezoneUtils::find_cctz_time_zone(timezone_name, local_ctz);
+
+        std::vector<std::string> column_names;
+        std::unordered_map<std::string, uint32_t> col_name_to_block_idx;
+        for (int i = 0; i < slot_descs.size(); i++) {
+            column_names.push_back(slot_descs[i]->col_name());
+            col_name_to_block_idx[slot_descs[i]->col_name()] = i;
+        }
+
+        TFileScanRangeParams scan_params;
+        TFileRangeDesc scan_range;
+        scan_range.start_offset = 0;
+        scan_range.size = local_file_reader->size();
+
+        auto local_reader = ParquetReader::create_unique(
+                nullptr, scan_params, scan_range, scan_range.size, &local_ctz, 
nullptr, nullptr);
+        local_reader->set_file_reader(local_file_reader);
+        phmap::flat_hash_map<int, 
std::vector<std::shared_ptr<ColumnPredicate>>> tmp;
+        static_cast<void>(local_reader->init_reader(column_names, 
&col_name_to_block_idx, {}, tmp,
+                                                    tuple_desc, nullptr, 
nullptr, nullptr,
+                                                    nullptr));
+
+        std::unordered_map<std::string, std::tuple<std::string, const 
SlotDescriptor*>>
+                partition_columns;
+        std::unordered_map<std::string, VExprContextSPtr> missing_columns;
+        static_cast<void>(local_reader->set_fill_columns(partition_columns, 
missing_columns));
+
+        bool eof = false;
+        std::string dump;
+        while (!eof) {
+            BlockUPtr block = Block::create_unique();
+            for (const auto& slot_desc : tuple_desc->slots()) {
+                auto data_type = make_nullable(slot_desc->type());
+                MutableColumnPtr data_column = data_type->create_column();
+                block->insert(ColumnWithTypeAndName(std::move(data_column), 
data_type,
+                                                    slot_desc->col_name()));
+            }
+
+            size_t read_row = 0;
+            Status st = local_reader->get_next_block(block.get(), &read_row, 
&eof);
+            EXPECT_TRUE(st.ok()) << st;
+            dump += block->dump_data();
+        }
+        return dump;
+    }
+
     static void create_table_desc(TDescriptorTable& t_desc_table, 
TTableDescriptor& t_table_desc,
                                   std::vector<std::string> table_column_names,
                                   std::vector<TPrimitiveType::type> types) {
@@ -400,6 +463,13 @@ TEST_F(ParquetExprTest, test_min_max) {
     }
 }
 
+TEST_F(ParquetExprTest, date_should_not_shift_in_west_timezone) {
+    std::string dump = read_date_column_dump("-06:00");
+    EXPECT_NE(dump.find("2020-01-01"), std::string::npos);
+    EXPECT_NE(dump.find("2020-01-06"), std::string::npos);
+    EXPECT_EQ(dump.find("2019-12-31"), std::string::npos);
+}
+
 TEST_F(ParquetExprTest, test_ge_2) { // int64_col = 10000000001   [10000000000 
, 10000000000+3)
                                      // int64_col = 10000000001   [10000000000 
, 10000000000+3)
     int loc = 2;
diff --git 
a/regression-test/data/external_table_p0/hive/test_hive_date_timezone.out 
b/regression-test/data/external_table_p0/hive/test_hive_date_timezone.out
new file mode 100644
index 00000000000..0982d2b10c5
--- /dev/null
+++ b/regression-test/data/external_table_p0/hive/test_hive_date_timezone.out
@@ -0,0 +1,56 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !orc_date_utc --
+2023-10-22
+2020-01-01
+\N
+\N
+\N
+2019-12-31
+2022-05-20
+\N
+2023-01-01
+2023-01-01
+2023-01-01
+2023-01-01
+
+-- !parquet_date_utc --
+2023-10-22
+2020-01-01
+\N
+\N
+\N
+2019-12-31
+2022-05-20
+\N
+2023-01-01
+2023-01-01
+2023-01-01
+2023-01-01
+
+-- !orc_date_west_tz --
+2023-10-22
+2020-01-01
+\N
+\N
+\N
+2019-12-31
+2022-05-20
+\N
+2023-01-01
+2023-01-01
+2023-01-01
+2023-01-01
+
+-- !parquet_date_west_tz --
+2023-10-22
+2020-01-01
+\N
+\N
+\N
+2019-12-31
+2022-05-20
+\N
+2023-01-01
+2023-01-01
+2023-01-01
+2023-01-01
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_date_timezone.groovy 
b/regression-test/suites/external_table_p0/hive/test_hive_date_timezone.groovy
new file mode 100644
index 00000000000..26371b8f5c7
--- /dev/null
+++ 
b/regression-test/suites/external_table_p0/hive/test_hive_date_timezone.groovy
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_hive_date_timezone", "p0,external") {
+    String enabled = context.config.otherConfigs.get("enableHiveTest")
+    if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+        logger.info("diable Hive test.")
+        return
+    }
+
+    for (String hivePrefix : ["hive3"]) {
+        setHivePrefix(hivePrefix)
+        String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+        String hmsPort = context.config.otherConfigs.get(hivePrefix + 
"HmsPort")
+        String hdfsPort = context.config.otherConfigs.get(hivePrefix + 
"HdfsPort")
+        String catalogName = "test_hive_date_timezone_${hivePrefix}"
+
+        sql """drop catalog if exists ${catalogName}"""
+        sql """
+            create catalog if not exists ${catalogName} properties (
+                'type'='hms',
+                'hadoop.username' = 'hadoop',
+                'fs.defaultFS' = 'hdfs://${externalEnvIp}:${hdfsPort}',
+                'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hmsPort}'
+            );
+        """
+
+        try {
+            sql """set enable_fallback_to_original_planner=false"""
+            sql """switch ${catalogName}"""
+            sql """use `schema_change`"""
+
+            sql """set time_zone = 'UTC'"""
+            qt_orc_date_utc """select date_col from 
orc_primitive_types_to_date order by id"""
+            qt_parquet_date_utc """select date_col from 
parquet_primitive_types_to_date order by id"""
+
+            sql """set time_zone = 'America/Mexico_City'"""
+            qt_orc_date_west_tz """select date_col from 
orc_primitive_types_to_date order by id"""
+            qt_parquet_date_west_tz """select date_col from 
parquet_primitive_types_to_date order by id"""
+        } finally {
+            sql """set time_zone = default"""
+            sql """switch internal"""
+            sql """drop catalog if exists ${catalogName}"""
+        }
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to