(doris) branch branch-4.0 updated: branch-4.0: [bug](parquet) fix parquet type not handle float16 type #58528 (#58630)

yiguolei Mon, 12 Jan 2026 18:11:46 -0800

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 16b404a48fc branch-4.0: [bug](parquet) fix parquet type not handle 
float16 type #58528 (#58630)
16b404a48fc is described below

commit 16b404a48fc21c3831e731e104feb966ae762494
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue Jan 13 10:11:18 2026 +0800

    branch-4.0: [bug](parquet) fix parquet type not handle float16 type #58528 
(#58630)
    
    need this pr: https://github.com/apache/doris/pull/57959
    before merge to other branch
    
    Cherry-picked from #58528
    
    Co-authored-by: zhangstar333 <[email protected]>
---
 .../exec/format/parquet/parquet_column_convert.cpp |  5 ++
 .../exec/format/parquet/parquet_column_convert.h   | 84 ++++++++++++++++++++++
 be/src/vec/exec/format/parquet/schema_desc.cpp     |  2 +
 .../tvf/test_hdfs_parquet_group0.out               | 38 +++++-----
 .../tvf/test_hdfs_tvf_float16.out                  | 19 +++++
 .../tvf/test_hdfs_tvf_float16.groovy               | 46 ++++++++++++
 6 files changed, 175 insertions(+), 19 deletions(-)

diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp 
b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
index d0b2d62d68c..d703d6b1a5a 100644
--- a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
+++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
@@ -231,6 +231,11 @@ std::unique_ptr<PhysicalToLogicalConverter> 
PhysicalToLogicalConverter::get_conv
             // for FixedSizeBinary
             physical_converter =
                     
std::make_unique<FixedSizeBinaryConverter>(parquet_schema.type_length);
+        } else if (src_logical_primitive == TYPE_FLOAT &&
+                   src_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY &&
+                   parquet_schema.logicalType.__isset.FLOAT16) {
+            physical_converter =
+                    
std::make_unique<Float16PhysicalConverter>(parquet_schema.type_length);
         } else {
             physical_converter = 
std::make_unique<ConsistentPhysicalConverter>();
         }
diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h 
b/be/src/vec/exec/format/parquet/parquet_column_convert.h
index b4749a3c829..e8b29131bdf 100644
--- a/be/src/vec/exec/format/parquet/parquet_column_convert.h
+++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h
@@ -20,6 +20,7 @@
 #include <gen_cpp/parquet_types.h>
 
 #include "common/cast_set.h"
+#include "runtime/primitive_type.h"
 #include "vec/columns/column_varbinary.h"
 #include "vec/core/extended_types.h"
 #include "vec/core/field.h"
@@ -354,6 +355,89 @@ public:
     }
 };
 
+class Float16PhysicalConverter : public PhysicalToLogicalConverter {
+private:
+    int _type_length;
+
+public:
+    Float16PhysicalConverter(int type_length) : _type_length(type_length) {
+        DCHECK_EQ(_type_length, 2);
+    }
+
+    Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr& 
src_logical_column) override {
+        ColumnPtr from_col = remove_nullable(src_physical_col);
+        MutableColumnPtr to_col = 
remove_nullable(src_logical_column)->assume_mutable();
+
+        const auto* src_data = assert_cast<const ColumnUInt8*>(from_col.get());
+        size_t length = src_data->size();
+        size_t num_values = length / _type_length;
+        auto* to_float_column = assert_cast<ColumnFloat32*>(to_col.get());
+        size_t start_idx = to_float_column->size();
+        to_float_column->resize(start_idx + num_values);
+        auto& to_float_column_data = to_float_column->get_data();
+        const uint8_t* ptr = src_data->get_data().data();
+        for (int i = 0; i < num_values; ++i) {
+            size_t offset = i * _type_length;
+            const uint8_t* data_ptr = ptr + offset;
+            uint16_t raw;
+            memcpy(&raw, data_ptr, sizeof(uint16_t));
+            float value = half_to_float(raw);
+            to_float_column_data[start_idx + i] = value;
+        }
+
+        return Status::OK();
+    }
+
+    float half_to_float(uint16_t h) {
+        // uint16_t h: half precision floating point
+        // bit 15:       sign（1 bit）
+        // bits 14..10 : exponent（5 bits）
+        // bits 9..0   : mantissa（10 bits）
+
+        // sign bit placed to float32 bit31
+        uint32_t sign = (h & 0x8000U) << 16; // 0x8000 << 16 = 0x8000_0000
+        // exponent:（5 bits）
+        uint32_t exp = (h & 0x7C00U) >> 10; // 0x7C00 = 0111 1100 0000 (half 
exponent mask)
+        // mantissa（10 bits）
+        uint32_t mant = (h & 0x03FFU); // 10-bit fraction
+
+        // cases：Zero/Subnormal, Normal, Inf/NaN
+        if (exp == 0) {
+            // exp==0: Zero or Subnormal ----------
+            if (mant == 0) {
+                // ±0.0
+                // sign = either 0x00000000 or 0x80000000
+                return std::bit_cast<float>(sign);
+            } else {
+                // ---------- Subnormal ----------
+                // half subnormal:
+                //    value = (-1)^sign * (mant / 2^10) * 2^(1 - bias)
+                // half bias = 15 → exponent = 1 - 15 = -14
+                float f = (static_cast<float>(mant) / 1024.0F) * 
std::powf(2.0F, -14.0F);
+                return sign ? -f : f;
+            }
+        } else if (exp == 0x1F) {
+            // exp==31: Inf or NaN ----------
+            // float32:
+            //    exponent = 255 (0xFF)
+            //    mantissa = mant << 13
+            uint32_t f = sign | 0x7F800000U | (mant << 13);
+            return std::bit_cast<float>(f);
+        } else {
+            // Normalized ----------
+            // float32 exponent:
+            //   exp32 = exp16 - bias16 + bias32
+            //   bias16 = 15
+            //   bias32 = 127
+            //
+            // so: exp32 = exp + (127 - 15)
+            uint32_t f = sign | ((exp + (127 - 15)) << 23) // place to float32 
exponent
+                         | (mant << 13);                   // mantissa align 
to 23 bits
+            return std::bit_cast<float>(f);
+        }
+    }
+};
+
 class UUIDVarBinaryConverter : public PhysicalToLogicalConverter {
 public:
     UUIDVarBinaryConverter(int type_length) : _type_length(type_length) {}
diff --git a/be/src/vec/exec/format/parquet/schema_desc.cpp 
b/be/src/vec/exec/format/parquet/schema_desc.cpp
index 677898da6e7..a3ef4fb222b 100644
--- a/be/src/vec/exec/format/parquet/schema_desc.cpp
+++ b/be/src/vec/exec/format/parquet/schema_desc.cpp
@@ -308,6 +308,8 @@ std::pair<DataTypePtr, bool> 
FieldDescriptor::convert_to_doris_type(
     } else if (logicalType.__isset.UUID) {
         ans.first =
                 DataTypeFactory::instance().create_data_type(TYPE_VARBINARY, 
nullable, -1, -1, 16);
+    } else if (logicalType.__isset.FLOAT16) {
+        ans.first = DataTypeFactory::instance().create_data_type(TYPE_FLOAT, 
nullable);
     } else {
         throw Exception(Status::InternalError("Not supported parquet 
logicalType"));
     }
diff --git 
a/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out 
b/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out
index 3cb34820fcc..0e21a8fad6f 100644
--- a/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out
+++ b/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out
@@ -46,16 +46,16 @@ apple_banana_mango9
 \      
 
 -- !test_4 --
-H     H      7.949828        7.949828        10.52116725573211       
10.52116725573211       64983   64983   572141000000    572141000000    03125   
03125   913.768 913.768
-I     I      9.15677 9.15677 9.88245783267187        9.88245783267187        
22192   22192   984601000000    984601000000    16947   16947   846.436 846.436
-'I     'I      10.33757        10.33757        9.820388586168541       
9.820388586168541       24191   24191   293650000000    293650000000    03795   
03795   1003.858        1003.858
-YH     YH      9.951282        9.951282        9.734161208084622       
9.734161208084622       8481    8481    65530000000     65530000000     04625   
04625   1047.888        1047.888
-`I     `I      10.09058        10.09058        10.82052847541742       
10.82052847541742       7403    7403    51248000000     51248000000     01038   
01038   1104.934        1104.934
-xI     xI      10.64394        10.64394        9.606258827775427       
9.606258827775427       79368   79368   246066000000    246066000000    11914   
11914   932.398 932.398
-{H     {H      11.40748        11.40748        10.19677609665696       
10.19677609665696       41157   41157   41079000000     41079000000     00363   
00363   968.825 968.825
-�H     �H      8.781187        8.781187        10.82951904202407       
10.82951904202407       55546   55546   455117000000    455117000000    17477   
17477   1038.969        1038.969
-�H     �H      9.121848        9.121848        8.006939628945844       
8.006939628945844       54041   54041   263845000000    263845000000    16780   
16780   1010.255        1010.255
-�H     �H      9.665876        9.665876        8.703527671925238       
8.703527671925238       80205   80205   516087000000    516087000000    05073   
05073   985.256 985.256
+10.125 10.125  9.15677 9.15677 9.88245783267187        9.88245783267187        
22192   22192   984601000000    984601000000    16947   16947   846.436 846.436
+10.30469       10.30469        10.33757        10.33757        
9.820388586168541       9.820388586168541       24191   24191   293650000000    
293650000000    03795   03795   1003.858        1003.858
+10.75  10.75   10.09058        10.09058        10.82052847541742       
10.82052847541742       7403    7403    51248000000     51248000000     01038   
01038   1104.934        1104.934
+10.9375        10.9375 10.64394        10.64394        9.606258827775427       
9.606258827775427       79368   79368   246066000000    246066000000    11914   
11914   932.398 932.398
+8.046875       8.046875        7.949828        7.949828        
10.52116725573211       10.52116725573211       64983   64983   572141000000    
572141000000    03125   03125   913.768 913.768
+8.695312       8.695312        9.951282        9.951282        
9.734161208084622       9.734161208084622       8481    8481    65530000000     
65530000000     04625   04625   1047.888        1047.888
+8.960938       8.960938        11.40748        11.40748        
10.19677609665696       10.19677609665696       41157   41157   41079000000     
41079000000     00363   00363   968.825 968.825
+9.148438       9.148438        9.665876        9.665876        
8.703527671925238       8.703527671925238       80205   80205   516087000000    
516087000000    05073   05073   985.256 985.256
+9.6875 9.6875  8.781187        8.781187        10.82951904202407       
10.82951904202407       55546   55546   455117000000    455117000000    17477   
17477   1038.969        1038.969
+9.984375       9.984375        9.121848        9.121848        
8.006939628945844       8.006939628945844       54041   54041   263845000000    
263845000000    16780   16780   1010.255        1010.255
 
 -- !test_5 --
 {"a":{1:1, 2:0}}       1       1
@@ -76,13 +76,13 @@ xI  xI      10.64394        10.64394        
9.606258827775427       9.606258827775427       79368   79368   24606600
 
 -- !test_7 --
 \N
-��
-�<
-�@
-�~
-��
-��
-��
+-0.0
+-1.0
+-2.0
+0.0
+1.0
+2.0
+NaN
 
 -- !test_8 --
 6      true    0       0       0       0       0.0     0       04/01/09        
0       2009-04-01T08:00
@@ -351,8 +351,8 @@ abc 5       2       true    [1, 2]
 
 -- !test_39 --
 \N
-��
-�~
+0.0
+NaN
 
 -- !test_40 --
 1.00
diff --git 
a/regression-test/data/external_table_p0/tvf/test_hdfs_tvf_float16.out 
b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf_float16.out
new file mode 100644
index 00000000000..116f2f0d9bf
--- /dev/null
+++ b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf_float16.out
@@ -0,0 +1,19 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !test_39 --
+\N
+0.0
+NaN
+
+-- !test_7 --
+\N
+-0.0
+-1.0
+-2.0
+0.0
+1.0
+2.0
+NaN
+
+-- !desc --
+x      float   Yes     false   \N      NONE
+
diff --git 
a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf_float16.groovy 
b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf_float16.groovy
new file mode 100644
index 00000000000..0f2d933600d
--- /dev/null
+++ b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf_float16.groovy
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_hdfs_tvf_float16","external,hive,tvf,external_docker") {
+    String hdfs_port = context.config.otherConfigs.get("hive2HdfsPort")
+    String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+
+    // It's okay to use random `hdfsUser`, but can not be empty.
+    def hdfsUserName = "doris"
+    def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"
+    def uri = ""
+
+    String enabled = context.config.otherConfigs.get("enableHiveTest")
+    if (enabled != null && enabled.equalsIgnoreCase("true")) {
+        uri = "${defaultFS}" + 
"/user/doris/tvf_data/test_hdfs_parquet/group0/float16_zeros_and_nans.parquet"
+        order_qt_test_39 """ select * from HDFS(
+                    "uri" = "${uri}",
+                    "hadoop.username" = "${hdfsUserName}",
+                    "format" = "parquet") """
+
+        uri = "${defaultFS}" + 
"/user/doris/tvf_data/test_hdfs_parquet/group0/float16_nonzeros_and_nans.parquet"
+        order_qt_test_7 """ select * from HDFS(
+                    "uri" = "${uri}",
+                    "hadoop.username" = "${hdfsUserName}",
+                    "format" = "parquet") """
+
+        order_qt_desc """ desc function HDFS(
+                    "uri" = "${uri}",
+                    "hadoop.username" = "${hdfsUserName}",
+                    "format" = "parquet") """
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-4.0 updated: branch-4.0: [bug](parquet) fix parquet type not handle float16 type #58528 (#58630)

Reply via email to