This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 16b404a48fc branch-4.0: [bug](parquet) fix parquet type not handle
float16 type #58528 (#58630)
16b404a48fc is described below
commit 16b404a48fc21c3831e731e104feb966ae762494
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue Jan 13 10:11:18 2026 +0800
branch-4.0: [bug](parquet) fix parquet type not handle float16 type #58528
(#58630)
need this pr: https://github.com/apache/doris/pull/57959
before merge to other branch
Cherry-picked from #58528
Co-authored-by: zhangstar333 <[email protected]>
---
.../exec/format/parquet/parquet_column_convert.cpp | 5 ++
.../exec/format/parquet/parquet_column_convert.h | 84 ++++++++++++++++++++++
be/src/vec/exec/format/parquet/schema_desc.cpp | 2 +
.../tvf/test_hdfs_parquet_group0.out | 38 +++++-----
.../tvf/test_hdfs_tvf_float16.out | 19 +++++
.../tvf/test_hdfs_tvf_float16.groovy | 46 ++++++++++++
6 files changed, 175 insertions(+), 19 deletions(-)
diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
index d0b2d62d68c..d703d6b1a5a 100644
--- a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
+++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
@@ -231,6 +231,11 @@ std::unique_ptr<PhysicalToLogicalConverter>
PhysicalToLogicalConverter::get_conv
// for FixedSizeBinary
physical_converter =
std::make_unique<FixedSizeBinaryConverter>(parquet_schema.type_length);
+ } else if (src_logical_primitive == TYPE_FLOAT &&
+ src_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY &&
+ parquet_schema.logicalType.__isset.FLOAT16) {
+ physical_converter =
+
std::make_unique<Float16PhysicalConverter>(parquet_schema.type_length);
} else {
physical_converter =
std::make_unique<ConsistentPhysicalConverter>();
}
diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h
b/be/src/vec/exec/format/parquet/parquet_column_convert.h
index b4749a3c829..e8b29131bdf 100644
--- a/be/src/vec/exec/format/parquet/parquet_column_convert.h
+++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h
@@ -20,6 +20,7 @@
#include <gen_cpp/parquet_types.h>
#include "common/cast_set.h"
+#include "runtime/primitive_type.h"
#include "vec/columns/column_varbinary.h"
#include "vec/core/extended_types.h"
#include "vec/core/field.h"
@@ -354,6 +355,89 @@ public:
}
};
+class Float16PhysicalConverter : public PhysicalToLogicalConverter {
+private:
+ int _type_length;
+
+public:
+ Float16PhysicalConverter(int type_length) : _type_length(type_length) {
+ DCHECK_EQ(_type_length, 2);
+ }
+
+ Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr&
src_logical_column) override {
+ ColumnPtr from_col = remove_nullable(src_physical_col);
+ MutableColumnPtr to_col =
remove_nullable(src_logical_column)->assume_mutable();
+
+ const auto* src_data = assert_cast<const ColumnUInt8*>(from_col.get());
+ size_t length = src_data->size();
+ size_t num_values = length / _type_length;
+ auto* to_float_column = assert_cast<ColumnFloat32*>(to_col.get());
+ size_t start_idx = to_float_column->size();
+ to_float_column->resize(start_idx + num_values);
+ auto& to_float_column_data = to_float_column->get_data();
+ const uint8_t* ptr = src_data->get_data().data();
+ for (int i = 0; i < num_values; ++i) {
+ size_t offset = i * _type_length;
+ const uint8_t* data_ptr = ptr + offset;
+ uint16_t raw;
+ memcpy(&raw, data_ptr, sizeof(uint16_t));
+ float value = half_to_float(raw);
+ to_float_column_data[start_idx + i] = value;
+ }
+
+ return Status::OK();
+ }
+
+ float half_to_float(uint16_t h) {
+ // uint16_t h: half precision floating point
+ // bit 15: sign(1 bit)
+ // bits 14..10 : exponent(5 bits)
+ // bits 9..0 : mantissa(10 bits)
+
+ // sign bit placed to float32 bit31
+ uint32_t sign = (h & 0x8000U) << 16; // 0x8000 << 16 = 0x8000_0000
+ // exponent:(5 bits)
+ uint32_t exp = (h & 0x7C00U) >> 10; // 0x7C00 = 0111 1100 0000 (half
exponent mask)
+ // mantissa(10 bits)
+ uint32_t mant = (h & 0x03FFU); // 10-bit fraction
+
+ // cases:Zero/Subnormal, Normal, Inf/NaN
+ if (exp == 0) {
+ // exp==0: Zero or Subnormal ----------
+ if (mant == 0) {
+ // ±0.0
+ // sign = either 0x00000000 or 0x80000000
+ return std::bit_cast<float>(sign);
+ } else {
+ // ---------- Subnormal ----------
+ // half subnormal:
+ // value = (-1)^sign * (mant / 2^10) * 2^(1 - bias)
+ // half bias = 15 → exponent = 1 - 15 = -14
+ float f = (static_cast<float>(mant) / 1024.0F) *
std::powf(2.0F, -14.0F);
+ return sign ? -f : f;
+ }
+ } else if (exp == 0x1F) {
+ // exp==31: Inf or NaN ----------
+ // float32:
+ // exponent = 255 (0xFF)
+ // mantissa = mant << 13
+ uint32_t f = sign | 0x7F800000U | (mant << 13);
+ return std::bit_cast<float>(f);
+ } else {
+ // Normalized ----------
+ // float32 exponent:
+ // exp32 = exp16 - bias16 + bias32
+ // bias16 = 15
+ // bias32 = 127
+ //
+ // so: exp32 = exp + (127 - 15)
+ uint32_t f = sign | ((exp + (127 - 15)) << 23) // place to float32
exponent
+ | (mant << 13); // mantissa align
to 23 bits
+ return std::bit_cast<float>(f);
+ }
+ }
+};
+
class UUIDVarBinaryConverter : public PhysicalToLogicalConverter {
public:
UUIDVarBinaryConverter(int type_length) : _type_length(type_length) {}
diff --git a/be/src/vec/exec/format/parquet/schema_desc.cpp
b/be/src/vec/exec/format/parquet/schema_desc.cpp
index 677898da6e7..a3ef4fb222b 100644
--- a/be/src/vec/exec/format/parquet/schema_desc.cpp
+++ b/be/src/vec/exec/format/parquet/schema_desc.cpp
@@ -308,6 +308,8 @@ std::pair<DataTypePtr, bool>
FieldDescriptor::convert_to_doris_type(
} else if (logicalType.__isset.UUID) {
ans.first =
DataTypeFactory::instance().create_data_type(TYPE_VARBINARY,
nullable, -1, -1, 16);
+ } else if (logicalType.__isset.FLOAT16) {
+ ans.first = DataTypeFactory::instance().create_data_type(TYPE_FLOAT,
nullable);
} else {
throw Exception(Status::InternalError("Not supported parquet
logicalType"));
}
diff --git
a/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out
b/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out
index 3cb34820fcc..0e21a8fad6f 100644
--- a/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out
+++ b/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out
@@ -46,16 +46,16 @@ apple_banana_mango9
\
-- !test_4 --
-H H 7.949828 7.949828 10.52116725573211
10.52116725573211 64983 64983 572141000000 572141000000 03125
03125 913.768 913.768
-I I 9.15677 9.15677 9.88245783267187 9.88245783267187
22192 22192 984601000000 984601000000 16947 16947 846.436 846.436
-'I 'I 10.33757 10.33757 9.820388586168541
9.820388586168541 24191 24191 293650000000 293650000000 03795
03795 1003.858 1003.858
-YH YH 9.951282 9.951282 9.734161208084622
9.734161208084622 8481 8481 65530000000 65530000000 04625
04625 1047.888 1047.888
-`I `I 10.09058 10.09058 10.82052847541742
10.82052847541742 7403 7403 51248000000 51248000000 01038
01038 1104.934 1104.934
-xI xI 10.64394 10.64394 9.606258827775427
9.606258827775427 79368 79368 246066000000 246066000000 11914
11914 932.398 932.398
-{H {H 11.40748 11.40748 10.19677609665696
10.19677609665696 41157 41157 41079000000 41079000000 00363
00363 968.825 968.825
-�H �H 8.781187 8.781187 10.82951904202407
10.82951904202407 55546 55546 455117000000 455117000000 17477
17477 1038.969 1038.969
-�H �H 9.121848 9.121848 8.006939628945844
8.006939628945844 54041 54041 263845000000 263845000000 16780
16780 1010.255 1010.255
-�H �H 9.665876 9.665876 8.703527671925238
8.703527671925238 80205 80205 516087000000 516087000000 05073
05073 985.256 985.256
+10.125 10.125 9.15677 9.15677 9.88245783267187 9.88245783267187
22192 22192 984601000000 984601000000 16947 16947 846.436 846.436
+10.30469 10.30469 10.33757 10.33757
9.820388586168541 9.820388586168541 24191 24191 293650000000
293650000000 03795 03795 1003.858 1003.858
+10.75 10.75 10.09058 10.09058 10.82052847541742
10.82052847541742 7403 7403 51248000000 51248000000 01038
01038 1104.934 1104.934
+10.9375 10.9375 10.64394 10.64394 9.606258827775427
9.606258827775427 79368 79368 246066000000 246066000000 11914
11914 932.398 932.398
+8.046875 8.046875 7.949828 7.949828
10.52116725573211 10.52116725573211 64983 64983 572141000000
572141000000 03125 03125 913.768 913.768
+8.695312 8.695312 9.951282 9.951282
9.734161208084622 9.734161208084622 8481 8481 65530000000
65530000000 04625 04625 1047.888 1047.888
+8.960938 8.960938 11.40748 11.40748
10.19677609665696 10.19677609665696 41157 41157 41079000000
41079000000 00363 00363 968.825 968.825
+9.148438 9.148438 9.665876 9.665876
8.703527671925238 8.703527671925238 80205 80205 516087000000
516087000000 05073 05073 985.256 985.256
+9.6875 9.6875 8.781187 8.781187 10.82951904202407
10.82951904202407 55546 55546 455117000000 455117000000 17477
17477 1038.969 1038.969
+9.984375 9.984375 9.121848 9.121848
8.006939628945844 8.006939628945844 54041 54041 263845000000
263845000000 16780 16780 1010.255 1010.255
-- !test_5 --
{"a":{1:1, 2:0}} 1 1
@@ -76,13 +76,13 @@ xI xI 10.64394 10.64394
9.606258827775427 9.606258827775427 79368 79368 24606600
-- !test_7 --
\N
-��
-�<
-�@
-�~
-��
-��
-��
+-0.0
+-1.0
+-2.0
+0.0
+1.0
+2.0
+NaN
-- !test_8 --
6 true 0 0 0 0 0.0 0 04/01/09
0 2009-04-01T08:00
@@ -351,8 +351,8 @@ abc 5 2 true [1, 2]
-- !test_39 --
\N
-��
-�~
+0.0
+NaN
-- !test_40 --
1.00
diff --git
a/regression-test/data/external_table_p0/tvf/test_hdfs_tvf_float16.out
b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf_float16.out
new file mode 100644
index 00000000000..116f2f0d9bf
--- /dev/null
+++ b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf_float16.out
@@ -0,0 +1,19 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !test_39 --
+\N
+0.0
+NaN
+
+-- !test_7 --
+\N
+-0.0
+-1.0
+-2.0
+0.0
+1.0
+2.0
+NaN
+
+-- !desc --
+x float Yes false \N NONE
+
diff --git
a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf_float16.groovy
b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf_float16.groovy
new file mode 100644
index 00000000000..0f2d933600d
--- /dev/null
+++ b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf_float16.groovy
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_hdfs_tvf_float16","external,hive,tvf,external_docker") {
+ String hdfs_port = context.config.otherConfigs.get("hive2HdfsPort")
+ String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+
+ // It's okay to use random `hdfsUser`, but can not be empty.
+ def hdfsUserName = "doris"
+ def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"
+ def uri = ""
+
+ String enabled = context.config.otherConfigs.get("enableHiveTest")
+ if (enabled != null && enabled.equalsIgnoreCase("true")) {
+ uri = "${defaultFS}" +
"/user/doris/tvf_data/test_hdfs_parquet/group0/float16_zeros_and_nans.parquet"
+ order_qt_test_39 """ select * from HDFS(
+ "uri" = "${uri}",
+ "hadoop.username" = "${hdfsUserName}",
+ "format" = "parquet") """
+
+ uri = "${defaultFS}" +
"/user/doris/tvf_data/test_hdfs_parquet/group0/float16_nonzeros_and_nans.parquet"
+ order_qt_test_7 """ select * from HDFS(
+ "uri" = "${uri}",
+ "hadoop.username" = "${hdfsUserName}",
+ "format" = "parquet") """
+
+ order_qt_desc """ desc function HDFS(
+ "uri" = "${uri}",
+ "hadoop.username" = "${hdfsUserName}",
+ "format" = "parquet") """
+ }
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]