This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 8308bc96b93205f2a376ca3d427f923322d8f49e Author: wuwenchi <[email protected]> AuthorDate: Tue Jan 23 10:42:42 2024 +0800 [fix](paimon)set timestamp's scale for parquet which has no logical type (#30119) --- .../exec/format/parquet/parquet_column_convert.h | 28 +++++++++++ .../paimon/paimon_timestamp_types.out | 13 +++++ .../paimon/paimon_timestamp_types.groovy | 55 ++++++++++++++++++++++ 3 files changed, 96 insertions(+) diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index b5c3ffb7c88..39ee29f663f 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -127,6 +127,30 @@ struct ConvertParams { DecimalScaleParams decimal_scale; FieldSchema* field_schema = nullptr; + /** + * Some frameworks like paimon maybe writes non-standard parquet files. Timestamp field doesn't have + * logicalType or converted_type to indicates its precision. We have to reset the time mask. + */ + void reset_time_scale_if_missing(int scale) { + const auto& schema = field_schema->parquet_schema; + if (!schema.__isset.logicalType && !schema.__isset.converted_type) { + int ts_scale = 9; + if (scale <= 3) { + ts_scale = 3; + } else if (scale <= 6) { + ts_scale = 6; + } + second_mask = common::exp10_i64(ts_scale); + scale_to_nano_factor = common::exp10_i64(9 - ts_scale); + + // The missing parque metadata makes it impossible for us to know the time zone information, + // so we default to UTC here. + if (ctz == nullptr) { + ctz = const_cast<cctz::time_zone*>(&utc0); + } + } + } + void init(FieldSchema* field_schema_, cctz::time_zone* ctz_) { field_schema = field_schema_; if (ctz_ != nullptr) { @@ -671,8 +695,12 @@ inline Status get_converter(tparquet::Type::type parquet_physical_type, Primitiv break; case TypeIndex::DateTimeV2: if (tparquet::Type::INT96 == parquet_physical_type) { + // int96 only stores nanoseconds in standard parquet file + convert_params->reset_time_scale_if_missing(9); *converter = std::make_unique<Int96toTimestamp>(); } else if (tparquet::Type::INT64 == parquet_physical_type) { + convert_params->reset_time_scale_if_missing( + remove_nullable(dst_data_type)->get_scale()); *converter = std::make_unique<Int64ToTimestamp>(); } break; diff --git a/regression-test/data/external_table_p2/paimon/paimon_timestamp_types.out b/regression-test/data/external_table_p2/paimon/paimon_timestamp_types.out new file mode 100644 index 00000000000..641424b160e --- /dev/null +++ b/regression-test/data/external_table_p2/paimon/paimon_timestamp_types.out @@ -0,0 +1,13 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !c1 -- +1 5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123 5432-08-30T05:43:21.123400 5432-08-30T05:43:21.123450 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 + +-- !c2 -- +1 5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123 5432-08-30T05:43:21.123400 5432-08-30T05:43:21.123450 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 + +-- !c3 -- +1 5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123 5432-08-30T05:43:21.123400 5432-08-30T05:43:21.123450 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 + +-- !c4 -- +1 5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123 5432-08-30T05:43:21.123400 5432-08-30T05:43:21.123450 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 + diff --git a/regression-test/suites/external_table_p2/paimon/paimon_timestamp_types.groovy b/regression-test/suites/external_table_p2/paimon/paimon_timestamp_types.groovy new file mode 100644 index 00000000000..dbb1f1d038c --- /dev/null +++ b/regression-test/suites/external_table_p2/paimon/paimon_timestamp_types.groovy @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("paimon_timestamp_types", "p2,external,paimon,external_remote,external_remote_paimon") { + + def ts_orc = """select * from ts_orc""" + def ts_parquet = """select * from ts_parquet""" + + String enabled = context.config.otherConfigs.get("enableExternalPaimonTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String catalog_name = "paimon_timestamp_catalog" + String user_name = context.config.otherConfigs.get("extHiveHmsUser") + String hiveHost = context.config.otherConfigs.get("extHiveHmsHost") + String hivePort = context.config.otherConfigs.get("extHdfsPort") + + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + "type" = "paimon", + "paimon.catalog.type" = "filesystem", + "warehouse" = "hdfs://${hiveHost}/${hivePort}/paimon/paimon1", + "hadoop.username" = "${user_name}" + ); + """ + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + sql """use db1;""" + logger.info("use db1") + + sql """set force_jni_scanner=true""" + qt_c1 ts_orc + qt_c2 ts_parquet + + sql """set force_jni_scanner=false""" + qt_c3 ts_orc + qt_c4 ts_parquet + + } +} + --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
