Gabriel39 commented on code in PR #63444:
URL: https://github.com/apache/doris/pull/63444#discussion_r3286070321


##########
be/src/format/new_parquet/column_reader.cpp:
##########
@@ -0,0 +1,1191 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "format/new_parquet/column_reader.h"
+
+#include <arrow/array/array_binary.h>
+#include <parquet/api/reader.h>
+#include <parquet/api/schema.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <exception>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "core/column/column.h"
+#include "core/column/column_decimal.h"
+#include "core/column/column_struct.h"
+#include "core/data_type/data_type_array.h"
+#include "core/data_type/data_type_factory.hpp"
+#include "core/data_type/data_type_map.h"
+#include "core/data_type/data_type_nullable.h"
+#include "core/data_type/data_type_number.h"
+#include "core/data_type/data_type_string.h"
+#include "core/data_type/data_type_struct.h"
+#include "core/value/vdatetime_value.h"
+#include "format/new_parquet/parquet_column_schema.h"
+
+namespace doris::parquet {
+namespace {
+
+DataTypePtr make_nullable_if_needed(DataTypePtr type, const 
::parquet::ColumnDescriptor* column) {
+    if (type != nullptr && column != nullptr && column->max_definition_level() 
> 0) {
+        return make_nullable(type);
+    }
+    return type;
+}
+
+DataTypePtr create_type(PrimitiveType type, bool nullable, int precision = 0, 
int scale = 0) {
+    return DataTypeFactory::instance().create_data_type(type, nullable, 
precision, scale);
+}
+
+bool has_non_physical_annotation(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr) {
+        return false;
+    }
+    const auto& logical_type = column->logical_type();
+    return column->converted_type() != ::parquet::ConvertedType::NONE ||
+           (logical_type != nullptr && logical_type->is_valid() && 
!logical_type->is_none());
+}
+
+bool is_decimal_column(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr) {
+        return false;
+    }
+    const auto& logical_type = column->logical_type();
+    return column->converted_type() == ::parquet::ConvertedType::DECIMAL ||
+           (logical_type != nullptr && logical_type->is_valid() && 
logical_type->is_decimal());
+}
+
+bool is_timestamp_column(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr) {
+        return false;
+    }
+    const auto& logical_type = column->logical_type();
+    return column->converted_type() == 
::parquet::ConvertedType::TIMESTAMP_MILLIS ||
+           column->converted_type() == 
::parquet::ConvertedType::TIMESTAMP_MICROS ||
+           (logical_type != nullptr && logical_type->is_valid() && 
logical_type->is_timestamp());
+}
+
+bool is_string_like_column(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr || is_decimal_column(column)) {

Review Comment:
   这里为什么要加is_decimal_column?



##########
be/src/format/new_parquet/column_reader.cpp:
##########
@@ -0,0 +1,1191 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "format/new_parquet/column_reader.h"
+
+#include <arrow/array/array_binary.h>
+#include <parquet/api/reader.h>
+#include <parquet/api/schema.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <exception>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "core/column/column.h"
+#include "core/column/column_decimal.h"
+#include "core/column/column_struct.h"
+#include "core/data_type/data_type_array.h"
+#include "core/data_type/data_type_factory.hpp"
+#include "core/data_type/data_type_map.h"
+#include "core/data_type/data_type_nullable.h"
+#include "core/data_type/data_type_number.h"
+#include "core/data_type/data_type_string.h"
+#include "core/data_type/data_type_struct.h"
+#include "core/value/vdatetime_value.h"
+#include "format/new_parquet/parquet_column_schema.h"
+
+namespace doris::parquet {
+namespace {
+
+DataTypePtr make_nullable_if_needed(DataTypePtr type, const 
::parquet::ColumnDescriptor* column) {
+    if (type != nullptr && column != nullptr && column->max_definition_level() 
> 0) {
+        return make_nullable(type);
+    }
+    return type;
+}
+
+DataTypePtr create_type(PrimitiveType type, bool nullable, int precision = 0, 
int scale = 0) {
+    return DataTypeFactory::instance().create_data_type(type, nullable, 
precision, scale);
+}
+
+bool has_non_physical_annotation(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr) {
+        return false;
+    }
+    const auto& logical_type = column->logical_type();
+    return column->converted_type() != ::parquet::ConvertedType::NONE ||
+           (logical_type != nullptr && logical_type->is_valid() && 
!logical_type->is_none());
+}
+
+bool is_decimal_column(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr) {
+        return false;
+    }
+    const auto& logical_type = column->logical_type();
+    return column->converted_type() == ::parquet::ConvertedType::DECIMAL ||
+           (logical_type != nullptr && logical_type->is_valid() && 
logical_type->is_decimal());
+}
+
+bool is_timestamp_column(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr) {
+        return false;
+    }
+    const auto& logical_type = column->logical_type();
+    return column->converted_type() == 
::parquet::ConvertedType::TIMESTAMP_MILLIS ||
+           column->converted_type() == 
::parquet::ConvertedType::TIMESTAMP_MICROS ||
+           (logical_type != nullptr && logical_type->is_valid() && 
logical_type->is_timestamp());
+}
+
+bool is_string_like_column(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr || is_decimal_column(column)) {
+        return false;
+    }
+    return column->physical_type() == ::parquet::Type::BYTE_ARRAY ||
+           column->physical_type() == ::parquet::Type::FIXED_LEN_BYTE_ARRAY;
+}
+
+PrimitiveType decimal_primitive_type(int precision) {
+    return precision > 38 ? TYPE_DECIMAL256 : TYPE_DECIMAL128I;
+}
+
+DataTypePtr converted_type_to_doris_type(const ::parquet::ColumnDescriptor* 
column) {
+    switch (column->converted_type()) {
+    case ::parquet::ConvertedType::UTF8:
+    case ::parquet::ConvertedType::ENUM:
+    case ::parquet::ConvertedType::JSON:
+    case ::parquet::ConvertedType::BSON:
+        return create_type(TYPE_STRING, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::DECIMAL:
+        return create_type(decimal_primitive_type(column->type_precision()),
+                           column->max_definition_level() > 0, 
column->type_precision(),
+                           column->type_scale());
+    case ::parquet::ConvertedType::DATE:
+        return create_type(TYPE_DATEV2, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::TIME_MILLIS:
+        return create_type(TYPE_TIMEV2, column->max_definition_level() > 0, 0, 
3);
+    case ::parquet::ConvertedType::TIME_MICROS:
+        return create_type(TYPE_TIMEV2, column->max_definition_level() > 0, 0, 
6);
+    case ::parquet::ConvertedType::TIMESTAMP_MILLIS:
+        return create_type(TYPE_DATETIMEV2, column->max_definition_level() > 
0, 0, 3);
+    case ::parquet::ConvertedType::TIMESTAMP_MICROS:
+        return create_type(TYPE_DATETIMEV2, column->max_definition_level() > 
0, 0, 6);
+    case ::parquet::ConvertedType::INT_8:
+        return create_type(TYPE_TINYINT, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::UINT_8:
+    case ::parquet::ConvertedType::INT_16:
+        return create_type(TYPE_SMALLINT, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::UINT_16:
+    case ::parquet::ConvertedType::INT_32:
+        return create_type(TYPE_INT, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::UINT_32:
+    case ::parquet::ConvertedType::INT_64:
+        return create_type(TYPE_BIGINT, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::UINT_64:
+        return create_type(TYPE_LARGEINT, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::NONE:
+    default:
+        return nullptr;
+    }
+}
+
+DataTypePtr logical_type_to_doris_type(const ::parquet::ColumnDescriptor* 
column) {

Review Comment:
   能不能把convert type的逻辑都统一到这里来



##########
be/src/format/new_parquet/column_reader.cpp:
##########
@@ -0,0 +1,1191 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "format/new_parquet/column_reader.h"
+
+#include <arrow/array/array_binary.h>
+#include <parquet/api/reader.h>
+#include <parquet/api/schema.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <exception>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "core/column/column.h"
+#include "core/column/column_decimal.h"
+#include "core/column/column_struct.h"
+#include "core/data_type/data_type_array.h"
+#include "core/data_type/data_type_factory.hpp"
+#include "core/data_type/data_type_map.h"
+#include "core/data_type/data_type_nullable.h"
+#include "core/data_type/data_type_number.h"
+#include "core/data_type/data_type_string.h"
+#include "core/data_type/data_type_struct.h"
+#include "core/value/vdatetime_value.h"
+#include "format/new_parquet/parquet_column_schema.h"
+
+namespace doris::parquet {
+namespace {
+
+DataTypePtr make_nullable_if_needed(DataTypePtr type, const 
::parquet::ColumnDescriptor* column) {
+    if (type != nullptr && column != nullptr && column->max_definition_level() 
> 0) {
+        return make_nullable(type);
+    }
+    return type;
+}
+
+DataTypePtr create_type(PrimitiveType type, bool nullable, int precision = 0, 
int scale = 0) {
+    return DataTypeFactory::instance().create_data_type(type, nullable, 
precision, scale);
+}
+
+bool has_non_physical_annotation(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr) {
+        return false;
+    }
+    const auto& logical_type = column->logical_type();
+    return column->converted_type() != ::parquet::ConvertedType::NONE ||
+           (logical_type != nullptr && logical_type->is_valid() && 
!logical_type->is_none());
+}
+
+bool is_decimal_column(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr) {
+        return false;
+    }
+    const auto& logical_type = column->logical_type();
+    return column->converted_type() == ::parquet::ConvertedType::DECIMAL ||
+           (logical_type != nullptr && logical_type->is_valid() && 
logical_type->is_decimal());
+}
+
+bool is_timestamp_column(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr) {
+        return false;
+    }
+    const auto& logical_type = column->logical_type();
+    return column->converted_type() == 
::parquet::ConvertedType::TIMESTAMP_MILLIS ||
+           column->converted_type() == 
::parquet::ConvertedType::TIMESTAMP_MICROS ||
+           (logical_type != nullptr && logical_type->is_valid() && 
logical_type->is_timestamp());
+}
+
+bool is_string_like_column(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr || is_decimal_column(column)) {
+        return false;
+    }
+    return column->physical_type() == ::parquet::Type::BYTE_ARRAY ||
+           column->physical_type() == ::parquet::Type::FIXED_LEN_BYTE_ARRAY;
+}
+
+PrimitiveType decimal_primitive_type(int precision) {
+    return precision > 38 ? TYPE_DECIMAL256 : TYPE_DECIMAL128I;
+}
+
+DataTypePtr converted_type_to_doris_type(const ::parquet::ColumnDescriptor* 
column) {
+    switch (column->converted_type()) {
+    case ::parquet::ConvertedType::UTF8:
+    case ::parquet::ConvertedType::ENUM:
+    case ::parquet::ConvertedType::JSON:
+    case ::parquet::ConvertedType::BSON:
+        return create_type(TYPE_STRING, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::DECIMAL:
+        return create_type(decimal_primitive_type(column->type_precision()),
+                           column->max_definition_level() > 0, 
column->type_precision(),
+                           column->type_scale());
+    case ::parquet::ConvertedType::DATE:
+        return create_type(TYPE_DATEV2, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::TIME_MILLIS:
+        return create_type(TYPE_TIMEV2, column->max_definition_level() > 0, 0, 
3);
+    case ::parquet::ConvertedType::TIME_MICROS:
+        return create_type(TYPE_TIMEV2, column->max_definition_level() > 0, 0, 
6);
+    case ::parquet::ConvertedType::TIMESTAMP_MILLIS:
+        return create_type(TYPE_DATETIMEV2, column->max_definition_level() > 
0, 0, 3);
+    case ::parquet::ConvertedType::TIMESTAMP_MICROS:
+        return create_type(TYPE_DATETIMEV2, column->max_definition_level() > 
0, 0, 6);
+    case ::parquet::ConvertedType::INT_8:
+        return create_type(TYPE_TINYINT, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::UINT_8:
+    case ::parquet::ConvertedType::INT_16:
+        return create_type(TYPE_SMALLINT, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::UINT_16:
+    case ::parquet::ConvertedType::INT_32:
+        return create_type(TYPE_INT, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::UINT_32:
+    case ::parquet::ConvertedType::INT_64:
+        return create_type(TYPE_BIGINT, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::UINT_64:
+        return create_type(TYPE_LARGEINT, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::NONE:
+    default:
+        return nullptr;
+    }
+}
+
+DataTypePtr logical_type_to_doris_type(const ::parquet::ColumnDescriptor* 
column) {
+    const auto& logical_type = column->logical_type();
+    if (logical_type == nullptr || !logical_type->is_valid() || 
logical_type->is_none()) {
+        return nullptr;
+    }
+    const bool nullable = column->max_definition_level() > 0;
+    if (logical_type->is_string() || logical_type->is_enum() || 
logical_type->is_JSON() ||
+        logical_type->is_BSON() || logical_type->is_UUID()) {
+        return create_type(TYPE_STRING, nullable);
+    }
+    if (logical_type->is_decimal()) {
+        const auto& decimal_type = static_cast<const 
::parquet::DecimalLogicalType&>(*logical_type);
+        return create_type(decimal_primitive_type(decimal_type.precision()), 
nullable,
+                           decimal_type.precision(), decimal_type.scale());
+    }
+    if (logical_type->is_date()) {
+        return create_type(TYPE_DATEV2, nullable);
+    }
+    if (logical_type->is_time()) {
+        const auto& time_type = static_cast<const 
::parquet::TimeLogicalType&>(*logical_type);
+        int scale = 0;
+        if (time_type.time_unit() == ::parquet::LogicalType::TimeUnit::MILLIS) 
{
+            scale = 3;
+        } else if (time_type.time_unit() == 
::parquet::LogicalType::TimeUnit::MICROS) {
+            scale = 6;
+        } else {
+            return nullptr;
+        }
+        return create_type(TYPE_TIMEV2, nullable, 0, scale);
+    }
+    if (logical_type->is_timestamp()) {
+        const auto& timestamp_type =
+                static_cast<const 
::parquet::TimestampLogicalType&>(*logical_type);
+        int scale = 0;
+        if (timestamp_type.time_unit() == 
::parquet::LogicalType::TimeUnit::MILLIS) {
+            scale = 3;
+        } else if (timestamp_type.time_unit() == 
::parquet::LogicalType::TimeUnit::MICROS) {
+            scale = 6;
+        } else {
+            return nullptr;
+        }
+        return create_type(TYPE_DATETIMEV2, nullable, 0, scale);
+    }
+    if (logical_type->is_int()) {
+        const auto& int_type = static_cast<const 
::parquet::IntLogicalType&>(*logical_type);
+        switch (int_type.bit_width()) {
+        case 8:
+            return create_type(int_type.is_signed() ? TYPE_TINYINT : 
TYPE_SMALLINT, nullable);
+        case 16:
+            return create_type(int_type.is_signed() ? TYPE_SMALLINT : 
TYPE_INT, nullable);
+        case 32:
+            return create_type(int_type.is_signed() ? TYPE_INT : TYPE_BIGINT, 
nullable);
+        case 64:
+            return create_type(int_type.is_signed() ? TYPE_BIGINT : 
TYPE_LARGEINT, nullable);
+        default:
+            return nullptr;
+        }
+    }
+    return nullptr;
+}
+
+DataTypePtr direct_flat_primitive_doris_type(const 
::parquet::ColumnDescriptor* column) {
+    if (column == nullptr || column->max_repetition_level() != 0 ||
+        column->max_definition_level() > 1 || 
has_non_physical_annotation(column)) {
+        return nullptr;
+    }
+
+    const bool nullable = column->max_definition_level() > 0;
+    switch (column->physical_type()) {
+    case ::parquet::Type::BOOLEAN:
+        return create_type(TYPE_BOOLEAN, nullable);
+    case ::parquet::Type::INT32:
+        return create_type(TYPE_INT, nullable);
+    case ::parquet::Type::INT64:
+        return create_type(TYPE_BIGINT, nullable);
+    case ::parquet::Type::FLOAT:
+        return create_type(TYPE_FLOAT, nullable);
+    case ::parquet::Type::DOUBLE:
+        return create_type(TYPE_DOUBLE, nullable);
+    default:
+        return nullptr;
+    }
+}
+
+bool supports_record_reader(const ::parquet::ColumnDescriptor* descriptor) {
+    if (descriptor == nullptr || descriptor->max_repetition_level() != 0 ||
+        descriptor->max_definition_level() > 1) {
+        return false;
+    }
+    switch (descriptor->physical_type()) {
+    case ::parquet::Type::BOOLEAN:
+    case ::parquet::Type::INT32:
+    case ::parquet::Type::INT64:
+    case ::parquet::Type::FLOAT:
+    case ::parquet::Type::DOUBLE:
+    case ::parquet::Type::BYTE_ARRAY:
+    case ::parquet::Type::FIXED_LEN_BYTE_ARRAY:
+        return true;
+    default:
+        return false;
+    }
+}
+
+class PrimitiveColumnReader final : public ParquetColumnReader {

Review Comment:
   ScalarColumnReader



##########
be/src/format/new_parquet/column_reader.cpp:
##########
@@ -0,0 +1,1191 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "format/new_parquet/column_reader.h"
+
+#include <arrow/array/array_binary.h>
+#include <parquet/api/reader.h>
+#include <parquet/api/schema.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <exception>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "core/column/column.h"
+#include "core/column/column_decimal.h"
+#include "core/column/column_struct.h"
+#include "core/data_type/data_type_array.h"
+#include "core/data_type/data_type_factory.hpp"
+#include "core/data_type/data_type_map.h"
+#include "core/data_type/data_type_nullable.h"
+#include "core/data_type/data_type_number.h"
+#include "core/data_type/data_type_string.h"
+#include "core/data_type/data_type_struct.h"
+#include "core/value/vdatetime_value.h"
+#include "format/new_parquet/parquet_column_schema.h"
+
+namespace doris::parquet {
+namespace {
+
+DataTypePtr make_nullable_if_needed(DataTypePtr type, const 
::parquet::ColumnDescriptor* column) {
+    if (type != nullptr && column != nullptr && column->max_definition_level() 
> 0) {
+        return make_nullable(type);
+    }
+    return type;
+}
+
+DataTypePtr create_type(PrimitiveType type, bool nullable, int precision = 0, 
int scale = 0) {
+    return DataTypeFactory::instance().create_data_type(type, nullable, 
precision, scale);
+}
+
+bool has_non_physical_annotation(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr) {
+        return false;
+    }
+    const auto& logical_type = column->logical_type();
+    return column->converted_type() != ::parquet::ConvertedType::NONE ||
+           (logical_type != nullptr && logical_type->is_valid() && 
!logical_type->is_none());
+}
+
+bool is_decimal_column(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr) {
+        return false;
+    }
+    const auto& logical_type = column->logical_type();
+    return column->converted_type() == ::parquet::ConvertedType::DECIMAL ||
+           (logical_type != nullptr && logical_type->is_valid() && 
logical_type->is_decimal());
+}
+
+bool is_timestamp_column(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr) {
+        return false;
+    }
+    const auto& logical_type = column->logical_type();
+    return column->converted_type() == 
::parquet::ConvertedType::TIMESTAMP_MILLIS ||
+           column->converted_type() == 
::parquet::ConvertedType::TIMESTAMP_MICROS ||
+           (logical_type != nullptr && logical_type->is_valid() && 
logical_type->is_timestamp());
+}
+
+bool is_string_like_column(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr || is_decimal_column(column)) {
+        return false;
+    }
+    return column->physical_type() == ::parquet::Type::BYTE_ARRAY ||
+           column->physical_type() == ::parquet::Type::FIXED_LEN_BYTE_ARRAY;
+}
+
+PrimitiveType decimal_primitive_type(int precision) {
+    return precision > 38 ? TYPE_DECIMAL256 : TYPE_DECIMAL128I;
+}
+
+DataTypePtr converted_type_to_doris_type(const ::parquet::ColumnDescriptor* 
column) {
+    switch (column->converted_type()) {
+    case ::parquet::ConvertedType::UTF8:
+    case ::parquet::ConvertedType::ENUM:
+    case ::parquet::ConvertedType::JSON:
+    case ::parquet::ConvertedType::BSON:
+        return create_type(TYPE_STRING, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::DECIMAL:
+        return create_type(decimal_primitive_type(column->type_precision()),
+                           column->max_definition_level() > 0, 
column->type_precision(),
+                           column->type_scale());
+    case ::parquet::ConvertedType::DATE:
+        return create_type(TYPE_DATEV2, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::TIME_MILLIS:
+        return create_type(TYPE_TIMEV2, column->max_definition_level() > 0, 0, 
3);
+    case ::parquet::ConvertedType::TIME_MICROS:
+        return create_type(TYPE_TIMEV2, column->max_definition_level() > 0, 0, 
6);
+    case ::parquet::ConvertedType::TIMESTAMP_MILLIS:
+        return create_type(TYPE_DATETIMEV2, column->max_definition_level() > 
0, 0, 3);
+    case ::parquet::ConvertedType::TIMESTAMP_MICROS:
+        return create_type(TYPE_DATETIMEV2, column->max_definition_level() > 
0, 0, 6);
+    case ::parquet::ConvertedType::INT_8:
+        return create_type(TYPE_TINYINT, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::UINT_8:
+    case ::parquet::ConvertedType::INT_16:
+        return create_type(TYPE_SMALLINT, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::UINT_16:
+    case ::parquet::ConvertedType::INT_32:
+        return create_type(TYPE_INT, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::UINT_32:
+    case ::parquet::ConvertedType::INT_64:
+        return create_type(TYPE_BIGINT, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::UINT_64:
+        return create_type(TYPE_LARGEINT, column->max_definition_level() > 0);
+    case ::parquet::ConvertedType::NONE:
+    default:
+        return nullptr;
+    }
+}
+
+DataTypePtr logical_type_to_doris_type(const ::parquet::ColumnDescriptor* 
column) {
+    const auto& logical_type = column->logical_type();
+    if (logical_type == nullptr || !logical_type->is_valid() || 
logical_type->is_none()) {
+        return nullptr;
+    }
+    const bool nullable = column->max_definition_level() > 0;
+    if (logical_type->is_string() || logical_type->is_enum() || 
logical_type->is_JSON() ||
+        logical_type->is_BSON() || logical_type->is_UUID()) {
+        return create_type(TYPE_STRING, nullable);
+    }
+    if (logical_type->is_decimal()) {
+        const auto& decimal_type = static_cast<const 
::parquet::DecimalLogicalType&>(*logical_type);
+        return create_type(decimal_primitive_type(decimal_type.precision()), 
nullable,
+                           decimal_type.precision(), decimal_type.scale());
+    }
+    if (logical_type->is_date()) {
+        return create_type(TYPE_DATEV2, nullable);
+    }
+    if (logical_type->is_time()) {
+        const auto& time_type = static_cast<const 
::parquet::TimeLogicalType&>(*logical_type);
+        int scale = 0;
+        if (time_type.time_unit() == ::parquet::LogicalType::TimeUnit::MILLIS) 
{
+            scale = 3;
+        } else if (time_type.time_unit() == 
::parquet::LogicalType::TimeUnit::MICROS) {
+            scale = 6;
+        } else {
+            return nullptr;
+        }
+        return create_type(TYPE_TIMEV2, nullable, 0, scale);
+    }
+    if (logical_type->is_timestamp()) {
+        const auto& timestamp_type =
+                static_cast<const 
::parquet::TimestampLogicalType&>(*logical_type);
+        int scale = 0;
+        if (timestamp_type.time_unit() == 
::parquet::LogicalType::TimeUnit::MILLIS) {
+            scale = 3;
+        } else if (timestamp_type.time_unit() == 
::parquet::LogicalType::TimeUnit::MICROS) {
+            scale = 6;
+        } else {
+            return nullptr;
+        }
+        return create_type(TYPE_DATETIMEV2, nullable, 0, scale);
+    }
+    if (logical_type->is_int()) {
+        const auto& int_type = static_cast<const 
::parquet::IntLogicalType&>(*logical_type);
+        switch (int_type.bit_width()) {
+        case 8:
+            return create_type(int_type.is_signed() ? TYPE_TINYINT : 
TYPE_SMALLINT, nullable);
+        case 16:
+            return create_type(int_type.is_signed() ? TYPE_SMALLINT : 
TYPE_INT, nullable);
+        case 32:
+            return create_type(int_type.is_signed() ? TYPE_INT : TYPE_BIGINT, 
nullable);
+        case 64:
+            return create_type(int_type.is_signed() ? TYPE_BIGINT : 
TYPE_LARGEINT, nullable);
+        default:
+            return nullptr;
+        }
+    }
+    return nullptr;
+}
+
+DataTypePtr direct_flat_primitive_doris_type(const 
::parquet::ColumnDescriptor* column) {
+    if (column == nullptr || column->max_repetition_level() != 0 ||
+        column->max_definition_level() > 1 || 
has_non_physical_annotation(column)) {
+        return nullptr;
+    }
+
+    const bool nullable = column->max_definition_level() > 0;
+    switch (column->physical_type()) {
+    case ::parquet::Type::BOOLEAN:
+        return create_type(TYPE_BOOLEAN, nullable);
+    case ::parquet::Type::INT32:
+        return create_type(TYPE_INT, nullable);
+    case ::parquet::Type::INT64:
+        return create_type(TYPE_BIGINT, nullable);
+    case ::parquet::Type::FLOAT:
+        return create_type(TYPE_FLOAT, nullable);
+    case ::parquet::Type::DOUBLE:
+        return create_type(TYPE_DOUBLE, nullable);
+    default:
+        return nullptr;
+    }
+}
+
+bool supports_record_reader(const ::parquet::ColumnDescriptor* descriptor) {
+    if (descriptor == nullptr || descriptor->max_repetition_level() != 0 ||
+        descriptor->max_definition_level() > 1) {
+        return false;
+    }
+    switch (descriptor->physical_type()) {
+    case ::parquet::Type::BOOLEAN:
+    case ::parquet::Type::INT32:
+    case ::parquet::Type::INT64:
+    case ::parquet::Type::FLOAT:
+    case ::parquet::Type::DOUBLE:
+    case ::parquet::Type::BYTE_ARRAY:
+    case ::parquet::Type::FIXED_LEN_BYTE_ARRAY:
+        return true;
+    default:
+        return false;
+    }
+}
+
+class PrimitiveColumnReader final : public ParquetColumnReader {
+public:
+    PrimitiveColumnReader(int file_column_id, const 
::parquet::ColumnDescriptor* descriptor,
+                          DataTypePtr type, std::string name,
+                          std::shared_ptr<::parquet::internal::RecordReader> 
record_reader)
+            : _file_column_id(file_column_id),
+              _parquet_column_ordinal(file_column_id),
+              _descriptor(descriptor),
+              _type(std::move(type)),
+              _name(std::move(name)),
+              _record_reader(std::move(record_reader)) {}
+
+    int file_column_id() const override { return _file_column_id; }
+    int parquet_column_ordinal() const override { return 
_parquet_column_ordinal; }
+    const DataTypePtr& type() const override { return _type; }
+    const std::string& name() const override { return _name; }
+
+    Status read_batch(int64_t batch_rows, MutableColumnPtr* result_column,
+                      int64_t* rows_read) override;
+    Status skip(int64_t rows) override;
+    Status read_selected(const std::vector<uint16_t>& selection, uint16_t 
selected_rows,
+                         int64_t batch_rows, MutableColumnPtr* result_column) 
override;
+
+    const ::parquet::ColumnDescriptor* descriptor() const { return 
_descriptor; }
+    const std::shared_ptr<::parquet::internal::RecordReader>& record_reader() 
const {
+        return _record_reader;
+    }
+
+private:
+    int _file_column_id = -1;
+    int _parquet_column_ordinal = -1;

Review Comment:
   _file_column_id和_parquet_column_ordinal的区别是啥?加个注释?



##########
be/src/format/new_parquet/column_reader.cpp:
##########
@@ -0,0 +1,1191 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "format/new_parquet/column_reader.h"
+
+#include <arrow/array/array_binary.h>
+#include <parquet/api/reader.h>
+#include <parquet/api/schema.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <exception>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "core/column/column.h"
+#include "core/column/column_decimal.h"
+#include "core/column/column_struct.h"
+#include "core/data_type/data_type_array.h"
+#include "core/data_type/data_type_factory.hpp"
+#include "core/data_type/data_type_map.h"
+#include "core/data_type/data_type_nullable.h"
+#include "core/data_type/data_type_number.h"
+#include "core/data_type/data_type_string.h"
+#include "core/data_type/data_type_struct.h"
+#include "core/value/vdatetime_value.h"
+#include "format/new_parquet/parquet_column_schema.h"
+
+namespace doris::parquet {
+namespace {
+
+DataTypePtr make_nullable_if_needed(DataTypePtr type, const 
::parquet::ColumnDescriptor* column) {
+    if (type != nullptr && column != nullptr && column->max_definition_level() 
> 0) {
+        return make_nullable(type);
+    }
+    return type;
+}
+
+DataTypePtr create_type(PrimitiveType type, bool nullable, int precision = 0, 
int scale = 0) {
+    return DataTypeFactory::instance().create_data_type(type, nullable, 
precision, scale);
+}
+
+bool has_non_physical_annotation(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr) {
+        return false;
+    }
+    const auto& logical_type = column->logical_type();
+    return column->converted_type() != ::parquet::ConvertedType::NONE ||
+           (logical_type != nullptr && logical_type->is_valid() && 
!logical_type->is_none());
+}
+
+bool is_decimal_column(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr) {
+        return false;
+    }
+    const auto& logical_type = column->logical_type();
+    return column->converted_type() == ::parquet::ConvertedType::DECIMAL ||
+           (logical_type != nullptr && logical_type->is_valid() && 
logical_type->is_decimal());
+}
+
+bool is_timestamp_column(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr) {
+        return false;
+    }
+    const auto& logical_type = column->logical_type();
+    return column->converted_type() == 
::parquet::ConvertedType::TIMESTAMP_MILLIS ||
+           column->converted_type() == 
::parquet::ConvertedType::TIMESTAMP_MICROS ||
+           (logical_type != nullptr && logical_type->is_valid() && 
logical_type->is_timestamp());
+}
+
+bool is_string_like_column(const ::parquet::ColumnDescriptor* column) {
+    if (column == nullptr || is_decimal_column(column)) {
+        return false;
+    }
+    return column->physical_type() == ::parquet::Type::BYTE_ARRAY ||
+           column->physical_type() == ::parquet::Type::FIXED_LEN_BYTE_ARRAY;
+}
+
+PrimitiveType decimal_primitive_type(int precision) {
+    return precision > 38 ? TYPE_DECIMAL256 : TYPE_DECIMAL128I;
+}
+
+DataTypePtr converted_type_to_doris_type(const ::parquet::ColumnDescriptor* 
column) {
+    switch (column->converted_type()) {

Review Comment:
   这里为什么不用logical type?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to