morningman commented on code in PR #58972: URL: https://github.com/apache/doris/pull/58972#discussion_r2612499349
########## be/src/vec/exec/format/table/parquet_metadata_reader.cpp: ########## @@ -0,0 +1,632 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/table/parquet_metadata_reader.h" + +#include <fmt/format.h> + +#include <algorithm> +#include <cctype> +#include <memory> +#include <sstream> +#include <unordered_map> + +#include "common/logging.h" +#include "io/fs/file_reader.h" +#include "io/fs/local_file_system.h" +#include "io/io_common.h" +#include "runtime/runtime_state.h" +#include "vec/columns/column.h" +#include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_vector.h" +#include "vec/common/assert_cast.h" +#include "vec/core/block.h" +#include "vec/core/types.h" +#include "vec/exec/format/parquet/parquet_thrift_util.h" +#include "vec/exec/format/parquet/schema_desc.h" +#include "vec/exec/format/parquet/vparquet_file_metadata.h" + +namespace doris::vectorized { +namespace { + +constexpr const char* MODE_SCHEMA = "parquet_schema"; +constexpr const char* MODE_METADATA = "parquet_metadata"; + +enum SchemaColumnIndex { + SCHEMA_FILE_NAME = 0, + SCHEMA_COLUMN_NAME, + SCHEMA_COLUMN_PATH, + SCHEMA_PHYSICAL_TYPE, + SCHEMA_LOGICAL_TYPE, + SCHEMA_REPETITION_LEVEL, + SCHEMA_DEFINITION_LEVEL, + SCHEMA_TYPE_LENGTH, + SCHEMA_PRECISION, + SCHEMA_SCALE, + SCHEMA_IS_NULLABLE +}; + +enum MetadataColumnIndex { + META_FILE_NAME = 0, + META_ROW_GROUP_ID, + META_COLUMN_ID, + META_COLUMN_NAME, + META_COLUMN_PATH, + META_PHYSICAL_TYPE, + META_LOGICAL_TYPE, + META_TYPE_LENGTH, + META_CONVERTED_TYPE, + META_NUM_VALUES, + META_NULL_COUNT, + META_DISTINCT_COUNT, + META_ENCODINGS, + META_COMPRESSION, + META_DATA_PAGE_OFFSET, + META_INDEX_PAGE_OFFSET, + META_DICTIONARY_PAGE_OFFSET, + META_TOTAL_COMPRESSED_SIZE, + META_TOTAL_UNCOMPRESSED_SIZE, + META_STATISTICS_MIN, + META_STATISTICS_MAX +}; + +std::string join_path(const std::vector<std::string>& items) { Review Comment: Use `join()` in `be/src/util/string_util.h` ########## be/src/vec/exec/format/table/parquet_metadata_reader.cpp: ########## @@ -0,0 +1,632 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/table/parquet_metadata_reader.h" + +#include <fmt/format.h> + +#include <algorithm> +#include <cctype> +#include <memory> +#include <sstream> +#include <unordered_map> + +#include "common/logging.h" +#include "io/fs/file_reader.h" +#include "io/fs/local_file_system.h" +#include "io/io_common.h" +#include "runtime/runtime_state.h" +#include "vec/columns/column.h" +#include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_vector.h" +#include "vec/common/assert_cast.h" +#include "vec/core/block.h" +#include "vec/core/types.h" +#include "vec/exec/format/parquet/parquet_thrift_util.h" +#include "vec/exec/format/parquet/schema_desc.h" +#include "vec/exec/format/parquet/vparquet_file_metadata.h" + +namespace doris::vectorized { +namespace { + +constexpr const char* MODE_SCHEMA = "parquet_schema"; +constexpr const char* MODE_METADATA = "parquet_metadata"; + +enum SchemaColumnIndex { + SCHEMA_FILE_NAME = 0, + SCHEMA_COLUMN_NAME, + SCHEMA_COLUMN_PATH, + SCHEMA_PHYSICAL_TYPE, + SCHEMA_LOGICAL_TYPE, + SCHEMA_REPETITION_LEVEL, + SCHEMA_DEFINITION_LEVEL, + SCHEMA_TYPE_LENGTH, + SCHEMA_PRECISION, + SCHEMA_SCALE, + SCHEMA_IS_NULLABLE +}; + +enum MetadataColumnIndex { + META_FILE_NAME = 0, + META_ROW_GROUP_ID, + META_COLUMN_ID, + META_COLUMN_NAME, + META_COLUMN_PATH, + META_PHYSICAL_TYPE, + META_LOGICAL_TYPE, + META_TYPE_LENGTH, + META_CONVERTED_TYPE, + META_NUM_VALUES, + META_NULL_COUNT, + META_DISTINCT_COUNT, + META_ENCODINGS, + META_COMPRESSION, + META_DATA_PAGE_OFFSET, + META_INDEX_PAGE_OFFSET, + META_DICTIONARY_PAGE_OFFSET, + META_TOTAL_COMPRESSED_SIZE, + META_TOTAL_UNCOMPRESSED_SIZE, + META_STATISTICS_MIN, + META_STATISTICS_MAX +}; + +std::string join_path(const std::vector<std::string>& items) { + if (items.empty()) { + return ""; + } + std::ostringstream oss; + for (size_t i = 0; i < items.size(); ++i) { + if (i != 0) { + oss << "."; + } + oss << items[i]; + } + return oss.str(); +} + +template <typename ColumnType, typename T> +void insert_numeric_impl(MutableColumnPtr& column, T value) { + if (auto* nullable = check_and_get_column<ColumnNullable>(*column)) { + nullable->get_null_map_data().push_back(0); + auto& nested = nullable->get_nested_column(); + assert_cast<ColumnType&>(nested).insert_value(value); + } else { + assert_cast<ColumnType&>(*column).insert_value(value); + } +} + +void insert_int32(MutableColumnPtr& column, Int32 value) { + insert_numeric_impl<ColumnInt32>(column, value); +} + +void insert_int64(MutableColumnPtr& column, Int64 value) { + insert_numeric_impl<ColumnInt64>(column, value); +} + +void insert_bool(MutableColumnPtr& column, bool value) { + insert_numeric_impl<ColumnUInt8>(column, static_cast<UInt8>(value)); +} + +void insert_string(MutableColumnPtr& column, const std::string& value) { + if (auto* nullable = check_and_get_column<ColumnNullable>(*column)) { + nullable->get_null_map_data().push_back(0); + auto& nested = nullable->get_nested_column(); + assert_cast<ColumnString&>(nested).insert_data(value.c_str(), value.size()); + } else { + assert_cast<ColumnString&>(*column).insert_data(value.c_str(), value.size()); + } +} + +void insert_null(MutableColumnPtr& column) { + if (auto* nullable = check_and_get_column<ColumnNullable>(*column)) { + nullable->get_null_map_data().push_back(1); + nullable->get_nested_column().insert_default(); + } else { + column->insert_default(); + } +} + +std::string physical_type_to_string(tparquet::Type::type type) { + switch (type) { + case tparquet::Type::BOOLEAN: + return "BOOLEAN"; + case tparquet::Type::INT32: + return "INT32"; + case tparquet::Type::INT64: + return "INT64"; + case tparquet::Type::INT96: + return "INT96"; + case tparquet::Type::FLOAT: + return "FLOAT"; + case tparquet::Type::DOUBLE: + return "DOUBLE"; + case tparquet::Type::BYTE_ARRAY: + return "BYTE_ARRAY"; + case tparquet::Type::FIXED_LEN_BYTE_ARRAY: + return "FIXED_LEN_BYTE_ARRAY"; + default: + return "UNKNOWN"; + } +} + +std::string compression_to_string(tparquet::CompressionCodec::type codec) { + switch (codec) { + case tparquet::CompressionCodec::UNCOMPRESSED: + return "UNCOMPRESSED"; + case tparquet::CompressionCodec::SNAPPY: + return "SNAPPY"; + case tparquet::CompressionCodec::GZIP: + return "GZIP"; + case tparquet::CompressionCodec::LZO: + return "LZO"; + case tparquet::CompressionCodec::BROTLI: + return "BROTLI"; + case tparquet::CompressionCodec::LZ4: + return "LZ4"; + case tparquet::CompressionCodec::ZSTD: + return "ZSTD"; + case tparquet::CompressionCodec::LZ4_RAW: + return "LZ4_RAW"; + default: + return "UNKNOWN"; + } +} + +std::string converted_type_to_string(tparquet::ConvertedType::type type) { + switch (type) { + case tparquet::ConvertedType::UTF8: + return "UTF8"; + case tparquet::ConvertedType::MAP: + return "MAP"; + case tparquet::ConvertedType::MAP_KEY_VALUE: + return "MAP_KEY_VALUE"; + case tparquet::ConvertedType::LIST: + return "LIST"; + case tparquet::ConvertedType::ENUM: + return "ENUM"; + case tparquet::ConvertedType::DECIMAL: + return "DECIMAL"; + case tparquet::ConvertedType::DATE: + return "DATE"; + case tparquet::ConvertedType::TIME_MILLIS: + return "TIME_MILLIS"; + case tparquet::ConvertedType::TIME_MICROS: + return "TIME_MICROS"; + case tparquet::ConvertedType::TIMESTAMP_MILLIS: + return "TIMESTAMP_MILLIS"; + case tparquet::ConvertedType::TIMESTAMP_MICROS: + return "TIMESTAMP_MICROS"; + case tparquet::ConvertedType::UINT_8: + return "UINT_8"; + case tparquet::ConvertedType::UINT_16: + return "UINT_16"; + case tparquet::ConvertedType::UINT_32: + return "UINT_32"; + case tparquet::ConvertedType::UINT_64: + return "UINT_64"; + case tparquet::ConvertedType::INT_8: + return "INT_8"; + case tparquet::ConvertedType::INT_16: + return "INT_16"; + case tparquet::ConvertedType::INT_32: + return "INT_32"; + case tparquet::ConvertedType::INT_64: + return "INT_64"; + case tparquet::ConvertedType::JSON: + return "JSON"; + case tparquet::ConvertedType::BSON: + return "BSON"; + case tparquet::ConvertedType::INTERVAL: + return "INTERVAL"; + default: + return "UNKNOWN"; + } +} + +std::string logical_type_to_string(const tparquet::SchemaElement& element) { + if (element.__isset.logicalType) { + const auto& logical = element.logicalType; + if (logical.__isset.STRING) { + return "STRING"; + } else if (logical.__isset.MAP) { + return "MAP"; + } else if (logical.__isset.LIST) { + return "LIST"; + } else if (logical.__isset.ENUM) { + return "ENUM"; + } else if (logical.__isset.DECIMAL) { + return "DECIMAL"; + } else if (logical.__isset.DATE) { + return "DATE"; + } else if (logical.__isset.TIME) { + return "TIME"; + } else if (logical.__isset.TIMESTAMP) { + return "TIMESTAMP"; + } else if (logical.__isset.INTEGER) { + return "INTEGER"; + } else if (logical.__isset.UNKNOWN) { + return "UNKNOWN"; + } else if (logical.__isset.JSON) { + return "JSON"; + } else if (logical.__isset.BSON) { + return "BSON"; + } else if (logical.__isset.UUID) { + return "UUID"; + } else if (logical.__isset.FLOAT16) { + return "FLOAT16"; + } else if (logical.__isset.VARIANT) { + return "VARIANT"; + } else if (logical.__isset.GEOMETRY) { + return "GEOMETRY"; + } else if (logical.__isset.GEOGRAPHY) { + return "GEOGRAPHY"; + } + } + if (element.__isset.converted_type) { + return converted_type_to_string(element.converted_type); + } + return ""; +} + +std::string encodings_to_string(const std::vector<tparquet::Encoding::type>& encodings) { + std::ostringstream oss; Review Comment: use `fmt` lib instead of `std::ostringstream` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
