This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 0b3a6f50b8809a91bd5039214f755bb8611ae103
Author: Ashin Gau <[email protected]>
AuthorDate: Sun Aug 20 22:59:18 2023 +0800

    [fix](parquet) the key colum of map type in parquet may be nullable (#23180)
    
    Fix errors when reading map type with nullable key column in parquet file. 
`ParquetReader` support to read nullable key column, but add a check to prevent 
reading nullable key column. Unfortunately, this check error was not thrown 
correctly, causing the BE to crash, and thrown meaningless error logs in be.out:
    ```
    ...
    11# 
doris::vectorized::ParquetReader::get_columns(std::unordered_map<std::__cxx11::basic_string<char,
 std::char_traits<char>, std::allocator<char> >, doris::TypeDescriptor, 
std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, 
std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, 
std::char_traits<char>, std::allocator<char> > >, 
std::allocator<std::pair<std::__cxx11::basic_string<char, 
std::char_traits<char>, std::allocator<char> > const, doris::TypeDes [...]
    12# doris::vectorized::VFileScanner::_get_next_reader() in 
/root/yun_you_external/output/be/lib/doris_be
    13# doris::vectorized::VFileScanner::_get_block_impl(doris::RuntimeState*, 
doris::vectorized::Block*, bool*) at 
/root/doris/be/src/vec/exec/scan/vfile_scanner.cpp:241
    ...
    ```
---
 be/src/vec/exec/format/parquet/schema_desc.cpp                        | 2 +-
 be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp             | 3 +--
 regression-test/data/external_table_p2/hive/test_complex_types.out    | 3 +++
 .../suites/external_table_p2/hive/test_complex_types.groovy           | 4 ++++
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/be/src/vec/exec/format/parquet/schema_desc.cpp 
b/be/src/vec/exec/format/parquet/schema_desc.cpp
index dcc5140e89..05eaf30a80 100644
--- a/be/src/vec/exec/format/parquet/schema_desc.cpp
+++ b/be/src/vec/exec/format/parquet/schema_desc.cpp
@@ -480,7 +480,7 @@ Status FieldDescriptor::parse_map_field(const 
std::vector<tparquet::SchemaElemen
     }
     auto& map_key = t_schemas[curr_pos + 2];
     if (!is_required_node(map_key)) {
-        return Status::InvalidArgument("the third level(map key) in map group 
must be required");
+        LOG(WARNING) << "Filed " << map_schema.name << " is map type, but with 
nullable key column";
     }
 
     if (map_key_value.num_children == 1) {
diff --git a/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp 
b/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp
index c199e72da7..95960f489e 100644
--- a/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp
@@ -32,8 +32,7 @@ Status FileMetaData::init_schema() {
     if (_metadata.schema[0].num_children <= 0) {
         Status::Corruption("Invalid parquet schema");
     }
-    _schema.parse_from_thrift(_metadata.schema);
-    return Status();
+    return _schema.parse_from_thrift(_metadata.schema);
 }
 
 const tparquet::FileMetaData& FileMetaData::to_thrift() {
diff --git a/regression-test/data/external_table_p2/hive/test_complex_types.out 
b/regression-test/data/external_table_p2/hive/test_complex_types.out
index 8c4f9d04f4..c414a60a99 100644
--- a/regression-test/data/external_table_p2/hive/test_complex_types.out
+++ b/regression-test/data/external_table_p2/hive/test_complex_types.out
@@ -29,3 +29,6 @@
 2      \N      \N      \N      {"h", 10}
 3      [5, NULL]       [[6, 7], [8, NULL], NULL]       {"f":1, "g":NULL}       
{NULL, 9}
 
+-- !map_with_nullable_key --
+\N     \N      \N      \N      \N      \N      \N      \N      \N              
test            test    
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 [...]
+
diff --git 
a/regression-test/suites/external_table_p2/hive/test_complex_types.groovy 
b/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
index c86c8c2562..5422e1d9a4 100644
--- a/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
+++ b/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
@@ -52,5 +52,9 @@ suite("test_complex_types", "p2") {
         qt_array_last """select max(array_last(i -> i > 0, capacity)) from byd 
where array_last(i -> i > 0, capacity) < 0.99"""
 
         qt_offsets_check """select * from complex_offsets_check order by id"""
+
+        qt_map_with_nullable_key """select * from parquet_all_types limit 1"""
+
+        sql """drop catalog ${catalog_name};"""
     }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to