This is an automated email from the ASF dual-hosted git repository. huajianlan pushed a commit to branch nested_column_prune in repository https://gitbox.apache.org/repos/asf/doris.git
commit 0279d9c575853e06e59d84eb57a7b317414e12f2 Author: kakachen <[email protected]> AuthorDate: Thu Oct 30 19:15:40 2025 +0800 Fix some tests's bugs of external table. --- be/src/vec/exec/format/orc/vorc_reader.cpp | 17 +- .../table/hive/hive_orc_nested_column_utils.cpp | 21 +++ be/src/vec/exec/format/table/hive_reader.cpp | 40 ++--- .../iceberg/iceberg_orc_nested_column_utils.cpp | 20 +++ .../iceberg_parquet_nested_column_utils.cpp | 16 +- be/src/vec/exec/format/table/iceberg_reader.cpp | 22 +-- .../hive/hive_reader_create_column_ids_test.cpp | 184 ++++++++++++--------- .../iceberg_reader_create_column_ids_test.cpp | 122 +++++++++----- 8 files changed, 274 insertions(+), 168 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index cfbad191772..11b75b820ca 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -1984,12 +1984,9 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name, } } else { // Normal processing: convert ORC column to Doris column - auto status = _orc_column_to_doris_column<false>( + RETURN_IF_ERROR(_orc_column_to_doris_column<false>( key_col_name, doris_key_column, doris_key_type, root_node->get_key_node(), - orc_key_type, orc_map->keys.get(), element_size); - if (!status.ok()) { - return status; - } + orc_key_type, orc_map->keys.get(), element_size)); } // Handle value column: if still missing, fill with default values @@ -2005,14 +2002,14 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name, } else { mutable_value_column->insert_many_defaults(element_size); } - return Status::OK(); } else { // Normal processing: convert ORC column to Doris column - return _orc_column_to_doris_column<false>(value_col_name, doris_value_column, - doris_value_type, root_node->get_value_node(), - orc_value_type, orc_map->elements.get(), - element_size); + RETURN_IF_ERROR(_orc_column_to_doris_column<false>( + value_col_name, doris_value_column, doris_value_type, + root_node->get_value_node(), orc_value_type, orc_map->elements.get(), + element_size)); } + return doris_map.deduplicate_keys(); } case PrimitiveType::TYPE_STRUCT: { if (orc_column_type->getKind() != orc::TypeKind::STRUCT) { diff --git a/be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp b/be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp index 6e9fa381fc7..1cd02967f0d 100644 --- a/be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp +++ b/be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp @@ -95,6 +95,27 @@ void HiveOrcNestedColumnUtils::extract_nested_column_ids( } else if (i == 1) { child_field_name = "VALUES"; } + + // Special handling for Orc MAP structure: + // When accessing only VALUES, we still need KEY structure for levels + // Check if we're at key child (i==0) and only VALUES is requested (no KEYS) + if (i == 0) { + bool has_keys_access = child_paths_by_table_col_name.find("KEYS") != + child_paths_by_table_col_name.end(); + bool has_values_access = child_paths_by_table_col_name.find("VALUES") != + child_paths_by_table_col_name.end(); + + // If only VALUES is accessed (not KEYS), still include key structure for deduplicate_keys + if (!has_keys_access && has_values_access) { + uint64_t key_start_id = child->getColumnId(); + uint64_t key_max_id = child->getMaximumColumnId(); + for (uint64_t id = key_start_id; id <= key_max_id; ++id) { + column_ids.insert(id); + } + has_child_columns = true; + continue; // Skip further processing of key child + } + } break; default: child_field_name = ""; diff --git a/be/src/vec/exec/format/table/hive_reader.cpp b/be/src/vec/exec/format/table/hive_reader.cpp index 81c8cf47f42..b9bc01fe7b6 100644 --- a/be/src/vec/exec/format/table/hive_reader.cpp +++ b/be/src/vec/exec/format/table/hive_reader.cpp @@ -166,8 +166,6 @@ ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type, } const orc::Type* orc_field = it->second; - const auto& all_access_paths = slot->all_access_paths(); - // primitive (non-nested) types: direct mapping by name if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && slot->col_type() != TYPE_MAP)) { @@ -179,13 +177,13 @@ ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type, } // complex types: - - // collect and process all_access_paths -> column_ids + const auto& all_access_paths = slot->all_access_paths(); process_access_paths(orc_field, all_access_paths, column_ids); - // collect and process predicate_access_paths -> filter_column_ids const auto& predicate_access_paths = slot->predicate_access_paths(); - process_access_paths(orc_field, predicate_access_paths, filter_column_ids); + if (!predicate_access_paths.empty()) { + process_access_paths(orc_field, predicate_access_paths, filter_column_ids); + } } return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); @@ -261,8 +259,6 @@ ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index( } const orc::Type* orc_field = it->second; - const auto& all_access_paths = slot->all_access_paths(); - // primitive (non-nested) types: direct mapping by pos if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && slot->col_type() != TYPE_MAP)) { @@ -273,14 +269,14 @@ ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index( continue; } + const auto& all_access_paths = slot->all_access_paths(); // complex types - - // collect and process all_access_paths -> column_ids process_access_paths(orc_field, all_access_paths, column_ids); - // collect and process predicate_access_paths -> filter_column_ids const auto& predicate_access_paths = slot->predicate_access_paths(); - process_access_paths(orc_field, predicate_access_paths, filter_column_ids); + if (!predicate_access_paths.empty()) { + process_access_paths(orc_field, predicate_access_paths, filter_column_ids); + } } return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); @@ -438,8 +434,6 @@ ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* fiel } auto field_schema = it->second; - const auto& all_access_paths = slot->all_access_paths(); - // primitive (non-nested) types: direct mapping by name if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && slot->col_type() != TYPE_MAP)) { @@ -452,13 +446,13 @@ ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* fiel } // complex types: - - // collect and process all_access_paths -> column_ids + const auto& all_access_paths = slot->all_access_paths(); process_access_paths(field_schema, all_access_paths, column_ids); - // collect and process predicate_access_paths -> filter_column_ids const auto& predicate_access_paths = slot->predicate_access_paths(); - process_access_paths(field_schema, predicate_access_paths, filter_column_ids); + if (!predicate_access_paths.empty()) { + process_access_paths(field_schema, predicate_access_paths, filter_column_ids); + } } return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); @@ -538,8 +532,6 @@ ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index( } auto field_schema = it->second; - const auto& all_access_paths = slot->all_access_paths(); - // primitive (non-nested) types: direct mapping by position if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && slot->col_type() != TYPE_MAP)) { @@ -551,12 +543,14 @@ ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index( continue; } - // collect and process all_access_paths -> column_ids + // complex types: + const auto& all_access_paths = slot->all_access_paths(); process_access_paths(field_schema, all_access_paths, column_ids); - // collect and process predicate_access_paths -> filter_column_ids const auto& predicate_access_paths = slot->predicate_access_paths(); - process_access_paths(field_schema, predicate_access_paths, filter_column_ids); + if (!predicate_access_paths.empty()) { + process_access_paths(field_schema, predicate_access_paths, filter_column_ids); + } } return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); diff --git a/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp b/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp index 5dbbbfb2af2..e3aacded188 100644 --- a/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp +++ b/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp @@ -97,6 +97,26 @@ void IcebergOrcNestedColumnUtils::extract_nested_column_ids( } else if (i == 1) { child_field_id = "VALUES"; } + // Special handling for Orc MAP structure: + // When accessing only VALUES, we still need KEY structure for levels + // Check if we're at key child (i==0) and only VALUES is requested (no KEYS) + if (i == 0) { + bool has_keys_access = + child_paths_by_field_id.find("KEYS") != child_paths_by_field_id.end(); + bool has_values_access = + child_paths_by_field_id.find("VALUES") != child_paths_by_field_id.end(); + + // If only VALUES is accessed (not KEYS), still include key structure for deduplicate_keys + if (!has_keys_access && has_values_access) { + uint64_t key_start_id = child->getColumnId(); + uint64_t key_max_id = child->getMaximumColumnId(); + for (uint64_t id = key_start_id; id <= key_max_id; ++id) { + column_ids.insert(id); + } + has_child_columns = true; + continue; // Skip further processing of key child + } + } break; default: child_field_id = ""; diff --git a/be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp b/be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp index cb2b2e6b8c7..7741d498f99 100644 --- a/be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp +++ b/be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp @@ -114,7 +114,10 @@ void IcebergParquetNestedColumnUtils::extract_nested_column_ids( uint64_t key_start_id = child.get_column_id(); uint64_t key_max_id = child.get_max_column_id(); for (uint64_t id = key_start_id; id <= key_max_id; ++id) { - column_ids.insert(id); + auto inserted = column_ids.insert(id); + if (inserted.second) { + std::cout << "[IcebergNested] added column id: " << id << std::endl; + } } has_child_columns = true; continue; // Skip further processing of key child @@ -144,7 +147,10 @@ void IcebergParquetNestedColumnUtils::extract_nested_column_ids( uint64_t start_id = child.get_column_id(); uint64_t max_column_id = child.get_max_column_id(); for (uint64_t id = start_id; id <= max_column_id; ++id) { - column_ids.insert(id); + auto inserted = column_ids.insert(id); + if (inserted.second) { + std::cout << "[IcebergNested] added column id: " << id << std::endl; + } } has_child_columns = true; } else { @@ -166,7 +172,11 @@ void IcebergParquetNestedColumnUtils::extract_nested_column_ids( // This ensures parent struct/container nodes are included when their children are needed if (has_child_columns) { // Set automatically handles deduplication, so no need to check if it already exists - column_ids.insert(field_schema.get_column_id()); + auto inserted = column_ids.insert(field_schema.get_column_id()); + if (inserted.second) { + std::cout << "[IcebergNested] added parent column id: " << field_schema.get_column_id() + << std::endl; + } } } diff --git a/be/src/vec/exec/format/table/iceberg_reader.cpp b/be/src/vec/exec/format/table/iceberg_reader.cpp index b08229165ea..1a010dc4454 100644 --- a/be/src/vec/exec/format/table/iceberg_reader.cpp +++ b/be/src/vec/exec/format/table/iceberg_reader.cpp @@ -544,8 +544,6 @@ ColumnIdResult IcebergParquetReader::_create_column_ids(const FieldDescriptor* f } auto field_schema = it->second; - const auto& all_access_paths = slot->all_access_paths(); - // primitive (non-nested) types: direct mapping by name if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && slot->col_type() != TYPE_MAP)) { @@ -558,13 +556,13 @@ ColumnIdResult IcebergParquetReader::_create_column_ids(const FieldDescriptor* f } // complex types: - - // collect and process all_access_paths -> column_ids + const auto& all_access_paths = slot->all_access_paths(); process_access_paths(field_schema, all_access_paths, column_ids); - // collect and process predicate_access_paths -> filter_column_ids const auto& predicate_access_paths = slot->predicate_access_paths(); - process_access_paths(field_schema, predicate_access_paths, filter_column_ids); + if (!predicate_access_paths.empty()) { + process_access_paths(field_schema, predicate_access_paths, filter_column_ids); + } } return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); } @@ -737,8 +735,6 @@ ColumnIdResult IcebergOrcReader::_create_column_ids(const orc::Type* orc_type, } const orc::Type* orc_field = it->second; - const auto& all_access_paths = slot->all_access_paths(); - // primitive (non-nested) types: direct mapping by name if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && slot->col_type() != TYPE_MAP)) { @@ -749,14 +745,14 @@ ColumnIdResult IcebergOrcReader::_create_column_ids(const orc::Type* orc_type, continue; } - // nested types: - - // collect and process all_access_paths -> column_ids + // complex types: + const auto& all_access_paths = slot->all_access_paths(); process_access_paths(orc_field, all_access_paths, column_ids); - // collect and process predicate_access_paths -> filter_column_ids const auto& predicate_access_paths = slot->predicate_access_paths(); - process_access_paths(orc_field, predicate_access_paths, filter_column_ids); + if (!predicate_access_paths.empty()) { + process_access_paths(orc_field, predicate_access_paths, filter_column_ids); + } } return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); diff --git a/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp b/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp index 7efca721fcd..d46acbdc134 100644 --- a/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp +++ b/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp @@ -738,7 +738,8 @@ protected: } // Helper function: Run Parquet test with different column ID extraction methods - void run_parquet_test_with_method(const std::vector<ColumnAccessPathConfig>& access_configs, + void run_parquet_test_with_method(const std::vector<std::string>& table_column_names, + const std::vector<ColumnAccessPathConfig>& access_configs, const std::set<uint64_t>& expected_column_ids, const std::set<uint64_t>& expected_filter_column_ids, bool use_top_level_method = false, @@ -760,12 +761,13 @@ protected: TTableDescriptor t_table_desc; // Define all columns according to the schema - std::vector<std::string> table_column_names = { - "name", "profile", "tags", "friends", "recent_activity", - "attributes", "complex_attributes"}; - std::vector<int> table_column_positions = {1, 2, 3, 4, 5, 6, 7}; - std::vector<TPrimitiveType::type> table_column_types = { - // TPrimitiveType::BIGINT, // id + std::vector<std::string> all_table_column_names = {"id", "name", + "profile", "tags", + "friends", "recent_activity", + "attributes", "complex_attributes"}; + std::vector<int> all_table_column_positions = {0, 1, 2, 3, 4, 5, 6, 7}; + std::vector<TPrimitiveType::type> all_table_column_types = { + TPrimitiveType::BIGINT, // id TPrimitiveType::STRING, // name TPrimitiveType::STRUCT, // profile TPrimitiveType::ARRAY, // tags @@ -775,6 +777,18 @@ protected: TPrimitiveType::MAP // complex_attributes }; + std::vector<int> table_column_positions; + std::vector<TPrimitiveType::type> table_column_types; + for (const auto& col_name : table_column_names) { + auto it = std::find(all_table_column_names.begin(), all_table_column_names.end(), + col_name); + if (it != all_table_column_names.end()) { + int idx = std::distance(all_table_column_names.begin(), it); + table_column_positions.push_back(idx); + table_column_types.push_back(all_table_column_types[idx]); + } + } + const TupleDescriptor* tuple_descriptor = create_tuple_descriptor( &desc_tbl, obj_pool, t_desc_table, t_table_desc, table_column_names, table_column_positions, table_column_types, access_configs); @@ -795,7 +809,8 @@ protected: } // Helper function: Run ORC test with different column ID extraction methods - void run_orc_test_with_method(const std::vector<ColumnAccessPathConfig>& access_configs, + void run_orc_test_with_method(const std::vector<std::string>& table_column_names, + const std::vector<ColumnAccessPathConfig>& access_configs, const std::set<uint64_t>& expected_column_ids, const std::set<uint64_t>& expected_filter_column_ids, bool use_top_level_method = false, @@ -817,12 +832,13 @@ protected: TTableDescriptor t_table_desc; // Define all columns according to the schema - std::vector<std::string> table_column_names = { - "name", "profile", "tags", "friends", "recent_activity", - "attributes", "complex_attributes"}; - std::vector<int> table_column_positions = {1, 2, 3, 4, 5, 6, 7}; - std::vector<TPrimitiveType::type> table_column_types = { - // TPrimitiveType::BIGINT, // id + std::vector<std::string> all_table_column_names = {"id", "name", + "profile", "tags", + "friends", "recent_activity", + "attributes", "complex_attributes"}; + std::vector<int> all_table_column_positions = {0, 1, 2, 3, 4, 5, 6, 7}; + std::vector<TPrimitiveType::type> all_table_column_types = { + TPrimitiveType::BIGINT, // id TPrimitiveType::STRING, // name TPrimitiveType::STRUCT, // profile TPrimitiveType::ARRAY, // tags @@ -832,6 +848,18 @@ protected: TPrimitiveType::MAP // complex_attributes }; + std::vector<int> table_column_positions; + std::vector<TPrimitiveType::type> table_column_types; + for (const auto& col_name : table_column_names) { + auto it = std::find(all_table_column_names.begin(), all_table_column_names.end(), + col_name); + if (it != all_table_column_names.end()) { + int idx = std::distance(all_table_column_names.begin(), it); + table_column_positions.push_back(idx); + table_column_types.push_back(all_table_column_types[idx]); + } + } + const TupleDescriptor* tuple_descriptor = create_tuple_descriptor( &desc_tbl, obj_pool, t_desc_table, t_table_desc, table_column_names, table_column_positions, table_column_types, access_configs); @@ -868,18 +896,19 @@ TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_1) { {"profile", "contact", "email"}}; // column_ids should contain all necessary column IDs (set automatically deduplicates) - // Expected IDs based on the schema: name(2), profile(3), address(4), coordinates(7), lat(8), lng(9), contact(10), email(11), hobbies(15), element(16), level(18) + std::vector<std::string> table_column_names = {"name", "profile"}; std::set<uint64_t> expected_column_ids = {2, 3, 4, 7, 8, 9, 10, 11, 15, 16, 18}; - // Expected IDs based on the schema: profile(3), address(4), coordinates(7), lat(8), contact(10), email(11) std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11}; - run_parquet_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); - run_parquet_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids, - true); + run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); - run_orc_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); - run_orc_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids, - true); + run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); } TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_2) { @@ -915,20 +944,21 @@ TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_2) { access_config.predicate_paths = {{"profile", "address", "coordinates", "lat"}, {"profile", "contact", "email"}}; + std::vector<std::string> table_column_names = {"name", "profile"}; // column_ids should contain all necessary column IDs (set automatically deduplicates) - // Expected IDs based on the schema: name(2), profile(3), address(4), coordinates(7), lat(8), lng(9), contact(10), email(11), hobbies(15), element(16), level(18) std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; - // Expected IDs based on the schema: profile(3), address(4), coordinates(7), lat(8), contact(10), email(11) std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11}; - run_parquet_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); - run_parquet_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids, - true); + run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); - run_orc_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); - run_orc_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids, - true); + run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); } TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_3) { @@ -964,19 +994,20 @@ TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_3) { access_config.predicate_paths = {{"profile", "address", "coordinates"}, {"profile", "contact", "email"}}; + std::vector<std::string> table_column_names = {"name", "profile"}; // column_ids should contain all necessary column IDs (set automatically deduplicates) - // Expected IDs based on the schema: name(2), profile(3), address(4), coordinates(7), lat(8), lng(9), contact(10), email(11), hobbies(15), element(16), level(18) std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}; - // Expected IDs based on the schema: profile(3), address(4), coordinates(7), lat(8), contact(10), email(11) std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 9, 10, 11}; - run_parquet_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); - run_parquet_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids, - true); + run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); - run_orc_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); - run_orc_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids, - true); + run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); } TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_4) { @@ -988,19 +1019,21 @@ TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_4) { access_config.all_column_paths = {}; access_config.predicate_paths = {}; + std::vector<std::string> table_column_names = {"name", "profile"}; // column_ids should contain all necessary column IDs (set automatically deduplicates) - // Expected IDs based on the schema: name(2), profile(3), address(4), coordinates(7), lat(8), lng(9), contact(10), email(11), hobbies(15), element(16), level(18) - std::set<uint64_t> expected_column_ids = {2}; - // Expected IDs based on the schema: profile(3), address(4), coordinates(7), lat(8), contact(10), email(11) + std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18}; std::set<uint64_t> expected_filter_column_ids = {}; - run_parquet_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); - run_parquet_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids, - true); + run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); - run_orc_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); - run_orc_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids, - true); + run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); } TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_5) { @@ -1049,18 +1082,20 @@ TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_5) { access_configs.push_back(access_config); } + std::vector<std::string> table_column_names = {"name", "friends", "recent_activity"}; // column_ids should contain all necessary column IDs (set automatically deduplicates) - // Expected IDs based on the schema: name(2), profile(3), address(4), coordinates(7), lat(8), lng(9), contact(10), email(11), hobbies(15), element(16), level(18) std::set<uint64_t> expected_column_ids = {2, 21, 22, 24, 25, 26, 27, 28, 29, 30, 32}; - // Expected IDs based on the schema: profile(3), address(4), coordinates(7), lat(8), contact(10), email(11) std::set<uint64_t> expected_filter_column_ids = {21, 22, 24, 26, 27, 28}; - run_parquet_test_with_method(access_configs, expected_column_ids, expected_filter_column_ids); - run_parquet_test_with_method(access_configs, expected_column_ids, expected_filter_column_ids, - true); + run_parquet_test_with_method(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids); + run_parquet_test_with_method(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids, true); - run_orc_test_with_method(access_configs, expected_column_ids, expected_filter_column_ids); - run_orc_test_with_method(access_configs, expected_column_ids, expected_filter_column_ids, true); + run_orc_test_with_method(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids); + run_orc_test_with_method(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids, true); } TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_6) { @@ -1146,33 +1181,32 @@ TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_6) { } { + std::vector<std::string> table_column_names = {"name", "complex_attributes"}; // parquet values should access keys // column_ids should contain all necessary column IDs (set automatically deduplicates) - // Expected IDs based on the schema: name(2), profile(3), address(4), coordinates(7), lat(8), lng(9), contact(10), email(11), hobbies(15), element(16), level(18) - std::set<uint64_t> parquet_expected_column_ids = { - 2, 36, 37, 38, 39, 40, 44, 45, 48, 49, 52, 53, 54, 61, 62, 63, 64, - 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80, 82, 83}; - // Expected IDs based on the schema: profile(3), address(4), coordinates(7), lat(8), contact(10), email(11) - std::set<uint64_t> parquet_expected_filter_column_ids = {36, 37, 38, 39, 40}; - - run_parquet_test_with_method(access_configs, parquet_expected_column_ids, - parquet_expected_filter_column_ids); - run_parquet_test_with_method(access_configs, parquet_expected_column_ids, - parquet_expected_filter_column_ids, true); + std::set<uint64_t> expected_column_ids = {2, 36, 37, 38, 39, 40, 44, 45, 48, 49, 52, 53, + 54, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 79, 80, 82, 83}; + std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40}; + + run_parquet_test_with_method(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids); + run_parquet_test_with_method(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids, true); } { + std::vector<std::string> table_column_names = {"name", "complex_attributes"}; + // orc values should access keys because need to deduplicate by keys // column_ids should contain all necessary column IDs (set automatically deduplicates) - // Expected IDs based on the schema: name(2), profile(3), address(4), coordinates(7), lat(8), lng(9), contact(10), email(11), hobbies(15), element(16), level(18) - std::set<uint64_t> orc_expected_column_ids = {2, 36, 37, 38, 39, 40, 44, 45, 48, 49, 52, - 53, 54, 61, 63, 64, 65, 66, 67, 68, 69, 70, - 71, 72, 73, 74, 75, 76, 77, 79, 80, 82, 83}; - // Expected IDs based on the schema: profile(3), address(4), coordinates(7), lat(8), contact(10), email(11) - std::set<uint64_t> orc_expected_filter_column_ids = {36, 37, 38, 39, 40}; - run_orc_test_with_method(access_configs, orc_expected_column_ids, - orc_expected_filter_column_ids); - run_orc_test_with_method(access_configs, orc_expected_column_ids, - orc_expected_filter_column_ids, true); + std::set<uint64_t> expected_column_ids = {2, 36, 37, 38, 39, 40, 44, 45, 48, 49, 52, 53, + 54, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 79, 80, 82, 83}; + std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40}; + run_orc_test_with_method(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids); + run_orc_test_with_method(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids, true); } } diff --git a/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp b/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp index 47e0ad87a4a..b1152f159bd 100644 --- a/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp +++ b/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp @@ -770,7 +770,8 @@ protected: } // Helper function: run Parquet test with different column ID extraction methods - void run_parquet_test_with_method(const std::vector<ColumnAccessPathConfig>& access_configs, + void run_parquet_test_with_method(const std::vector<std::string>& table_column_names, + const std::vector<ColumnAccessPathConfig>& access_configs, const std::set<uint64_t>& expected_column_ids, const std::set<uint64_t>& expected_filter_column_ids, bool use_top_level_method = false, @@ -792,12 +793,13 @@ protected: TTableDescriptor t_table_desc; // Define all columns according to the schema - std::vector<std::string> table_column_names = { - "name", "profile", "tags", "friends", "recent_activity", - "attributes", "complex_attributes"}; - std::vector<int> table_column_positions = {1, 2, 3, 4, 5, 6, 7}; - std::vector<TPrimitiveType::type> table_column_types = { - // TPrimitiveType::BIGINT, // id + std::vector<std::string> all_table_column_names = {"id", "name", + "profile", "tags", + "friends", "recent_activity", + "attributes", "complex_attributes"}; + std::vector<int> all_table_column_positions = {0, 1, 2, 3, 4, 5, 6, 7}; + std::vector<TPrimitiveType::type> all_table_column_types = { + TPrimitiveType::BIGINT, // id TPrimitiveType::STRING, // name TPrimitiveType::STRUCT, // profile TPrimitiveType::ARRAY, // tags @@ -807,6 +809,18 @@ protected: TPrimitiveType::MAP // complex_attributes }; + std::vector<int> table_column_positions; + std::vector<TPrimitiveType::type> table_column_types; + for (const auto& col_name : table_column_names) { + auto it = std::find(all_table_column_names.begin(), all_table_column_names.end(), + col_name); + if (it != all_table_column_names.end()) { + int idx = std::distance(all_table_column_names.begin(), it); + table_column_positions.push_back(idx); + table_column_types.push_back(all_table_column_types[idx]); + } + } + const TupleDescriptor* tuple_descriptor = create_tuple_descriptor( &desc_tbl, obj_pool, t_desc_table, t_table_desc, table_column_names, table_column_positions, table_column_types, access_configs); @@ -827,7 +841,8 @@ protected: } // Helper function: run Orc test with different column ID extraction methods - void run_orc_test_with_method(const std::vector<ColumnAccessPathConfig>& access_configs, + void run_orc_test_with_method(const std::vector<std::string>& table_column_names, + const std::vector<ColumnAccessPathConfig>& access_configs, const std::set<uint64_t>& expected_column_ids, const std::set<uint64_t>& expected_filter_column_ids, bool use_top_level_method = false, @@ -849,12 +864,13 @@ protected: TTableDescriptor t_table_desc; // Define all columns according to the schema - std::vector<std::string> table_column_names = { - "name", "profile", "tags", "friends", "recent_activity", - "attributes", "complex_attributes"}; - std::vector<int> table_column_positions = {1, 2, 3, 4, 5, 6, 7}; - std::vector<TPrimitiveType::type> table_column_types = { - // TPrimitiveType::BIGINT, // id + std::vector<std::string> all_table_column_names = {"id", "name", + "profile", "tags", + "friends", "recent_activity", + "attributes", "complex_attributes"}; + std::vector<int> all_table_column_positions = {0, 1, 2, 3, 4, 5, 6, 7}; + std::vector<TPrimitiveType::type> all_table_column_types = { + TPrimitiveType::BIGINT, // id TPrimitiveType::STRING, // name TPrimitiveType::STRUCT, // profile TPrimitiveType::ARRAY, // tags @@ -864,6 +880,18 @@ protected: TPrimitiveType::MAP // complex_attributes }; + std::vector<int> table_column_positions; + std::vector<TPrimitiveType::type> table_column_types; + for (const auto& col_name : table_column_names) { + auto it = std::find(all_table_column_names.begin(), all_table_column_names.end(), + col_name); + if (it != all_table_column_names.end()) { + int idx = std::distance(all_table_column_names.begin(), it); + table_column_positions.push_back(idx); + table_column_types.push_back(all_table_column_types[idx]); + } + } + const TupleDescriptor* tuple_descriptor = create_tuple_descriptor( &desc_tbl, obj_pool, t_desc_table, t_table_desc, table_column_names, table_column_positions, table_column_types, access_configs); @@ -898,14 +926,15 @@ TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_1) { {"3", "11", "*", "23"}}; access_config.predicate_paths = {{"3", "9", "14", "15"}, {"3", "10", "17"}}; + std::vector<std::string> table_column_names = {"name", "profile"}; // column_ids should contain all necessary column IDs (set automatically deduplicates) - // Expected IDs based on the schema: name(2), profile(3), address(4), coordinates(7), lat(8), lng(9), contact(10), email(11), hobbies(15), element(16), level(18) std::set<uint64_t> expected_column_ids = {2, 3, 4, 7, 8, 9, 10, 11, 15, 16, 18}; - // Expected IDs based on the schema: profile(3), address(4), coordinates(7), lat(8), contact(10), email(11) std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11}; - run_parquet_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); - run_orc_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); + run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); } TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_2) { @@ -940,15 +969,16 @@ TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_2) { access_config.all_column_paths = {{"3"}}; access_config.predicate_paths = {{"3", "9", "14", "15"}, {"3", "10", "17"}}; + std::vector<std::string> table_column_names = {"name", "profile"}; // column_ids should contain all necessary column IDs (set automatically deduplicates) - // Expected IDs based on the schema: name(2), profile(3), address(4), coordinates(7), lat(8), lng(9), contact(10), email(11), hobbies(15), element(16), level(18) std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; - // Expected IDs based on the schema: profile(3), address(4), coordinates(7), lat(8), contact(10), email(11) std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11}; - run_parquet_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); - run_orc_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); + run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); } TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_3) { @@ -987,14 +1017,15 @@ TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_3) { // access_config.predicate_paths = {{"profile", "address", "coordinates"}, // {"profile", "contact", "email"}}; + std::vector<std::string> table_column_names = {"name", "profile"}; // column_ids should contain all necessary column IDs (set automatically deduplicates) - // Expected IDs based on the schema: name(2), profile(3), address(4), coordinates(7), lat(8), lng(9), contact(10), email(11), hobbies(15), element(16), level(18) std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}; - // Expected IDs based on the schema: profile(3), address(4), coordinates(7), lat(8), contact(10), email(11) std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 9, 10, 11}; - run_parquet_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); - run_orc_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); + run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); } TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_4) { @@ -1006,14 +1037,16 @@ TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_4) { access_config.all_column_paths = {}; access_config.predicate_paths = {}; + std::vector<std::string> table_column_names = {"name", "profile"}; // column_ids should contain all necessary column IDs (set automatically deduplicates) - // Expected IDs based on the schema: name(2), profile(3), address(4), coordinates(7), lat(8), lng(9), contact(10), email(11), hobbies(15), element(16), level(18) - std::set<uint64_t> expected_column_ids = {2}; - // Expected IDs based on the schema: profile(3), address(4), coordinates(7), lat(8), contact(10), email(11) + std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18}; std::set<uint64_t> expected_filter_column_ids = {}; - run_parquet_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); - run_orc_test_with_method({access_config}, expected_column_ids, expected_filter_column_ids); + run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); } TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_5) { @@ -1060,14 +1093,15 @@ TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_5) { access_configs.push_back(access_config); } + std::vector<std::string> table_column_names = {"name", "friends", "recent_activity"}; // column_ids should contain all necessary column IDs (set automatically deduplicates) - // Expected IDs based on the schema: name(2), profile(3), address(4), coordinates(7), lat(8), lng(9), contact(10), email(11), hobbies(15), element(16), level(18) std::set<uint64_t> expected_column_ids = {2, 21, 22, 24, 25, 26, 27, 28, 29, 30, 32}; - // Expected IDs based on the schema: profile(3), address(4), coordinates(7), lat(8), contact(10), email(11) std::set<uint64_t> expected_filter_column_ids = {21, 22, 24, 26, 27, 28}; - run_parquet_test_with_method(access_configs, expected_column_ids, expected_filter_column_ids); - run_orc_test_with_method(access_configs, expected_column_ids, expected_filter_column_ids); + run_parquet_test_with_method(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids); + run_orc_test_with_method(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids); } TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_6) { @@ -1148,29 +1182,29 @@ TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_6) { } { + std::vector<std::string> table_column_names = {"name", "complex_attributes"}; // parquet values should access keys // column_ids should contain all necessary column IDs (set automatically deduplicates) - // Expected IDs based on the schema: name(2), profile(3), address(4), coordinates(7), lat(8), lng(9), contact(10), email(11), hobbies(15), element(16), level(18) std::set<uint64_t> expected_column_ids = {2, 36, 37, 38, 39, 40, 44, 45, 48, 49, 52, 53, 54, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80, 82, 83}; - // Expected IDs based on the schema: profile(3), address(4), coordinates(7), lat(8), contact(10), email(11) std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40}; - run_parquet_test_with_method(access_configs, expected_column_ids, + run_parquet_test_with_method(table_column_names, access_configs, expected_column_ids, expected_filter_column_ids); } { + std::vector<std::string> table_column_names = {"name", "complex_attributes"}; + // orc values should access keys because need to deduplicate by keys // column_ids should contain all necessary column IDs (set automatically deduplicates) - // Expected IDs based on the schema: name(2), profile(3), address(4), coordinates(7), lat(8), lng(9), contact(10), email(11), hobbies(15), element(16), level(18) - std::set<uint64_t> expected_column_ids = {2, 36, 37, 38, 39, 40, 44, 45, 48, 49, 52, - 53, 54, 61, 63, 64, 65, 66, 67, 68, 69, 70, - 71, 72, 73, 74, 75, 76, 77, 79, 80, 82, 83}; - // Expected IDs based on the schema: profile(3), address(4), coordinates(7), lat(8), contact(10), email(11) + std::set<uint64_t> expected_column_ids = {2, 36, 37, 38, 39, 40, 44, 45, 48, 49, 52, 53, + 54, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 79, 80, 82, 83}; std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40}; - run_orc_test_with_method(access_configs, expected_column_ids, expected_filter_column_ids); + run_orc_test_with_method(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids); } } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
