This is an automated email from the ASF dual-hosted git repository.

suxiaogang223 pushed a commit to branch refact_reader_branch
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/refact_reader_branch by this 
push:
     new aa1381cf73e [fix](be) Stabilize parquet nested scalar levels
aa1381cf73e is described below

commit aa1381cf73e5049d0999dc0523c5c4a9d97da0ef
Author: Socrates <[email protected]>
AuthorDate: Fri May 29 10:34:54 2026 +0800

    [fix](be) Stabilize parquet nested scalar levels
    
    ### What problem does this PR solve?
    
    Issue Number: close #xxx
    
    Related PR: #xxx
    
    Problem Summary: Avoid stale level buffers for required nested leaves and 
preserve nullable nested scalar value slot mapping expected by Arrow 
RecordReader.
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test: Manual test
        - Ran git diff --check. Fedora BE unit test validation follows with 
./run-be-ut.sh --run '--filter=ParquetColumnReaderTest.*'.
    - Behavior changed: No
    - Does this need documentation: No
---
 be/src/format/new_parquet/column_reader.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/be/src/format/new_parquet/column_reader.cpp 
b/be/src/format/new_parquet/column_reader.cpp
index a177e025879..4d24100685e 100644
--- a/be/src/format/new_parquet/column_reader.cpp
+++ b/be/src/format/new_parquet/column_reader.cpp
@@ -482,7 +482,7 @@ Status read_nested_scalar_batch(ScalarColumnReader& 
column_reader, int64_t batch
                 column_reader.name());
     }
     batch->def_levels.resize(static_cast<size_t>(batch->levels_written));
-    if (def_levels == nullptr) {
+    if (column_reader.descriptor()->max_definition_level() == 0 || def_levels 
== nullptr) {
         std::fill(batch->def_levels.begin(), batch->def_levels.end(),
                   column_reader.descriptor()->max_definition_level());
     } else {
@@ -496,7 +496,7 @@ Status read_nested_scalar_batch(ScalarColumnReader& 
column_reader, int64_t batch
                 column_reader.name());
     }
     batch->rep_levels.resize(static_cast<size_t>(batch->levels_written));
-    if (rep_levels == nullptr) {
+    if (column_reader.descriptor()->max_repetition_level() == 0 || rep_levels 
== nullptr) {
         std::fill(batch->rep_levels.begin(), batch->rep_levels.end(), 0);
     } else {
         std::copy(rep_levels, rep_levels + batch->levels_written, 
batch->rep_levels.begin());
@@ -507,8 +507,7 @@ Status read_nested_scalar_batch(ScalarColumnReader& 
column_reader, int64_t batch
     const int16_t max_definition_level = 
column_reader.descriptor()->max_definition_level();
     NullMap value_null_map;
     for (int64_t level_idx = 0; level_idx < batch->levels_written; 
++level_idx) {
-        const bool has_value = batch->def_levels[level_idx] == 
max_definition_level;
-        if (batch->def_levels[level_idx] >= value_slot_definition_level && 
has_value) {
+        if (batch->def_levels[level_idx] >= value_slot_definition_level) {
             if (value_idx >= batch->values_written) {
                 return Status::Corruption(
                         "Nested parquet reader returned fewer values than 
definition levels for "
@@ -516,10 +515,8 @@ Status read_nested_scalar_batch(ScalarColumnReader& 
column_reader, int64_t batch
                         column_reader.name());
             }
             batch->value_indices[level_idx] = value_idx++;
-        }
-        if (batch->def_levels[level_idx] >= value_slot_definition_level) {
             if (column_reader.type()->is_nullable()) {
-                value_null_map.push_back(!has_value);
+                value_null_map.push_back(batch->def_levels[level_idx] != 
max_definition_level);
             }
         }
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to