This is an automated email from the ASF dual-hosted git repository.
suxiaogang223 pushed a commit to branch refact_reader_branch
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/refact_reader_branch by this
push:
new aa1381cf73e [fix](be) Stabilize parquet nested scalar levels
aa1381cf73e is described below
commit aa1381cf73e5049d0999dc0523c5c4a9d97da0ef
Author: Socrates <[email protected]>
AuthorDate: Fri May 29 10:34:54 2026 +0800
[fix](be) Stabilize parquet nested scalar levels
### What problem does this PR solve?
Issue Number: close #xxx
Related PR: #xxx
Problem Summary: Avoid stale level buffers for required nested leaves and
preserve nullable nested scalar value slot mapping expected by Arrow
RecordReader.
### Release note
None
### Check List (For Author)
- Test: Manual test
- Ran git diff --check. Fedora BE unit test validation follows with
./run-be-ut.sh --run '--filter=ParquetColumnReaderTest.*'.
- Behavior changed: No
- Does this need documentation: No
---
be/src/format/new_parquet/column_reader.cpp | 11 ++++-------
1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/be/src/format/new_parquet/column_reader.cpp
b/be/src/format/new_parquet/column_reader.cpp
index a177e025879..4d24100685e 100644
--- a/be/src/format/new_parquet/column_reader.cpp
+++ b/be/src/format/new_parquet/column_reader.cpp
@@ -482,7 +482,7 @@ Status read_nested_scalar_batch(ScalarColumnReader&
column_reader, int64_t batch
column_reader.name());
}
batch->def_levels.resize(static_cast<size_t>(batch->levels_written));
- if (def_levels == nullptr) {
+ if (column_reader.descriptor()->max_definition_level() == 0 || def_levels
== nullptr) {
std::fill(batch->def_levels.begin(), batch->def_levels.end(),
column_reader.descriptor()->max_definition_level());
} else {
@@ -496,7 +496,7 @@ Status read_nested_scalar_batch(ScalarColumnReader&
column_reader, int64_t batch
column_reader.name());
}
batch->rep_levels.resize(static_cast<size_t>(batch->levels_written));
- if (rep_levels == nullptr) {
+ if (column_reader.descriptor()->max_repetition_level() == 0 || rep_levels
== nullptr) {
std::fill(batch->rep_levels.begin(), batch->rep_levels.end(), 0);
} else {
std::copy(rep_levels, rep_levels + batch->levels_written,
batch->rep_levels.begin());
@@ -507,8 +507,7 @@ Status read_nested_scalar_batch(ScalarColumnReader&
column_reader, int64_t batch
const int16_t max_definition_level =
column_reader.descriptor()->max_definition_level();
NullMap value_null_map;
for (int64_t level_idx = 0; level_idx < batch->levels_written;
++level_idx) {
- const bool has_value = batch->def_levels[level_idx] ==
max_definition_level;
- if (batch->def_levels[level_idx] >= value_slot_definition_level &&
has_value) {
+ if (batch->def_levels[level_idx] >= value_slot_definition_level) {
if (value_idx >= batch->values_written) {
return Status::Corruption(
"Nested parquet reader returned fewer values than
definition levels for "
@@ -516,10 +515,8 @@ Status read_nested_scalar_batch(ScalarColumnReader&
column_reader, int64_t batch
column_reader.name());
}
batch->value_indices[level_idx] = value_idx++;
- }
- if (batch->def_levels[level_idx] >= value_slot_definition_level) {
if (column_reader.type()->is_nullable()) {
- value_null_map.push_back(!has_value);
+ value_null_map.push_back(batch->def_levels[level_idx] !=
max_definition_level);
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]