This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new d2a171805c GH-47012: [C++][Parquet] Reserve values correctly when
reading BYTE_ARRAY and FLBA (#47013)
d2a171805c is described below
commit d2a171805c63caa27f05232695b753e07c32cb1d
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jul 9 09:35:19 2025 +0200
GH-47012: [C++][Parquet] Reserve values correctly when reading BYTE_ARRAY
and FLBA (#47013)
### Rationale for this change
When reading a Parquet leaf column as Arrow, we [presize the Arrow
builder](https://github.com/apache/arrow/blob/a0cc2d8ed35dce7ee6c3e7cbcc4867216a9ef16f/cpp/src/parquet/arrow/reader.cc#L487-L488)
so as to avoid spurious reallocations during incremental Parquet decoding
calls.
However, the Reserve method on RecordReader will [only properly reserve
values](https://github.com/apache/arrow/blob/a0cc2d8ed35dce7ee6c3e7cbcc4867216a9ef16f/cpp/src/parquet/column_reader.cc#L1693-L1696)
for non-FLBA non-BYTE_ARRAY physical types.
The result is that, on some of our micro-benchmarks, we spend a significant
amount of time reallocating data on the ArrayBuilder.
### What changes are included in this PR?
Properly reserve space on Array builders when reading Parquet data as
Arrow. Note that, when reading into Binary or LargeBinary, this doesn't avoid
reallocations for the actual data. However, for FixedSizeBinary and BinaryView,
this is sufficient to avoid any reallocations.
Benchmark numbers on my local machine (Ubuntu 24.04):
```
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Non-regressions: (250)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
benchmark baseline contender change %
counters
BM_ReadColumnPlain<false,Float16LogicalType>/null_probability:-1 3.295
GiB/sec 7.834 GiB/sec 137.771
{'family_index': 10, 'per_family_instance_index': 0, 'run_name':
'BM_ReadColumnPlain<false,Float16LogicalType>/null_probability:-1',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 118}
BM_ReadColumnByteStreamSplit<false,Float16LogicalType>/null_probability:-1
3.453 GiB/sec 8.148 GiB/sec 135.957 {'family_index':
12, 'per_family_instance_index': 0, 'run_name':
'BM_ReadColumnByteStreamSplit<false,Float16LogicalType>/null_probability:-1',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 119}
BM_ReadColumnByteStreamSplit<true,Float16LogicalType>/null_probability:100
1.360 GiB/sec 1.780 GiB/sec 30.870 {'family_index':
13, 'per_family_instance_index': 4, 'run_name':
'BM_ReadColumnByteStreamSplit<true,Float16LogicalType>/null_probability:100',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 49}
BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:100 1.360
GiB/sec 1.780 GiB/sec 30.861
{'family_index': 11, 'per_family_instance_index': 4, 'run_name':
'BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:100',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 49}
BM_ReadColumnByteStreamSplit<true,Float16LogicalType>/null_probability:0
1.292 GiB/sec 1.662 GiB/sec 28.666
{'family_index': 13, 'per_family_instance_index': 0, 'run_name':
'BM_ReadColumnByteStreamSplit<true,Float16LogicalType>/null_probability:0',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 47}
BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:0 1.304 GiB/sec
1.665 GiB/sec 27.691 {'family_index':
11, 'per_family_instance_index': 0, 'run_name':
'BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:0',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 46}
BM_ReadBinaryViewColumn/null_probability:99/unique_values:32 959.085 MiB/sec
1.185 GiB/sec 26.568 {'family_index':
15, 'per_family_instance_index': 4, 'run_name':
'BM_ReadBinaryViewColumn/null_probability:99/unique_values:32', 'repetitions':
1, 'repetition_index': 0, 'threads': 1, 'iterations': 9}
BM_ReadColumnByteStreamSplit<true,Float16LogicalType>/null_probability:99
1.012 GiB/sec 1.210 GiB/sec 19.557 {'family_index':
13, 'per_family_instance_index': 3, 'run_name':
'BM_ReadColumnByteStreamSplit<true,Float16LogicalType>/null_probability:99',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 36}
BM_ReadBinaryViewColumnDeltaByteArray/null_probability:99/unique_values:-1
1.011 GiB/sec 1.187 GiB/sec 17.407 {'family_index':
17, 'per_family_instance_index': 3, 'run_name':
'BM_ReadBinaryViewColumnDeltaByteArray/null_probability:99/unique_values:-1',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 9}
BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:99 1.024
GiB/sec 1.201 GiB/sec 17.206
{'family_index': 11, 'per_family_instance_index': 3, 'run_name':
'BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:99',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 36}
BM_ReadBinaryViewColumn/null_probability:99/unique_values:-1 1.023 GiB/sec
1.197 GiB/sec 17.016 {'family_index':
15, 'per_family_instance_index': 7, 'run_name':
'BM_ReadBinaryViewColumn/null_probability:99/unique_values:-1', 'repetitions':
1, 'repetition_index': 0, 'threads': 1, 'iterations': 9}
BM_ReadBinaryColumn/null_probability:99/unique_values:32 541.347 MiB/sec
632.640 MiB/sec 16.864
{'family_index': 14, 'per_family_instance_index': 4, 'run_name':
'BM_ReadBinaryColumn/null_probability:99/unique_values:32', 'repetitions': 1,
'repetition_index': 0, 'threads': 1, 'iterations': 9}
BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:1 954.762 MiB/sec
1.084 GiB/sec 16.272 {'family_index':
11, 'per_family_instance_index': 1, 'run_name':
'BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:1',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 33}
BM_ReadColumnByteStreamSplit<true,Float16LogicalType>/null_probability:1
970.997 MiB/sec 1.100 GiB/sec 15.969
{'family_index': 13, 'per_family_instance_index': 1, 'run_name':
'BM_ReadColumnByteStreamSplit<true,Float16LogicalType>/null_probability:1',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 34}
BM_ReadBinaryColumn/null_probability:99/unique_values:-1 592.605 MiB/sec
666.605 MiB/sec 12.487
{'family_index': 14, 'per_family_instance_index': 7, 'run_name':
'BM_ReadBinaryColumn/null_probability:99/unique_values:-1', 'repetitions': 1,
'repetition_index': 0, 'threads': 1, 'iterations': 10}
BM_ReadBinaryColumnDeltaByteArray/null_probability:99/unique_values:-1 587.604
MiB/sec 659.154 MiB/sec 12.177 {'family_index':
16, 'per_family_instance_index': 3, 'run_name':
'BM_ReadBinaryColumnDeltaByteArray/null_probability:99/unique_values:-1',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 10}
BM_ReadBinaryViewColumn/null_probability:50/unique_values:-1 867.001 MiB/sec
962.427 MiB/sec 11.006 {'family_index':
15, 'per_family_instance_index': 6, 'run_name':
'BM_ReadBinaryViewColumn/null_probability:50/unique_values:-1', 'repetitions':
1, 'repetition_index': 0, 'threads': 1, 'iterations': 4}
BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:50 473.040
MiB/sec 522.948 MiB/sec 10.551
{'family_index': 11, 'per_family_instance_index': 2, 'run_name':
'BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:50',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 17}
BM_ReadBinaryViewColumn/null_probability:0/unique_values:-1 1.633 GiB/sec
1.800 GiB/sec 10.197 {'family_index':
15, 'per_family_instance_index': 1, 'run_name':
'BM_ReadBinaryViewColumn/null_probability:0/unique_values:-1', 'repetitions':
1, 'repetition_index': 0, 'threads': 1, 'iterations': 5}
BM_ReadStructOfListColumn/50 466.944 MiB/sec 513.407 MiB/sec 9.951
{'family_index':
20, 'per_family_instance_index': 2, 'run_name': 'BM_ReadStructOfListColumn/50',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 27}
BM_ReadBinaryViewColumnDeltaByteArray/null_probability:50/unique_values:-1
894.649 MiB/sec 976.595 MiB/sec 9.160
{'family_index': 17, 'per_family_instance_index': 2, 'run_name':
'BM_ReadBinaryViewColumnDeltaByteArray/null_probability:50/unique_values:-1',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4}
BM_ReadColumnByteStreamSplit<true,Float16LogicalType>/null_probability:50
479.717 MiB/sec 523.293 MiB/sec 9.084
{'family_index': 13, 'per_family_instance_index': 2, 'run_name':
'BM_ReadColumnByteStreamSplit<true,Float16LogicalType>/null_probability:50',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 17}
BM_ReadBinaryColumn/null_probability:50/unique_values:-1 613.860 MiB/sec
667.963 MiB/sec 8.814
{'family_index': 14, 'per_family_instance_index': 6, 'run_name':
'BM_ReadBinaryColumn/null_probability:50/unique_values:-1', 'repetitions': 1,
'repetition_index': 0, 'threads': 1, 'iterations': 3}
BM_ReadBinaryViewColumnDeltaByteArray/null_probability:1/unique_values:-1
1.479 GiB/sec 1.608 GiB/sec 8.761
{'family_index': 17, 'per_family_instance_index': 1, 'run_name':
'BM_ReadBinaryViewColumnDeltaByteArray/null_probability:1/unique_values:-1',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4}
BM_ReadBinaryViewColumnDeltaByteArray/null_probability:0/unique_values:-1
1.628 GiB/sec 1.762 GiB/sec 8.235
{'family_index': 17, 'per_family_instance_index': 0, 'run_name':
'BM_ReadBinaryViewColumnDeltaByteArray/null_probability:0/unique_values:-1',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 5}
BM_ReadStructOfListColumn/0 760.221 MiB/sec 822.339 MiB/sec 8.171
{'family_index':
20, 'per_family_instance_index': 0, 'run_name': 'BM_ReadStructOfListColumn/0',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 47}
BM_ReadBinaryViewColumn/null_probability:1/unique_values:32 843.826 MiB/sec
912.397 MiB/sec 8.126 {'family_index':
15, 'per_family_instance_index': 2, 'run_name':
'BM_ReadBinaryViewColumn/null_probability:1/unique_values:32', 'repetitions':
1, 'repetition_index': 0, 'threads': 1, 'iterations': 3}
BM_ReadBinaryViewColumn/null_probability:50/unique_values:32 699.538 MiB/sec
755.468 MiB/sec 7.995 {'family_index':
15, 'per_family_instance_index': 3, 'run_name':
'BM_ReadBinaryViewColumn/null_probability:50/unique_values:32', 'repetitions':
1, 'repetition_index': 0, 'threads': 1, 'iterations': 3}
BM_ByteStreamSplitDecode_FLBA_Generic<16>/1024 3.724 GiB/sec 4.007
GiB/sec 7.597
{'family_index': 4, 'per_family_instance_index': 0, 'run_name':
'BM_ByteStreamSplitDecode_FLBA_Generic<16>/1024', 'repetitions': 1,
'repetition_index': 0, 'threads': 1, 'iterations': 176027}
BM_ReadBinaryViewColumn/null_probability:1/unique_values:-1 1.474 GiB/sec
1.586 GiB/sec 7.591 {'family_index':
15, 'per_family_instance_index': 5, 'run_name':
'BM_ReadBinaryViewColumn/null_probability:1/unique_values:-1', 'repetitions':
1, 'repetition_index': 0, 'threads': 1, 'iterations': 4}
BM_ReadBinaryColumn/null_probability:0/unique_values:-1 1.114 GiB/sec
1.192 GiB/sec 7.005
{'family_index': 14, 'per_family_instance_index': 1, 'run_name':
'BM_ReadBinaryColumn/null_probability:0/unique_values:-1', 'repetitions': 1,
'repetition_index': 0, 'threads': 1, 'iterations': 3}
BM_ReadBinaryColumn/null_probability:1/unique_values:-1 1.022 GiB/sec
1.091 GiB/sec 6.715
{'family_index': 14, 'per_family_instance_index': 5, 'run_name':
'BM_ReadBinaryColumn/null_probability:1/unique_values:-1', 'repetitions': 1,
'repetition_index': 0, 'threads': 1, 'iterations': 4}
BM_ReadBinaryColumnDeltaByteArray/null_probability:0/unique_values:-1 1.101
GiB/sec 1.174 GiB/sec 6.557 {'family_index':
16, 'per_family_instance_index': 0, 'run_name':
'BM_ReadBinaryColumnDeltaByteArray/null_probability:0/unique_values:-1',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4}
BM_DecodeArrowBooleanPlain/DecodeArrowWithNull/num_values:16384/null_in_ten_thousand:5000
18.019 MiB/sec 19.100 MiB/sec 5.997 {'family_index': 33,
'per_family_instance_index': 14, 'run_name':
'BM_DecodeArrowBooleanPlain/DecodeArrowWithNull/num_values:16384/null_in_ten_thousand:5000',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 6295}
BM_ReadBinaryViewColumn/null_probability:0/unique_values:32 893.151 MiB/sec
945.900 MiB/sec 5.906 {'family_index':
15, 'per_family_instance_index': 0, 'run_name':
'BM_ReadBinaryViewColumn/null_probability:0/unique_values:32', 'repetitions':
1, 'repetition_index': 0, 'threads': 1, 'iterations': 3}
BM_DecodeArrowBooleanPlain/DecodeArrowWithNull/num_values:16384/null_in_ten_thousand:1000
20.243 MiB/sec 21.404 MiB/sec 5.733 {'family_index': 33,
'per_family_instance_index': 10, 'run_name':
'BM_DecodeArrowBooleanPlain/DecodeArrowWithNull/num_values:16384/null_in_ten_thousand:1000',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 7257}
BM_ReadBinaryColumnDeltaByteArray/null_probability:50/unique_values:-1 620.583
MiB/sec 655.859 MiB/sec 5.684 {'family_index':
16, 'per_family_instance_index': 2, 'run_name':
'BM_ReadBinaryColumnDeltaByteArray/null_probability:50/unique_values:-1',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 3}
BM_ReadBinaryColumn/null_probability:0/unique_values:32 751.375 MiB/sec
793.728 MiB/sec 5.637
{'family_index': 14, 'per_family_instance_index': 0, 'run_name':
'BM_ReadBinaryColumn/null_probability:0/unique_values:32', 'repetitions': 1,
'repetition_index': 0, 'threads': 1, 'iterations': 3}
BM_ReadBinaryColumn/null_probability:50/unique_values:32 537.693 MiB/sec
567.159 MiB/sec 5.480
{'family_index': 14, 'per_family_instance_index': 3, 'run_name':
'BM_ReadBinaryColumn/null_probability:50/unique_values:32', 'repetitions': 1,
'repetition_index': 0, 'threads': 1, 'iterations': 3}
BM_DecodeArrowBooleanPlain/DecodeArrowWithNull/num_values:16384/null_in_ten_thousand:100
44.112 MiB/sec 46.474 MiB/sec 5.355 {'family_index': 33,
'per_family_instance_index': 6, 'run_name':
'BM_DecodeArrowBooleanPlain/DecodeArrowWithNull/num_values:16384/null_in_ten_thousand:100',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 15273}
BM_DecodeArrowBooleanRle/DecodeArrowWithNull/num_values:16384/null_in_ten_thousand:1000
20.750 MiB/sec 21.843 MiB/sec 5.265 {'family_index': 30,
'per_family_instance_index': 10, 'run_name':
'BM_DecodeArrowBooleanRle/DecodeArrowWithNull/num_values:16384/null_in_ten_thousand:1000',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 7387}
BM_ReadColumn<false,Int32Type>/-1/10 7.621 GiB/sec 8.019 GiB/sec
5.223
{'family_index': 0, 'per_family_instance_index': 1, 'run_name':
'BM_ReadColumn<false,Int32Type>/-1/10', 'repetitions': 1, 'repetition_index':
0, 'threads': 1, 'iterations': 137}
[ ... snip non-significant changes ... ]
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Regressions: (4)
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
benchmark baseline
contender change %
counters
BM_ReadListColumn/99 1.452 GiB/sec
1.379 GiB/sec -5.006 {'family_index': 21,
'per_family_instance_index': 3, 'run_name': 'BM_ReadListColumn/99',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 129}
BM_ArrowBinaryViewDict/DecodeArrowNonNull_Dense/1024 270.542 MiB/sec
256.345 MiB/sec -5.248 {'family_index': 27, 'per_family_instance_index': 0,
'run_name': 'BM_ArrowBinaryViewDict/DecodeArrowNonNull_Dense/1024',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 32060}
BM_ArrowBinaryPlain/DecodeArrow_Dict/65536 172.371 MiB/sec
162.455 MiB/sec -5.753 {'family_index': 18,
'per_family_instance_index': 3, 'run_name':
'BM_ArrowBinaryPlain/DecodeArrow_Dict/65536', 'repetitions': 1,
'repetition_index': 0, 'threads': 1, 'iterations': 319}
BM_ArrowBinaryPlain/DecodeArrowNonNull_Dict/1024 189.008 MiB/sec
176.900 MiB/sec -6.406 {'family_index': 19, 'per_family_instance_index':
0, 'run_name': 'BM_ArrowBinaryPlain/DecodeArrowNonNull_Dict/1024',
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 22292}
```
### Are these changes tested?
By existing tests.
### Are there any user-facing changes?
No.
* GitHub Issue: #47012
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
ci/scripts/cpp_test.sh | 3 +++
cpp/build-support/run-test.sh | 1 +
cpp/src/parquet/column_reader.cc | 14 +++++++++++++-
3 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh
index 60d9dd0a3b..88c06849c8 100755
--- a/ci/scripts/cpp_test.sh
+++ b/ci/scripts/cpp_test.sh
@@ -126,6 +126,9 @@ fi
if [ "${ARROW_FUZZING}" == "ON" ]; then
# Fuzzing regression tests
+ # Some fuzz regression files may trigger huge memory allocations,
+ # let the allocator return null instead of aborting.
+ export ASAN_OPTIONS="$ASAN_OPTIONS allocator_may_return_null=1"
"${binary_output_dir}/arrow-ipc-stream-fuzz"
"${ARROW_TEST_DATA}"/arrow-ipc-stream/crash-*
"${binary_output_dir}/arrow-ipc-stream-fuzz"
"${ARROW_TEST_DATA}"/arrow-ipc-stream/*-testcase-*
"${binary_output_dir}/arrow-ipc-file-fuzz"
"${ARROW_TEST_DATA}"/arrow-ipc-file/*-testcase-*
diff --git a/cpp/build-support/run-test.sh b/cpp/build-support/run-test.sh
index f176586a0f..3e3034a3c8 100755
--- a/cpp/build-support/run-test.sh
+++ b/cpp/build-support/run-test.sh
@@ -77,6 +77,7 @@ function setup_sanitizers() {
# Set up suppressions for AddressSanitizer
ASAN_OPTIONS="$ASAN_OPTIONS
suppressions=$ROOT/build-support/asan-suppressions.txt"
+ ASAN_OPTIONS="$ASAN_OPTIONS allocator_may_return_null=1"
export ASAN_OPTIONS
# Set up suppressions for LeakSanitizer
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 1e681d8e90..eb9df9f2f4 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1684,7 +1684,7 @@ class TypedRecordReader : public
TypedColumnReaderImpl<DType>,
}
}
- void ReserveValues(int64_t extra_values) {
+ virtual void ReserveValues(int64_t extra_values) {
const int64_t new_values_capacity =
UpdateCapacity(values_capacity_, values_written_, extra_values);
if (new_values_capacity > values_capacity_) {
@@ -1968,6 +1968,12 @@ class FLBARecordReader final : public
TypedRecordReader<FLBAType>,
return ::arrow::ArrayVector{std::move(chunk)};
}
+ void ReserveValues(int64_t extra_values) override {
+ ARROW_DCHECK(!uses_values_);
+ TypedRecordReader::ReserveValues(extra_values);
+ PARQUET_THROW_NOT_OK(array_builder_.Reserve(extra_values));
+ }
+
void ReadValuesDense(int64_t values_to_read) override {
int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull(
static_cast<int>(values_to_read), &array_builder_);
@@ -2042,6 +2048,12 @@ class ByteArrayChunkedRecordReader final : public
TypedRecordReader<ByteArrayTyp
return result;
}
+ void ReserveValues(int64_t extra_values) override {
+ ARROW_DCHECK(!uses_values_);
+ TypedRecordReader::ReserveValues(extra_values);
+ PARQUET_THROW_NOT_OK(accumulator_.builder->Reserve(extra_values));
+ }
+
void ReadValuesDense(int64_t values_to_read) override {
int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull(
static_cast<int>(values_to_read), &accumulator_);