scott-routledge2 commented on code in PR #43661: URL: https://github.com/apache/arrow/pull/43661#discussion_r2380199206
########## cpp/src/arrow/dataset/parquet_scan_benchmark.cc: ########## @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/testing/gtest_util.h" +#include "benchmark/benchmark.h" + +#include "arrow/api.h" +#include "arrow/compute/initialize.h" +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/file_parquet.h" +#include "arrow/dataset/scanner.h" +#include "arrow/io/memory.h" +#include "parquet/arrow/writer.h" + +namespace arrow { +namespace dataset { + +using parquet::arrow::WriteTable; + +Result<std::shared_ptr<Buffer>> WriteStringColParquetBuffer(int64_t nrows) { + auto schema = arrow::schema({arrow::field("my_string_col", arrow::utf8())}); + + arrow::StringBuilder builder; + for (int64_t i = 0; i < nrows; i++) { + ARROW_RETURN_NOT_OK(builder.Append("row_" + std::to_string(i))); + } + std::shared_ptr<arrow::Array> arr; + ARROW_RETURN_NOT_OK(builder.Finish(&arr)); + auto table = arrow::Table::Make(schema, {arr}); + + ARROW_ASSIGN_OR_RAISE(auto sink, arrow::io::BufferOutputStream::Create()); + ARROW_RETURN_NOT_OK(WriteTable(*table, arrow::default_memory_pool(), sink)); + return sink->Finish(); +} + +static void ParquetScanToTableCastStrings(benchmark::State& state) { + // GH-43660: Scan parquet data including a String column using a dataset object with + // LargeString in schema. + int64_t nrows = 100'000; + int64_t batch_size = 100; + bool use_threads = false; + auto format = std::make_shared<ParquetFileFormat>(); + + // Create a buffer with a single String column and wrap with FileFragment + ASSERT_OK_AND_ASSIGN(std::shared_ptr<Buffer> buffer, + WriteStringColParquetBuffer(nrows)); + auto buffer_reader = std::make_shared<arrow::io::BufferReader>(buffer); + FileSource source(buffer_reader, buffer->size()); + ASSERT_OK_AND_ASSIGN(auto fragment, format->MakeFragment(source)); + std::vector<std::shared_ptr<FileFragment>> fragments{fragment}; + + // Create a dataset from FileFragment and set schema to LargeString (require casting). + auto schema = arrow::schema({field("my_string_col", arrow::large_utf8())}); + ASSERT_OK_AND_ASSIGN(auto dataset, FileSystemDataset::Make( + schema, compute::literal(true), format, + /*filesystem=*/nullptr, std::move(fragments))); + + ASSERT_OK_AND_ASSIGN(auto builder, dataset->NewScan()); + ASSERT_OK(builder->BatchSize(batch_size)); + ASSERT_OK(builder->UseThreads(use_threads)); + ASSERT_OK_AND_ASSIGN(auto scanner, builder->Finish()); + + for (auto _ : state) { + ASSERT_OK_AND_ASSIGN(auto table, scanner->ToTable()); + benchmark::DoNotOptimize(table); + } + + state.SetItemsProcessed(state.iterations() * nrows); +} + +BENCHMARK(ParquetScanToTableCastStrings); + +} // namespace dataset Review Comment: Local results (comparing to main): ``` ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ Non-regressions: (20) ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ benchmark baseline contender change % counters ParquetScanToTableCastStrings/num_batches:1000/batch_size:1000 1.068M items/sec 10.574M items/sec 890.391 {'family_index': 0, 'per_family_instance_index': 2, 'run_name': 'ParquetScanToTableCastStrings/num_batches:1000/batch_size:1000', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1} ParquetScanToTableCastStrings/num_batches:1000/batch_size:100 761.915K items/sec 1.065M items/sec 39.760 {'family_index': 0, 'per_family_instance_index': 1, 'run_name': 'ParquetScanToTableCastStrings/num_batches:1000/batch_size:100', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4} ParquetScanToTableCastStrings/num_batches:1000/batch_size:10 86.631K items/sec 107.530K items/sec 24.124 {'family_index': 0, 'per_family_instance_index': 0, 'run_name': 'ParquetScanToTableCastStrings/num_batches:1000/batch_size:10', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 5} ScanOnlyBench/num_batches:1000/batch_size:1000/scan_alg:0/real_time 41.041 MiB/sec 49.586 MiB/sec 20.820 {'family_index': 1, 'per_family_instance_index': 4, 'run_name': 'ScanOnlyBench/num_batches:1000/batch_size:1000/scan_alg:0/real_time', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 6} ScanOnlyBench/num_batches:1000/batch_size:10/scan_alg:0/real_time 457.346 KiB/sec 518.384 KiB/sec 13.346 {'family_index': 1, 'per_family_instance_index': 0, 'run_name': 'ScanOnlyBench/num_batches:1000/batch_size:10/scan_alg:0/real_time', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 6} ScanOnlyBench/num_batches:1000/batch_size:100/scan_alg:0/real_time 4.421 MiB/sec 5.011 MiB/sec 13.339 {'family_index': 1, 'per_family_instance_index': 2, 'run_name': 'ScanOnlyBench/num_batches:1000/batch_size:100/scan_alg:0/real_time', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 6} ScanOnlyBench/num_batches:1000/batch_size:10/scan_alg:1/real_time 634.336 KiB/sec 718.653 KiB/sec 13.292 {'family_index': 1, 'per_family_instance_index': 1, 'run_name': 'ScanOnlyBench/num_batches:1000/batch_size:10/scan_alg:1/real_time', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 9} MinimalEndToEndBench/num_batches:1000/batch_size:100/scan_alg:0/real_time 4.563 MiB/sec 5.089 MiB/sec 11.535 {'family_index': 0, 'per_family_instance_index': 2, 'run_name': 'MinimalEndToEndBench/num_batches:1000/batch_size:100/scan_alg:0/real_time', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 7} MinimalEndToEndBench/num_batches:1000/batch_size:10/scan_alg:1/real_time 624.277 KiB/sec 680.752 KiB/sec 9.047 {'family_index': 0, 'per_family_instance_index': 1, 'run_name': 'MinimalEndToEndBench/num_batches:1000/batch_size:10/scan_alg:1/real_time', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 9} MinimalEndToEndBench/num_batches:1000/batch_size:10/scan_alg:0/real_time 483.179 KiB/sec 522.102 KiB/sec 8.055 {'family_index': 0, 'per_family_instance_index': 0, 'run_name': 'MinimalEndToEndBench/num_batches:1000/batch_size:10/scan_alg:0/real_time', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 7} ScanOnlyBench/num_batches:1000/batch_size:100/scan_alg:1/real_time 6.578 MiB/sec 7.080 MiB/sec 7.645 {'family_index': 1, 'per_family_instance_index': 3, 'run_name': 'ScanOnlyBench/num_batches:1000/batch_size:100/scan_alg:1/real_time', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 9} MinimalEndToEndBench/num_batches:1000/batch_size:1000/scan_alg:0/real_time 46.819 MiB/sec 49.794 MiB/sec 6.353 {'family_index': 0, 'per_family_instance_index': 4, 'run_name': 'MinimalEndToEndBench/num_batches:1000/batch_size:1000/scan_alg:0/real_time', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 7} ScanOnlyBench/num_batches:1000/batch_size:1000/scan_alg:1/real_time 67.173 MiB/sec 70.629 MiB/sec 5.145 {'family_index': 1, 'per_family_instance_index': 5, 'run_name': 'ScanOnlyBench/num_batches:1000/batch_size:1000/scan_alg:1/real_time', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 9} GetFilteredFragments/single_file 1.501M items/sec 1.557M items/sec 3.709 {'family_index': 3, 'per_family_instance_index': 0, 'run_name': 'GetFilteredFragments/single_file', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 105, 'num_filtered_fragments': 1.0, 'num_fragments': 10000.0} MinimalEndToEndBench/num_batches:1000/batch_size:100/scan_alg:1/real_time 6.327 MiB/sec 6.481 MiB/sec 2.430 {'family_index': 0, 'per_family_instance_index': 3, 'run_name': 'MinimalEndToEndBench/num_batches:1000/batch_size:100/scan_alg:1/real_time', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 9} GetFilteredFragments/range 229.364K items/sec 232.693K items/sec 1.451 {'family_index': 4, 'per_family_instance_index': 0, 'run_name': 'GetFilteredFragments/range', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 16, 'num_filtered_fragments': 9800.0, 'num_fragments': 10000.0} MinimalEndToEndBench/num_batches:1000/batch_size:1000/scan_alg:1/real_time 61.721 MiB/sec 62.435 MiB/sec 1.156 {'family_index': 0, 'per_family_instance_index': 5, 'run_name': 'MinimalEndToEndBench/num_batches:1000/batch_size:1000/scan_alg:1/real_time', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 9} GetFilteredFragments/single_dir 3.419M items/sec 3.444M items/sec 0.733 {'family_index': 1, 'per_family_instance_index': 0, 'run_name': 'GetFilteredFragments/single_dir', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 236, 'num_filtered_fragments': 100.0, 'num_fragments': 10000.0} GetFilteredFragments/multi_dir 40.060K items/sec 40.215K items/sec 0.386 {'family_index': 2, 'per_family_instance_index': 0, 'run_name': 'GetFilteredFragments/multi_dir', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 3, 'num_filtered_fragments': 100.0, 'num_fragments': 10000.0} GetAllFragments 35.682M items/sec 35.433M items/sec -0.698 {'family_index': 0, 'per_family_instance_index': 0, 'run_name': 'GetAllFragments', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2282, 'num_fragments': 10000.0} -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
