scott-routledge2 commented on code in PR #43661:
URL: https://github.com/apache/arrow/pull/43661#discussion_r2380199206


##########
cpp/src/arrow/dataset/parquet_scan_benchmark.cc:
##########
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/testing/gtest_util.h"
+#include "benchmark/benchmark.h"
+
+#include "arrow/api.h"
+#include "arrow/compute/initialize.h"
+#include "arrow/dataset/dataset.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/dataset/scanner.h"
+#include "arrow/io/memory.h"
+#include "parquet/arrow/writer.h"
+
+namespace arrow {
+namespace dataset {
+
+using parquet::arrow::WriteTable;
+
+Result<std::shared_ptr<Buffer>> WriteStringColParquetBuffer(int64_t nrows) {
+  auto schema = arrow::schema({arrow::field("my_string_col", arrow::utf8())});
+
+  arrow::StringBuilder builder;
+  for (int64_t i = 0; i < nrows; i++) {
+    ARROW_RETURN_NOT_OK(builder.Append("row_" + std::to_string(i)));
+  }
+  std::shared_ptr<arrow::Array> arr;
+  ARROW_RETURN_NOT_OK(builder.Finish(&arr));
+  auto table = arrow::Table::Make(schema, {arr});
+
+  ARROW_ASSIGN_OR_RAISE(auto sink, arrow::io::BufferOutputStream::Create());
+  ARROW_RETURN_NOT_OK(WriteTable(*table, arrow::default_memory_pool(), sink));
+  return sink->Finish();
+}
+
+static void ParquetScanToTableCastStrings(benchmark::State& state) {
+  // GH-43660: Scan parquet data including a String column using a dataset 
object with
+  // LargeString in schema.
+  int64_t nrows = 100'000;
+  int64_t batch_size = 100;
+  bool use_threads = false;
+  auto format = std::make_shared<ParquetFileFormat>();
+
+  // Create a buffer with a single String column and wrap with FileFragment
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Buffer> buffer,
+                       WriteStringColParquetBuffer(nrows));
+  auto buffer_reader = std::make_shared<arrow::io::BufferReader>(buffer);
+  FileSource source(buffer_reader, buffer->size());
+  ASSERT_OK_AND_ASSIGN(auto fragment, format->MakeFragment(source));
+  std::vector<std::shared_ptr<FileFragment>> fragments{fragment};
+
+  // Create a dataset from FileFragment and set schema to LargeString (require 
casting).
+  auto schema = arrow::schema({field("my_string_col", arrow::large_utf8())});
+  ASSERT_OK_AND_ASSIGN(auto dataset, FileSystemDataset::Make(
+                                         schema, compute::literal(true), 
format,
+                                         /*filesystem=*/nullptr, 
std::move(fragments)));
+
+  ASSERT_OK_AND_ASSIGN(auto builder, dataset->NewScan());
+  ASSERT_OK(builder->BatchSize(batch_size));
+  ASSERT_OK(builder->UseThreads(use_threads));
+  ASSERT_OK_AND_ASSIGN(auto scanner, builder->Finish());
+
+  for (auto _ : state) {
+    ASSERT_OK_AND_ASSIGN(auto table, scanner->ToTable());
+    benchmark::DoNotOptimize(table);
+  }
+
+  state.SetItemsProcessed(state.iterations() * nrows);
+}
+
+BENCHMARK(ParquetScanToTableCastStrings);
+
+}  // namespace dataset

Review Comment:
   Local results (comparing to main):
   ```
   
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   Non-regressions: (20)
   
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                                                                    benchmark   
        baseline          contender  change %                                   
                                                                                
                                                                                
                             counters
               ParquetScanToTableCastStrings/num_batches:1000/batch_size:1000   
1.068M items/sec  10.574M items/sec   890.391                               
{'family_index': 0, 'per_family_instance_index': 2, 'run_name': 
'ParquetScanToTableCastStrings/num_batches:1000/batch_size:1000', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1}
                ParquetScanToTableCastStrings/num_batches:1000/batch_size:100 
761.915K items/sec   1.065M items/sec    39.760                                
{'family_index': 0, 'per_family_instance_index': 1, 'run_name': 
'ParquetScanToTableCastStrings/num_batches:1000/batch_size:100', 'repetitions': 
1, 'repetition_index': 0, 'threads': 1, 'iterations': 4}
                 ParquetScanToTableCastStrings/num_batches:1000/batch_size:10  
86.631K items/sec 107.530K items/sec    24.124                                 
{'family_index': 0, 'per_family_instance_index': 0, 'run_name': 
'ParquetScanToTableCastStrings/num_batches:1000/batch_size:10', 'repetitions': 
1, 'repetition_index': 0, 'threads': 1, 'iterations': 5}
          ScanOnlyBench/num_batches:1000/batch_size:1000/scan_alg:0/real_time   
  41.041 MiB/sec     49.586 MiB/sec    20.820                          
{'family_index': 1, 'per_family_instance_index': 4, 'run_name': 
'ScanOnlyBench/num_batches:1000/batch_size:1000/scan_alg:0/real_time', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 6}
            ScanOnlyBench/num_batches:1000/batch_size:10/scan_alg:0/real_time   
 457.346 KiB/sec    518.384 KiB/sec    13.346                            
{'family_index': 1, 'per_family_instance_index': 0, 'run_name': 
'ScanOnlyBench/num_batches:1000/batch_size:10/scan_alg:0/real_time', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 6}
           ScanOnlyBench/num_batches:1000/batch_size:100/scan_alg:0/real_time   
   4.421 MiB/sec      5.011 MiB/sec    13.339                           
{'family_index': 1, 'per_family_instance_index': 2, 'run_name': 
'ScanOnlyBench/num_batches:1000/batch_size:100/scan_alg:0/real_time', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 6}
            ScanOnlyBench/num_batches:1000/batch_size:10/scan_alg:1/real_time   
 634.336 KiB/sec    718.653 KiB/sec    13.292                            
{'family_index': 1, 'per_family_instance_index': 1, 'run_name': 
'ScanOnlyBench/num_batches:1000/batch_size:10/scan_alg:1/real_time', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 9}
    MinimalEndToEndBench/num_batches:1000/batch_size:100/scan_alg:0/real_time   
   4.563 MiB/sec      5.089 MiB/sec    11.535                    
{'family_index': 0, 'per_family_instance_index': 2, 'run_name': 
'MinimalEndToEndBench/num_batches:1000/batch_size:100/scan_alg:0/real_time', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 7}
     MinimalEndToEndBench/num_batches:1000/batch_size:10/scan_alg:1/real_time   
 624.277 KiB/sec    680.752 KiB/sec     9.047                     
{'family_index': 0, 'per_family_instance_index': 1, 'run_name': 
'MinimalEndToEndBench/num_batches:1000/batch_size:10/scan_alg:1/real_time', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 9}
     MinimalEndToEndBench/num_batches:1000/batch_size:10/scan_alg:0/real_time   
 483.179 KiB/sec    522.102 KiB/sec     8.055                     
{'family_index': 0, 'per_family_instance_index': 0, 'run_name': 
'MinimalEndToEndBench/num_batches:1000/batch_size:10/scan_alg:0/real_time', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 7}
           ScanOnlyBench/num_batches:1000/batch_size:100/scan_alg:1/real_time   
   6.578 MiB/sec      7.080 MiB/sec     7.645                           
{'family_index': 1, 'per_family_instance_index': 3, 'run_name': 
'ScanOnlyBench/num_batches:1000/batch_size:100/scan_alg:1/real_time', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 9}
   MinimalEndToEndBench/num_batches:1000/batch_size:1000/scan_alg:0/real_time   
  46.819 MiB/sec     49.794 MiB/sec     6.353                   
{'family_index': 0, 'per_family_instance_index': 4, 'run_name': 
'MinimalEndToEndBench/num_batches:1000/batch_size:1000/scan_alg:0/real_time', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 7}
          ScanOnlyBench/num_batches:1000/batch_size:1000/scan_alg:1/real_time   
  67.173 MiB/sec     70.629 MiB/sec     5.145                          
{'family_index': 1, 'per_family_instance_index': 5, 'run_name': 
'ScanOnlyBench/num_batches:1000/batch_size:1000/scan_alg:1/real_time', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 9}
                                             GetFilteredFragments/single_file   
1.501M items/sec   1.557M items/sec     3.709  {'family_index': 3, 
'per_family_instance_index': 0, 'run_name': 'GetFilteredFragments/single_file', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 105, 
'num_filtered_fragments': 1.0, 'num_fragments': 10000.0}
    MinimalEndToEndBench/num_batches:1000/batch_size:100/scan_alg:1/real_time   
   6.327 MiB/sec      6.481 MiB/sec     2.430                    
{'family_index': 0, 'per_family_instance_index': 3, 'run_name': 
'MinimalEndToEndBench/num_batches:1000/batch_size:100/scan_alg:1/real_time', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 9}
                                                   GetFilteredFragments/range 
229.364K items/sec 232.693K items/sec     1.451      {'family_index': 4, 
'per_family_instance_index': 0, 'run_name': 'GetFilteredFragments/range', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 16, 
'num_filtered_fragments': 9800.0, 'num_fragments': 10000.0}
   MinimalEndToEndBench/num_batches:1000/batch_size:1000/scan_alg:1/real_time   
  61.721 MiB/sec     62.435 MiB/sec     1.156                   
{'family_index': 0, 'per_family_instance_index': 5, 'run_name': 
'MinimalEndToEndBench/num_batches:1000/batch_size:1000/scan_alg:1/real_time', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 9}
                                              GetFilteredFragments/single_dir   
3.419M items/sec   3.444M items/sec     0.733 {'family_index': 1, 
'per_family_instance_index': 0, 'run_name': 'GetFilteredFragments/single_dir', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 236, 
'num_filtered_fragments': 100.0, 'num_fragments': 10000.0}
                                               GetFilteredFragments/multi_dir  
40.060K items/sec  40.215K items/sec     0.386    {'family_index': 2, 
'per_family_instance_index': 0, 'run_name': 'GetFilteredFragments/multi_dir', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 3, 
'num_filtered_fragments': 100.0, 'num_fragments': 10000.0}
                                                              GetAllFragments  
35.682M items/sec  35.433M items/sec    -0.698                                  
               {'family_index': 0, 'per_family_instance_index': 0, 'run_name': 
'GetAllFragments', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 
'iterations': 2282, 'num_fragments': 10000.0}
   
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to