This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new dc501147 chore(dev/benchmarks): Add benchmarks for 
`ArrowArrayAppend()` (#401)
dc501147 is described below

commit dc50114756b7e9067b42181a6a86f928effc6e68
Author: Dewey Dunnington <[email protected]>
AuthorDate: Mon Mar 18 09:28:33 2024 -0300

    chore(dev/benchmarks): Add benchmarks for `ArrowArrayAppend()` (#401)
    
    This PR adds a set of benchmarks for building arrays using
    `ArrowArrayAppendXXX()` and adds a few missing ones for `ArrowArrayView`
    like `ArrowArrayViewGetString()`.
    
    (Report output in details)
    
    <details>
    
    # Benchmark Report
    
    ## Configurations
    
    These benchmarks were run with the following configurations:
    
    | preset_name | preset_description                               |
    |:------------|:-------------------------------------------------|
    | local       | Uses the nanoarrow C sources from this checkout. |
    | v0.4.0      | Uses the nanoarrow C sources the 0.4.0 release.  |
    
    ## Summary
    
    A quick and dirty summary of benchmark results between this checkout and
    the last released version.
    
    | benchmark_label | v0.4.0 | local | change | pct_change |
    
    
|:----------------------------------------------------------|---------:|---------:|-------:|-----------:|
    | [ArrayAppendInt16](#arrayappendint16) | 2.68ms | 2.66ms | 1ns | -0.9%
    |
    | [ArrayAppendInt32](#arrayappendint32) | 3.12ms | 3.08ms | 1ns | -1.3%
    |
    | [ArrayAppendInt64](#arrayappendint64) | 3.79ms | 3.47ms | 1ns | -8.4%
    |
    | [ArrayAppendInt8](#arrayappendint8) | 2.39ms | 2.38ms | 1ns | -0.1% |
    | [ArrayAppendNulls](#arrayappendnulls) | 12.05ms | 12.04ms | 1ns |
    -0.1% |
    | [ArrayAppendString](#arrayappendstring) | 8.96ms | 8.67ms | 1ns |
    -3.2% |
    | [ArrayViewGetInt16](#arrayviewgetint16) | 628.79µs | 627.1µs | 1ns |
    -0.3% |
    | [ArrayViewGetInt32](#arrayviewgetint32) | 634.21µs | 625.86µs | 1ns |
    -1.3% |
    | [ArrayViewGetInt64](#arrayviewgetint64) | 672.81µs | 676.99µs | 4.18µs
    | 0.6% |
    | [ArrayViewGetInt8](#arrayviewgetint8) | 783.55µs | 784.61µs | 1.05µs |
    0.1% |
    | [ArrayViewGetString](#arrayviewgetstring) | 1.26ms | 1.25ms | 1ns |
    -0.4% |
    | [ArrayViewIsNull](#arrayviewisnull) | 1.21ms | 1.19ms | 1ns | -1.8% |
    | [ArrayViewIsNullNonNullable](#arrayviewisnullnonnullable) | 938.36µs |
    940.65µs | 2.28µs | 0.2% |
    | [SchemaInitWideStruct](#schemainitwidestruct) | 1.02ms | 1.02ms | 1ns
    | -0.2% |
    | [SchemaViewInitWideStruct](#schemaviewinitwidestruct) | 103.62µs |
    103.53µs | 1ns | -0.1% |
    
    ## ArrowArray-related benchmarks
    
    Benchmarks for producing ArrowArrays using the ArrowArrayXXX()
    functions.
    
    ### ArrayAppendString
    
    Use ArrowArrayAppendString() to build a string array.
    
    [View
    
Source](https://github.com/paleolimbot/arrow-nanoarrow/blob/benchmarks-read-create/dev/benchmarks/c/array_benchmark.cc#L288-L315)
    
    | preset_name | iterations | real_time | cpu_time | items_per_second |
    |:------------|-----------:|----------:|---------:|-----------------:|
    | local       |         83 |    8.67ms |   8.64ms |      115,712,019 |
    | v0.4.0      |         77 |    8.96ms |   8.81ms |      113,455,364 |
    
    ### ArrayAppendInt8
    
    Use ArrowArrayAppendInt() to build an int8 array.
    
    [View
    
Source](https://github.com/paleolimbot/arrow-nanoarrow/blob/benchmarks-read-create/dev/benchmarks/c/array_benchmark.cc#L339-L341)
    
    | preset_name | iterations | real_time | cpu_time | items_per_second |
    |:------------|-----------:|----------:|---------:|-----------------:|
    | local       |        292 |    2.38ms |   2.38ms |      420,186,810 |
    | v0.4.0      |        296 |    2.39ms |   2.38ms |      419,740,272 |
    
    ### ArrayAppendInt16
    
    Use ArrowArrayAppendInt() to build an int16 array.
    
    [View
    
Source](https://github.com/paleolimbot/arrow-nanoarrow/blob/benchmarks-read-create/dev/benchmarks/c/array_benchmark.cc#L344-L346)
    
    | preset_name | iterations | real_time | cpu_time | items_per_second |
    |:------------|-----------:|----------:|---------:|-----------------:|
    | local       |        264 |    2.66ms |   2.66ms |      376,369,150 |
    | v0.4.0      |        261 |    2.68ms |   2.68ms |      373,079,925 |
    
    ### ArrayAppendInt32
    
    Use ArrowArrayAppendInt() to build an int32 array.
    
    [View
    
Source](https://github.com/paleolimbot/arrow-nanoarrow/blob/benchmarks-read-create/dev/benchmarks/c/array_benchmark.cc#L349-L351)
    
    | preset_name | iterations | real_time | cpu_time | items_per_second |
    |:------------|-----------:|----------:|---------:|-----------------:|
    | local       |        228 |    3.08ms |   3.08ms |      324,738,215 |
    | v0.4.0      |        225 |    3.12ms |   3.12ms |      320,760,473 |
    
    ### ArrayAppendInt64
    
    Use ArrowArrayAppendInt() to build an int64 array.
    
    [View
    
Source](https://github.com/paleolimbot/arrow-nanoarrow/blob/benchmarks-read-create/dev/benchmarks/c/array_benchmark.cc#L354-L356)
    
    | preset_name | iterations | real_time | cpu_time | items_per_second |
    |:------------|-----------:|----------:|---------:|-----------------:|
    | local       |        206 |    3.47ms |   3.46ms |      289,089,536 |
    | v0.4.0      |        186 |    3.79ms |   3.77ms |      265,070,543 |
    
    ### ArrayAppendNulls
    
    Use ArrowArrayAppendNulls() to build an int32 array that contains 80%
    null values.
    
    [View
    
Source](https://github.com/paleolimbot/arrow-nanoarrow/blob/benchmarks-read-create/dev/benchmarks/c/array_benchmark.cc#L379-L401)
    
    | preset_name | iterations | real_time | cpu_time | items_per_second |
    |:------------|-----------:|----------:|---------:|-----------------:|
    | local       |         59 |      12ms |     12ms |       83,199,603 |
    | v0.4.0      |         58 |      12ms |     12ms |       83,135,409 |
    
    ## ArrowArrayView-related benchmarks
    
    Benchmarks for consuming ArrowArrays using the ArrowArrayViewXXX()
    functions.
    
    ### ArrayViewGetInt8
    
    Use ArrowArrayViewGet() to consume an int8 array.
    
    [View
    
Source](https://github.com/paleolimbot/arrow-nanoarrow/blob/benchmarks-read-create/dev/benchmarks/c/array_benchmark.cc#L118-L120)
    
    | preset_name | iterations | real_time | cpu_time | items_per_second |
    |:------------|-----------:|----------:|---------:|-----------------:|
    | local       |        893 |     785µs |    784µs |    1,276,321,450 |
    | v0.4.0      |        894 |     784µs |    782µs |    1,278,021,040 |
    
    ### ArrayViewGetInt16
    
    Use ArrowArrayViewGet() to consume an int16 array.
    
    [View
    
Source](https://github.com/paleolimbot/arrow-nanoarrow/blob/benchmarks-read-create/dev/benchmarks/c/array_benchmark.cc#L123-L125)
    
    | preset_name | iterations | real_time | cpu_time | items_per_second |
    |:------------|-----------:|----------:|---------:|-----------------:|
    | local       |       1114 |     627µs |    626µs |    1,597,100,560 |
    | v0.4.0      |       1115 |     629µs |    628µs |    1,593,178,054 |
    
    ### ArrayViewGetInt32
    
    Use ArrowArrayViewGet() to consume an int32 array.
    
    [View
    
Source](https://github.com/paleolimbot/arrow-nanoarrow/blob/benchmarks-read-create/dev/benchmarks/c/array_benchmark.cc#L128-L130)
    
    | preset_name | iterations | real_time | cpu_time | items_per_second |
    |:------------|-----------:|----------:|---------:|-----------------:|
    | local       |       1115 |     626µs |    625µs |    1,600,061,993 |
    | v0.4.0      |       1114 |     634µs |    633µs |    1,580,536,418 |
    
    ### ArrayViewGetInt64
    
    Use ArrowArrayViewGet() to consume an int64 array.
    
    [View
    
Source](https://github.com/paleolimbot/arrow-nanoarrow/blob/benchmarks-read-create/dev/benchmarks/c/array_benchmark.cc#L133-L135)
    
    | preset_name | iterations | real_time | cpu_time | items_per_second |
    |:------------|-----------:|----------:|---------:|-----------------:|
    | local       |       1023 |     677µs |    676µs |    1,480,375,260 |
    | v0.4.0      |       1018 |     673µs |    671µs |    1,490,177,709 |
    
    ### ArrayViewIsNullNonNullable
    
    Use ArrowArrayViewIsNull() to check for nulls while consuming an int32
    array that does not contain a validity buffer.
    
    [View
    
Source](https://github.com/paleolimbot/arrow-nanoarrow/blob/benchmarks-read-create/dev/benchmarks/c/array_benchmark.cc#L139-L168)
    
    | preset_name | iterations | real_time | cpu_time | items_per_second |
    |:------------|-----------:|----------:|---------:|-----------------:|
    | local       |        746 |     941µs |    940µs |    1,064,112,037 |
    | v0.4.0      |        745 |     938µs |    937µs |    1,066,931,705 |
    
    ### ArrayViewIsNull
    
    Use ArrowArrayViewIsNull() to check for nulls while consuming an int32
    array that contains 20% nulls.
    
    [View
    
Source](https://github.com/paleolimbot/arrow-nanoarrow/blob/benchmarks-read-create/dev/benchmarks/c/array_benchmark.cc#L172-L211)
    
    | preset_name | iterations | real_time | cpu_time | items_per_second |
    |:------------|-----------:|----------:|---------:|-----------------:|
    | local       |        588 |    1.19ms |   1.19ms |      842,447,913 |
    | v0.4.0      |        588 |    1.21ms |    1.2ms |      830,223,525 |
    
    ### ArrayViewGetString
    
    Use ArrowArrayViewGetStringUnsafe() to consume a string array.
    
    [View
    
Source](https://github.com/paleolimbot/arrow-nanoarrow/blob/benchmarks-read-create/dev/benchmarks/c/array_benchmark.cc#L214-L245)
    
    | preset_name | iterations | real_time | cpu_time | items_per_second |
    |:------------|-----------:|----------:|---------:|-----------------:|
    | local       |        557 |    1.25ms |   1.25ms |      800,060,902 |
    | v0.4.0      |        546 |    1.26ms |   1.25ms |      797,048,875 |
    
    ## Schema-related benchmarks
    
    Benchmarks for producing and consuming ArrowSchema.
    
    ### SchemaInitWideStruct
    
    Benchmark ArrowSchema creation for very wide tables.
    
    Simulates part of the process of creating a very wide table with a
    simple column type (integer).
    
    [View
    
Source](https://github.com/paleolimbot/arrow-nanoarrow/blob/benchmarks-read-create/dev/benchmarks/c/schema_benchmark.cc#L45-L56)
    
    | preset_name | iterations | real_time | cpu_time | items_per_second |
    |:------------|-----------:|----------:|---------:|-----------------:|
    | local       |        690 |    1.02ms |   1.02ms |        9,843,783 |
    | v0.4.0      |        683 |    1.02ms |   1.02ms |        9,831,837 |
    
    ### SchemaViewInitWideStruct
    
    Benchmark ArrowSchema parsing for very wide tables.
    
    Simulates part of the process of consuming a very wide table. Typically
    the ArrowSchemaViewInit() is done by ArrowArrayViewInit() but uses a
    similar pattern.
    
    [View
    
Source](https://github.com/paleolimbot/arrow-nanoarrow/blob/benchmarks-read-create/dev/benchmarks/c/schema_benchmark.cc#L78-L91)
    
    | preset_name | iterations | real_time | cpu_time | items_per_second |
    |:------------|-----------:|----------:|---------:|-----------------:|
    | local       |       6772 |     104µs |    103µs |       96,669,664 |
    | v0.4.0      |       6749 |     104µs |    103µs |       96,625,343 |
    
    
    </details>
    
    ---------
    
    Co-authored-by: Benjamin Kietzman <[email protected]>
---
 dev/benchmarks/CMakeLists.txt       |   2 +-
 dev/benchmarks/benchmark-report.qmd |   4 +-
 dev/benchmarks/c/array_benchmark.cc | 401 ++++++++++++++++++++++++++++++------
 3 files changed, 345 insertions(+), 62 deletions(-)

diff --git a/dev/benchmarks/CMakeLists.txt b/dev/benchmarks/CMakeLists.txt
index 0d628aec..9064903d 100644
--- a/dev/benchmarks/CMakeLists.txt
+++ b/dev/benchmarks/CMakeLists.txt
@@ -55,7 +55,7 @@ fetchcontent_makeavailable(benchmark)
 if(IS_DIRECTORY "${NANOARROW_BENCHMARK_SOURCE_URL}")
   fetchcontent_declare(nanoarrow SOURCE_DIR 
"${NANOARROW_BENCHMARK_SOURCE_URL}")
   fetchcontent_makeavailable(nanoarrow)
-elseif(NOT NANOARROW_BENCHMARK_SOURCE_URL STREQUAL "")
+elseif(NOT "${NANOARROW_BENCHMARK_SOURCE_URL}" STREQUAL "")
   fetchcontent_declare(nanoarrow URL "${NANOARROW_BENCHMARK_SOURCE_URL}")
   fetchcontent_makeavailable(nanoarrow)
 endif()
diff --git a/dev/benchmarks/benchmark-report.qmd 
b/dev/benchmarks/benchmark-report.qmd
index 23fd8edc..f33be3cd 100644
--- a/dev/benchmarks/benchmark-report.qmd
+++ b/dev/benchmarks/benchmark-report.qmd
@@ -17,7 +17,9 @@
 # under the License.
 
 title: "Benchmark Report"
-format: gfm
+format:
+  gfm:
+    wrap: none
 ---
 
 ```{r setup, include=FALSE}
diff --git a/dev/benchmarks/c/array_benchmark.cc 
b/dev/benchmarks/c/array_benchmark.cc
index 33d9b26b..d81b80d4 100644
--- a/dev/benchmarks/c/array_benchmark.cc
+++ b/dev/benchmarks/c/array_benchmark.cc
@@ -19,23 +19,38 @@
 
 #include <nanoarrow/nanoarrow.hpp>
 
+// The length of most arrays used in these benchmarks. Just big enough so
+// that the benchmark takes a non-trivial amount of time to run.
+static const int64_t kNumItemsPrettyBig = 1000000;
+
+// Used to generate string/binary arrays
+static const std::string kAlphabet =
+    "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
 /// \defgroup nanoarrow-benchmark-array-view ArrowArrayView-related benchmarks
 ///
-/// Benchmarks for consuming ArrowArrays using the ArrowArrayViewXXX() 
functions.
+/// Benchmarks for consuming ArrowArrays using the `ArrowArrayViewXXX()` 
functions.
 ///
 /// @{
 
-// Utility for building primitive arrays
-template <typename CType, ArrowType type>
-ArrowErrorCode InitSchemaAndArrayPrimitive(ArrowSchema* schema, ArrowArray* 
array,
-                                           std::vector<CType> values,
-                                           std::vector<int8_t> validity = {}) {
-  NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema, type));
-  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromSchema(array, schema, nullptr));
+// Helper to initialize an ArrowArrayView from buffers. The ArrowArray isn't 
used
+// by the benchmark but is needed to hold the memory.
+template <typename Buffer1T, typename Buffer2T = int8_t>
+ArrowErrorCode InitArrayViewFromBuffers(ArrowType type, ArrowArray* array,
+                                        ArrowArrayView* array_view,
+                                        std::vector<int8_t> validity,
+                                        std::vector<Buffer1T> buffer1,
+                                        std::vector<Buffer2T> buffer2 = {}) {
+  // Initialize arrays
+  nanoarrow::UniqueSchema schema;
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema.get(), type));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromSchema(array, schema.get(), 
nullptr));
 
-  // Set the data buffer
-  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(ArrowArrayBuffer(array, 1), 
values.data(),
-                                            values.size() * sizeof(CType)));
+  // Initialize buffers
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(ArrowArrayBuffer(array, 1), 
buffer1.data(),
+                                            buffer1.size() * 
sizeof(Buffer1T)));
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(ArrowArrayBuffer(array, 2), 
buffer2.data(),
+                                            buffer2.size() * 
sizeof(Buffer2T)));
 
   // Pack the validity bitmap
   if (validity.size() > 0) {
@@ -44,93 +59,359 @@ ArrowErrorCode InitSchemaAndArrayPrimitive(ArrowSchema* 
schema, ArrowArray* arra
     ArrowBitmapAppendInt8Unsafe(validity_bitmap, validity.data(), 
validity.size());
   }
 
+  // Set the length
+  switch (type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_BINARY:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (buffer1.size() > 0) {
+        array->length = buffer1.size() - 1;
+      } else {
+        array->length = 0;
+      }
+      break;
+
+    default:
+      array->length = buffer1.size();
+      break;
+  }
+
+  // Set the null count
+  if (validity.size() > 0) {
+    array->null_count = array->length - 
ArrowBitCountSet(ArrowArrayBuffer(array, 0)->data,
+                                                         0, array->length);
+  } else {
+    array->null_count = 0;
+  }
+
   NANOARROW_RETURN_NOT_OK(ArrowArrayFinishBuildingDefault(array, nullptr));
+  NANOARROW_RETURN_NOT_OK(
+      ArrowArrayViewInitFromSchema(array_view, schema.get(), nullptr));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(array_view, array, nullptr));
   return NANOARROW_OK;
 }
 
 template <typename CType, ArrowType type>
-static void BaseArrayViewGetIntUnsafe(benchmark::State& state, double 
prop_null = 0.0) {
-  nanoarrow::UniqueSchema schema;
+static void BaseArrayViewGetInt(benchmark::State& state) {
   nanoarrow::UniqueArray array;
   nanoarrow::UniqueArrayView array_view;
 
-  int64_t n_values = 1000000;
+  int64_t n_values = kNumItemsPrettyBig;
 
   std::vector<CType> values(n_values);
   for (int64_t i = 0; i < n_values; i++) {
     values[i] = i % std::numeric_limits<CType>::max();
   }
 
-  std::vector<int8_t> validity;
+  NANOARROW_THROW_NOT_OK(
+      InitArrayViewFromBuffers(type, array.get(), array_view.get(), {}, 
values));
 
-  if (prop_null > 0) {
-    int64_t num_nulls = n_values * prop_null;
-    int64_t null_spacing = n_values / num_nulls;
-    validity.resize(n_values);
+  std::vector<CType> values_out(n_values);
+  for (auto _ : state) {
     for (int64_t i = 0; i < n_values; i++) {
-      validity[i] = i % null_spacing != 0;
+      values_out[i] = ArrowArrayViewGetIntUnsafe(array_view.get(), i);
     }
+    benchmark::DoNotOptimize(values_out);
   }
 
-  int code = InitSchemaAndArrayPrimitive<CType, type>(
-      schema.get(), array.get(), std::move(values), std::move(validity));
-  NANOARROW_THROW_NOT_OK(code);
-  NANOARROW_THROW_NOT_OK(
-      ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr));
-  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(array_view.get(), array.get(), 
nullptr));
+  state.SetItemsProcessed(n_values * state.iterations());
+}
 
-  std::vector<CType> values_out(n_values);
+/// \brief Use ArrowArrayViewGet() to consume an int8 array
+static void BenchmarkArrayViewGetInt8(benchmark::State& state) {
+  BaseArrayViewGetInt<int8_t, NANOARROW_TYPE_INT8>(state);
+}
+
+/// \brief Use ArrowArrayViewGet() to consume an int16 array
+static void BenchmarkArrayViewGetInt16(benchmark::State& state) {
+  BaseArrayViewGetInt<int16_t, NANOARROW_TYPE_INT16>(state);
+}
+
+/// \brief Use ArrowArrayViewGet() to consume an int32 array
+static void BenchmarkArrayViewGetInt32(benchmark::State& state) {
+  BaseArrayViewGetInt<int32_t, NANOARROW_TYPE_INT32>(state);
+}
+
+/// \brief Use ArrowArrayViewGet() to consume an int64 array
+static void BenchmarkArrayViewGetInt64(benchmark::State& state) {
+  BaseArrayViewGetInt<int64_t, NANOARROW_TYPE_INT64>(state);
+}
+
+/// \brief Use ArrowArrayViewIsNull() to check for nulls while consuming an 
int32 array
+/// that does not contain a validity buffer.
+static void BenchmarkArrayViewIsNullNonNullable(benchmark::State& state) {
+  nanoarrow::UniqueArray array;
+  nanoarrow::UniqueArrayView array_view;
+
+  int64_t n_values = kNumItemsPrettyBig;
+
+  // Create values
+  std::vector<int32_t> values(n_values);
+  for (int64_t i = 0; i < n_values; i++) {
+    values[i] = i % 1000;
+  }
+
+  NANOARROW_THROW_NOT_OK(InitArrayViewFromBuffers(NANOARROW_TYPE_INT32, 
array.get(),
+                                                  array_view.get(), {}, 
values));
 
-  if (prop_null > 0) {
-    for (auto _ : state) {
-      for (int64_t i = 0; i < n_values; i++) {
-        if (ArrowArrayViewIsNull(array_view.get(), i)) {
-          values_out[i] = 0;
-        } else {
-          values_out[i] = ArrowArrayViewGetIntUnsafe(array_view.get(), i);
-        }
+  // Read the array
+  std::vector<int32_t> values_out(n_values);
+  for (auto _ : state) {
+    for (int64_t i = 0; i < n_values; i++) {
+      if (ArrowArrayViewIsNull(array_view.get(), i)) {
+        values_out[i] = 0;
+      } else {
+        values_out[i] = ArrowArrayViewGetIntUnsafe(array_view.get(), i);
       }
-      benchmark::DoNotOptimize(values_out);
     }
-  } else {
-    for (auto _ : state) {
-      for (int64_t i = 0; i < n_values; i++) {
+    benchmark::DoNotOptimize(values_out);
+  }
+
+  state.SetItemsProcessed(n_values * state.iterations());
+}
+
+/// \brief Use ArrowArrayViewIsNull() to check for nulls while consuming an 
int32 array
+/// that contains 20% nulls.
+static void BenchmarkArrayViewIsNull(benchmark::State& state) {
+  nanoarrow::UniqueArray array;
+  nanoarrow::UniqueArrayView array_view;
+
+  int64_t n_values = kNumItemsPrettyBig;
+
+  // Create values
+  std::vector<int32_t> values(n_values);
+  for (int64_t i = 0; i < n_values; i++) {
+    values[i] = i % 1000;
+  }
+
+  // Create validity buffer
+  double prop_null = 0.2;
+  int64_t num_nulls = n_values * prop_null;
+  int64_t null_spacing = n_values / num_nulls;
+
+  std::vector<int8_t> validity(n_values);
+  for (int64_t i = 0; i < n_values; i++) {
+    validity[i] = i % null_spacing != 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(InitArrayViewFromBuffers(NANOARROW_TYPE_INT32, 
array.get(),
+                                                  array_view.get(), validity, 
values));
+
+  // Read the array
+  std::vector<int32_t> values_out(n_values);
+  for (auto _ : state) {
+    for (int64_t i = 0; i < n_values; i++) {
+      if (ArrowArrayViewIsNull(array_view.get(), i)) {
+        values_out[i] = 0;
+      } else {
         values_out[i] = ArrowArrayViewGetIntUnsafe(array_view.get(), i);
       }
-      benchmark::DoNotOptimize(values_out);
     }
+    benchmark::DoNotOptimize(values_out);
+  }
+
+  state.SetItemsProcessed(n_values * state.iterations());
+}
+
+/// \brief Use ArrowArrayViewGetStringUnsafe() to consume a string array
+static void BenchmarkArrayViewGetString(benchmark::State& state) {
+  nanoarrow::UniqueArray array;
+  nanoarrow::UniqueArrayView array_view;
+
+  // Create an array of relatively small strings
+  int64_t n_values = kNumItemsPrettyBig;
+  int64_t value_size = 7;
+  std::string alphabet = 
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+  std::vector<int32_t> offsets(n_values + 1);
+  for (int64_t i = 0; i < n_values; i++) {
+    offsets[i + 1] = i * value_size;
+  }
+
+  int64_t n_alphabets = n_values / alphabet.size() + 1;
+  std::vector<char> data(alphabet.size() * n_alphabets);
+  for (int64_t data_pos = 0; data_pos < data.size(); data_pos += 
alphabet.size()) {
+    memcpy(data.data() + data_pos, alphabet.data(), alphabet.size());
+  }
+
+  // Read the array as non-copying views. Possibly less realistic than
+  // what somebody might actually do, but also is a more direct benchmark
+  // of the overhead associated with calling it.
+  std::vector<ArrowStringView> values_out(n_values);
+  for (auto _ : state) {
+    for (int64_t i = 0; i < n_values; i++) {
+      values_out[i] = ArrowArrayViewGetStringUnsafe(array_view.get(), i);
+    }
+    benchmark::DoNotOptimize(values_out);
+  }
+  state.SetItemsProcessed(n_values * state.iterations());
+}
+
+/// @}
+
+/// \defgroup nanoarrow-benchmark-array ArrowArray-related benchmarks
+///
+/// Benchmarks for producing ArrowArrays using the `ArrowArrayXXX()` functions.
+///
+/// @{
+
+template <typename CType, ArrowType type>
+static ArrowErrorCode CreateAndAppendToArrayInt(ArrowArray* array,
+                                                const std::vector<CType>& 
values) {
+  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(array, type));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array));
+
+  for (int64_t i = 0; i < values.size(); i++) {
+    NANOARROW_RETURN_NOT_OK(ArrowArrayAppendInt(array, values[i]));
+  }
+
+  NANOARROW_RETURN_NOT_OK(ArrowArrayFinishBuildingDefault(array, nullptr));
+  return NANOARROW_OK;
+}
+
+template <ArrowType type>
+static ArrowErrorCode CreateAndAppendToArrayString(
+    ArrowArray* array, const std::vector<std::string>& values) {
+  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(array, type));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array));
+
+  for (const std::string& s : values) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayAppendString(array, {s.data(), 
static_cast<int64_t>(s.size())}));
+  }
+
+  NANOARROW_RETURN_NOT_OK(ArrowArrayFinishBuildingDefault(array, nullptr));
+  return NANOARROW_OK;
+}
+
+/// \brief Use ArrowArrayAppendString() to build a string array
+static void BenchmarkArrayAppendString(benchmark::State& state) {
+  nanoarrow::UniqueArray array;
+
+  int64_t n_values = kNumItemsPrettyBig;
+  int64_t value_size = 7;
+
+  std::vector<std::string> values(n_values);
+  int64_t alphabet_pos = 0;
+  for (std::string& value : values) {
+    if ((alphabet_pos + value_size) >= kAlphabet.size()) {
+      alphabet_pos = 0;
+    }
+
+    value.assign(kAlphabet.data() + alphabet_pos, value_size);
+    alphabet_pos += value_size;
+  }
+
+  for (auto _ : state) {
+    array.reset();
+    NANOARROW_THROW_NOT_OK(
+        CreateAndAppendToArrayString<NANOARROW_TYPE_STRING>(array.get(), 
values));
+    benchmark::DoNotOptimize(array);
   }
 
   state.SetItemsProcessed(n_values * state.iterations());
 }
 
-/// \brief Use ArrowArrayViewGetIntUnsafe() to consume an int8 array
-static void BenchmarkArrayViewGetIntUnsafeInt8(benchmark::State& state) {
-  BaseArrayViewGetIntUnsafe<int8_t, NANOARROW_TYPE_INT8>(state);
+template <typename CType, ArrowType type>
+static void BaseBenchmarkArrayAppendInt(benchmark::State& state) {
+  nanoarrow::UniqueArray array;
+
+  int64_t n_values = kNumItemsPrettyBig;
+
+  std::vector<CType> values(n_values);
+  for (int64_t i = 0; i < n_values; i++) {
+    values[i] = i % std::numeric_limits<CType>::max();
+  }
+
+  for (auto _ : state) {
+    array.reset();
+    int code = CreateAndAppendToArrayInt<CType, type>(array.get(), values);
+    NANOARROW_THROW_NOT_OK(code);
+    benchmark::DoNotOptimize(array);
+  }
+
+  state.SetItemsProcessed(n_values * state.iterations());
+}
+
+/// \brief Use ArrowArrayAppendInt() to build an int8 array
+static void BenchmarkArrayAppendInt8(benchmark::State& state) {
+  BaseBenchmarkArrayAppendInt<int8_t, NANOARROW_TYPE_INT8>(state);
+}
+
+/// \brief Use ArrowArrayAppendInt() to build an int16 array
+static void BenchmarkArrayAppendInt16(benchmark::State& state) {
+  BaseBenchmarkArrayAppendInt<int16_t, NANOARROW_TYPE_INT16>(state);
 }
 
-/// \brief Use ArrowArrayViewGetIntUnsafe() to consume an int16 array
-static void BenchmarkArrayViewGetIntUnsafeInt16(benchmark::State& state) {
-  BaseArrayViewGetIntUnsafe<int16_t, NANOARROW_TYPE_INT16>(state);
+/// \brief Use ArrowArrayAppendInt() to build an int32 array
+static void BenchmarkArrayAppendInt32(benchmark::State& state) {
+  BaseBenchmarkArrayAppendInt<int32_t, NANOARROW_TYPE_INT32>(state);
 }
 
-/// \brief Use ArrowArrayViewGetIntUnsafe() to consume an int32 array
-static void BenchmarkArrayViewGetIntUnsafeInt32(benchmark::State& state) {
-  BaseArrayViewGetIntUnsafe<int32_t, NANOARROW_TYPE_INT32>(state);
+/// \brief Use ArrowArrayAppendInt() to build an int64 array
+static void BenchmarkArrayAppendInt64(benchmark::State& state) {
+  BaseBenchmarkArrayAppendInt<int64_t, NANOARROW_TYPE_INT64>(state);
 }
 
-/// \brief Use ArrowArrayViewGetIntUnsafe() to consume an int64 array
-static void BenchmarkArrayViewGetIntUnsafeInt64(benchmark::State& state) {
-  BaseArrayViewGetIntUnsafe<int64_t, NANOARROW_TYPE_INT64>(state);
+template <typename CType, ArrowType type>
+static ArrowErrorCode CreateAndAppendIntWithNulls(ArrowArray* array,
+                                                  const std::vector<int8_t>& 
validity) {
+  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(array, type));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array));
+  CType non_null_value = std::numeric_limits<CType>::max() / 2;
+
+  for (int64_t i = 0; i < validity.size(); i++) {
+    if (validity[i]) {
+      NANOARROW_RETURN_NOT_OK(ArrowArrayAppendInt(array, non_null_value));
+    } else {
+      NANOARROW_RETURN_NOT_OK(ArrowArrayAppendNull(array, 1));
+    }
+  }
+
+  NANOARROW_RETURN_NOT_OK(ArrowArrayFinishBuildingDefault(array, nullptr));
+  return NANOARROW_OK;
 }
 
-/// \brief Use ArrowArrayViewGetIntUnsafe() to consume an int64 array 
(checking for nulls)
-static void BenchmarkArrayViewGetIntUnsafeInt64CheckNull(benchmark::State& 
state) {
-  BaseArrayViewGetIntUnsafe<int64_t, NANOARROW_TYPE_INT64>(state, 0.2);
+/// \brief Use ArrowArrayAppendNulls() to build an int32 array that contains 
80%
+/// null values
+static void BenchmarkArrayAppendNulls(benchmark::State& state) {
+  nanoarrow::UniqueArray array;
+
+  int64_t n_values = kNumItemsPrettyBig;
+  double prop_null = 0.8;
+  int64_t num_nulls = n_values * prop_null;
+  int64_t null_spacing = n_values / num_nulls;
+
+  std::vector<int8_t> validity(n_values);
+  for (int64_t i = 0; i < n_values; i++) {
+    validity[i] = i % null_spacing != 0;
+  }
+
+  for (auto _ : state) {
+    array.reset();
+    int code =
+        CreateAndAppendIntWithNulls<int32_t, 
NANOARROW_TYPE_INT32>(array.get(), validity);
+    NANOARROW_THROW_NOT_OK(code);
+    benchmark::DoNotOptimize(array);
+  }
+
+  state.SetItemsProcessed(n_values * state.iterations());
 }
 
-BENCHMARK(BenchmarkArrayViewGetIntUnsafeInt8);
-BENCHMARK(BenchmarkArrayViewGetIntUnsafeInt16);
-BENCHMARK(BenchmarkArrayViewGetIntUnsafeInt32);
-BENCHMARK(BenchmarkArrayViewGetIntUnsafeInt64);
-BENCHMARK(BenchmarkArrayViewGetIntUnsafeInt64CheckNull);
+/// @}
+
+BENCHMARK(BenchmarkArrayViewGetInt8);
+BENCHMARK(BenchmarkArrayViewGetInt16);
+BENCHMARK(BenchmarkArrayViewGetInt32);
+BENCHMARK(BenchmarkArrayViewGetInt64);
+BENCHMARK(BenchmarkArrayViewGetString);
+BENCHMARK(BenchmarkArrayViewIsNullNonNullable);
+BENCHMARK(BenchmarkArrayViewIsNull);
+
+BENCHMARK(BenchmarkArrayAppendString);
+BENCHMARK(BenchmarkArrayAppendInt8);
+BENCHMARK(BenchmarkArrayAppendInt16);
+BENCHMARK(BenchmarkArrayAppendInt32);
+BENCHMARK(BenchmarkArrayAppendInt64);
+BENCHMARK(BenchmarkArrayAppendNulls);

Reply via email to