This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 5756b766 chore: Add basic benchmark suite to C library (#393)
5756b766 is described below
commit 5756b76697d47820675da196916ca89c0ab499d3
Author: Dewey Dunnington <[email protected]>
AuthorDate: Thu Mar 7 14:46:48 2024 -0400
chore: Add basic benchmark suite to C library (#393)
This PR adds an initial set of benchmarks covering some realistic usage
patterns. The general approach is to use doxygen comments to document
the benchmarks, which will run against the released version and the
previous version. I'm not sure exactly what the output format will be
but I'd like the benchmarks to be written in such a way that there's a
path to programatically generating a report (maybe using conbench, maybe
just a Quarto document).
Work in progress!
---
CMakeLists.txt | 8 +++
CMakePresets.json | 18 +++--
src/nanoarrow/array_benchmark.cc | 135 ++++++++++++++++++++++++++++++++++++++
src/nanoarrow/nanoarrow_types.h | 2 +-
src/nanoarrow/schema_benchmark.cc | 74 +++++++++++++++++++--
5 files changed, 227 insertions(+), 10 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 327ba449..d45a57cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -260,9 +260,17 @@ if(NANOARROW_BUILD_TESTS)
endif()
if(NANOARROW_BUILD_BENCHMARKS)
+ # benchmark requires at least C++11
+ if(NOT DEFINED CMAKE_CXX_STANDARD)
+ set(CMAKE_CXX_STANDARD 11)
+ endif()
+
add_subdirectory("thirdparty/benchmark")
add_executable(schema_benchmark src/nanoarrow/schema_benchmark.cc)
+ add_executable(array_benchmark src/nanoarrow/array_benchmark.cc)
+
target_link_libraries(schema_benchmark PRIVATE nanoarrow
benchmark::benchmark_main)
+ target_link_libraries(array_benchmark PRIVATE nanoarrow
benchmark::benchmark_main)
endif()
diff --git a/CMakePresets.json b/CMakePresets.json
index 28917d7b..d5ff3ea0 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -9,18 +9,28 @@
{
"name": "default",
"displayName": "Default Config",
- "cacheVariables": {}
+ "binaryDir": "${sourceDir}/build",
+ "cacheVariables": {
+ "CMAKE_EXPORT_COMPILE_COMMANDS": "ON"
+ }
},
{
"name": "default-with-tests",
- "inherits": [
- "default"
- ],
+ "inherits": ["default"],
"displayName": "Default with tests",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Debug",
"NANOARROW_BUILD_TESTS": "ON"
}
+ },
+ {
+ "name": "default-with-benchmarks",
+ "inherits": ["default"],
+ "displayName": "Default with benchmarks",
+ "cacheVariables": {
+ "CMAKE_BUILD_TYPE": "Release",
+ "NANOARROW_BUILD_BENCHMARKS": "ON"
+ }
}
]
}
diff --git a/src/nanoarrow/array_benchmark.cc b/src/nanoarrow/array_benchmark.cc
new file mode 100644
index 00000000..93c4f7a0
--- /dev/null
+++ b/src/nanoarrow/array_benchmark.cc
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <benchmark/benchmark.h>
+
+#include "nanoarrow.hpp"
+
+/// \defgroup nanoarrow-benchmark-array-view ArrowArrayView-related benchmarks
+///
+/// Benchmarks for consuming ArrowArrays using the ArrowArrayViewXXX()
functions.
+///
+/// @{
+
+// Utility for building primitive arrays
+template <typename CType, ArrowType type>
+ArrowErrorCode InitSchemaAndArrayPrimitive(ArrowSchema* schema, ArrowArray*
array,
+ std::vector<CType> values,
+ std::vector<int8_t> validity = {}) {
+ NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema, type));
+ NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromSchema(array, schema, nullptr));
+
+ // Set the data buffer
+ nanoarrow::BufferInitSequence(ArrowArrayBuffer(array, 1), std::move(values));
+
+ // Pack the validity bitmap
+ if (validity.size() > 0) {
+ ArrowBitmap* validity_bitmap = ArrowArrayValidityBitmap(array);
+ NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(validity_bitmap,
validity.size()));
+ ArrowBitmapAppendInt8Unsafe(validity_bitmap, validity.data(),
validity.size());
+ }
+
+ NANOARROW_RETURN_NOT_OK(ArrowArrayFinishBuildingDefault(array, nullptr));
+ return NANOARROW_OK;
+}
+
+template <typename CType, ArrowType type>
+static void BaseArrayViewGetIntUnsafe(benchmark::State& state, double
prop_null = 0.0) {
+ nanoarrow::UniqueSchema schema;
+ nanoarrow::UniqueArray array;
+ nanoarrow::UniqueArrayView array_view;
+
+ int64_t n_values = 1000000;
+
+ std::vector<CType> values(n_values);
+ for (int64_t i = 0; i < n_values; i++) {
+ values[i] = i % std::numeric_limits<CType>::max();
+ }
+
+ std::vector<int8_t> validity;
+
+ if (prop_null > 0) {
+ int64_t num_nulls = n_values * prop_null;
+ int64_t null_spacing = n_values / num_nulls;
+ validity.resize(n_values);
+ for (int64_t i = 0; i < n_values; i++) {
+ validity[i] = i % null_spacing != 0;
+ }
+ }
+
+ int code = InitSchemaAndArrayPrimitive<CType, type>(
+ schema.get(), array.get(), std::move(values), std::move(validity));
+ NANOARROW_THROW_NOT_OK(code);
+ NANOARROW_THROW_NOT_OK(
+ ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr));
+ NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(array_view.get(), array.get(),
nullptr));
+
+ std::vector<CType> values_out(n_values);
+
+ if (prop_null > 0) {
+ for (auto _ : state) {
+ for (int64_t i = 0; i < n_values; i++) {
+ if (ArrowArrayViewIsNull(array_view.get(), i)) {
+ values_out[i] = 0;
+ } else {
+ values_out[i] = ArrowArrayViewGetIntUnsafe(array_view.get(), i);
+ }
+ }
+ benchmark::DoNotOptimize(values_out);
+ }
+ } else {
+ for (auto _ : state) {
+ for (int64_t i = 0; i < n_values; i++) {
+ values_out[i] = ArrowArrayViewGetIntUnsafe(array_view.get(), i);
+ }
+ benchmark::DoNotOptimize(values_out);
+ }
+ }
+
+ state.SetItemsProcessed(n_values * state.iterations());
+}
+
+/// \brief Use ArrowArrayViewGetIntUnsafe() to consume an int8 array
+static void BenchmarkArrayViewGetIntUnsafeInt8(benchmark::State& state) {
+ BaseArrayViewGetIntUnsafe<int8_t, NANOARROW_TYPE_INT8>(state);
+}
+
+/// \brief Use ArrowArrayViewGetIntUnsafe() to consume an int16 array
+static void BenchmarkArrayViewGetIntUnsafeInt16(benchmark::State& state) {
+ BaseArrayViewGetIntUnsafe<int16_t, NANOARROW_TYPE_INT16>(state);
+}
+
+/// \brief Use ArrowArrayViewGetIntUnsafe() to consume an int32 array
+static void BenchmarkArrayViewGetIntUnsafeInt32(benchmark::State& state) {
+ BaseArrayViewGetIntUnsafe<int32_t, NANOARROW_TYPE_INT32>(state);
+}
+
+/// \brief Use ArrowArrayViewGetIntUnsafe() to consume an int64 array
+static void BenchmarkArrayViewGetIntUnsafeInt64(benchmark::State& state) {
+ BaseArrayViewGetIntUnsafe<int64_t, NANOARROW_TYPE_INT64>(state);
+}
+
+/// \brief Use ArrowArrayViewGetIntUnsafe() to consume an int64 array
(checking for nulls)
+static void BenchmarkArrayViewGetIntUnsafeInt64CheckNull(benchmark::State&
state) {
+ BaseArrayViewGetIntUnsafe<int64_t, NANOARROW_TYPE_INT64>(state, 0.2);
+}
+
+BENCHMARK(BenchmarkArrayViewGetIntUnsafeInt8);
+BENCHMARK(BenchmarkArrayViewGetIntUnsafeInt16);
+BENCHMARK(BenchmarkArrayViewGetIntUnsafeInt32);
+BENCHMARK(BenchmarkArrayViewGetIntUnsafeInt64);
+BENCHMARK(BenchmarkArrayViewGetIntUnsafeInt64CheckNull);
diff --git a/src/nanoarrow/nanoarrow_types.h b/src/nanoarrow/nanoarrow_types.h
index 03f1bc01..d814a056 100644
--- a/src/nanoarrow/nanoarrow_types.h
+++ b/src/nanoarrow/nanoarrow_types.h
@@ -313,7 +313,7 @@ static inline void ArrowErrorSetString(struct ArrowError*
error, const char* src
#define NANOARROW_DCHECK(EXPR) _NANOARROW_DCHECK_IMPL(EXPR, #EXPR)
#else
-#define NANOARROW_ASSERT_OK(EXPR) EXPR
+#define NANOARROW_ASSERT_OK(EXPR) (void)(EXPR)
#define NANOARROW_DCHECK(EXPR)
#endif
diff --git a/src/nanoarrow/schema_benchmark.cc
b/src/nanoarrow/schema_benchmark.cc
index 10b45841..2e6b330e 100644
--- a/src/nanoarrow/schema_benchmark.cc
+++ b/src/nanoarrow/schema_benchmark.cc
@@ -17,15 +17,79 @@
#include <benchmark/benchmark.h>
-#include "nanoarrow.h"
+#include "nanoarrow.hpp"
-static void BM_SchemaInit(benchmark::State& state) {
+/// \defgroup nanoarrow-benchmark-schema Schema-related benchmarks
+///
+/// Benchmarks for producing and consuming ArrowSchema.
+///
+/// @{
+
+// Utility to initialize a wide struct schema
+static ArrowErrorCode SchemaInitStruct(struct ArrowSchema* schema, int64_t
n_columns) {
+ ArrowSchemaInit(schema);
+ NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeStruct(schema, n_columns));
+ for (int64_t i = 0; i < n_columns; i++) {
+ NANOARROW_RETURN_NOT_OK(
+ ArrowSchemaSetType(schema->children[i], NANOARROW_TYPE_INT32));
+ }
+ return NANOARROW_OK;
+}
+
+/// \brief Benchmark ArrowSchema creation for very wide tables
+///
+/// Simulates part of the process of creating a very wide table with a
+/// simple column type (integer).
+static void BenchmarkSchemaInitWideStruct(benchmark::State& state);
+
+static void BenchmarkSchemaInitWideStruct(benchmark::State& state) {
+ struct ArrowSchema schema;
+
+ int64_t n_columns = 10000;
+
+ for (auto _ : state) {
+ NANOARROW_THROW_NOT_OK(SchemaInitStruct(&schema, 10000));
+ ArrowSchemaRelease(&schema);
+ }
+
+ state.SetItemsProcessed(n_columns * state.iterations());
+}
+
+BENCHMARK(BenchmarkSchemaInitWideStruct);
+
+/// \brief Benchmark ArrowSchema parsing for very wide tables
+///
+/// Simulates part of the process of consuming a very wide table. Typically
+/// the ArrowSchemaViewInit() is done by ArrowArrayViewInit() but uses a
+/// similar pattern.
+static void BenchmarkSchemaViewInitWideStruct(benchmark::State& state);
+
+static ArrowErrorCode SchemaViewInitChildren(struct ArrowSchema* schema,
+ struct ArrowError* error) {
+ for (int64_t i = 0; i < schema->n_children; i++) {
+ struct ArrowSchemaView schema_view;
+ NANOARROW_RETURN_NOT_OK(
+ ArrowSchemaViewInit(&schema_view, schema->children[i], error));
+ }
+
+ return NANOARROW_OK;
+}
+
+static void BenchmarkSchemaViewInitWideStruct(benchmark::State& state) {
struct ArrowSchema schema;
+ struct ArrowError error;
+
+ int64_t n_columns = 10000;
+ SchemaInitStruct(&schema, n_columns);
for (auto _ : state) {
- ArrowSchemaInit(&schema);
+ NANOARROW_ASSERT_OK(SchemaViewInitChildren(&schema, &error));
}
+ state.SetItemsProcessed(n_columns * state.iterations());
+
+ ArrowSchemaRelease(&schema);
}
-// Register the function as a benchmark
-BENCHMARK(BM_SchemaInit);
+BENCHMARK(BenchmarkSchemaViewInitWideStruct);
+
+/// @}