This is an automated email from the ASF dual-hosted git repository.
lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-cookbook.git
The following commit(s) were added to refs/heads/main by this push:
new 72f9c04 [C++] Add TypeVisitor example (#166)
72f9c04 is described below
commit 72f9c042029aeb28b252c26bb77ca7daf897a05d
Author: Will Jones <[email protected]>
AuthorDate: Mon Mar 28 10:14:24 2022 -0700
[C++] Add TypeVisitor example (#166)
* Start visitor example
* Get a working visitor pattern
* Add random data generator example
* Revert makefile change
* Apply suggestions from code review
Co-authored-by: Antoine Pitrou <[email protected]>
* Merge
* Reorganize examples and add prose
* PR feedback
* Update cpp/code/creating_arrow_objects.cc
Co-authored-by: Antoine Pitrou <[email protected]>
* Adjust types to be more specific
* Update cpp/code/basic_arrow.cc
Co-authored-by: Antoine Pitrou <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
---
cpp/code/basic_arrow.cc | 69 +++++++++++++++++++++++++++++
cpp/code/creating_arrow_objects.cc | 89 ++++++++++++++++++++++++++++++++++++++
cpp/source/basic.rst | 42 ++++++++++++++++++
cpp/source/create.rst | 24 +++++++++-
4 files changed, 223 insertions(+), 1 deletion(-)
diff --git a/cpp/code/basic_arrow.cc b/cpp/code/basic_arrow.cc
index c8f7070..759e22f 100644
--- a/cpp/code/basic_arrow.cc
+++ b/cpp/code/basic_arrow.cc
@@ -16,6 +16,7 @@
// under the License.
#include <arrow/api.h>
+#include <arrow/visit_array_inline.h>
#include <gtest/gtest.h>
#include "common.h"
@@ -63,3 +64,71 @@ arrow::Status ReturnNotOk() {
TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+/// \brief Sum numeric values across columns
+///
+/// Only supports floating point and integral types. Does not support decimals.
+class TableSummation {
+ double partial = 0.0;
+ public:
+
+ arrow::Result<double> Compute(std::shared_ptr<arrow::RecordBatch> batch) {
+ for (std::shared_ptr<arrow::Array> array : batch->columns()) {
+ ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*array, this));
+ }
+ return partial;
+ }
+
+ // Default implementation
+ arrow::Status Visit(const arrow::Array& array) {
+ return arrow::Status::NotImplemented("Can not compute sum for array of
type ",
+ array.type()->ToString());
+ }
+
+ template <typename ArrayType, typename T = typename ArrayType::TypeClass>
+ arrow::enable_if_number<T, arrow::Status> Visit(const ArrayType& array) {
+ for (arrow::util::optional<typename T::c_type> value : array) {
+ if (value.has_value()) {
+ partial += static_cast<double>(value.value());
+ }
+ }
+ return arrow::Status::OK();
+ }
+}; // TableSummation
+
+arrow::Status VisitorSummationExample() {
+ StartRecipe("VisitorSummationExample");
+ std::shared_ptr<arrow::Schema> schema = arrow::schema({
+ arrow::field("a", arrow::int32()),
+ arrow::field("b", arrow::float64()),
+ });
+ int32_t num_rows = 3;
+ std::vector<std::shared_ptr<arrow::Array>> columns;
+
+ arrow::Int32Builder a_builder = arrow::Int32Builder();
+ std::vector<int32_t> a_vals = {1, 2, 3};
+ ARROW_RETURN_NOT_OK(a_builder.AppendValues(a_vals));
+ ARROW_ASSIGN_OR_RAISE(auto a_arr, a_builder.Finish());
+ columns.push_back(a_arr);
+
+ arrow::DoubleBuilder b_builder = arrow::DoubleBuilder();
+ std::vector<double> b_vals = {4.0, 5.0, 6.0};
+ ARROW_RETURN_NOT_OK(b_builder.AppendValues(b_vals));
+ ARROW_ASSIGN_OR_RAISE(auto b_arr, b_builder.Finish());
+ columns.push_back(b_arr);
+
+ auto batch = arrow::RecordBatch::Make(schema, num_rows, columns);
+
+ // Call
+ TableSummation summation;
+ ARROW_ASSIGN_OR_RAISE(auto total, summation.Compute(batch));
+
+ rout << "Total is " << total;
+
+ EndRecipe("VisitorSummationExample");
+
+ EXPECT_EQ(total, 21.0);
+ return arrow::Status::OK();
+}
+
+TEST(BasicArrow, VisitorSummationExample) {
ASSERT_OK(VisitorSummationExample()); }
diff --git a/cpp/code/creating_arrow_objects.cc
b/cpp/code/creating_arrow_objects.cc
index 8d4085f..be607a5 100644
--- a/cpp/code/creating_arrow_objects.cc
+++ b/cpp/code/creating_arrow_objects.cc
@@ -18,6 +18,8 @@
#include <arrow/api.h>
#include <gtest/gtest.h>
+#include <random>
+
#include "common.h"
arrow::Status CreatingArrays() {
@@ -58,5 +60,92 @@ arrow::Status CreatingArraysPtr() {
return arrow::Status::OK();
}
+/// \brief Generate random record batches for a given schema
+///
+/// For demonstration purposes, this only covers DoubleType and ListType
+class RandomBatchGenerator {
+ public:
+ std::shared_ptr<arrow::Schema> schema;
+
+ RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) :
schema(schema){};
+
+ arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t
num_rows) {
+ num_rows_ = num_rows;
+ for (std::shared_ptr<arrow::Field> field : schema->fields()) {
+ ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+ }
+
+ return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+ }
+
+ // Default implementation
+ arrow::Status Visit(const arrow::DataType& type) {
+ return arrow::Status::NotImplemented("Generating data for",
type.ToString());
+ }
+
+ arrow::Status Visit(const arrow::DoubleType&) {
+ auto builder = arrow::DoubleBuilder();
+ std::normal_distribution<> d{/*mean=*/5.0, /*stddev=*/2.0};
+ for (int32_t i = 0; i < num_rows_; ++i) {
+ builder.Append(d(gen_));
+ }
+ ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
+ arrays_.push_back(array);
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::ListType& type) {
+ // Generate offsets first, which determines number of values in sub-array
+ std::poisson_distribution<> d{/*mean=*/4};
+ auto builder = arrow::Int32Builder();
+ builder.Append(0);
+ int32_t last_val = 0;
+ for (int32_t i = 0; i < num_rows_; ++i) {
+ last_val += d(gen_);
+ builder.Append(last_val);
+ }
+ ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
+
+ // Since children of list has a new length, will use a new generator
+ RandomBatchGenerator value_gen(arrow::schema({arrow::field("x",
type.value_type())}));
+ // Last index from the offsets array becomes the length of the sub-array
+ ARROW_ASSIGN_OR_RAISE(auto inner_batch, value_gen.Generate(last_val));
+ std::shared_ptr<arrow::Array> values = inner_batch->column(0);
+
+ ARROW_ASSIGN_OR_RAISE(auto array,
+ arrow::ListArray::FromArrays(*offsets.get(),
*values.get()));
+ arrays_.push_back(array);
+
+ return arrow::Status::OK();
+ }
+
+ protected:
+ std::random_device rd_{};
+ std::mt19937 gen_{rd_()};
+ std::vector<std::shared_ptr<arrow::Array>> arrays_;
+ int32_t num_rows_;
+}; // RandomBatchGenerator
+
+arrow::Status GenerateRandomData() {
+ StartRecipe("GenerateRandomData");
+ std::shared_ptr<arrow::Schema> schema =
+ arrow::schema({arrow::field("x", arrow::float64()),
+ arrow::field("y", arrow::list(arrow::float64()))});
+
+ RandomBatchGenerator generator(schema);
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::RecordBatch> batch,
generator.Generate(5));
+
+ rout << "Created batch: \n" << batch->ToString();
+
+ // Consider using ValidateFull to check correctness
+ ARROW_RETURN_NOT_OK(batch->ValidateFull());
+
+ EndRecipe("GenerateRandomData");
+ EXPECT_EQ(batch->num_rows(), 5);
+
+ return arrow::Status::OK();
+}
+
TEST(CreatingArrowObjects, CreatingArraysTest) { ASSERT_OK(CreatingArrays()); }
TEST(CreatingArrowObjects, CreatingArraysPtrTest) {
ASSERT_OK(CreatingArraysPtr()); }
+TEST(CreatingArrowObjects, GeneratingRandomData) {
ASSERT_OK(GenerateRandomData()); }
diff --git a/cpp/source/basic.rst b/cpp/source/basic.rst
index b91cc0e..287fd39 100644
--- a/cpp/source/basic.rst
+++ b/cpp/source/basic.rst
@@ -48,3 +48,45 @@ boilerplate for you. It will run the contained expression
and check the resulti
.. recipe:: ../code/basic_arrow.cc ReturnNotOk
:caption: Using ARROW_RETURN_NOT_OK to check the status
:dedent: 2
+
+
+Using the Visitor Pattern
+=========================
+
+Arrow classes :cpp:class:`arrow::DataType`, :cpp:class:`arrow::Scalar`, and
+:cpp:class:`arrow::Array` have specialized subclasses for each Arrow type. In
+order to specialize logic for each subclass, you can use the visitor pattern.
+Arrow provides inline template functions that allow you to call visitors
+efficiently:
+
+ * :cpp:func:`arrow::VisitTypeInline`
+ * :cpp:func:`arrow::VisitScalarInline`
+ * :cpp:func:`arrow::VisitArrayInline`
+
+Generate Random Data
+--------------------
+
+See example at :ref:`Generate Random Data Example`.
+
+
+Generalize Computations Across Arrow Types
+------------------------------------------
+
+Array visitors can be useful when writing functions that can handle multiple
+array types. However, implementing a visitor for each type individually can be
+excessively verbose. Fortunately, Arrow provides type traits that allow you to
+write templated functions to handle subsets of types. The example below
+demonstrates a table sum function that can handle any integer or floating
point
+array with only a single visitor implementation by leveraging
+:cpp:type:`arrow::enable_if_number`.
+
+.. literalinclude:: ../code/basic_arrow.cc
+ :language: cpp
+ :linenos:
+ :start-at: class TableSummation
+ :end-at: }; // TableSummation
+ :caption: Using visitor pattern that can compute sum of table with any
numeric type
+
+
+.. recipe:: ../code/basic_arrow.cc VisitorSummationExample
+ :dedent: 2
diff --git a/cpp/source/create.rst b/cpp/source/create.rst
index 393747b..305f733 100644
--- a/cpp/source/create.rst
+++ b/cpp/source/create.rst
@@ -47,4 +47,26 @@ Builders can also consume standard C++ containers:
.. note::
Builders will not take ownership of data in containers and will make a
- copy of the underlying data.
\ No newline at end of file
+ copy of the underlying data.
+
+.. _Generate Random Data Example:
+
+Generate Random Data for a Given Schema
+=======================================
+
+To generate random data for a given schema, implementing a type visitor is a
+good idea. The following example only implements double arrays and list arrays,
+but could be easily extended to all types.
+
+
+.. literalinclude:: ../code/creating_arrow_objects.cc
+ :language: cpp
+ :linenos:
+ :start-at: class RandomBatchGenerator
+ :end-at: }; // RandomBatchGenerator
+ :caption: Using visitor pattern to generate random record batches
+
+Given such a generator, you can create random test data for any supported
schema:
+
+.. recipe:: ../code/creating_arrow_objects.cc GenerateRandomData
+ :dedent: 2