pitrou commented on a change in pull request #166:
URL: https://github.com/apache/arrow-cookbook/pull/166#discussion_r831992785



##########
File path: cpp/code/creating_arrow_objects.cc
##########
@@ -58,5 +60,92 @@ arrow::Status CreatingArraysPtr() {
   return arrow::Status::OK();
 }
 
+/// \brief Generate random record batches for a given schema
+///
+/// For demonstration purposes, this only covers DoubleType and ListType
+class RandomBatchGenerator {
+ public:
+  std::shared_ptr<arrow::Schema> schema;
+
+  RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) : 
schema(schema){};
+
+  arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t 
num_rows) {
+    num_rows_ = num_rows;
+    for (auto field : schema->fields()) {
+      ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+    }
+
+    return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+  }
+
+  // Default implementation
+  arrow::Status Visit(const arrow::DataType& type) {
+    return arrow::Status::NotImplemented("Generating data for", 
type.ToString());
+  }
+
+  arrow::Status Visit(const arrow::DoubleType&) {
+    auto builder = arrow::DoubleBuilder();
+    std::normal_distribution<> d{/*mean=*/5.0, /*stddev=*/2.0};
+    for (int64_t i = 0; i < num_rows_; ++i) {
+      builder.Append(d(gen_));
+    }
+    ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
+    arrays_.push_back(array);
+    return arrow::Status::OK();
+  }
+
+  arrow::Status Visit(const arrow::ListType& type) {
+    // Generate offsets first, which determines number of values in sub-array
+    std::poisson_distribution<> d{/*mean=*/4};
+    auto builder = arrow::Int32Builder();
+    builder.Append(0);
+    int32_t last_val = 0;
+    for (int64_t i = 0; i < num_rows_; ++i) {
+      last_val += d(gen_);
+      builder.Append(last_val);
+    }
+    ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
+
+    // Since children of list has a new length, will use a new generator
+    RandomBatchGenerator value_gen(arrow::schema({arrow::field("x", 
type.value_type())}));
+    // Last index from the offsets array becomes the length of the sub-array
+    ARROW_ASSIGN_OR_RAISE(auto inner_batch, value_gen.Generate(last_val));
+    std::shared_ptr<arrow::Array> values = inner_batch->column(0);
+
+    ARROW_ASSIGN_OR_RAISE(auto array,
+                          arrow::ListArray::FromArrays(*offsets.get(), 
*values.get()));
+    arrays_.push_back(array);
+
+    return arrow::Status::OK();
+  }
+
+ protected:
+  std::random_device rd_{};
+  std::mt19937 gen_{rd_()};
+  std::vector<std::shared_ptr<arrow::Array>> arrays_;
+  int64_t num_rows_;
+};  // RandomBatchGenerator
+
+arrow::Status GenerateRandomData() {
+  StartRecipe("GenerateRandomData");
+  std::shared_ptr<arrow::Schema> schema =
+      arrow::schema({arrow::field("x", arrow::float64()),
+                     arrow::field("y", arrow::list(arrow::float64()))});
+
+  RandomBatchGenerator generator(schema);
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::RecordBatch> batch, 
generator.Generate(5));
+
+  rout << "Created batch: \n" << batch->ToString();
+
+  // Consider using ValidateFull to check correctness
+  ARROW_RETURN_NOT_OK(batch->ValidateFull());
+
+  EndRecipe("GenerateRandomData");
+  EXPECT_EQ(batch->num_rows(), 5);
+
+  return arrow::Status::OK();
+}
+
 TEST(CreatingArrowObjects, CreatingArraysTest) { ASSERT_OK(CreatingArrays()); }
 TEST(CreatingArrowObjects, CreatingArraysPtrTest) { 
ASSERT_OK(CreatingArraysPtr()); }
+TEST(BasicArrow, GenerateRandomData) { ASSERT_OK(GenerateRandomData()); }

Review comment:
       Nit:
   ```suggestion
   TEST(CreatingArrowObjects, GeneratingRandomData) { 
ASSERT_OK(GenerateRandomData()); }
   ```

##########
File path: cpp/code/basic_arrow.cc
##########
@@ -18,6 +18,12 @@
 #include <arrow/api.h>
 #include <gtest/gtest.h>
 
+#include <random>
+
+#include "arrow/array.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/visit_array_inline.h"

Review comment:
       Nit: can we gather related includes together and use the same quoting 
convention? (angle brackets apparently)

##########
File path: cpp/source/basic.rst
##########
@@ -48,3 +48,45 @@ boilerplate for you.  It will run the contained expression 
and check the resulti
 .. recipe:: ../code/basic_arrow.cc ReturnNotOk
   :caption: Using ARROW_RETURN_NOT_OK to check the status
   :dedent: 2
+
+
+Using the Visitor Pattern
+=========================
+
+Arrow classes :cpp:class:`arrow::DataType`, :cpp:class:`arrow::Scalar`, and
+:cpp:class:`arrow::Array` have specialized subclasses for each Arrow type. In 
+order to specialize logic for each subclass, you can use the visitor pattern. 
+Arrow provides inline template functions that allow you to call visitors 
+efficiently:
+
+ * :cpp:func:`arrow::VisitTypeInline`
+ * :cpp:func:`arrow::VisitScalarInline`
+ * :cpp:func:`arrow::VisitArrayInline`
+
+Generate Random Data
+--------------------
+
+See example at :ref:`Generate Random Data Example`.
+
+
+Generalize Computations Across Arrow Types
+------------------------------------------
+
+Array visitors can be useful when writing functions that can handle multiple
+array types. However, implementing a visitor for each type individually can be
+excessively verbose. Fortunately, Arrow provides type traits that allow you to
+write templated functions to handle subsets of types. The example below
+demonstrates a table sum function that can handle any integer or floating 
point 
+array with only a single visitor implementation by leveraging
+:cpp:type:`arrow::enable_if_number`.

Review comment:
       A pity this reference doesn't work, but that needs a separate PR to 
solve on the Arrow side I suppose.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to