pitrou commented on a change in pull request #166:
URL: https://github.com/apache/arrow-cookbook/pull/166#discussion_r831134267



##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
 TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
 
 TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+  std::shared_ptr<arrow::Schema> schema;
+
+  RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) : 
schema(schema){};
+
+  arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t 
num_rows) {
+    num_rows_ = num_rows;
+    for (auto field : schema->fields()) {
+      ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+    }
+
+    return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+  }
+
+  // Default implementation
+  arrow::Status Visit(const arrow::DataType& type) {
+    return arrow::Status::NotImplemented("Generating data for", 
type.ToString());
+  }
+
+  arrow::Status Visit(const arrow::DoubleType&) {
+    auto builder = arrow::DoubleBuilder();
+    std::normal_distribution<> d{5, 2};
+    for (int i = 0; i < num_rows_; ++i) {

Review comment:
       We should decide to always use either fixed-sized ints (`int32_t`) or 
flexibly-sized ones (`int`). Mixing the two is going to confuse the user IMHO.
   @westonpace What do you think?

##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
 TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
 
 TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+  std::shared_ptr<arrow::Schema> schema;
+
+  RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) : 
schema(schema){};
+
+  arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t 
num_rows) {
+    num_rows_ = num_rows;
+    for (auto field : schema->fields()) {
+      ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+    }
+
+    return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+  }
+
+  // Default implementation
+  arrow::Status Visit(const arrow::DataType& type) {
+    return arrow::Status::NotImplemented("Generating data for", 
type.ToString());
+  }
+
+  arrow::Status Visit(const arrow::DoubleType&) {
+    auto builder = arrow::DoubleBuilder();
+    std::normal_distribution<> d{5, 2};

Review comment:
       ```suggestion
       std::normal_distribution<> d{mean=5.0, stddev=2.0};
   ```

##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
 TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
 
 TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+  std::shared_ptr<arrow::Schema> schema;
+
+  RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) : 
schema(schema){};
+
+  arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t 
num_rows) {
+    num_rows_ = num_rows;
+    for (auto field : schema->fields()) {
+      ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+    }
+
+    return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+  }
+
+  // Default implementation
+  arrow::Status Visit(const arrow::DataType& type) {
+    return arrow::Status::NotImplemented("Generating data for", 
type.ToString());
+  }
+
+  arrow::Status Visit(const arrow::DoubleType&) {
+    auto builder = arrow::DoubleBuilder();
+    std::normal_distribution<> d{5, 2};
+    for (int i = 0; i < num_rows_; ++i) {
+      builder.Append(d(gen_));
+    }
+    ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
+    arrays_.push_back(array);
+    return arrow::Status::OK();
+  }
+
+  arrow::Status Visit(const arrow::ListType& type) {
+    // Generate offsets first, which determines number of values in sub-array
+    std::poisson_distribution<> d{4};
+    auto builder = arrow::Int32Builder();
+    builder.Append(0);
+    int32_t last_val = 0;
+    for (int i = 0; i < num_rows_; ++i) {
+      last_val += d(gen_);
+      builder.Append(last_val);
+    }
+    ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
+
+    // Generate values as if we had that number of values
+    int64_t previous_num_rows = num_rows_;
+    num_rows_ = last_val;
+    ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*type.value_type(), this));
+    auto values = arrays_.back();
+    arrays_.pop_back();
+    num_rows_ = previous_num_rows;
+
+    ARROW_ASSIGN_OR_RAISE(auto array,
+                          arrow::ListArray::FromArrays(*offsets.get(), 
*values.get()));
+    arrays_.push_back(array);
+
+    return arrow::Status::OK();
+  }
+
+ protected:
+  std::random_device rd_{};
+  std::mt19937 gen_{rd_()};
+  std::vector<std::shared_ptr<arrow::Array>> arrays_;
+  int64_t num_rows_;
+};  // RandomBatchGenerator
+
+arrow::Status GenerateRandomData() {
+  StartRecipe("GenerateRandomData");
+  std::shared_ptr<arrow::Schema> schema =
+      arrow::schema({arrow::field("x", arrow::float64()),
+                     arrow::field("y", arrow::list(arrow::float64()))});
+
+  RandomBatchGenerator generator(schema);
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::RecordBatch> batch, 
generator.Generate(5));
+
+  rout << "Created batch: \n" << batch;
+
+  // Consider using ValidateFull to check correctness
+  ARROW_RETURN_NOT_OK(batch->ValidateFull());
+
+  EndRecipe("GenerateRandomData");
+  EXPECT_EQ(batch->num_rows(), 5);
+
+  return arrow::Status::OK();
+}
+
+TEST(BasicArrow, GenerateRandomData) { ASSERT_OK(GenerateRandomData()); }
+
+class TableSummation {
+ public:
+  double partial;
+
+  arrow::Result<double> Compute(std::shared_ptr<arrow::RecordBatch> batch) {
+    for (auto array : batch->columns()) {
+      ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*array, this));
+    }
+    return partial;
+  }
+
+  // Default implementation
+  arrow::Status Visit(const arrow::Array& array) {
+    return arrow::Status::NotImplemented("Can not compute sum for array of 
type ",
+                                         array.type()->ToString());
+  }
+
+  template <typename ArrayType, typename T = typename ArrayType::TypeClass>
+  arrow::enable_if_number<T, arrow::Status> Visit(const ArrayType& array) {
+    for (auto value : array) {
+      if (value.has_value()) {
+        partial += (double)value.value();

Review comment:
       ```suggestion
           partial += static_cast<double>(value.value());
   ```

##########
File path: cpp/source/basic.rst
##########
@@ -48,3 +48,58 @@ boilerplate for you.  It will run the contained expression 
and check the resulti
 .. recipe:: ../code/basic_arrow.cc ReturnNotOk
   :caption: Using ARROW_RETURN_NOT_OK to check the status
   :dedent: 2
+
+
+Using the Visitor Pattern
+=========================
+
+Arrow classes DataType, Scalar, and Array have specialized subclasses for
+each Arrow type. In order to specialize logic for each subclass, you the 
visitor
+pattern. Arrow provides the macros:
+
+ * `arrow::VisitTypeInline`

Review comment:
       Should this use more proper reST markup to create a cross-reference?

##########
File path: cpp/source/basic.rst
##########
@@ -48,3 +48,58 @@ boilerplate for you.  It will run the contained expression 
and check the resulti
 .. recipe:: ../code/basic_arrow.cc ReturnNotOk
   :caption: Using ARROW_RETURN_NOT_OK to check the status
   :dedent: 2
+
+
+Using the Visitor Pattern
+=========================
+
+Arrow classes DataType, Scalar, and Array have specialized subclasses for

Review comment:
       Can we cross reference these classes?

##########
File path: cpp/source/basic.rst
##########
@@ -48,3 +48,58 @@ boilerplate for you.  It will run the contained expression 
and check the resulti
 .. recipe:: ../code/basic_arrow.cc ReturnNotOk
   :caption: Using ARROW_RETURN_NOT_OK to check the status
   :dedent: 2
+
+
+Using the Visitor Pattern
+=========================
+
+Arrow classes DataType, Scalar, and Array have specialized subclasses for
+each Arrow type. In order to specialize logic for each subclass, you the 
visitor
+pattern. Arrow provides the macros:
+
+ * `arrow::VisitTypeInline`
+ * `arrow::VisitScalarInline`
+ * `arrow::VisitArrayInline`
+
+To implement a TypeVisitor we have to implement a Visit method for every 
possible
+DataType we wish to handle. Fortunately, we can often use templates and type
+traits to make this less verbose.
+
+Generate Random Data for Given Schema
+-------------------------------------
+
+To generate random data for a given schema, a type visitor is helpful.
+
+
+.. literalinclude:: ../code/basic_arrow.cc
+   :language: cpp
+   :linenos:
+   :start-at: class RandomBatchGenerator
+   :end-at: };  // RandomBatchGenerator
+   :caption: Using visitor pattern to generate random record batches
+  
+
+.. recipe:: ../code/basic_arrow.cc GenerateRandomData
+   :dedent: 2
+
+
+Convert Arbitrary Scalars to Variants
+-------------------------------------
+
+.. TODO: Implement converter from rows to values

Review comment:
       Is this a recipe you actually want to write soon (in this PR perhaps)? 
Otherwise, a JIRA is better suited rather than a TODO comment directly in the 
source tree, IMHO.

##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
 TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
 
 TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+  std::shared_ptr<arrow::Schema> schema;
+
+  RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) : 
schema(schema){};
+
+  arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t 
num_rows) {
+    num_rows_ = num_rows;
+    for (auto field : schema->fields()) {
+      ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+    }
+
+    return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+  }
+
+  // Default implementation
+  arrow::Status Visit(const arrow::DataType& type) {
+    return arrow::Status::NotImplemented("Generating data for", 
type.ToString());
+  }
+
+  arrow::Status Visit(const arrow::DoubleType&) {
+    auto builder = arrow::DoubleBuilder();
+    std::normal_distribution<> d{5, 2};
+    for (int i = 0; i < num_rows_; ++i) {
+      builder.Append(d(gen_));
+    }
+    ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
+    arrays_.push_back(array);
+    return arrow::Status::OK();
+  }
+
+  arrow::Status Visit(const arrow::ListType& type) {
+    // Generate offsets first, which determines number of values in sub-array
+    std::poisson_distribution<> d{4};

Review comment:
       ```suggestion
       std::poisson_distribution<> d{mean=4};
   ```

##########
File path: cpp/source/basic.rst
##########
@@ -48,3 +48,58 @@ boilerplate for you.  It will run the contained expression 
and check the resulti
 .. recipe:: ../code/basic_arrow.cc ReturnNotOk
   :caption: Using ARROW_RETURN_NOT_OK to check the status
   :dedent: 2
+
+
+Using the Visitor Pattern
+=========================
+
+Arrow classes DataType, Scalar, and Array have specialized subclasses for
+each Arrow type. In order to specialize logic for each subclass, you the 
visitor

Review comment:
       Missing word here ("you can use" perhaps?).

##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
 TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
 
 TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+  std::shared_ptr<arrow::Schema> schema;
+
+  RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) : 
schema(schema){};
+
+  arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t 
num_rows) {
+    num_rows_ = num_rows;
+    for (auto field : schema->fields()) {
+      ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+    }
+
+    return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+  }
+
+  // Default implementation
+  arrow::Status Visit(const arrow::DataType& type) {
+    return arrow::Status::NotImplemented("Generating data for", 
type.ToString());
+  }
+
+  arrow::Status Visit(const arrow::DoubleType&) {
+    auto builder = arrow::DoubleBuilder();
+    std::normal_distribution<> d{5, 2};
+    for (int i = 0; i < num_rows_; ++i) {
+      builder.Append(d(gen_));
+    }
+    ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
+    arrays_.push_back(array);
+    return arrow::Status::OK();
+  }
+
+  arrow::Status Visit(const arrow::ListType& type) {
+    // Generate offsets first, which determines number of values in sub-array
+    std::poisson_distribution<> d{4};
+    auto builder = arrow::Int32Builder();
+    builder.Append(0);
+    int32_t last_val = 0;
+    for (int i = 0; i < num_rows_; ++i) {
+      last_val += d(gen_);
+      builder.Append(last_val);
+    }
+    ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
+
+    // Generate values as if we had that number of values
+    int64_t previous_num_rows = num_rows_;
+    num_rows_ = last_val;
+    ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*type.value_type(), this));
+    auto values = arrays_.back();
+    arrays_.pop_back();
+    num_rows_ = previous_num_rows;
+
+    ARROW_ASSIGN_OR_RAISE(auto array,
+                          arrow::ListArray::FromArrays(*offsets.get(), 
*values.get()));
+    arrays_.push_back(array);
+
+    return arrow::Status::OK();
+  }
+
+ protected:
+  std::random_device rd_{};
+  std::mt19937 gen_{rd_()};
+  std::vector<std::shared_ptr<arrow::Array>> arrays_;
+  int64_t num_rows_;
+};  // RandomBatchGenerator
+
+arrow::Status GenerateRandomData() {
+  StartRecipe("GenerateRandomData");
+  std::shared_ptr<arrow::Schema> schema =
+      arrow::schema({arrow::field("x", arrow::float64()),
+                     arrow::field("y", arrow::list(arrow::float64()))});
+
+  RandomBatchGenerator generator(schema);
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::RecordBatch> batch, 
generator.Generate(5));
+
+  rout << "Created batch: \n" << batch;
+
+  // Consider using ValidateFull to check correctness
+  ARROW_RETURN_NOT_OK(batch->ValidateFull());
+
+  EndRecipe("GenerateRandomData");
+  EXPECT_EQ(batch->num_rows(), 5);
+
+  return arrow::Status::OK();
+}
+
+TEST(BasicArrow, GenerateRandomData) { ASSERT_OK(GenerateRandomData()); }
+
+class TableSummation {
+ public:
+  double partial;
+
+  arrow::Result<double> Compute(std::shared_ptr<arrow::RecordBatch> batch) {
+    for (auto array : batch->columns()) {
+      ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*array, this));
+    }
+    return partial;
+  }
+
+  // Default implementation
+  arrow::Status Visit(const arrow::Array& array) {
+    return arrow::Status::NotImplemented("Can not compute sum for array of 
type ",
+                                         array.type()->ToString());
+  }
+
+  template <typename ArrayType, typename T = typename ArrayType::TypeClass>
+  arrow::enable_if_number<T, arrow::Status> Visit(const ArrayType& array) {
+    for (auto value : array) {

Review comment:
       Perhaps it would be informative to spell out the value type explicitly 
here (i.e. `arrow::util::optional<double>` IIUC).

##########
File path: cpp/source/basic.rst
##########
@@ -48,3 +48,58 @@ boilerplate for you.  It will run the contained expression 
and check the resulti
 .. recipe:: ../code/basic_arrow.cc ReturnNotOk
   :caption: Using ARROW_RETURN_NOT_OK to check the status
   :dedent: 2
+
+
+Using the Visitor Pattern
+=========================
+
+Arrow classes DataType, Scalar, and Array have specialized subclasses for
+each Arrow type. In order to specialize logic for each subclass, you the 
visitor
+pattern. Arrow provides the macros:
+
+ * `arrow::VisitTypeInline`
+ * `arrow::VisitScalarInline`
+ * `arrow::VisitArrayInline`
+
+To implement a TypeVisitor we have to implement a Visit method for every 
possible
+DataType we wish to handle. Fortunately, we can often use templates and type
+traits to make this less verbose.
+
+Generate Random Data for Given Schema

Review comment:
       "For a given Schema", perhaps?

##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
 TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
 
 TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+  std::shared_ptr<arrow::Schema> schema;
+
+  RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) : 
schema(schema){};
+
+  arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t 
num_rows) {
+    num_rows_ = num_rows;
+    for (auto field : schema->fields()) {
+      ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+    }
+
+    return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+  }
+
+  // Default implementation
+  arrow::Status Visit(const arrow::DataType& type) {
+    return arrow::Status::NotImplemented("Generating data for", 
type.ToString());
+  }
+
+  arrow::Status Visit(const arrow::DoubleType&) {
+    auto builder = arrow::DoubleBuilder();
+    std::normal_distribution<> d{5, 2};
+    for (int i = 0; i < num_rows_; ++i) {
+      builder.Append(d(gen_));
+    }
+    ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
+    arrays_.push_back(array);
+    return arrow::Status::OK();
+  }
+
+  arrow::Status Visit(const arrow::ListType& type) {
+    // Generate offsets first, which determines number of values in sub-array
+    std::poisson_distribution<> d{4};
+    auto builder = arrow::Int32Builder();
+    builder.Append(0);
+    int32_t last_val = 0;
+    for (int i = 0; i < num_rows_; ++i) {
+      last_val += d(gen_);
+      builder.Append(last_val);
+    }
+    ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
+
+    // Generate values as if we had that number of values
+    int64_t previous_num_rows = num_rows_;
+    num_rows_ = last_val;
+    ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*type.value_type(), this));
+    auto values = arrays_.back();
+    arrays_.pop_back();
+    num_rows_ = previous_num_rows;
+
+    ARROW_ASSIGN_OR_RAISE(auto array,
+                          arrow::ListArray::FromArrays(*offsets.get(), 
*values.get()));
+    arrays_.push_back(array);
+
+    return arrow::Status::OK();
+  }
+
+ protected:
+  std::random_device rd_{};
+  std::mt19937 gen_{rd_()};
+  std::vector<std::shared_ptr<arrow::Array>> arrays_;
+  int64_t num_rows_;
+};  // RandomBatchGenerator
+
+arrow::Status GenerateRandomData() {
+  StartRecipe("GenerateRandomData");
+  std::shared_ptr<arrow::Schema> schema =
+      arrow::schema({arrow::field("x", arrow::float64()),
+                     arrow::field("y", arrow::list(arrow::float64()))});
+
+  RandomBatchGenerator generator(schema);
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::RecordBatch> batch, 
generator.Generate(5));
+
+  rout << "Created batch: \n" << batch;
+
+  // Consider using ValidateFull to check correctness
+  ARROW_RETURN_NOT_OK(batch->ValidateFull());
+
+  EndRecipe("GenerateRandomData");
+  EXPECT_EQ(batch->num_rows(), 5);
+
+  return arrow::Status::OK();
+}
+
+TEST(BasicArrow, GenerateRandomData) { ASSERT_OK(GenerateRandomData()); }
+
+class TableSummation {
+ public:
+  double partial;
+
+  arrow::Result<double> Compute(std::shared_ptr<arrow::RecordBatch> batch) {
+    for (auto array : batch->columns()) {
+      ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*array, this));
+    }
+    return partial;
+  }
+
+  // Default implementation
+  arrow::Status Visit(const arrow::Array& array) {
+    return arrow::Status::NotImplemented("Can not compute sum for array of 
type ",
+                                         array.type()->ToString());
+  }
+
+  template <typename ArrayType, typename T = typename ArrayType::TypeClass>
+  arrow::enable_if_number<T, arrow::Status> Visit(const ArrayType& array) {
+    for (auto value : array) {
+      if (value.has_value()) {
+        partial += (double)value.value();
+      }
+    }
+    return arrow::Status::OK();
+  }
+};  // TableSummation
+
+arrow::Status VisitorSummationExample() {
+  StartRecipe("VisitorSummationExample");
+  std::shared_ptr<arrow::Schema> schema = arrow::schema({
+      arrow::field("a", arrow::int32()),
+      arrow::field("b", arrow::int64()),
+      arrow::field("c", arrow::float64()),
+  });
+  int64_t num_rows = 3;
+  std::vector<std::shared_ptr<arrow::Array>> columns;
+
+  arrow::Int32Builder a_builder = arrow::Int32Builder();
+  std::vector<int32_t> a_vals = {1, 2, 3};
+  ARROW_RETURN_NOT_OK(a_builder.AppendValues(a_vals));
+  ARROW_ASSIGN_OR_RAISE(auto a_arr, a_builder.Finish());
+  columns.push_back(a_arr);
+
+  arrow::Int64Builder b_builder = arrow::Int64Builder();
+  std::vector<int64_t> b_vals = {4, 5, 6};
+  ARROW_RETURN_NOT_OK(b_builder.AppendValues(b_vals));
+  ARROW_ASSIGN_OR_RAISE(auto b_arr, b_builder.Finish());
+  columns.push_back(b_arr);

Review comment:
       Perhaps we can have just two columns in order to make the example 
shorter? I'm not sure showing two different integer sizes is really important.

##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
 TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
 
 TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {

Review comment:
       Add a brief comment explaining what this class does, and how it only 
implements a couple types?

##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
 TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
 
 TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+  std::shared_ptr<arrow::Schema> schema;
+
+  RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) : 
schema(schema){};
+
+  arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t 
num_rows) {
+    num_rows_ = num_rows;
+    for (auto field : schema->fields()) {
+      ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+    }
+
+    return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+  }
+
+  // Default implementation
+  arrow::Status Visit(const arrow::DataType& type) {
+    return arrow::Status::NotImplemented("Generating data for", 
type.ToString());
+  }
+
+  arrow::Status Visit(const arrow::DoubleType&) {
+    auto builder = arrow::DoubleBuilder();
+    std::normal_distribution<> d{5, 2};
+    for (int i = 0; i < num_rows_; ++i) {
+      builder.Append(d(gen_));
+    }
+    ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
+    arrays_.push_back(array);
+    return arrow::Status::OK();
+  }
+
+  arrow::Status Visit(const arrow::ListType& type) {
+    // Generate offsets first, which determines number of values in sub-array
+    std::poisson_distribution<> d{4};
+    auto builder = arrow::Int32Builder();
+    builder.Append(0);
+    int32_t last_val = 0;
+    for (int i = 0; i < num_rows_; ++i) {
+      last_val += d(gen_);
+      builder.Append(last_val);
+    }
+    ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
+
+    // Generate values as if we had that number of values
+    int64_t previous_num_rows = num_rows_;
+    num_rows_ = last_val;
+    ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*type.value_type(), this));
+    auto values = arrays_.back();
+    arrays_.pop_back();
+    num_rows_ = previous_num_rows;

Review comment:
       I find the `previous_num_rows` dance a bit ugly. Should we instead 
instantiate a local `RandomBatchGenerator` for the values array?

##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
 TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
 
 TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+  std::shared_ptr<arrow::Schema> schema;
+
+  RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) : 
schema(schema){};
+
+  arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t 
num_rows) {
+    num_rows_ = num_rows;
+    for (auto field : schema->fields()) {
+      ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+    }
+
+    return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+  }
+
+  // Default implementation
+  arrow::Status Visit(const arrow::DataType& type) {
+    return arrow::Status::NotImplemented("Generating data for", 
type.ToString());
+  }
+
+  arrow::Status Visit(const arrow::DoubleType&) {
+    auto builder = arrow::DoubleBuilder();
+    std::normal_distribution<> d{5, 2};
+    for (int i = 0; i < num_rows_; ++i) {
+      builder.Append(d(gen_));
+    }
+    ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
+    arrays_.push_back(array);
+    return arrow::Status::OK();
+  }
+
+  arrow::Status Visit(const arrow::ListType& type) {
+    // Generate offsets first, which determines number of values in sub-array
+    std::poisson_distribution<> d{4};
+    auto builder = arrow::Int32Builder();
+    builder.Append(0);
+    int32_t last_val = 0;
+    for (int i = 0; i < num_rows_; ++i) {
+      last_val += d(gen_);
+      builder.Append(last_val);
+    }
+    ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
+
+    // Generate values as if we had that number of values
+    int64_t previous_num_rows = num_rows_;
+    num_rows_ = last_val;
+    ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*type.value_type(), this));
+    auto values = arrays_.back();
+    arrays_.pop_back();
+    num_rows_ = previous_num_rows;
+
+    ARROW_ASSIGN_OR_RAISE(auto array,
+                          arrow::ListArray::FromArrays(*offsets.get(), 
*values.get()));
+    arrays_.push_back(array);
+
+    return arrow::Status::OK();
+  }
+
+ protected:
+  std::random_device rd_{};
+  std::mt19937 gen_{rd_()};
+  std::vector<std::shared_ptr<arrow::Array>> arrays_;
+  int64_t num_rows_;
+};  // RandomBatchGenerator
+
+arrow::Status GenerateRandomData() {
+  StartRecipe("GenerateRandomData");
+  std::shared_ptr<arrow::Schema> schema =
+      arrow::schema({arrow::field("x", arrow::float64()),
+                     arrow::field("y", arrow::list(arrow::float64()))});
+
+  RandomBatchGenerator generator(schema);
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::RecordBatch> batch, 
generator.Generate(5));
+
+  rout << "Created batch: \n" << batch;
+
+  // Consider using ValidateFull to check correctness
+  ARROW_RETURN_NOT_OK(batch->ValidateFull());
+
+  EndRecipe("GenerateRandomData");
+  EXPECT_EQ(batch->num_rows(), 5);
+
+  return arrow::Status::OK();
+}
+
+TEST(BasicArrow, GenerateRandomData) { ASSERT_OK(GenerateRandomData()); }
+
+class TableSummation {

Review comment:
       
   
   Add a brief comment explaining what this class does, and how it only 
implements a couple types?
   




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to