pitrou commented on a change in pull request #166:
URL: https://github.com/apache/arrow-cookbook/pull/166#discussion_r831134267
##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+ std::shared_ptr<arrow::Schema> schema;
+
+ RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) :
schema(schema){};
+
+ arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t
num_rows) {
+ num_rows_ = num_rows;
+ for (auto field : schema->fields()) {
+ ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+ }
+
+ return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+ }
+
+ // Default implementation
+ arrow::Status Visit(const arrow::DataType& type) {
+ return arrow::Status::NotImplemented("Generating data for",
type.ToString());
+ }
+
+ arrow::Status Visit(const arrow::DoubleType&) {
+ auto builder = arrow::DoubleBuilder();
+ std::normal_distribution<> d{5, 2};
+ for (int i = 0; i < num_rows_; ++i) {
Review comment:
We should decide to always use either fixed-sized ints (`int32_t`) or
flexibly-sized ones (`int`). Mixing the two is going to confuse the user IMHO.
@westonpace What do you think?
##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+ std::shared_ptr<arrow::Schema> schema;
+
+ RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) :
schema(schema){};
+
+ arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t
num_rows) {
+ num_rows_ = num_rows;
+ for (auto field : schema->fields()) {
+ ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+ }
+
+ return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+ }
+
+ // Default implementation
+ arrow::Status Visit(const arrow::DataType& type) {
+ return arrow::Status::NotImplemented("Generating data for",
type.ToString());
+ }
+
+ arrow::Status Visit(const arrow::DoubleType&) {
+ auto builder = arrow::DoubleBuilder();
+ std::normal_distribution<> d{5, 2};
Review comment:
```suggestion
std::normal_distribution<> d{mean=5.0, stddev=2.0};
```
##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+ std::shared_ptr<arrow::Schema> schema;
+
+ RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) :
schema(schema){};
+
+ arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t
num_rows) {
+ num_rows_ = num_rows;
+ for (auto field : schema->fields()) {
+ ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+ }
+
+ return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+ }
+
+ // Default implementation
+ arrow::Status Visit(const arrow::DataType& type) {
+ return arrow::Status::NotImplemented("Generating data for",
type.ToString());
+ }
+
+ arrow::Status Visit(const arrow::DoubleType&) {
+ auto builder = arrow::DoubleBuilder();
+ std::normal_distribution<> d{5, 2};
+ for (int i = 0; i < num_rows_; ++i) {
+ builder.Append(d(gen_));
+ }
+ ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
+ arrays_.push_back(array);
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::ListType& type) {
+ // Generate offsets first, which determines number of values in sub-array
+ std::poisson_distribution<> d{4};
+ auto builder = arrow::Int32Builder();
+ builder.Append(0);
+ int32_t last_val = 0;
+ for (int i = 0; i < num_rows_; ++i) {
+ last_val += d(gen_);
+ builder.Append(last_val);
+ }
+ ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
+
+ // Generate values as if we had that number of values
+ int64_t previous_num_rows = num_rows_;
+ num_rows_ = last_val;
+ ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*type.value_type(), this));
+ auto values = arrays_.back();
+ arrays_.pop_back();
+ num_rows_ = previous_num_rows;
+
+ ARROW_ASSIGN_OR_RAISE(auto array,
+ arrow::ListArray::FromArrays(*offsets.get(),
*values.get()));
+ arrays_.push_back(array);
+
+ return arrow::Status::OK();
+ }
+
+ protected:
+ std::random_device rd_{};
+ std::mt19937 gen_{rd_()};
+ std::vector<std::shared_ptr<arrow::Array>> arrays_;
+ int64_t num_rows_;
+}; // RandomBatchGenerator
+
+arrow::Status GenerateRandomData() {
+ StartRecipe("GenerateRandomData");
+ std::shared_ptr<arrow::Schema> schema =
+ arrow::schema({arrow::field("x", arrow::float64()),
+ arrow::field("y", arrow::list(arrow::float64()))});
+
+ RandomBatchGenerator generator(schema);
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::RecordBatch> batch,
generator.Generate(5));
+
+ rout << "Created batch: \n" << batch;
+
+ // Consider using ValidateFull to check correctness
+ ARROW_RETURN_NOT_OK(batch->ValidateFull());
+
+ EndRecipe("GenerateRandomData");
+ EXPECT_EQ(batch->num_rows(), 5);
+
+ return arrow::Status::OK();
+}
+
+TEST(BasicArrow, GenerateRandomData) { ASSERT_OK(GenerateRandomData()); }
+
+class TableSummation {
+ public:
+ double partial;
+
+ arrow::Result<double> Compute(std::shared_ptr<arrow::RecordBatch> batch) {
+ for (auto array : batch->columns()) {
+ ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*array, this));
+ }
+ return partial;
+ }
+
+ // Default implementation
+ arrow::Status Visit(const arrow::Array& array) {
+ return arrow::Status::NotImplemented("Can not compute sum for array of
type ",
+ array.type()->ToString());
+ }
+
+ template <typename ArrayType, typename T = typename ArrayType::TypeClass>
+ arrow::enable_if_number<T, arrow::Status> Visit(const ArrayType& array) {
+ for (auto value : array) {
+ if (value.has_value()) {
+ partial += (double)value.value();
Review comment:
```suggestion
partial += static_cast<double>(value.value());
```
##########
File path: cpp/source/basic.rst
##########
@@ -48,3 +48,58 @@ boilerplate for you. It will run the contained expression
and check the resulti
.. recipe:: ../code/basic_arrow.cc ReturnNotOk
:caption: Using ARROW_RETURN_NOT_OK to check the status
:dedent: 2
+
+
+Using the Visitor Pattern
+=========================
+
+Arrow classes DataType, Scalar, and Array have specialized subclasses for
+each Arrow type. In order to specialize logic for each subclass, you the
visitor
+pattern. Arrow provides the macros:
+
+ * `arrow::VisitTypeInline`
Review comment:
Should this use more proper reST markup to create a cross-reference?
##########
File path: cpp/source/basic.rst
##########
@@ -48,3 +48,58 @@ boilerplate for you. It will run the contained expression
and check the resulti
.. recipe:: ../code/basic_arrow.cc ReturnNotOk
:caption: Using ARROW_RETURN_NOT_OK to check the status
:dedent: 2
+
+
+Using the Visitor Pattern
+=========================
+
+Arrow classes DataType, Scalar, and Array have specialized subclasses for
Review comment:
Can we cross reference these classes?
##########
File path: cpp/source/basic.rst
##########
@@ -48,3 +48,58 @@ boilerplate for you. It will run the contained expression
and check the resulti
.. recipe:: ../code/basic_arrow.cc ReturnNotOk
:caption: Using ARROW_RETURN_NOT_OK to check the status
:dedent: 2
+
+
+Using the Visitor Pattern
+=========================
+
+Arrow classes DataType, Scalar, and Array have specialized subclasses for
+each Arrow type. In order to specialize logic for each subclass, you the
visitor
+pattern. Arrow provides the macros:
+
+ * `arrow::VisitTypeInline`
+ * `arrow::VisitScalarInline`
+ * `arrow::VisitArrayInline`
+
+To implement a TypeVisitor we have to implement a Visit method for every
possible
+DataType we wish to handle. Fortunately, we can often use templates and type
+traits to make this less verbose.
+
+Generate Random Data for Given Schema
+-------------------------------------
+
+To generate random data for a given schema, a type visitor is helpful.
+
+
+.. literalinclude:: ../code/basic_arrow.cc
+ :language: cpp
+ :linenos:
+ :start-at: class RandomBatchGenerator
+ :end-at: }; // RandomBatchGenerator
+ :caption: Using visitor pattern to generate random record batches
+
+
+.. recipe:: ../code/basic_arrow.cc GenerateRandomData
+ :dedent: 2
+
+
+Convert Arbitrary Scalars to Variants
+-------------------------------------
+
+.. TODO: Implement converter from rows to values
Review comment:
Is this a recipe you actually want to write soon (in this PR perhaps)?
Otherwise, a JIRA is better suited rather than a TODO comment directly in the
source tree, IMHO.
##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+ std::shared_ptr<arrow::Schema> schema;
+
+ RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) :
schema(schema){};
+
+ arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t
num_rows) {
+ num_rows_ = num_rows;
+ for (auto field : schema->fields()) {
+ ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+ }
+
+ return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+ }
+
+ // Default implementation
+ arrow::Status Visit(const arrow::DataType& type) {
+ return arrow::Status::NotImplemented("Generating data for",
type.ToString());
+ }
+
+ arrow::Status Visit(const arrow::DoubleType&) {
+ auto builder = arrow::DoubleBuilder();
+ std::normal_distribution<> d{5, 2};
+ for (int i = 0; i < num_rows_; ++i) {
+ builder.Append(d(gen_));
+ }
+ ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
+ arrays_.push_back(array);
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::ListType& type) {
+ // Generate offsets first, which determines number of values in sub-array
+ std::poisson_distribution<> d{4};
Review comment:
```suggestion
std::poisson_distribution<> d{mean=4};
```
##########
File path: cpp/source/basic.rst
##########
@@ -48,3 +48,58 @@ boilerplate for you. It will run the contained expression
and check the resulti
.. recipe:: ../code/basic_arrow.cc ReturnNotOk
:caption: Using ARROW_RETURN_NOT_OK to check the status
:dedent: 2
+
+
+Using the Visitor Pattern
+=========================
+
+Arrow classes DataType, Scalar, and Array have specialized subclasses for
+each Arrow type. In order to specialize logic for each subclass, you the
visitor
Review comment:
Missing word here ("you can use" perhaps?).
##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+ std::shared_ptr<arrow::Schema> schema;
+
+ RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) :
schema(schema){};
+
+ arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t
num_rows) {
+ num_rows_ = num_rows;
+ for (auto field : schema->fields()) {
+ ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+ }
+
+ return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+ }
+
+ // Default implementation
+ arrow::Status Visit(const arrow::DataType& type) {
+ return arrow::Status::NotImplemented("Generating data for",
type.ToString());
+ }
+
+ arrow::Status Visit(const arrow::DoubleType&) {
+ auto builder = arrow::DoubleBuilder();
+ std::normal_distribution<> d{5, 2};
+ for (int i = 0; i < num_rows_; ++i) {
+ builder.Append(d(gen_));
+ }
+ ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
+ arrays_.push_back(array);
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::ListType& type) {
+ // Generate offsets first, which determines number of values in sub-array
+ std::poisson_distribution<> d{4};
+ auto builder = arrow::Int32Builder();
+ builder.Append(0);
+ int32_t last_val = 0;
+ for (int i = 0; i < num_rows_; ++i) {
+ last_val += d(gen_);
+ builder.Append(last_val);
+ }
+ ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
+
+ // Generate values as if we had that number of values
+ int64_t previous_num_rows = num_rows_;
+ num_rows_ = last_val;
+ ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*type.value_type(), this));
+ auto values = arrays_.back();
+ arrays_.pop_back();
+ num_rows_ = previous_num_rows;
+
+ ARROW_ASSIGN_OR_RAISE(auto array,
+ arrow::ListArray::FromArrays(*offsets.get(),
*values.get()));
+ arrays_.push_back(array);
+
+ return arrow::Status::OK();
+ }
+
+ protected:
+ std::random_device rd_{};
+ std::mt19937 gen_{rd_()};
+ std::vector<std::shared_ptr<arrow::Array>> arrays_;
+ int64_t num_rows_;
+}; // RandomBatchGenerator
+
+arrow::Status GenerateRandomData() {
+ StartRecipe("GenerateRandomData");
+ std::shared_ptr<arrow::Schema> schema =
+ arrow::schema({arrow::field("x", arrow::float64()),
+ arrow::field("y", arrow::list(arrow::float64()))});
+
+ RandomBatchGenerator generator(schema);
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::RecordBatch> batch,
generator.Generate(5));
+
+ rout << "Created batch: \n" << batch;
+
+ // Consider using ValidateFull to check correctness
+ ARROW_RETURN_NOT_OK(batch->ValidateFull());
+
+ EndRecipe("GenerateRandomData");
+ EXPECT_EQ(batch->num_rows(), 5);
+
+ return arrow::Status::OK();
+}
+
+TEST(BasicArrow, GenerateRandomData) { ASSERT_OK(GenerateRandomData()); }
+
+class TableSummation {
+ public:
+ double partial;
+
+ arrow::Result<double> Compute(std::shared_ptr<arrow::RecordBatch> batch) {
+ for (auto array : batch->columns()) {
+ ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*array, this));
+ }
+ return partial;
+ }
+
+ // Default implementation
+ arrow::Status Visit(const arrow::Array& array) {
+ return arrow::Status::NotImplemented("Can not compute sum for array of
type ",
+ array.type()->ToString());
+ }
+
+ template <typename ArrayType, typename T = typename ArrayType::TypeClass>
+ arrow::enable_if_number<T, arrow::Status> Visit(const ArrayType& array) {
+ for (auto value : array) {
Review comment:
Perhaps it would be informative to spell out the value type explicitly
here (i.e. `arrow::util::optional<double>` IIUC).
##########
File path: cpp/source/basic.rst
##########
@@ -48,3 +48,58 @@ boilerplate for you. It will run the contained expression
and check the resulti
.. recipe:: ../code/basic_arrow.cc ReturnNotOk
:caption: Using ARROW_RETURN_NOT_OK to check the status
:dedent: 2
+
+
+Using the Visitor Pattern
+=========================
+
+Arrow classes DataType, Scalar, and Array have specialized subclasses for
+each Arrow type. In order to specialize logic for each subclass, you the
visitor
+pattern. Arrow provides the macros:
+
+ * `arrow::VisitTypeInline`
+ * `arrow::VisitScalarInline`
+ * `arrow::VisitArrayInline`
+
+To implement a TypeVisitor we have to implement a Visit method for every
possible
+DataType we wish to handle. Fortunately, we can often use templates and type
+traits to make this less verbose.
+
+Generate Random Data for Given Schema
Review comment:
"For a given Schema", perhaps?
##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+ std::shared_ptr<arrow::Schema> schema;
+
+ RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) :
schema(schema){};
+
+ arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t
num_rows) {
+ num_rows_ = num_rows;
+ for (auto field : schema->fields()) {
+ ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+ }
+
+ return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+ }
+
+ // Default implementation
+ arrow::Status Visit(const arrow::DataType& type) {
+ return arrow::Status::NotImplemented("Generating data for",
type.ToString());
+ }
+
+ arrow::Status Visit(const arrow::DoubleType&) {
+ auto builder = arrow::DoubleBuilder();
+ std::normal_distribution<> d{5, 2};
+ for (int i = 0; i < num_rows_; ++i) {
+ builder.Append(d(gen_));
+ }
+ ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
+ arrays_.push_back(array);
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::ListType& type) {
+ // Generate offsets first, which determines number of values in sub-array
+ std::poisson_distribution<> d{4};
+ auto builder = arrow::Int32Builder();
+ builder.Append(0);
+ int32_t last_val = 0;
+ for (int i = 0; i < num_rows_; ++i) {
+ last_val += d(gen_);
+ builder.Append(last_val);
+ }
+ ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
+
+ // Generate values as if we had that number of values
+ int64_t previous_num_rows = num_rows_;
+ num_rows_ = last_val;
+ ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*type.value_type(), this));
+ auto values = arrays_.back();
+ arrays_.pop_back();
+ num_rows_ = previous_num_rows;
+
+ ARROW_ASSIGN_OR_RAISE(auto array,
+ arrow::ListArray::FromArrays(*offsets.get(),
*values.get()));
+ arrays_.push_back(array);
+
+ return arrow::Status::OK();
+ }
+
+ protected:
+ std::random_device rd_{};
+ std::mt19937 gen_{rd_()};
+ std::vector<std::shared_ptr<arrow::Array>> arrays_;
+ int64_t num_rows_;
+}; // RandomBatchGenerator
+
+arrow::Status GenerateRandomData() {
+ StartRecipe("GenerateRandomData");
+ std::shared_ptr<arrow::Schema> schema =
+ arrow::schema({arrow::field("x", arrow::float64()),
+ arrow::field("y", arrow::list(arrow::float64()))});
+
+ RandomBatchGenerator generator(schema);
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::RecordBatch> batch,
generator.Generate(5));
+
+ rout << "Created batch: \n" << batch;
+
+ // Consider using ValidateFull to check correctness
+ ARROW_RETURN_NOT_OK(batch->ValidateFull());
+
+ EndRecipe("GenerateRandomData");
+ EXPECT_EQ(batch->num_rows(), 5);
+
+ return arrow::Status::OK();
+}
+
+TEST(BasicArrow, GenerateRandomData) { ASSERT_OK(GenerateRandomData()); }
+
+class TableSummation {
+ public:
+ double partial;
+
+ arrow::Result<double> Compute(std::shared_ptr<arrow::RecordBatch> batch) {
+ for (auto array : batch->columns()) {
+ ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*array, this));
+ }
+ return partial;
+ }
+
+ // Default implementation
+ arrow::Status Visit(const arrow::Array& array) {
+ return arrow::Status::NotImplemented("Can not compute sum for array of
type ",
+ array.type()->ToString());
+ }
+
+ template <typename ArrayType, typename T = typename ArrayType::TypeClass>
+ arrow::enable_if_number<T, arrow::Status> Visit(const ArrayType& array) {
+ for (auto value : array) {
+ if (value.has_value()) {
+ partial += (double)value.value();
+ }
+ }
+ return arrow::Status::OK();
+ }
+}; // TableSummation
+
+arrow::Status VisitorSummationExample() {
+ StartRecipe("VisitorSummationExample");
+ std::shared_ptr<arrow::Schema> schema = arrow::schema({
+ arrow::field("a", arrow::int32()),
+ arrow::field("b", arrow::int64()),
+ arrow::field("c", arrow::float64()),
+ });
+ int64_t num_rows = 3;
+ std::vector<std::shared_ptr<arrow::Array>> columns;
+
+ arrow::Int32Builder a_builder = arrow::Int32Builder();
+ std::vector<int32_t> a_vals = {1, 2, 3};
+ ARROW_RETURN_NOT_OK(a_builder.AppendValues(a_vals));
+ ARROW_ASSIGN_OR_RAISE(auto a_arr, a_builder.Finish());
+ columns.push_back(a_arr);
+
+ arrow::Int64Builder b_builder = arrow::Int64Builder();
+ std::vector<int64_t> b_vals = {4, 5, 6};
+ ARROW_RETURN_NOT_OK(b_builder.AppendValues(b_vals));
+ ARROW_ASSIGN_OR_RAISE(auto b_arr, b_builder.Finish());
+ columns.push_back(b_arr);
Review comment:
Perhaps we can have just two columns in order to make the example
shorter? I'm not sure showing two different integer sizes is really important.
##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
Review comment:
Add a brief comment explaining what this class does, and how it only
implements a couple types?
##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+ std::shared_ptr<arrow::Schema> schema;
+
+ RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) :
schema(schema){};
+
+ arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t
num_rows) {
+ num_rows_ = num_rows;
+ for (auto field : schema->fields()) {
+ ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+ }
+
+ return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+ }
+
+ // Default implementation
+ arrow::Status Visit(const arrow::DataType& type) {
+ return arrow::Status::NotImplemented("Generating data for",
type.ToString());
+ }
+
+ arrow::Status Visit(const arrow::DoubleType&) {
+ auto builder = arrow::DoubleBuilder();
+ std::normal_distribution<> d{5, 2};
+ for (int i = 0; i < num_rows_; ++i) {
+ builder.Append(d(gen_));
+ }
+ ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
+ arrays_.push_back(array);
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::ListType& type) {
+ // Generate offsets first, which determines number of values in sub-array
+ std::poisson_distribution<> d{4};
+ auto builder = arrow::Int32Builder();
+ builder.Append(0);
+ int32_t last_val = 0;
+ for (int i = 0; i < num_rows_; ++i) {
+ last_val += d(gen_);
+ builder.Append(last_val);
+ }
+ ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
+
+ // Generate values as if we had that number of values
+ int64_t previous_num_rows = num_rows_;
+ num_rows_ = last_val;
+ ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*type.value_type(), this));
+ auto values = arrays_.back();
+ arrays_.pop_back();
+ num_rows_ = previous_num_rows;
Review comment:
I find the `previous_num_rows` dance a bit ugly. Should we instead
instantiate a local `RandomBatchGenerator` for the values array?
##########
File path: cpp/code/basic_arrow.cc
##########
@@ -63,3 +69,162 @@ arrow::Status ReturnNotOk() {
TEST(BasicArrow, ReturnNotOkNoMacro) { ASSERT_OK(ReturnNotOkMacro()); }
TEST(BasicArrow, ReturnNotOk) { ASSERT_OK(ReturnNotOk()); }
+
+class RandomBatchGenerator {
+ public:
+ std::shared_ptr<arrow::Schema> schema;
+
+ RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) :
schema(schema){};
+
+ arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t
num_rows) {
+ num_rows_ = num_rows;
+ for (auto field : schema->fields()) {
+ ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
+ }
+
+ return arrow::RecordBatch::Make(schema, num_rows, arrays_);
+ }
+
+ // Default implementation
+ arrow::Status Visit(const arrow::DataType& type) {
+ return arrow::Status::NotImplemented("Generating data for",
type.ToString());
+ }
+
+ arrow::Status Visit(const arrow::DoubleType&) {
+ auto builder = arrow::DoubleBuilder();
+ std::normal_distribution<> d{5, 2};
+ for (int i = 0; i < num_rows_; ++i) {
+ builder.Append(d(gen_));
+ }
+ ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
+ arrays_.push_back(array);
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::ListType& type) {
+ // Generate offsets first, which determines number of values in sub-array
+ std::poisson_distribution<> d{4};
+ auto builder = arrow::Int32Builder();
+ builder.Append(0);
+ int32_t last_val = 0;
+ for (int i = 0; i < num_rows_; ++i) {
+ last_val += d(gen_);
+ builder.Append(last_val);
+ }
+ ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
+
+ // Generate values as if we had that number of values
+ int64_t previous_num_rows = num_rows_;
+ num_rows_ = last_val;
+ ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*type.value_type(), this));
+ auto values = arrays_.back();
+ arrays_.pop_back();
+ num_rows_ = previous_num_rows;
+
+ ARROW_ASSIGN_OR_RAISE(auto array,
+ arrow::ListArray::FromArrays(*offsets.get(),
*values.get()));
+ arrays_.push_back(array);
+
+ return arrow::Status::OK();
+ }
+
+ protected:
+ std::random_device rd_{};
+ std::mt19937 gen_{rd_()};
+ std::vector<std::shared_ptr<arrow::Array>> arrays_;
+ int64_t num_rows_;
+}; // RandomBatchGenerator
+
+arrow::Status GenerateRandomData() {
+ StartRecipe("GenerateRandomData");
+ std::shared_ptr<arrow::Schema> schema =
+ arrow::schema({arrow::field("x", arrow::float64()),
+ arrow::field("y", arrow::list(arrow::float64()))});
+
+ RandomBatchGenerator generator(schema);
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::RecordBatch> batch,
generator.Generate(5));
+
+ rout << "Created batch: \n" << batch;
+
+ // Consider using ValidateFull to check correctness
+ ARROW_RETURN_NOT_OK(batch->ValidateFull());
+
+ EndRecipe("GenerateRandomData");
+ EXPECT_EQ(batch->num_rows(), 5);
+
+ return arrow::Status::OK();
+}
+
+TEST(BasicArrow, GenerateRandomData) { ASSERT_OK(GenerateRandomData()); }
+
+class TableSummation {
Review comment:
Add a brief comment explaining what this class does, and how it only
implements a couple types?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]