Re: [PR] GH-37895: [C++] Feature: support concatenate recordbatches. [arrow]

via GitHub Fri, 13 Oct 2023 03:20:01 -0700


js8544 commented on code in PR #37896:
URL: https://github.com/apache/arrow/pull/37896#discussion_r1358074430



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -555,4 +555,33 @@ TEST_F(TestRecordBatch, ReplaceSchema) {
   ASSERT_RAISES(Invalid, b1->ReplaceSchema(schema));
 }
 
+TEST_F(TestRecordBatch, ConcatenateRecordBatches) {
+  int length = 10;
+
+  auto f0 = field("f0", int32());
+  auto f1 = field("f1", uint8());
+
+  auto schema = ::arrow::schema({f0, f1});
+
+  random::RandomArrayGenerator gen(42);
+
+  auto b1 = gen.BatchOf(schema->fields(), length);
+
+  length = 5;
+
+  auto b2 = gen.BatchOf(schema->fields(), length);
+
+  ASSERT_OK_AND_ASSIGN(auto batch, ConcatenateRecordBatches({b1, b2}));
+  ASSERT_EQ(batch->num_rows(), b1->num_rows() + b2->num_rows());
+
+  f0 = field("fd0", int32());
+  f1 = field("fd1", uint8());
+
+  schema = ::arrow::schema({f0, f1});
+
+  auto b3 = gen.BatchOf(schema->fields(), length);
+
+  ASSERT_RAISES(Invalid, ConcatenateRecordBatches({b1, b3}));

Review Comment:
   It's desirable to have tests on edge cases such as zero-length RecordBatch 
and zero-column ones.



##########
cpp/src/arrow/record_batch.cc:
##########
@@ -432,4 +433,43 @@ RecordBatchReader::~RecordBatchReader() {
   ARROW_WARN_NOT_OK(this->Close(), "Implicitly called RecordBatchReader::Close 
failed");
 }
 
+Result<std::shared_ptr<RecordBatch>> ConcatenateRecordBatches(
+    const RecordBatchVector& batches, MemoryPool* pool) {
+  int64_t length = 0;
+  size_t n = batches.size();
+  if (n == 0) {
+    return Status::Invalid("Must pass at least one recordbatch");
+  }
+  if (n == 1) {
+    return batches[0];
+  }
+  int cols = batches[0]->num_columns();
+  auto schema = batches[0]->schema();
+  std::vector<std::shared_ptr<Array>> columns;
+  if (cols == 0) {
+    // special case: null batch, no data, just length
+    for (size_t i = 0; i < batches.size(); ++i) {
+      length += batches[i]->num_rows();
+    }
+  } else {

Review Comment:
   Will the following change make any difference on the result?
   ```cpp
   Result<std::shared_ptr<RecordBatch>> ConcatenateRecordBatches(
       const RecordBatchVector& batches, MemoryPool* pool) {
     int64_t length = 0;
     size_t n = batches.size();
     if (n == 0) {
       return Status::Invalid("Must pass at least one recordbatch");
     }
     int cols = batches[0]->num_columns();
     auto schema = batches[0]->schema();
     for (int i = 0; i < batches.size(); ++i) {
       length += batches[i]->num_rows();
       if (!schema->Equals(batches[i]->schema())) {
         return Status::Invalid(
             "Schema of RecordBatch index ", i, " is ", 
batches[i]->schema()->ToString(),
             ", which does not match index 0 recordbatch schema: ", 
schema->ToString());
       }
     }
   
     std::vector<std::shared_ptr<Array>> concatenated_columns;
     for (int col = 0; col < cols; ++col) {
       ArrayVector column_arrays;
       for (const auto & batch : batches) {
         column_arrays.emplace_back(std::move(batch->column(col)));
       }
       ARROW_ASSIGN_OR_RAISE(auto concatenated_column, 
Concatenate(column_arrays, pool))
       concatenated_columns.emplace_back(std::move(concatenated_column));
     }
     return RecordBatch::Make(std::move(schema), length, 
std::move(concatenated_columns));
   }
   ```
   I assumed:
   1. Schema checking can be moved out of the nested loop.
   2. The result length is just the sum of all batch lengths, no matter how 
many (or zero) columns there are.
   3. Each column has the same length in a RecordBatch, so no checking of 
column length is needed here.
   4. Variable names like `column_data`, `data`, `columns` are ambiguous so I 
changed them to more descriptive names.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-37895: [C++] Feature: support concatenate recordbatches. [arrow]

Reply via email to