Re: [PR] GH-40060: [C++][Python] Basic conversion of RecordBatch to Arrow Tensor - add support for different data types [arrow]

via GitHub Wed, 06 Mar 2024 05:20:05 -0800


jorisvandenbossche commented on code in PR #40359:
URL: https://github.com/apache/arrow/pull/40359#discussion_r1514430991



##########
cpp/src/arrow/record_batch.cc:
##########
@@ -253,14 +253,65 @@ inline void ConvertColumnsToTensor(const RecordBatch& 
batch, uint8_t* out) {
   using CType = typename arrow::TypeTraits<DataType>::CType;
   auto* out_values = reinterpret_cast<CType*>(out);
 
-  // Loop through all of the columns
-  for (int i = 0; i < batch.num_columns(); ++i) {
-    const auto* in_values = batch.column(i)->data()->GetValues<CType>(1);
-
-    // Copy data of each column
-    memcpy(out_values, in_values, sizeof(CType) * batch.num_rows());
-    out_values += batch.num_rows();
-  }  // End loop through columns
+  if (TypeTraits<DataType>::type_singleton() ==
+      batch.column(0)->type()) {  // If all columns are of same data type
+    // Loop through all of the columns
+    for (int i = 0; i < batch.num_columns(); ++i) {
+      const auto& arr = *batch.column(i);
+      auto data = arr.data();
+      const auto& in_values = data->GetValues<CType>(1);
+
+      // Copy data of each column
+      memcpy(out_values, in_values, sizeof(CType) * batch.num_rows());
+      out_values += batch.num_rows();
+    }  // End loop through columns
+
+  } else {  // If columns have mixed data type

Review Comment:
   We might want to keep the simpler memcpy for those columns where the column 
type matches the result type? 
   Another way to organize the code is to first loop over the columns, and only 
then switch between memcpy or the loop, depending on the column type.



##########
cpp/src/arrow/record_batch.cc:
##########
@@ -253,14 +253,65 @@ inline void ConvertColumnsToTensor(const RecordBatch& 
batch, uint8_t* out) {
   using CType = typename arrow::TypeTraits<DataType>::CType;
   auto* out_values = reinterpret_cast<CType*>(out);
 
-  // Loop through all of the columns
-  for (int i = 0; i < batch.num_columns(); ++i) {
-    const auto* in_values = batch.column(i)->data()->GetValues<CType>(1);
-
-    // Copy data of each column
-    memcpy(out_values, in_values, sizeof(CType) * batch.num_rows());
-    out_values += batch.num_rows();
-  }  // End loop through columns
+  if (TypeTraits<DataType>::type_singleton() ==
+      batch.column(0)->type()) {  // If all columns are of same data type
+    // Loop through all of the columns
+    for (int i = 0; i < batch.num_columns(); ++i) {
+      const auto& arr = *batch.column(i);
+      auto data = arr.data();
+      const auto& in_values = data->GetValues<CType>(1);
+
+      // Copy data of each column
+      memcpy(out_values, in_values, sizeof(CType) * batch.num_rows());
+      out_values += batch.num_rows();
+    }  // End loop through columns
+
+  } else {  // If columns have mixed data type
+    // Loop through all of the columns
+    for (int i = 0; i < batch.num_columns(); ++i) {
+      const auto& arr = *batch.column(i);
+      auto data = arr.data();
+
+      // Copy data of each column
+      for (int i = 0; i < arr.length(); ++i) {

Review Comment:
   ```suggestion
         for (int64_t i = 0; i < arr.length(); ++i) {
   ```



##########
cpp/src/arrow/record_batch.cc:
##########
@@ -253,14 +253,65 @@ inline void ConvertColumnsToTensor(const RecordBatch& 
batch, uint8_t* out) {
   using CType = typename arrow::TypeTraits<DataType>::CType;
   auto* out_values = reinterpret_cast<CType*>(out);
 
-  // Loop through all of the columns
-  for (int i = 0; i < batch.num_columns(); ++i) {
-    const auto* in_values = batch.column(i)->data()->GetValues<CType>(1);
-
-    // Copy data of each column
-    memcpy(out_values, in_values, sizeof(CType) * batch.num_rows());
-    out_values += batch.num_rows();
-  }  // End loop through columns
+  if (TypeTraits<DataType>::type_singleton() ==
+      batch.column(0)->type()) {  // If all columns are of same data type
+    // Loop through all of the columns
+    for (int i = 0; i < batch.num_columns(); ++i) {
+      const auto& arr = *batch.column(i);
+      auto data = arr.data();
+      const auto& in_values = data->GetValues<CType>(1);

Review Comment:
   ```suggestion
         const auto* in_values = batch.column(i)->data()->GetValues<CType>(1);
   ```
   
   (to keep the suggestion from the previous pr)



##########
cpp/src/arrow/record_batch.cc:
##########
@@ -269,28 +320,46 @@ Result<std::shared_ptr<Tensor>> 
RecordBatch::ToTensor(MemoryPool* pool) const {
         "Conversion to Tensor for RecordBatches without columns/schema is not "
         "supported.");
   }
-  const auto& type = column(0)->type();
-  // Check for supported data types
-  if (!is_integer(type->id()) && !is_floating(type->id())) {
-    return Status::TypeError("DataType is not supported: ", type->ToString());
-  }
-  // Check for uniform data type
   // Check for no validity bitmap of each field
   for (int i = 0; i < num_columns(); ++i) {
     if (column(i)->null_count() > 0) {
       return Status::TypeError("Can only convert a RecordBatch with no 
nulls.");
     }
-    if (column(i)->type() != type) {
-      return Status::TypeError("Can only convert a RecordBatch with uniform 
data type.");
+  }
+
+  // Check for supported data types and merge fields
+  // to get the resulting uniform data type
+  if (!is_integer(column(0)->type()->id()) && 
!is_floating(column(0)->type()->id())) {
+    return Status::TypeError("DataType is not supported: ",
+                             column(0)->type()->ToString());
+  }
+  std::shared_ptr<Field> result_field = schema_->field(0);
+  std::shared_ptr<DataType> result_type = result_field->type();
+
+  if (num_columns() > 1) {
+    Field::MergeOptions options;
+    options.promote_integer_to_float = true;
+    options.promote_integer_sign = true;
+    options.promote_numeric_width = true;
+
+    for (int i = 1; i < num_columns(); ++i) {
+      if (!is_integer(column(i)->type()->id()) && 
!is_floating(column(i)->type()->id())) {
+        return Status::TypeError("DataType is not supported: ",
+                                 column(i)->type()->ToString());
+      }
+      ARROW_ASSIGN_OR_RAISE(
+          result_field, result_field->MergeWith(

Review Comment:
   I would think you can move `MergeTypes` out of the internal namespace (eg 
just before the `Field::MergeWith` impl) and add it to type.h. It shouldn't be 
a problem that the other functions used inside `MergeTypes` are still left as 
internal



##########
cpp/src/arrow/record_batch.cc:
##########
@@ -253,14 +253,65 @@ inline void ConvertColumnsToTensor(const RecordBatch& 
batch, uint8_t* out) {
   using CType = typename arrow::TypeTraits<DataType>::CType;
   auto* out_values = reinterpret_cast<CType*>(out);
 
-  // Loop through all of the columns
-  for (int i = 0; i < batch.num_columns(); ++i) {
-    const auto* in_values = batch.column(i)->data()->GetValues<CType>(1);
-
-    // Copy data of each column
-    memcpy(out_values, in_values, sizeof(CType) * batch.num_rows());
-    out_values += batch.num_rows();
-  }  // End loop through columns
+  if (TypeTraits<DataType>::type_singleton() ==
+      batch.column(0)->type()) {  // If all columns are of same data type
+    // Loop through all of the columns
+    for (int i = 0; i < batch.num_columns(); ++i) {
+      const auto& arr = *batch.column(i);
+      auto data = arr.data();
+      const auto& in_values = data->GetValues<CType>(1);
+
+      // Copy data of each column
+      memcpy(out_values, in_values, sizeof(CType) * batch.num_rows());
+      out_values += batch.num_rows();
+    }  // End loop through columns
+
+  } else {  // If columns have mixed data type
+    // Loop through all of the columns
+    for (int i = 0; i < batch.num_columns(); ++i) {
+      const auto& arr = *batch.column(i);
+      auto data = arr.data();
+
+      // Copy data of each column
+      for (int i = 0; i < arr.length(); ++i) {
+        switch (arr.type_id()) {
+          case Type::UINT8:
+            *out_values++ = static_cast<CType>(data->GetValues<uint8_t>(1)[i]);

Review Comment:
   It would be good to check what is the overhead of calling 
`data->GetValues<uint8_t>(1)` inside the loop, i.e. for each value.
   
   Otherwise you can get those values once before the loop, and then use them 
similarly as the `out_values`:
   
   ```
       const uint8_t* in_values = data->GetValues<uint8_t>(1);
       for (int64_t i = 0; i < arr.length(); ++i) {
         *out_values++ = static_cast<CType>(*in_values++);
       }
   ```
   
   The fits less nicely in the switch case (moving the loop inside), but so 
maybe you can avoid the repetition with a macro



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-40060: [C++][Python] Basic conversion of RecordBatch to Arrow Tensor - add support for different data types [arrow]

Reply via email to