jorisvandenbossche commented on code in PR #40359:
URL: https://github.com/apache/arrow/pull/40359#discussion_r1514430991
##########
cpp/src/arrow/record_batch.cc:
##########
@@ -253,14 +253,65 @@ inline void ConvertColumnsToTensor(const RecordBatch&
batch, uint8_t* out) {
using CType = typename arrow::TypeTraits<DataType>::CType;
auto* out_values = reinterpret_cast<CType*>(out);
- // Loop through all of the columns
- for (int i = 0; i < batch.num_columns(); ++i) {
- const auto* in_values = batch.column(i)->data()->GetValues<CType>(1);
-
- // Copy data of each column
- memcpy(out_values, in_values, sizeof(CType) * batch.num_rows());
- out_values += batch.num_rows();
- } // End loop through columns
+ if (TypeTraits<DataType>::type_singleton() ==
+ batch.column(0)->type()) { // If all columns are of same data type
+ // Loop through all of the columns
+ for (int i = 0; i < batch.num_columns(); ++i) {
+ const auto& arr = *batch.column(i);
+ auto data = arr.data();
+ const auto& in_values = data->GetValues<CType>(1);
+
+ // Copy data of each column
+ memcpy(out_values, in_values, sizeof(CType) * batch.num_rows());
+ out_values += batch.num_rows();
+ } // End loop through columns
+
+ } else { // If columns have mixed data type
Review Comment:
We might want to keep the simpler memcpy for those columns where the column
type matches the result type?
Another way to organize the code is to first loop over the columns, and only
then switch between memcpy or the loop, depending on the column type.
##########
cpp/src/arrow/record_batch.cc:
##########
@@ -253,14 +253,65 @@ inline void ConvertColumnsToTensor(const RecordBatch&
batch, uint8_t* out) {
using CType = typename arrow::TypeTraits<DataType>::CType;
auto* out_values = reinterpret_cast<CType*>(out);
- // Loop through all of the columns
- for (int i = 0; i < batch.num_columns(); ++i) {
- const auto* in_values = batch.column(i)->data()->GetValues<CType>(1);
-
- // Copy data of each column
- memcpy(out_values, in_values, sizeof(CType) * batch.num_rows());
- out_values += batch.num_rows();
- } // End loop through columns
+ if (TypeTraits<DataType>::type_singleton() ==
+ batch.column(0)->type()) { // If all columns are of same data type
+ // Loop through all of the columns
+ for (int i = 0; i < batch.num_columns(); ++i) {
+ const auto& arr = *batch.column(i);
+ auto data = arr.data();
+ const auto& in_values = data->GetValues<CType>(1);
+
+ // Copy data of each column
+ memcpy(out_values, in_values, sizeof(CType) * batch.num_rows());
+ out_values += batch.num_rows();
+ } // End loop through columns
+
+ } else { // If columns have mixed data type
+ // Loop through all of the columns
+ for (int i = 0; i < batch.num_columns(); ++i) {
+ const auto& arr = *batch.column(i);
+ auto data = arr.data();
+
+ // Copy data of each column
+ for (int i = 0; i < arr.length(); ++i) {
Review Comment:
```suggestion
for (int64_t i = 0; i < arr.length(); ++i) {
```
##########
cpp/src/arrow/record_batch.cc:
##########
@@ -253,14 +253,65 @@ inline void ConvertColumnsToTensor(const RecordBatch&
batch, uint8_t* out) {
using CType = typename arrow::TypeTraits<DataType>::CType;
auto* out_values = reinterpret_cast<CType*>(out);
- // Loop through all of the columns
- for (int i = 0; i < batch.num_columns(); ++i) {
- const auto* in_values = batch.column(i)->data()->GetValues<CType>(1);
-
- // Copy data of each column
- memcpy(out_values, in_values, sizeof(CType) * batch.num_rows());
- out_values += batch.num_rows();
- } // End loop through columns
+ if (TypeTraits<DataType>::type_singleton() ==
+ batch.column(0)->type()) { // If all columns are of same data type
+ // Loop through all of the columns
+ for (int i = 0; i < batch.num_columns(); ++i) {
+ const auto& arr = *batch.column(i);
+ auto data = arr.data();
+ const auto& in_values = data->GetValues<CType>(1);
Review Comment:
```suggestion
const auto* in_values = batch.column(i)->data()->GetValues<CType>(1);
```
(to keep the suggestion from the previous pr)
##########
cpp/src/arrow/record_batch.cc:
##########
@@ -269,28 +320,46 @@ Result<std::shared_ptr<Tensor>>
RecordBatch::ToTensor(MemoryPool* pool) const {
"Conversion to Tensor for RecordBatches without columns/schema is not "
"supported.");
}
- const auto& type = column(0)->type();
- // Check for supported data types
- if (!is_integer(type->id()) && !is_floating(type->id())) {
- return Status::TypeError("DataType is not supported: ", type->ToString());
- }
- // Check for uniform data type
// Check for no validity bitmap of each field
for (int i = 0; i < num_columns(); ++i) {
if (column(i)->null_count() > 0) {
return Status::TypeError("Can only convert a RecordBatch with no
nulls.");
}
- if (column(i)->type() != type) {
- return Status::TypeError("Can only convert a RecordBatch with uniform
data type.");
+ }
+
+ // Check for supported data types and merge fields
+ // to get the resulting uniform data type
+ if (!is_integer(column(0)->type()->id()) &&
!is_floating(column(0)->type()->id())) {
+ return Status::TypeError("DataType is not supported: ",
+ column(0)->type()->ToString());
+ }
+ std::shared_ptr<Field> result_field = schema_->field(0);
+ std::shared_ptr<DataType> result_type = result_field->type();
+
+ if (num_columns() > 1) {
+ Field::MergeOptions options;
+ options.promote_integer_to_float = true;
+ options.promote_integer_sign = true;
+ options.promote_numeric_width = true;
+
+ for (int i = 1; i < num_columns(); ++i) {
+ if (!is_integer(column(i)->type()->id()) &&
!is_floating(column(i)->type()->id())) {
+ return Status::TypeError("DataType is not supported: ",
+ column(i)->type()->ToString());
+ }
+ ARROW_ASSIGN_OR_RAISE(
+ result_field, result_field->MergeWith(
Review Comment:
I would think you can move `MergeTypes` out of the internal namespace (eg
just before the `Field::MergeWith` impl) and add it to type.h. It shouldn't be
a problem that the other functions used inside `MergeTypes` are still left as
internal
##########
cpp/src/arrow/record_batch.cc:
##########
@@ -253,14 +253,65 @@ inline void ConvertColumnsToTensor(const RecordBatch&
batch, uint8_t* out) {
using CType = typename arrow::TypeTraits<DataType>::CType;
auto* out_values = reinterpret_cast<CType*>(out);
- // Loop through all of the columns
- for (int i = 0; i < batch.num_columns(); ++i) {
- const auto* in_values = batch.column(i)->data()->GetValues<CType>(1);
-
- // Copy data of each column
- memcpy(out_values, in_values, sizeof(CType) * batch.num_rows());
- out_values += batch.num_rows();
- } // End loop through columns
+ if (TypeTraits<DataType>::type_singleton() ==
+ batch.column(0)->type()) { // If all columns are of same data type
+ // Loop through all of the columns
+ for (int i = 0; i < batch.num_columns(); ++i) {
+ const auto& arr = *batch.column(i);
+ auto data = arr.data();
+ const auto& in_values = data->GetValues<CType>(1);
+
+ // Copy data of each column
+ memcpy(out_values, in_values, sizeof(CType) * batch.num_rows());
+ out_values += batch.num_rows();
+ } // End loop through columns
+
+ } else { // If columns have mixed data type
+ // Loop through all of the columns
+ for (int i = 0; i < batch.num_columns(); ++i) {
+ const auto& arr = *batch.column(i);
+ auto data = arr.data();
+
+ // Copy data of each column
+ for (int i = 0; i < arr.length(); ++i) {
+ switch (arr.type_id()) {
+ case Type::UINT8:
+ *out_values++ = static_cast<CType>(data->GetValues<uint8_t>(1)[i]);
Review Comment:
It would be good to check what is the overhead of calling
`data->GetValues<uint8_t>(1)` inside the loop, i.e. for each value.
Otherwise you can get those values once before the loop, and then use them
similarly as the `out_values`:
```
const uint8_t* in_values = data->GetValues<uint8_t>(1);
for (int64_t i = 0; i < arr.length(); ++i) {
*out_values++ = static_cast<CType>(*in_values++);
}
```
The fits less nicely in the switch case (moving the loop inside), but so
maybe you can avoid the repetition with a macro
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]