jorisvandenbossche commented on code in PR #40803:
URL: https://github.com/apache/arrow/pull/40803#discussion_r1540593215
##########
cpp/src/arrow/record_batch.h:
##########
@@ -86,7 +86,7 @@ class ARROW_EXPORT RecordBatch {
/// strides (type size in bytes, type size in bytes * number of rows).
/// Generated Tensor will have column-major layout.
Result<std::shared_ptr<Tensor>> ToTensor(
- MemoryPool* pool = default_memory_pool()) const;
+ bool null_to_nan = false, MemoryPool* pool = default_memory_pool())
const;
Review Comment:
Can you add an explanation of that parameter in the doc comment just above?
##########
python/pyarrow/includes/libarrow.pxd:
##########
@@ -984,7 +984,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
shared_ptr[CRecordBatch] Slice(int64_t offset)
shared_ptr[CRecordBatch] Slice(int64_t offset, int64_t length)
- CResult[shared_ptr[CTensor]] ToTensor() const
+ CResult[shared_ptr[CTensor]] ToTensor(c_bool nul_to_nan, CMemoryPool*
pool) const
Review Comment:
```suggestion
CResult[shared_ptr[CTensor]] ToTensor(c_bool null_to_nan,
CMemoryPool* pool) const
```
##########
python/pyarrow/table.pxi:
##########
@@ -3389,21 +3389,64 @@ cdef class RecordBatch(_Tabular):
<CResult[shared_ptr[CArray]]>deref(c_record_batch).ToStructArray())
return pyarrow_wrap_array(c_array)
- def to_tensor(self):
+ def to_tensor(self, c_bool null_to_nan=False, MemoryPool memory_pool=None):
"""
Convert to a :class:`~pyarrow.Tensor`.
RecordBatches that can be converted have fields of type signed or
unsigned
- integer or float, including all bit-widths, with no validity bitmask.
+ integer or float, including all bit-widths. RecordBatches with
validity bitmask
+ for any of the arrays can be converted with ``null_to_nan``turned to
``True``.
+ In this case null values are converted to NaN and signed or unsigned
integer
+ type arrays are promoted to appropriate float instance.
+
+ Parameters
+ ----------
+ null_to_nan : bool, default False
+ Whether to overwrite null values in the data with ``NaN``.
Review Comment:
```suggestion
Whether to write null values in the result as ``NaN``.
```
##########
python/pyarrow/table.pxi:
##########
@@ -3389,21 +3389,64 @@ cdef class RecordBatch(_Tabular):
<CResult[shared_ptr[CArray]]>deref(c_record_batch).ToStructArray())
return pyarrow_wrap_array(c_array)
- def to_tensor(self):
+ def to_tensor(self, c_bool null_to_nan=False, MemoryPool memory_pool=None):
"""
Convert to a :class:`~pyarrow.Tensor`.
RecordBatches that can be converted have fields of type signed or
unsigned
- integer or float, including all bit-widths, with no validity bitmask.
+ integer or float, including all bit-widths. RecordBatches with
validity bitmask
+ for any of the arrays can be converted with ``null_to_nan``turned to
``True``.
+ In this case null values are converted to NaN and signed or unsigned
integer
+ type arrays are promoted to appropriate float instance.
Review Comment:
```suggestion
type arrays are promoted to appropriate float type.
```
##########
cpp/src/arrow/record_batch.cc:
##########
@@ -276,7 +284,8 @@ struct ConvertColumnsToTensorVisitor {
};
template <typename DataType>
-inline void ConvertColumnsToTensor(const RecordBatch& batch, uint8_t* out) {
+inline void ConvertColumnsToTensor(const RecordBatch& batch, uint8_t* out,
+ bool null_to_nan) {
Review Comment:
The `null_to_nan` is not used here?
##########
cpp/src/arrow/record_batch.cc:
##########
@@ -334,42 +347,51 @@ Result<std::shared_ptr<Tensor>>
RecordBatch::ToTensor(MemoryPool* pool) const {
result_type = result_field->type();
}
+ // Check if result_type is signed or unsigned integer and null_to_nan is set
to true
+ // Then all columns should be promoted to float type
+ if (is_integer(result_type->id()) && null_to_nan) {
+ ARROW_ASSIGN_OR_RAISE(
+ result_field,
+ result_field->MergeWith(field(result_field->name(), float16()),
options));
Review Comment:
One problem with using float16 here is that for a combo of (int8, float16)
the merged type is still float16 (I think, didn't check), and we can't cast
(yet) to float16.
So maybe use float32 here, so the minimum resulting float type is float32
(and also add a test case for it)
##########
cpp/src/arrow/record_batch.cc:
##########
@@ -286,16 +295,20 @@ inline void ConvertColumnsToTensor(const RecordBatch&
batch, uint8_t* out) {
}
}
-Result<std::shared_ptr<Tensor>> RecordBatch::ToTensor(MemoryPool* pool) const {
+Result<std::shared_ptr<Tensor>> RecordBatch::ToTensor(bool null_to_nan,
+ MemoryPool* pool) const {
if (num_columns() == 0) {
return Status::TypeError(
"Conversion to Tensor for RecordBatches without columns/schema is not "
"supported.");
}
// Check for no validity bitmap of each field
+ // if null_to_nan conversion is set to false
for (int i = 0; i < num_columns(); ++i) {
- if (column(i)->null_count() > 0) {
- return Status::TypeError("Can only convert a RecordBatch with no
nulls.");
+ if (column(i)->null_count() > 0 && !null_to_nan) {
+ return Status::TypeError(
+ "Can only convert a RecordBatch with no nulls. Set null_to_nan to
true to "
+ "convert nulls to nan");
Review Comment:
```suggestion
"convert nulls to NaN");
```
##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -667,7 +667,8 @@ TEST_F(TestRecordBatch, ToTensorUnsupportedMissing) {
auto batch = RecordBatch::Make(schema, length, {a0, a1});
ASSERT_RAISES_WITH_MESSAGE(TypeError,
- "Type error: Can only convert a RecordBatch with
no nulls.",
+ "Type error: Can only convert a RecordBatch with
no nulls. "
+ "Set null_to_nan to true to convert nulls to nan",
Review Comment:
```suggestion
"Set null_to_nan to true to convert nulls to
NaN",
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]