bkietz commented on code in PR #40064:
URL: https://github.com/apache/arrow/pull/40064#discussion_r1507886826
##########
cpp/src/arrow/record_batch.cc:
##########
@@ -247,6 +248,99 @@ Result<std::shared_ptr<StructArray>>
RecordBatch::ToStructArray() const {
/*offset=*/0);
}
+template <typename DataType>
+inline void ConvertColumnsToTensor(const RecordBatch& batch, uint8_t* out) {
+ using CType = typename arrow::TypeTraits<DataType>::CType;
+ auto* out_values = reinterpret_cast<CType*>(out);
+
+ // Loop through all of the columns
+ for (int i = 0; i < batch.num_columns(); ++i) {
+ const auto& arr = *batch.column(i);
+ auto data = arr.data();
+ const auto& in_values = data->GetValues<CType>(1);
Review Comment:
```suggestion
const auto* in_values = batch.column(i)->data()->GetValues<CType>(1);
```
##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -592,4 +593,232 @@ TEST_F(TestRecordBatch, ConcatenateRecordBatches) {
ASSERT_BATCHES_EQUAL(*batch, *null_batch);
}
+TEST_F(TestRecordBatch, ToTensorUnsupported) {
+ const int length = 9;
+
+ // Mixed data type
+ auto f0 = field("f0", int32());
+ auto f1 = field("f1", int64());
+
+ std::vector<std::shared_ptr<Field>> fields = {f0, f1};
+ auto schema = ::arrow::schema(fields);
+
+ auto a0 = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9]");
+ auto a1 = ArrayFromJSON(int64(), "[10, 20, 30, 40, 50, 60, 70, 80, 90]");
+
+ auto batch = RecordBatch::Make(schema, length, {a0, a1});
+
+ ASSERT_RAISES_WITH_MESSAGE(
+ TypeError, "Type error: Can only convert a RecordBatch with uniform data
type.",
+ batch->ToTensor());
+
+ // Unsupported data type
+ auto f2 = field("f2", utf8());
+
+ std::vector<std::shared_ptr<Field>> fields_1 = {f2};
+ auto schema_2 = ::arrow::schema(fields_1);
+
+ auto a2 = ArrayFromJSON(utf8(), R"(["a", "b", "c", "a", "b", "c", "a", "b",
"c"])");
+ auto batch_2 = RecordBatch::Make(schema_2, length, {a2});
+
+ ASSERT_RAISES_WITH_MESSAGE(
+ TypeError, "Type error: DataType is not supported: " +
a2->type()->ToString(),
+ batch_2->ToTensor());
+}
+
+TEST_F(TestRecordBatch, ToTensorUnsupportedMissing) {
+ const int length = 9;
+
+ auto f0 = field("f0", int32());
+ auto f1 = field("f1", int32());
+
+ std::vector<std::shared_ptr<Field>> fields = {f0, f1};
+ auto schema = ::arrow::schema(fields);
+
+ auto a0 = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9]");
+ auto a1 = ArrayFromJSON(int32(), "[10, 20, 30, 40, null, 60, 70, 80, 90]");
+
+ auto batch = RecordBatch::Make(schema, length, {a0, a1});
+
+ ASSERT_RAISES_WITH_MESSAGE(TypeError,
+ "Type error: Can only convert a RecordBatch with
no nulls.",
+ batch->ToTensor());
+}
+
+TEST_F(TestRecordBatch, ToTensorEmptyBatch) {
+ auto f0 = field("f0", int32());
+ auto f1 = field("f1", int32());
+
+ std::vector<std::shared_ptr<Field>> fields = {f0, f1};
+ auto schema = ::arrow::schema(fields);
+
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<RecordBatch> empty,
+ RecordBatch::MakeEmpty(schema));
+
+ ASSERT_OK_AND_ASSIGN(auto tensor, empty->ToTensor());
+ ASSERT_OK(tensor->Validate());
+
+ const std::vector<int64_t> strides = {4, 4};
+ const std::vector<int64_t> shape = {0, 2};
+
+ EXPECT_EQ(strides, tensor->strides());
+ EXPECT_EQ(shape, tensor->shape());
+
+ auto batch_no_columns =
+ RecordBatch::Make(::arrow::schema({}), 10,
std::vector<std::shared_ptr<Array>>{});
+
+ ASSERT_RAISES_WITH_MESSAGE(TypeError,
+ "Type error: Conversion to Tensor for
RecordBatches without "
+ "columns/schema is not supported.",
+ batch_no_columns->ToTensor());
+}
+
+template <typename DataType>
+void CheckTensor(const std::shared_ptr<Tensor>& tensor, const int size,
+ const std::vector<int64_t> shape, const std::vector<int64_t>
f_strides) {
+ EXPECT_EQ(size, tensor->size());
+ EXPECT_EQ(TypeTraits<DataType>::type_singleton(), tensor->type());
+ EXPECT_EQ(shape, tensor->shape());
+ EXPECT_EQ(f_strides, tensor->strides());
+ EXPECT_FALSE(tensor->is_row_major());
+ EXPECT_TRUE(tensor->is_column_major());
+ EXPECT_TRUE(tensor->is_contiguous());
+}
+
+TEST_F(TestRecordBatch, ToTensorSupportedNaN) {
+ const int length = 9;
+
+ auto f0 = field("f0", float32());
+ auto f1 = field("f1", float32());
+
+ std::vector<std::shared_ptr<Field>> fields = {f0, f1};
+ auto schema = ::arrow::schema(fields);
+
+ auto a0 = ArrayFromJSON(float32(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9]");
+ auto a1 = ArrayFromJSON(float32(), "[10, 20, 30, 40, NaN, 60, 70, 80, 90]");
+
+ auto batch = RecordBatch::Make(schema, length, {a0, a1});
+
+ ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor());
+ ASSERT_OK(tensor->Validate());
+
+ std::vector<int64_t> shape = {9, 2};
+ const int64_t f32_size = sizeof(float);
+ std::vector<int64_t> f_strides = {f32_size, f32_size * shape[0]};
+ std::vector<float> f_values = {
+ static_cast<float>(NAN), 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40,
+ static_cast<float>(NAN), 60, 70, 80, 90};
+ auto data = Buffer::Wrap(f_values);
+
+ std::shared_ptr<Tensor> tensor_expected;
+ ASSERT_OK_AND_ASSIGN(tensor_expected, Tensor::Make(float32(), data, shape,
f_strides));
Review Comment:
Would you mind writing or filing an issue for TensorFromJSON()? I think it
could make these tests easier to read and write
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]