bkietz commented on code in PR #40064:
URL: https://github.com/apache/arrow/pull/40064#discussion_r1507886826


##########
cpp/src/arrow/record_batch.cc:
##########
@@ -247,6 +248,99 @@ Result<std::shared_ptr<StructArray>> 
RecordBatch::ToStructArray() const {
                                        /*offset=*/0);
 }
 
+template <typename DataType>
+inline void ConvertColumnsToTensor(const RecordBatch& batch, uint8_t* out) {
+  using CType = typename arrow::TypeTraits<DataType>::CType;
+  auto* out_values = reinterpret_cast<CType*>(out);
+
+  // Loop through all of the columns
+  for (int i = 0; i < batch.num_columns(); ++i) {
+    const auto& arr = *batch.column(i);
+    auto data = arr.data();
+    const auto& in_values = data->GetValues<CType>(1);

Review Comment:
   ```suggestion
       const auto* in_values = batch.column(i)->data()->GetValues<CType>(1);
   ```



##########
cpp/src/arrow/record_batch_test.cc:
##########
@@ -592,4 +593,232 @@ TEST_F(TestRecordBatch, ConcatenateRecordBatches) {
   ASSERT_BATCHES_EQUAL(*batch, *null_batch);
 }
 
+TEST_F(TestRecordBatch, ToTensorUnsupported) {
+  const int length = 9;
+
+  // Mixed data type
+  auto f0 = field("f0", int32());
+  auto f1 = field("f1", int64());
+
+  std::vector<std::shared_ptr<Field>> fields = {f0, f1};
+  auto schema = ::arrow::schema(fields);
+
+  auto a0 = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9]");
+  auto a1 = ArrayFromJSON(int64(), "[10, 20, 30, 40, 50, 60, 70, 80, 90]");
+
+  auto batch = RecordBatch::Make(schema, length, {a0, a1});
+
+  ASSERT_RAISES_WITH_MESSAGE(
+      TypeError, "Type error: Can only convert a RecordBatch with uniform data 
type.",
+      batch->ToTensor());
+
+  // Unsupported data type
+  auto f2 = field("f2", utf8());
+
+  std::vector<std::shared_ptr<Field>> fields_1 = {f2};
+  auto schema_2 = ::arrow::schema(fields_1);
+
+  auto a2 = ArrayFromJSON(utf8(), R"(["a", "b", "c", "a", "b", "c", "a", "b", 
"c"])");
+  auto batch_2 = RecordBatch::Make(schema_2, length, {a2});
+
+  ASSERT_RAISES_WITH_MESSAGE(
+      TypeError, "Type error: DataType is not supported: " + 
a2->type()->ToString(),
+      batch_2->ToTensor());
+}
+
+TEST_F(TestRecordBatch, ToTensorUnsupportedMissing) {
+  const int length = 9;
+
+  auto f0 = field("f0", int32());
+  auto f1 = field("f1", int32());
+
+  std::vector<std::shared_ptr<Field>> fields = {f0, f1};
+  auto schema = ::arrow::schema(fields);
+
+  auto a0 = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9]");
+  auto a1 = ArrayFromJSON(int32(), "[10, 20, 30, 40, null, 60, 70, 80, 90]");
+
+  auto batch = RecordBatch::Make(schema, length, {a0, a1});
+
+  ASSERT_RAISES_WITH_MESSAGE(TypeError,
+                             "Type error: Can only convert a RecordBatch with 
no nulls.",
+                             batch->ToTensor());
+}
+
+TEST_F(TestRecordBatch, ToTensorEmptyBatch) {
+  auto f0 = field("f0", int32());
+  auto f1 = field("f1", int32());
+
+  std::vector<std::shared_ptr<Field>> fields = {f0, f1};
+  auto schema = ::arrow::schema(fields);
+
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<RecordBatch> empty,
+                       RecordBatch::MakeEmpty(schema));
+
+  ASSERT_OK_AND_ASSIGN(auto tensor, empty->ToTensor());
+  ASSERT_OK(tensor->Validate());
+
+  const std::vector<int64_t> strides = {4, 4};
+  const std::vector<int64_t> shape = {0, 2};
+
+  EXPECT_EQ(strides, tensor->strides());
+  EXPECT_EQ(shape, tensor->shape());
+
+  auto batch_no_columns =
+      RecordBatch::Make(::arrow::schema({}), 10, 
std::vector<std::shared_ptr<Array>>{});
+
+  ASSERT_RAISES_WITH_MESSAGE(TypeError,
+                             "Type error: Conversion to Tensor for 
RecordBatches without "
+                             "columns/schema is not supported.",
+                             batch_no_columns->ToTensor());
+}
+
+template <typename DataType>
+void CheckTensor(const std::shared_ptr<Tensor>& tensor, const int size,
+                 const std::vector<int64_t> shape, const std::vector<int64_t> 
f_strides) {
+  EXPECT_EQ(size, tensor->size());
+  EXPECT_EQ(TypeTraits<DataType>::type_singleton(), tensor->type());
+  EXPECT_EQ(shape, tensor->shape());
+  EXPECT_EQ(f_strides, tensor->strides());
+  EXPECT_FALSE(tensor->is_row_major());
+  EXPECT_TRUE(tensor->is_column_major());
+  EXPECT_TRUE(tensor->is_contiguous());
+}
+
+TEST_F(TestRecordBatch, ToTensorSupportedNaN) {
+  const int length = 9;
+
+  auto f0 = field("f0", float32());
+  auto f1 = field("f1", float32());
+
+  std::vector<std::shared_ptr<Field>> fields = {f0, f1};
+  auto schema = ::arrow::schema(fields);
+
+  auto a0 = ArrayFromJSON(float32(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9]");
+  auto a1 = ArrayFromJSON(float32(), "[10, 20, 30, 40, NaN, 60, 70, 80, 90]");
+
+  auto batch = RecordBatch::Make(schema, length, {a0, a1});
+
+  ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor());
+  ASSERT_OK(tensor->Validate());
+
+  std::vector<int64_t> shape = {9, 2};
+  const int64_t f32_size = sizeof(float);
+  std::vector<int64_t> f_strides = {f32_size, f32_size * shape[0]};
+  std::vector<float> f_values = {
+      static_cast<float>(NAN), 2,  3,  4,  5, 6, 7, 8, 9, 10, 20, 30, 40,
+      static_cast<float>(NAN), 60, 70, 80, 90};
+  auto data = Buffer::Wrap(f_values);
+
+  std::shared_ptr<Tensor> tensor_expected;
+  ASSERT_OK_AND_ASSIGN(tensor_expected, Tensor::Make(float32(), data, shape, 
f_strides));

Review Comment:
   Would you mind writing or filing an issue for TensorFromJSON()? I think it 
could make these tests easier to read and write



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to