AlenkaF commented on code in PR #41870:
URL: https://github.com/apache/arrow/pull/41870#discussion_r3165694570


##########
cpp/src/arrow/table_test.cc:
##########
@@ -520,6 +521,541 @@ TEST_F(TestTable, ConcatenateTables) {
   ASSERT_RAISES(Invalid, ConcatenateTables({t1, t3}));
 }
 
+TEST_F(TestTable, ToTensorUnsupportedType) {
+  auto f0 = field("f0", int32());
+  // Unsupported data type
+  auto f1 = field("f1", utf8());
+
+  std::vector<std::shared_ptr<Field>> fields = {f0, f1};
+  auto schema = ::arrow::schema(fields);
+
+  auto a0 = ChunkedArrayFromJSON(int32(), {"[1, 2, 3]", "[4, 5, 6, 7, 8, 9]"});
+  auto a1 = ChunkedArrayFromJSON(
+      utf8(), {R"(["a", "b", "c", "a", "b"])", R"(["c", "a", "b", "c"])"});
+
+  auto table = Table::Make(schema, {a0, a1});
+
+  ASSERT_RAISES_WITH_MESSAGE(
+      TypeError, "Type error: DataType is not supported: " + 
a1->type()->ToString(),
+      table->ToTensor());
+
+  // Unsupported boolean data type
+  auto f2 = field("f2", boolean());
+
+  std::vector<std::shared_ptr<Field>> fields2 = {f0, f2};
+  auto schema2 = ::arrow::schema(fields2);
+  auto a2 = ChunkedArrayFromJSON(
+      boolean(), {"[true, false, true, true, false, true, false, true, 
true]"});
+  auto table2 = Table::Make(schema2, {a0, a2});
+
+  ASSERT_RAISES_WITH_MESSAGE(
+      TypeError, "Type error: DataType is not supported: " + 
a2->type()->ToString(),
+      table2->ToTensor());
+}
+
+TEST_F(TestTable, ToTensorUnsupportedMissing) {
+  auto f0 = field("f0", int32());
+  auto f1 = field("f1", int32());
+
+  std::vector<std::shared_ptr<Field>> fields = {f0, f1};
+  auto schema = ::arrow::schema(fields);
+
+  auto a0 = ChunkedArrayFromJSON(int32(), {"[1, 2, 3]", "[4, 5, 6, 7, 8, 9]"});
+  auto a1 = ChunkedArrayFromJSON(int32(), {"[10, 20]", "[30, 40, null, 60, 70, 
80, 90]"});
+
+  auto table = Table::Make(schema, {a0, a1});
+
+  ASSERT_RAISES_WITH_MESSAGE(
+      TypeError,
+      "Type error: Can only convert a Table or RecordBatch with no "
+      "nulls. Set null_to_nan to true to convert nulls to NaN",
+      table->ToTensor());
+}
+
+TEST_F(TestTable, ToTensorEmptyTable) {
+  auto f0 = field("f0", int32());
+  auto f1 = field("f1", int32());
+
+  std::vector<std::shared_ptr<Field>> fields = {f0, f1};
+  auto schema = ::arrow::schema(fields);
+
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Table> empty, Table::MakeEmpty(schema));
+
+  ASSERT_OK_AND_ASSIGN(auto tensor_column,
+                       empty->ToTensor(/*null_to_nan=*/false, 
/*row_major=*/false));
+  ASSERT_OK(tensor_column->Validate());
+
+  ASSERT_OK_AND_ASSIGN(auto tensor_row, empty->ToTensor());
+  ASSERT_OK(tensor_row->Validate());
+
+  const std::vector<int64_t> strides = {4, 4};
+  const std::vector<int64_t> shape = {0, 2};
+
+  EXPECT_EQ(strides, tensor_column->strides());
+  EXPECT_EQ(shape, tensor_column->shape());
+  EXPECT_EQ(strides, tensor_row->strides());
+  EXPECT_EQ(shape, tensor_row->shape());
+
+  std::vector<std::shared_ptr<Array>> columns;
+  auto t2 = Table::Make(::arrow::schema({}), columns);
+  auto table_no_columns =
+      Table::Make(::arrow::schema({}), std::vector<std::shared_ptr<Array>>{});
+
+  ASSERT_RAISES_WITH_MESSAGE(
+      TypeError,
+      "Type error: Conversion to Tensor for Tables or RecordBatches "
+      "without columns/schema is not supported.",
+      table_no_columns->ToTensor());
+}
+
+template <typename DataType>
+void CheckTableToTensor(const std::shared_ptr<Tensor>& tensor, const int size,
+                        const std::vector<int64_t> shape,
+                        const std::vector<int64_t> f_strides) {
+  EXPECT_EQ(size, tensor->size());
+  EXPECT_EQ(TypeTraits<DataType>::type_singleton(), tensor->type());
+  EXPECT_EQ(shape, tensor->shape());
+  EXPECT_EQ(f_strides, tensor->strides());
+  EXPECT_FALSE(tensor->is_row_major());
+  EXPECT_TRUE(tensor->is_column_major());
+  EXPECT_TRUE(tensor->is_contiguous());
+}
+
+template <typename DataType>
+void CheckTableToTensorRowMajor(const std::shared_ptr<Tensor>& tensor, const 
int size,
+                                const std::vector<int64_t> shape,
+                                const std::vector<int64_t> strides) {
+  EXPECT_EQ(size, tensor->size());
+  EXPECT_EQ(TypeTraits<DataType>::type_singleton(), tensor->type());
+  EXPECT_EQ(shape, tensor->shape());
+  EXPECT_EQ(strides, tensor->strides());
+  EXPECT_TRUE(tensor->is_row_major());
+  EXPECT_FALSE(tensor->is_column_major());
+  EXPECT_TRUE(tensor->is_contiguous());
+}
+
+TEST_F(TestTable, ToTensorSupportedNaN) {
+  auto f0 = field("f0", float32());
+  auto f1 = field("f1", float32());
+
+  std::vector<std::shared_ptr<Field>> fields = {f0, f1};
+  auto schema = ::arrow::schema(fields);
+
+  auto a0 = ChunkedArrayFromJSON(float32(), {"[NaN, 2, 3]", "[4, 5, 6, 7, 8, 
9]"});
+  auto a1 =
+      ChunkedArrayFromJSON(float32(), {"[10, 20]", "[30, 40, NaN, 60, 70, 80, 
90]"});
+
+  auto table = Table::Make(schema, {a0, a1});
+
+  ASSERT_OK_AND_ASSIGN(auto tensor,
+                       table->ToTensor(/*null_to_nan=*/false, 
/*row_major=*/false));
+  ASSERT_OK(tensor->Validate());
+
+  std::vector<int64_t> shape = {9, 2};
+  const int64_t f32_size = sizeof(float);
+  std::vector<int64_t> f_strides = {f32_size, f32_size * shape[0]};
+  std::shared_ptr<Tensor> tensor_expected = TensorFromJSON(
+      float32(), "[NaN, 2,  3,  4,  5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 
70, 80, 90]",
+      shape, f_strides);
+
+  EXPECT_FALSE(tensor_expected->Equals(*tensor));
+  EXPECT_TRUE(tensor_expected->Equals(*tensor, 
EqualOptions().nans_equal(true)));
+  CheckTableToTensor<FloatType>(tensor, 18, shape, f_strides);
+}
+
+TEST_F(TestTable, ToTensorSupportedNullToNan) {
+  // int32 + float32 = float64
+  auto f0 = field("f0", int32());
+  auto f1 = field("f1", float32());
+
+  std::vector<std::shared_ptr<Field>> fields = {f0, f1};
+  auto schema = ::arrow::schema(fields);
+
+  auto a0 = ChunkedArrayFromJSON(int32(), {"[null, 2, 3]", "[4, 5, 6, 7, 8, 
9]"});
+  auto a1 =
+      ChunkedArrayFromJSON(float32(), {"[10, 20]", "[30, 40, null, 60, 70, 80, 
90]"});
+
+  auto table = Table::Make(schema, {a0, a1});
+
+  ASSERT_OK_AND_ASSIGN(auto tensor,
+                       table->ToTensor(/*null_to_nan=*/true, 
/*row_major=*/false));
+  ASSERT_OK(tensor->Validate());
+
+  std::vector<int64_t> shape = {9, 2};
+  const int64_t f64_size = sizeof(double);
+  std::vector<int64_t> f_strides = {f64_size, f64_size * shape[0]};
+  std::shared_ptr<Tensor> tensor_expected = TensorFromJSON(
+      float64(), "[NaN, 2,  3,  4,  5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 
70, 80, 90]",
+      shape, f_strides);
+
+  EXPECT_FALSE(tensor_expected->Equals(*tensor));
+  EXPECT_TRUE(tensor_expected->Equals(*tensor, 
EqualOptions().nans_equal(true)));
+
+  CheckTableToTensor<DoubleType>(tensor, 18, shape, f_strides);
+
+  ASSERT_OK_AND_ASSIGN(auto tensor_row, table->ToTensor(/*null_to_nan=*/true));
+  ASSERT_OK(tensor_row->Validate());
+
+  std::vector<int64_t> strides = {f64_size * shape[1], f64_size};
+  std::shared_ptr<Tensor> tensor_expected_row = TensorFromJSON(
+      float64(), "[NaN, 10, 2,  20, 3, 30,  4, 40, 5, NaN, 6, 60, 7, 70, 8, 
80, 9, 90]",
+      shape, strides);
+
+  EXPECT_FALSE(tensor_expected_row->Equals(*tensor_row));
+  EXPECT_TRUE(tensor_expected_row->Equals(*tensor_row, 
EqualOptions().nans_equal(true)));
+
+  CheckTableToTensorRowMajor<DoubleType>(tensor_row, 18, shape, strides);
+
+  // int32 -> float64
+  auto f2 = field("f2", int32());
+
+  std::vector<std::shared_ptr<Field>> fields1 = {f0, f2};
+  auto schema1 = ::arrow::schema(fields1);
+
+  auto a2 = ChunkedArrayFromJSON(int32(), {"[10, 20]", "[30, 40, null, 60, 70, 
80, 90]"});
+  auto table1 = Table::Make(schema1, {a0, a2});
+
+  ASSERT_OK_AND_ASSIGN(auto tensor1,
+                       table1->ToTensor(/*null_to_nan=*/true, 
/*row_major=*/false));
+  ASSERT_OK(tensor1->Validate());
+
+  EXPECT_FALSE(tensor_expected->Equals(*tensor1));
+  EXPECT_TRUE(tensor_expected->Equals(*tensor1, 
EqualOptions().nans_equal(true)));
+
+  CheckTableToTensor<DoubleType>(tensor1, 18, shape, f_strides);
+
+  ASSERT_OK_AND_ASSIGN(auto tensor1_row, 
table1->ToTensor(/*null_to_nan=*/true));
+  ASSERT_OK(tensor1_row->Validate());
+
+  EXPECT_FALSE(tensor_expected_row->Equals(*tensor1_row));
+  EXPECT_TRUE(tensor_expected_row->Equals(*tensor1_row, 
EqualOptions().nans_equal(true)));
+
+  CheckTableToTensorRowMajor<DoubleType>(tensor1_row, 18, shape, strides);
+
+  // int8 -> float32
+  auto f3 = field("f3", int8());
+  auto f4 = field("f4", int8());
+
+  std::vector<std::shared_ptr<Field>> fields2 = {f3, f4};
+  auto schema2 = ::arrow::schema(fields2);
+
+  auto a3 = ChunkedArrayFromJSON(int8(), {"[null, 2, 3]", "[4, 5, 6, 7, 8, 
9]"});
+  auto a4 = ChunkedArrayFromJSON(int8(), {"[10, 20]", "[30, 40, null, 60, 70, 
80, 90]"});
+  auto table2 = Table::Make(schema2, {a3, a4});
+
+  ASSERT_OK_AND_ASSIGN(auto tensor2,
+                       table2->ToTensor(/*null_to_nan=*/true, 
/*row_major=*/false));
+  ASSERT_OK(tensor2->Validate());
+
+  const int64_t f32_size = sizeof(float);
+  std::vector<int64_t> f_strides_2 = {f32_size, f32_size * shape[0]};
+  std::shared_ptr<Tensor> tensor_expected_2 = TensorFromJSON(
+      float32(), "[NaN, 2,  3,  4,  5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 
70, 80, 90]",
+      shape, f_strides_2);
+
+  EXPECT_FALSE(tensor_expected_2->Equals(*tensor2));
+  EXPECT_TRUE(tensor_expected_2->Equals(*tensor2, 
EqualOptions().nans_equal(true)));
+
+  CheckTableToTensor<FloatType>(tensor2, 18, shape, f_strides_2);
+
+  ASSERT_OK_AND_ASSIGN(auto tensor2_row, 
table2->ToTensor(/*null_to_nan=*/true));
+  ASSERT_OK(tensor2_row->Validate());
+
+  std::vector<int64_t> strides_2 = {f32_size * shape[1], f32_size};
+  std::shared_ptr<Tensor> tensor2_expected_row = TensorFromJSON(
+      float32(), "[NaN, 10, 2,  20, 3, 30,  4, 40, 5, NaN, 6, 60, 7, 70, 8, 
80, 9, 90]",
+      shape, strides_2);
+
+  EXPECT_FALSE(tensor2_expected_row->Equals(*tensor2_row));
+  EXPECT_TRUE(
+      tensor2_expected_row->Equals(*tensor2_row, 
EqualOptions().nans_equal(true)));
+
+  CheckTableToTensorRowMajor<FloatType>(tensor2_row, 18, shape, strides_2);
+}
+
+TEST_F(TestTable, ToTensorSupportedTypesMixed) {
+  auto f0 = field("f0", uint16());
+  auto f1 = field("f1", int16());
+  auto f2 = field("f2", float32());
+
+  auto a0 = ChunkedArrayFromJSON(uint16(), {"[1, 2, 3]", "[4, 5, 6, 7, 8, 
9]"});
+  auto a1 = ChunkedArrayFromJSON(int16(), {"[10, 20]", "[30, 40, 50, 60, 70, 
80, 90]"});
+  auto a2 = ChunkedArrayFromJSON(float32(),
+                                 {"[100, 200, 300, NaN, 500, 600]", "[700, 
800, 900]"});
+
+  // Single column
+  std::vector<std::shared_ptr<Field>> fields = {f0};
+  auto schema = ::arrow::schema(fields);
+  auto table = Table::Make(schema, {a0});
+
+  ASSERT_OK_AND_ASSIGN(auto tensor,
+                       table->ToTensor(/*null_to_nan=*/false, 
/*row_major=*/false));
+  ASSERT_OK(tensor->Validate());
+
+  std::vector<int64_t> shape = {9, 1};
+  const int64_t uint16_size = sizeof(uint16_t);
+  std::vector<int64_t> f_strides = {uint16_size, uint16_size * shape[0]};
+  std::shared_ptr<Tensor> tensor_expected =
+      TensorFromJSON(uint16(), "[1, 2, 3, 4, 5, 6, 7, 8, 9]", shape, 
f_strides);
+
+  EXPECT_TRUE(tensor_expected->Equals(*tensor));
+  CheckTableToTensor<UInt16Type>(tensor, 9, shape, f_strides);
+
+  // uint16 + int16 = int32
+  std::vector<std::shared_ptr<Field>> fields1 = {f0, f1};
+  auto schema1 = ::arrow::schema(fields1);
+  auto table1 = Table::Make(schema1, {a0, a1});
+
+  ASSERT_OK_AND_ASSIGN(auto tensor1,
+                       table1->ToTensor(/*null_to_nan=*/false, 
/*row_major=*/false));
+  ASSERT_OK(tensor1->Validate());
+
+  std::vector<int64_t> shape1 = {9, 2};
+  const int64_t int32_size = sizeof(int32_t);
+  std::vector<int64_t> f_strides_1 = {int32_size, int32_size * shape1[0]};
+  std::shared_ptr<Tensor> tensor_expected_1 = TensorFromJSON(
+      int32(), "[1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 20, 30, 40, 50, 60, 
70, 80, 90]",
+      shape1, f_strides_1);
+
+  EXPECT_TRUE(tensor_expected_1->Equals(*tensor1));
+
+  CheckTableToTensor<Int32Type>(tensor1, 18, shape1, f_strides_1);
+
+  ASSERT_EQ(tensor1->type()->bit_width(), 
tensor_expected_1->type()->bit_width());
+
+  ASSERT_EQ(1, tensor_expected_1->Value<Int32Type>({0, 0}));
+  ASSERT_EQ(2, tensor_expected_1->Value<Int32Type>({1, 0}));
+  ASSERT_EQ(10, tensor_expected_1->Value<Int32Type>({0, 1}));
+
+  // uint16 + int16 + float32 = float64
+  std::vector<std::shared_ptr<Field>> fields2 = {f0, f1, f2};
+  auto schema2 = ::arrow::schema(fields2);
+  auto table2 = Table::Make(schema2, {a0, a1, a2});
+
+  ASSERT_OK_AND_ASSIGN(auto tensor2,
+                       table2->ToTensor(/*null_to_nan=*/false, 
/*row_major=*/false));
+  ASSERT_OK(tensor2->Validate());
+
+  std::vector<int64_t> shape2 = {9, 3};
+  const int64_t f64_size = sizeof(double);
+  std::vector<int64_t> f_strides_2 = {f64_size, f64_size * shape2[0]};
+  std::shared_ptr<Tensor> tensor_expected_2 =
+      TensorFromJSON(float64(),
+                     "[1,   2,   3,   4,   5,  6,  7,  8,   9,   10,  20, 30,  
40,  50,"
+                     "60,  70, 80, 90, 100, 200, 300, NaN, 500, 600, 700, 800, 
900]",
+                     shape2, f_strides_2);
+
+  EXPECT_FALSE(tensor_expected_2->Equals(*tensor2));
+  EXPECT_TRUE(tensor_expected_2->Equals(*tensor2, 
EqualOptions().nans_equal(true)));
+
+  CheckTableToTensor<DoubleType>(tensor2, 27, shape2, f_strides_2);
+}
+
+TEST_F(TestTable, ToTensorUnsupportedMixedFloat16) {
+  auto f0 = field("f0", float16());
+  auto f1 = field("f1", float64());
+
+  auto a0 = ChunkedArrayFromJSON(float16(), {"[1, 2, 3]", "[4, 5, 6, 7, 8, 
9]"});
+  auto a1 = ChunkedArrayFromJSON(float64(), {"[10, 20]", "[30, 40, 50, 60, 70, 
80, 90]"});
+
+  std::vector<std::shared_ptr<Field>> fields = {f0, f1};
+  auto schema = ::arrow::schema(fields);
+  auto table = Table::Make(schema, {a0, a1});
+
+  ASSERT_RAISES_WITH_MESSAGE(
+      NotImplemented, "NotImplemented: Casting from or to halffloat is not 
supported.",
+      table->ToTensor());
+
+  std::vector<std::shared_ptr<Field>> fields1 = {f1, f0};
+  auto schema1 = ::arrow::schema(fields1);
+  auto table1 = Table::Make(schema1, {a1, a0});
+
+  ASSERT_RAISES_WITH_MESSAGE(
+      NotImplemented, "NotImplemented: Casting from or to halffloat is not 
supported.",
+      table1->ToTensor());
+}
+
+template <typename DataType>
+class TestTableToTensorColumnMajor : public ::testing::Test {};
+
+TYPED_TEST_SUITE_P(TestTableToTensorColumnMajor);
+
+TYPED_TEST_P(TestTableToTensorColumnMajor, SupportedTypes) {
+  using DataType = TypeParam;
+  using c_data_type = typename DataType::c_type;
+  const int unit_size = sizeof(c_data_type);
+
+  auto f0 = field("f0", TypeTraits<DataType>::type_singleton());
+  auto f1 = field("f1", TypeTraits<DataType>::type_singleton());
+  auto f2 = field("f2", TypeTraits<DataType>::type_singleton());
+
+  std::vector<std::shared_ptr<Field>> fields = {f0, f1, f2};
+  auto schema = ::arrow::schema(fields);
+
+  auto a0 = ChunkedArrayFromJSON(TypeTraits<DataType>::type_singleton(),
+                                 {"[1, 2, 3]", "[4, 5, 6, 7, 8, 9]"});
+  auto a1 = ChunkedArrayFromJSON(TypeTraits<DataType>::type_singleton(),
+                                 {"[10, 20]", "[30, 40, 50, 60, 70, 80, 90]"});
+  auto a2 = ChunkedArrayFromJSON(TypeTraits<DataType>::type_singleton(),
+                                 {"[100, 100, 100, 100, 100, 100]", "[100, 
100, 100]"});
+
+  auto table = Table::Make(schema, {a0, a1, a2});
+
+  ASSERT_OK_AND_ASSIGN(auto tensor,
+                       table->ToTensor(/*null_to_nan=*/false, 
/*row_major=*/false));
+  ASSERT_OK(tensor->Validate());
+
+  std::vector<int64_t> shape = {9, 3};
+  std::vector<int64_t> f_strides = {unit_size, unit_size * shape[0]};
+  std::shared_ptr<Tensor> tensor_expected = TensorFromJSON(
+      TypeTraits<DataType>::type_singleton(),
+      "[1,   2,   3,   4,   5,   6,   7,   8,   9, 10,  20,  30,  40,  50,  
60,  70,  "
+      "80,  90, 100, 100, 100, 100, 100, 100, 100, 100, 100]",
+      shape, f_strides);
+
+  EXPECT_TRUE(tensor_expected->Equals(*tensor));
+  CheckTableToTensor<DataType>(tensor, 27, shape, f_strides);
+
+  // Test offsets
+  auto table_slice = table->Slice(1);
+
+  ASSERT_OK_AND_ASSIGN(auto tensor_sliced, 
table_slice->ToTensor(/*null_to_nan=*/false,
+                                                                 
/*row_major=*/false));
+  ASSERT_OK(tensor_sliced->Validate());
+
+  std::vector<int64_t> shape_sliced = {8, 3};
+  std::vector<int64_t> f_strides_sliced = {unit_size, unit_size * 
shape_sliced[0]};
+  std::shared_ptr<Tensor> tensor_expected_sliced =
+      TensorFromJSON(TypeTraits<DataType>::type_singleton(),
+                     "[2,   3,   4,   5,   6,   7,   8,   9, 20,  30,  40,  
50,  60,  "
+                     "70,  80,  90, 100, 100, 100, 100, 100, 100, 100, 100]",
+                     shape_sliced, f_strides_sliced);
+
+  EXPECT_TRUE(tensor_expected_sliced->Equals(*tensor_sliced));
+  CheckTableToTensor<DataType>(tensor_expected_sliced, 24, shape_sliced,
+                               f_strides_sliced);

Review Comment:
   Thank you so much for the kind words and review @tadeja! Yes, the suggestion 
makes perfect sense, will commit.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to