benibus commented on code in PR #34537:
URL: https://github.com/apache/arrow/pull/34537#discussion_r1149609861
##########
cpp/src/arrow/type_test.cc:
##########
@@ -379,6 +382,168 @@ TEST(TestFieldPath, Basics) {
FieldPath({s.num_fields() * 2}).Get(s));
}
+TEST(TestFieldPath, GetForTable) {
+ using testing::HasSubstr;
+
+ const int length = 4;
+ auto f0 = field("a", int32());
+ auto f1 = field("b", int32());
+ auto f2 = field("c", struct_({f1}));
+ auto f3 = field("d", struct_({f0, f2}));
+ auto table_schema = schema({f0, f1, f2, f3});
+
+ // Each column has a different chunking
+ ChunkedArrayVector columns(4);
+ columns[0] = ChunkedArrayFromJSON(f0->type(), {"[0,1,2,3]"});
+ columns[1] = ChunkedArrayFromJSON(f1->type(), {"[3,2,1]", "[0]"});
+ columns[2] =
+ ChunkedArrayFromJSON(f2->type(), {R"([{"b":3},{"b":2}])",
R"([{"b":1},{"b":0}])"});
+ columns[3] = ChunkedArrayFromJSON(
+ f3->type(), {R"([{"a":0,"c":{"b":3}},{"a":1,"c":{"b":2}}])",
+ R"([{"a":2,"c":{"b":1}}])", R"([{"a":3,"c":{"b":0}}])"});
+ auto table = Table::Make(table_schema, columns, length);
+ ASSERT_OK(table->ValidateFull());
+
+ ASSERT_OK_AND_ASSIGN(auto v0, FieldPath({0}).Get(*table));
+ ASSERT_OK_AND_ASSIGN(auto v1, FieldPath({1}).Get(*table));
+ ASSERT_OK_AND_ASSIGN(auto v2, FieldPath({2}).Get(*table));
+ ASSERT_OK_AND_ASSIGN(auto v2_0, FieldPath({2, 0}).Get(*table));
+ ASSERT_OK_AND_ASSIGN(auto v3, FieldPath({3}).Get(*table));
+ ASSERT_OK_AND_ASSIGN(auto v3_0, FieldPath({3, 0}).Get(*table));
+ ASSERT_OK_AND_ASSIGN(auto v3_1, FieldPath({3, 1}).Get(*table));
+ ASSERT_OK_AND_ASSIGN(auto v3_1_0, FieldPath({3, 1, 0}).Get(*table));
+
+ EXPECT_EQ(v0->num_chunks(), columns[0]->num_chunks());
+ EXPECT_EQ(v1->num_chunks(), columns[1]->num_chunks());
+ EXPECT_EQ(v2->num_chunks(), columns[2]->num_chunks());
+ EXPECT_EQ(v2_0->num_chunks(), columns[2]->num_chunks());
+ EXPECT_EQ(v3->num_chunks(), columns[3]->num_chunks());
+ EXPECT_EQ(v3_0->num_chunks(), columns[3]->num_chunks());
+ EXPECT_EQ(v3_1->num_chunks(), columns[3]->num_chunks());
+ EXPECT_EQ(v3_1_0->num_chunks(), columns[3]->num_chunks());
+
+ EXPECT_TRUE(columns[0]->Equals(v0));
+ EXPECT_TRUE(columns[0]->Equals(v3_0));
+
+ EXPECT_TRUE(columns[1]->Equals(v1));
+ EXPECT_TRUE(columns[1]->Equals(v2_0));
+ EXPECT_TRUE(columns[1]->Equals(v3_1_0));
+
+ EXPECT_TRUE(columns[2]->Equals(v2));
+ EXPECT_TRUE(columns[2]->Equals(v3_1));
+
+ EXPECT_TRUE(columns[3]->Equals(v3));
+
+ for (const auto& path :
+ {FieldPath({4, 1, 0}), FieldPath({3, 2, 0}), FieldPath{3, 1, 1}}) {
+ EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, HasSubstr("index out of
range"),
+ path.Get(*table));
+ }
+ EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("empty indices cannot be
traversed"),
+ FieldPath().Get(*table));
+}
+
+TEST(TestFieldPath, GetForChunkedArray) {
+ using testing::HasSubstr;
+
+ auto f0 = field("a", int32());
+ auto f1 = field("b", int32());
+ auto f2 = field("c", struct_({f1}));
+ auto f3 = field("d", struct_({f0, f2}));
+ auto type = struct_({f0, f1, f3});
+
+ auto column0 = ChunkedArrayFromJSON(f0->type(), {"[0,1,2,3]"});
+ auto column1 = ChunkedArrayFromJSON(f1->type(), {"[3,2,1,0]"});
+ auto column2_1 =
+ ChunkedArrayFromJSON(f2->type(),
{R"([{"b":3},{"b":2},{"b":1},{"b":0}])"});
+ auto chunked_array = ChunkedArrayFromJSON(
+ type,
+ {
+ R"([{"a":0,"b":3,"d":{"a":0,"c":{"b":3}}}])",
+
R"([{"a":1,"b":2,"d":{"a":1,"c":{"b":2}}},{"a":2,"b":1,"d":{"a":2,"c":{"b":1}}}])",
+ R"([{"a":3,"b":0,"d":{"a":3,"c":{"b":0}}}])",
+ });
+ ASSERT_OK(chunked_array->ValidateFull());
+
+ ASSERT_OK_AND_ASSIGN(auto v0, FieldPath({0}).Get(*chunked_array));
+ ASSERT_OK_AND_ASSIGN(auto v1, FieldPath({1}).Get(*chunked_array));
+ ASSERT_OK_AND_ASSIGN(auto v2_0, FieldPath({2, 0}).Get(*chunked_array));
+ ASSERT_OK_AND_ASSIGN(auto v2_1, FieldPath({2, 1}).Get(*chunked_array));
+ ASSERT_OK_AND_ASSIGN(auto v2_1_0, FieldPath({2, 1, 0}).Get(*chunked_array));
+
+ for (const auto& v : {v0, v1, v2_0, v2_1, v2_1_0}) {
+ EXPECT_EQ(v->num_chunks(), chunked_array->num_chunks());
+ }
+
+ EXPECT_TRUE(column0->Equals(v0));
+ EXPECT_TRUE(column0->Equals(v2_0));
+
+ EXPECT_TRUE(column1->Equals(v1));
+ EXPECT_TRUE(column1->Equals(v2_1_0));
+ EXPECT_FALSE(column1->Equals(v2_1));
+
+ EXPECT_TRUE(column2_1->Equals(v2_1));
+
+ EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
+ HasSubstr("Get child data of non-struct
chunked array"),
+ FieldPath({0}).Get(*column0));
+}
+
+TEST(TestFieldPath, GetForChunkedArrayWithNulls) {
+ auto int_field = field("i", int32());
+ auto int_chunked_array =
+ ChunkedArrayFromJSON(int_field->type(), {"[0,1]", "[2,null]", "[3,4]"});
+
+ ASSERT_OK_AND_ASSIGN(auto null_bitmap, AllocateEmptyBitmap(2));
+ ArrayVector struct_chunks;
+ for (const auto& int_chunk : int_chunked_array->chunks()) {
+ ASSERT_OK_AND_ASSIGN(auto chunk,
+ StructArray::Make({int_chunk}, {int_field},
null_bitmap, 2));
+ struct_chunks.push_back(chunk);
+ }
+
+ ASSERT_OK_AND_ASSIGN(auto struct_chunked_array,
ChunkedArray::Make(struct_chunks));
+ ASSERT_OK(struct_chunked_array->ValidateFull());
+ // The top-level null bitmap shouldn't affect the validity of the returned
child field.
+ ASSERT_OK_AND_ASSIGN(auto int_child,
FieldPath({0}).Get(*struct_chunked_array));
+ ASSERT_TRUE(int_chunked_array->Equals(int_child));
+}
+
+TEST(TestFieldPath, GetForRecordBatch) {
Review Comment:
I agree that adding more tests for the other `FieldPath::Get` overloads
would be a good idea - but maybe in a different PR. We'd probably want to
improve coverage across the board (i.e. including `RecordBatch`, `Array`,
`ArrayData`) in a way that takes the zero-copy requirements into account. Doing
this without duplicating the test logic 3-5 times would be ideal, but more
involved.
##########
cpp/src/arrow/type_test.cc:
##########
@@ -398,6 +563,71 @@ TEST(TestFieldRef, Basics) {
EXPECT_THAT(FieldRef("beta").FindAll(s), ElementsAre(FieldPath{1},
FieldPath{3}));
}
+TEST(TestFieldRef, FindAllForTable) {
Review Comment:
I don't think the extra `FieldRef` tests need to be included here (again,
maybe in a separate PR).
In any case, you wouldn't need to go through all this trouble to create the
`Table`/`RecordBatch` since `FieldRef::FindAll` doesn't actually care about the
values (eventually, all the overloads just redirect to the overload for
`FieldVector`). The `Find*` methods simply return `FieldPath`s which can _then_
be used to extract any data.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]