jorisvandenbossche commented on code in PR #14697:
URL: https://github.com/apache/arrow/pull/14697#discussion_r1041161242
##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -460,17 +469,21 @@ struct StructFieldFunctor {
static Status CheckIndex(int index, const DataType& type) {
if (!ValidParentType(type)) {
return Status::TypeError("struct_field: cannot subscript field of type
", type);
- } else if (index < 0 || index >= type.num_fields()) {
+ } else if (!IsBaseListType(type) && (index < 0 || index >=
type.num_fields())) {
Review Comment:
In theory, we could check the index for fixed sized list arrays as well (not
for variable size list arrays, though, so not sure that is worth it)
##########
cpp/src/arrow/type.cc:
##########
@@ -1060,19 +1060,40 @@ struct FieldPathGetImpl {
}
template <typename T, typename GetChildren>
- static Result<T> Get(const FieldPath* path, const std::vector<T>* children,
- GetChildren&& get_children, int* out_of_range_depth) {
+ static Result<T> Get(const FieldPath* path, const DataType* parent,
+ const std::vector<T>* children, GetChildren&&
get_children,
+ int* out_of_range_depth) {
if (path->indices().empty()) {
return Status::Invalid("empty indices cannot be traversed");
}
int depth = 0;
- const T* out;
+ const T* out = nullptr;
for (int index : path->indices()) {
if (children == nullptr) {
return Status::NotImplemented("Get child data of non-struct array");
}
+ if constexpr (std::is_same_v<T, std::shared_ptr<arrow::Field>>) {
+ auto IsBaseListType = [](const DataType& type) {
+ return dynamic_cast<const BaseListType*>(&type) != nullptr;
+ };
+
+ // For lists, we don't care about the index, jump right into the list
value type.
+ // The index here is used in the kernel to grab the specific list
element.
+ // Children then are fields from list type, and out will be the
children from
+ // that field, thus index 0.
+ if (out != nullptr && IsBaseListType(*(*out)->type())) {
+ children = get_children(*out);
Review Comment:
I don't fully understand why we need the `get_children` here (because then
we call `get_children` twice in a single iteration? since there is another call
a bit below)
Can you add a clarifying comment about that?
##########
cpp/src/arrow/type_test.cc:
##########
@@ -434,6 +434,64 @@ TEST(TestFieldRef, DotPathRoundTrip) {
check_roundtrip(FieldRef("foo", 1, FieldRef("bar", 2, 3), FieldRef()));
}
+TEST(TestFieldRef, NestedWithList) {
+ // bit of indirection to satisfy list / large_list variants
+ auto gen_type = [](std::shared_ptr<DataType> (*f)(const
std::shared_ptr<DataType>&)) {
+ return f;
+ };
+
+ for (auto list_type : {gen_type(list), gen_type(large_list)}) {
+ // Single list type
+ auto type = struct_(
+ {field("c", list_type(struct_({field("d", int32()), field("e",
int8())})))});
+
+ // Note numeric values here go unused, outside of indicating a further
step, during
+ // FindAll, only used by kernels for which list element to take. When
FindAll
+ // encounters a list, it assumes the list value type. Not b/c they are
numeric, but
+ // they are the position referencing a specific list element.
+ EXPECT_THAT(FieldRef("c", 0, "d").FindAll(*type), ElementsAre(FieldPath{0,
0, 0}));
Review Comment:
Can you also add variants of those tests that pass a schema instead of type
to `FindAll`?
##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -460,17 +469,21 @@ struct StructFieldFunctor {
static Status CheckIndex(int index, const DataType& type) {
if (!ValidParentType(type)) {
return Status::TypeError("struct_field: cannot subscript field of type
", type);
- } else if (index < 0 || index >= type.num_fields()) {
+ } else if (!IsBaseListType(type) && (index < 0 || index >=
type.num_fields())) {
return Status::Invalid("struct_field: out-of-bounds field reference to
field ",
index, " in type ", type, " with ",
type.num_fields(),
" fields");
}
return Status::OK();
}
+ static bool IsBaseListType(const DataType& type) {
+ return dynamic_cast<const BaseListType*>(&type) != nullptr;
+ }
Review Comment:
Also, if we keep this, using a combination of a few `type.id() == ..`
checks (like `ValidParentType` just below) might be more readable to know which
types are included)
##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -460,17 +469,21 @@ struct StructFieldFunctor {
static Status CheckIndex(int index, const DataType& type) {
if (!ValidParentType(type)) {
return Status::TypeError("struct_field: cannot subscript field of type
", type);
- } else if (index < 0 || index >= type.num_fields()) {
+ } else if (!IsBaseListType(type) && (index < 0 || index >=
type.num_fields())) {
return Status::Invalid("struct_field: out-of-bounds field reference to
field ",
index, " in type ", type, " with ",
type.num_fields(),
" fields");
}
return Status::OK();
}
+ static bool IsBaseListType(const DataType& type) {
+ return dynamic_cast<const BaseListType*>(&type) != nullptr;
+ }
Review Comment:
We have `is_list_like` helper defined (although that also includes Map type,
not sure if the list_element kernel supports that)
##########
cpp/src/arrow/type.cc:
##########
@@ -1060,19 +1060,40 @@ struct FieldPathGetImpl {
}
template <typename T, typename GetChildren>
- static Result<T> Get(const FieldPath* path, const std::vector<T>* children,
- GetChildren&& get_children, int* out_of_range_depth) {
+ static Result<T> Get(const FieldPath* path, const DataType* parent,
+ const std::vector<T>* children, GetChildren&&
get_children,
+ int* out_of_range_depth) {
if (path->indices().empty()) {
return Status::Invalid("empty indices cannot be traversed");
}
int depth = 0;
- const T* out;
+ const T* out = nullptr;
for (int index : path->indices()) {
if (children == nullptr) {
return Status::NotImplemented("Get child data of non-struct array");
}
+ if constexpr (std::is_same_v<T, std::shared_ptr<arrow::Field>>) {
+ auto IsBaseListType = [](const DataType& type) {
+ return dynamic_cast<const BaseListType*>(&type) != nullptr;
+ };
+
+ // For lists, we don't care about the index, jump right into the list
value type.
+ // The index here is used in the kernel to grab the specific list
element.
+ // Children then are fields from list type, and out will be the
children from
+ // that field, thus index 0.
+ if (out != nullptr && IsBaseListType(*(*out)->type())) {
+ children = get_children(*out);
Review Comment:
(or to put it differently: why isn't this initial get_children needed when
parent is a struct type? What's the difference between both?)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]