lidavidm commented on a change in pull request #12162:
URL: https://github.com/apache/arrow/pull/12162#discussion_r794532289
##########
File path: docs/source/python/api/compute.rst
##########
@@ -504,6 +505,7 @@ Compute Options
IndexOptions
JoinOptions
MakeStructOptions
+ MapArrayLookupOptions
Review comment:
Just a reminder to revisit and implement this.
To do so, you'll need to:
1. Add the options class definition to the Cython pxd:
https://github.com/apache/arrow/blob/39367db2dab321dbbf4d12d2229020614b049dde/python/pyarrow/includes/libarrow.pxd#L2147-L2150
2. Add a Python options class wrapper to the Cython code:
https://github.com/apache/arrow/blob/39367db2dab321dbbf4d12d2229020614b049dde/python/pyarrow/_compute.pyx#L1317-L1333
3. Add an import so it's in the public API:
https://github.com/apache/arrow/blob/39367db2dab321dbbf4d12d2229020614b049dde/python/pyarrow/compute.py#L18
4. Add the options class to the serialization test:
https://github.com/apache/arrow/blob/39367db2dab321dbbf4d12d2229020614b049dde/python/pyarrow/tests/test_compute.py#L138
(just add it to the list there, no need to add additional assertions to the
test)
5. Add a brief test to make sure everything works (no need to add detailed
tests, we just want to make sure the bindings were set up):
https://github.com/apache/arrow/blob/39367db2dab321dbbf4d12d2229020614b049dde/python/pyarrow/tests/test_compute.py#L2081-L2092
##########
File path: cpp/src/arrow/compute/kernels/scalar_nested.cc
##########
@@ -428,6 +428,271 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a
StructArray",
"specified through MakeStructOptions."),
{"*args"},
"MakeStructOptions"};
+template <typename KeyType>
+struct MapArrayLookupFunctor {
+ static Result<int64_t> GetOneMatchingIndex(const Array& keys,
+ const Scalar& query_key_scalar,
+ const bool* from_back) {
+ int64_t match_index = -1;
+ RETURN_NOT_OK(
+ FindMatchingIndices(keys, query_key_scalar, [&](int64_t index) ->
Status {
+ match_index = index;
+ if (*from_back) {
+ return Status::OK();
+ } else {
+ return Status::Cancelled("Found key match for FIRST");
+ }
+ }));
+
+ return match_index;
+ }
+
+ static Status BuildItemsArray(const Array& keys, const Array& items,
+ const Scalar& query_key_scalar,
+ bool* found_at_least_one_key, ArrayBuilder*
builder) {
+ RETURN_NOT_OK(
+ FindMatchingIndices(keys, query_key_scalar, [&](int64_t index) ->
Status {
+ *found_at_least_one_key = true;
+ RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), index, 1));
+ return Status::OK();
+ }));
+ return Status::OK();
+ }
+
+ template <typename FoundItem>
+ static Status FindMatchingIndices(const Array& keys, const Scalar&
query_key_scalar,
+ FoundItem callback) {
+ const auto query_key = UnboxScalar<KeyType>::Unbox(query_key_scalar);
+ int64_t index = 0;
+ ARROW_UNUSED(VisitArrayValuesInline<KeyType>(
Review comment:
This can't be ARROW_UNUSED anymore since AppendArraySlice above may
return an actual error. Instead we should check if the status is Cancelled:
```
auto status = Visit...;
if (!status.ok() && !status.IsCancelled()) {
return status;
}
return Status::OK();
```
##########
File path: cpp/src/arrow/compute/kernels/scalar_nested.cc
##########
@@ -428,6 +428,271 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a
StructArray",
"specified through MakeStructOptions."),
{"*args"},
"MakeStructOptions"};
+template <typename KeyType>
+struct MapArrayLookupFunctor {
+ static Result<int64_t> GetOneMatchingIndex(const Array& keys,
+ const Scalar& query_key_scalar,
+ const bool* from_back) {
+ int64_t match_index = -1;
+ RETURN_NOT_OK(
+ FindMatchingIndices(keys, query_key_scalar, [&](int64_t index) ->
Status {
+ match_index = index;
+ if (*from_back) {
+ return Status::OK();
+ } else {
+ return Status::Cancelled("Found key match for FIRST");
+ }
+ }));
+
+ return match_index;
+ }
+
+ static Status BuildItemsArray(const Array& keys, const Array& items,
+ const Scalar& query_key_scalar,
+ bool* found_at_least_one_key, ArrayBuilder*
builder) {
+ RETURN_NOT_OK(
+ FindMatchingIndices(keys, query_key_scalar, [&](int64_t index) ->
Status {
+ *found_at_least_one_key = true;
+ RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), index, 1));
+ return Status::OK();
+ }));
+ return Status::OK();
+ }
+
+ template <typename FoundItem>
+ static Status FindMatchingIndices(const Array& keys, const Scalar&
query_key_scalar,
+ FoundItem callback) {
+ const auto query_key = UnboxScalar<KeyType>::Unbox(query_key_scalar);
+ int64_t index = 0;
+ ARROW_UNUSED(VisitArrayValuesInline<KeyType>(
+ *keys.data(),
+ [&](decltype(query_key) key) -> Status {
+ if (key == query_key) {
+ return callback(index++);
+ }
+ ++index;
+ return Status::OK();
+ },
+ [&]() -> Status {
+ ++index;
+ return Status::OK();
+ }));
+ return Status::OK();
+ }
+
+ static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch,
Datum* out) {
+ const auto& options = OptionsWrapper<MapArrayLookupOptions>::Get(ctx);
+ const auto& query_key = options.query_key;
+ const auto& occurrence = options.occurrence;
+ const MapArray map_array(batch[0].array());
+
+ std::unique_ptr<ArrayBuilder> builder;
+ if (occurrence == MapArrayLookupOptions::Occurrence::ALL) {
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(),
+ list(map_array.map_type()->item_type()),
&builder));
+
+ for (int64_t map_array_idx = 0; map_array_idx < map_array.length();
+ ++map_array_idx) {
+ if (!map_array.IsValid(map_array_idx)) {
+ RETURN_NOT_OK(builder->AppendNull());
+ continue;
+ }
+
+ auto map = map_array.value_slice(map_array_idx);
+ auto keys = checked_cast<const StructArray&>(*map).field(0);
+ auto items = checked_cast<const StructArray&>(*map).field(1);
+ bool found_at_least_one_key = false;
+ std::unique_ptr<ArrayBuilder> list_builder;
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(),
map_array.map_type()->item_type(),
+ &list_builder));
+
+ RETURN_NOT_OK(BuildItemsArray(*keys, *items, *query_key,
&found_at_least_one_key,
+ list_builder.get()));
+ if (!found_at_least_one_key) {
+ RETURN_NOT_OK(builder->AppendNull());
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto list_result, list_builder->Finish());
+ RETURN_NOT_OK(builder->AppendScalar(ListScalar(list_result)));
+ }
+ list_builder->Reset();
+ }
+ ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
+ out->value = result->data();
+ } else {
+ RETURN_NOT_OK(
+ MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(),
&builder));
+
Review comment:
Here we should `builder->Reserve(batch.length)`
##########
File path: docs/source/cpp/compute.rst
##########
@@ -1639,17 +1639,19 @@ in the respective option classes.
Structural transforms
~~~~~~~~~~~~~~~~~~~~~
-+---------------------+------------+-------------------------------------+------------------+------------------------------+--------+
-| Function name | Arity | Input types |
Output type | Options class | Notes |
-+=====================+============+=====================================+==================+==============================+========+
-| list_element | Binary | List-like (Arg 0), Integral (Arg 1) |
List value type | | \(1) |
-+---------------------+------------+-------------------------------------+------------------+------------------------------+--------+
-| list_flatten | Unary | List-like |
List value type | | \(2) |
-+---------------------+------------+-------------------------------------+------------------+------------------------------+--------+
-| list_parent_indices | Unary | List-like |
Int64 | | \(3) |
-+---------------------+------------+-------------------------------------+------------------+------------------------------+--------+
-| struct_field | Unary | Struct or Union |
Computed | :struct:`StructFieldOptions` | \(4) |
-+---------------------+------------+-------------------------------------+------------------+------------------------------+--------+
++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+
+| Function name | Arity | Input types |
Output type | Options class | Notes |
++=====================+============+=====================================+==================+=================================+========+
+| list_element | Binary | List-like (Arg 0), Integral (Arg 1) |
List value type | | \(1) |
++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+
+| list_flatten | Unary | List-like |
List value type | | \(2) |
++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+
+| list_parent_indices | Unary | List-like |
Int64 | | \(3) |
++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+
+| struct_field | Unary | Struct or Union |
Computed | :struct:`StructFieldOptions` | \(4) |
++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+
+| map_array_lookup | Unary | MapArray |
Computed | :struct:`MapArrayLookupOptions` | \(5) |
Review comment:
Please keep these tables alphabetically sorted (sorry, you may have to
renumber some items)
##########
File path: docs/source/cpp/compute.rst
##########
@@ -1639,17 +1639,19 @@ in the respective option classes.
Structural transforms
~~~~~~~~~~~~~~~~~~~~~
-+---------------------+------------+-------------------------------------+------------------+------------------------------+--------+
-| Function name | Arity | Input types |
Output type | Options class | Notes |
-+=====================+============+=====================================+==================+==============================+========+
-| list_element | Binary | List-like (Arg 0), Integral (Arg 1) |
List value type | | \(1) |
-+---------------------+------------+-------------------------------------+------------------+------------------------------+--------+
-| list_flatten | Unary | List-like |
List value type | | \(2) |
-+---------------------+------------+-------------------------------------+------------------+------------------------------+--------+
-| list_parent_indices | Unary | List-like |
Int64 | | \(3) |
-+---------------------+------------+-------------------------------------+------------------+------------------------------+--------+
-| struct_field | Unary | Struct or Union |
Computed | :struct:`StructFieldOptions` | \(4) |
-+---------------------+------------+-------------------------------------+------------------+------------------------------+--------+
++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+
+| Function name | Arity | Input types |
Output type | Options class | Notes |
++=====================+============+=====================================+==================+=================================+========+
+| list_element | Binary | List-like (Arg 0), Integral (Arg 1) |
List value type | | \(1) |
++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+
+| list_flatten | Unary | List-like |
List value type | | \(2) |
++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+
+| list_parent_indices | Unary | List-like |
Int64 | | \(3) |
++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+
+| struct_field | Unary | Struct or Union |
Computed | :struct:`StructFieldOptions` | \(4) |
++---------------------+------------+-------------------------------------+------------------+---------------------------------+--------+
+| map_array_lookup | Unary | MapArray |
Computed | :struct:`MapArrayLookupOptions` | \(5) |
Review comment:
```suggestion
| map_array_lookup | Unary | Map |
Computed | :struct:`MapArrayLookupOptions` | \(5) |
```
##########
File path: cpp/src/arrow/compute/kernels/scalar_nested.cc
##########
@@ -428,6 +428,271 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a
StructArray",
"specified through MakeStructOptions."),
{"*args"},
"MakeStructOptions"};
+template <typename KeyType>
+struct MapArrayLookupFunctor {
+ static Result<int64_t> GetOneMatchingIndex(const Array& keys,
+ const Scalar& query_key_scalar,
+ const bool* from_back) {
+ int64_t match_index = -1;
+ RETURN_NOT_OK(
+ FindMatchingIndices(keys, query_key_scalar, [&](int64_t index) ->
Status {
+ match_index = index;
+ if (*from_back) {
+ return Status::OK();
+ } else {
+ return Status::Cancelled("Found key match for FIRST");
+ }
+ }));
+
+ return match_index;
+ }
+
+ static Status BuildItemsArray(const Array& keys, const Array& items,
+ const Scalar& query_key_scalar,
+ bool* found_at_least_one_key, ArrayBuilder*
builder) {
+ RETURN_NOT_OK(
+ FindMatchingIndices(keys, query_key_scalar, [&](int64_t index) ->
Status {
+ *found_at_least_one_key = true;
+ RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), index, 1));
+ return Status::OK();
+ }));
+ return Status::OK();
+ }
+
+ template <typename FoundItem>
+ static Status FindMatchingIndices(const Array& keys, const Scalar&
query_key_scalar,
+ FoundItem callback) {
+ const auto query_key = UnboxScalar<KeyType>::Unbox(query_key_scalar);
+ int64_t index = 0;
+ ARROW_UNUSED(VisitArrayValuesInline<KeyType>(
+ *keys.data(),
+ [&](decltype(query_key) key) -> Status {
+ if (key == query_key) {
+ return callback(index++);
+ }
+ ++index;
+ return Status::OK();
+ },
+ [&]() -> Status {
+ ++index;
+ return Status::OK();
+ }));
+ return Status::OK();
+ }
+
+ static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch,
Datum* out) {
+ const auto& options = OptionsWrapper<MapArrayLookupOptions>::Get(ctx);
+ const auto& query_key = options.query_key;
+ const auto& occurrence = options.occurrence;
+ const MapArray map_array(batch[0].array());
+
+ std::unique_ptr<ArrayBuilder> builder;
+ if (occurrence == MapArrayLookupOptions::Occurrence::ALL) {
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(),
+ list(map_array.map_type()->item_type()),
&builder));
+
+ for (int64_t map_array_idx = 0; map_array_idx < map_array.length();
+ ++map_array_idx) {
+ if (!map_array.IsValid(map_array_idx)) {
+ RETURN_NOT_OK(builder->AppendNull());
+ continue;
+ }
+
+ auto map = map_array.value_slice(map_array_idx);
+ auto keys = checked_cast<const StructArray&>(*map).field(0);
+ auto items = checked_cast<const StructArray&>(*map).field(1);
+ bool found_at_least_one_key = false;
+ std::unique_ptr<ArrayBuilder> list_builder;
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(),
map_array.map_type()->item_type(),
+ &list_builder));
+
+ RETURN_NOT_OK(BuildItemsArray(*keys, *items, *query_key,
&found_at_least_one_key,
+ list_builder.get()));
+ if (!found_at_least_one_key) {
+ RETURN_NOT_OK(builder->AppendNull());
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto list_result, list_builder->Finish());
+ RETURN_NOT_OK(builder->AppendScalar(ListScalar(list_result)));
+ }
+ list_builder->Reset();
Review comment:
This is inefficient because we're building an Array, allocating a
Scalar, and then copying the data to the builder. However, a ListBuilder lets
you directly append to the child array.
This should do something like
```
auto* values_builder =
checked_cast<ListBuilder*>(builder.get())->values_builder();
builder->Append(true); // start a new list item
FindMatchingIndices(..., [&](int64_t index) {
values_builder->AppendArraySlice(...);
});
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]