lidavidm commented on a change in pull request #12162:
URL: https://github.com/apache/arrow/pull/12162#discussion_r791702912
##########
File path: cpp/src/arrow/compute/kernels/scalar_nested.cc
##########
@@ -429,6 +429,169 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a
StructArray",
{"*args"},
"MakeStructOptions"};
+struct MapArrayLookupFunctor {
+ static Result<int64_t> FindOneMapValueIndex(const Array& keys, const Scalar&
query_key,
+ const int64_t start, const
int64_t end,
+ const bool from_back = false) {
+ if (!from_back) {
+ for (int64_t idx = start; idx < end; ++idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) return idx;
+ }
+ } else {
+ for (int64_t idx = end - 1; idx >= start; --idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) return idx;
+ }
+ }
+ return -1;
+ }
+
+ static Result<std::shared_ptr<Scalar>> GetScalarOutput(KernelContext* ctx,
+ const MapScalar
map_scalar) {
Review comment:
For LAST, you could keep track of the index outside the "loop" and
simply overwrite it each time (though that is perhaps not as efficient),
returning the final index. We could later define a visitor that goes in
reverse.
##########
File path: cpp/src/arrow/compute/kernels/scalar_nested.cc
##########
@@ -429,6 +429,167 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a
StructArray",
{"*args"},
"MakeStructOptions"};
+struct MapArrayLookupFunctor {
+ static Result<int64_t> FindOneMapValueIndex(const Array& keys, const Scalar&
query_key,
+ const int64_t start, const
int64_t end,
+ const bool from_back = false) {
+ if (!from_back) {
+ for (int64_t idx = start; idx < end; ++idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) return idx;
+ }
+ } else {
+ for (int64_t idx = end - 1; idx >= start; --idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) return idx;
+ }
+ }
+ return -1;
+ }
+
+ static Status SetScalarOutput(const Array& keys, const Array& items,
KernelContext* ctx,
+ const MapArrayLookupOptions& options,
+ std::shared_ptr<Scalar>& out) {
+ const Scalar& query_key = *options.query_key;
+ if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items.type(), &builder));
+
+ bool found_at_least_one_key = false;
+ for (int64_t idx = 0; idx < keys.length(); ++idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) {
+ found_at_least_one_key = true;
+ RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), idx, 1));
+ }
+ }
+ if (!found_at_least_one_key) {
+ out = MakeNullScalar(list(items.type()));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
+ ARROW_ASSIGN_OR_RAISE(out, MakeScalar(list(items.type()), result));
+ }
+ } else { /* occurrence == FIRST || LAST */
+ bool from_back = (options.occurrence == MapArrayLookupOptions::LAST);
+
+ ARROW_ASSIGN_OR_RAISE(
+ int64_t key_match_idx,
+ FindOneMapValueIndex(keys, query_key, 0, keys.length(), from_back));
+ if (key_match_idx != -1) {
+ ARROW_ASSIGN_OR_RAISE(out, items.GetScalar(key_match_idx));
+ } else {
+ out = MakeNullScalar(items.type());
+ }
+ }
+ return Status::OK();
+ }
+
+ static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch,
Datum* out) {
+ const auto& options = OptionsWrapper<MapArrayLookupOptions>::Get(ctx);
+
+ const MapArray map_array(batch[0].array());
+
+ std::unique_ptr<ArrayBuilder> builder;
+ if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) {
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(),
+ list(map_array.map_type()->item_type()),
&builder));
+ } else {
+ RETURN_NOT_OK(
+ MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(),
&builder));
+ }
+
+ for (int64_t map_array_idx = 0; map_array_idx < map_array.length();
++map_array_idx) {
+ if (!map_array.IsValid(map_array_idx)) {
+ RETURN_NOT_OK(builder->AppendNull());
+ continue;
+ } else {
+ auto map = map_array.value_slice(map_array_idx);
+ auto keys = checked_cast<const StructArray&>(*map).field(0);
+ auto items = checked_cast<const StructArray&>(*map).field(1);
+ std::shared_ptr<Scalar> output;
+ RETURN_NOT_OK(SetScalarOutput(*keys, *items, ctx, options, output));
Review comment:
I would say, don't use SetScalarOutput and stick to AppendArraySlice.
Allocating a Scalar for each item will be expensive especially for things like
strings where you have to copy the data to a Scalar only to copy it again into
an Array.
##########
File path: cpp/src/arrow/compute/kernels/scalar_nested.cc
##########
@@ -429,6 +429,167 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a
StructArray",
{"*args"},
"MakeStructOptions"};
+struct MapArrayLookupFunctor {
+ static Result<int64_t> FindOneMapValueIndex(const Array& keys, const Scalar&
query_key,
+ const int64_t start, const
int64_t end,
+ const bool from_back = false) {
+ if (!from_back) {
+ for (int64_t idx = start; idx < end; ++idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) return idx;
+ }
+ } else {
+ for (int64_t idx = end - 1; idx >= start; --idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) return idx;
+ }
+ }
+ return -1;
+ }
+
+ static Status SetScalarOutput(const Array& keys, const Array& items,
KernelContext* ctx,
+ const MapArrayLookupOptions& options,
+ std::shared_ptr<Scalar>& out) {
+ const Scalar& query_key = *options.query_key;
+ if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items.type(), &builder));
+
+ bool found_at_least_one_key = false;
+ for (int64_t idx = 0; idx < keys.length(); ++idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) {
+ found_at_least_one_key = true;
+ RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), idx, 1));
+ }
+ }
+ if (!found_at_least_one_key) {
+ out = MakeNullScalar(list(items.type()));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
+ ARROW_ASSIGN_OR_RAISE(out, MakeScalar(list(items.type()), result));
+ }
+ } else { /* occurrence == FIRST || LAST */
+ bool from_back = (options.occurrence == MapArrayLookupOptions::LAST);
+
+ ARROW_ASSIGN_OR_RAISE(
+ int64_t key_match_idx,
+ FindOneMapValueIndex(keys, query_key, 0, keys.length(), from_back));
+ if (key_match_idx != -1) {
+ ARROW_ASSIGN_OR_RAISE(out, items.GetScalar(key_match_idx));
+ } else {
+ out = MakeNullScalar(items.type());
+ }
+ }
+ return Status::OK();
+ }
+
+ static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch,
Datum* out) {
+ const auto& options = OptionsWrapper<MapArrayLookupOptions>::Get(ctx);
+
+ const MapArray map_array(batch[0].array());
+
+ std::unique_ptr<ArrayBuilder> builder;
+ if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) {
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(),
+ list(map_array.map_type()->item_type()),
&builder));
+ } else {
+ RETURN_NOT_OK(
+ MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(),
&builder));
+ }
+
+ for (int64_t map_array_idx = 0; map_array_idx < map_array.length();
++map_array_idx) {
+ if (!map_array.IsValid(map_array_idx)) {
+ RETURN_NOT_OK(builder->AppendNull());
+ continue;
+ } else {
+ auto map = map_array.value_slice(map_array_idx);
+ auto keys = checked_cast<const StructArray&>(*map).field(0);
+ auto items = checked_cast<const StructArray&>(*map).field(1);
+ std::shared_ptr<Scalar> output;
+ RETURN_NOT_OK(SetScalarOutput(*keys, *items, ctx, options, output));
Review comment:
In other words - you don't need SetScalarOutput, I realize it shares
more code between the two cases, but now it's at the cost of performance.
FindOneMapValueIndex is the important part to refactor since that will change
when this kernel is templated, but using Scalars in the Array case is strictly
overhead.
##########
File path: cpp/src/arrow/compute/kernels/scalar_nested.cc
##########
@@ -429,6 +429,169 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a
StructArray",
{"*args"},
"MakeStructOptions"};
+struct MapArrayLookupFunctor {
+ static Result<int64_t> FindOneMapValueIndex(const Array& keys, const Scalar&
query_key,
+ const int64_t start, const
int64_t end,
+ const bool from_back = false) {
+ if (!from_back) {
+ for (int64_t idx = start; idx < end; ++idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) return idx;
+ }
+ } else {
+ for (int64_t idx = end - 1; idx >= start; --idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) return idx;
+ }
+ }
+ return -1;
+ }
+
+ static Result<std::shared_ptr<Scalar>> GetScalarOutput(KernelContext* ctx,
+ const MapScalar
map_scalar) {
Review comment:
And yes, templating is for the type of the key.
##########
File path: cpp/src/arrow/compute/kernels/scalar_nested.cc
##########
@@ -429,6 +429,167 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a
StructArray",
{"*args"},
"MakeStructOptions"};
+struct MapArrayLookupFunctor {
+ static Result<int64_t> FindOneMapValueIndex(const Array& keys, const Scalar&
query_key,
+ const int64_t start, const
int64_t end,
+ const bool from_back = false) {
+ if (!from_back) {
+ for (int64_t idx = start; idx < end; ++idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) return idx;
+ }
+ } else {
+ for (int64_t idx = end - 1; idx >= start; --idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) return idx;
+ }
+ }
+ return -1;
+ }
+
+ static Status SetScalarOutput(const Array& keys, const Array& items,
KernelContext* ctx,
+ const MapArrayLookupOptions& options,
+ std::shared_ptr<Scalar>& out) {
+ const Scalar& query_key = *options.query_key;
+ if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items.type(), &builder));
+
+ bool found_at_least_one_key = false;
+ for (int64_t idx = 0; idx < keys.length(); ++idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) {
+ found_at_least_one_key = true;
+ RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), idx, 1));
+ }
+ }
+ if (!found_at_least_one_key) {
+ out = MakeNullScalar(list(items.type()));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
+ ARROW_ASSIGN_OR_RAISE(out, MakeScalar(list(items.type()), result));
+ }
+ } else { /* occurrence == FIRST || LAST */
+ bool from_back = (options.occurrence == MapArrayLookupOptions::LAST);
+
+ ARROW_ASSIGN_OR_RAISE(
+ int64_t key_match_idx,
+ FindOneMapValueIndex(keys, query_key, 0, keys.length(), from_back));
+ if (key_match_idx != -1) {
+ ARROW_ASSIGN_OR_RAISE(out, items.GetScalar(key_match_idx));
+ } else {
+ out = MakeNullScalar(items.type());
+ }
+ }
+ return Status::OK();
+ }
+
+ static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch,
Datum* out) {
+ const auto& options = OptionsWrapper<MapArrayLookupOptions>::Get(ctx);
+
+ const MapArray map_array(batch[0].array());
+
+ std::unique_ptr<ArrayBuilder> builder;
+ if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) {
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(),
+ list(map_array.map_type()->item_type()),
&builder));
+ } else {
+ RETURN_NOT_OK(
+ MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(),
&builder));
+ }
+
+ for (int64_t map_array_idx = 0; map_array_idx < map_array.length();
++map_array_idx) {
+ if (!map_array.IsValid(map_array_idx)) {
+ RETURN_NOT_OK(builder->AppendNull());
+ continue;
+ } else {
+ auto map = map_array.value_slice(map_array_idx);
+ auto keys = checked_cast<const StructArray&>(*map).field(0);
+ auto items = checked_cast<const StructArray&>(*map).field(1);
+ std::shared_ptr<Scalar> output;
+ RETURN_NOT_OK(SetScalarOutput(*keys, *items, ctx, options, output));
Review comment:
Yes, I was just suggesting that calculating the offset could be factored
out.
##########
File path: cpp/src/arrow/compute/kernels/scalar_nested.cc
##########
@@ -429,6 +429,167 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a
StructArray",
{"*args"},
"MakeStructOptions"};
+struct MapArrayLookupFunctor {
+ static Result<int64_t> FindOneMapValueIndex(const Array& keys, const Scalar&
query_key,
+ const int64_t start, const
int64_t end,
+ const bool from_back = false) {
+ if (!from_back) {
+ for (int64_t idx = start; idx < end; ++idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) return idx;
+ }
+ } else {
+ for (int64_t idx = end - 1; idx >= start; --idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) return idx;
+ }
+ }
+ return -1;
+ }
+
+ static Status SetScalarOutput(const Array& keys, const Array& items,
KernelContext* ctx,
+ const MapArrayLookupOptions& options,
+ std::shared_ptr<Scalar>& out) {
+ const Scalar& query_key = *options.query_key;
+ if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items.type(), &builder));
+
+ bool found_at_least_one_key = false;
+ for (int64_t idx = 0; idx < keys.length(); ++idx) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> key,
keys.GetScalar(idx));
+
+ if (key->Equals(query_key)) {
+ found_at_least_one_key = true;
+ RETURN_NOT_OK(builder->AppendArraySlice(*items.data(), idx, 1));
+ }
+ }
+ if (!found_at_least_one_key) {
+ out = MakeNullScalar(list(items.type()));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
+ ARROW_ASSIGN_OR_RAISE(out, MakeScalar(list(items.type()), result));
+ }
+ } else { /* occurrence == FIRST || LAST */
+ bool from_back = (options.occurrence == MapArrayLookupOptions::LAST);
+
+ ARROW_ASSIGN_OR_RAISE(
+ int64_t key_match_idx,
+ FindOneMapValueIndex(keys, query_key, 0, keys.length(), from_back));
+ if (key_match_idx != -1) {
+ ARROW_ASSIGN_OR_RAISE(out, items.GetScalar(key_match_idx));
+ } else {
+ out = MakeNullScalar(items.type());
+ }
+ }
+ return Status::OK();
+ }
+
+ static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch,
Datum* out) {
+ const auto& options = OptionsWrapper<MapArrayLookupOptions>::Get(ctx);
+
+ const MapArray map_array(batch[0].array());
+
+ std::unique_ptr<ArrayBuilder> builder;
+ if (options.occurrence == MapArrayLookupOptions::Occurrence::ALL) {
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(),
+ list(map_array.map_type()->item_type()),
&builder));
+ } else {
+ RETURN_NOT_OK(
+ MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(),
&builder));
+ }
+
+ for (int64_t map_array_idx = 0; map_array_idx < map_array.length();
++map_array_idx) {
+ if (!map_array.IsValid(map_array_idx)) {
+ RETURN_NOT_OK(builder->AppendNull());
+ continue;
+ } else {
+ auto map = map_array.value_slice(map_array_idx);
+ auto keys = checked_cast<const StructArray&>(*map).field(0);
+ auto items = checked_cast<const StructArray&>(*map).field(1);
+ std::shared_ptr<Scalar> output;
+ RETURN_NOT_OK(SetScalarOutput(*keys, *items, ctx, options, output));
Review comment:
Sure, but that case can just have offset=0. And I'm suggesting this
since this is the code that will change the most when the kernel gets
templated, so it will save some effort changing it then.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]