[GitHub] [arrow] lidavidm commented on a change in pull request #11446: ARROW-14181: [C++][Compute] Support for dictionaries in hash join

GitBox Wed, 03 Nov 2021 18:27:35 -0700


lidavidm commented on a change in pull request #11446:
URL: https://github.com/apache/arrow/pull/11446#discussion_r741977708




##########
File path: cpp/src/arrow/compute/exec/hash_join_dict.cc
##########
@@ -0,0 +1,667 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/hash_join_dict.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+
+namespace arrow {
+namespace compute {
+
+bool HashJoinDictUtil::KeyDataTypesValid(
+    const std::shared_ptr<DataType>& probe_data_type,
+    const std::shared_ptr<DataType>& build_data_type) {
+  bool l_is_dict = (probe_data_type->id() == Type::DICTIONARY);
+  bool r_is_dict = (build_data_type->id() == Type::DICTIONARY);
+  DataType* l_type;
+  if (l_is_dict) {
+    const auto& dict_type = checked_cast<const 
DictionaryType&>(*probe_data_type);
+    l_type = dict_type.value_type().get();
+  } else {
+    l_type = probe_data_type.get();
+  }
+  DataType* r_type;
+  if (r_is_dict) {
+    const auto& dict_type = checked_cast<const 
DictionaryType&>(*build_data_type);
+    r_type = dict_type.value_type().get();
+  } else {
+    r_type = build_data_type.get();
+  }
+  return l_type->Equals(*r_type);
+}
+
+Result<std::shared_ptr<ArrayData>> HashJoinDictUtil::IndexRemapUsingLUT(
+    ExecContext* ctx, const Datum& indices, int64_t batch_length,
+    const std::shared_ptr<ArrayData>& map_array,
+    const std::shared_ptr<DataType>& data_type) {
+  ARROW_DCHECK(indices.is_array() || indices.is_scalar());
+
+  const uint8_t* map_non_nulls = map_array->buffers[0]->data();
+  const int32_t* map = reinterpret_cast<const 
int32_t*>(map_array->buffers[1]->data());
+
+  ARROW_DCHECK(data_type->id() == Type::DICTIONARY);
+  const auto& dict_type = checked_cast<const DictionaryType&>(*data_type);
+
+  ARROW_ASSIGN_OR_RAISE(
+      std::shared_ptr<ArrayData> result,
+      ConvertToInt32(dict_type.index_type(), indices, batch_length, ctx));
+
+  uint8_t* nns = result->buffers[0]->mutable_data();
+  int32_t* ids = 
reinterpret_cast<int32_t*>(result->buffers[1]->mutable_data());
+  for (int64_t i = 0; i < batch_length; ++i) {
+    bool is_null = !BitUtil::GetBit(nns, i);
+    if (is_null) {
+      ids[i] = kNullId;
+    } else {
+      ARROW_DCHECK(ids[i] >= 0 && ids[i] < map_array->length);
+      if (!BitUtil::GetBit(map_non_nulls, ids[i])) {
+        BitUtil::ClearBit(nns, i);
+        ids[i] = kNullId;
+      } else {
+        ids[i] = map[ids[i]];
+      }
+    }
+  }
+
+  return result;
+}
+
+namespace HashJoinDictUtilImp {
+template <typename FROM, typename TO>
+static Result<std::shared_ptr<ArrayData>> ConvertImp(
+    const std::shared_ptr<DataType>& to_type, const Datum& input, int64_t 
batch_length,
+    ExecContext* ctx) {
+  ARROW_DCHECK(input.is_array() || input.is_scalar());
+  bool is_scalar = input.is_scalar();
+
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> to_buf,
+                        AllocateBuffer(batch_length * sizeof(TO), 
ctx->memory_pool()));
+  TO* to = reinterpret_cast<TO*>(to_buf->mutable_data());
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> to_nn_buf,
+                        AllocateBitmap(batch_length, ctx->memory_pool()));
+  uint8_t* to_nn = to_nn_buf->mutable_data();
+  memset(to_nn, 0xff, BitUtil::BytesForBits(batch_length));
+
+  if (!is_scalar) {
+    const ArrayData& arr = *input.array();
+    const FROM* from = arr.GetValues<FROM>(1);
+    DCHECK_EQ(arr.length, batch_length);
+
+    for (int64_t i = 0; i < arr.length; ++i) {
+      to[i] = static_cast<TO>(from[i]);
+      // Make sure we did not lose information during cast
+      ARROW_DCHECK(static_cast<FROM>(to[i]) == from[i]);
+
+      bool is_null = (arr.buffers[0] != NULLPTR) &&
+                     !BitUtil::GetBit(arr.buffers[0]->data(), arr.offset + i);
+      if (is_null) {
+        BitUtil::ClearBit(to_nn, i);
+      }
+    }
+
+    // Pass null buffer unchanged
+    return ArrayData::Make(to_type, arr.length,
+                           {std::move(to_nn_buf), std::move(to_buf)});
+  } else {
+    const auto& scalar = 
input.scalar_as<arrow::internal::PrimitiveScalarBase>();
+    if (scalar.is_valid) {
+      const util::string_view data = scalar.view();
+      DCHECK_EQ(data.size(), sizeof(FROM));
+      const FROM from = *reinterpret_cast<const FROM*>(data.data());
+      const TO to_value = static_cast<TO>(from);
+      // Make sure we did not lose information during cast
+      ARROW_DCHECK(static_cast<FROM>(to_value) == from);
+
+      for (int64_t i = 0; i < batch_length; ++i) {
+        to[i] = to_value;
+      }
+
+      memset(to_nn, 0xff, BitUtil::BytesForBits(batch_length));
+      return ArrayData::Make(to_type, batch_length,
+                             {std::move(to_nn_buf), std::move(to_buf)});
+    } else {
+      memset(to_nn, 0, BitUtil::BytesForBits(batch_length));
+      return ArrayData::Make(to_type, batch_length,
+                             {std::move(to_nn_buf), std::move(to_buf)});
+    }
+  }
+}
+}  // namespace HashJoinDictUtilImp
+
+Result<std::shared_ptr<ArrayData>> HashJoinDictUtil::ConvertToInt32(
+    const std::shared_ptr<DataType>& from_type, const Datum& input, int64_t 
batch_length,
+    ExecContext* ctx) {
+  using namespace HashJoinDictUtilImp;

Review comment:
       This leads to a linter error. Maybe an anonymous namespace above would 
be more convenient?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] lidavidm commented on a change in pull request #11446: ARROW-14181: [C++][Compute] Support for dictionaries in hash join

Reply via email to