[GitHub] [arrow] westonpace commented on a change in pull request #8984: ARROW-5336: [C++] Implement arrow::Concatenate for dictionary-encoded arrays with unequal dictionaries

GitBox Sat, 09 Jan 2021 20:06:45 -0800


westonpace commented on a change in pull request #8984:
URL: https://github.com/apache/arrow/pull/8984#discussion_r554217402




##########
File path: cpp/src/arrow/array/array_dict.cc
##########
@@ -44,6 +44,29 @@ namespace arrow {
 using internal::checked_cast;
 using internal::CopyBitmap;
 
+struct MaxIndexAccessor {

Review comment:
       Even better, it turns out `util/int_util.h` already had what I needed.  
I switched to using `internal::IntegersCanFit`.

##########
File path: cpp/src/arrow/array/concatenate.cc
##########
@@ -163,6 +163,46 @@ static Status PutOffsets(const std::shared_ptr<Buffer>& 
src, Offset first_offset
   return Status::OK();
 }
 
+struct DictionaryConcatenate {

Review comment:
       Done

##########
File path: cpp/src/arrow/type.h
##########
@@ -1367,6 +1367,12 @@ class ARROW_EXPORT DictionaryUnifier {
   /// after this is called
   virtual Status GetResult(std::shared_ptr<DataType>* out_type,
                            std::shared_ptr<Array>* out_dict) = 0;
+
+  /// \brief Return a result DictionaryType with the given index type.  If
+  /// the index type is not large enough then an invalid status will be 
returned.
+  /// The unifier cannot be used after this is called
+  virtual Status GetResultWithIndexType(std::shared_ptr<DataType> index_type,

Review comment:
       Correct, fixed.

##########
File path: cpp/src/arrow/array/concatenate_test.cc
##########
@@ -225,6 +225,86 @@ TEST_F(ConcatenateTest, DictionaryType) {
   });
 }
 
+TEST_F(ConcatenateTest, DictionaryTypeDifferentDictionaries) {
+  auto dict_type = dictionary(uint8(), utf8());
+  auto dict_one = DictArrayFromJSON(dict_type, "[1, 2, null, 3, 0]",
+                                    "[\"A0\", \"A1\", \"A2\", \"A3\"]");
+  auto dict_two = DictArrayFromJSON(dict_type, "[null, 4, 2, 1]",
+                                    "[\"B0\", \"B1\", \"B2\", \"B3\", 
\"B4\"]");
+  auto concat_expected = DictArrayFromJSON(
+      dict_type, "[1, 2, null, 3, 0, null, 8, 6, 5]",
+      "[\"A0\", \"A1\", \"A2\", \"A3\", \"B0\", \"B1\", \"B2\", \"B3\", 
\"B4\"]");
+  ASSERT_OK_AND_ASSIGN(auto concat_actual, Concatenate({dict_one, dict_two}));
+  AssertArraysEqual(*concat_expected, *concat_actual);
+}
+
+TEST_F(ConcatenateTest, DictionaryTypePartialOverlapDictionaries) {
+  auto dict_type = dictionary(uint8(), utf8());
+  auto dict_one = DictArrayFromJSON(dict_type, "[1, 2, null, 3, 0]",
+                                    "[\"A0\", \"A1\", \"C2\", \"C3\"]");
+  auto dict_two = DictArrayFromJSON(dict_type, "[null, 4, 2, 1]",
+                                    "[\"B0\", \"B1\", \"C2\", \"C3\", 
\"B4\"]");
+  auto concat_expected =
+      DictArrayFromJSON(dict_type, "[1, 2, null, 3, 0, null, 6, 2, 5]",
+                        "[\"A0\", \"A1\", \"C2\", \"C3\", \"B0\", \"B1\", 
\"B4\"]");
+  ASSERT_OK_AND_ASSIGN(auto concat_actual, Concatenate({dict_one, dict_two}));
+  AssertArraysEqual(*concat_expected, *concat_actual);
+}
+
+TEST_F(ConcatenateTest, DictionaryTypeDifferentSizeIndex) {
+  auto dict_type = dictionary(uint8(), utf8());
+  auto bigger_dict_type = dictionary(uint16(), utf8());
+  auto dict_one = DictArrayFromJSON(dict_type, "[0]", "[\"A0\"]");
+  auto dict_two = DictArrayFromJSON(bigger_dict_type, "[0]", "[\"B0\"]");
+  ASSERT_RAISES(Invalid, Concatenate({dict_one, dict_two}).status());
+}
+
+TEST_F(ConcatenateTest, DictionaryTypeCantUnifyNullInDictionary) {
+  auto dict_type = dictionary(uint8(), utf8());
+  auto dict_one = DictArrayFromJSON(dict_type, "[0, 1]", "[null, \"A\"]");
+  auto dict_two = DictArrayFromJSON(dict_type, "[0, 1]", "[null, \"B\"]");
+  ASSERT_RAISES(Invalid, Concatenate({dict_one, dict_two}).status());
+}
+
+TEST_F(ConcatenateTest, DictionaryTypeEnlargedIndices) {
+  auto size = std::numeric_limits<uint8_t>::max() + 1;
+  auto dict_type = dictionary(uint8(), uint16());
+
+  UInt8Builder index_builder;
+  ASSERT_OK(index_builder.Reserve(size));
+  for (auto i = 0; i < size; i++) {
+    index_builder.UnsafeAppend(i);
+  }
+  ASSERT_OK_AND_ASSIGN(auto indices, index_builder.Finish());
+
+  UInt16Builder values_builder;
+  ASSERT_OK(values_builder.Reserve(size));
+  for (auto i = 0; i < size; i++) {
+    values_builder.UnsafeAppend(i);
+  }
+  ASSERT_OK_AND_ASSIGN(auto dictionary_one, values_builder.Finish());
+
+  UInt16Builder values_builder_two;
+  ASSERT_OK(values_builder_two.Reserve(size));
+  for (auto i = size; i < 2 * size; i++) {

Review comment:
       I don't think `i*2` would work.  It's building up two arrays, the first 
has values [0,size) and the second has values [size,2*size).  I changed it a 
little so it is one loop if that is clearer.

##########
File path: cpp/src/arrow/array/array_dict.cc
##########
@@ -44,6 +44,29 @@ namespace arrow {
 using internal::checked_cast;
 using internal::CopyBitmap;
 
+struct MaxIndexAccessor {
+  MaxIndexAccessor() {}
+
+  template <typename T>
+  enable_if_t<!is_integer_type<T>::value, Status> Visit(const T&) {
+    return Status::Invalid("Dictionary index types must be integer types");
+  }
+
+  template <typename T>
+  enable_if_integer<T, Status> Visit(const T&) {
+    max_index_value_ = std::numeric_limits<typename T::c_type>::max();
+    return Status::OK();
+  }
+
+  int64_t max_index_value_ = 0;
+};
+
+Result<int64_t> DictionaryIndexMaxValue(std::shared_ptr<DataType> index_type) {

Review comment:
       Good catch.  N/A anymore since using int_util.

##########
File path: cpp/src/arrow/array/concatenate.cc
##########
@@ -163,6 +163,46 @@ static Status PutOffsets(const std::shared_ptr<Buffer>& 
src, Offset first_offset
   return Status::OK();
 }
 
+struct DictionaryConcatenate {
+  DictionaryConcatenate(BufferVector& index_buffers,
+                        std::vector<std::shared_ptr<Buffer>>& index_lookup,
+                        MemoryPool* pool)
+      : out_(nullptr),
+        index_buffers_(index_buffers),
+        index_lookup_(index_lookup),
+        pool_(pool) {}
+
+  template <typename T>
+  enable_if_t<!is_integer_type<T>::value, Status> Visit(const T& t) {
+    return Status::Invalid("Dictionary indices must be integral types");
+  }
+
+  template <typename T, typename CType = typename T::c_type>
+  enable_if_integer<T, Status> Visit(const T& index_type) {
+    int64_t out_length = 0;
+    for (const auto& buffer : index_buffers_) {
+      out_length += buffer->size();
+    }
+    ARROW_ASSIGN_OR_RAISE(out_, AllocateBuffer(out_length, pool_));
+    auto out_data = out_->mutable_data();
+    for (size_t i = 0; i < index_buffers_.size(); i++) {
+      auto buffer = index_buffers_[i];
+      auto old_indices = reinterpret_cast<const CType*>(buffer->data());
+      auto indices_map = 
reinterpret_cast<int32_t*>(index_lookup_[i]->mutable_data());

Review comment:
       Fixed.

##########
File path: cpp/src/arrow/array/concatenate.cc
##########
@@ -274,7 +329,12 @@ class ConcatenateImpl {
       ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, *fixed));
       return ConcatenateBuffers(index_buffers, pool_).Value(&out_->buffers[1]);
     } else {
-      return Status::NotImplemented("Concat with dictionary unification NYI");
+      ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, *fixed));

Review comment:
       Fixed.

##########
File path: cpp/src/arrow/array/concatenate.cc
##########
@@ -163,6 +163,46 @@ static Status PutOffsets(const std::shared_ptr<Buffer>& 
src, Offset first_offset
   return Status::OK();
 }
 
+struct DictionaryConcatenate {
+  DictionaryConcatenate(BufferVector& index_buffers,
+                        std::vector<std::shared_ptr<Buffer>>& index_lookup,
+                        MemoryPool* pool)
+      : out_(nullptr),
+        index_buffers_(index_buffers),
+        index_lookup_(index_lookup),
+        pool_(pool) {}
+
+  template <typename T>
+  enable_if_t<!is_integer_type<T>::value, Status> Visit(const T& t) {
+    return Status::Invalid("Dictionary indices must be integral types");
+  }
+
+  template <typename T, typename CType = typename T::c_type>
+  enable_if_integer<T, Status> Visit(const T& index_type) {
+    int64_t out_length = 0;
+    for (const auto& buffer : index_buffers_) {
+      out_length += buffer->size();
+    }
+    ARROW_ASSIGN_OR_RAISE(out_, AllocateBuffer(out_length, pool_));
+    auto out_data = out_->mutable_data();

Review comment:
       Yes, the missing `reinterpret_cast` was a bug (as I think you noticed) 
for index types with more than one byte.  Fixed.

##########
File path: cpp/src/arrow/array/concatenate.cc
##########
@@ -163,6 +163,46 @@ static Status PutOffsets(const std::shared_ptr<Buffer>& 
src, Offset first_offset
   return Status::OK();
 }
 
+struct DictionaryConcatenate {
+  DictionaryConcatenate(BufferVector& index_buffers,
+                        std::vector<std::shared_ptr<Buffer>>& index_lookup,
+                        MemoryPool* pool)
+      : out_(nullptr),
+        index_buffers_(index_buffers),
+        index_lookup_(index_lookup),
+        pool_(pool) {}
+
+  template <typename T>
+  enable_if_t<!is_integer_type<T>::value, Status> Visit(const T& t) {
+    return Status::Invalid("Dictionary indices must be integral types");
+  }
+
+  template <typename T, typename CType = typename T::c_type>
+  enable_if_integer<T, Status> Visit(const T& index_type) {
+    int64_t out_length = 0;
+    for (const auto& buffer : index_buffers_) {
+      out_length += buffer->size();
+    }
+    ARROW_ASSIGN_OR_RAISE(out_, AllocateBuffer(out_length, pool_));
+    auto out_data = out_->mutable_data();
+    for (size_t i = 0; i < index_buffers_.size(); i++) {
+      auto buffer = index_buffers_[i];

Review comment:
       Fixed anyways for consistency.

##########
File path: cpp/src/arrow/array/concatenate_test.cc
##########
@@ -225,6 +225,86 @@ TEST_F(ConcatenateTest, DictionaryType) {
   });
 }
 
+TEST_F(ConcatenateTest, DictionaryTypeDifferentDictionaries) {
+  auto dict_type = dictionary(uint8(), utf8());

Review comment:
       I added a test and there was an issue.  Both the missing reinterpret 
cast and the way I was computing size (# bytes vs. # elements)




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] westonpace commented on a change in pull request #8984: ARROW-5336: [C++] Implement arrow::Concatenate for dictionary-encoded arrays with unequal dictionaries

Reply via email to