js8544 commented on code in PR #38502:
URL: https://github.com/apache/arrow/pull/38502#discussion_r1388920322


##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -118,13 +115,8 @@ class ChunkedArraySorter : public TypeVisitor {
       };
       auto merge_non_nulls = [&](uint64_t* range_begin, uint64_t* range_middle,
                                  uint64_t* range_end, uint64_t* temp_indices) {
-        if (is_dictionary(*physical_type_)) {
-          MergeNonNulls<DictionaryType>(range_begin, range_middle, range_end, 
arrays,
-                                        temp_indices);
-        } else {
-          MergeNonNulls<ArrayType>(range_begin, range_middle, range_end, 
arrays,
-                                   temp_indices);
-        }
+        MergeNonNulls<ArrayType>(range_begin, range_middle, range_end, arrays,

Review Comment:
   I'm sorry my intentions were not clear. I agree with your 
`ChunkedArraySorter::MergeNonNulls<DictionaryType>` approach, but you don't 
need to have a `if` clause to dispatch it, because `MergeNonNulls<ArrayType>` 
should be enough to dispatch it (here ArrayType==DictionaryType).



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -236,67 +190,6 @@ class ChunkedArraySorter : public TypeVisitor {
   NullPartitionResult* output_;
 };
 
-template <>
-void ChunkedArraySorter::MergeNonNulls<DictionaryType>(

Review Comment:
   Here's an example of my intended idea (may not work as is, and may not even 
compile):
   ```cpp
   class DictionaryMergeSorter {
     template <typename T>
     std::enable_if_t<has_string_view<T>::value || has_c_type<T>::value, 
Status> Visit(
         const T& type, const SortOrder order, uint64_t* range_begin, uint64_t* 
range_middle,
         uint64_t* range_end, const std::vector<const Array*>& arrays,
         uint64_t* temp_indices) {
       using ArrayType = typename TypeTraits<T>::ArrayType;
   
       // This lambda functions should probably be refactored to 
codegen_internal.h or
       // chunked_internal.h
       auto GetView = [](const Array& array, uint64_t index) {
         const auto& dictionary_array = checked_cast<const 
DictionaryArray&>(array);
         // GetValueIndex is expensive and can be further optimized
         int64_t value_index = dictionary_array.GetValueIndex(index);
         return checked_cast<const ArrayType&>(*dictionary_array.dictionary())
             .GetView(value_index);
       };
   
       const ChunkedArrayResolver left_resolver(arrays);
       const ChunkedArrayResolver right_resolver(arrays);
       if (order == SortOrder::Ascending) {
         std::merge(range_begin, range_middle, range_middle, range_end, 
temp_indices,
                    [&](uint64_t left, uint64_t right) {
                      const auto chunk_left = 
left_resolver.Resolve<Array>(left);
                      const auto chunk_right = 
right_resolver.Resolve<Array>(right);
                      auto left_value = GetView(*chunk_left.array, 
chunk_left.index);
                      auto right_value = GetView(*chunk_right.array, 
chunk_right.index);
                      return left_value < right_value;
                    });
       } else {
         std::merge(range_begin, range_middle, range_middle, range_end, 
temp_indices,
                    [&](uint64_t left, uint64_t right) {
                      const auto chunk_left = 
left_resolver.Resolve<Array>(left);
                      const auto chunk_right = 
right_resolver.Resolve<Array>(right);
                      auto left_value = GetView(*chunk_left.array, 
chunk_left.index);
                      auto right_value = GetView(*chunk_right.array, 
chunk_right.index);
                      return left_value > right_value;
                    });
       }
     }
   
     Status Visit(const DataType& type, const SortOrder order, uint64_t* 
range_begin,
                  uint64_t* range_middle, uint64_t* range_end,
                  const std::vector<const Array*>& arrays, uint64_t* 
temp_indices) {
       return Status::TypeError("Unsupported type for Dictionary sorting: ",
                                type.ToString());
     }
   
    public:
     Status Merge(const SortOrder order, uint64_t* range_begin, uint64_t* 
range_middle,
                  uint64_t* range_end, const std::vector<const Array*>& arrays,
                  uint64_t* temp_indices) {
       auto dict_value_type =
           checked_cast<const DictionaryArray&>(arrays[0]).dictionary()->type();
       return VisitTypeInline(*dict_value_type, this, range_begin, 
range_middle, range_end,
                              arrays, temp_indices);
     }
   };
   
   template <>
   Status ChunkedArraySorter::MergeNonNulls<DictionaryType>(
       uint64_t* range_begin, uint64_t* range_middle, uint64_t* range_end,
       const std::vector<const Array*>& arrays, uint64_t* temp_indices) {
     DictionaryMergeSorter sorter;
     return sorter.Merge(order_, range_begin, range_middle, range_end, arrays, 
temp_indices);
   }
   ```
   In this way there is one type dispatch per chunk, rather than one per 
element when using a variant. The dispatching can even be moved up further, but 
I'm not sure how to do it elegantly.



##########
cpp/src/arrow/array/array_dict.cc:
##########
@@ -78,6 +78,13 @@ int64_t DictionaryArray::GetValueIndex(int64_t i) const {
   }
 }
 
+ViewType DictionaryArray::GetView(int64_t i) const {

Review Comment:
   I'm sorry that my suggestions were not clear. I meant to suggest a visitor 
pattern in the sort kernel implementation, not in DictionaryArray definition. 
   
   The dictionary value can be of any arrow type and not all types have a view. 
For example, `dictionary<list<string>>` can't have a proper view type because 
lists don't have view types. 
   
   And having a view of a variant type may be slow. Because variant's 
comparison is a dynamic dispatching process and it will happen for each pair of 
values. We can move the dispatching higher up to make it faster. I'll give an 
example below.



##########
cpp/src/arrow/array/array_dict.h:
##########
@@ -20,17 +20,23 @@
 #include <cstdint>
 #include <memory>
 
+#include "arrow/array.h"
 #include "arrow/array/array_base.h"
+#include "arrow/array/array_primitive.h"
 #include "arrow/array/data.h"
 #include "arrow/result.h"
 #include "arrow/scalar.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
 
+using ViewType = std::variant<std::string_view, int32_t>;

Review Comment:
   If we use a variant then it's going to have so many variants that it's 
essentially the `Scalar` class



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to