[GitHub] [arrow] pitrou commented on a change in pull request #11022: ARROW-13573: [C++] Support dictionaries natively in case_when

GitBox Mon, 20 Sep 2021 06:31:57 -0700


pitrou commented on a change in pull request #11022:
URL: https://github.com/apache/arrow/pull/11022#discussion_r712154851




##########
File path: ci/scripts/PKGBUILD
##########
@@ -80,9 +80,13 @@ build() {
     export LIBS="-L${MINGW_PREFIX}/libs"
     export ARROW_S3=OFF
     export ARROW_WITH_RE2=OFF
+    # Without this, some dataset functionality segfaults
+    export CMAKE_UNITY_BUILD=ON
   else
     export ARROW_S3=ON
     export ARROW_WITH_RE2=ON
+    # Without this, some compute functionality segfaults
+    export CMAKE_UNITY_BUILD=OFF

Review comment:
       You mean it segfaults during compilation?

##########
File path: cpp/src/arrow/array/builder_dict.h
##########
@@ -376,6 +455,37 @@ class DictionaryBuilderBase : public ArrayBuilder {
   }
 
  protected:
+  template <typename c_type>
+  Status AppendArraySliceImpl(const typename TypeTraits<T>::ArrayType& dict,
+                              const ArrayData& array, int64_t offset, int64_t 
length) {
+    const c_type* values = array.GetValues<c_type>(1) + offset;
+    return VisitBitBlocks(
+        array.buffers[0], array.offset + offset, length,
+        [&](const int64_t position) {
+          const int64_t index = static_cast<int64_t>(values[position]);
+          if (dict.IsValid(index)) {
+            return Append(dict.GetView(index));
+          }
+          return AppendNull();
+        },
+        [&]() { return AppendNull(); });
+  }
+
+  template <typename IndexType>
+  Status AppendScalarImpl(const typename TypeTraits<T>::ArrayType& dict,
+                          const Scalar& index_scalar, int64_t n_repeats) {
+    using ScalarType = typename TypeTraits<IndexType>::ScalarType;
+    const auto index = internal::checked_cast<const 
ScalarType&>(index_scalar).value;
+    if (index_scalar.is_valid && dict.IsValid(index)) {
+      const auto& value = dict.GetView(index);
+      for (int64_t i = 0; i < n_repeats; i++) {
+        ARROW_RETURN_NOT_OK(Append(value));

Review comment:
       Not for this PR, but it sounds like offering a two-step API on 
DictionaryBuilder would allow for performance improvements:
   ```c++
   /// Ensure `value` is in the dict, and return its index, but doesn't append 
it
   Result<int64_t> Encode(c_type value);
   /// Append the given dictionary index
   Status AppendIndex(int64_t index);
   Status AppendIndices(int64_t index, int64_t nrepeats);
   ```

##########
File path: cpp/src/arrow/compute/kernels/test_util.cc
##########
@@ -170,6 +171,74 @@ void CheckScalar(std::string func_name, const DatumVector& 
inputs, Datum expecte
   }
 }
 
+Datum CheckDictionaryNonRecursive(const std::string& func_name, const 
DatumVector& args) {
+  EXPECT_OK_AND_ASSIGN(Datum actual, CallFunction(func_name, args));
+  ValidateOutput(actual);
+
+  DatumVector decoded_args;
+  decoded_args.reserve(args.size());
+  for (const auto& arg : args) {
+    if (arg.type()->id() == Type::DICTIONARY) {
+      const auto& to_type = checked_cast<const 
DictionaryType&>(*arg.type()).value_type();
+      EXPECT_OK_AND_ASSIGN(auto decoded, Cast(arg, to_type));
+      decoded_args.push_back(decoded);
+    } else {
+      decoded_args.push_back(arg);
+    }
+  }
+  EXPECT_OK_AND_ASSIGN(Datum expected, CallFunction(func_name, decoded_args));
+
+  if (actual.type()->id() == Type::DICTIONARY) {

Review comment:
       Hmm, it would be nice if the caller actually said whether the output is 
supposed to be dictionary-encoded or not. Otherwise there could be silent 
regressions where the output type of a kernel changes from one version to 
another.

##########
File path: cpp/src/arrow/ipc/json_simple_test.cc
##########
@@ -1385,6 +1385,29 @@ TEST(TestScalarFromJSON, Errors) {
   ASSERT_RAISES(Invalid, ScalarFromJSON(boolean(), "\"true\"", &scalar));
 }
 
+TEST(TestDictScalarFromJSON, Basics) {
+  auto type = dictionary(int32(), utf8());
+  auto dict = R"(["whiskey", "tango", "foxtrot"])";
+  auto expected_dictionary = ArrayFromJSON(utf8(), dict);
+
+  for (auto index : {"null", "2", "1", "0"}) {
+    auto scalar = DictScalarFromJSON(type, index, dict);
+    auto expected_index = ScalarFromJSON(int32(), index);
+    AssertScalarsEqual(*DictionaryScalar::Make(expected_index, 
expected_dictionary),

Review comment:
       Also call `scalar->ValidateFull()`?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a change in pull request #11022: ARROW-13573: [C++] Support dictionaries natively in case_when

Reply via email to