pitrou commented on a change in pull request #11022:
URL: https://github.com/apache/arrow/pull/11022#discussion_r712154851
##########
File path: ci/scripts/PKGBUILD
##########
@@ -80,9 +80,13 @@ build() {
export LIBS="-L${MINGW_PREFIX}/libs"
export ARROW_S3=OFF
export ARROW_WITH_RE2=OFF
+ # Without this, some dataset functionality segfaults
+ export CMAKE_UNITY_BUILD=ON
else
export ARROW_S3=ON
export ARROW_WITH_RE2=ON
+ # Without this, some compute functionality segfaults
+ export CMAKE_UNITY_BUILD=OFF
Review comment:
You mean it segfaults during compilation?
##########
File path: cpp/src/arrow/array/builder_dict.h
##########
@@ -376,6 +455,37 @@ class DictionaryBuilderBase : public ArrayBuilder {
}
protected:
+ template <typename c_type>
+ Status AppendArraySliceImpl(const typename TypeTraits<T>::ArrayType& dict,
+ const ArrayData& array, int64_t offset, int64_t
length) {
+ const c_type* values = array.GetValues<c_type>(1) + offset;
+ return VisitBitBlocks(
+ array.buffers[0], array.offset + offset, length,
+ [&](const int64_t position) {
+ const int64_t index = static_cast<int64_t>(values[position]);
+ if (dict.IsValid(index)) {
+ return Append(dict.GetView(index));
+ }
+ return AppendNull();
+ },
+ [&]() { return AppendNull(); });
+ }
+
+ template <typename IndexType>
+ Status AppendScalarImpl(const typename TypeTraits<T>::ArrayType& dict,
+ const Scalar& index_scalar, int64_t n_repeats) {
+ using ScalarType = typename TypeTraits<IndexType>::ScalarType;
+ const auto index = internal::checked_cast<const
ScalarType&>(index_scalar).value;
+ if (index_scalar.is_valid && dict.IsValid(index)) {
+ const auto& value = dict.GetView(index);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
Review comment:
Not for this PR, but it sounds like offering a two-step API on
DictionaryBuilder would allow for performance improvements:
```c++
/// Ensure `value` is in the dict, and return its index, but doesn't append
it
Result<int64_t> Encode(c_type value);
/// Append the given dictionary index
Status AppendIndex(int64_t index);
Status AppendIndices(int64_t index, int64_t nrepeats);
```
##########
File path: cpp/src/arrow/compute/kernels/test_util.cc
##########
@@ -170,6 +171,74 @@ void CheckScalar(std::string func_name, const DatumVector&
inputs, Datum expecte
}
}
+Datum CheckDictionaryNonRecursive(const std::string& func_name, const
DatumVector& args) {
+ EXPECT_OK_AND_ASSIGN(Datum actual, CallFunction(func_name, args));
+ ValidateOutput(actual);
+
+ DatumVector decoded_args;
+ decoded_args.reserve(args.size());
+ for (const auto& arg : args) {
+ if (arg.type()->id() == Type::DICTIONARY) {
+ const auto& to_type = checked_cast<const
DictionaryType&>(*arg.type()).value_type();
+ EXPECT_OK_AND_ASSIGN(auto decoded, Cast(arg, to_type));
+ decoded_args.push_back(decoded);
+ } else {
+ decoded_args.push_back(arg);
+ }
+ }
+ EXPECT_OK_AND_ASSIGN(Datum expected, CallFunction(func_name, decoded_args));
+
+ if (actual.type()->id() == Type::DICTIONARY) {
Review comment:
Hmm, it would be nice if the caller actually said whether the output is
supposed to be dictionary-encoded or not. Otherwise there could be silent
regressions where the output type of a kernel changes from one version to
another.
##########
File path: cpp/src/arrow/ipc/json_simple_test.cc
##########
@@ -1385,6 +1385,29 @@ TEST(TestScalarFromJSON, Errors) {
ASSERT_RAISES(Invalid, ScalarFromJSON(boolean(), "\"true\"", &scalar));
}
+TEST(TestDictScalarFromJSON, Basics) {
+ auto type = dictionary(int32(), utf8());
+ auto dict = R"(["whiskey", "tango", "foxtrot"])";
+ auto expected_dictionary = ArrayFromJSON(utf8(), dict);
+
+ for (auto index : {"null", "2", "1", "0"}) {
+ auto scalar = DictScalarFromJSON(type, index, dict);
+ auto expected_index = ScalarFromJSON(int32(), index);
+ AssertScalarsEqual(*DictionaryScalar::Make(expected_index,
expected_dictionary),
Review comment:
Also call `scalar->ValidateFull()`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]