pitrou commented on a change in pull request #11022:
URL: https://github.com/apache/arrow/pull/11022#discussion_r702935784
##########
File path: cpp/src/arrow/array/builder_dict.h
##########
@@ -282,6 +294,163 @@ class DictionaryBuilderBase : public ArrayBuilder {
return indices_builder_.AppendEmptyValues(length);
}
+ Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override {
+ if (!scalar.type->Equals(type())) {
+ return Status::Invalid("Cannot append scalar of type ",
scalar.type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ if (!scalar.is_valid) return AppendNulls(n_repeats);
+
+ const auto& dict_ty = internal::checked_cast<const
DictionaryType&>(*scalar.type);
+ const DictionaryScalar& dict_scalar =
+ internal::checked_cast<const DictionaryScalar&>(scalar);
+ const auto& dict = internal::checked_cast<const typename
TypeTraits<T>::ArrayType&>(
+ *dict_scalar.value.dictionary);
+ switch (dict_ty.index_type()->id()) {
+ case Type::UINT8: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
UInt8Scalar&>(*dict_scalar.value.index).value);
Review comment:
What happens if `dict` has a null at this index?
##########
File path: cpp/src/arrow/array/builder_dict.h
##########
@@ -282,6 +294,163 @@ class DictionaryBuilderBase : public ArrayBuilder {
return indices_builder_.AppendEmptyValues(length);
}
+ Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override {
+ if (!scalar.type->Equals(type())) {
+ return Status::Invalid("Cannot append scalar of type ",
scalar.type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ if (!scalar.is_valid) return AppendNulls(n_repeats);
+
+ const auto& dict_ty = internal::checked_cast<const
DictionaryType&>(*scalar.type);
+ const DictionaryScalar& dict_scalar =
+ internal::checked_cast<const DictionaryScalar&>(scalar);
+ const auto& dict = internal::checked_cast<const typename
TypeTraits<T>::ArrayType&>(
+ *dict_scalar.value.dictionary);
+ switch (dict_ty.index_type()->id()) {
+ case Type::UINT8: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
UInt8Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::INT8: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
Int8Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::UINT16: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
UInt16Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::INT16: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
Int16Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::UINT32: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
UInt32Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::INT32: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
Int32Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::UINT64: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
UInt64Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::INT64: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
Int64Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ default:
+ return Status::TypeError("Invalid index type: ", dict_ty);
+ }
+ return Status::OK();
+ }
+
+ Status AppendScalars(const ScalarVector& scalars) override {
+ for (const auto& scalar : scalars) {
+ ARROW_RETURN_NOT_OK(AppendScalar(*scalar, /*n_repeats=*/1));
+ }
+ return Status::OK();
+ }
+
+ Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t
length) final {
+ // Visit the indices and insert the unpacked values.
+ const auto& dict_ty = internal::checked_cast<const
DictionaryType&>(*array.type);
+ const typename TypeTraits<T>::ArrayType dict(array.dictionary);
+ switch (dict_ty.index_type()->id()) {
+ case Type::UINT8: {
+ const uint8_t* values = array.GetValues<uint8_t>(1) + offset;
+ return VisitBitBlocks(
+ array.buffers[0], array.offset + offset, std::min(array.length,
length),
Review comment:
Other `AppendArraySlice` implementations don't check that `length` is in
bounds, so `std::min` doesn't seem necessary here.
##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -1058,6 +1062,109 @@ void AddFSBinaryIfElseKernel(const
std::shared_ptr<IfElseFunction>& scalar_funct
DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
}
+// Given a reference dictionary, computes indices to map dictionary values
from a
+// comparison dictionary to the reference.
+class DictionaryRemapper {
Review comment:
Don't we already have `DictionaryUnifier` for this? Or am I
misunderstanding?
##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
##########
@@ -624,6 +624,250 @@ TYPED_TEST(TestCaseWhenNumeric, ListOfType) {
ArrayFromJSON(type, R"([null, null, null, [6, null]])"));
}
+template <typename Type>
+class TestCaseWhenDict : public ::testing::Test {};
+
+struct JsonDict {
+ std::shared_ptr<DataType> type;
+ std::string value;
+};
+
+TYPED_TEST_SUITE(TestCaseWhenDict, IntegralArrowTypes);
+
+TYPED_TEST(TestCaseWhenDict, Simple) {
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ for (const auto& dict :
+ {JsonDict{utf8(), R"(["a", null, "bc", "def"])"},
+ JsonDict{int64(), "[1, null, 2, 3]"},
+ JsonDict{decimal256(3, 2), R"(["1.23", null, "3.45", "6.78"])"}}) {
+ auto type = dictionary(default_type_instance<TypeParam>(), dict.type);
+ auto values_null = DictArrayFromJSON(type, "[null, null, null, null]",
dict.value);
+ auto values1 = DictArrayFromJSON(type, "[0, null, 3, 1]", dict.value);
+ auto values2 = DictArrayFromJSON(type, "[2, 1, null, 0]", dict.value);
+
+ // Easy case: all arguments have the same dictionary
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2},
+ DictArrayFromJSON(type, "[0, null, null, null]", dict.value));
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2,
values1},
+ DictArrayFromJSON(type, "[0, null, null, 1]", dict.value));
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null,
values2, values1},
+ DictArrayFromJSON(type, "[null, null, null, 1]", dict.value));
+ }
+}
+
+TYPED_TEST(TestCaseWhenDict, Mixed) {
+ auto type = dictionary(default_type_instance<TypeParam>(), utf8());
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ auto dict = R"(["a", null, "bc", "def"])";
+ auto values_null = DictArrayFromJSON(type, "[null, null, null, null]", dict);
+ auto values1_dict = DictArrayFromJSON(type, "[0, null, 3, 1]", dict);
+ auto values1_decoded = ArrayFromJSON(utf8(), R"(["a", null, "def", null])");
+ auto values2_dict = DictArrayFromJSON(type, "[2, 1, null, 0]", dict);
+ auto values2_decoded = ArrayFromJSON(utf8(), R"(["bc", null, null, "a"])");
+
+ // If we have mixed dictionary/non-dictionary arguments, we decode
dictionaries
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1_dict,
values2_decoded},
+ ArrayFromJSON(utf8(), R"(["a", null, null, null])"));
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1_decoded,
values2_dict},
+ ArrayFromJSON(utf8(), R"(["a", null, null, null])"));
+ CheckScalar("case_when",
+ {MakeStruct({cond1, cond2}), values1_dict, values2_dict,
values1_decoded},
+ ArrayFromJSON(utf8(), R"(["a", null, null, null])"));
+ CheckScalar("case_when",
+ {MakeStruct({cond1, cond2}), values_null, values2_dict,
values1_decoded},
+ ArrayFromJSON(utf8(), R"([null, null, null, null])"));
+}
+
+TYPED_TEST(TestCaseWhenDict, NestedSimple) {
+ auto make_list = [](const std::shared_ptr<Array>& indices,
+ const std::shared_ptr<Array>& backing_array) {
+ EXPECT_OK_AND_ASSIGN(auto result, ListArray::FromArrays(*indices,
*backing_array));
+ return result;
+ };
+ auto index_type = default_type_instance<TypeParam>();
+ auto inner_type = dictionary(index_type, utf8());
+ auto type = list(inner_type);
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ auto dict = R"(["a", "b", "bc", "def"])";
+ auto values_null = make_list(ArrayFromJSON(int32(), "[null, null, null,
null, 0]"),
+ DictArrayFromJSON(inner_type, "[]", dict));
+ auto values1_backing = DictArrayFromJSON(inner_type, "[0, null, 3, 1]",
dict);
+ auto values2_backing = DictArrayFromJSON(inner_type, "[2, 1, null, 0]",
dict);
+ auto values1 = make_list(ArrayFromJSON(int32(), "[0, 2, 2, 3, 4]"),
values1_backing);
+ auto values2 = make_list(ArrayFromJSON(int32(), "[0, 1, 2, 2, 4]"),
values2_backing);
+
+ CheckScalarNonRecursive(
+ "case_when", {MakeStruct({cond1, cond2}), values1, values2},
+ make_list(ArrayFromJSON(int32(), "[0, 2, 2, null, 2]"),
+ DictArrayFromJSON(inner_type, "[0, null]", R"(["a"])")));
+ CheckScalarNonRecursive(
+ "case_when",
+ {MakeStruct({cond1, cond2}), values1,
+ make_list(ArrayFromJSON(int32(), "[0, 1, null, 2, 4]"),
values2_backing)},
+ make_list(ArrayFromJSON(int32(), "[0, 2, null, null, 2]"),
+ DictArrayFromJSON(inner_type, "[0, null]", R"(["a"])")));
+ CheckScalarNonRecursive(
+ "case_when",
+ {MakeStruct({cond1, cond2}), values1,
+ make_list(ArrayFromJSON(int32(), "[0, 1, null, 2, 4]"),
values2_backing), values1},
+ make_list(ArrayFromJSON(int32(), "[0, 2, null, 2, 3]"),
+ DictArrayFromJSON(inner_type, "[0, null, 1]", R"(["a",
"b"])")));
+
+ CheckScalarNonRecursive(
+ "case_when",
+ {
+ Datum(MakeStruct({cond1, cond2})),
+ Datum(std::make_shared<ListScalar>(
+ DictArrayFromJSON(inner_type, "[0, 1]", dict))),
+ Datum(std::make_shared<ListScalar>(
+ DictArrayFromJSON(inner_type, "[2, 3]", dict))),
+ },
+ make_list(ArrayFromJSON(int32(), "[0, 2, 4, null, 6]"),
+ DictArrayFromJSON(inner_type, "[0, 1, 0, 1, 2, 3]", dict)));
+
+ CheckScalarNonRecursive(
+ "case_when", {MakeStruct({Datum(true), Datum(false)}), values1,
values2}, values1);
+ CheckScalarNonRecursive(
+ "case_when", {MakeStruct({Datum(false), Datum(true)}), values1,
values2}, values2);
+ CheckScalarNonRecursive("case_when", {MakeStruct({Datum(false)}), values1,
values2},
+ values2);
+ CheckScalarNonRecursive("case_when",
+ {MakeStruct({Datum(false), Datum(false)}), values1,
values2},
+ values_null);
+}
+
+TYPED_TEST(TestCaseWhenDict, DifferentDictionaries) {
+ auto type = dictionary(default_type_instance<TypeParam>(), utf8());
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ auto dict1 = R"(["a", null, "bc", "def"])";
+ auto dict2 = R"(["bc", "foo", null, "a"])";
+ auto dict3 = R"(["def", "a", "a", "bc"])";
+ auto values1_null = DictArrayFromJSON(type, "[null, null, null, null]",
dict1);
+ auto values2_null = DictArrayFromJSON(type, "[null, null, null, null]",
dict2);
+ auto values1 = DictArrayFromJSON(type, "[0, null, 3, 1]", dict1);
+ auto values2 = DictArrayFromJSON(type, "[2, 1, null, 0]", dict2);
+ auto values3 = DictArrayFromJSON(type, "[0, 1, 2, 3]", dict3);
+
+ // For scalar conditions, we borrow the dictionary of the chosen output (or
the first
+ // input when outputting null)
+ CheckScalar("case_when", {MakeStruct({Datum(true), Datum(false)}), values1,
values2},
+ values1);
+ CheckScalar("case_when", {MakeStruct({Datum(false), Datum(true)}), values1,
values2},
+ values2);
+ CheckScalar("case_when", {MakeStruct({Datum(false), Datum(false)}), values1,
values2},
+ values1_null);
+ CheckScalar("case_when", {MakeStruct({Datum(false), Datum(false)}), values2,
values1},
+ values2_null);
+
+ // For array conditions, we always borrow the dictionary of the first input
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2},
+ DictArrayFromJSON(type, "[0, null, null, null]", dict1));
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2,
values1},
+ DictArrayFromJSON(type, "[0, null, null, 1]", dict1));
+
+ // When mixing dictionaries, we try to map other dictionaries onto the first
one
+ // Don't check the scalar cases since we don't remap dictionaries in that
case
+ CheckScalarNonRecursive(
+ "case_when",
+ {MakeStruct({ArrayFromJSON(boolean(), "[true, true, false, false]")}),
values1,
+ values2},
+ DictArrayFromJSON(type, "[0, null, null, 2]", dict1));
+ CheckScalarNonRecursive(
+ "case_when",
+ {MakeStruct({ArrayFromJSON(boolean(), "[true, true, false, false]"),
+ ArrayFromJSON(boolean(), "[true, false, true, false]")}),
+ values1, values2},
+ DictArrayFromJSON(type, "[0, null, null, null]", dict1));
+ CheckScalarNonRecursive(
+ "case_when",
+ {MakeStruct({ArrayFromJSON(boolean(), "[false, false, false, false]"),
+ ArrayFromJSON(boolean(), "[true, true, true, true]")}),
+ values1, values3},
+ DictArrayFromJSON(type, "[3, 0, 0, 2]", dict1));
+ CheckScalarNonRecursive(
+ "case_when",
+ {MakeStruct({ArrayFromJSON(boolean(), "[null, null, null, true]"),
+ ArrayFromJSON(boolean(), "[true, true, true, true]")}),
+ values1, values3},
+ DictArrayFromJSON(type, "[3, 0, 0, 1]", dict1));
+ CheckScalarNonRecursive(
+ "case_when",
+ {
+ MakeStruct({ArrayFromJSON(boolean(), "[true, true, false, false]")}),
+ DictScalarFromJSON(type, "0", dict1),
+ DictScalarFromJSON(type, "0", dict2),
+ },
+ DictArrayFromJSON(type, "[0, 0, 2, 2]", dict1));
+ CheckScalarNonRecursive(
+ "case_when",
+ {
+ MakeStruct({ArrayFromJSON(boolean(), "[true, true, false, false]"),
+ ArrayFromJSON(boolean(), "[false, false, true, true]")}),
+ DictScalarFromJSON(type, "0", dict1),
+ DictScalarFromJSON(type, "0", dict2),
+ },
+ DictArrayFromJSON(type, "[0, 0, 2, 2]", dict1));
+
+ // If we can't map values from a dictionary, then raise an error
+ // Unmappable value is in the else clause
Review comment:
I'm curious: why don't we unify dictionaries instead? It would sound
more useful to me. I don't see any reason for the first input to have a
particular status, is there?
##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
##########
@@ -624,6 +624,250 @@ TYPED_TEST(TestCaseWhenNumeric, ListOfType) {
ArrayFromJSON(type, R"([null, null, null, [6, null]])"));
}
+template <typename Type>
+class TestCaseWhenDict : public ::testing::Test {};
+
+struct JsonDict {
+ std::shared_ptr<DataType> type;
+ std::string value;
+};
+
+TYPED_TEST_SUITE(TestCaseWhenDict, IntegralArrowTypes);
+
+TYPED_TEST(TestCaseWhenDict, Simple) {
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ for (const auto& dict :
+ {JsonDict{utf8(), R"(["a", null, "bc", "def"])"},
+ JsonDict{int64(), "[1, null, 2, 3]"},
+ JsonDict{decimal256(3, 2), R"(["1.23", null, "3.45", "6.78"])"}}) {
+ auto type = dictionary(default_type_instance<TypeParam>(), dict.type);
+ auto values_null = DictArrayFromJSON(type, "[null, null, null, null]",
dict.value);
+ auto values1 = DictArrayFromJSON(type, "[0, null, 3, 1]", dict.value);
+ auto values2 = DictArrayFromJSON(type, "[2, 1, null, 0]", dict.value);
+
+ // Easy case: all arguments have the same dictionary
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2},
+ DictArrayFromJSON(type, "[0, null, null, null]", dict.value));
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2,
values1},
+ DictArrayFromJSON(type, "[0, null, null, 1]", dict.value));
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null,
values2, values1},
+ DictArrayFromJSON(type, "[null, null, null, 1]", dict.value));
+ }
+}
+
+TYPED_TEST(TestCaseWhenDict, Mixed) {
+ auto type = dictionary(default_type_instance<TypeParam>(), utf8());
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ auto dict = R"(["a", null, "bc", "def"])";
+ auto values_null = DictArrayFromJSON(type, "[null, null, null, null]", dict);
+ auto values1_dict = DictArrayFromJSON(type, "[0, null, 3, 1]", dict);
+ auto values1_decoded = ArrayFromJSON(utf8(), R"(["a", null, "def", null])");
+ auto values2_dict = DictArrayFromJSON(type, "[2, 1, null, 0]", dict);
+ auto values2_decoded = ArrayFromJSON(utf8(), R"(["bc", null, null, "a"])");
+
+ // If we have mixed dictionary/non-dictionary arguments, we decode
dictionaries
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1_dict,
values2_decoded},
+ ArrayFromJSON(utf8(), R"(["a", null, null, null])"));
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1_decoded,
values2_dict},
+ ArrayFromJSON(utf8(), R"(["a", null, null, null])"));
+ CheckScalar("case_when",
+ {MakeStruct({cond1, cond2}), values1_dict, values2_dict,
values1_decoded},
+ ArrayFromJSON(utf8(), R"(["a", null, null, null])"));
+ CheckScalar("case_when",
+ {MakeStruct({cond1, cond2}), values_null, values2_dict,
values1_decoded},
+ ArrayFromJSON(utf8(), R"([null, null, null, null])"));
+}
+
+TYPED_TEST(TestCaseWhenDict, NestedSimple) {
+ auto make_list = [](const std::shared_ptr<Array>& indices,
+ const std::shared_ptr<Array>& backing_array) {
+ EXPECT_OK_AND_ASSIGN(auto result, ListArray::FromArrays(*indices,
*backing_array));
+ return result;
+ };
+ auto index_type = default_type_instance<TypeParam>();
+ auto inner_type = dictionary(index_type, utf8());
+ auto type = list(inner_type);
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ auto dict = R"(["a", "b", "bc", "def"])";
+ auto values_null = make_list(ArrayFromJSON(int32(), "[null, null, null,
null, 0]"),
+ DictArrayFromJSON(inner_type, "[]", dict));
+ auto values1_backing = DictArrayFromJSON(inner_type, "[0, null, 3, 1]",
dict);
+ auto values2_backing = DictArrayFromJSON(inner_type, "[2, 1, null, 0]",
dict);
+ auto values1 = make_list(ArrayFromJSON(int32(), "[0, 2, 2, 3, 4]"),
values1_backing);
+ auto values2 = make_list(ArrayFromJSON(int32(), "[0, 1, 2, 2, 4]"),
values2_backing);
+
+ CheckScalarNonRecursive(
Review comment:
Why is this calling `CheckScalarNonRecursive` and not `CheckScalar`?
Leave a comment?
##########
File path: cpp/src/arrow/array/validate.cc
##########
@@ -568,6 +568,9 @@ struct ValidateArrayFullImpl {
}
Status Visit(const DictionaryType& type) {
+ if (!data.dictionary) {
+ return Status::Invalid("Dictionary array has no dictionary");
+ }
Review comment:
Normally this is already checked in the top-level `ValidateArray`:
https://github.com/apache/arrow/blob/a93ce9907c6f7ae70591b786f040d64e0f7d7109/cpp/src/arrow/array/validate.cc#L375-L382
##########
File path: cpp/src/arrow/array/builder_dict.h
##########
@@ -282,6 +294,163 @@ class DictionaryBuilderBase : public ArrayBuilder {
return indices_builder_.AppendEmptyValues(length);
}
+ Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override {
+ if (!scalar.type->Equals(type())) {
+ return Status::Invalid("Cannot append scalar of type ",
scalar.type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ if (!scalar.is_valid) return AppendNulls(n_repeats);
+
+ const auto& dict_ty = internal::checked_cast<const
DictionaryType&>(*scalar.type);
+ const DictionaryScalar& dict_scalar =
+ internal::checked_cast<const DictionaryScalar&>(scalar);
+ const auto& dict = internal::checked_cast<const typename
TypeTraits<T>::ArrayType&>(
+ *dict_scalar.value.dictionary);
+ switch (dict_ty.index_type()->id()) {
+ case Type::UINT8: {
Review comment:
Perhaps try to factor this out as proposed below for `AppendScalar`.
##########
File path: cpp/src/arrow/array/builder_dict.h
##########
@@ -282,6 +294,163 @@ class DictionaryBuilderBase : public ArrayBuilder {
return indices_builder_.AppendEmptyValues(length);
}
+ Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override {
+ if (!scalar.type->Equals(type())) {
+ return Status::Invalid("Cannot append scalar of type ",
scalar.type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ if (!scalar.is_valid) return AppendNulls(n_repeats);
+
+ const auto& dict_ty = internal::checked_cast<const
DictionaryType&>(*scalar.type);
+ const DictionaryScalar& dict_scalar =
+ internal::checked_cast<const DictionaryScalar&>(scalar);
+ const auto& dict = internal::checked_cast<const typename
TypeTraits<T>::ArrayType&>(
+ *dict_scalar.value.dictionary);
+ switch (dict_ty.index_type()->id()) {
+ case Type::UINT8: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
UInt8Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::INT8: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
Int8Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::UINT16: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
UInt16Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::INT16: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
Int16Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::UINT32: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
UInt32Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::INT32: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
Int32Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::UINT64: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
UInt64Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::INT64: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
Int64Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ default:
+ return Status::TypeError("Invalid index type: ", dict_ty);
+ }
+ return Status::OK();
+ }
+
+ Status AppendScalars(const ScalarVector& scalars) override {
+ for (const auto& scalar : scalars) {
+ ARROW_RETURN_NOT_OK(AppendScalar(*scalar, /*n_repeats=*/1));
+ }
+ return Status::OK();
+ }
+
+ Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t
length) final {
+ // Visit the indices and insert the unpacked values.
+ const auto& dict_ty = internal::checked_cast<const
DictionaryType&>(*array.type);
+ const typename TypeTraits<T>::ArrayType dict(array.dictionary);
+ switch (dict_ty.index_type()->id()) {
Review comment:
Call `Reserve(length)` before this?
##########
File path: cpp/src/arrow/array/builder_dict.h
##########
@@ -282,6 +294,163 @@ class DictionaryBuilderBase : public ArrayBuilder {
return indices_builder_.AppendEmptyValues(length);
}
+ Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override {
+ if (!scalar.type->Equals(type())) {
Review comment:
Do we really want to do this check every append or should this be left
to callers?
##########
File path: cpp/src/arrow/array/builder_dict.h
##########
@@ -282,6 +294,163 @@ class DictionaryBuilderBase : public ArrayBuilder {
return indices_builder_.AppendEmptyValues(length);
}
+ Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override {
+ if (!scalar.type->Equals(type())) {
+ return Status::Invalid("Cannot append scalar of type ",
scalar.type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ if (!scalar.is_valid) return AppendNulls(n_repeats);
+
+ const auto& dict_ty = internal::checked_cast<const
DictionaryType&>(*scalar.type);
+ const DictionaryScalar& dict_scalar =
+ internal::checked_cast<const DictionaryScalar&>(scalar);
+ const auto& dict = internal::checked_cast<const typename
TypeTraits<T>::ArrayType&>(
+ *dict_scalar.value.dictionary);
+ switch (dict_ty.index_type()->id()) {
+ case Type::UINT8: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
UInt8Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::INT8: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
Int8Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::UINT16: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
UInt16Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::INT16: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
Int16Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::UINT32: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
UInt32Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::INT32: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
Int32Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::UINT64: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
UInt64Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ case Type::INT64: {
+ const auto& value = dict.GetView(
+ internal::checked_cast<const
Int64Scalar&>(*dict_scalar.value.index).value);
+ for (int64_t i = 0; i < n_repeats; i++) {
+ ARROW_RETURN_NOT_OK(Append(value));
+ }
+ break;
+ }
+ default:
+ return Status::TypeError("Invalid index type: ", dict_ty);
+ }
+ return Status::OK();
+ }
+
+ Status AppendScalars(const ScalarVector& scalars) override {
+ for (const auto& scalar : scalars) {
+ ARROW_RETURN_NOT_OK(AppendScalar(*scalar, /*n_repeats=*/1));
+ }
+ return Status::OK();
+ }
+
+ Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t
length) final {
+ // Visit the indices and insert the unpacked values.
+ const auto& dict_ty = internal::checked_cast<const
DictionaryType&>(*array.type);
+ const typename TypeTraits<T>::ArrayType dict(array.dictionary);
+ switch (dict_ty.index_type()->id()) {
+ case Type::UINT8: {
+ const uint8_t* values = array.GetValues<uint8_t>(1) + offset;
+ return VisitBitBlocks(
+ array.buffers[0], array.offset + offset, std::min(array.length,
length),
+ [&](int64_t position) { return
Append(dict.GetView(values[position])); },
+ [&]() { return AppendNull(); });
+ }
Review comment:
Is it possible to factor this out to avoid repetition? For example:
```c++
template <IndexType>
struct SliceAppender {
const IndexType* values;
Status operator()(const ArrayData& array, int64_t offset, int64_t length) {
return VisitBitBlocks(
array.buffers[0], array.offset + offset, length,
[&](int64_t position) {
if (dict.IsNull(values[position])) return AppendNull();
return Append(dict.GetView(values[position]));
},
[&]() { return AppendNull(); }); }
);
}
}
case Type::UINT8:
return SliceAppender{array.GetValues<uint8_t>(1) + offset}(array, offset,
length);
// ...
```
##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
##########
@@ -624,6 +624,250 @@ TYPED_TEST(TestCaseWhenNumeric, ListOfType) {
ArrayFromJSON(type, R"([null, null, null, [6, null]])"));
}
+template <typename Type>
+class TestCaseWhenDict : public ::testing::Test {};
+
+struct JsonDict {
+ std::shared_ptr<DataType> type;
+ std::string value;
+};
+
+TYPED_TEST_SUITE(TestCaseWhenDict, IntegralArrowTypes);
+
+TYPED_TEST(TestCaseWhenDict, Simple) {
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ for (const auto& dict :
+ {JsonDict{utf8(), R"(["a", null, "bc", "def"])"},
+ JsonDict{int64(), "[1, null, 2, 3]"},
+ JsonDict{decimal256(3, 2), R"(["1.23", null, "3.45", "6.78"])"}}) {
+ auto type = dictionary(default_type_instance<TypeParam>(), dict.type);
+ auto values_null = DictArrayFromJSON(type, "[null, null, null, null]",
dict.value);
+ auto values1 = DictArrayFromJSON(type, "[0, null, 3, 1]", dict.value);
+ auto values2 = DictArrayFromJSON(type, "[2, 1, null, 0]", dict.value);
+
+ // Easy case: all arguments have the same dictionary
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2},
+ DictArrayFromJSON(type, "[0, null, null, null]", dict.value));
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2,
values1},
+ DictArrayFromJSON(type, "[0, null, null, 1]", dict.value));
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null,
values2, values1},
+ DictArrayFromJSON(type, "[null, null, null, 1]", dict.value));
+ }
+}
+
+TYPED_TEST(TestCaseWhenDict, Mixed) {
+ auto type = dictionary(default_type_instance<TypeParam>(), utf8());
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ auto dict = R"(["a", null, "bc", "def"])";
+ auto values_null = DictArrayFromJSON(type, "[null, null, null, null]", dict);
+ auto values1_dict = DictArrayFromJSON(type, "[0, null, 3, 1]", dict);
+ auto values1_decoded = ArrayFromJSON(utf8(), R"(["a", null, "def", null])");
+ auto values2_dict = DictArrayFromJSON(type, "[2, 1, null, 0]", dict);
+ auto values2_decoded = ArrayFromJSON(utf8(), R"(["bc", null, null, "a"])");
+
+ // If we have mixed dictionary/non-dictionary arguments, we decode
dictionaries
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1_dict,
values2_decoded},
+ ArrayFromJSON(utf8(), R"(["a", null, null, null])"));
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1_decoded,
values2_dict},
+ ArrayFromJSON(utf8(), R"(["a", null, null, null])"));
+ CheckScalar("case_when",
+ {MakeStruct({cond1, cond2}), values1_dict, values2_dict,
values1_decoded},
+ ArrayFromJSON(utf8(), R"(["a", null, null, null])"));
+ CheckScalar("case_when",
+ {MakeStruct({cond1, cond2}), values_null, values2_dict,
values1_decoded},
+ ArrayFromJSON(utf8(), R"([null, null, null, null])"));
+}
+
+TYPED_TEST(TestCaseWhenDict, NestedSimple) {
+ auto make_list = [](const std::shared_ptr<Array>& indices,
+ const std::shared_ptr<Array>& backing_array) {
+ EXPECT_OK_AND_ASSIGN(auto result, ListArray::FromArrays(*indices,
*backing_array));
+ return result;
+ };
+ auto index_type = default_type_instance<TypeParam>();
+ auto inner_type = dictionary(index_type, utf8());
+ auto type = list(inner_type);
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ auto dict = R"(["a", "b", "bc", "def"])";
+ auto values_null = make_list(ArrayFromJSON(int32(), "[null, null, null,
null, 0]"),
+ DictArrayFromJSON(inner_type, "[]", dict));
+ auto values1_backing = DictArrayFromJSON(inner_type, "[0, null, 3, 1]",
dict);
+ auto values2_backing = DictArrayFromJSON(inner_type, "[2, 1, null, 0]",
dict);
+ auto values1 = make_list(ArrayFromJSON(int32(), "[0, 2, 2, 3, 4]"),
values1_backing);
+ auto values2 = make_list(ArrayFromJSON(int32(), "[0, 1, 2, 2, 4]"),
values2_backing);
+
+ CheckScalarNonRecursive(
+ "case_when", {MakeStruct({cond1, cond2}), values1, values2},
+ make_list(ArrayFromJSON(int32(), "[0, 2, 2, null, 2]"),
+ DictArrayFromJSON(inner_type, "[0, null]", R"(["a"])")));
+ CheckScalarNonRecursive(
+ "case_when",
+ {MakeStruct({cond1, cond2}), values1,
+ make_list(ArrayFromJSON(int32(), "[0, 1, null, 2, 4]"),
values2_backing)},
+ make_list(ArrayFromJSON(int32(), "[0, 2, null, null, 2]"),
+ DictArrayFromJSON(inner_type, "[0, null]", R"(["a"])")));
+ CheckScalarNonRecursive(
+ "case_when",
+ {MakeStruct({cond1, cond2}), values1,
+ make_list(ArrayFromJSON(int32(), "[0, 1, null, 2, 4]"),
values2_backing), values1},
+ make_list(ArrayFromJSON(int32(), "[0, 2, null, 2, 3]"),
+ DictArrayFromJSON(inner_type, "[0, null, 1]", R"(["a",
"b"])")));
+
+ CheckScalarNonRecursive(
+ "case_when",
+ {
+ Datum(MakeStruct({cond1, cond2})),
+ Datum(std::make_shared<ListScalar>(
+ DictArrayFromJSON(inner_type, "[0, 1]", dict))),
+ Datum(std::make_shared<ListScalar>(
+ DictArrayFromJSON(inner_type, "[2, 3]", dict))),
+ },
+ make_list(ArrayFromJSON(int32(), "[0, 2, 4, null, 6]"),
+ DictArrayFromJSON(inner_type, "[0, 1, 0, 1, 2, 3]", dict)));
+
+ CheckScalarNonRecursive(
+ "case_when", {MakeStruct({Datum(true), Datum(false)}), values1,
values2}, values1);
+ CheckScalarNonRecursive(
+ "case_when", {MakeStruct({Datum(false), Datum(true)}), values1,
values2}, values2);
+ CheckScalarNonRecursive("case_when", {MakeStruct({Datum(false)}), values1,
values2},
+ values2);
+ CheckScalarNonRecursive("case_when",
+ {MakeStruct({Datum(false), Datum(false)}), values1,
values2},
+ values_null);
+}
+
+TYPED_TEST(TestCaseWhenDict, DifferentDictionaries) {
+ auto type = dictionary(default_type_instance<TypeParam>(), utf8());
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ auto dict1 = R"(["a", null, "bc", "def"])";
+ auto dict2 = R"(["bc", "foo", null, "a"])";
+ auto dict3 = R"(["def", "a", "a", "bc"])";
+ auto values1_null = DictArrayFromJSON(type, "[null, null, null, null]",
dict1);
+ auto values2_null = DictArrayFromJSON(type, "[null, null, null, null]",
dict2);
+ auto values1 = DictArrayFromJSON(type, "[0, null, 3, 1]", dict1);
+ auto values2 = DictArrayFromJSON(type, "[2, 1, null, 0]", dict2);
Review comment:
For some reason, it looks like the nulls in the indices are placed at
the same indices as the nulls in the respective dictionaries.
##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
##########
@@ -624,6 +624,250 @@ TYPED_TEST(TestCaseWhenNumeric, ListOfType) {
ArrayFromJSON(type, R"([null, null, null, [6, null]])"));
}
+template <typename Type>
+class TestCaseWhenDict : public ::testing::Test {};
+
+struct JsonDict {
+ std::shared_ptr<DataType> type;
+ std::string value;
+};
+
+TYPED_TEST_SUITE(TestCaseWhenDict, IntegralArrowTypes);
+
+TYPED_TEST(TestCaseWhenDict, Simple) {
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ for (const auto& dict :
+ {JsonDict{utf8(), R"(["a", null, "bc", "def"])"},
+ JsonDict{int64(), "[1, null, 2, 3]"},
+ JsonDict{decimal256(3, 2), R"(["1.23", null, "3.45", "6.78"])"}}) {
+ auto type = dictionary(default_type_instance<TypeParam>(), dict.type);
+ auto values_null = DictArrayFromJSON(type, "[null, null, null, null]",
dict.value);
+ auto values1 = DictArrayFromJSON(type, "[0, null, 3, 1]", dict.value);
+ auto values2 = DictArrayFromJSON(type, "[2, 1, null, 0]", dict.value);
+
+ // Easy case: all arguments have the same dictionary
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2},
+ DictArrayFromJSON(type, "[0, null, null, null]", dict.value));
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2,
values1},
+ DictArrayFromJSON(type, "[0, null, null, 1]", dict.value));
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null,
values2, values1},
+ DictArrayFromJSON(type, "[null, null, null, 1]", dict.value));
+ }
+}
+
+TYPED_TEST(TestCaseWhenDict, Mixed) {
+ auto type = dictionary(default_type_instance<TypeParam>(), utf8());
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ auto dict = R"(["a", null, "bc", "def"])";
+ auto values_null = DictArrayFromJSON(type, "[null, null, null, null]", dict);
+ auto values1_dict = DictArrayFromJSON(type, "[0, null, 3, 1]", dict);
+ auto values1_decoded = ArrayFromJSON(utf8(), R"(["a", null, "def", null])");
+ auto values2_dict = DictArrayFromJSON(type, "[2, 1, null, 0]", dict);
+ auto values2_decoded = ArrayFromJSON(utf8(), R"(["bc", null, null, "a"])");
+
+ // If we have mixed dictionary/non-dictionary arguments, we decode
dictionaries
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1_dict,
values2_decoded},
+ ArrayFromJSON(utf8(), R"(["a", null, null, null])"));
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1_decoded,
values2_dict},
+ ArrayFromJSON(utf8(), R"(["a", null, null, null])"));
+ CheckScalar("case_when",
+ {MakeStruct({cond1, cond2}), values1_dict, values2_dict,
values1_decoded},
+ ArrayFromJSON(utf8(), R"(["a", null, null, null])"));
+ CheckScalar("case_when",
+ {MakeStruct({cond1, cond2}), values_null, values2_dict,
values1_decoded},
+ ArrayFromJSON(utf8(), R"([null, null, null, null])"));
+}
+
+TYPED_TEST(TestCaseWhenDict, NestedSimple) {
+ auto make_list = [](const std::shared_ptr<Array>& indices,
+ const std::shared_ptr<Array>& backing_array) {
+ EXPECT_OK_AND_ASSIGN(auto result, ListArray::FromArrays(*indices,
*backing_array));
+ return result;
+ };
+ auto index_type = default_type_instance<TypeParam>();
+ auto inner_type = dictionary(index_type, utf8());
+ auto type = list(inner_type);
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ auto dict = R"(["a", "b", "bc", "def"])";
+ auto values_null = make_list(ArrayFromJSON(int32(), "[null, null, null,
null, 0]"),
+ DictArrayFromJSON(inner_type, "[]", dict));
+ auto values1_backing = DictArrayFromJSON(inner_type, "[0, null, 3, 1]",
dict);
+ auto values2_backing = DictArrayFromJSON(inner_type, "[2, 1, null, 0]",
dict);
+ auto values1 = make_list(ArrayFromJSON(int32(), "[0, 2, 2, 3, 4]"),
values1_backing);
+ auto values2 = make_list(ArrayFromJSON(int32(), "[0, 1, 2, 2, 4]"),
values2_backing);
+
+ CheckScalarNonRecursive(
+ "case_when", {MakeStruct({cond1, cond2}), values1, values2},
+ make_list(ArrayFromJSON(int32(), "[0, 2, 2, null, 2]"),
+ DictArrayFromJSON(inner_type, "[0, null]", R"(["a"])")));
+ CheckScalarNonRecursive(
+ "case_when",
+ {MakeStruct({cond1, cond2}), values1,
+ make_list(ArrayFromJSON(int32(), "[0, 1, null, 2, 4]"),
values2_backing)},
+ make_list(ArrayFromJSON(int32(), "[0, 2, null, null, 2]"),
+ DictArrayFromJSON(inner_type, "[0, null]", R"(["a"])")));
+ CheckScalarNonRecursive(
+ "case_when",
+ {MakeStruct({cond1, cond2}), values1,
+ make_list(ArrayFromJSON(int32(), "[0, 1, null, 2, 4]"),
values2_backing), values1},
+ make_list(ArrayFromJSON(int32(), "[0, 2, null, 2, 3]"),
+ DictArrayFromJSON(inner_type, "[0, null, 1]", R"(["a",
"b"])")));
+
+ CheckScalarNonRecursive(
+ "case_when",
+ {
+ Datum(MakeStruct({cond1, cond2})),
+ Datum(std::make_shared<ListScalar>(
+ DictArrayFromJSON(inner_type, "[0, 1]", dict))),
+ Datum(std::make_shared<ListScalar>(
+ DictArrayFromJSON(inner_type, "[2, 3]", dict))),
+ },
+ make_list(ArrayFromJSON(int32(), "[0, 2, 4, null, 6]"),
+ DictArrayFromJSON(inner_type, "[0, 1, 0, 1, 2, 3]", dict)));
+
+ CheckScalarNonRecursive(
+ "case_when", {MakeStruct({Datum(true), Datum(false)}), values1,
values2}, values1);
+ CheckScalarNonRecursive(
+ "case_when", {MakeStruct({Datum(false), Datum(true)}), values1,
values2}, values2);
+ CheckScalarNonRecursive("case_when", {MakeStruct({Datum(false)}), values1,
values2},
+ values2);
+ CheckScalarNonRecursive("case_when",
+ {MakeStruct({Datum(false), Datum(false)}), values1,
values2},
+ values_null);
+}
+
+TYPED_TEST(TestCaseWhenDict, DifferentDictionaries) {
+ auto type = dictionary(default_type_instance<TypeParam>(), utf8());
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ auto dict1 = R"(["a", null, "bc", "def"])";
+ auto dict2 = R"(["bc", "foo", null, "a"])";
+ auto dict3 = R"(["def", "a", "a", "bc"])";
+ auto values1_null = DictArrayFromJSON(type, "[null, null, null, null]",
dict1);
+ auto values2_null = DictArrayFromJSON(type, "[null, null, null, null]",
dict2);
+ auto values1 = DictArrayFromJSON(type, "[0, null, 3, 1]", dict1);
+ auto values2 = DictArrayFromJSON(type, "[2, 1, null, 0]", dict2);
+ auto values3 = DictArrayFromJSON(type, "[0, 1, 2, 3]", dict3);
+
+ // For scalar conditions, we borrow the dictionary of the chosen output (or
the first
+ // input when outputting null)
+ CheckScalar("case_when", {MakeStruct({Datum(true), Datum(false)}), values1,
values2},
+ values1);
+ CheckScalar("case_when", {MakeStruct({Datum(false), Datum(true)}), values1,
values2},
+ values2);
+ CheckScalar("case_when", {MakeStruct({Datum(false), Datum(false)}), values1,
values2},
+ values1_null);
+ CheckScalar("case_when", {MakeStruct({Datum(false), Datum(false)}), values2,
values1},
+ values2_null);
+
+ // For array conditions, we always borrow the dictionary of the first input
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2},
+ DictArrayFromJSON(type, "[0, null, null, null]", dict1));
+ CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2,
values1},
+ DictArrayFromJSON(type, "[0, null, null, 1]", dict1));
+
+ // When mixing dictionaries, we try to map other dictionaries onto the first
one
+ // Don't check the scalar cases since we don't remap dictionaries in that
case
+ CheckScalarNonRecursive(
+ "case_when",
+ {MakeStruct({ArrayFromJSON(boolean(), "[true, true, false, false]")}),
values1,
+ values2},
+ DictArrayFromJSON(type, "[0, null, null, 2]", dict1));
+ CheckScalarNonRecursive(
+ "case_when",
+ {MakeStruct({ArrayFromJSON(boolean(), "[true, true, false, false]"),
+ ArrayFromJSON(boolean(), "[true, false, true, false]")}),
+ values1, values2},
+ DictArrayFromJSON(type, "[0, null, null, null]", dict1));
+ CheckScalarNonRecursive(
+ "case_when",
+ {MakeStruct({ArrayFromJSON(boolean(), "[false, false, false, false]"),
+ ArrayFromJSON(boolean(), "[true, true, true, true]")}),
+ values1, values3},
+ DictArrayFromJSON(type, "[3, 0, 0, 2]", dict1));
+ CheckScalarNonRecursive(
+ "case_when",
+ {MakeStruct({ArrayFromJSON(boolean(), "[null, null, null, true]"),
+ ArrayFromJSON(boolean(), "[true, true, true, true]")}),
+ values1, values3},
+ DictArrayFromJSON(type, "[3, 0, 0, 1]", dict1));
+ CheckScalarNonRecursive(
+ "case_when",
+ {
+ MakeStruct({ArrayFromJSON(boolean(), "[true, true, false, false]")}),
+ DictScalarFromJSON(type, "0", dict1),
+ DictScalarFromJSON(type, "0", dict2),
+ },
+ DictArrayFromJSON(type, "[0, 0, 2, 2]", dict1));
+ CheckScalarNonRecursive(
+ "case_when",
+ {
+ MakeStruct({ArrayFromJSON(boolean(), "[true, true, false, false]"),
+ ArrayFromJSON(boolean(), "[false, false, true, true]")}),
+ DictScalarFromJSON(type, "0", dict1),
+ DictScalarFromJSON(type, "0", dict2),
+ },
+ DictArrayFromJSON(type, "[0, 0, 2, 2]", dict1));
+
+ // If we can't map values from a dictionary, then raise an error
+ // Unmappable value is in the else clause
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid,
+ ::testing::HasSubstr(
+ "Cannot map dictionary index 1 at position 1 to the common
dictionary"),
+ CallFunction(
+ "case_when",
+ {MakeStruct({ArrayFromJSON(boolean(), "[false, false, false,
false]")}),
+ values1, values2}));
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid,
+ ::testing::HasSubstr("Cannot map dictionary index 1 to the common
dictionary"),
+ CallFunction(
+ "case_when",
+ {MakeStruct({ArrayFromJSON(boolean(), "[false, false, false,
false]")}),
+ values1, DictScalarFromJSON(type, "1", dict2)}));
+ // Unmappable value is in a branch (test multiple times to ensure coverage
of branches
+ // in impl)
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid,
+ ::testing::HasSubstr(
+ "Cannot map dictionary index 1 at position 1 to the common
dictionary"),
+ CallFunction("case_when",
+ {MakeStruct({Datum(false),
+ ArrayFromJSON(boolean(), "[true, true, true,
true]")}),
+ values1, values2}));
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid,
+ ::testing::HasSubstr(
+ "Cannot map dictionary index 1 at position 1 to the common
dictionary"),
+ CallFunction("case_when",
+ {MakeStruct({Datum(false),
+ ArrayFromJSON(boolean(), "[false, true, false,
false]")}),
+ values1, values2}));
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid,
+ ::testing::HasSubstr(
+ "Cannot map dictionary index 1 at position 1 to the common
dictionary"),
+ CallFunction("case_when",
+ {MakeStruct({Datum(false),
+ ArrayFromJSON(boolean(), "[null, true, null,
null]")}),
+ values1, values2}));
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid,
+ ::testing::HasSubstr("Cannot map dictionary index 1 to the common
dictionary"),
+ CallFunction("case_when",
+ {MakeStruct({Datum(false),
+ ArrayFromJSON(boolean(), "[true, true, true,
null]")}),
+ values1, DictScalarFromJSON(type, "1", dict2)}));
+
+ // ...or optionally, emit null
+
+ // TODO: this is not implemented yet
Review comment:
I'm not sure I understand what this TODO is for. Emitting a null when
some option is enabled?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]