bkietz commented on a change in pull request #9323: URL: https://github.com/apache/arrow/pull/9323#discussion_r579306832
########## File path: cpp/src/arrow/compute/kernels/vector_hash.cc ########## @@ -529,6 +578,7 @@ std::unique_ptr<KernelState> DictionaryHashInit(KernelContext* ctx, DCHECK(false) << "Unsupported dictionary index type"; break; } + DictionaryEncodeOptions options = DictionaryEncodeOptions::Defaults(); Review comment: Seems unused: ```suggestion ``` ########## File path: cpp/src/arrow/compute/kernels/vector_hash.cc ########## @@ -147,6 +152,8 @@ class ValueCountsAction final : ActionBase { } } + bool ShouldEncodeNulls() { return true; } Review comment: ```suggestion constexpr bool ShouldEncodeNulls() const { return true; } ``` ########## File path: cpp/src/arrow/dataset/partition.cc ########## @@ -625,8 +654,27 @@ class StructDictionary { private: Status AddOne(Datum column, std::shared_ptr<Int32Array>* fused_indices) { + if (column.type()->id() == Type::DICTIONARY) { + if (column.null_count() != 0) { + // TODO Optimize this by allowign DictionaryEncode to transfer a null-masked Review comment: please include a username or (preferably) a JIRA in TODO comments ```suggestion // TODO($FOLLOW_UP) Optimize this by allowing DictionaryEncode to transfer a null-masked ``` ########## File path: cpp/src/arrow/dataset/partition.cc ########## @@ -410,21 +434,26 @@ std::shared_ptr<PartitioningFactory> DirectoryPartitioning::MakeFactory( } util::optional<KeyValuePartitioning::Key> HivePartitioning::ParseKey( - const std::string& segment) { + const std::string& segment, const std::string& null_fallback) { auto name_end = string_view(segment).find_first_of('='); + // Not round-trippable if (name_end == string_view::npos) { return util::nullopt; } - return Key{segment.substr(0, name_end), segment.substr(name_end + 1)}; + auto value = segment.substr(name_end + 1); + if (value == null_fallback) { + return Key{segment.substr(0, name_end), "", true}; + } + return Key{segment.substr(0, name_end), segment.substr(name_end + 1), false}; Review comment: ```suggestion auto name = segment.substr(0, name_end); auto value = segment.substr(name_end + 1); if (value == null_fallback) { return Key{name, "", true}; } return Key{name, value, false}; ``` ########## File path: cpp/src/arrow/dataset/partition.h ########## @@ -120,6 +125,7 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning { /// of a scalar value struct Key { std::string name, value; + bool null; Review comment: nit: ```suggestion std::string name; std::string util::optional<value>; ``` ########## File path: cpp/src/arrow/compute/kernels/vector_hash_test.cc ########## @@ -305,6 +305,11 @@ TEST_F(TestHashKernel, ValueCountsBoolean) { ArrayFromJSON(boolean(), "[false]"), ArrayFromJSON(int64(), "[2]")); } +TEST_F(TestHashKernel, ValueCountsNull) { + CheckValueCounts<NullType, std::nullptr_t>( + null(), {nullptr, nullptr, nullptr}, {true, false, true}, {nullptr}, {false}, {3}); Review comment: For improved readability, please prefer JSON test arrays ```suggestion CheckValueCounts(ArrayFromJSON(null(), "[null, null, null]"), ArrayFromJSON(null(), "[null]"), ArrayFromJSON(int64(), "[3]")); ``` In particular, note that the from-vector case attempts to construct a null array with only the middle slot null ########## File path: cpp/src/arrow/dataset/partition_test.cc ########## @@ -322,8 +397,9 @@ TEST_F(TestPartitioning, DiscoverHiveSchema) { } Review comment: Please add a test asserting that `{"/alpha=1", "/alpha=_FALLBACK_STRING_"}` is inferred as integer ########## File path: cpp/src/arrow/dataset/partition_test.cc ########## @@ -103,6 +137,32 @@ class TestPartitioning : public ::testing::Test { std::shared_ptr<Schema> written_schema_; }; +TEST_F(TestPartitioning, Partition) { + auto partition_schema = schema({field("a", int32()), field("b", utf8())}); + auto schema_ = schema({field("a", int32()), field("b", utf8()), field("c", uint32())}); + auto remaining_schema = schema({field("c", uint32())}); + auto partitioning = std::make_shared<DirectoryPartitioning>(partition_schema); + std::string json = R"([{"a": 3, "b": "x", "c": 0}, + {"a": 3, "b": "x", "c": 1}, + {"a": 1, "b": null, "c": 2}, + {"a": null, "b": null, "c": 3}, + {"a": null, "b": "z", "c": 4}, + {"a": null, "b": null, "c": 5} + ])"; + std::vector<std::string> expected_batches = {R"([{"c": 0}, {"c": 1}])", R"([{"c": 2}])", + R"([{"c": 3}, {"c": 5}])", + R"([{"c": 4}])"}; + std::vector<Expression> expected_expressions = { + and_(equal(field_ref("a"), literal(3)), equal(field_ref("b"), literal("x"))), + and_(equal(field_ref("a"), literal(1)), is_null(field_ref("b"))), + and_(is_null(field_ref("a")), is_null(field_ref("b"))), + and_(is_null(field_ref("a")), equal(field_ref("b"), literal("z")))}; + AssertPartition(partitioning, schema_, json, remaining_schema, expected_batches, + expected_expressions); +} + +TEST_F(TestPartitioning, StructDictionaryNull) {} + Review comment: ```suggestion ``` ########## File path: cpp/src/arrow/compute/kernels/vector_hash.cc ########## @@ -687,11 +738,14 @@ void RegisterVectorHash(FunctionRegistry* registry) { // ---------------------------------------------------------------------- // dictionary_encode + const auto kDefaultDictionaryEncodeOptions = DictionaryEncodeOptions::Defaults(); Review comment: This needs to have static storage duration; please move it into the anonymous namespace next to `dictionary_encode_doc` ########## File path: python/pyarrow/_dataset.pyx ########## @@ -1580,6 +1584,8 @@ cdef class HivePartitioning(Partitioning): corresponding entry of `dictionaries` must be an array containing every value which may be taken by the corresponding column or an error will be raised in parsing. + null_fallback : str Review comment: ```suggestion null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" ``` ########## File path: cpp/src/arrow/dataset/expression.h ########## @@ -159,10 +162,27 @@ Expression call(std::string function, std::vector<Expression> arguments, ARROW_DS_EXPORT std::vector<FieldRef> FieldsInExpression(const Expression&); +/// Represents either a concrete value or a hint that a field is valid/invalid +struct KnownFieldValue { Review comment: Instead, let's use a null scalar to indicate that a field is known to be null. In particular, this allows us to maintain division of responsibilities between known value replacement and constant folding, and minimizes special cases which need to be recognized and handled in simplified expressions. This is a significant change so I've assembled a PR summarizing it: https://github.com/westonpace/arrow/pull/5 ########## File path: cpp/src/arrow/dataset/expression.h ########## @@ -135,6 +135,9 @@ inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equ ARROW_DS_EXPORT Expression literal(Datum lit); +ARROW_DS_EXPORT +Expression null_literal(const std::shared_ptr<DataType>& type); Review comment: Since this is only used in tests, please move it to `expression_test.cc` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org