bkietz commented on a change in pull request #9323: URL: https://github.com/apache/arrow/pull/9323#discussion_r581371432
########## File path: cpp/src/arrow/dataset/partition_test.cc ########## @@ -77,6 +78,39 @@ class TestPartitioning : public ::testing::Test { ASSERT_OK_AND_ASSIGN(partitioning_, factory_->Finish(actual)); } + void AssertPartition(const std::shared_ptr<Partitioning> partitioning, + const std::shared_ptr<RecordBatch> full_batch, + const RecordBatchVector& expected_batches, + const std::vector<Expression>& expected_expressions) { + ASSERT_OK_AND_ASSIGN(auto partition_results, partitioning->Partition(full_batch)); + std::shared_ptr<RecordBatch> rest = full_batch; Review comment: Unused: ```suggestion ``` ########## File path: cpp/src/arrow/dataset/partition.cc ########## @@ -147,7 +151,9 @@ Result<Expression> KeyValuePartitioning::ConvertKey(const Key& key) const { std::shared_ptr<Scalar> converted; - if (field->type()->id() == Type::DICTIONARY) { + if (!key.value.has_value()) { + return is_null(field_ref(field->name())); + } else if (field->type()->id() == Type::DICTIONARY) { Review comment: nit: ```suggestion } if (field->type()->id() == Type::DICTIONARY) { ``` ########## File path: cpp/src/arrow/compute/kernels/vector_hash_test.cc ########## @@ -542,6 +547,12 @@ TEST_F(TestHashKernel, UniqueDecimal) { {true, false, true, true}, expected, {1, 0, 1}); } +TEST_F(TestHashKernel, UniqueNull) { + CheckUnique<NullType, std::nullptr_t>(null(), {nullptr, nullptr}, {false, true}, + {nullptr}, {false}); + CheckUnique<NullType, std::nullptr_t>(null(), {}, {}, {}, {}); Review comment: ```suggestion CheckUnique(ArrayFromJSON(null(), "[null, null, null]"), ArrayFromJSON(null(), "[null]")); CheckUnique(ArrayFromJSON(null(), "[]"), ArrayFromJSON(null(), "[]")); ``` ########## File path: cpp/src/arrow/dataset/partition_test.cc ########## @@ -282,9 +348,16 @@ TEST_F(TestPartitioning, HivePartitioningFormat) { equal(field_ref("alpha"), literal(0))), "alpha=0/beta=3.25"); AssertFormat(equal(field_ref("alpha"), literal(0)), "alpha=0"); Review comment: Since a null valued partition key is semantically equivalent to an absent one, we should ensure they format identically. I've created ARROW-11762 for follow up ########## File path: cpp/src/arrow/dataset/partition.cc ########## @@ -625,8 +650,27 @@ class StructDictionary { private: Status AddOne(Datum column, std::shared_ptr<Int32Array>* fused_indices) { + if (column.type()->id() == Type::DICTIONARY) { + if (column.null_count() != 0) { + // TODO(ARROW-11732) Optimize this by allowign DictionaryEncode to transfer a Review comment: ```suggestion // TODO(ARROW-11732) Optimize this by allowing DictionaryEncode to transfer a ``` ########## File path: cpp/src/arrow/dataset/partition.cc ########## @@ -287,8 +303,13 @@ class KeyValuePartitioningFactory : public PartitioningFactory { return it_inserted.first->second; } - Status InsertRepr(const std::string& name, util::string_view repr) { - return InsertRepr(GetOrInsertField(name), repr); + Status InsertRepr(const std::string& name, util::optional<string_view> repr) { + auto field_index = GetOrInsertField(name); + if (repr.has_value()) { + return InsertRepr(field_index, *repr); + } else { + return Status::OK(); + } Review comment: ```suggestion if (repr.has_value()) { auto field_index = GetOrInsertField(name); return InsertRepr(field_index, *repr); } return Status::OK(); ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org