bkietz commented on a change in pull request #9323:
URL: https://github.com/apache/arrow/pull/9323#discussion_r579306832
##########
File path: cpp/src/arrow/compute/kernels/vector_hash.cc
##########
@@ -529,6 +578,7 @@ std::unique_ptr<KernelState>
DictionaryHashInit(KernelContext* ctx,
DCHECK(false) << "Unsupported dictionary index type";
break;
}
+ DictionaryEncodeOptions options = DictionaryEncodeOptions::Defaults();
Review comment:
Seems unused:
```suggestion
```
##########
File path: cpp/src/arrow/compute/kernels/vector_hash.cc
##########
@@ -147,6 +152,8 @@ class ValueCountsAction final : ActionBase {
}
}
+ bool ShouldEncodeNulls() { return true; }
Review comment:
```suggestion
constexpr bool ShouldEncodeNulls() const { return true; }
```
##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -625,8 +654,27 @@ class StructDictionary {
private:
Status AddOne(Datum column, std::shared_ptr<Int32Array>* fused_indices) {
+ if (column.type()->id() == Type::DICTIONARY) {
+ if (column.null_count() != 0) {
+ // TODO Optimize this by allowign DictionaryEncode to transfer a
null-masked
Review comment:
please include a username or (preferably) a JIRA in TODO comments
```suggestion
// TODO($FOLLOW_UP) Optimize this by allowing DictionaryEncode to
transfer a null-masked
```
##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -410,21 +434,26 @@ std::shared_ptr<PartitioningFactory>
DirectoryPartitioning::MakeFactory(
}
util::optional<KeyValuePartitioning::Key> HivePartitioning::ParseKey(
- const std::string& segment) {
+ const std::string& segment, const std::string& null_fallback) {
auto name_end = string_view(segment).find_first_of('=');
+ // Not round-trippable
if (name_end == string_view::npos) {
return util::nullopt;
}
- return Key{segment.substr(0, name_end), segment.substr(name_end + 1)};
+ auto value = segment.substr(name_end + 1);
+ if (value == null_fallback) {
+ return Key{segment.substr(0, name_end), "", true};
+ }
+ return Key{segment.substr(0, name_end), segment.substr(name_end + 1), false};
Review comment:
```suggestion
auto name = segment.substr(0, name_end);
auto value = segment.substr(name_end + 1);
if (value == null_fallback) {
return Key{name, "", true};
}
return Key{name, value, false};
```
##########
File path: cpp/src/arrow/dataset/partition.h
##########
@@ -120,6 +125,7 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public
Partitioning {
/// of a scalar value
struct Key {
std::string name, value;
+ bool null;
Review comment:
nit:
```suggestion
std::string name;
std::string util::optional<value>;
```
##########
File path: cpp/src/arrow/compute/kernels/vector_hash_test.cc
##########
@@ -305,6 +305,11 @@ TEST_F(TestHashKernel, ValueCountsBoolean) {
ArrayFromJSON(boolean(), "[false]"), ArrayFromJSON(int64(),
"[2]"));
}
+TEST_F(TestHashKernel, ValueCountsNull) {
+ CheckValueCounts<NullType, std::nullptr_t>(
+ null(), {nullptr, nullptr, nullptr}, {true, false, true}, {nullptr},
{false}, {3});
Review comment:
For improved readability, please prefer JSON test arrays
```suggestion
CheckValueCounts(ArrayFromJSON(null(), "[null, null, null]"),
ArrayFromJSON(null(), "[null]"), ArrayFromJSON(int64(),
"[3]"));
```
In particular, note that the from-vector case attempts to construct a null
array with only the middle slot null
##########
File path: cpp/src/arrow/dataset/partition_test.cc
##########
@@ -322,8 +397,9 @@ TEST_F(TestPartitioning, DiscoverHiveSchema) {
}
Review comment:
Please add a test asserting that `{"/alpha=1",
"/alpha=_FALLBACK_STRING_"}` is inferred as integer
##########
File path: cpp/src/arrow/dataset/partition_test.cc
##########
@@ -103,6 +137,32 @@ class TestPartitioning : public ::testing::Test {
std::shared_ptr<Schema> written_schema_;
};
+TEST_F(TestPartitioning, Partition) {
+ auto partition_schema = schema({field("a", int32()), field("b", utf8())});
+ auto schema_ = schema({field("a", int32()), field("b", utf8()), field("c",
uint32())});
+ auto remaining_schema = schema({field("c", uint32())});
+ auto partitioning =
std::make_shared<DirectoryPartitioning>(partition_schema);
+ std::string json = R"([{"a": 3, "b": "x", "c": 0},
+ {"a": 3, "b": "x", "c": 1},
+ {"a": 1, "b": null, "c": 2},
+ {"a": null, "b": null, "c": 3},
+ {"a": null, "b": "z", "c": 4},
+ {"a": null, "b": null, "c": 5}
+ ])";
+ std::vector<std::string> expected_batches = {R"([{"c": 0}, {"c": 1}])",
R"([{"c": 2}])",
+ R"([{"c": 3}, {"c": 5}])",
+ R"([{"c": 4}])"};
+ std::vector<Expression> expected_expressions = {
+ and_(equal(field_ref("a"), literal(3)), equal(field_ref("b"),
literal("x"))),
+ and_(equal(field_ref("a"), literal(1)), is_null(field_ref("b"))),
+ and_(is_null(field_ref("a")), is_null(field_ref("b"))),
+ and_(is_null(field_ref("a")), equal(field_ref("b"), literal("z")))};
+ AssertPartition(partitioning, schema_, json, remaining_schema,
expected_batches,
+ expected_expressions);
+}
+
+TEST_F(TestPartitioning, StructDictionaryNull) {}
+
Review comment:
```suggestion
```
##########
File path: cpp/src/arrow/compute/kernels/vector_hash.cc
##########
@@ -687,11 +738,14 @@ void RegisterVectorHash(FunctionRegistry* registry) {
// ----------------------------------------------------------------------
// dictionary_encode
+ const auto kDefaultDictionaryEncodeOptions =
DictionaryEncodeOptions::Defaults();
Review comment:
This needs to have static storage duration; please move it into the
anonymous namespace next to `dictionary_encode_doc`
##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -1580,6 +1584,8 @@ cdef class HivePartitioning(Partitioning):
corresponding entry of `dictionaries` must be an array containing
every value which may be taken by the corresponding column or an
error will be raised in parsing.
+ null_fallback : str
Review comment:
```suggestion
null_fallback : str, default "__HIVE_DEFAULT_PARTITION__"
```
##########
File path: cpp/src/arrow/dataset/expression.h
##########
@@ -159,10 +162,27 @@ Expression call(std::string function,
std::vector<Expression> arguments,
ARROW_DS_EXPORT
std::vector<FieldRef> FieldsInExpression(const Expression&);
+/// Represents either a concrete value or a hint that a field is valid/invalid
+struct KnownFieldValue {
Review comment:
Instead, let's use a null scalar to indicate that a field is known to be
null. In particular, this allows us to maintain division of responsibilities
between known value replacement and constant folding, and minimizes special
cases which need to be recognized and handled in simplified expressions. This
is a significant change so I've assembled a PR summarizing it:
https://github.com/westonpace/arrow/pull/5
##########
File path: cpp/src/arrow/dataset/expression.h
##########
@@ -135,6 +135,9 @@ inline bool operator!=(const Expression& l, const
Expression& r) { return !l.Equ
ARROW_DS_EXPORT
Expression literal(Datum lit);
+ARROW_DS_EXPORT
+Expression null_literal(const std::shared_ptr<DataType>& type);
Review comment:
Since this is only used in tests, please move it to `expression_test.cc`
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]