[GitHub] [arrow] bkietz commented on a change in pull request #9323: ARROW-10438: [C++][Dataset] Partitioning::Format on nulls

GitBox Fri, 19 Feb 2021 10:15:48 -0800


bkietz commented on a change in pull request #9323:
URL: https://github.com/apache/arrow/pull/9323#discussion_r579306832




##########
File path: cpp/src/arrow/compute/kernels/vector_hash.cc
##########
@@ -529,6 +578,7 @@ std::unique_ptr<KernelState> 
DictionaryHashInit(KernelContext* ctx,
       DCHECK(false) << "Unsupported dictionary index type";
       break;
   }
+  DictionaryEncodeOptions options = DictionaryEncodeOptions::Defaults();

Review comment:
       Seems unused:
   ```suggestion
   ```

##########
File path: cpp/src/arrow/compute/kernels/vector_hash.cc
##########
@@ -147,6 +152,8 @@ class ValueCountsAction final : ActionBase {
     }
   }
 
+  bool ShouldEncodeNulls() { return true; }

Review comment:
       ```suggestion
     constexpr bool ShouldEncodeNulls() const { return true; }
   ```

##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -625,8 +654,27 @@ class StructDictionary {
 
  private:
   Status AddOne(Datum column, std::shared_ptr<Int32Array>* fused_indices) {
+    if (column.type()->id() == Type::DICTIONARY) {
+      if (column.null_count() != 0) {
+        // TODO Optimize this by allowign DictionaryEncode to transfer a 
null-masked

Review comment:
       please include a username or (preferably) a JIRA in TODO comments
   ```suggestion
           // TODO($FOLLOW_UP) Optimize this by allowing DictionaryEncode to 
transfer a null-masked
   ```

##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -410,21 +434,26 @@ std::shared_ptr<PartitioningFactory> 
DirectoryPartitioning::MakeFactory(
 }
 
 util::optional<KeyValuePartitioning::Key> HivePartitioning::ParseKey(
-    const std::string& segment) {
+    const std::string& segment, const std::string& null_fallback) {
   auto name_end = string_view(segment).find_first_of('=');
+  // Not round-trippable
   if (name_end == string_view::npos) {
     return util::nullopt;
   }
 
-  return Key{segment.substr(0, name_end), segment.substr(name_end + 1)};
+  auto value = segment.substr(name_end + 1);
+  if (value == null_fallback) {
+    return Key{segment.substr(0, name_end), "", true};
+  }
+  return Key{segment.substr(0, name_end), segment.substr(name_end + 1), false};

Review comment:
       ```suggestion
     auto name = segment.substr(0, name_end);
     auto value = segment.substr(name_end + 1);
     if (value == null_fallback) {
       return Key{name, "", true};
     }
     return Key{name, value, false};
   ```

##########
File path: cpp/src/arrow/dataset/partition.h
##########
@@ -120,6 +125,7 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public 
Partitioning {
   /// of a scalar value
   struct Key {
     std::string name, value;
+    bool null;

Review comment:
       nit:
   ```suggestion
       std::string name;
       std::string util::optional<value>;
   ```

##########
File path: cpp/src/arrow/compute/kernels/vector_hash_test.cc
##########
@@ -305,6 +305,11 @@ TEST_F(TestHashKernel, ValueCountsBoolean) {
                    ArrayFromJSON(boolean(), "[false]"), ArrayFromJSON(int64(), 
"[2]"));
 }
 
+TEST_F(TestHashKernel, ValueCountsNull) {
+  CheckValueCounts<NullType, std::nullptr_t>(
+      null(), {nullptr, nullptr, nullptr}, {true, false, true}, {nullptr}, 
{false}, {3});

Review comment:
       For improved readability, please prefer JSON test arrays
   ```suggestion
     CheckValueCounts(ArrayFromJSON(null(), "[null, null, null]"),
                      ArrayFromJSON(null(), "[null]"), ArrayFromJSON(int64(), 
"[3]"));
   ```
   In particular, note that the from-vector case attempts to construct a null 
array with only the middle slot null

##########
File path: cpp/src/arrow/dataset/partition_test.cc
##########
@@ -322,8 +397,9 @@ TEST_F(TestPartitioning, DiscoverHiveSchema) {
 }
 

Review comment:
       Please add a test asserting that `{"/alpha=1", 
"/alpha=_FALLBACK_STRING_"}` is inferred as integer

##########
File path: cpp/src/arrow/dataset/partition_test.cc
##########
@@ -103,6 +137,32 @@ class TestPartitioning : public ::testing::Test {
   std::shared_ptr<Schema> written_schema_;
 };
 
+TEST_F(TestPartitioning, Partition) {
+  auto partition_schema = schema({field("a", int32()), field("b", utf8())});
+  auto schema_ = schema({field("a", int32()), field("b", utf8()), field("c", 
uint32())});
+  auto remaining_schema = schema({field("c", uint32())});
+  auto partitioning = 
std::make_shared<DirectoryPartitioning>(partition_schema);
+  std::string json = R"([{"a": 3,    "b": "x",  "c": 0},
+                         {"a": 3,    "b": "x",  "c": 1},
+                         {"a": 1,    "b": null, "c": 2},
+                         {"a": null, "b": null, "c": 3},
+                         {"a": null, "b": "z",  "c": 4},
+                         {"a": null, "b": null, "c": 5}
+                       ])";
+  std::vector<std::string> expected_batches = {R"([{"c": 0}, {"c": 1}])", 
R"([{"c": 2}])",
+                                               R"([{"c": 3}, {"c": 5}])",
+                                               R"([{"c": 4}])"};
+  std::vector<Expression> expected_expressions = {
+      and_(equal(field_ref("a"), literal(3)), equal(field_ref("b"), 
literal("x"))),
+      and_(equal(field_ref("a"), literal(1)), is_null(field_ref("b"))),
+      and_(is_null(field_ref("a")), is_null(field_ref("b"))),
+      and_(is_null(field_ref("a")), equal(field_ref("b"), literal("z")))};
+  AssertPartition(partitioning, schema_, json, remaining_schema, 
expected_batches,
+                  expected_expressions);
+}
+
+TEST_F(TestPartitioning, StructDictionaryNull) {}
+

Review comment:
       ```suggestion
   ```

##########
File path: cpp/src/arrow/compute/kernels/vector_hash.cc
##########
@@ -687,11 +738,14 @@ void RegisterVectorHash(FunctionRegistry* registry) {
   // ----------------------------------------------------------------------
   // dictionary_encode
 
+  const auto kDefaultDictionaryEncodeOptions = 
DictionaryEncodeOptions::Defaults();

Review comment:
       This needs to have static storage duration; please move it into the 
anonymous namespace next to `dictionary_encode_doc`

##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -1580,6 +1584,8 @@ cdef class HivePartitioning(Partitioning):
         corresponding entry of `dictionaries` must be an array containing
         every value which may be taken by the corresponding column or an
         error will be raised in parsing.
+    null_fallback : str

Review comment:
       ```suggestion
       null_fallback : str, default "__HIVE_DEFAULT_PARTITION__"
   ```

##########
File path: cpp/src/arrow/dataset/expression.h
##########
@@ -159,10 +162,27 @@ Expression call(std::string function, 
std::vector<Expression> arguments,
 ARROW_DS_EXPORT
 std::vector<FieldRef> FieldsInExpression(const Expression&);
 
+/// Represents either a concrete value or a hint that a field is valid/invalid
+struct KnownFieldValue {

Review comment:
       Instead, let's use a null scalar to indicate that a field is known to be 
null. In particular, this allows us to maintain division of responsibilities 
between known value replacement and constant folding, and minimizes special 
cases which need to be recognized and handled in simplified expressions. This 
is a significant change so I've assembled a PR summarizing it: 
https://github.com/westonpace/arrow/pull/5

##########
File path: cpp/src/arrow/dataset/expression.h
##########
@@ -135,6 +135,9 @@ inline bool operator!=(const Expression& l, const 
Expression& r) { return !l.Equ
 ARROW_DS_EXPORT
 Expression literal(Datum lit);
 
+ARROW_DS_EXPORT
+Expression null_literal(const std::shared_ptr<DataType>& type);

Review comment:
       Since this is only used in tests, please move it to `expression_test.cc`




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] bkietz commented on a change in pull request #9323: ARROW-10438: [C++][Dataset] Partitioning::Format on nulls

Reply via email to