bkietz commented on a change in pull request #9677:
URL: https://github.com/apache/arrow/pull/9677#discussion_r592576498



##########
File path: cpp/src/arrow/dataset/expression.cc
##########
@@ -33,6 +33,7 @@
 #include "arrow/util/optional.h"
 #include "arrow/util/string.h"
 #include "arrow/util/value_parsing.h"
+#include "arrow/visitor_inline.h"

Review comment:
       Is this used?

##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -300,10 +302,18 @@ class KeyValuePartitioningFactory : public 
PartitioningFactory {
     return repr_memos_[index]->GetOrInsert<StringType>(repr, &dummy);
   }
 
-  Result<std::shared_ptr<Schema>> DoInpsect() {
+  Result<std::shared_ptr<Schema>> DoInspect() {

Review comment:
       :facepalm: 

##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -1566,7 +1571,14 @@ cdef class DirectoryPartitioning(Partitioning):
         if infer_dictionary:
             c_options.infer_dictionary = True
 
-        c_field_names = [tobytes(s) for s in field_names]
+        if schema:
+            c_options.schema = pyarrow_unwrap_schema(schema)
+            c_field_names = [tobytes(f.name) for f in schema]
+        elif not field_names:
+            raise TypeError(
+                "field_names must be passed if schema is not given")

Review comment:
       ```suggestion
               raise ValueError(
                   "Neither field_names nor schema was passed; cannot infer 
field_names")
   ```
   ```

##########
File path: cpp/src/arrow/dataset/partition.h
##########
@@ -90,6 +90,10 @@ struct PartitioningFactoryOptions {
   /// columns, and Expressions parsed by the finished Partitioning will include
   /// dictionaries of all unique inspected values for each field.
   bool infer_dictionary = false;
+  /// Optionally, an expected schema can be provided, in which case inference
+  /// will only check discovered fields against the schema and update internal
+  /// state (such as dictionaries).
+  std::shared_ptr<Schema> schema = nullptr;

Review comment:
       ```suggestion
     std::shared_ptr<Schema> schema;
   ```
   `nullptr` may not be used in headers and is the default for smart ptrs anyway

##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -1547,6 +1549,9 @@ cdef class DirectoryPartitioning(Partitioning):
             Synonymous with infer_dictionary for backwards compatibility with
             1.0: setting this to -1 or None is equivalent to passing
             infer_dictionary=True.
+        schema : Schema, default None
+            Do not infer the schema, but confirm that partition values match
+            this schema and infer dictionary values as appropriate.

Review comment:
       ```suggestion
               Use this schema instead of inferring a schema from partition 
values.
               Partition values will be validated against this schema before 
accumulation
               into the Partitioning's dictionary.
   ```

##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -317,15 +327,34 @@ class KeyValuePartitioningFactory : public 
PartitioningFactory {
                                "'; couldn't infer type");
       }
 
-      // try casting to int32, otherwise bail and just use the string reprs
-      auto dict = compute::Cast(reprs, int32()).ValueOr(reprs).make_array();
-      auto type = dict->type();
-      if (options_.infer_dictionary) {
-        // wrap the inferred type in dictionary()
-        type = dictionary(int32(), std::move(type));
+      std::shared_ptr<Field> current_field;
+      std::shared_ptr<Array> dict;
+      if (options_.schema) {
+        // if we have a schema, use the schema type.
+        current_field = options_.schema->field(index);
+        auto cast_target = current_field->type();
+        if (is_dictionary(cast_target->id())) {
+          cast_target = 
checked_pointer_cast<DictionaryType>(cast_target)->value_type();
+        }
+        auto maybe_dict = compute::Cast(reprs, cast_target);
+        if (!maybe_dict.ok()) {
+          return Status::Invalid("Could not cast segments for partition field 
",
+                                 current_field->name(), " to requested type ",
+                                 current_field->type()->ToString(),
+                                 " because: ", maybe_dict.status());
+        }
+        dict = maybe_dict.ValueOrDie().make_array();
+      } else {
+        // try casting to int32, otherwise bail and just use the string reprs
+        dict = compute::Cast(reprs, int32()).ValueOr(reprs).make_array();
+        auto type = dict->type();
+        if (options_.infer_dictionary) {
+          // wrap the inferred type in dictionary()
+          type = dictionary(int32(), std::move(type));
+        }
+        current_field = field(name, std::move(type));
       }
-
-      fields[index] = field(name, std::move(type));
+      fields[index] = current_field;

Review comment:
       ```suggestion
         fields[index] = std::move(current_field);
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to