bkietz commented on a change in pull request #9677:
URL: https://github.com/apache/arrow/pull/9677#discussion_r592576498
##########
File path: cpp/src/arrow/dataset/expression.cc
##########
@@ -33,6 +33,7 @@
#include "arrow/util/optional.h"
#include "arrow/util/string.h"
#include "arrow/util/value_parsing.h"
+#include "arrow/visitor_inline.h"
Review comment:
Is this used?
##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -300,10 +302,18 @@ class KeyValuePartitioningFactory : public
PartitioningFactory {
return repr_memos_[index]->GetOrInsert<StringType>(repr, &dummy);
}
- Result<std::shared_ptr<Schema>> DoInpsect() {
+ Result<std::shared_ptr<Schema>> DoInspect() {
Review comment:
:facepalm:
##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -1566,7 +1571,14 @@ cdef class DirectoryPartitioning(Partitioning):
if infer_dictionary:
c_options.infer_dictionary = True
- c_field_names = [tobytes(s) for s in field_names]
+ if schema:
+ c_options.schema = pyarrow_unwrap_schema(schema)
+ c_field_names = [tobytes(f.name) for f in schema]
+ elif not field_names:
+ raise TypeError(
+ "field_names must be passed if schema is not given")
Review comment:
```suggestion
raise ValueError(
"Neither field_names nor schema was passed; cannot infer
field_names")
```
```
##########
File path: cpp/src/arrow/dataset/partition.h
##########
@@ -90,6 +90,10 @@ struct PartitioningFactoryOptions {
/// columns, and Expressions parsed by the finished Partitioning will include
/// dictionaries of all unique inspected values for each field.
bool infer_dictionary = false;
+ /// Optionally, an expected schema can be provided, in which case inference
+ /// will only check discovered fields against the schema and update internal
+ /// state (such as dictionaries).
+ std::shared_ptr<Schema> schema = nullptr;
Review comment:
```suggestion
std::shared_ptr<Schema> schema;
```
`nullptr` may not be used in headers and is the default for smart ptrs anyway
##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -1547,6 +1549,9 @@ cdef class DirectoryPartitioning(Partitioning):
Synonymous with infer_dictionary for backwards compatibility with
1.0: setting this to -1 or None is equivalent to passing
infer_dictionary=True.
+ schema : Schema, default None
+ Do not infer the schema, but confirm that partition values match
+ this schema and infer dictionary values as appropriate.
Review comment:
```suggestion
Use this schema instead of inferring a schema from partition
values.
Partition values will be validated against this schema before
accumulation
into the Partitioning's dictionary.
```
##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -317,15 +327,34 @@ class KeyValuePartitioningFactory : public
PartitioningFactory {
"'; couldn't infer type");
}
- // try casting to int32, otherwise bail and just use the string reprs
- auto dict = compute::Cast(reprs, int32()).ValueOr(reprs).make_array();
- auto type = dict->type();
- if (options_.infer_dictionary) {
- // wrap the inferred type in dictionary()
- type = dictionary(int32(), std::move(type));
+ std::shared_ptr<Field> current_field;
+ std::shared_ptr<Array> dict;
+ if (options_.schema) {
+ // if we have a schema, use the schema type.
+ current_field = options_.schema->field(index);
+ auto cast_target = current_field->type();
+ if (is_dictionary(cast_target->id())) {
+ cast_target =
checked_pointer_cast<DictionaryType>(cast_target)->value_type();
+ }
+ auto maybe_dict = compute::Cast(reprs, cast_target);
+ if (!maybe_dict.ok()) {
+ return Status::Invalid("Could not cast segments for partition field
",
+ current_field->name(), " to requested type ",
+ current_field->type()->ToString(),
+ " because: ", maybe_dict.status());
+ }
+ dict = maybe_dict.ValueOrDie().make_array();
+ } else {
+ // try casting to int32, otherwise bail and just use the string reprs
+ dict = compute::Cast(reprs, int32()).ValueOr(reprs).make_array();
+ auto type = dict->type();
+ if (options_.infer_dictionary) {
+ // wrap the inferred type in dictionary()
+ type = dictionary(int32(), std::move(type));
+ }
+ current_field = field(name, std::move(type));
}
-
- fields[index] = field(name, std::move(type));
+ fields[index] = current_field;
Review comment:
```suggestion
fields[index] = std::move(current_field);
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]