nealrichardson commented on a change in pull request #12133:
URL: https://github.com/apache/arrow/pull/12133#discussion_r784016948
##########
File path: r/R/dataset-factory.R
##########
@@ -60,16 +61,71 @@ DatasetFactory$create <- function(x,
return(FileSystemDatasetFactory$create(path_and_fs$fs, NULL,
path_and_fs$path, format))
}
- if (!is.null(partitioning)) {
- if (inherits(partitioning, "Schema")) {
- partitioning <- DirectoryPartitioning$create(partitioning)
- } else if (is.character(partitioning)) {
- # These are the column/field names, and we should autodetect their types
- partitioning <- DirectoryPartitioningFactory$create(partitioning)
+ # Handle partitioning arg in cases where it is "character" or "Schema"
+ if (!is.null(partitioning) && !inherits(partitioning, c("Partitioning",
"PartitioningFactory"))) {
+ if (!is_false(hive_style)) {
+ # Default is NA, which means check to see if the paths could be
hive_style
+ hive_factory <- HivePartitioningFactory$create()
+ paths <- path_and_fs$fs$ls(
+ path_and_fs$path,
+ allow_not_found = FALSE,
+ recursive = TRUE
+ )
+ hive_schema <- hive_factory$Inspect(paths)
+ # This is length-0 if there are no hive segments
+ if (is.na(hive_style)) {
+ hive_style <- length(hive_schema) > 0
+ }
+ }
+
+ if (hive_style) {
+ if (is.character(partitioning)) {
+ # These are not needed, the user probably provided them because they
+ # thought they needed to. Just make sure they aren't invalid.
+ if (!identical(names(hive_schema), partitioning)) {
+ abort(c(
+ paste(
+ '"partitioning" does not match the detected Hive-style
partitions:',
+ deparse1(names(hive_schema))
+ ),
+ i = 'Omit "partitioning" to use the Hive partitions',
+ i = "Set `hive_style = FALSE` to override what was detected",
+ i = "Or, to rename partition columns, call `select()` or
`rename()` after opening the dataset"
+ ))
+ }
+ partitioning <- hive_factory$Finish(hive_schema)
+ } else if (inherits(partitioning, "Schema")) {
+ # This means we want to set the types of the hive-style partitions
+ # to be exactly what we want them to be
+ if (!identical(names(hive_schema), names(partitioning))) {
+ abort(c(
+ paste(
+ '"partitioning" does not match the detected Hive-style
partitions:',
+ deparse1(names(hive_schema))
+ ),
+ i = 'Omit "partitioning" to use the Hive partitions',
+ i = "Set `hive_style = FALSE` to override what was detected",
+ i = "Or, to rename partition columns, call `select()` or
`rename()` after opening the dataset"
+ ))
+ }
Review comment:
Not really IMO
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]