jonkeane commented on a change in pull request #12133: URL: https://github.com/apache/arrow/pull/12133#discussion_r784317194
########## File path: r/R/dataset-factory.R ########## @@ -60,16 +61,71 @@ DatasetFactory$create <- function(x, return(FileSystemDatasetFactory$create(path_and_fs$fs, NULL, path_and_fs$path, format)) } - if (!is.null(partitioning)) { - if (inherits(partitioning, "Schema")) { - partitioning <- DirectoryPartitioning$create(partitioning) - } else if (is.character(partitioning)) { - # These are the column/field names, and we should autodetect their types - partitioning <- DirectoryPartitioningFactory$create(partitioning) + # Handle partitioning arg in cases where it is "character" or "Schema" + if (!is.null(partitioning) && !inherits(partitioning, c("Partitioning", "PartitioningFactory"))) { + if (!is_false(hive_style)) { + # Default is NA, which means check to see if the paths could be hive_style + hive_factory <- HivePartitioningFactory$create() + paths <- path_and_fs$fs$ls( + path_and_fs$path, + allow_not_found = FALSE, + recursive = TRUE + ) + hive_schema <- hive_factory$Inspect(paths) + # This is length-0 if there are no hive segments + if (is.na(hive_style)) { + hive_style <- length(hive_schema) > 0 + } + } + + if (hive_style) { + if (is.character(partitioning)) { + # These are not needed, the user probably provided them because they + # thought they needed to. Just make sure they aren't invalid. + if (!identical(names(hive_schema), partitioning)) { + abort(c( + paste( + '"partitioning" does not match the detected Hive-style partitions:', + deparse1(names(hive_schema)) + ), + i = 'Omit "partitioning" to use the Hive partitions', + i = "Set `hive_style = FALSE` to override what was detected", + i = "Or, to rename partition columns, call `select()` or `rename()` after opening the dataset" Review comment: I might be very liberal in my interpretation of the must/should approach, but I think this message is fantastic and gives folks a few clear ideas of where they might be going wrong / how to fix that, so to me it satisfies that brief. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org