lidavidm commented on a change in pull request #9589: URL: https://github.com/apache/arrow/pull/9589#discussion_r609108354
########## File path: r/R/dataset-scan.R ########## @@ -142,17 +146,13 @@ map_batches <- function(X, FUN, ..., .data.frame = TRUE) { scanner <- Scanner$create(ensure_group_vars(X)) FUN <- as_mapper(FUN) # message("Making ScanTasks") - lapply(scanner$Scan(), function(scan_task) { - # This outer lapply could be parallelized - # message("Making Batches") - lapply(scan_task$Execute(), function(batch) { - # message("Processing Batch") - # This inner lapply cannot be parallelized - # TODO: wrap batch in arrow_dplyr_query with X$selected_columns, - # X$temp_columns, and X$group_by_vars - # if X is arrow_dplyr_query, if some other arg (.dplyr?) == TRUE - FUN(batch, ...) - }) + lapply(scanner$ToBatches(), function(batch) { + # message("Processing Batch") + # This inner lapply cannot be parallelized Review comment: Nope! :) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org