nealrichardson commented on code in PR #12751:
URL: https://github.com/apache/arrow/pull/12751#discussion_r853057491
##########
r/R/array.R:
##########
@@ -256,7 +255,11 @@ concat_arrays <- function(..., type = NULL) {
#' @rdname concat_arrays
#' @export
c.Array <- function(...) {
- concat_arrays(...)
+ abort(c(
Review Comment:
Nice helpful message
##########
r/R/record-batch.R:
##########
@@ -189,3 +189,59 @@ record_batch <- RecordBatch$create
#' @export
names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+ abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(inputs, call = caller_env()) {
+ sizes <- map(inputs, function(input) {
+ # TODO: can we somehow implement vec_size for Arrow structures?
+ if (inherits(input, "ArrowTabular") || is.data.frame(input)) {
+ nrow(input)
+ } else {
+ length(input)
+ }
+ })
+ wrong_idx <- which.min(sizes == sizes[[1]] | sizes == 1)
+ if (wrong_idx != 1) {
+ abort(
+ c("Non-scalar inputs must have an equal number of rows.",
+ i = sprintf("..1 has %d, ..%d has %d", sizes[[1]], wrong_idx,
sizes[[wrong_idx]])),
+ call = call
+ )
+ }
+}
+
+#' @export
+cbind.RecordBatch <- function(...) {
+ call <- sys.call()
+ inputs <- list(...)
+ arg_names <- if (is.null(names(inputs))) {
+ rep("", length(inputs))
+ } else {
+ names(inputs)
+ }
+
+ cbind_check_length(inputs, call)
+
+ columns <- flatten(map(seq_along(inputs), function(i) {
+ input <- inputs[[i]]
+ name <- arg_names[i]
+
+ if (inherits(input, "RecordBatch")) {
+ set_names(input$columns, names(input))
Review Comment:
RecordBatch$columns returns an unnamed list? 🤔 should we change that (in a
followup)?
##########
r/R/record-batch.R:
##########
@@ -189,3 +189,59 @@ record_batch <- RecordBatch$create
#' @export
names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+ abort("Use `Table$create()` to combine record batches")
Review Comment:
Is this better (more explicit) or worse (confusing because it's telling me
I'm not going to get a RecordBatch)?
Is there or should there be a ConcatenateBatches function, and/or some way
to turn a Table into a RecordBatch (concat_arrays each of the ChunkedArrays in
the Table)?
```suggestion
abort("Use `Table$create()` to combine RecordBatches into a Table")
```
##########
r/R/record-batch.R:
##########
@@ -189,3 +189,59 @@ record_batch <- RecordBatch$create
#' @export
names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+ abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(inputs, call = caller_env()) {
+ sizes <- map(inputs, function(input) {
+ # TODO: can we somehow implement vec_size for Arrow structures?
+ if (inherits(input, "ArrowTabular") || is.data.frame(input)) {
+ nrow(input)
+ } else {
+ length(input)
+ }
+ })
+ wrong_idx <- which.min(sizes == sizes[[1]] | sizes == 1)
+ if (wrong_idx != 1) {
+ abort(
+ c("Non-scalar inputs must have an equal number of rows.",
+ i = sprintf("..1 has %d, ..%d has %d", sizes[[1]], wrong_idx,
sizes[[wrong_idx]])),
+ call = call
+ )
+ }
+}
+
+#' @export
+cbind.RecordBatch <- function(...) {
+ call <- sys.call()
+ inputs <- list(...)
+ arg_names <- if (is.null(names(inputs))) {
+ rep("", length(inputs))
+ } else {
+ names(inputs)
+ }
+
+ cbind_check_length(inputs, call)
+
+ columns <- flatten(map(seq_along(inputs), function(i) {
+ input <- inputs[[i]]
+ name <- arg_names[i]
+
+ if (inherits(input, "RecordBatch")) {
+ set_names(input$columns, names(input))
+ } else if (inherits(input, "data.frame")) {
+ as.list(input)
+ } else {
+ if (is.na(name) || name == "") {
Review Comment:
why NA or ""? (as in, why not set to NA on L222 and only check for NA here?)
##########
r/R/table.R:
##########
@@ -149,6 +149,77 @@ Table$create <- function(..., schema = NULL) {
#' @export
names.Table <- function(x) x$ColumnNames()
+#' @export
+rbind.Table <- function(...) {
+ tables <- list(...)
+
+ # assert they have same schema
+ schema <- tables[[1]]$schema
+ unequal_schema_idx <- which.min(lapply(tables, function(x) x$schema ==
schema))
+ if (unequal_schema_idx != 1) {
+ abort(c(
+ sprintf("Schema at index %i does not match the first schema",
unequal_schema_idx),
+ i = paste0("Schema 1:\n", schema$ToString()),
+ i = paste0(sprintf("Schema %d:\n", unequal_schema_idx),
+ tables[[unequal_schema_idx]]$schema$ToString())
+ ))
+ }
+
+ # create chunked array for each column
+ columns <- map(seq_len(tables[[1]]$num_columns), function(i) {
+ do.call(c, map(tables, ~ .[[i]]))
+ })
+
+ Table$create(!!!set_names(columns, names(schema)), schema = schema)
+}
+
+#' @export
+cbind.Table <- function(...) {
Review Comment:
> Yes, except in `cbind.Table`, chunked arrays and tables are allowed.
Two thoughts here, feel free to ignore or defer to a followup:
1. Given how you've simplified, I wonder if that's a problem. If you passed
through chunked arrays to RecordBatch$create, it should error, and hopefully
with an informative message. In fact if I read the latest version correctly,
cbind.RecordBatch isn't asserting that it doesn't get chunked arrays, so you're
already relying on that behavior.
2. Maybe @paleolimbot's as_* S3 generics PR would help? cbind.RecordBatch
would `as_arrow_array` its input columns, and cbind.Table would
`as_chunked_array` its inputs.
##########
r/R/record-batch.R:
##########
@@ -189,3 +189,59 @@ record_batch <- RecordBatch$create
#' @export
names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+ abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(inputs, call = caller_env()) {
+ sizes <- map(inputs, function(input) {
+ # TODO: can we somehow implement vec_size for Arrow structures?
+ if (inherits(input, "ArrowTabular") || is.data.frame(input)) {
+ nrow(input)
+ } else {
+ length(input)
+ }
+ })
+ wrong_idx <- which.min(sizes == sizes[[1]] | sizes == 1)
+ if (wrong_idx != 1) {
+ abort(
+ c("Non-scalar inputs must have an equal number of rows.",
+ i = sprintf("..1 has %d, ..%d has %d", sizes[[1]], wrong_idx,
sizes[[wrong_idx]])),
+ call = call
+ )
+ }
+}
+
+#' @export
+cbind.RecordBatch <- function(...) {
+ call <- sys.call()
+ inputs <- list(...)
+ arg_names <- if (is.null(names(inputs))) {
+ rep("", length(inputs))
+ } else {
+ names(inputs)
+ }
+
+ cbind_check_length(inputs, call)
+
+ columns <- flatten(map(seq_along(inputs), function(i) {
Review Comment:
Since we aren't using `i` anymore, can we use `imap()` now?
##########
r/R/record-batch.R:
##########
@@ -189,3 +189,59 @@ record_batch <- RecordBatch$create
#' @export
names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+ abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(inputs, call = caller_env()) {
+ sizes <- map(inputs, function(input) {
+ # TODO: can we somehow implement vec_size for Arrow structures?
+ if (inherits(input, "ArrowTabular") || is.data.frame(input)) {
+ nrow(input)
+ } else {
+ length(input)
+ }
+ })
+ wrong_idx <- which.min(sizes == sizes[[1]] | sizes == 1)
+ if (wrong_idx != 1) {
+ abort(
+ c("Non-scalar inputs must have an equal number of rows.",
+ i = sprintf("..1 has %d, ..%d has %d", sizes[[1]], wrong_idx,
sizes[[wrong_idx]])),
+ call = call
+ )
+ }
+}
Review Comment:
I think the meaning here is a little clearer. I used `head()` in the first
line because it is robust to length-0 input (whereas `[[1]]` would error)
though you also use it in your error message below. Not sure if that's a
problem here or not, maybe you can never get in here with `inputs` being empty.
```suggestion
ok_lengths <- sizes %in% c(head(sizes, 1), 1L)
if (!all(ok_lengths)) {
first_bad_one <- which.min(!ok.lengths)
abort(
c("Non-scalar inputs must have an equal number of rows.",
i = sprintf("..1 has %d, ..%d has %d", sizes[[1]], first_bad_one,
sizes[[first_bad_one]])),
call = call
)
}
}
```
##########
r/R/record-batch.R:
##########
@@ -189,3 +189,59 @@ record_batch <- RecordBatch$create
#' @export
names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+ abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(inputs, call = caller_env()) {
+ sizes <- map(inputs, function(input) {
+ # TODO: can we somehow implement vec_size for Arrow structures?
+ if (inherits(input, "ArrowTabular") || is.data.frame(input)) {
+ nrow(input)
+ } else {
+ length(input)
+ }
+ })
Review Comment:
This is why I suggested using `NROW()`, it does this for you:
```
> NROW
function (x)
if (length(d <- dim(x))) d[1L] else length(x)
<bytecode: 0x14dfdaf28>
<environment: namespace:base>
```
```suggestion
sizes <- map_int(inputs, NROW)
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]