nealrichardson commented on code in PR #12751:
URL: https://github.com/apache/arrow/pull/12751#discussion_r853057491


##########
r/R/array.R:
##########
@@ -256,7 +255,11 @@ concat_arrays <- function(..., type = NULL) {
 #' @rdname concat_arrays
 #' @export
 c.Array <- function(...) {
-  concat_arrays(...)
+  abort(c(

Review Comment:
   Nice helpful message



##########
r/R/record-batch.R:
##########
@@ -189,3 +189,59 @@ record_batch <- RecordBatch$create
 
 #' @export
 names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+  abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(inputs, call = caller_env()) {
+  sizes <- map(inputs, function(input) {
+    # TODO: can we somehow implement vec_size for Arrow structures?
+    if (inherits(input, "ArrowTabular") || is.data.frame(input)) {
+      nrow(input)
+    } else {
+      length(input)
+    }
+  })
+  wrong_idx <- which.min(sizes == sizes[[1]] | sizes == 1)
+  if (wrong_idx != 1) {
+    abort(
+      c("Non-scalar inputs must have an equal number of rows.",
+        i = sprintf("..1 has %d, ..%d has %d", sizes[[1]], wrong_idx, 
sizes[[wrong_idx]])),
+      call = call
+    )
+  }
+}
+
+#' @export
+cbind.RecordBatch <- function(...) {
+  call <- sys.call()
+  inputs <- list(...)
+  arg_names <- if (is.null(names(inputs))) {
+    rep("", length(inputs))
+  } else {
+    names(inputs)
+  }
+
+  cbind_check_length(inputs, call)
+
+  columns <- flatten(map(seq_along(inputs), function(i) {
+    input <- inputs[[i]]
+    name <- arg_names[i]
+
+    if (inherits(input, "RecordBatch")) {
+      set_names(input$columns, names(input))

Review Comment:
   RecordBatch$columns returns an unnamed list? 🤔 should we change that (in a 
followup)?



##########
r/R/record-batch.R:
##########
@@ -189,3 +189,59 @@ record_batch <- RecordBatch$create
 
 #' @export
 names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+  abort("Use `Table$create()` to combine record batches")

Review Comment:
   Is this better (more explicit) or worse (confusing because it's telling me 
I'm not going to get a RecordBatch)?
   
   Is there or should there be a ConcatenateBatches function, and/or some way 
to turn a Table into a RecordBatch (concat_arrays each of the ChunkedArrays in 
the Table)? 
   
   ```suggestion
     abort("Use `Table$create()` to combine RecordBatches into a Table")
   ```



##########
r/R/record-batch.R:
##########
@@ -189,3 +189,59 @@ record_batch <- RecordBatch$create
 
 #' @export
 names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+  abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(inputs, call = caller_env()) {
+  sizes <- map(inputs, function(input) {
+    # TODO: can we somehow implement vec_size for Arrow structures?
+    if (inherits(input, "ArrowTabular") || is.data.frame(input)) {
+      nrow(input)
+    } else {
+      length(input)
+    }
+  })
+  wrong_idx <- which.min(sizes == sizes[[1]] | sizes == 1)
+  if (wrong_idx != 1) {
+    abort(
+      c("Non-scalar inputs must have an equal number of rows.",
+        i = sprintf("..1 has %d, ..%d has %d", sizes[[1]], wrong_idx, 
sizes[[wrong_idx]])),
+      call = call
+    )
+  }
+}
+
+#' @export
+cbind.RecordBatch <- function(...) {
+  call <- sys.call()
+  inputs <- list(...)
+  arg_names <- if (is.null(names(inputs))) {
+    rep("", length(inputs))
+  } else {
+    names(inputs)
+  }
+
+  cbind_check_length(inputs, call)
+
+  columns <- flatten(map(seq_along(inputs), function(i) {
+    input <- inputs[[i]]
+    name <- arg_names[i]
+
+    if (inherits(input, "RecordBatch")) {
+      set_names(input$columns, names(input))
+    } else if (inherits(input, "data.frame")) {
+      as.list(input)
+    } else {
+      if (is.na(name) || name == "") {

Review Comment:
   why NA or ""? (as in, why not set to NA on L222 and only check for NA here?)



##########
r/R/table.R:
##########
@@ -149,6 +149,77 @@ Table$create <- function(..., schema = NULL) {
 #' @export
 names.Table <- function(x) x$ColumnNames()
 
+#' @export
+rbind.Table <- function(...) {
+  tables <- list(...)
+
+  # assert they have same schema
+  schema <- tables[[1]]$schema
+  unequal_schema_idx <- which.min(lapply(tables, function(x) x$schema == 
schema))
+  if (unequal_schema_idx != 1) {
+    abort(c(
+      sprintf("Schema at index %i does not match the first schema", 
unequal_schema_idx),
+      i = paste0("Schema 1:\n", schema$ToString()),
+      i = paste0(sprintf("Schema %d:\n", unequal_schema_idx),
+                 tables[[unequal_schema_idx]]$schema$ToString())
+    ))
+  }
+
+  # create chunked array for each column
+  columns <- map(seq_len(tables[[1]]$num_columns), function(i) {
+    do.call(c, map(tables, ~ .[[i]]))
+  })
+
+  Table$create(!!!set_names(columns, names(schema)), schema = schema)
+}
+
+#' @export
+cbind.Table <- function(...) {

Review Comment:
   > Yes, except in `cbind.Table`, chunked arrays and tables are allowed.
   
   Two thoughts here, feel free to ignore or defer to a followup:
   
   1. Given how you've simplified, I wonder if that's a problem. If you passed 
through chunked arrays to RecordBatch$create, it should error, and hopefully 
with an informative message. In fact if I read the latest version correctly, 
cbind.RecordBatch isn't asserting that it doesn't get chunked arrays, so you're 
already relying on that behavior. 
   2. Maybe @paleolimbot's as_* S3 generics PR would help? cbind.RecordBatch 
would `as_arrow_array` its input columns, and cbind.Table would 
`as_chunked_array` its inputs.



##########
r/R/record-batch.R:
##########
@@ -189,3 +189,59 @@ record_batch <- RecordBatch$create
 
 #' @export
 names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+  abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(inputs, call = caller_env()) {
+  sizes <- map(inputs, function(input) {
+    # TODO: can we somehow implement vec_size for Arrow structures?
+    if (inherits(input, "ArrowTabular") || is.data.frame(input)) {
+      nrow(input)
+    } else {
+      length(input)
+    }
+  })
+  wrong_idx <- which.min(sizes == sizes[[1]] | sizes == 1)
+  if (wrong_idx != 1) {
+    abort(
+      c("Non-scalar inputs must have an equal number of rows.",
+        i = sprintf("..1 has %d, ..%d has %d", sizes[[1]], wrong_idx, 
sizes[[wrong_idx]])),
+      call = call
+    )
+  }
+}
+
+#' @export
+cbind.RecordBatch <- function(...) {
+  call <- sys.call()
+  inputs <- list(...)
+  arg_names <- if (is.null(names(inputs))) {
+    rep("", length(inputs))
+  } else {
+    names(inputs)
+  }
+
+  cbind_check_length(inputs, call)
+
+  columns <- flatten(map(seq_along(inputs), function(i) {

Review Comment:
   Since we aren't using `i` anymore, can we use `imap()` now?



##########
r/R/record-batch.R:
##########
@@ -189,3 +189,59 @@ record_batch <- RecordBatch$create
 
 #' @export
 names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+  abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(inputs, call = caller_env()) {
+  sizes <- map(inputs, function(input) {
+    # TODO: can we somehow implement vec_size for Arrow structures?
+    if (inherits(input, "ArrowTabular") || is.data.frame(input)) {
+      nrow(input)
+    } else {
+      length(input)
+    }
+  })
+  wrong_idx <- which.min(sizes == sizes[[1]] | sizes == 1)
+  if (wrong_idx != 1) {
+    abort(
+      c("Non-scalar inputs must have an equal number of rows.",
+        i = sprintf("..1 has %d, ..%d has %d", sizes[[1]], wrong_idx, 
sizes[[wrong_idx]])),
+      call = call
+    )
+  }
+}

Review Comment:
   I think the meaning here is a little clearer. I used `head()` in the first 
line because it is robust to length-0 input (whereas `[[1]]` would error) 
though you also use it in your error message below. Not sure if that's a 
problem here or not, maybe you can never get in here with `inputs` being empty.
   
   ```suggestion
    ok_lengths <- sizes %in% c(head(sizes, 1), 1L)
     if (!all(ok_lengths)) {
       first_bad_one <- which.min(!ok.lengths)
       abort(
         c("Non-scalar inputs must have an equal number of rows.",
           i = sprintf("..1 has %d, ..%d has %d", sizes[[1]], first_bad_one, 
sizes[[first_bad_one]])),
         call = call
       )
     }
   }
   ```



##########
r/R/record-batch.R:
##########
@@ -189,3 +189,59 @@ record_batch <- RecordBatch$create
 
 #' @export
 names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+  abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(inputs, call = caller_env()) {
+  sizes <- map(inputs, function(input) {
+    # TODO: can we somehow implement vec_size for Arrow structures?
+    if (inherits(input, "ArrowTabular") || is.data.frame(input)) {
+      nrow(input)
+    } else {
+      length(input)
+    }
+  })

Review Comment:
   This is why I suggested using `NROW()`, it does this for you: 
   
   ```
   > NROW
   function (x) 
   if (length(d <- dim(x))) d[1L] else length(x)
   <bytecode: 0x14dfdaf28>
   <environment: namespace:base>
   ```
   
   ```suggestion
     sizes <- map_int(inputs, NROW)
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to