paleolimbot commented on issue #822:
URL: 
https://github.com/apache/arrow-nanoarrow/issues/822#issuecomment-3504268075

   Ok, I took some time to try to reproduce this...it might take some larger 
numbers to properly observe the non-linearity. Feel free to modify this example 
to better reproduce! I am seeing a few erratic results and I think that putting 
millions of strings into the session starts to make things difficult to 
predict, possibly because garbage collection starts to take a long time.
   
   ```r
   library(nanoarrow)
   
   ascii_bytes <- vapply(letters, charToRaw, raw(1), USE.NAMES = FALSE)
   
   random_string_array <- function(n = 1, n_chars = 16) {
     data_buffer <- sample(ascii_bytes, n_chars * n, replace = TRUE)
     offsets_buffer <- as.integer(seq(0, n * n_chars, length.out = n + 1))
     nanoarrow_array_modify(
       nanoarrow_array_init(na_string()),
       list(
         length = n,
         null_count = 0,
         buffers = list(NULL, offsets_buffer, data_buffer)
       )
     )
   }
   
   random_string_struct <- function(n_rows = 1024, n_cols = 1, n_chars = 16) {
     col_names <- sprintf("col%03d", seq_len(n_cols))
     col_types <- rep(list(na_string()), n_cols)
     names(col_types) <- col_names
     schema <- na_struct(col_types)
     
     columns <- lapply(
       col_names,
       function(...) random_string_array(n_rows, n_chars = n_chars)
     )
     
     nanoarrow_array_modify(
       nanoarrow_array_init(schema),
       list(
         length = n_rows,
         null_count = 0,
         children = columns
       )
     )  
   }
   
   random_string_stream <- function(n_batches = 1, n_rows = 1024, n_cols = 1, 
n_chars = 16) {
     basic_array_stream(
       lapply(
         seq_len(n_batches),
         function(...) random_string_struct(n_rows, n_cols, n_chars)
       )
     )
   }
   time_convert <- function(n_batches, n_cols, n_rows, method) {
     message(glue::glue("generating n_batches = {n_batches}, n_cols = {n_cols}, 
method = {method}"))
     stream <- random_string_stream(n_batches = n_batches, n_cols = n_cols, 
n_rows = n_rows)
   
     message("timing...")
     switch(method,
       convert_array_stream_explicit_size = {
         system.time({
           df <- convert_array_stream(stream, size = n_batches * n_rows)
           stopifnot(
             nrow(df) == (n_batches * n_rows),
             ncol(df) == n_cols
           )
         })
       },
       convert_array_stream_default = {
         system.time({
           df <- convert_array_stream(stream)
           stopifnot(
             nrow(df) == (n_batches * n_rows),
             ncol(df) == n_cols
           )
         })
       },
       convert_arrow_table_no_altrep = {
         withr::with_options(list(arrow.use_altrep = FALSE), {
           system.time({
             df <- as.data.frame(arrow::as_arrow_table(stream))
             stopifnot(
               nrow(df) == (n_batches * n_rows),
               ncol(df) == n_cols
             )
           })
         })
       },
       stop("Unknown method")
     )
   }
   
   times <- expand.grid(n_batches = c(10, 20, 30, 40, 50, 60), n_cols = c(10, 
20, 30, 40, 50, 60), method = c(
     "convert_array_stream_explicit_size",
     "convert_array_stream_default",
     "convert_arrow_table_no_altrep"
   ), stringsAsFactors = FALSE)
   
   times$system_time <- Map(time_convert, times$n_batches, times$n_cols, 1024, 
times$method)
   times$elapsed <- vapply(times$system_time, "[[", double(1), 1)
   ```
   
   ```r
   library(ggplot2)
   
   # These are somewhere in the middle and not at the largest number of values,
   # so I think they really are just the unlucky runs where the gc ran
   weird_gc <- times$method == "convert_array_stream_explicit_size" & 
     times$n_cols == 40 & 
     times$n_batches %in% c(50, 60)
   
   # facet by n_cols to see the effect of increasing batch size
   ggplot(times[!weird_gc, ], aes(n_batches, elapsed, fill = method)) +
     geom_col(position = position_dodge()) +
     facet_wrap(~n_cols) +
     theme_bw() +
     theme(legend.position = "bottom")
   ```
   
   <img width="1102" height="898" alt="Image" 
src="https://github.com/user-attachments/assets/47a0570a-f46a-482e-835f-e2b001d41350";
 />
   
   ```r
   # facet by n_batches to see the effect of increasing columns
   ggplot(times[!weird_gc, ], aes(n_cols, elapsed, fill = method)) +
     geom_col(position = position_dodge()) +
     facet_wrap(~n_batches) +
     theme_bw() +
     theme(legend.position = "bottom")
   ```
   
   <img width="1102" height="898" alt="Image" 
src="https://github.com/user-attachments/assets/c66d761e-9d31-4de2-9561-77380df89e8a";
 />


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to