romainfrancois commented on pull request #11225:
URL: https://github.com/apache/arrow/pull/11225#issuecomment-928981161
@jonkeane Getting this on master:
``` r
library(arrow, warn.conflicts = FALSE)
#> See arrow_info() for available features
library(dplyr, warn.conflicts = FALSE)
inject_na <- function(x) {
x[sample(seq_along(x), length(x) / 10)] <- NA
x
}
df <- data.frame(
x = runif(1e6),
y = sample(1:100, 1e6, replace = TRUE),
z = sample(month.abb, 1e6, replace = TRUE)
) %>%
mutate(
across(everything(), inject_na, .names = "{.col}_na")
)
a <- Array$create(df)
a$type
#> StructType
#> struct<x: double, y: int32, z: string, x_na: double, y_na: int32, z_na:
string>
bench::workout({
v <- a$as_vector()
v$x
v$y
v$z
v$x_na
v$y_na
v$z_na
})
#> # A tibble: 7 × 3
#> exprs process real
#> <bch:expr> <bch:tm> <bch:tm>
#> 1 v <- a$as_vector() 148ms 148.53ms
#> 2 v$x 49µs 50.43µs
#> 3 v$y 3µs 4.9µs
#> 4 v$z 4µs 4.24µs
#> 5 v$x_na 4µs 3.89µs
#> 6 v$y_na 3µs 3.63µs
#> 7 v$z_na 3µs 3.97µs
# get few values
bench::workout({
v$x[1:10]
v$y[1:10]
v$z[1:10]
v$x_na[1:10]
v$y_na[1:10]
v$z_na[1:10]
})
#> # A tibble: 6 × 3
#> exprs process real
#> <bch:expr> <bch:tm> <bch:tm>
#> 1 v$x[1:10] 11µs 12.71µs
#> 2 v$y[1:10] 5µs 5.96µs
#> 3 v$z[1:10] 4µs 5.16µs
#> 4 v$x_na[1:10] 4µs 4.79µs
#> 5 v$y_na[1:10] 4µs 4.48µs
#> 6 v$z_na[1:10] 4µs 4.82µs
# duplicate -> materialize
bench::workout({
v$x[]
v$y[]
v$z[]
v$x_na[]
v$y_na[]
v$z_na[]
})
#> # A tibble: 6 × 3
#> exprs process real
#> <bch:expr> <bch:tm> <bch:tm>
#> 1 v$x[] 3.71ms 3.71ms
#> 2 v$y[] 8.95ms 8.95ms
#> 3 v$z[] 9.43ms 9.43ms
#> 4 v$x_na[] 67.23ms 67.32ms
#> 5 v$y_na[] 505µs 508.25µs
#> 6 v$z_na[] 4.55ms 4.56ms
```
<sup>Created on 2021-09-28 by the [reprex
package](https://reprex.tidyverse.org) (v2.0.0)</sup>
vs this on, this on this pull request:
``` r
library(arrow, warn.conflicts = FALSE)
#> See arrow_info() for available features
library(dplyr, warn.conflicts = FALSE)
inject_na <- function(x) {
x[sample(seq_along(x), length(x) / 10)] <- NA
x
}
df <- data.frame(
x = runif(1e6),
y = sample(1:100, 1e6, replace = TRUE),
z = sample(month.abb, 1e6, replace = TRUE)
) %>%
mutate(
across(everything(), inject_na, .names = "{.col}_na")
)
a <- Array$create(df)
a$type
#> StructType
#> struct<x: double, y: int32, z: string, x_na: double, y_na: int32, z_na:
string>
bench::workout({
v <- a$as_vector()
v$x
v$y
v$z
v$x_na
v$y_na
v$z_na
})
#> # A tibble: 7 × 3
#> exprs process real
#> <bch:expr> <bch:tm> <bch:tm>
#> 1 v <- a$as_vector() 236µs 237.54µs
#> 2 v$x 43µs 45.07µs
#> 3 v$y 3µs 4.86µs
#> 4 v$z 3µs 3.95µs
#> 5 v$x_na 3µs 3.81µs
#> 6 v$y_na 4µs 3.71µs
#> 7 v$z_na 3µs 3.51µs
# get few values
bench::workout({
v$x[1:10]
v$y[1:10]
v$z[1:10]
v$x_na[1:10]
v$y_na[1:10]
v$z_na[1:10]
})
#> # A tibble: 6 × 3
#> exprs process real
#> <bch:expr> <bch:tm> <bch:tm>
#> 1 v$x[1:10] 14µs 16.55µs
#> 2 v$y[1:10] 13µs 13.92µs
#> 3 v$z[1:10] 40µs 41.81µs
#> 4 v$x_na[1:10] 6µs 7.38µs
#> 5 v$y_na[1:10] 6µs 6.56µs
#> 6 v$z_na[1:10] 30µs 30.25µs
# duplicate -> materialize
bench::workout({
v$x[]
v$y[]
v$z[]
v$x_na[]
v$y_na[]
v$z_na[]
})
#> # A tibble: 6 × 3
#> exprs process real
#> <bch:expr> <bch:tm> <bch:tm>
#> 1 v$x[] 4.17ms 4.17ms
#> 2 v$y[] 2.08ms 2.08ms
#> 3 v$z[] 111.07ms 111.13ms
#> 4 v$x_na[] 6.39ms 6.39ms
#> 5 v$y_na[] 4.12ms 4.12ms
#> 6 v$z_na[] 31.86ms 31.89ms
```
<sup>Created on 2021-09-28 by the [reprex
package](https://reprex.tidyverse.org) (v2.0.0)</sup>
I guess it's a tradeoff, this pr being more lazy, whereas the previous code
was greedy (materializing copies immediately) but also doing it in parallel.
I think it's fine to only pay for making standard R vectors when it becomes
necessary.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]