paleolimbot commented on issue #15271:
URL: https://github.com/apache/arrow/issues/15271#issuecomment-1377495190
Make sure you're writing using `compression = "uncompressed"`! It's not
perfect, but is about 2x faster. I'll look into it to see if there's any way to
skip some R code here to more directly call the C++ writer...even the leve of
overhead with no compression that you've highlighted is confusing to me.
Using no compression:
``` r
tmpdir <- tempfile()
dir.create(tmpdir)
colnums <- c(10,20,30,100,150,200,300,500)
rownums <- c(1,2,3,4,5,10,20,30,40,50,60,70,100,200, 300, 400, 500, 1000,
2000, 3000, 4000, 5000, 10000)
# Generate files
for (colnum in colnums) {
for (rownum in rownums) {
fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
dat <- as.data.frame(matrix(runif(rownum*colnum), nrow=rownum,
ncol=colnum))
save(dat, file=fn.robj)
arrow::write_feather(x = dat, sink = fn.arrow, compression =
"uncompressed")
}
}
times.robj <- matrix(0, nrow=length(rownums), ncol=length(colnums))
rownames(times.robj) <- paste(rownums,"rows")
colnames(times.robj) <- paste(colnums,"cols")
times.arrow <- times.robj
for (i in 1:length(rownums)) {
for (j in 1:length(colnums)) {
rownum <- rownums[i]
colnum <- colnums[j]
fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
# measure 2nd load to account for cold caches
load(fn.robj)
start <- Sys.time();
load(fn.robj);
times.robj[i,j] <- Sys.time()-start
tst <- arrow::read_feather(fn.arrow)
start <- Sys.time();
tst <- arrow::read_feather(fn.arrow);
times.arrow[i,j] <- Sys.time()-start
}
}
times.arrow / times.robj
#> 10 cols 20 cols 30 cols 100 cols 150 cols
200 cols
#> 1 rows 14.4725275 17.9795082 18.4140625 21.90818859 47.65606362
22.84116694
#> 2 rows 15.1983806 16.2460317 16.9053030 18.67129630 20.76380952
37.10859729
#> 3 rows 21.7117117 15.6601562 15.0646259 17.03752759 17.77000000
19.34379458
#> 4 rows 15.7056277 16.7242798 14.8692810 16.17453799 16.61224490
18.86018642
#> 5 rows 13.1034483 14.4306050 14.9470199 14.90576923 17.99046105
18.01030928
#> 10 rows 12.5816327 12.9710611 13.5114943 12.35703002 28.33454988
13.22032289
#> 20 rows 12.0430464 10.7642276 10.1307339 9.10829493 8.45411765
9.29576547
#> 30 rows 11.1220238 9.6205251 8.8284024 6.56949960 6.90670927
7.49974529
#> 40 rows 10.7088235 9.0176600 8.0673953 6.57269790 6.01518560
6.51640071
#> 50 rows 8.8784119 8.7257384 7.2162162 5.68754448 5.36519115
5.89375727
#> 60 rows 9.7962963 8.1595960 6.8823529 5.16987179 10.22431958
4.99090247
#> 70 rows 8.4882075 8.1819961 6.6296296 5.04599761 4.74102564
4.54345654
#> 100 rows 8.2778993 6.3507692 5.5512821 3.87919776 3.18816885
3.65419847
#> 200 rows 6.9781818 4.6319149 11.3175395 2.39477680 2.22712351
2.23399873
#> 300 rows 5.9528875 3.4087948 2.8162523 2.28367392 1.53755051
1.65800866
#> 400 rows 4.7578419 3.0028986 2.2602876 2.15348917 1.26760074
1.21309890
#> 500 rows 4.1558308 2.5225768 2.2711656 1.41115560 1.05550257
1.02989052
#> 1000 rows 2.2786585 1.3790087 3.0056259 0.60250798 0.53179530
0.53369967
#> 2000 rows 1.3539916 1.5805147 0.5737926 0.30327838 0.27820840
0.27057028
#> 3000 rows 1.1347815 0.5374048 0.3965298 0.20412111 0.19350023
0.45714431
#> 4000 rows 0.7417894 0.4128671 3.5819726 0.24726677 0.14699569
0.14043276
#> 5000 rows 0.6041413 0.3378337 0.8593773 0.19491538 0.12437216
0.11456206
#> 10000 rows 0.3014837 0.1828018 0.1201612 0.02665133 0.05724913
0.05461478
#> 300 cols 500 cols
#> 1 rows 27.20939086 48.20383912
#> 2 rows 25.13126492 34.15562914
#> 3 rows 24.11811024 30.89401968
#> 4 rows 21.79393939 26.18478261
#> 5 rows 20.94679803 26.48522653
#> 10 rows 14.96833216 25.12523191
#> 20 rows 10.51369216 15.84330318
#> 30 rows 7.43155288 11.73603952
#> 40 rows 6.62136223 10.43135770
#> 50 rows 5.99006711 9.25798485
#> 60 rows 5.04369274 6.14095785
#> 70 rows 4.75809650 5.70886076
#> 100 rows 5.00190311 4.54890153
#> 200 rows 4.50396996 2.68490953
#> 300 rows 2.99969424 1.89673687
#> 400 rows 2.34352282 1.48038762
#> 500 rows 2.03165384 1.20663080
#> 1000 rows 0.70601711 0.63683243
#> 2000 rows 0.27909992 0.43289769
#> 3000 rows 0.18386126 0.20415949
#> 4000 rows 0.29411463 0.16423265
#> 5000 rows 0.11312960 0.12045428
#> 10000 rows 0.05825836 0.06443037
```
<sup>Created on 2023-01-10 with [reprex
v2.0.2](https://reprex.tidyverse.org)</sup>
Using default compression:
``` r
tmpdir <- tempfile()
dir.create(tmpdir)
colnums <- c(10,20,30,100,150,200,300,500)
rownums <- c(1,2,3,4,5,10,20,30,40,50,60,70,100,200, 300, 400, 500, 1000,
2000, 3000, 4000, 5000, 10000)
# Generate files
for (colnum in colnums) {
for (rownum in rownums) {
fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
dat <- as.data.frame(matrix(runif(rownum*colnum), nrow=rownum,
ncol=colnum))
save(dat, file=fn.robj)
arrow::write_feather(x = dat, sink = fn.arrow)
}
}
times.robj <- matrix(0, nrow=length(rownums), ncol=length(colnums))
rownames(times.robj) <- paste(rownums,"rows")
colnames(times.robj) <- paste(colnums,"cols")
times.arrow <- times.robj
for (i in 1:length(rownums)) {
for (j in 1:length(colnums)) {
rownum <- rownums[i]
colnum <- colnums[j]
fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
# measure 2nd load to account for cold caches
load(fn.robj)
start <- Sys.time();
load(fn.robj);
times.robj[i,j] <- Sys.time()-start
tst <- arrow::read_feather(fn.arrow)
start <- Sys.time();
tst <- arrow::read_feather(fn.arrow);
times.arrow[i,j] <- Sys.time()-start
}
}
times.arrow / times.robj
#> 10 cols 20 cols 30 cols 100 cols 150 cols 200
cols
#> 1 rows 16.9572954 19.6031746 19.4701754 26.5231144 56.01642710
33.39605735
#> 2 rows 19.0990991 20.9177489 20.8730769 23.7868481 26.38644689
45.43119266
#> 3 rows 21.1547619 19.2469136 21.4253731 21.8924051 24.21588946
25.88827586
#> 4 rows 18.4112554 18.8007663 18.4275862 21.3195021 22.97166667
26.45885635
#> 5 rows 15.8395522 17.8750000 16.3880597 22.0901804 22.89716841
26.51943005
#> 10 rows 15.3061224 13.0547945 14.9222520 16.8244767 34.23970944
17.79330709
#> 20 rows 14.3840830 13.6781609 12.3011236 11.2735528 11.33975904
11.87954111
#> 30 rows 13.5421687 11.0495283 10.1816514 9.3316370 9.57760314
9.62248996
#> 40 rows 12.6453488 9.8964059 9.9819168 7.7601744 8.33240067
8.37088608
#> 50 rows 11.8975069 10.1530612 10.3616000 7.4708579 7.31219272
7.15629522
#> 60 rows 11.3643836 8.9316081 8.3958991 7.0183366 12.72128146
6.76300578
#> 70 rows 11.0265252 9.6686869 8.1184408 6.6577017 6.38455080
6.92413793
#> 100 rows 10.3680556 8.0369748 6.4965116 5.1863354 4.83441670
5.06206362
#> 200 rows 12.3647059 6.8830275 4.9482612 3.2896631 3.24210312
3.27877754
#> 300 rows 5.7400000 4.5351986 3.5697161 2.3988402 2.14011906
2.06634286
#> 400 rows 5.0799087 2.9543702 2.8629648 1.7690058 1.72880966
1.76503533
#> 500 rows 4.4447884 2.8496770 2.3769231 1.4735886 1.35359428
1.52543420
#> 1000 rows 2.7072555 1.5854657 1.3616873 0.7840171 0.76427293
0.72445101
#> 2000 rows 1.5208333 0.8911792 0.6701459 0.4350788 0.37124991
0.37946588
#> 3000 rows 1.0453862 0.6643997 0.5169999 0.2656266 0.24755968
0.25853659
#> 4000 rows 0.8616682 0.4784442 0.4127477 0.2119238 0.19982264
0.19844568
#> 5000 rows 0.8958047 0.3799294 0.3235682 0.1832789 0.16097686
0.16914301
#> 10000 rows 0.3733628 0.2193108 0.1665289 0.1076588 0.09350925
0.08932051
#> 300 cols 500 cols
#> 1 rows 35.87483176 62.1506196
#> 2 rows 32.28801843 44.1924342
#> 3 rows 31.48050459 39.4118098
#> 4 rows 29.49416755 36.0374823
#> 5 rows 28.25379171 34.6821192
#> 10 rows 20.96233383 29.6552511
#> 20 rows 12.59460738 21.9988169
#> 30 rows 10.30442541 15.0805057
#> 40 rows 9.17821473 13.3024585
#> 50 rows 7.90048940 10.9834538
#> 60 rows 7.22199747 8.0121655
#> 70 rows 7.09084699 7.5827408
#> 100 rows 5.27838565 5.9264278
#> 200 rows 5.55643482 3.2979336
#> 300 rows 3.63902649 2.3820292
#> 400 rows 3.04591480 1.9261239
#> 500 rows 2.45318492 1.5959291
#> 1000 rows 1.27772319 0.8132839
#> 2000 rows 0.70657236 0.4209621
#> 3000 rows 0.50213646 0.2835666
#> 4000 rows 0.20044236 0.2147253
#> 5000 rows 0.14406603 0.1745972
#> 10000 rows 0.08071889 0.1012044
```
<sup>Created on 2023-01-10 with [reprex
v2.0.2](https://reprex.tidyverse.org)</sup>
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]