paleolimbot commented on issue #15271:
URL: https://github.com/apache/arrow/issues/15271#issuecomment-1377495190

   Make sure you're writing using `compression = "uncompressed"`! It's not 
perfect, but is about 2x faster. I'll look into it to see if there's any way to 
skip some R code here to more directly call the C++ writer...even the leve of 
overhead with no compression that you've highlighted is confusing to me.
   
   Using no compression:
   
   ``` r
   tmpdir <- tempfile()
   dir.create(tmpdir)
   
   colnums <- c(10,20,30,100,150,200,300,500)
   rownums <- c(1,2,3,4,5,10,20,30,40,50,60,70,100,200, 300, 400, 500, 1000, 
2000, 3000, 4000, 5000, 10000)
   
   # Generate files
   for (colnum in colnums) {
     for (rownum in rownums) {
       fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
       fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
       
       dat <- as.data.frame(matrix(runif(rownum*colnum), nrow=rownum, 
ncol=colnum))
       save(dat, file=fn.robj)
       arrow::write_feather(x = dat, sink = fn.arrow, compression = 
"uncompressed")
     }
   }
   
   times.robj <- matrix(0, nrow=length(rownums), ncol=length(colnums))
   rownames(times.robj) <- paste(rownums,"rows")
   colnames(times.robj) <- paste(colnums,"cols")
   times.arrow <- times.robj
   
   for (i in 1:length(rownums)) {
     for (j in 1:length(colnums)) {
       rownum <- rownums[i]
       colnum <- colnums[j]
       fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
       fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
       
       # measure 2nd load to account for cold caches
       load(fn.robj)
       start <- Sys.time(); 
       load(fn.robj); 
       times.robj[i,j] <- Sys.time()-start
       
       tst <- arrow::read_feather(fn.arrow)
       start <- Sys.time(); 
       tst <- arrow::read_feather(fn.arrow); 
       times.arrow[i,j] <- Sys.time()-start
     }
   }
   
   times.arrow / times.robj
   #>               10 cols    20 cols    30 cols    100 cols    150 cols    
200 cols
   #> 1 rows     14.4725275 17.9795082 18.4140625 21.90818859 47.65606362 
22.84116694
   #> 2 rows     15.1983806 16.2460317 16.9053030 18.67129630 20.76380952 
37.10859729
   #> 3 rows     21.7117117 15.6601562 15.0646259 17.03752759 17.77000000 
19.34379458
   #> 4 rows     15.7056277 16.7242798 14.8692810 16.17453799 16.61224490 
18.86018642
   #> 5 rows     13.1034483 14.4306050 14.9470199 14.90576923 17.99046105 
18.01030928
   #> 10 rows    12.5816327 12.9710611 13.5114943 12.35703002 28.33454988 
13.22032289
   #> 20 rows    12.0430464 10.7642276 10.1307339  9.10829493  8.45411765  
9.29576547
   #> 30 rows    11.1220238  9.6205251  8.8284024  6.56949960  6.90670927  
7.49974529
   #> 40 rows    10.7088235  9.0176600  8.0673953  6.57269790  6.01518560  
6.51640071
   #> 50 rows     8.8784119  8.7257384  7.2162162  5.68754448  5.36519115  
5.89375727
   #> 60 rows     9.7962963  8.1595960  6.8823529  5.16987179 10.22431958  
4.99090247
   #> 70 rows     8.4882075  8.1819961  6.6296296  5.04599761  4.74102564  
4.54345654
   #> 100 rows    8.2778993  6.3507692  5.5512821  3.87919776  3.18816885  
3.65419847
   #> 200 rows    6.9781818  4.6319149 11.3175395  2.39477680  2.22712351  
2.23399873
   #> 300 rows    5.9528875  3.4087948  2.8162523  2.28367392  1.53755051  
1.65800866
   #> 400 rows    4.7578419  3.0028986  2.2602876  2.15348917  1.26760074  
1.21309890
   #> 500 rows    4.1558308  2.5225768  2.2711656  1.41115560  1.05550257  
1.02989052
   #> 1000 rows   2.2786585  1.3790087  3.0056259  0.60250798  0.53179530  
0.53369967
   #> 2000 rows   1.3539916  1.5805147  0.5737926  0.30327838  0.27820840  
0.27057028
   #> 3000 rows   1.1347815  0.5374048  0.3965298  0.20412111  0.19350023  
0.45714431
   #> 4000 rows   0.7417894  0.4128671  3.5819726  0.24726677  0.14699569  
0.14043276
   #> 5000 rows   0.6041413  0.3378337  0.8593773  0.19491538  0.12437216  
0.11456206
   #> 10000 rows  0.3014837  0.1828018  0.1201612  0.02665133  0.05724913  
0.05461478
   #>               300 cols    500 cols
   #> 1 rows     27.20939086 48.20383912
   #> 2 rows     25.13126492 34.15562914
   #> 3 rows     24.11811024 30.89401968
   #> 4 rows     21.79393939 26.18478261
   #> 5 rows     20.94679803 26.48522653
   #> 10 rows    14.96833216 25.12523191
   #> 20 rows    10.51369216 15.84330318
   #> 30 rows     7.43155288 11.73603952
   #> 40 rows     6.62136223 10.43135770
   #> 50 rows     5.99006711  9.25798485
   #> 60 rows     5.04369274  6.14095785
   #> 70 rows     4.75809650  5.70886076
   #> 100 rows    5.00190311  4.54890153
   #> 200 rows    4.50396996  2.68490953
   #> 300 rows    2.99969424  1.89673687
   #> 400 rows    2.34352282  1.48038762
   #> 500 rows    2.03165384  1.20663080
   #> 1000 rows   0.70601711  0.63683243
   #> 2000 rows   0.27909992  0.43289769
   #> 3000 rows   0.18386126  0.20415949
   #> 4000 rows   0.29411463  0.16423265
   #> 5000 rows   0.11312960  0.12045428
   #> 10000 rows  0.05825836  0.06443037
   ```
   
   <sup>Created on 2023-01-10 with [reprex 
v2.0.2](https://reprex.tidyverse.org)</sup>
   
   Using default compression:
   
   ``` r
   tmpdir <- tempfile()
   dir.create(tmpdir)
   
   colnums <- c(10,20,30,100,150,200,300,500)
   rownums <- c(1,2,3,4,5,10,20,30,40,50,60,70,100,200, 300, 400, 500, 1000, 
2000, 3000, 4000, 5000, 10000)
   
   # Generate files
   for (colnum in colnums) {
     for (rownum in rownums) {
       fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
       fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
       
       dat <- as.data.frame(matrix(runif(rownum*colnum), nrow=rownum, 
ncol=colnum))
       save(dat, file=fn.robj)
       arrow::write_feather(x = dat, sink = fn.arrow)
     }
   }
   
   times.robj <- matrix(0, nrow=length(rownums), ncol=length(colnums))
   rownames(times.robj) <- paste(rownums,"rows")
   colnames(times.robj) <- paste(colnums,"cols")
   times.arrow <- times.robj
   
   for (i in 1:length(rownums)) {
     for (j in 1:length(colnums)) {
       rownum <- rownums[i]
       colnum <- colnums[j]
       fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
       fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
       
       # measure 2nd load to account for cold caches
       load(fn.robj)
       start <- Sys.time(); 
       load(fn.robj); 
       times.robj[i,j] <- Sys.time()-start
       
       tst <- arrow::read_feather(fn.arrow)
       start <- Sys.time(); 
       tst <- arrow::read_feather(fn.arrow); 
       times.arrow[i,j] <- Sys.time()-start
     }
   }
   
   times.arrow / times.robj
   #>               10 cols    20 cols    30 cols   100 cols    150 cols    200 
cols
   #> 1 rows     16.9572954 19.6031746 19.4701754 26.5231144 56.01642710 
33.39605735
   #> 2 rows     19.0990991 20.9177489 20.8730769 23.7868481 26.38644689 
45.43119266
   #> 3 rows     21.1547619 19.2469136 21.4253731 21.8924051 24.21588946 
25.88827586
   #> 4 rows     18.4112554 18.8007663 18.4275862 21.3195021 22.97166667 
26.45885635
   #> 5 rows     15.8395522 17.8750000 16.3880597 22.0901804 22.89716841 
26.51943005
   #> 10 rows    15.3061224 13.0547945 14.9222520 16.8244767 34.23970944 
17.79330709
   #> 20 rows    14.3840830 13.6781609 12.3011236 11.2735528 11.33975904 
11.87954111
   #> 30 rows    13.5421687 11.0495283 10.1816514  9.3316370  9.57760314  
9.62248996
   #> 40 rows    12.6453488  9.8964059  9.9819168  7.7601744  8.33240067  
8.37088608
   #> 50 rows    11.8975069 10.1530612 10.3616000  7.4708579  7.31219272  
7.15629522
   #> 60 rows    11.3643836  8.9316081  8.3958991  7.0183366 12.72128146  
6.76300578
   #> 70 rows    11.0265252  9.6686869  8.1184408  6.6577017  6.38455080  
6.92413793
   #> 100 rows   10.3680556  8.0369748  6.4965116  5.1863354  4.83441670  
5.06206362
   #> 200 rows   12.3647059  6.8830275  4.9482612  3.2896631  3.24210312  
3.27877754
   #> 300 rows    5.7400000  4.5351986  3.5697161  2.3988402  2.14011906  
2.06634286
   #> 400 rows    5.0799087  2.9543702  2.8629648  1.7690058  1.72880966  
1.76503533
   #> 500 rows    4.4447884  2.8496770  2.3769231  1.4735886  1.35359428  
1.52543420
   #> 1000 rows   2.7072555  1.5854657  1.3616873  0.7840171  0.76427293  
0.72445101
   #> 2000 rows   1.5208333  0.8911792  0.6701459  0.4350788  0.37124991  
0.37946588
   #> 3000 rows   1.0453862  0.6643997  0.5169999  0.2656266  0.24755968  
0.25853659
   #> 4000 rows   0.8616682  0.4784442  0.4127477  0.2119238  0.19982264  
0.19844568
   #> 5000 rows   0.8958047  0.3799294  0.3235682  0.1832789  0.16097686  
0.16914301
   #> 10000 rows  0.3733628  0.2193108  0.1665289  0.1076588  0.09350925  
0.08932051
   #>               300 cols   500 cols
   #> 1 rows     35.87483176 62.1506196
   #> 2 rows     32.28801843 44.1924342
   #> 3 rows     31.48050459 39.4118098
   #> 4 rows     29.49416755 36.0374823
   #> 5 rows     28.25379171 34.6821192
   #> 10 rows    20.96233383 29.6552511
   #> 20 rows    12.59460738 21.9988169
   #> 30 rows    10.30442541 15.0805057
   #> 40 rows     9.17821473 13.3024585
   #> 50 rows     7.90048940 10.9834538
   #> 60 rows     7.22199747  8.0121655
   #> 70 rows     7.09084699  7.5827408
   #> 100 rows    5.27838565  5.9264278
   #> 200 rows    5.55643482  3.2979336
   #> 300 rows    3.63902649  2.3820292
   #> 400 rows    3.04591480  1.9261239
   #> 500 rows    2.45318492  1.5959291
   #> 1000 rows   1.27772319  0.8132839
   #> 2000 rows   0.70657236  0.4209621
   #> 3000 rows   0.50213646  0.2835666
   #> 4000 rows   0.20044236  0.2147253
   #> 5000 rows   0.14406603  0.1745972
   #> 10000 rows  0.08071889  0.1012044
   ```
   
   <sup>Created on 2023-01-10 with [reprex 
v2.0.2](https://reprex.tidyverse.org)</sup>


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to