paleolimbot commented on issue #15271:
URL: https://github.com/apache/arrow/issues/15271#issuecomment-1377666643

   Ok! It seems like the problem is metadata. On write, we stick some R 
metadata into the schema and use it to do some stuff when we recreate the data 
frame on the way out. Most of the time that metadata is unused, and because it 
involves an R loop we see performance issues.
   
   If you write the file without the R metadata, it looks like reading it is 
much faster (but definitely test locally to confirm!).
   
   If this works for you, we could add a flag to disable writing the R metadata 
(or disable loading it).
   
   ``` r
   tmpdir <- tempfile()
   dir.create(tmpdir)
   
   colnums <- c(10,20,30,100,150,200,300,500)
   rownums <- c(1,2,3,4,5,10,20,30,40,50,60,70,100,200, 300, 400, 500, 1000, 
2000, 3000, 4000, 5000, 10000)
   
   # Generate files
   for (colnum in colnums) {
     for (rownum in rownums) {
       fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
       fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
       
       dat <- as.data.frame(matrix(runif(rownum*colnum), nrow=rownum, 
ncol=colnum))
       save(dat, file=fn.robj)
       
       # create the table manually to avoid metadata
       dat_table <- arrow::as_arrow_table(dat)
       schema <- dat_table$schema
       schema$metadata <- NULL
       dat_table <- dat_table$cast(schema)
       
       arrow::write_feather(x = dat_table, sink = fn.arrow, compression = 
"uncompressed")
     }
   }
   
   times.robj <- matrix(0, nrow=length(rownums), ncol=length(colnums))
   rownames(times.robj) <- paste(rownums,"rows")
   colnames(times.robj) <- paste(colnums,"cols")
   times.arrow <- times.robj
   
   for (i in 1:length(rownums)) {
     for (j in 1:length(colnums)) {
       rownum <- rownums[i]
       colnum <- colnums[j]
       fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
       fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
       
       # measure 2nd load to account for cold caches
       load(fn.robj)
       start <- Sys.time(); 
       load(fn.robj); 
       times.robj[i,j] <- Sys.time()-start
       
       tst <- arrow::read_feather(fn.arrow)
       start <- Sys.time(); 
       tst <- arrow::read_feather(fn.arrow, as_data_frame = TRUE, mmap = TRUE); 
       times.arrow[i,j] <- Sys.time()-start
     }
   }
   
   times.arrow / times.robj
   #>               10 cols    20 cols     30 cols    100 cols   150 cols    
200 cols
   #> 1 rows     14.0952381 12.2730769 14.97500000  9.20437956 8.75479744  
8.13718412
   #> 2 rows     16.0696517 14.9234234 14.28278689  9.90533981 8.56250000  
8.56160000
   #> 3 rows     13.7713004 13.4891775 11.53790614  8.57407407 8.35842294  
7.07703704
   #> 4 rows     14.7380952 95.9319149 11.25517241  8.21645022 6.92554992  
6.31000000
   #> 5 rows     14.4626168 12.9609375 11.72664360  7.66060606 6.76986755  
5.90463576
   #> 10 rows    12.3790323 10.7172414  9.49712644  6.16776316 5.58681876  
4.66462793
   #> 20 rows    11.2867647 62.1293103  7.77804296  4.57604790 3.61700263  
3.06232877
   #> 30 rows    10.3590604  8.4000000  7.05376344  3.62488129 2.55404571 
11.87710970
   #> 40 rows     9.9206349  7.3310185  6.19379845  3.17002417 2.39525463  
1.98561465
   #> 50 rows    11.6686567  6.9299781  5.71708185  2.71903751 2.14587738  
1.81172220
   #> 60 rows     8.8262032  6.7301255  5.47731092 16.19293478 1.97486961  
1.62612613
   #> 70 rows     8.8347339  6.4109312  5.19554849  2.44055069 1.78809932  
1.56611431
   #> 100 rows    7.7412935  5.3079526  4.49799197  2.03780242 1.44817927  
1.29230357
   #> 200 rows    6.7373358  3.8204819  3.00714286  1.16359795 0.87507926  
0.72829531
   #> 300 rows    4.9736842  2.9963603  2.74172185  0.85562541 0.63074822  
0.51833064
   #> 400 rows    3.9795134  2.4449307  1.77052632  0.82852432 0.54286035  
0.40358784
   #> 500 rows    3.4116356  2.0236613  1.46481876  0.55421516 0.40433317  
1.58486533
   #> 1000 rows   1.9754717  1.1283404  0.85135779  0.28743853 0.21571464  
0.17574634
   #> 2000 rows   1.1457113  0.5982890  0.41544440  0.15338311 0.10889462  
0.09171419
   #> 3000 rows   0.7994512  0.4206546  0.28310156  0.10914713 0.07384605  
0.06096175
   #> 4000 rows   0.6511236  0.3175360  0.23418670  0.07628486 0.05940748  
0.04918424
   #> 5000 rows   0.4762331  0.2692693  0.17628306  0.07026943 0.04680908  
0.03953978
   #> 10000 rows  0.2431953  0.1263146  0.08880676  0.03294864 0.02410858  
0.02036180
   #>              300 cols   500 cols
   #> 1 rows     7.25616438 6.53256705
   #> 2 rows     6.45868263 5.33515199
   #> 3 rows     6.25084364 5.28482972
   #> 4 rows     5.76898396 4.93076374
   #> 5 rows     5.70020121 4.40679095
   #> 10 rows    3.84901532 3.18668529
   #> 20 rows    2.55096154 2.00625000
   #> 30 rows    1.87665830 1.49675397
   #> 40 rows    1.68254466 1.29062263
   #> 50 rows    1.72490914 1.25016578
   #> 60 rows    1.38690327 1.04475309
   #> 70 rows    1.43247588 0.94965370
   #> 100 rows   0.96589063 0.79135421
   #> 200 rows   0.58168371 0.45303118
   #> 300 rows   0.42742552 0.32061561
   #> 400 rows   0.31737134 0.24586616
   #> 500 rows   0.25752199 0.20255645
   #> 1000 rows  0.55231620 0.10501935
   #> 2000 rows  0.07068172 0.05902058
   #> 3000 rows  0.04852037 0.13285303
   #> 4000 rows  0.03674992 0.02832831
   #> 5000 rows  0.01554990 0.02325099
   #> 10000 rows 0.01585784 0.01236076
   ```
   
   <sup>Created on 2023-01-10 with [reprex 
v2.0.2](https://reprex.tidyverse.org)</sup>


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to