OfekShilon commented on issue #15271:
URL: https://github.com/apache/arrow/issues/15271#issuecomment-1382862116

   @paleolimbot  Thanks for the suggestion. I'm aware of the overhead of 
metadata from [this 
discussion](https://github.com/apache/arrow/pull/15252#issuecomment-1375760926),
 but there is no metadata to speak of in the files in this example (not even 
row names) - and indeed I don't see any definite win by dropping it:
   
   ```
   tmpdir <- tempfile()
   dir.create(tmpdir)
   
   colnums <- c(10,20,30,100,150,200)
   rownums <- c(1,2,3,4,5,10,20,30,40,50,60,70,100,200, 300, 400, 500, 1000, 
2000, 3000, 4000, 5000, 10000)
   
   # Generate files
   for (colnum in colnums) {
     for (rownum in rownums) {
       fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
       fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
       fn.arrow.nometa <- paste0(tmpdir, "/arrow.nometa.",rownum,"x",colnum)
       
       dat <- as.data.frame(matrix(runif(rownum*colnum), nrow=rownum, 
ncol=colnum))
       save(dat, file=fn.robj)
       
       # create the table manually to avoid metadata
       dat_table <- arrow::as_arrow_table(dat)
       arrow::write_feather(x = dat_table, sink = fn.arrow, compression = 
"uncompressed")
   
       schema <- dat_table$schema
       schema$metadata <- NULL
       dat_table <- dat_table$cast(schema)
       arrow::write_feather(x = dat_table, sink = fn.arrow.nometa, compression 
= "uncompressed")
     }
   }
   
   times.robj <- matrix(0, nrow=length(rownums), ncol=length(colnums))
   rownames(times.robj) <- paste(rownums,"rows")
   colnames(times.robj) <- paste(colnums,"cols")
   times.arrow <- times.robj
   times.arrow.nometa <- times.robj
   
   for (i in 1:length(rownums)) {
     for (j in 1:length(colnums)) {
       rownum <- rownums[i]
       colnum <- colnums[j]
       fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
       fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
       fn.arrow.nometa <- paste0(tmpdir, "/arrow.nometa.",rownum,"x",colnum)
       
       # measure 2nd load to account for cold caches
       load(fn.robj)
       start <- Sys.time();
       load(fn.robj);
       times.robj[i,j] <- Sys.time()-start
   
       tst <- arrow::read_feather(fn.arrow)
       start <- Sys.time();
       tst <- arrow::read_feather(fn.arrow, as_data_frame = TRUE, mmap = TRUE);
       times.arrow[i,j] <- Sys.time()-start
   
       tst <- arrow::read_feather(fn.arrow.nometa)
       start <- Sys.time();
       tst <- arrow::read_feather(fn.arrow.nometa, as_data_frame = TRUE, mmap = 
TRUE);
       times.arrow.nometa[i,j] <- Sys.time()-start
     }
   }
   ```
   Gives -
   ``` 
   > times.arrow.nometa / times.robj
                  10 cols    20 cols    30 cols    100 cols    150 cols    200 
cols
   1 rows      70.0114504 58.2468085 46.1319444 21.80683403 11.17302053 
17.57530120
   2 rows      43.8119658 35.5418327 44.6910569 14.93035480 92.98226950 
15.06676136
   3 rows      59.6066351 35.9829060 17.2069672 20.61194030 16.86906710 
15.61495845
   4 rows     236.3318182 44.0948905 31.1320755 16.19062500 18.24731183  
8.38811445
   5 rows      38.0276498 30.7560976 17.5539419 15.84103512 13.33577713 
11.82111801
   10 rows     29.7992278 25.0996785 13.4232082 13.74528302 10.35152838  
8.64968153
   20 rows     25.3423729 19.8398950 17.2170022  9.75327511  7.33414833  
8.22432262
   30 rows     16.1743697 11.8511628 19.6003824  7.47927032  5.79861111  
5.35154017
   40 rows     30.6280992 24.8726236 20.2042105 19.87437811  6.08097028  
4.42742382
   50 rows     29.8060000 22.1587838 18.2661499  3.84229508  6.51073729  
3.17507246
   60 rows     20.1960298 16.0766610 12.6747851  5.38434983  3.86645595 
16.07189542
   70 rows     19.9536585 13.5328597 15.0110345  4.48984526  3.51769231  
2.88766452
   100 rows    17.3659091 11.7341577  7.4166054  4.57267189  3.18088012  
2.27087242
   200 rows    16.2354892 10.2235047  6.3983116  4.09790752  1.63446432  
1.69634703
   300 rows     7.3573854  7.1700787  4.5906849  1.63513514  5.10005897  
1.15804737
   400 rows     6.7309689  4.7252280  4.1573647  1.17293525  0.92025293  
0.95345718
   500 rows     7.0257590  4.1061644  2.8501041  1.03354651  0.70476702  
0.59290404
   1000 rows    5.9288681  3.4319209  1.4538153  0.51121076  0.37171409  
0.34528247
   2000 rows    2.2429879  1.1519025  0.7536303  0.26461660  0.18316701  
0.13906527
   3000 rows    1.4939711  0.8129300  0.5481969  0.17433917  0.13261505  
0.11275994
   4000 rows    1.3325031  0.6623176  0.5343539  0.13499381  0.09493322  
0.07435454
   5000 rows    0.8804653  3.8379121  0.3498308  0.13787353  0.07957967  
0.06604075
   10000 rows   0.7404644  0.2288824  0.1536465  0.06676482  0.04172911  
0.03254246
   ```
   
   My measurements differ from yours, but even yours show robj wins by a wide 
margin for <2000 lines.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to