arnaud-feldmann commented on issue #35180:
URL: https://github.com/apache/arrow/issues/35180#issuecomment-1520854304

   Right now I do that ugly thing to compute weighted medians :
   ``` r
   library(arrow)
   #> Some features are not enabled in this build of Arrow. Run `arrow_info()` 
for more information.
   #> 
   #> Attachement du package : 'arrow'
   #> L'objet suivant est masqué depuis 'package:utils':
   #> 
   #>     timestamp
   library(dplyr)
   #> 
   #> Attachement du package : 'dplyr'
   #> Les objets suivants sont masqués depuis 'package:stats':
   #> 
   #>     filter, lag
   #> Les objets suivants sont masqués depuis 'package:base':
   #> 
   #>     intersect, setdiff, setequal, union
   tf <- tempfile()
   dir.create(tf)
   write_dataset(mtcars, tf, partitioning = "cyl")
   ds <- open_dataset(tf)
   
   # weighted median of gross horspower by weight
   get_median <- function(arrow, var, wt) {
     var <- ensym(var)
     wt <- ensym(wt)
     
     sel <-
       arrow %>%
       filter(! is.na(!! var) & ! is.na(!! wt)) %>%
       mutate(!! var := as.double(!! var),
              !! wt := as.double(!! wt)) %>%
       arrange(!! var)
     sel_wt <- sel %>% pull(!! wt, as_vector = FALSE)
     sel_val <- sel %>% pull(!! var, as_vector = FALSE)
     sel_cum_poids <- call_function("cumulative_sum_checked",sel_wt)
     sel_sum_wt <- Scalar$create(sel_cum_poids[sel_cum_poids$length()])
     sel_val[sel_cum_poids>  sel_sum_wt/2][1L]$as_vector()
   }
   get_median(ds, hp, wt)
   #> [1] 175
   ```
   
   This isn't very pretty. That'd why I searched for a direct binding.
   
   ```
   ds %>%
     mutate(cs = cumsum(wt)) %>% collect()
   #>     mpg  disp  hp drat    wt  qsec vs am gear carb cyl     cs
   #> 1  21.0 160.0 110 3.90 2.620 16.46  0  1    4    4   6  2.620
   #> 2  21.0 160.0 110 3.90 2.875 17.02  0  1    4    4   6  5.495
   #> 3  21.4 258.0 110 3.08 3.215 19.44  1  0    3    1   6  8.710
   #> 4  18.1 225.0 105 2.76 3.460 20.22  1  0    3    1   6 12.170
   #> 5  19.2 167.6 123 3.92 3.440 18.30  1  0    4    4   6 15.610
   #> 6  17.8 167.6 123 3.92 3.440 18.90  1  0    4    4   6 19.050
   #> 7  19.7 145.0 175 3.62 2.770 15.50  0  1    5    6   6 21.820
   #> 8  22.8 108.0  93 3.85 2.320 18.61  1  1    4    1   4  2.320
   #> 9  24.4 146.7  62 3.69 3.190 20.00  1  0    4    2   4  5.510
   #> 10 22.8 140.8  95 3.92 3.150 22.90  1  0    4    2   4  8.660
   #> 11 32.4  78.7  66 4.08 2.200 19.47  1  1    4    1   4 10.860
   #> 12 30.4  75.7  52 4.93 1.615 18.52  1  1    4    2   4 12.475
   #> 13 33.9  71.1  65 4.22 1.835 19.90  1  1    4    1   4 14.310
   #> 14 21.5 120.1  97 3.70 2.465 20.01  1  0    3    1   4 16.775
   #> 15 27.3  79.0  66 4.08 1.935 18.90  1  1    4    1   4 18.710
   #> 16 26.0 120.3  91 4.43 2.140 16.70  0  1    5    2   4 20.850
   #> 17 30.4  95.1 113 3.77 1.513 16.90  1  1    5    2   4 22.363
   #> 18 21.4 121.0 109 4.11 2.780 18.60  1  1    4    2   4 25.143
   #> 19 18.7 360.0 175 3.15 3.440 17.02  0  0    3    2   8  3.440
   #> 20 14.3 360.0 245 3.21 3.570 15.84  0  0    3    4   8  7.010
   #> 21 16.4 275.8 180 3.07 4.070 17.40  0  0    3    3   8 11.080
   #> 22 17.3 275.8 180 3.07 3.730 17.60  0  0    3    3   8 14.810
   #> 23 15.2 275.8 180 3.07 3.780 18.00  0  0    3    3   8 18.590
   #> 24 10.4 472.0 205 2.93 5.250 17.98  0  0    3    4   8 23.840
   #> 25 10.4 460.0 215 3.00 5.424 17.82  0  0    3    4   8 29.264
   #> 26 14.7 440.0 230 3.23 5.345 17.42  0  0    3    4   8 34.609
   #> 27 15.5 318.0 150 2.76 3.520 16.87  0  0    3    2   8 38.129
   #> 28 15.2 304.0 150 3.15 3.435 17.30  0  0    3    2   8 41.564
   #> 29 13.3 350.0 245 3.73 3.840 15.41  0  0    3    4   8 45.404
   #> 30 19.2 400.0 175 3.08 3.845 17.05  0  0    3    2   8 49.249
   #> 31 15.8 351.0 264 4.22 3.170 14.50  0  1    5    4   8 52.419
   #> 32 15.0 301.0 335 3.54 3.570 14.60  0  1    5    8   8 55.989
   ```
   
   So I tried binding cumulative_sum_checked to cumsum, but I realise that, 
even with the UDF helper, things starts again at each chunk, so I guess, as 
cumsum is a array function (not a scalar one), things are more complicated.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to