paleolimbot commented on pull request #12467:
URL: https://github.com/apache/arrow/pull/12467#issuecomment-1081896525


   ...and I implemented the geoarrow side of this to make sure it will work. It 
does (except for a bit in one of the compute kernels where Concatenate doesn't 
work for extension types). See details and 
https://github.com/paleolimbot/geoarrow/pull/7
   
   <details>
   
   ``` r
   # remotes::install_github("apache/arrow#12467")
   # remotes::install_github("paleolimbot/geoarrow@arrow-ext-type")
   library(arrow, warn.conflicts = FALSE)
   library(dplyr, warn.conflicts = FALSE)
   library(geoarrow)
   
   places_folder <- system.file("example_dataset/osm_places", package = 
"geoarrow")
   places <- open_dataset(places_folder)
   places$schema$geometry$type
   #> GeoArrowType
   #> point GEOGCS["WGS 84",DATUM["WGS_...
   places$schema$geometry$type$crs
   #> [1] "GEOGCS[\"WGS 84\",DATUM[\"WGS_1984\",SPHEROID[\"WGS 
84\",6378137,298.257223563],AUTHORITY[\"EPSG\",\"6326\"]],PRIMEM[\"Greenwich\",0,AUTHORITY[\"EPSG\",\"8901\"]],UNIT[\"degree\",0.0174532925199433,AUTHORITY[\"EPSG\",\"9122\"]],AXIS[\"Longitude\",EAST],AXIS[\"Latitude\",NORTH]]"
   
   # works!
   Scanner$create(places)$ToTable()
   #> Table
   #> 7255 rows x 6 columns
   #> $osm_id <string>
   #> $code <int32>
   #> $population <double>
   #> $name <string>
   #> $geometry <point GEOGCS["WGS 84",DATUM["WGS_...>
   #> $fclass <string>
   #> 
   #> See $metadata for additional Schema metadata
   
   # works!
   as.data.frame(Scanner$create(places)$ToTable())
   #> # A tibble: 7,255 × 6
   #>    osm_id      code population name           geometry                    
fclass
   #>    <chr>      <int>      <dbl> <chr>          <wk_wkb>                    
<chr> 
   #>  1 21040334    1001      50781 Roskilde       <POINT (12.08192 55.64335)> 
city  
   #>  2 21040360    1001      72398 Esbjerg        <POINT (8.452075 55.46649)> 
city  
   #>  3 26559154    1001      62687 Randers        <POINT (10.03715 56.46175)> 
city  
   #>  4 26559170    1001      60508 Kolding        <POINT (9.47905 55.4895)>   
city  
   #>  5 26559198    1001      56567 Vejle          <POINT (9.533324 55.70001)> 
city  
   #>  6 26559213    1001     273077 Aarhus         <POINT (10.2134 56.14963)>  
city  
   #>  7 26559274    1001     178210 Odense         <POINT (10.38521 55.39972)> 
city  
   #>  8 1368129781  1001      58646 Horsens        <POINT (9.844477 55.86117)> 
city  
   #>  9 2247730880  1001     114194 Aalborg        <POINT (9.921526 57.04626)> 
city  
   #> 10 393558713   1030          0 Englebjerggård <POINT (11.77737 55.2004)>  
farm  
   #> # … with 7,245 more rows
   
   # unfortunately, this fails...
   places %>% 
     filter(population > 100000) %>% 
     select(name, population, fclass, geometry) %>% 
     arrange(desc(population)) %>% 
     collect()
   #> Error in `handle_csv_read_error()` at r/R/dplyr-collect.R:33:6:
   #> ! NotImplemented: concatenation of extension<geoarrow.point>
   #> 
/Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/array/concatenate.cc:195
  VisitTypeInline(*out_->type, this)
   #> 
/Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/array/concatenate.cc:590
  ConcatenateImpl(data, pool).Concatenate(&out_data)
   #> 
/Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc:2025
  Concatenate(values.chunks(), ctx->memory_pool())
   #> 
/Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc:2084
  TakeCA(*table.column(j), indices, options, ctx)
   #> 
/Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/compute/exec/sink_node.cc:375
  impl_->DoFinish()
   #> 
/Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/compute/exec/exec_plan.cc:484
  iterator_.Next()
   #> 
/Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/record_batch.cc:337 
 ReadNext(&batch)
   #> 
/Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/record_batch.cc:351 
 ToRecordBatches()
   
   # ...unless we unregister the extension type and use geoarrow_collect()
   arrow::unregister_extension_type("geoarrow.point")
   open_dataset(places_folder) %>% 
     filter(population > 100000) %>% 
     select(name, population, fclass, geometry) %>% 
     arrange(desc(population)) %>% 
     geoarrow_collect()
   #> # A tibble: 5 × 4
   #>   name          population fclass           geometry                   
   #>   <chr>              <dbl> <chr>            <wk_wkb>                   
   #> 1 København         613288 national_capital <POINT (12.57007 55.68672)>
   #> 2 Aarhus            273077 city             <POINT (10.2134 56.14963)> 
   #> 3 Odense            178210 city             <POINT (10.38521 55.39972)>
   #> 4 Aalborg           114194 city             <POINT (9.921526 57.04626)>
   #> 5 Frederiksberg     102029 suburb           <POINT (12.53262 55.67802)>
   ```
   
   <sup>Created on 2022-03-29 by the [reprex 
package](https://reprex.tidyverse.org) (v2.0.1)</sup>
   
   </details>


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to