thisisnic opened a new issue, #39541:
URL: https://github.com/apache/arrow/issues/39541
### Describe the bug, including details regarding any error messages,
version, and platform.
#37658 added the capability for the partition variable to be part of the CSV
dataset, but it only works when the partition is supplied as a schema object
and not a column name.
``` r
library(arrow)
library(dplyr)
tf <- tempfile()
dir.create(tf)
csv_dir <- file.path(tf, "csv_dir")
parquet_dir <- file.path(tf, "parquet_dir")
dir.create(csv_dir)
dir.create(parquet_dir)
write_dataset(group_by(mtcars[1:16,], cyl), csv_dir, format = "csv")
write_dataset(group_by(mtcars[17:32,], cyl), parquet_dir, format = "parquet")
csv_dataset <- open_dataset(csv_dir, format = "csv")
parquet_dataset <- open_dataset(parquet_dir, format = "parquet")
# CSV parse error
open_dataset(
csv_dir,
format = "csv",
schema = schema(parquet_dataset),
skip_rows = 1,
partitioning = "cyl"
) %>%
collect()
#> Error in `compute.Dataset()`:
#> ! Invalid: Could not open CSV input source
'/tmp/RtmpoQQt2q/file25f3a27580770/csv_dir/cyl=4/part-0.csv': Invalid: CSV
parse error: Row #2: Expected 11 columns, got 10:
22.8,108,93,3.85,2.32,18.61,1,1,4,1
#> Backtrace:
#> ▆
#> 1. ├─... %>% collect()
#> 2. ├─dplyr::collect(.)
#> 3. └─arrow:::collect.Dataset(.)
#> 4. ├─arrow:::collect.ArrowTabular(compute.Dataset(x), as_data_frame)
#> 5. └─arrow:::compute.Dataset(x)
#> 6. └─base::tryCatch(...)
#> 7. └─base (local) tryCatchList(expr, classes, parentenv, handlers)
#> 8. └─base (local) tryCatchOne(expr, names, parentenv,
handlers[[1L]])
#> 9. └─value[[3L]](cond)
#> 10. └─arrow:::augment_io_error_msg(e, call, schema =
schema())
#> 11. └─rlang::abort(msg, call = call)
# this works fine
open_dataset(
csv_dir,
format = "csv",
schema = schema(parquet_dataset),
skip_rows = 1,
partitioning = schema("cyl" = int32())
) %>%
collect()
#> # A tibble: 16 × 11
#> mpg disp hp drat wt qsec vs am gear carb cyl
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 22.8 108 93 3.85 2.32 18.6 1 1 4 1 4
#> 2 24.4 147. 62 3.69 3.19 20 1 0 4 2 4
#> 3 22.8 141. 95 3.92 3.15 22.9 1 0 4 2 4
#> 4 21 160 110 3.9 2.62 16.5 0 1 4 4 6
#> 5 21 160 110 3.9 2.88 17.0 0 1 4 4 6
#> 6 21.4 258 110 3.08 3.22 19.4 1 0 3 1 6
#> 7 18.1 225 105 2.76 3.46 20.2 1 0 3 1 6
#> 8 19.2 168. 123 3.92 3.44 18.3 1 0 4 4 6
#> 9 17.8 168. 123 3.92 3.44 18.9 1 0 4 4 6
#> 10 18.7 360 175 3.15 3.44 17.0 0 0 3 2 8
#> 11 14.3 360 245 3.21 3.57 15.8 0 0 3 4 8
#> 12 16.4 276. 180 3.07 4.07 17.4 0 0 3 3 8
#> 13 17.3 276. 180 3.07 3.73 17.6 0 0 3 3 8
#> 14 15.2 276. 180 3.07 3.78 18 0 0 3 3 8
#> 15 10.4 472 205 2.93 5.25 18.0 0 0 3 4 8
#> 16 10.4 460 215 3 5.42 17.8 0 0 3 4 8
# both of these work
open_dataset(
parquet_dir,
schema = schema(parquet_dataset),
partitioning = "cyl"
) %>%
collect()
#> # A tibble: 16 × 11
#> mpg disp hp drat wt qsec vs am gear carb cyl
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 32.4 78.7 66 4.08 2.2 19.5 1 1 4 1 4
#> 2 30.4 75.7 52 4.93 1.62 18.5 1 1 4 2 4
#> 3 33.9 71.1 65 4.22 1.84 19.9 1 1 4 1 4
#> 4 21.5 120. 97 3.7 2.46 20.0 1 0 3 1 4
#> 5 27.3 79 66 4.08 1.94 18.9 1 1 4 1 4
#> 6 26 120. 91 4.43 2.14 16.7 0 1 5 2 4
#> 7 30.4 95.1 113 3.77 1.51 16.9 1 1 5 2 4
#> 8 21.4 121 109 4.11 2.78 18.6 1 1 4 2 4
#> 9 19.7 145 175 3.62 2.77 15.5 0 1 5 6 6
#> 10 14.7 440 230 3.23 5.34 17.4 0 0 3 4 8
#> 11 15.5 318 150 2.76 3.52 16.9 0 0 3 2 8
#> 12 15.2 304 150 3.15 3.44 17.3 0 0 3 2 8
#> 13 13.3 350 245 3.73 3.84 15.4 0 0 3 4 8
#> 14 19.2 400 175 3.08 3.84 17.0 0 0 3 2 8
#> 15 15.8 351 264 4.22 3.17 14.5 0 1 5 4 8
#> 16 15 301 335 3.54 3.57 14.6 0 1 5 8 8
open_dataset(
parquet_dir,
schema = schema(parquet_dataset),
partitioning = schema("cyl" = int32())
) %>%
collect()
#> # A tibble: 16 × 11
#> mpg disp hp drat wt qsec vs am gear carb cyl
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 32.4 78.7 66 4.08 2.2 19.5 1 1 4 1 4
#> 2 30.4 75.7 52 4.93 1.62 18.5 1 1 4 2 4
#> 3 33.9 71.1 65 4.22 1.84 19.9 1 1 4 1 4
#> 4 21.5 120. 97 3.7 2.46 20.0 1 0 3 1 4
#> 5 27.3 79 66 4.08 1.94 18.9 1 1 4 1 4
#> 6 26 120. 91 4.43 2.14 16.7 0 1 5 2 4
#> 7 30.4 95.1 113 3.77 1.51 16.9 1 1 5 2 4
#> 8 21.4 121 109 4.11 2.78 18.6 1 1 4 2 4
#> 9 19.7 145 175 3.62 2.77 15.5 0 1 5 6 6
#> 10 14.7 440 230 3.23 5.34 17.4 0 0 3 4 8
#> 11 15.5 318 150 2.76 3.52 16.9 0 0 3 2 8
#> 12 15.2 304 150 3.15 3.44 17.3 0 0 3 2 8
#> 13 13.3 350 245 3.73 3.84 15.4 0 0 3 4 8
#> 14 19.2 400 175 3.08 3.84 17.0 0 0 3 2 8
#> 15 15.8 351 264 4.22 3.17 14.5 0 1 5 4 8
#> 16 15 301 335 3.54 3.57 14.6 0 1 5 8 8
```
<sup>Created on 2024-01-09 with [reprex
v2.0.2](https://reprex.tidyverse.org)</sup>
### Component(s)
R
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]