[I] [R] write_dataset() - supplying partition name as character not schema results in it being missing from dataset [arrow]

via GitHub Tue, 09 Jan 2024 15:42:46 -0800


thisisnic opened a new issue, #39541:
URL: https://github.com/apache/arrow/issues/39541


   ### Describe the bug, including details regarding any error messages, 
version, and platform.
   
   #37658 added the capability for the partition variable to be part of the CSV 
dataset, but it only works when the partition is supplied as a schema object 
and not a column name.
   
   ``` r
   library(arrow)
   library(dplyr)
   tf <- tempfile()
   dir.create(tf)
   csv_dir <- file.path(tf, "csv_dir")
   parquet_dir <- file.path(tf, "parquet_dir")
   dir.create(csv_dir)
   dir.create(parquet_dir)
   write_dataset(group_by(mtcars[1:16,], cyl), csv_dir, format = "csv")
   write_dataset(group_by(mtcars[17:32,], cyl), parquet_dir, format = "parquet")
   csv_dataset <- open_dataset(csv_dir, format = "csv")
   parquet_dataset <- open_dataset(parquet_dir, format = "parquet")
   
   # CSV parse error
   open_dataset(
     csv_dir,
     format = "csv",
     schema = schema(parquet_dataset),
     skip_rows = 1,
     partitioning = "cyl"
   ) %>%
     collect()
   #> Error in `compute.Dataset()`:
   #> ! Invalid: Could not open CSV input source 
'/tmp/RtmpoQQt2q/file25f3a27580770/csv_dir/cyl=4/part-0.csv': Invalid: CSV 
parse error: Row #2: Expected 11 columns, got 10: 
22.8,108,93,3.85,2.32,18.61,1,1,4,1
   #> Backtrace:
   #>      ▆
   #>   1. ├─... %>% collect()
   #>   2. ├─dplyr::collect(.)
   #>   3. └─arrow:::collect.Dataset(.)
   #>   4.   ├─arrow:::collect.ArrowTabular(compute.Dataset(x), as_data_frame)
   #>   5.   └─arrow:::compute.Dataset(x)
   #>   6.     └─base::tryCatch(...)
   #>   7.       └─base (local) tryCatchList(expr, classes, parentenv, handlers)
   #>   8.         └─base (local) tryCatchOne(expr, names, parentenv, 
handlers[[1L]])
   #>   9.           └─value[[3L]](cond)
   #>  10.             └─arrow:::augment_io_error_msg(e, call, schema = 
schema())
   #>  11.               └─rlang::abort(msg, call = call)
   
   # this works fine
   open_dataset(
     csv_dir,
     format = "csv",
     schema = schema(parquet_dataset),
     skip_rows = 1, 
     partitioning = schema("cyl" = int32())
   ) %>%
     collect()
   #> # A tibble: 16 × 11
   #>      mpg  disp    hp  drat    wt  qsec    vs    am  gear  carb   cyl
   #>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
   #>  1  22.8  108     93  3.85  2.32  18.6     1     1     4     1     4
   #>  2  24.4  147.    62  3.69  3.19  20       1     0     4     2     4
   #>  3  22.8  141.    95  3.92  3.15  22.9     1     0     4     2     4
   #>  4  21    160    110  3.9   2.62  16.5     0     1     4     4     6
   #>  5  21    160    110  3.9   2.88  17.0     0     1     4     4     6
   #>  6  21.4  258    110  3.08  3.22  19.4     1     0     3     1     6
   #>  7  18.1  225    105  2.76  3.46  20.2     1     0     3     1     6
   #>  8  19.2  168.   123  3.92  3.44  18.3     1     0     4     4     6
   #>  9  17.8  168.   123  3.92  3.44  18.9     1     0     4     4     6
   #> 10  18.7  360    175  3.15  3.44  17.0     0     0     3     2     8
   #> 11  14.3  360    245  3.21  3.57  15.8     0     0     3     4     8
   #> 12  16.4  276.   180  3.07  4.07  17.4     0     0     3     3     8
   #> 13  17.3  276.   180  3.07  3.73  17.6     0     0     3     3     8
   #> 14  15.2  276.   180  3.07  3.78  18       0     0     3     3     8
   #> 15  10.4  472    205  2.93  5.25  18.0     0     0     3     4     8
   #> 16  10.4  460    215  3     5.42  17.8     0     0     3     4     8
   
   # both of these work
   open_dataset(
     parquet_dir,
     schema = schema(parquet_dataset),
     partitioning = "cyl"
   ) %>%
     collect()
   #> # A tibble: 16 × 11
   #>      mpg  disp    hp  drat    wt  qsec    vs    am  gear  carb   cyl
   #>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
   #>  1  32.4  78.7    66  4.08  2.2   19.5     1     1     4     1     4
   #>  2  30.4  75.7    52  4.93  1.62  18.5     1     1     4     2     4
   #>  3  33.9  71.1    65  4.22  1.84  19.9     1     1     4     1     4
   #>  4  21.5 120.     97  3.7   2.46  20.0     1     0     3     1     4
   #>  5  27.3  79      66  4.08  1.94  18.9     1     1     4     1     4
   #>  6  26   120.     91  4.43  2.14  16.7     0     1     5     2     4
   #>  7  30.4  95.1   113  3.77  1.51  16.9     1     1     5     2     4
   #>  8  21.4 121     109  4.11  2.78  18.6     1     1     4     2     4
   #>  9  19.7 145     175  3.62  2.77  15.5     0     1     5     6     6
   #> 10  14.7 440     230  3.23  5.34  17.4     0     0     3     4     8
   #> 11  15.5 318     150  2.76  3.52  16.9     0     0     3     2     8
   #> 12  15.2 304     150  3.15  3.44  17.3     0     0     3     2     8
   #> 13  13.3 350     245  3.73  3.84  15.4     0     0     3     4     8
   #> 14  19.2 400     175  3.08  3.84  17.0     0     0     3     2     8
   #> 15  15.8 351     264  4.22  3.17  14.5     0     1     5     4     8
   #> 16  15   301     335  3.54  3.57  14.6     0     1     5     8     8
   
   open_dataset(
     parquet_dir,
     schema = schema(parquet_dataset),
     partitioning = schema("cyl" = int32())
   ) %>%
     collect()
   #> # A tibble: 16 × 11
   #>      mpg  disp    hp  drat    wt  qsec    vs    am  gear  carb   cyl
   #>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
   #>  1  32.4  78.7    66  4.08  2.2   19.5     1     1     4     1     4
   #>  2  30.4  75.7    52  4.93  1.62  18.5     1     1     4     2     4
   #>  3  33.9  71.1    65  4.22  1.84  19.9     1     1     4     1     4
   #>  4  21.5 120.     97  3.7   2.46  20.0     1     0     3     1     4
   #>  5  27.3  79      66  4.08  1.94  18.9     1     1     4     1     4
   #>  6  26   120.     91  4.43  2.14  16.7     0     1     5     2     4
   #>  7  30.4  95.1   113  3.77  1.51  16.9     1     1     5     2     4
   #>  8  21.4 121     109  4.11  2.78  18.6     1     1     4     2     4
   #>  9  19.7 145     175  3.62  2.77  15.5     0     1     5     6     6
   #> 10  14.7 440     230  3.23  5.34  17.4     0     0     3     4     8
   #> 11  15.5 318     150  2.76  3.52  16.9     0     0     3     2     8
   #> 12  15.2 304     150  3.15  3.44  17.3     0     0     3     2     8
   #> 13  13.3 350     245  3.73  3.84  15.4     0     0     3     4     8
   #> 14  19.2 400     175  3.08  3.84  17.0     0     0     3     2     8
   #> 15  15.8 351     264  4.22  3.17  14.5     0     1     5     4     8
   #> 16  15   301     335  3.54  3.57  14.6     0     1     5     8     8
   ```
   
   <sup>Created on 2024-01-09 with [reprex 
v2.0.2](https://reprex.tidyverse.org)</sup>
   
   
   ### Component(s)
   
   R


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[I] [R] write_dataset() - supplying partition name as character not schema results in it being missing from dataset [arrow]

Reply via email to