[
https://issues.apache.org/jira/browse/ARROW-18219?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Antoine Pitrou updated ARROW-18219:
-----------------------------------
Description:
`read_csv_arrow()` incorrectly parses CSV files when a string value contains a
comma that appears after a backslash-escaped quote mark. Originally noted by
Thomas Klebel [https://scicomm.xyz/@tklebel/109270436511066953]
This is an example that throws the error:
{code:r}
x <- tempfile()
readr::write_lines(
'
id,text
1,"some text on \\"BLAH
" and X, and Y also"
', x)
cat(system(paste('cat', x), intern = TRUE), sep = "\n")
#>
#> id,text
#> 1,"some text on \"BLAH\" and X, and Y also"
arrow::read_csv_arrow(x, escape_backslash = TRUE)
#> Error:
#> ! Invalid: CSV parse error: Expected 2 columns, got 3: 1,"some text on
\"BLAH\" and X, and Y also"
#> Backtrace:
#> ▆
#> 1. └─arrow (local) `<fn>`(file = x, escape_backslash = TRUE, delim = ",")
#> 2. └─base::tryCatch(...) at r/R/csv.R:217:2
#> 3. └─base (local) tryCatchList(expr, classes, parentenv, handlers)
#> 4. └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]])
#> 5. └─value[[3L]](cond)
#> 6. └─arrow:::augment_io_error_msg(e, call, schema = schema) at
r/R/csv.R:222:6
#> 7. └─rlang::abort(msg, call = call) at r/R/util.R:251:2
{code}
<sup>Created on 2022-11-02 with [reprex
v2.0.2]([https://reprex.tidyverse.org|https://reprex.tidyverse.org/])</sup>
This version includes four lines that might potentially error but do not:
{code:r}
x <- tempfile()
readr::write_lines(
'
id,text
2,"some text on X and Y"
3,"some text on X, and Y"
4,"some text on \\"BLAH
"
5,"some text on X and Y, and \\"BLAH
" also"
', x)
cat(system(paste('cat', x), intern = TRUE), sep = "\n")
#>
#> id,text
#> 2,"some text on X and Y"
#> 3,"some text on X, and Y"
#> 4,"some text on \"BLAH\"
#> 5,"some text on X and Y, and \"BLAH\" also"
arrow::read_csv_arrow(x, escape_backslash = TRUE)
#> # A tibble: 4 × 2
#> id text
#> <int> <chr>
#> 1 2 "some text on X and Y"
#> 2 3 "some text on X, and Y"
#> 3 4 "some text on \\BLAH\\\""
#> 4 5 "some text on X and Y, and \\BLAH\\\" also\""
{code}
<sup>Created on 2022-11-02 with [reprex
v2.0.2]([https://reprex.tidyverse.org|https://reprex.tidyverse.org/])</sup>
I'm not sure if the problem is R specific. I've partially reproduced the error
using reticulate and pyarrow as follows, but notice that this errors at a
different point: the pyarrow version appears to fail with the comma preceding
the backslash-escaped quote mark:
{code:r}
x <- tempfile()
readr::write_lines(
'
id,text
1,"some text on X and Y"
2,"some text on X, and Y"
3,"some text on \\"BLAH
"
4,"some text on X and Y, and \\"BLAH
" also"
5,"some text on \\"BLAH
" and X, and Y also"
', x)
cat(system(paste('cat', x), intern = TRUE), sep = "\n")
#>
#> id,text
#> 1,"some text on X and Y"
#> 2,"some text on X, and Y"
#> 3,"some text on \"BLAH\"
#> 4,"some text on X and Y, and \"BLAH\" also"
#> 5,"some text on \"BLAH\" and X, and Y also"
csv <- reticulate::import("pyarrow.csv")
opt <- csv$ParseOptions(escape_char='
')
csv$read_csv(x, parse_options = opt)
#> Error in py_call_impl(callable, dots$args, dots$keywords):
pyarrow.lib.ArrowInvalid: CSV parse error: Expected 2 columns, got 3: 3,"some
text on \"BLAH\"
#> 4,"some text on X and Y, and \"BLAH\" also"
{code}
<sup>Created on 2022-11-02 with [reprex
v2.0.2]([https://reprex.tidyverse.org|https://reprex.tidyverse.org/])</sup>
was:
`read_csv_arrow()` incorrectly parses CSV files when a string value contains a
comma that appears after a backslash-escaped quote mark. Originally noted by
Thomas Klebel https://scicomm.xyz/@tklebel/109270436511066953
This is an example that throws the error:
``` r
x <- tempfile()
readr::write_lines(
'
id,text
1,"some text on \\"BLAH\\" and X, and Y also"
', x)
cat(system(paste('cat', x), intern = TRUE), sep = "\n")
#>
#> id,text
#> 1,"some text on \"BLAH\" and X, and Y also"
arrow::read_csv_arrow(x, escape_backslash = TRUE)
#> Error:
#> ! Invalid: CSV parse error: Expected 2 columns, got 3: 1,"some text on
\"BLAH\" and X, and Y also"
#> Backtrace:
#> ▆
#> 1. └─arrow (local) `<fn>`(file = x, escape_backslash = TRUE, delim = ",")
#> 2. └─base::tryCatch(...) at r/R/csv.R:217:2
#> 3. └─base (local) tryCatchList(expr, classes, parentenv, handlers)
#> 4. └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]])
#> 5. └─value[[3L]](cond)
#> 6. └─arrow:::augment_io_error_msg(e, call, schema = schema) at
r/R/csv.R:222:6
#> 7. └─rlang::abort(msg, call = call) at r/R/util.R:251:2
```
<sup>Created on 2022-11-02 with [reprex
v2.0.2](https://reprex.tidyverse.org)</sup>
This version includes four lines that might potentially error but do not:
``` r
x <- tempfile()
readr::write_lines(
'
id,text
2,"some text on X and Y"
3,"some text on X, and Y"
4,"some text on \\"BLAH\\"
5,"some text on X and Y, and \\"BLAH\\" also"
', x)
cat(system(paste('cat', x), intern = TRUE), sep = "\n")
#>
#> id,text
#> 2,"some text on X and Y"
#> 3,"some text on X, and Y"
#> 4,"some text on \"BLAH\"
#> 5,"some text on X and Y, and \"BLAH\" also"
arrow::read_csv_arrow(x, escape_backslash = TRUE)
#> # A tibble: 4 × 2
#> id text
#> <int> <chr>
#> 1 2 "some text on X and Y"
#> 2 3 "some text on X, and Y"
#> 3 4 "some text on \\BLAH\\\""
#> 4 5 "some text on X and Y, and \\BLAH\\\" also\""
```
<sup>Created on 2022-11-02 with [reprex
v2.0.2](https://reprex.tidyverse.org)</sup>
I'm not sure if the problem is R specific. I've partially reproduced the error
using reticulate and pyarrow as follows, but notice that this errors at a
different point: the pyarrow version appears to fail with the comma preceding
the backslash-escaped quote mark:
``` r
x <- tempfile()
readr::write_lines(
'
id,text
1,"some text on X and Y"
2,"some text on X, and Y"
3,"some text on \\"BLAH\\"
4,"some text on X and Y, and \\"BLAH\\" also"
5,"some text on \\"BLAH\\" and X, and Y also"
', x)
cat(system(paste('cat', x), intern = TRUE), sep = "\n")
#>
#> id,text
#> 1,"some text on X and Y"
#> 2,"some text on X, and Y"
#> 3,"some text on \"BLAH\"
#> 4,"some text on X and Y, and \"BLAH\" also"
#> 5,"some text on \"BLAH\" and X, and Y also"
csv <- reticulate::import("pyarrow.csv")
opt <- csv$ParseOptions(escape_char='\\')
csv$read_csv(x, parse_options = opt)
#> Error in py_call_impl(callable, dots$args, dots$keywords):
pyarrow.lib.ArrowInvalid: CSV parse error: Expected 2 columns, got 3: 3,"some
text on \"BLAH\"
#> 4,"some text on X and Y, and \"BLAH\" also"
```
<sup>Created on 2022-11-02 with [reprex
v2.0.2](https://reprex.tidyverse.org)</sup>
> [R] read_csv_arrow fails when a string contains a backslash-escaped quote
> mark followed by a comma
> --------------------------------------------------------------------------------------------------
>
> Key: ARROW-18219
> URL: https://issues.apache.org/jira/browse/ARROW-18219
> Project: Apache Arrow
> Issue Type: Bug
> Components: R
> Affects Versions: 10.0.0
> Reporter: Danielle Navarro
> Priority: Major
>
> `read_csv_arrow()` incorrectly parses CSV files when a string value contains
> a comma that appears after a backslash-escaped quote mark. Originally noted
> by Thomas Klebel [https://scicomm.xyz/@tklebel/109270436511066953]
> This is an example that throws the error:
> {code:r}
> x <- tempfile()
> readr::write_lines(
> '
> id,text
> 1,"some text on \\"BLAH
> " and X, and Y also"
> ', x)
> cat(system(paste('cat', x), intern = TRUE), sep = "\n")
> #>
> #> id,text
> #> 1,"some text on \"BLAH\" and X, and Y also"
> arrow::read_csv_arrow(x, escape_backslash = TRUE)
> #> Error:
> #> ! Invalid: CSV parse error: Expected 2 columns, got 3: 1,"some text on
> \"BLAH\" and X, and Y also"
> #> Backtrace:
> #> ▆
> #> 1. └─arrow (local) `<fn>`(file = x, escape_backslash = TRUE, delim = ",")
> #> 2. └─base::tryCatch(...) at r/R/csv.R:217:2
> #> 3. └─base (local) tryCatchList(expr, classes, parentenv, handlers)
> #> 4. └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]])
> #> 5. └─value[[3L]](cond)
> #> 6. └─arrow:::augment_io_error_msg(e, call, schema = schema) at
> r/R/csv.R:222:6
> #> 7. └─rlang::abort(msg, call = call) at r/R/util.R:251:2
> {code}
> <sup>Created on 2022-11-02 with [reprex
> v2.0.2]([https://reprex.tidyverse.org|https://reprex.tidyverse.org/])</sup>
> This version includes four lines that might potentially error but do not:
> {code:r}
> x <- tempfile()
> readr::write_lines(
> '
> id,text
> 2,"some text on X and Y"
> 3,"some text on X, and Y"
> 4,"some text on \\"BLAH
> "
> 5,"some text on X and Y, and \\"BLAH
> " also"
> ', x)
> cat(system(paste('cat', x), intern = TRUE), sep = "\n")
> #>
> #> id,text
> #> 2,"some text on X and Y"
> #> 3,"some text on X, and Y"
> #> 4,"some text on \"BLAH\"
> #> 5,"some text on X and Y, and \"BLAH\" also"
> arrow::read_csv_arrow(x, escape_backslash = TRUE)
> #> # A tibble: 4 × 2
> #> id text
> #> <int> <chr>
> #> 1 2 "some text on X and Y"
> #> 2 3 "some text on X, and Y"
> #> 3 4 "some text on \\BLAH\\\""
> #> 4 5 "some text on X and Y, and \\BLAH\\\" also\""
> {code}
> <sup>Created on 2022-11-02 with [reprex
> v2.0.2]([https://reprex.tidyverse.org|https://reprex.tidyverse.org/])</sup>
> I'm not sure if the problem is R specific. I've partially reproduced the
> error using reticulate and pyarrow as follows, but notice that this errors at
> a different point: the pyarrow version appears to fail with the comma
> preceding the backslash-escaped quote mark:
> {code:r}
> x <- tempfile()
> readr::write_lines(
> '
> id,text
> 1,"some text on X and Y"
> 2,"some text on X, and Y"
> 3,"some text on \\"BLAH
> "
> 4,"some text on X and Y, and \\"BLAH
> " also"
> 5,"some text on \\"BLAH
> " and X, and Y also"
> ', x)
> cat(system(paste('cat', x), intern = TRUE), sep = "\n")
> #>
> #> id,text
> #> 1,"some text on X and Y"
> #> 2,"some text on X, and Y"
> #> 3,"some text on \"BLAH\"
> #> 4,"some text on X and Y, and \"BLAH\" also"
> #> 5,"some text on \"BLAH\" and X, and Y also"
> csv <- reticulate::import("pyarrow.csv")
> opt <- csv$ParseOptions(escape_char='
> ')
> csv$read_csv(x, parse_options = opt)
> #> Error in py_call_impl(callable, dots$args, dots$keywords):
> pyarrow.lib.ArrowInvalid: CSV parse error: Expected 2 columns, got 3: 3,"some
> text on \"BLAH\"
> #> 4,"some text on X and Y, and \"BLAH\" also"
> {code}
> <sup>Created on 2022-11-02 with [reprex
> v2.0.2]([https://reprex.tidyverse.org|https://reprex.tidyverse.org/])</sup>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)