[ 
https://issues.apache.org/jira/browse/ARROW-18219?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Antoine Pitrou updated ARROW-18219:
-----------------------------------
    Description: 
`read_csv_arrow()` incorrectly parses CSV files when a string value contains a 
comma that appears after a backslash-escaped quote mark. Originally noted by 
Thomas Klebel [https://scicomm.xyz/@tklebel/109270436511066953]

This is an example that throws the error:

{code:r}
x <- tempfile()
readr::write_lines(
'
id,text
1,"some text on \\"BLAH
" and X, and Y also"
', x)

cat(system(paste('cat', x), intern = TRUE), sep = "\n")
#> 
#> id,text
#> 1,"some text on \"BLAH\" and X, and Y also"
arrow::read_csv_arrow(x, escape_backslash = TRUE)
#> Error:
#> ! Invalid: CSV parse error: Expected 2 columns, got 3: 1,"some text on 
\"BLAH\" and X, and Y also"

#> Backtrace:
#> ▆
#> 1. └─arrow (local) `<fn>`(file = x, escape_backslash = TRUE, delim = ",")
#> 2. └─base::tryCatch(...) at r/R/csv.R:217:2
#> 3. └─base (local) tryCatchList(expr, classes, parentenv, handlers)
#> 4. └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]])
#> 5. └─value[[3L]](cond)
#> 6. └─arrow:::augment_io_error_msg(e, call, schema = schema) at 
r/R/csv.R:222:6
#> 7. └─rlang::abort(msg, call = call) at r/R/util.R:251:2
{code}

<sup>Created on 2022-11-02 with [reprex 
v2.0.2]([https://reprex.tidyverse.org|https://reprex.tidyverse.org/])</sup>

This version includes four lines that might potentially error but do not:

{code:r}
x <- tempfile()
readr::write_lines(
'
id,text
2,"some text on X and Y"
3,"some text on X, and Y"
4,"some text on \\"BLAH
"
5,"some text on X and Y, and \\"BLAH
" also"
', x)

cat(system(paste('cat', x), intern = TRUE), sep = "\n")
#> 
#> id,text
#> 2,"some text on X and Y"
#> 3,"some text on X, and Y"
#> 4,"some text on \"BLAH\"
#> 5,"some text on X and Y, and \"BLAH\" also"
arrow::read_csv_arrow(x, escape_backslash = TRUE)
#> # A tibble: 4 × 2
#> id text 
#> <int> <chr> 
#> 1 2 "some text on X and Y" 
#> 2 3 "some text on X, and Y" 
#> 3 4 "some text on \\BLAH\\\"" 
#> 4 5 "some text on X and Y, and \\BLAH\\\" also\""
{code}

<sup>Created on 2022-11-02 with [reprex 
v2.0.2]([https://reprex.tidyverse.org|https://reprex.tidyverse.org/])</sup>

I'm not sure if the problem is R specific. I've partially reproduced the error 
using reticulate and pyarrow as follows, but notice that this errors at a 
different point: the pyarrow version appears to fail with the comma preceding 
the backslash-escaped quote mark:

{code:r}
x <- tempfile()
readr::write_lines(
'
id,text
1,"some text on X and Y"
2,"some text on X, and Y"
3,"some text on \\"BLAH
"
4,"some text on X and Y, and \\"BLAH
" also"
5,"some text on \\"BLAH
" and X, and Y also"
', x)

cat(system(paste('cat', x), intern = TRUE), sep = "\n")
#> 
#> id,text
#> 1,"some text on X and Y"
#> 2,"some text on X, and Y"
#> 3,"some text on \"BLAH\"
#> 4,"some text on X and Y, and \"BLAH\" also"
#> 5,"some text on \"BLAH\" and X, and Y also"

csv <- reticulate::import("pyarrow.csv")
opt <- csv$ParseOptions(escape_char='
')
csv$read_csv(x, parse_options = opt)
#> Error in py_call_impl(callable, dots$args, dots$keywords): 
pyarrow.lib.ArrowInvalid: CSV parse error: Expected 2 columns, got 3: 3,"some 
text on \"BLAH\"
#> 4,"some text on X and Y, and \"BLAH\" also"
{code}

<sup>Created on 2022-11-02 with [reprex 
v2.0.2]([https://reprex.tidyverse.org|https://reprex.tidyverse.org/])</sup>

  was:
`read_csv_arrow()` incorrectly parses CSV files when a string value contains a 
comma that appears after a backslash-escaped quote mark. Originally noted by 
Thomas Klebel https://scicomm.xyz/@tklebel/109270436511066953

This is an example that throws the error:

``` r
x <- tempfile()
readr::write_lines(
  '
id,text
1,"some text on \\"BLAH\\" and X, and Y also"
', x)

cat(system(paste('cat', x), intern = TRUE), sep = "\n")
#> 
#> id,text
#> 1,"some text on \"BLAH\" and X, and Y also"
arrow::read_csv_arrow(x, escape_backslash = TRUE)
#> Error:
#> ! Invalid: CSV parse error: Expected 2 columns, got 3: 1,"some text on 
\"BLAH\" and X, and Y also"

#> Backtrace:
#>     ▆
#>  1. └─arrow (local) `<fn>`(file = x, escape_backslash = TRUE, delim = ",")
#>  2.   └─base::tryCatch(...) at r/R/csv.R:217:2
#>  3.     └─base (local) tryCatchList(expr, classes, parentenv, handlers)
#>  4.       └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]])
#>  5.         └─value[[3L]](cond)
#>  6.           └─arrow:::augment_io_error_msg(e, call, schema = schema) at 
r/R/csv.R:222:6
#>  7.             └─rlang::abort(msg, call = call) at r/R/util.R:251:2
```

<sup>Created on 2022-11-02 with [reprex 
v2.0.2](https://reprex.tidyverse.org)</sup>

This version includes four lines that might potentially error but do not:

``` r
x <- tempfile()
readr::write_lines(
  '
id,text
2,"some text on X and Y"
3,"some text on X, and Y"
4,"some text on \\"BLAH\\"
5,"some text on X and Y, and \\"BLAH\\" also"
', x)

cat(system(paste('cat', x), intern = TRUE), sep = "\n")
#> 
#> id,text
#> 2,"some text on X and Y"
#> 3,"some text on X, and Y"
#> 4,"some text on \"BLAH\"
#> 5,"some text on X and Y, and \"BLAH\" also"
arrow::read_csv_arrow(x, escape_backslash = TRUE)
#> # A tibble: 4 × 2
#>      id text                                         
#>   <int> <chr>                                        
#> 1     2 "some text on X and Y"                       
#> 2     3 "some text on X, and Y"                      
#> 3     4 "some text on \\BLAH\\\""                    
#> 4     5 "some text on X and Y, and \\BLAH\\\" also\""
```

<sup>Created on 2022-11-02 with [reprex 
v2.0.2](https://reprex.tidyverse.org)</sup>

I'm not sure if the problem is R specific. I've partially reproduced the error 
using reticulate and pyarrow as follows, but notice that this errors at a 
different point: the pyarrow version appears to fail with the comma preceding 
the backslash-escaped quote mark:

``` r
x <- tempfile()
readr::write_lines(
  '
id,text
1,"some text on X and Y"
2,"some text on X, and Y"
3,"some text on \\"BLAH\\"
4,"some text on X and Y, and \\"BLAH\\" also"
5,"some text on \\"BLAH\\" and X, and Y also"
', x)

cat(system(paste('cat', x), intern = TRUE), sep = "\n")
#> 
#> id,text
#> 1,"some text on X and Y"
#> 2,"some text on X, and Y"
#> 3,"some text on \"BLAH\"
#> 4,"some text on X and Y, and \"BLAH\" also"
#> 5,"some text on \"BLAH\" and X, and Y also"

csv <- reticulate::import("pyarrow.csv")
opt <- csv$ParseOptions(escape_char='\\')
csv$read_csv(x, parse_options = opt)
#> Error in py_call_impl(callable, dots$args, dots$keywords): 
pyarrow.lib.ArrowInvalid: CSV parse error: Expected 2 columns, got 3: 3,"some 
text on \"BLAH\"
#> 4,"some text on X and Y, and \"BLAH\" also"
```

<sup>Created on 2022-11-02 with [reprex 
v2.0.2](https://reprex.tidyverse.org)</sup>





> [R] read_csv_arrow fails when a string contains a backslash-escaped quote 
> mark followed by a comma
> --------------------------------------------------------------------------------------------------
>
>                 Key: ARROW-18219
>                 URL: https://issues.apache.org/jira/browse/ARROW-18219
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: R
>    Affects Versions: 10.0.0
>            Reporter: Danielle Navarro
>            Priority: Major
>
> `read_csv_arrow()` incorrectly parses CSV files when a string value contains 
> a comma that appears after a backslash-escaped quote mark. Originally noted 
> by Thomas Klebel [https://scicomm.xyz/@tklebel/109270436511066953]
> This is an example that throws the error:
> {code:r}
> x <- tempfile()
> readr::write_lines(
> '
> id,text
> 1,"some text on \\"BLAH
> " and X, and Y also"
> ', x)
> cat(system(paste('cat', x), intern = TRUE), sep = "\n")
> #> 
> #> id,text
> #> 1,"some text on \"BLAH\" and X, and Y also"
> arrow::read_csv_arrow(x, escape_backslash = TRUE)
> #> Error:
> #> ! Invalid: CSV parse error: Expected 2 columns, got 3: 1,"some text on 
> \"BLAH\" and X, and Y also"
> #> Backtrace:
> #> ▆
> #> 1. └─arrow (local) `<fn>`(file = x, escape_backslash = TRUE, delim = ",")
> #> 2. └─base::tryCatch(...) at r/R/csv.R:217:2
> #> 3. └─base (local) tryCatchList(expr, classes, parentenv, handlers)
> #> 4. └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]])
> #> 5. └─value[[3L]](cond)
> #> 6. └─arrow:::augment_io_error_msg(e, call, schema = schema) at 
> r/R/csv.R:222:6
> #> 7. └─rlang::abort(msg, call = call) at r/R/util.R:251:2
> {code}
> <sup>Created on 2022-11-02 with [reprex 
> v2.0.2]([https://reprex.tidyverse.org|https://reprex.tidyverse.org/])</sup>
> This version includes four lines that might potentially error but do not:
> {code:r}
> x <- tempfile()
> readr::write_lines(
> '
> id,text
> 2,"some text on X and Y"
> 3,"some text on X, and Y"
> 4,"some text on \\"BLAH
> "
> 5,"some text on X and Y, and \\"BLAH
> " also"
> ', x)
> cat(system(paste('cat', x), intern = TRUE), sep = "\n")
> #> 
> #> id,text
> #> 2,"some text on X and Y"
> #> 3,"some text on X, and Y"
> #> 4,"some text on \"BLAH\"
> #> 5,"some text on X and Y, and \"BLAH\" also"
> arrow::read_csv_arrow(x, escape_backslash = TRUE)
> #> # A tibble: 4 × 2
> #> id text 
> #> <int> <chr> 
> #> 1 2 "some text on X and Y" 
> #> 2 3 "some text on X, and Y" 
> #> 3 4 "some text on \\BLAH\\\"" 
> #> 4 5 "some text on X and Y, and \\BLAH\\\" also\""
> {code}
> <sup>Created on 2022-11-02 with [reprex 
> v2.0.2]([https://reprex.tidyverse.org|https://reprex.tidyverse.org/])</sup>
> I'm not sure if the problem is R specific. I've partially reproduced the 
> error using reticulate and pyarrow as follows, but notice that this errors at 
> a different point: the pyarrow version appears to fail with the comma 
> preceding the backslash-escaped quote mark:
> {code:r}
> x <- tempfile()
> readr::write_lines(
> '
> id,text
> 1,"some text on X and Y"
> 2,"some text on X, and Y"
> 3,"some text on \\"BLAH
> "
> 4,"some text on X and Y, and \\"BLAH
> " also"
> 5,"some text on \\"BLAH
> " and X, and Y also"
> ', x)
> cat(system(paste('cat', x), intern = TRUE), sep = "\n")
> #> 
> #> id,text
> #> 1,"some text on X and Y"
> #> 2,"some text on X, and Y"
> #> 3,"some text on \"BLAH\"
> #> 4,"some text on X and Y, and \"BLAH\" also"
> #> 5,"some text on \"BLAH\" and X, and Y also"
> csv <- reticulate::import("pyarrow.csv")
> opt <- csv$ParseOptions(escape_char='
> ')
> csv$read_csv(x, parse_options = opt)
> #> Error in py_call_impl(callable, dots$args, dots$keywords): 
> pyarrow.lib.ArrowInvalid: CSV parse error: Expected 2 columns, got 3: 3,"some 
> text on \"BLAH\"
> #> 4,"some text on X and Y, and \"BLAH\" also"
> {code}
> <sup>Created on 2022-11-02 with [reprex 
> v2.0.2]([https://reprex.tidyverse.org|https://reprex.tidyverse.org/])</sup>



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to