paleolimbot commented on pull request #12030:
URL: https://github.com/apache/arrow/pull/12030#issuecomment-1006058148


   A reprex that might help with testing (since my example is a little 
dinky)...the probable use case is reading encodings like latin1, but I also did 
check with a bunch of random latin1 characters:
   
   <details>
   
   ``` r
   # generate a data frame with funky characters
   latin1_chars <- iconv(
     # exclude the comma and control characters
     list(as.raw(setdiff(c(38:126, 161:255), 44))),
     "latin1", "UTF-8"
   )
   
   make_text_col <- function(chars, 
                             chars_per_item_min = 1, chars_per_item_max = 20,
                             n_items = 20) {
     choices <- unlist(strsplit(chars, ""))
     text_col <- character(n_items)
     for (i in seq_along(text_col)) {
       text_col[i] <- paste0(
         sample(
           choices, 
           round(runif(1, chars_per_item_min, chars_per_item_max)), 
           replace = TRUE
         ),
         collapse = ""
       )
     }
     text_col
   }
   
   set.seed(1843)
   n_items <- 1e6
   
   df_latin1 <- data.frame(
     n = 1:n_items,
     latin1_chars = make_text_col(latin1_chars, n_items = n_items)
   )
   
   # now check the CSV reader
   library(arrow, warn.conflicts = FALSE)
   
   # make some files
   tf_latin1_utf8 <- tempfile()
   tf_latin1_latin1 <- tempfile()
   
   readr::write_csv(df_latin1, tf_latin1_utf8)
   readr::write_file(
     iconv(list(readr::read_file_raw(tf_latin1_utf8)), "UTF-8", "latin1", toRaw 
= TRUE)[[1]],
     tf_latin1_latin1
   )
   
   
   fs <- LocalFileSystem$create()
   reader <- CsvTableReader$create(
     fs$OpenInputStream(tf_latin1_utf8),
     read_options = CsvReadOptions$create(encoding = "UTF-8")
   )
   
   df_latin1_from_utf8 <- tibble::as_tibble(reader$Read())
   
   reader <- CsvTableReader$create(
     fs$OpenInputStream(tf_latin1_latin1),
     read_options = CsvReadOptions$create(encoding = "latin1")
   )
   df_latin1_from_latin1 <- tibble::as_tibble(reader$Read())
   
   identical(df_latin1_from_utf8, df_latin1_from_latin1)
   #> [1] TRUE
   ```
   
   <sup>Created on 2022-01-05 by the [reprex 
package](https://reprex.tidyverse.org) (v2.0.1)</sup>
   
   </details>


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to