[arrow] 03/06: ARROW-16612: [R] Fix compression inference from filename (#13625)

kszucs Wed, 27 Jul 2022 05:27:38 -0700

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-9.0.0
in repository https://gitbox.apache.org/repos/asf/arrow.git


commit 107163fec888e36a2d576d1f992f0e6f41ef7ad1
Author: Neal Richardson <[email protected]>
AuthorDate: Wed Jul 27 07:02:05 2022 -0400

    ARROW-16612: [R] Fix compression inference from filename (#13625)
    
    This is actually a much larger change than the original issue.
    
    * ~Infer compression from the file extension in `write_parquet()` and pass 
it to ParquetFileWriter rather than write to a CompressedOutputStream, and 
don't wrap the in a CompressedInputStream in `read_parquet()` because that 
doesn't work (and isn't how compression works for Parquet). Previously, reading 
from a file with extension `.parquet.gz` etc. would error unless you opened an 
input stream yourself. This is the original report from ARROW-16612.~ Cut and 
moved to [ARROW-17221](http [...]
    * Likewise for `read_feather()` and `write_feather()`, which also support 
compression within the file itself and not around it.
    * Since the whole "detect compression and wrap in a compressed stream" 
feature seems limited to CSV and JSON, and in making the changes here I was 
having to hack around that feature, I refactored to pull it out of the internal 
functions `make_readable_file()` and `make_output_stream()` and do it only in 
the csv/json functions.
    * In the process of refactoring, I noticed and fixed two bugs: (1) no 
matter what compression extension you provided to `make_output_stream()`, you 
would get a gzip-compressed stream because we weren't actually passing the 
codec to `CompressedOutputStream$create()`; (2) `.lz4` actually needs to be 
mapped to the "lz4_frame" codec; attempting to write a CSV to a 
`CompressedOutputStream$create(codec = "lz4")` raises an error. Neither were 
caught because our tests for this feature only te [...]
    * The refactoring should also mean that ARROW-16619 (inferring compression 
from URL), as well as from SubTreeFileSystem (S3 buckets etc.), is also 
supported.
    
    Authored-by: Neal Richardson <[email protected]>
    Signed-off-by: Neal Richardson <[email protected]>
---
 r/R/csv.R                          | 40 +++++++++++---------
 r/R/feather.R                      | 21 +++++++----
 r/R/io.R                           | 76 ++++++++++++--------------------------
 r/R/ipc-stream.R                   | 10 -----
 r/R/json.R                         |  5 +++
 r/R/parquet.R                      |  9 +++++
 r/man/make_readable_file.Rd        | 11 +-----
 r/man/read_feather.Rd              |  6 +--
 r/man/read_ipc_stream.Rd           |  6 ---
 r/man/write_feather.Rd             |  9 +++--
 r/man/write_ipc_stream.Rd          |  6 ---
 r/tests/testthat/test-compressed.R |  8 ++++
 r/tests/testthat/test-csv.R        | 25 ++++++++++++-
 r/tests/testthat/test-feather.R    | 16 ++++++++
 r/tests/testthat/test-parquet.R    | 16 ++++++++
 15 files changed, 145 insertions(+), 119 deletions(-)

diff --git a/r/R/csv.R b/r/R/csv.R
index 32ed0e4bee..6adbb40219 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -188,7 +188,12 @@ read_delim_arrow <- function(file,
   }
 
   if (!inherits(file, "InputStream")) {
+    compression <- detect_compression(file)
     file <- make_readable_file(file)
+    if (compression != "uncompressed") {
+      # TODO: accept compression and compression_level as args
+      file <- CompressedInputStream$create(file, compression)
+    }
     on.exit(file$close())
   }
   reader <- CsvTableReader$create(
@@ -699,7 +704,6 @@ write_csv_arrow <- function(x,
     )
   }
 
-  # default values are considered missing by base R
   if (missing(include_header) && !missing(col_names)) {
     include_header <- col_names
   }
@@ -712,16 +716,27 @@ write_csv_arrow <- function(x,
   }
 
   x_out <- x
-  if (is.data.frame(x)) {
-    x <- Table$create(x)
-  }
-
-  if (inherits(x, c("Dataset", "arrow_dplyr_query"))) {
-    x <- Scanner$create(x)$ToRecordBatchReader()
+  if (!inherits(x, "ArrowTabular")) {
+    tryCatch(
+      x <- as_record_batch_reader(x),
+      error = function(e) {
+        abort(
+          paste0(
+            "x must be an object of class 'data.frame', 'RecordBatch', ",
+            "'Dataset', 'Table', or 'RecordBatchReader' not '", class(x)[1], 
"'."
+          )
+        )
+      }
+    )
   }
 
   if (!inherits(sink, "OutputStream")) {
+    compression <- detect_compression(sink)
     sink <- make_output_stream(sink)
+    if (compression != "uncompressed") {
+      # TODO: accept compression and compression_level as args
+      sink <- CompressedOutputStream$create(sink, codec = compression)
+    }
     on.exit(sink$close())
   }
 
@@ -731,17 +746,6 @@ write_csv_arrow <- function(x,
     csv___WriteCSV__Table(x, write_options, sink)
   } else if (inherits(x, c("RecordBatchReader"))) {
     csv___WriteCSV__RecordBatchReader(x, write_options, sink)
-  } else {
-    abort(
-      c(
-        paste0(
-          paste(
-            "x must be an object of class 'data.frame', 'RecordBatch',",
-            "'Dataset', 'Table', or 'RecordBatchReader' not '"
-          ), class(x)[[1]], "'."
-        )
-      )
-    )
   }
 
   invisible(x_out)
diff --git a/r/R/feather.R b/r/R/feather.R
index 03c8a7b5f0..4e2e9947cb 100644
--- a/r/R/feather.R
+++ b/r/R/feather.R
@@ -38,8 +38,9 @@
 #' @param compression Name of compression codec to use, if any. Default is
 #' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise
 #' "uncompressed". "zstd" is the other available codec and generally has better
-#' compression ratios in exchange for slower read and write performance
-#' See [codec_is_available()]. This option is not supported for V1.
+#' compression ratios in exchange for slower read and write performance.
+#' "lz4" is shorthand for the "lz4_frame" codec.
+#' See [codec_is_available()] for details. This option is not supported for V1.
 #' @param compression_level If `compression` is "zstd", you may
 #' specify an integer compression level. If omitted, the compression codec's
 #' default compression level is used.
@@ -67,11 +68,13 @@ write_feather <- function(x,
                           sink,
                           version = 2,
                           chunk_size = 65536L,
-                          compression = c("default", "lz4", "uncompressed", 
"zstd"),
+                          compression = c("default", "lz4", "lz4_frame", 
"uncompressed", "zstd"),
                           compression_level = NULL) {
   # Handle and validate options before touching data
   version <- as.integer(version)
   assert_that(version %in% 1:2)
+
+  # TODO(ARROW-17221): if (missing(compression)), we could 
detect_compression(sink) here
   compression <- match.arg(compression)
   chunk_size <- as.integer(chunk_size)
   assert_that(chunk_size > 0)
@@ -128,7 +131,7 @@ write_feather <- function(x,
 write_ipc_file <- function(x,
                            sink,
                            chunk_size = 65536L,
-                           compression = c("default", "lz4", "uncompressed", 
"zstd"),
+                           compression = c("default", "lz4", "lz4_frame", 
"uncompressed", "zstd"),
                            compression_level = NULL) {
   mc <- match.call()
   mc$version <- 2
@@ -147,7 +150,7 @@ write_ipc_file <- function(x,
 #'
 #' @inheritParams read_ipc_stream
 #' @inheritParams read_delim_arrow
-#' @param ... additional parameters, passed to [make_readable_file()].
+#' @inheritParams make_readable_file
 #'
 #' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an
 #' Arrow [Table] otherwise
@@ -163,9 +166,13 @@ write_ipc_file <- function(x,
 #' dim(df)
 #' # Can select columns
 #' df <- read_feather(tf, col_select = starts_with("d"))
-read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) {
+read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = 
TRUE) {
   if (!inherits(file, "RandomAccessFile")) {
-    file <- make_readable_file(file, ...)
+    # Compression is handled inside the IPC file format, so we don't need
+    # to detect from the file extension and wrap in a CompressedInputStream
+    # TODO: Why is this the only read_format() functions that allows passing
+    # mmap to make_readable_file?
+    file <- make_readable_file(file, mmap)
     on.exit(file$close())
   }
   reader <- FeatherReader$create(file)
diff --git a/r/R/io.R b/r/R/io.R
index 82e3847df5..fc664ed386 100644
--- a/r/R/io.R
+++ b/r/R/io.R
@@ -229,52 +229,31 @@ mmap_open <- function(path, mode = c("read", "write", 
"readwrite")) {
 #' Handle a range of possible input sources
 #' @param file A character file name, `raw` vector, or an Arrow input stream
 #' @param mmap Logical: whether to memory-map the file (default `TRUE`)
-#' @param compression If the file is compressed, created a 
[CompressedInputStream]
-#' with this compression codec, either a [Codec] or the string name of one.
-#' If `NULL` (default) and `file` is a string file name, the function will try
-#' to infer compression from the file extension.
-#' @param filesystem If not `NULL`, `file` will be opened via the
-#' `filesystem$OpenInputFile()` filesystem method, rather than the `io` 
module's
-#' `MemoryMappedFile` or `ReadableFile` constructors.
 #' @return An `InputStream` or a subclass of one.
 #' @keywords internal
-make_readable_file <- function(file, mmap = TRUE, compression = NULL, 
filesystem = NULL) {
+make_readable_file <- function(file, mmap = TRUE) {
   if (inherits(file, "SubTreeFileSystem")) {
     filesystem <- file$base_fs
-    # SubTreeFileSystem adds a slash to base_path, but filesystems will reject 
file names
-    # with trailing slashes, so we need to remove it here.
-    file <- sub("/$", "", file$base_path)
-  }
-  if (is.string(file)) {
+    # SubTreeFileSystem adds a slash to base_path, but filesystems will reject
+    # file names with trailing slashes, so we need to remove it here.
+    path <- sub("/$", "", file$base_path)
+    file <- filesystem$OpenInputFile(path)
+  } else if (is.string(file)) {
     if (is_url(file)) {
       file <- tryCatch(
         {
           fs_and_path <- FileSystem$from_uri(file)
-          filesystem <- fs_and_path$fs
-          fs_and_path$path
+          fs_and_path$fs$OpenInputFile(fs_and_path$path)
         },
         error = function(e) {
           MakeRConnectionInputStream(url(file, open = "rb"))
         }
       )
-    }
-
-    if (is.null(compression)) {
-      # Infer compression from the file path
-      compression <- detect_compression(file)
-    }
-
-    if (!is.null(filesystem)) {
-      file <- filesystem$OpenInputFile(file)
-    } else if (is.string(file) && isTRUE(mmap)) {
+    } else if (isTRUE(mmap)) {
       file <- mmap_open(file)
-    } else if (is.string(file)) {
+    } else {
       file <- ReadableFile$create(file)
     }
-
-    if (is_compressed(compression)) {
-      file <- CompressedInputStream$create(file, compression)
-    }
   } else if (inherits(file, c("raw", "Buffer"))) {
     file <- BufferReader$create(file)
   } else if (inherits(file, "connection")) {
@@ -294,7 +273,7 @@ make_readable_file <- function(file, mmap = TRUE, 
compression = NULL, filesystem
   file
 }
 
-make_output_stream <- function(x, filesystem = NULL, compression = NULL) {
+make_output_stream <- function(x) {
   if (inherits(x, "connection")) {
     if (!isOpen(x)) {
       open(x, "wb")
@@ -305,45 +284,36 @@ make_output_stream <- function(x, filesystem = NULL, 
compression = NULL) {
 
   if (inherits(x, "SubTreeFileSystem")) {
     filesystem <- x$base_fs
-    # SubTreeFileSystem adds a slash to base_path, but filesystems will reject 
file names
-    # with trailing slashes, so we need to remove it here.
-    x <- sub("/$", "", x$base_path)
+    # SubTreeFileSystem adds a slash to base_path, but filesystems will reject
+    # file names with trailing slashes, so we need to remove it here.
+    path <- sub("/$", "", x$base_path)
+    filesystem$OpenOutputStream(path)
   } else if (is_url(x)) {
     fs_and_path <- FileSystem$from_uri(x)
-    filesystem <- fs_and_path$fs
-    x <- fs_and_path$path
-  }
-
-  if (is.null(compression)) {
-    # Infer compression from sink
-    compression <- detect_compression(x)
-  }
-
-  assert_that(is.string(x))
-  if (is.null(filesystem) && is_compressed(compression)) {
-    CompressedOutputStream$create(x) ## compressed local
-  } else if (is.null(filesystem) && !is_compressed(compression)) {
-    FileOutputStream$create(x) ## uncompressed local
-  } else if (!is.null(filesystem) && is_compressed(compression)) {
-    CompressedOutputStream$create(filesystem$OpenOutputStream(x)) ## 
compressed remote
+    fs_and_path$fs$OpenOutputStream(fs_and_path$path)
   } else {
-    filesystem$OpenOutputStream(x) ## uncompressed remote
+    assert_that(is.string(x))
+    FileOutputStream$create(x)
   }
 }
 
 detect_compression <- function(path) {
+  if (inherits(path, "SubTreeFileSystem")) {
+    path <- path$base_path
+  }
   if (!is.string(path)) {
     return("uncompressed")
   }
 
-  # Remove any trailing slashes, which FileSystem$from_uri may add
+  # Remove any trailing slashes, which SubTreeFileSystem may add
   path <- sub("/$", "", path)
 
   switch(tools::file_ext(path),
     bz2 = "bz2",
     gz = "gzip",
-    lz4 = "lz4",
+    lz4 = "lz4_frame",
     zst = "zstd",
+    snappy = "snappy",
     "uncompressed"
   )
 }
diff --git a/r/R/ipc-stream.R b/r/R/ipc-stream.R
index 9fea0f9e52..dd59d0f4df 100644
--- a/r/R/ipc-stream.R
+++ b/r/R/ipc-stream.R
@@ -23,11 +23,6 @@
 #' a "stream" format and a "file" format, known as Feather. 
`write_ipc_stream()`
 #' and [write_feather()] write those formats, respectively.
 #'
-#' `write_arrow()`, a wrapper around `write_ipc_stream()` and `write_feather()`
-#' with some nonstandard behavior, is deprecated. You should explicitly choose
-#' the function that will write the desired IPC format (stream or file) since
-#' either can be written to a file or `OutputStream`.
-#'
 #' @inheritParams write_feather
 #' @param ... extra parameters passed to `write_feather()`.
 #'
@@ -87,11 +82,6 @@ write_to_raw <- function(x, format = c("stream", "file")) {
 #' a "stream" format and a "file" format, known as Feather. `read_ipc_stream()`
 #' and [read_feather()] read those formats, respectively.
 #'
-#' `read_arrow()`, a wrapper around `read_ipc_stream()` and `read_feather()`,
-#' is deprecated. You should explicitly choose
-#' the function that will read the desired IPC format (stream or file) since
-#' a file or `InputStream` may contain either.
-#'
 #' @param file A character file name or URI, `raw` vector, an Arrow input 
stream,
 #' or a `FileSystem` with path (`SubTreeFileSystem`).
 #' If a file name or URI, an Arrow [InputStream] will be opened and
diff --git a/r/R/json.R b/r/R/json.R
index 19cf6a9299..2b1f4916cb 100644
--- a/r/R/json.R
+++ b/r/R/json.R
@@ -44,7 +44,12 @@ read_json_arrow <- function(file,
                             schema = NULL,
                             ...) {
   if (!inherits(file, "InputStream")) {
+    compression <- detect_compression(file)
     file <- make_readable_file(file)
+    if (compression != "uncompressed") {
+      # TODO: accept compression and compression_level as args
+      file <- CompressedInputStream$create(file, compression)
+    }
     on.exit(file$close())
   }
   tab <- JsonTableReader$create(file, schema = schema, ...)$Read()
diff --git a/r/R/parquet.R b/r/R/parquet.R
index 8cd9daa857..0b3f93b20e 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -36,9 +36,17 @@
 read_parquet <- function(file,
                          col_select = NULL,
                          as_data_frame = TRUE,
+                         # TODO: for consistency with other readers/writers,
+                         # these properties should be enumerated as args here,
+                         # and ParquetArrowReaderProperties$create() should
+                         # accept them, as with ParquetWriterProperties.
+                         # Assembling `props` yourself is something you do with
+                         # ParquetFileReader but not here.
                          props = ParquetArrowReaderProperties$create(),
                          ...) {
   if (!inherits(file, "RandomAccessFile")) {
+    # Compression is handled inside the parquet file format, so we don't need
+    # to detect from the file extension and wrap in a CompressedInputStream
     file <- make_readable_file(file)
     on.exit(file$close())
   }
@@ -156,6 +164,7 @@ write_parquet <- function(x,
   x <- as_writable_table(x)
 
   if (!inherits(sink, "OutputStream")) {
+    # TODO(ARROW-17221): if (missing(compression)), we could 
detect_compression(sink) here
     sink <- make_output_stream(sink)
     on.exit(sink$close())
   }
diff --git a/r/man/make_readable_file.Rd b/r/man/make_readable_file.Rd
index fe2e298261..1544815211 100644
--- a/r/man/make_readable_file.Rd
+++ b/r/man/make_readable_file.Rd
@@ -4,21 +4,12 @@
 \alias{make_readable_file}
 \title{Handle a range of possible input sources}
 \usage{
-make_readable_file(file, mmap = TRUE, compression = NULL, filesystem = NULL)
+make_readable_file(file, mmap = TRUE)
 }
 \arguments{
 \item{file}{A character file name, \code{raw} vector, or an Arrow input stream}
 
 \item{mmap}{Logical: whether to memory-map the file (default \code{TRUE})}
-
-\item{compression}{If the file is compressed, created a 
\link{CompressedInputStream}
-with this compression codec, either a \link{Codec} or the string name of one.
-If \code{NULL} (default) and \code{file} is a string file name, the function 
will try
-to infer compression from the file extension.}
-
-\item{filesystem}{If not \code{NULL}, \code{file} will be opened via the
-\code{filesystem$OpenInputFile()} filesystem method, rather than the \code{io} 
module's
-\code{MemoryMappedFile} or \code{ReadableFile} constructors.}
 }
 \value{
 An \code{InputStream} or a subclass of one.
diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd
index 07d20b8e01..218a163b99 100644
--- a/r/man/read_feather.Rd
+++ b/r/man/read_feather.Rd
@@ -5,9 +5,9 @@
 \alias{read_ipc_file}
 \title{Read a Feather file (an Arrow IPC file)}
 \usage{
-read_feather(file, col_select = NULL, as_data_frame = TRUE, ...)
+read_feather(file, col_select = NULL, as_data_frame = TRUE, mmap = TRUE)
 
-read_ipc_file(file, col_select = NULL, as_data_frame = TRUE, ...)
+read_ipc_file(file, col_select = NULL, as_data_frame = TRUE, mmap = TRUE)
 }
 \arguments{
 \item{file}{A character file name or URI, \code{raw} vector, an Arrow input 
stream,
@@ -24,7 +24,7 @@ of columns, as used in \code{dplyr::select()}.}
 \item{as_data_frame}{Should the function return a \code{data.frame} (default) 
or
 an Arrow \link{Table}?}
 
-\item{...}{additional parameters, passed to 
\code{\link[=make_readable_file]{make_readable_file()}}.}
+\item{mmap}{Logical: whether to memory-map the file (default \code{TRUE})}
 }
 \value{
 A \code{data.frame} if \code{as_data_frame} is \code{TRUE} (the default), or an
diff --git a/r/man/read_ipc_stream.Rd b/r/man/read_ipc_stream.Rd
index 567ee9882b..63b50e7c1b 100644
--- a/r/man/read_ipc_stream.Rd
+++ b/r/man/read_ipc_stream.Rd
@@ -27,12 +27,6 @@ Apache Arrow defines two formats for 
\href{https://arrow.apache.org/docs/format/
 a "stream" format and a "file" format, known as Feather. 
\code{read_ipc_stream()}
 and \code{\link[=read_feather]{read_feather()}} read those formats, 
respectively.
 }
-\details{
-\code{read_arrow()}, a wrapper around \code{read_ipc_stream()} and 
\code{read_feather()},
-is deprecated. You should explicitly choose
-the function that will read the desired IPC format (stream or file) since
-a file or \code{InputStream} may contain either.
-}
 \seealso{
 \code{\link[=write_feather]{write_feather()}} for writing IPC files. 
\link{RecordBatchReader} for a
 lower-level interface.
diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd
index 85c83ff04b..2d8a86f969 100644
--- a/r/man/write_feather.Rd
+++ b/r/man/write_feather.Rd
@@ -10,7 +10,7 @@ write_feather(
   sink,
   version = 2,
   chunk_size = 65536L,
-  compression = c("default", "lz4", "uncompressed", "zstd"),
+  compression = c("default", "lz4", "lz4_frame", "uncompressed", "zstd"),
   compression_level = NULL
 )
 
@@ -18,7 +18,7 @@ write_ipc_file(
   x,
   sink,
   chunk_size = 65536L,
-  compression = c("default", "lz4", "uncompressed", "zstd"),
+  compression = c("default", "lz4", "lz4_frame", "uncompressed", "zstd"),
   compression_level = NULL
 )
 }
@@ -37,8 +37,9 @@ random row access. Default is 64K. This option is not 
supported for V1.}
 \item{compression}{Name of compression codec to use, if any. Default is
 "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise
 "uncompressed". "zstd" is the other available codec and generally has better
-compression ratios in exchange for slower read and write performance
-See \code{\link[=codec_is_available]{codec_is_available()}}. This option is 
not supported for V1.}
+compression ratios in exchange for slower read and write performance.
+"lz4" is shorthand for the "lz4_frame" codec.
+See \code{\link[=codec_is_available]{codec_is_available()}} for details. This 
option is not supported for V1.}
 
 \item{compression_level}{If \code{compression} is "zstd", you may
 specify an integer compression level. If omitted, the compression codec's
diff --git a/r/man/write_ipc_stream.Rd b/r/man/write_ipc_stream.Rd
index 60c3197732..094e3ad11a 100644
--- a/r/man/write_ipc_stream.Rd
+++ b/r/man/write_ipc_stream.Rd
@@ -22,12 +22,6 @@ Apache Arrow defines two formats for 
\href{https://arrow.apache.org/docs/format/
 a "stream" format and a "file" format, known as Feather. 
\code{write_ipc_stream()}
 and \code{\link[=write_feather]{write_feather()}} write those formats, 
respectively.
 }
-\details{
-\code{write_arrow()}, a wrapper around \code{write_ipc_stream()} and 
\code{write_feather()}
-with some nonstandard behavior, is deprecated. You should explicitly choose
-the function that will write the desired IPC format (stream or file) since
-either can be written to a file or \code{OutputStream}.
-}
 \examples{
 tf <- tempfile()
 on.exit(unlink(tf))
diff --git a/r/tests/testthat/test-compressed.R 
b/r/tests/testthat/test-compressed.R
index 485e16769f..7d1c1cfd39 100644
--- a/r/tests/testthat/test-compressed.R
+++ b/r/tests/testthat/test-compressed.R
@@ -40,6 +40,14 @@ test_that("Codec attributes", {
   expect_error(cod$level)
 })
 
+test_that("Default compression_level for zstd", {
+  skip_if_not_available("zstd")
+  cod <- Codec$create("zstd")
+  expect_equal(cod$name, "zstd")
+  # TODO: implement $level
+  expect_error(cod$level)
+})
+
 test_that("can write Buffer to CompressedOutputStream and read back in 
CompressedInputStream", {
   skip_if_not_available("gzip")
   buf <- buffer(as.raw(sample(0:255, size = 1024, replace = TRUE)))
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index d4878e6d67..cd8da2625c 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -566,8 +566,6 @@ test_that("read/write compressed file successfully", {
   skip_if_not_available("gzip")
   tfgz <- tempfile(fileext = ".csv.gz")
   tf <- tempfile(fileext = ".csv")
-  on.exit(unlink(tf))
-  on.exit(unlink(tfgz))
 
   write_csv_arrow(tbl, tf)
   write_csv_arrow(tbl, tfgz)
@@ -577,6 +575,29 @@ test_that("read/write compressed file successfully", {
     read_csv_arrow(tfgz),
     tbl
   )
+  skip_if_not_available("lz4")
+  tflz4 <- tempfile(fileext = ".csv.lz4")
+  write_csv_arrow(tbl, tflz4)
+  expect_false(file.size(tfgz) == file.size(tflz4))
+  expect_identical(
+    read_csv_arrow(tflz4),
+    tbl
+  )
+})
+
+test_that("read/write compressed filesystem path", {
+  skip_if_not_available("zstd")
+  tfzst <- tempfile(fileext = ".csv.zst")
+  fs <- LocalFileSystem$create()$path(tfzst)
+  write_csv_arrow(tbl, fs)
+
+  tf <- tempfile(fileext = ".csv")
+  write_csv_arrow(tbl, tf)
+  expect_lt(file.size(tfzst), file.size(tf))
+  expect_identical(
+    read_csv_arrow(fs),
+    tbl
+  )
 })
 
 test_that("read_csv_arrow() can read sub-second timestamps with col_types T 
setting (ARROW-15599)", {
diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R
index 1ef2ecf3e9..8d7a43ad06 100644
--- a/r/tests/testthat/test-feather.R
+++ b/r/tests/testthat/test-feather.R
@@ -207,6 +207,22 @@ test_that("read_feather requires RandomAccessFile and 
errors nicely otherwise (A
   )
 })
 
+test_that("write_feather() does not detect compression from filename", {
+  # TODO(ARROW-17221): should this be supported?
+  without <- tempfile(fileext = ".arrow")
+  with_zst <- tempfile(fileext = ".arrow.zst")
+  write_feather(mtcars, without)
+  write_feather(mtcars, with_zst)
+  expect_equal(file.size(without), file.size(with_zst))
+})
+
+test_that("read_feather() handles (ignores) compression in filename", {
+  df <- tibble::tibble(x = 1:5)
+  f <- tempfile(fileext = ".parquet.zst")
+  write_feather(df, f)
+  expect_equal(read_feather(f), df)
+})
+
 test_that("read_feather() and write_feather() accept connection objects", {
   skip_if_not(CanRunWithCapturedR())
 
diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R
index b75892bc84..32170534a4 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/tests/testthat/test-parquet.R
@@ -185,6 +185,22 @@ test_that("write_parquet() defaults to snappy 
compression", {
   expect_equal(file.size(tmp1), file.size(tmp2))
 })
 
+test_that("write_parquet() does not detect compression from filename", {
+  # TODO(ARROW-17221): should this be supported?
+  without <- tempfile(fileext = ".parquet")
+  with_gz <- tempfile(fileext = ".parquet.gz")
+  write_parquet(mtcars, without)
+  write_parquet(mtcars, with_gz)
+  expect_equal(file.size(with_gz), file.size(without))
+})
+
+test_that("read_parquet() handles (ignores) compression in filename", {
+  df <- tibble::tibble(x = 1:5)
+  f <- tempfile(fileext = ".parquet.gz")
+  write_parquet(df, f)
+  expect_equal(read_parquet(f), df)
+})
+
 test_that("Factors are preserved when writing/reading from Parquet", {
   fct <- factor(c("a", "b"), levels = c("c", "a", "b"))
   ord <- factor(c("a", "b"), levels = c("c", "a", "b"), ordered = TRUE)

[arrow] 03/06: ARROW-16612: [R] Fix compression inference from filename (#13625)

Reply via email to