This is an automated email from the ASF dual-hosted git repository. kszucs pushed a commit to branch maint-9.0.0 in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 107163fec888e36a2d576d1f992f0e6f41ef7ad1 Author: Neal Richardson <[email protected]> AuthorDate: Wed Jul 27 07:02:05 2022 -0400 ARROW-16612: [R] Fix compression inference from filename (#13625) This is actually a much larger change than the original issue. * ~Infer compression from the file extension in `write_parquet()` and pass it to ParquetFileWriter rather than write to a CompressedOutputStream, and don't wrap the in a CompressedInputStream in `read_parquet()` because that doesn't work (and isn't how compression works for Parquet). Previously, reading from a file with extension `.parquet.gz` etc. would error unless you opened an input stream yourself. This is the original report from ARROW-16612.~ Cut and moved to [ARROW-17221](http [...] * Likewise for `read_feather()` and `write_feather()`, which also support compression within the file itself and not around it. * Since the whole "detect compression and wrap in a compressed stream" feature seems limited to CSV and JSON, and in making the changes here I was having to hack around that feature, I refactored to pull it out of the internal functions `make_readable_file()` and `make_output_stream()` and do it only in the csv/json functions. * In the process of refactoring, I noticed and fixed two bugs: (1) no matter what compression extension you provided to `make_output_stream()`, you would get a gzip-compressed stream because we weren't actually passing the codec to `CompressedOutputStream$create()`; (2) `.lz4` actually needs to be mapped to the "lz4_frame" codec; attempting to write a CSV to a `CompressedOutputStream$create(codec = "lz4")` raises an error. Neither were caught because our tests for this feature only te [...] * The refactoring should also mean that ARROW-16619 (inferring compression from URL), as well as from SubTreeFileSystem (S3 buckets etc.), is also supported. Authored-by: Neal Richardson <[email protected]> Signed-off-by: Neal Richardson <[email protected]> --- r/R/csv.R | 40 +++++++++++--------- r/R/feather.R | 21 +++++++---- r/R/io.R | 76 ++++++++++++-------------------------- r/R/ipc-stream.R | 10 ----- r/R/json.R | 5 +++ r/R/parquet.R | 9 +++++ r/man/make_readable_file.Rd | 11 +----- r/man/read_feather.Rd | 6 +-- r/man/read_ipc_stream.Rd | 6 --- r/man/write_feather.Rd | 9 +++-- r/man/write_ipc_stream.Rd | 6 --- r/tests/testthat/test-compressed.R | 8 ++++ r/tests/testthat/test-csv.R | 25 ++++++++++++- r/tests/testthat/test-feather.R | 16 ++++++++ r/tests/testthat/test-parquet.R | 16 ++++++++ 15 files changed, 145 insertions(+), 119 deletions(-) diff --git a/r/R/csv.R b/r/R/csv.R index 32ed0e4bee..6adbb40219 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -188,7 +188,12 @@ read_delim_arrow <- function(file, } if (!inherits(file, "InputStream")) { + compression <- detect_compression(file) file <- make_readable_file(file) + if (compression != "uncompressed") { + # TODO: accept compression and compression_level as args + file <- CompressedInputStream$create(file, compression) + } on.exit(file$close()) } reader <- CsvTableReader$create( @@ -699,7 +704,6 @@ write_csv_arrow <- function(x, ) } - # default values are considered missing by base R if (missing(include_header) && !missing(col_names)) { include_header <- col_names } @@ -712,16 +716,27 @@ write_csv_arrow <- function(x, } x_out <- x - if (is.data.frame(x)) { - x <- Table$create(x) - } - - if (inherits(x, c("Dataset", "arrow_dplyr_query"))) { - x <- Scanner$create(x)$ToRecordBatchReader() + if (!inherits(x, "ArrowTabular")) { + tryCatch( + x <- as_record_batch_reader(x), + error = function(e) { + abort( + paste0( + "x must be an object of class 'data.frame', 'RecordBatch', ", + "'Dataset', 'Table', or 'RecordBatchReader' not '", class(x)[1], "'." + ) + ) + } + ) } if (!inherits(sink, "OutputStream")) { + compression <- detect_compression(sink) sink <- make_output_stream(sink) + if (compression != "uncompressed") { + # TODO: accept compression and compression_level as args + sink <- CompressedOutputStream$create(sink, codec = compression) + } on.exit(sink$close()) } @@ -731,17 +746,6 @@ write_csv_arrow <- function(x, csv___WriteCSV__Table(x, write_options, sink) } else if (inherits(x, c("RecordBatchReader"))) { csv___WriteCSV__RecordBatchReader(x, write_options, sink) - } else { - abort( - c( - paste0( - paste( - "x must be an object of class 'data.frame', 'RecordBatch',", - "'Dataset', 'Table', or 'RecordBatchReader' not '" - ), class(x)[[1]], "'." - ) - ) - ) } invisible(x_out) diff --git a/r/R/feather.R b/r/R/feather.R index 03c8a7b5f0..4e2e9947cb 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -38,8 +38,9 @@ #' @param compression Name of compression codec to use, if any. Default is #' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise #' "uncompressed". "zstd" is the other available codec and generally has better -#' compression ratios in exchange for slower read and write performance -#' See [codec_is_available()]. This option is not supported for V1. +#' compression ratios in exchange for slower read and write performance. +#' "lz4" is shorthand for the "lz4_frame" codec. +#' See [codec_is_available()] for details. This option is not supported for V1. #' @param compression_level If `compression` is "zstd", you may #' specify an integer compression level. If omitted, the compression codec's #' default compression level is used. @@ -67,11 +68,13 @@ write_feather <- function(x, sink, version = 2, chunk_size = 65536L, - compression = c("default", "lz4", "uncompressed", "zstd"), + compression = c("default", "lz4", "lz4_frame", "uncompressed", "zstd"), compression_level = NULL) { # Handle and validate options before touching data version <- as.integer(version) assert_that(version %in% 1:2) + + # TODO(ARROW-17221): if (missing(compression)), we could detect_compression(sink) here compression <- match.arg(compression) chunk_size <- as.integer(chunk_size) assert_that(chunk_size > 0) @@ -128,7 +131,7 @@ write_feather <- function(x, write_ipc_file <- function(x, sink, chunk_size = 65536L, - compression = c("default", "lz4", "uncompressed", "zstd"), + compression = c("default", "lz4", "lz4_frame", "uncompressed", "zstd"), compression_level = NULL) { mc <- match.call() mc$version <- 2 @@ -147,7 +150,7 @@ write_ipc_file <- function(x, #' #' @inheritParams read_ipc_stream #' @inheritParams read_delim_arrow -#' @param ... additional parameters, passed to [make_readable_file()]. +#' @inheritParams make_readable_file #' #' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an #' Arrow [Table] otherwise @@ -163,9 +166,13 @@ write_ipc_file <- function(x, #' dim(df) #' # Can select columns #' df <- read_feather(tf, col_select = starts_with("d")) -read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { +read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = TRUE) { if (!inherits(file, "RandomAccessFile")) { - file <- make_readable_file(file, ...) + # Compression is handled inside the IPC file format, so we don't need + # to detect from the file extension and wrap in a CompressedInputStream + # TODO: Why is this the only read_format() functions that allows passing + # mmap to make_readable_file? + file <- make_readable_file(file, mmap) on.exit(file$close()) } reader <- FeatherReader$create(file) diff --git a/r/R/io.R b/r/R/io.R index 82e3847df5..fc664ed386 100644 --- a/r/R/io.R +++ b/r/R/io.R @@ -229,52 +229,31 @@ mmap_open <- function(path, mode = c("read", "write", "readwrite")) { #' Handle a range of possible input sources #' @param file A character file name, `raw` vector, or an Arrow input stream #' @param mmap Logical: whether to memory-map the file (default `TRUE`) -#' @param compression If the file is compressed, created a [CompressedInputStream] -#' with this compression codec, either a [Codec] or the string name of one. -#' If `NULL` (default) and `file` is a string file name, the function will try -#' to infer compression from the file extension. -#' @param filesystem If not `NULL`, `file` will be opened via the -#' `filesystem$OpenInputFile()` filesystem method, rather than the `io` module's -#' `MemoryMappedFile` or `ReadableFile` constructors. #' @return An `InputStream` or a subclass of one. #' @keywords internal -make_readable_file <- function(file, mmap = TRUE, compression = NULL, filesystem = NULL) { +make_readable_file <- function(file, mmap = TRUE) { if (inherits(file, "SubTreeFileSystem")) { filesystem <- file$base_fs - # SubTreeFileSystem adds a slash to base_path, but filesystems will reject file names - # with trailing slashes, so we need to remove it here. - file <- sub("/$", "", file$base_path) - } - if (is.string(file)) { + # SubTreeFileSystem adds a slash to base_path, but filesystems will reject + # file names with trailing slashes, so we need to remove it here. + path <- sub("/$", "", file$base_path) + file <- filesystem$OpenInputFile(path) + } else if (is.string(file)) { if (is_url(file)) { file <- tryCatch( { fs_and_path <- FileSystem$from_uri(file) - filesystem <- fs_and_path$fs - fs_and_path$path + fs_and_path$fs$OpenInputFile(fs_and_path$path) }, error = function(e) { MakeRConnectionInputStream(url(file, open = "rb")) } ) - } - - if (is.null(compression)) { - # Infer compression from the file path - compression <- detect_compression(file) - } - - if (!is.null(filesystem)) { - file <- filesystem$OpenInputFile(file) - } else if (is.string(file) && isTRUE(mmap)) { + } else if (isTRUE(mmap)) { file <- mmap_open(file) - } else if (is.string(file)) { + } else { file <- ReadableFile$create(file) } - - if (is_compressed(compression)) { - file <- CompressedInputStream$create(file, compression) - } } else if (inherits(file, c("raw", "Buffer"))) { file <- BufferReader$create(file) } else if (inherits(file, "connection")) { @@ -294,7 +273,7 @@ make_readable_file <- function(file, mmap = TRUE, compression = NULL, filesystem file } -make_output_stream <- function(x, filesystem = NULL, compression = NULL) { +make_output_stream <- function(x) { if (inherits(x, "connection")) { if (!isOpen(x)) { open(x, "wb") @@ -305,45 +284,36 @@ make_output_stream <- function(x, filesystem = NULL, compression = NULL) { if (inherits(x, "SubTreeFileSystem")) { filesystem <- x$base_fs - # SubTreeFileSystem adds a slash to base_path, but filesystems will reject file names - # with trailing slashes, so we need to remove it here. - x <- sub("/$", "", x$base_path) + # SubTreeFileSystem adds a slash to base_path, but filesystems will reject + # file names with trailing slashes, so we need to remove it here. + path <- sub("/$", "", x$base_path) + filesystem$OpenOutputStream(path) } else if (is_url(x)) { fs_and_path <- FileSystem$from_uri(x) - filesystem <- fs_and_path$fs - x <- fs_and_path$path - } - - if (is.null(compression)) { - # Infer compression from sink - compression <- detect_compression(x) - } - - assert_that(is.string(x)) - if (is.null(filesystem) && is_compressed(compression)) { - CompressedOutputStream$create(x) ## compressed local - } else if (is.null(filesystem) && !is_compressed(compression)) { - FileOutputStream$create(x) ## uncompressed local - } else if (!is.null(filesystem) && is_compressed(compression)) { - CompressedOutputStream$create(filesystem$OpenOutputStream(x)) ## compressed remote + fs_and_path$fs$OpenOutputStream(fs_and_path$path) } else { - filesystem$OpenOutputStream(x) ## uncompressed remote + assert_that(is.string(x)) + FileOutputStream$create(x) } } detect_compression <- function(path) { + if (inherits(path, "SubTreeFileSystem")) { + path <- path$base_path + } if (!is.string(path)) { return("uncompressed") } - # Remove any trailing slashes, which FileSystem$from_uri may add + # Remove any trailing slashes, which SubTreeFileSystem may add path <- sub("/$", "", path) switch(tools::file_ext(path), bz2 = "bz2", gz = "gzip", - lz4 = "lz4", + lz4 = "lz4_frame", zst = "zstd", + snappy = "snappy", "uncompressed" ) } diff --git a/r/R/ipc-stream.R b/r/R/ipc-stream.R index 9fea0f9e52..dd59d0f4df 100644 --- a/r/R/ipc-stream.R +++ b/r/R/ipc-stream.R @@ -23,11 +23,6 @@ #' a "stream" format and a "file" format, known as Feather. `write_ipc_stream()` #' and [write_feather()] write those formats, respectively. #' -#' `write_arrow()`, a wrapper around `write_ipc_stream()` and `write_feather()` -#' with some nonstandard behavior, is deprecated. You should explicitly choose -#' the function that will write the desired IPC format (stream or file) since -#' either can be written to a file or `OutputStream`. -#' #' @inheritParams write_feather #' @param ... extra parameters passed to `write_feather()`. #' @@ -87,11 +82,6 @@ write_to_raw <- function(x, format = c("stream", "file")) { #' a "stream" format and a "file" format, known as Feather. `read_ipc_stream()` #' and [read_feather()] read those formats, respectively. #' -#' `read_arrow()`, a wrapper around `read_ipc_stream()` and `read_feather()`, -#' is deprecated. You should explicitly choose -#' the function that will read the desired IPC format (stream or file) since -#' a file or `InputStream` may contain either. -#' #' @param file A character file name or URI, `raw` vector, an Arrow input stream, #' or a `FileSystem` with path (`SubTreeFileSystem`). #' If a file name or URI, an Arrow [InputStream] will be opened and diff --git a/r/R/json.R b/r/R/json.R index 19cf6a9299..2b1f4916cb 100644 --- a/r/R/json.R +++ b/r/R/json.R @@ -44,7 +44,12 @@ read_json_arrow <- function(file, schema = NULL, ...) { if (!inherits(file, "InputStream")) { + compression <- detect_compression(file) file <- make_readable_file(file) + if (compression != "uncompressed") { + # TODO: accept compression and compression_level as args + file <- CompressedInputStream$create(file, compression) + } on.exit(file$close()) } tab <- JsonTableReader$create(file, schema = schema, ...)$Read() diff --git a/r/R/parquet.R b/r/R/parquet.R index 8cd9daa857..0b3f93b20e 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -36,9 +36,17 @@ read_parquet <- function(file, col_select = NULL, as_data_frame = TRUE, + # TODO: for consistency with other readers/writers, + # these properties should be enumerated as args here, + # and ParquetArrowReaderProperties$create() should + # accept them, as with ParquetWriterProperties. + # Assembling `props` yourself is something you do with + # ParquetFileReader but not here. props = ParquetArrowReaderProperties$create(), ...) { if (!inherits(file, "RandomAccessFile")) { + # Compression is handled inside the parquet file format, so we don't need + # to detect from the file extension and wrap in a CompressedInputStream file <- make_readable_file(file) on.exit(file$close()) } @@ -156,6 +164,7 @@ write_parquet <- function(x, x <- as_writable_table(x) if (!inherits(sink, "OutputStream")) { + # TODO(ARROW-17221): if (missing(compression)), we could detect_compression(sink) here sink <- make_output_stream(sink) on.exit(sink$close()) } diff --git a/r/man/make_readable_file.Rd b/r/man/make_readable_file.Rd index fe2e298261..1544815211 100644 --- a/r/man/make_readable_file.Rd +++ b/r/man/make_readable_file.Rd @@ -4,21 +4,12 @@ \alias{make_readable_file} \title{Handle a range of possible input sources} \usage{ -make_readable_file(file, mmap = TRUE, compression = NULL, filesystem = NULL) +make_readable_file(file, mmap = TRUE) } \arguments{ \item{file}{A character file name, \code{raw} vector, or an Arrow input stream} \item{mmap}{Logical: whether to memory-map the file (default \code{TRUE})} - -\item{compression}{If the file is compressed, created a \link{CompressedInputStream} -with this compression codec, either a \link{Codec} or the string name of one. -If \code{NULL} (default) and \code{file} is a string file name, the function will try -to infer compression from the file extension.} - -\item{filesystem}{If not \code{NULL}, \code{file} will be opened via the -\code{filesystem$OpenInputFile()} filesystem method, rather than the \code{io} module's -\code{MemoryMappedFile} or \code{ReadableFile} constructors.} } \value{ An \code{InputStream} or a subclass of one. diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd index 07d20b8e01..218a163b99 100644 --- a/r/man/read_feather.Rd +++ b/r/man/read_feather.Rd @@ -5,9 +5,9 @@ \alias{read_ipc_file} \title{Read a Feather file (an Arrow IPC file)} \usage{ -read_feather(file, col_select = NULL, as_data_frame = TRUE, ...) +read_feather(file, col_select = NULL, as_data_frame = TRUE, mmap = TRUE) -read_ipc_file(file, col_select = NULL, as_data_frame = TRUE, ...) +read_ipc_file(file, col_select = NULL, as_data_frame = TRUE, mmap = TRUE) } \arguments{ \item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream, @@ -24,7 +24,7 @@ of columns, as used in \code{dplyr::select()}.} \item{as_data_frame}{Should the function return a \code{data.frame} (default) or an Arrow \link{Table}?} -\item{...}{additional parameters, passed to \code{\link[=make_readable_file]{make_readable_file()}}.} +\item{mmap}{Logical: whether to memory-map the file (default \code{TRUE})} } \value{ A \code{data.frame} if \code{as_data_frame} is \code{TRUE} (the default), or an diff --git a/r/man/read_ipc_stream.Rd b/r/man/read_ipc_stream.Rd index 567ee9882b..63b50e7c1b 100644 --- a/r/man/read_ipc_stream.Rd +++ b/r/man/read_ipc_stream.Rd @@ -27,12 +27,6 @@ Apache Arrow defines two formats for \href{https://arrow.apache.org/docs/format/ a "stream" format and a "file" format, known as Feather. \code{read_ipc_stream()} and \code{\link[=read_feather]{read_feather()}} read those formats, respectively. } -\details{ -\code{read_arrow()}, a wrapper around \code{read_ipc_stream()} and \code{read_feather()}, -is deprecated. You should explicitly choose -the function that will read the desired IPC format (stream or file) since -a file or \code{InputStream} may contain either. -} \seealso{ \code{\link[=write_feather]{write_feather()}} for writing IPC files. \link{RecordBatchReader} for a lower-level interface. diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd index 85c83ff04b..2d8a86f969 100644 --- a/r/man/write_feather.Rd +++ b/r/man/write_feather.Rd @@ -10,7 +10,7 @@ write_feather( sink, version = 2, chunk_size = 65536L, - compression = c("default", "lz4", "uncompressed", "zstd"), + compression = c("default", "lz4", "lz4_frame", "uncompressed", "zstd"), compression_level = NULL ) @@ -18,7 +18,7 @@ write_ipc_file( x, sink, chunk_size = 65536L, - compression = c("default", "lz4", "uncompressed", "zstd"), + compression = c("default", "lz4", "lz4_frame", "uncompressed", "zstd"), compression_level = NULL ) } @@ -37,8 +37,9 @@ random row access. Default is 64K. This option is not supported for V1.} \item{compression}{Name of compression codec to use, if any. Default is "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise "uncompressed". "zstd" is the other available codec and generally has better -compression ratios in exchange for slower read and write performance -See \code{\link[=codec_is_available]{codec_is_available()}}. This option is not supported for V1.} +compression ratios in exchange for slower read and write performance. +"lz4" is shorthand for the "lz4_frame" codec. +See \code{\link[=codec_is_available]{codec_is_available()}} for details. This option is not supported for V1.} \item{compression_level}{If \code{compression} is "zstd", you may specify an integer compression level. If omitted, the compression codec's diff --git a/r/man/write_ipc_stream.Rd b/r/man/write_ipc_stream.Rd index 60c3197732..094e3ad11a 100644 --- a/r/man/write_ipc_stream.Rd +++ b/r/man/write_ipc_stream.Rd @@ -22,12 +22,6 @@ Apache Arrow defines two formats for \href{https://arrow.apache.org/docs/format/ a "stream" format and a "file" format, known as Feather. \code{write_ipc_stream()} and \code{\link[=write_feather]{write_feather()}} write those formats, respectively. } -\details{ -\code{write_arrow()}, a wrapper around \code{write_ipc_stream()} and \code{write_feather()} -with some nonstandard behavior, is deprecated. You should explicitly choose -the function that will write the desired IPC format (stream or file) since -either can be written to a file or \code{OutputStream}. -} \examples{ tf <- tempfile() on.exit(unlink(tf)) diff --git a/r/tests/testthat/test-compressed.R b/r/tests/testthat/test-compressed.R index 485e16769f..7d1c1cfd39 100644 --- a/r/tests/testthat/test-compressed.R +++ b/r/tests/testthat/test-compressed.R @@ -40,6 +40,14 @@ test_that("Codec attributes", { expect_error(cod$level) }) +test_that("Default compression_level for zstd", { + skip_if_not_available("zstd") + cod <- Codec$create("zstd") + expect_equal(cod$name, "zstd") + # TODO: implement $level + expect_error(cod$level) +}) + test_that("can write Buffer to CompressedOutputStream and read back in CompressedInputStream", { skip_if_not_available("gzip") buf <- buffer(as.raw(sample(0:255, size = 1024, replace = TRUE))) diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R index d4878e6d67..cd8da2625c 100644 --- a/r/tests/testthat/test-csv.R +++ b/r/tests/testthat/test-csv.R @@ -566,8 +566,6 @@ test_that("read/write compressed file successfully", { skip_if_not_available("gzip") tfgz <- tempfile(fileext = ".csv.gz") tf <- tempfile(fileext = ".csv") - on.exit(unlink(tf)) - on.exit(unlink(tfgz)) write_csv_arrow(tbl, tf) write_csv_arrow(tbl, tfgz) @@ -577,6 +575,29 @@ test_that("read/write compressed file successfully", { read_csv_arrow(tfgz), tbl ) + skip_if_not_available("lz4") + tflz4 <- tempfile(fileext = ".csv.lz4") + write_csv_arrow(tbl, tflz4) + expect_false(file.size(tfgz) == file.size(tflz4)) + expect_identical( + read_csv_arrow(tflz4), + tbl + ) +}) + +test_that("read/write compressed filesystem path", { + skip_if_not_available("zstd") + tfzst <- tempfile(fileext = ".csv.zst") + fs <- LocalFileSystem$create()$path(tfzst) + write_csv_arrow(tbl, fs) + + tf <- tempfile(fileext = ".csv") + write_csv_arrow(tbl, tf) + expect_lt(file.size(tfzst), file.size(tf)) + expect_identical( + read_csv_arrow(fs), + tbl + ) }) test_that("read_csv_arrow() can read sub-second timestamps with col_types T setting (ARROW-15599)", { diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index 1ef2ecf3e9..8d7a43ad06 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -207,6 +207,22 @@ test_that("read_feather requires RandomAccessFile and errors nicely otherwise (A ) }) +test_that("write_feather() does not detect compression from filename", { + # TODO(ARROW-17221): should this be supported? + without <- tempfile(fileext = ".arrow") + with_zst <- tempfile(fileext = ".arrow.zst") + write_feather(mtcars, without) + write_feather(mtcars, with_zst) + expect_equal(file.size(without), file.size(with_zst)) +}) + +test_that("read_feather() handles (ignores) compression in filename", { + df <- tibble::tibble(x = 1:5) + f <- tempfile(fileext = ".parquet.zst") + write_feather(df, f) + expect_equal(read_feather(f), df) +}) + test_that("read_feather() and write_feather() accept connection objects", { skip_if_not(CanRunWithCapturedR()) diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index b75892bc84..32170534a4 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -185,6 +185,22 @@ test_that("write_parquet() defaults to snappy compression", { expect_equal(file.size(tmp1), file.size(tmp2)) }) +test_that("write_parquet() does not detect compression from filename", { + # TODO(ARROW-17221): should this be supported? + without <- tempfile(fileext = ".parquet") + with_gz <- tempfile(fileext = ".parquet.gz") + write_parquet(mtcars, without) + write_parquet(mtcars, with_gz) + expect_equal(file.size(with_gz), file.size(without)) +}) + +test_that("read_parquet() handles (ignores) compression in filename", { + df <- tibble::tibble(x = 1:5) + f <- tempfile(fileext = ".parquet.gz") + write_parquet(df, f) + expect_equal(read_parquet(f), df) +}) + test_that("Factors are preserved when writing/reading from Parquet", { fct <- factor(c("a", "b"), levels = c("c", "a", "b")) ord <- factor(c("a", "b"), levels = c("c", "a", "b"), ordered = TRUE)
