nealrichardson commented on code in PR #13183:
URL: https://github.com/apache/arrow/pull/13183#discussion_r875991641
##########
r/R/io.R:
##########
@@ -322,6 +333,10 @@ detect_compression <- function(path) {
return("uncompressed")
}
+ ## Remove any trailing slashes
Review Comment:
```suggestion
# Remove any trailing slashes, which FileSystem$from_uri may add
```
##########
r/R/io.R:
##########
@@ -309,11 +310,21 @@ make_output_stream <- function(x, filesystem = NULL) {
filesystem <- fs_and_path$fs
x <- fs_and_path$path
}
+
+ if (is.null(compression)) {
+ # Infer compression from sink
+ compression <- detect_compression(x)
+ }
+
assert_that(is.string(x))
- if (is.null(filesystem)) {
- FileOutputStream$create(x)
+ if (is.null(filesystem) && is_compressed(compression)) {
+ CompressedOutputStream$create(x) ##compressed local
+ } else if (is.null(filesystem) && !is_compressed(compression)) {
+ FileOutputStream$create(x) ## uncompressed local
+ } else if (!is.null(filesystem) && is_compressed(compression)) {
+ CompressedOutputStream$create(filesystem$OpenOutputStream(x)) ##
compressed s3
} else {
- filesystem$OpenOutputStream(x)
+ filesystem$OpenOutputStream(x) ## uncompressed s3
Review Comment:
Not only S3, any remote file system (GCS, Azure are coming soon, for example)
##########
r/tests/testthat/test-csv.R:
##########
@@ -564,6 +564,29 @@ test_that("write_csv_arrow can write from
RecordBatchReader objects", {
expect_equal(nrow(tbl_in), 3)
})
+test_that("write_csv_arrow() compresses by file extension", {
+ skip_if_not_available("gzip")
+ tfgz <- tempfile(fileext = ".csv.gz")
+ tf <- tempfile(fileext = ".csv")
+ on.exit(unlink(tf))
+ on.exit(unlink(tfgz))
+
+ write_csv_arrow(tbl, tf)
+ write_csv_arrow(tbl, tfgz)
+ expect_lt(file.size(tfgz), file.size(tf))
+})
+
+test_that("read/write compressed file", {
Review Comment:
Why is this a separate test? You could read in the file(s) in the previous
test
##########
r/tests/testthat/test-s3-minio.R:
##########
@@ -54,6 +54,24 @@ if (arrow_with_s3() && process_is_running("minio server")) {
)
})
+ test_that("read/write compressed csv by filesystem", {
+ dat <- tibble(x = seq(1, 10, by = 0.2))
Review Comment:
Does this need to be skipped if gzip is not present or have we already
skipped by this point?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]