This is an automated email from the ASF dual-hosted git repository.

thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new ad115be121 GH-33287: [R] Cannot read_parquet on http URL (#34708)
ad115be121 is described below

commit ad115be1214b13ce393537bdb9c34ae919e4997f
Author: Nic Crane <[email protected]>
AuthorDate: Wed Apr 5 09:48:18 2023 +0100

    GH-33287: [R] Cannot read_parquet on http URL (#34708)
    
    Before:
    
    ``` r
    library(arrow)
    parquet_url <- 
"https://raw.githubusercontent.com/apache/arrow/master/r/inst/v0.7.1.parquet";
    read_parquet(parquet_url)
    #> Error: file must be a "RandomAccessFile"
    ```
    
    After:
    
    ``` r
    library(arrow)
    parquet_url <- 
"https://raw.githubusercontent.com/apache/arrow/master/r/inst/v0.7.1.parquet";
    read_parquet(parquet_url)
    #>    carat       cut color clarity depth table price    x    y    z
    #> 1   0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
    #> 2   0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
    #> 3   0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
    #> 4   0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
    #> 5   0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
    #> 6   0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48
    #> 7   0.24 Very Good     I    VVS1  62.3    57   336 3.95 3.98 2.47
    #> 8   0.26 Very Good     H     SI1  61.9    55   337 4.07 4.11 2.53
    #> 9   0.22      Fair     E     VS2  65.1    61   337 3.87 3.78 2.49
    #> 10  0.23 Very Good     H     VS1  59.4    61   338 4.00 4.05 2.39
    #>    __index_level_0__
    #> 1                  0
    #> 2                  1
    #> 3                  2
    #> 4                  3
    #> 5                  4
    #> 6                  5
    #> 7                  6
    #> 8                  7
    #> 9                  8
    #> 10                 9
    ```
    * Closes: #33287
    
    Authored-by: Nic Crane <[email protected]>
    Signed-off-by: Nic Crane <[email protected]>
---
 r/R/csv.R                       |  2 +-
 r/R/filesystem.R                |  1 +
 r/R/io.R                        | 11 ++++++++++-
 r/R/ipc-stream.R                |  2 +-
 r/R/parquet.R                   |  1 -
 r/man/make_readable_file.Rd     |  4 +++-
 r/tests/testthat/test-feather.R |  9 +++++++++
 r/tests/testthat/test-io.R      |  1 +
 r/tests/testthat/test-parquet.R |  9 +++++++++
 9 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/r/R/csv.R b/r/R/csv.R
index 01ef3e4c6a..8224323866 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -216,7 +216,7 @@ read_delim_arrow <- function(file,
 
   if (!inherits(file, "InputStream")) {
     compression <- detect_compression(file)
-    file <- make_readable_file(file)
+    file <- make_readable_file(file, random_access = FALSE)
     if (compression != "uncompressed") {
       # TODO: accept compression and compression_level as args
       file <- CompressedInputStream$create(file, compression)
diff --git a/r/R/filesystem.R b/r/R/filesystem.R
index 5a1e0eab60..ee5e5a62e2 100644
--- a/r/R/filesystem.R
+++ b/r/R/filesystem.R
@@ -361,6 +361,7 @@ get_path_and_filesystem <- function(x, filesystem = NULL) {
 }
 
 is_url <- function(x) is.string(x) && grepl("://", x)
+is_http_url <- function(x) is_url(x) && grepl("^http", x)
 are_urls <- function(x) if (!is.character(x)) FALSE else grepl("://", x)
 
 #' @usage NULL
diff --git a/r/R/io.R b/r/R/io.R
index fc664ed386..b2989de78a 100644
--- a/r/R/io.R
+++ b/r/R/io.R
@@ -229,9 +229,10 @@ mmap_open <- function(path, mode = c("read", "write", 
"readwrite")) {
 #' Handle a range of possible input sources
 #' @param file A character file name, `raw` vector, or an Arrow input stream
 #' @param mmap Logical: whether to memory-map the file (default `TRUE`)
+#' @param random_access Logical: whether the result must be a RandomAccessFile
 #' @return An `InputStream` or a subclass of one.
 #' @keywords internal
-make_readable_file <- function(file, mmap = TRUE) {
+make_readable_file <- function(file, mmap = TRUE, random_access = TRUE) {
   if (inherits(file, "SubTreeFileSystem")) {
     filesystem <- file$base_fs
     # SubTreeFileSystem adds a slash to base_path, but filesystems will reject
@@ -239,6 +240,14 @@ make_readable_file <- function(file, mmap = TRUE) {
     path <- sub("/$", "", file$base_path)
     file <- filesystem$OpenInputFile(path)
   } else if (is.string(file)) {
+    # if this is a HTTP URL, we need a local copy to pass to 
FileSystem$from_uri
+    if (random_access && is_http_url(file)) {
+      tf <- tempfile()
+      download.file(file, tf, quiet = TRUE, mode = "wb")
+      file <- tf
+      on.exit(unlink(tf))
+    }
+
     if (is_url(file)) {
       file <- tryCatch(
         {
diff --git a/r/R/ipc-stream.R b/r/R/ipc-stream.R
index dd59d0f4df..f0b4a6aae0 100644
--- a/r/R/ipc-stream.R
+++ b/r/R/ipc-stream.R
@@ -98,7 +98,7 @@ write_to_raw <- function(x, format = c("stream", "file")) {
 #' @export
 read_ipc_stream <- function(file, as_data_frame = TRUE, ...) {
   if (!inherits(file, "InputStream")) {
-    file <- make_readable_file(file)
+    file <- make_readable_file(file, random_access = FALSE)
     on.exit(file$close())
   }
 
diff --git a/r/R/parquet.R b/r/R/parquet.R
index a6f8058f4a..f3d384e8c2 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -606,7 +606,6 @@ ParquetArrowReaderProperties$create <- function(use_threads 
= option_use_threads
 calculate_chunk_size <- function(rows, columns,
                                  target_cells_per_group = 
getOption("arrow.parquet_cells_per_group", 2.5e8),
                                  max_chunks = 
getOption("arrow.parquet_max_chunks", 200)) {
-
   # Ensure is a float to prevent integer overflow issues
   num_cells <- as.numeric(rows) * as.numeric(columns)
 
diff --git a/r/man/make_readable_file.Rd b/r/man/make_readable_file.Rd
index 1544815211..ad9a1c3374 100644
--- a/r/man/make_readable_file.Rd
+++ b/r/man/make_readable_file.Rd
@@ -4,12 +4,14 @@
 \alias{make_readable_file}
 \title{Handle a range of possible input sources}
 \usage{
-make_readable_file(file, mmap = TRUE)
+make_readable_file(file, mmap = TRUE, random_access = TRUE)
 }
 \arguments{
 \item{file}{A character file name, \code{raw} vector, or an Arrow input stream}
 
 \item{mmap}{Logical: whether to memory-map the file (default \code{TRUE})}
+
+\item{random_access}{Logical: whether the result must be a RandomAccessFile}
 }
 \value{
 An \code{InputStream} or a subclass of one.
diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R
index 9f42a00d85..4caadc27c4 100644
--- a/r/tests/testthat/test-feather.R
+++ b/r/tests/testthat/test-feather.R
@@ -327,3 +327,12 @@ test_that("Error is created when feather reads a parquet 
file", {
 test_that("The read_ipc_file function is an alias of read_feather", {
   expect_identical(read_ipc_file, read_feather)
 })
+
+test_that("Can read Feather files from a URL", {
+  skip_if_offline()
+  skip_on_cran()
+  feather_url <- 
"https://github.com/apache/arrow-testing/raw/master/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_datetime.arrow_file";
 # nolint
+  fu <- read_feather(feather_url)
+  expect_true(tibble::is_tibble(fu))
+  expect_identical(dim(fu), c(17L, 15L))
+})
diff --git a/r/tests/testthat/test-io.R b/r/tests/testthat/test-io.R
index b4780af75d..8698250d47 100644
--- a/r/tests/testthat/test-io.R
+++ b/r/tests/testthat/test-io.R
@@ -244,3 +244,4 @@ test_that("reencoding input stream errors for invalid 
characters", {
 
   unlink(temp_utf8)
 })
+
diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R
index e1e54a5139..12711521cc 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/tests/testthat/test-parquet.R
@@ -472,3 +472,12 @@ test_that("Can read parquet with nested lists and maps", {
   pq <- read_parquet(paste0(parquet_test_data, "/nested_maps.snappy.parquet"), 
as_data_frame = FALSE)
   expect_true(pq$a$type == map_of(utf8(), map_of(int32(), field("value", 
boolean(), nullable = FALSE))))
 })
+
+test_that("Can read Parquet files from a URL", {
+  skip_if_offline()
+  skip_on_cran()
+  parquet_url <- 
"https://github.com/apache/arrow/blob/64f2cc7986ce672dd1a8cb268d193617a80a1653/r/inst/v0.7.1.parquet?raw=true";
 # nolint
+  pu <- read_parquet(parquet_url)
+  expect_true(tibble::is_tibble(pu))
+  expect_identical(dim(pu), c(10L, 11L))
+})

Reply via email to