This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new ad115be121 GH-33287: [R] Cannot read_parquet on http URL (#34708)
ad115be121 is described below
commit ad115be1214b13ce393537bdb9c34ae919e4997f
Author: Nic Crane <[email protected]>
AuthorDate: Wed Apr 5 09:48:18 2023 +0100
GH-33287: [R] Cannot read_parquet on http URL (#34708)
Before:
``` r
library(arrow)
parquet_url <-
"https://raw.githubusercontent.com/apache/arrow/master/r/inst/v0.7.1.parquet"
read_parquet(parquet_url)
#> Error: file must be a "RandomAccessFile"
```
After:
``` r
library(arrow)
parquet_url <-
"https://raw.githubusercontent.com/apache/arrow/master/r/inst/v0.7.1.parquet"
read_parquet(parquet_url)
#> carat cut color clarity depth table price x y z
#> 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
#> 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
#> 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
#> 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
#> 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
#> 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
#> 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
#> 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
#> 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
#> 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39
#> __index_level_0__
#> 1 0
#> 2 1
#> 3 2
#> 4 3
#> 5 4
#> 6 5
#> 7 6
#> 8 7
#> 9 8
#> 10 9
```
* Closes: #33287
Authored-by: Nic Crane <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
r/R/csv.R | 2 +-
r/R/filesystem.R | 1 +
r/R/io.R | 11 ++++++++++-
r/R/ipc-stream.R | 2 +-
r/R/parquet.R | 1 -
r/man/make_readable_file.Rd | 4 +++-
r/tests/testthat/test-feather.R | 9 +++++++++
r/tests/testthat/test-io.R | 1 +
r/tests/testthat/test-parquet.R | 9 +++++++++
9 files changed, 35 insertions(+), 5 deletions(-)
diff --git a/r/R/csv.R b/r/R/csv.R
index 01ef3e4c6a..8224323866 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -216,7 +216,7 @@ read_delim_arrow <- function(file,
if (!inherits(file, "InputStream")) {
compression <- detect_compression(file)
- file <- make_readable_file(file)
+ file <- make_readable_file(file, random_access = FALSE)
if (compression != "uncompressed") {
# TODO: accept compression and compression_level as args
file <- CompressedInputStream$create(file, compression)
diff --git a/r/R/filesystem.R b/r/R/filesystem.R
index 5a1e0eab60..ee5e5a62e2 100644
--- a/r/R/filesystem.R
+++ b/r/R/filesystem.R
@@ -361,6 +361,7 @@ get_path_and_filesystem <- function(x, filesystem = NULL) {
}
is_url <- function(x) is.string(x) && grepl("://", x)
+is_http_url <- function(x) is_url(x) && grepl("^http", x)
are_urls <- function(x) if (!is.character(x)) FALSE else grepl("://", x)
#' @usage NULL
diff --git a/r/R/io.R b/r/R/io.R
index fc664ed386..b2989de78a 100644
--- a/r/R/io.R
+++ b/r/R/io.R
@@ -229,9 +229,10 @@ mmap_open <- function(path, mode = c("read", "write",
"readwrite")) {
#' Handle a range of possible input sources
#' @param file A character file name, `raw` vector, or an Arrow input stream
#' @param mmap Logical: whether to memory-map the file (default `TRUE`)
+#' @param random_access Logical: whether the result must be a RandomAccessFile
#' @return An `InputStream` or a subclass of one.
#' @keywords internal
-make_readable_file <- function(file, mmap = TRUE) {
+make_readable_file <- function(file, mmap = TRUE, random_access = TRUE) {
if (inherits(file, "SubTreeFileSystem")) {
filesystem <- file$base_fs
# SubTreeFileSystem adds a slash to base_path, but filesystems will reject
@@ -239,6 +240,14 @@ make_readable_file <- function(file, mmap = TRUE) {
path <- sub("/$", "", file$base_path)
file <- filesystem$OpenInputFile(path)
} else if (is.string(file)) {
+ # if this is a HTTP URL, we need a local copy to pass to
FileSystem$from_uri
+ if (random_access && is_http_url(file)) {
+ tf <- tempfile()
+ download.file(file, tf, quiet = TRUE, mode = "wb")
+ file <- tf
+ on.exit(unlink(tf))
+ }
+
if (is_url(file)) {
file <- tryCatch(
{
diff --git a/r/R/ipc-stream.R b/r/R/ipc-stream.R
index dd59d0f4df..f0b4a6aae0 100644
--- a/r/R/ipc-stream.R
+++ b/r/R/ipc-stream.R
@@ -98,7 +98,7 @@ write_to_raw <- function(x, format = c("stream", "file")) {
#' @export
read_ipc_stream <- function(file, as_data_frame = TRUE, ...) {
if (!inherits(file, "InputStream")) {
- file <- make_readable_file(file)
+ file <- make_readable_file(file, random_access = FALSE)
on.exit(file$close())
}
diff --git a/r/R/parquet.R b/r/R/parquet.R
index a6f8058f4a..f3d384e8c2 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -606,7 +606,6 @@ ParquetArrowReaderProperties$create <- function(use_threads
= option_use_threads
calculate_chunk_size <- function(rows, columns,
target_cells_per_group =
getOption("arrow.parquet_cells_per_group", 2.5e8),
max_chunks =
getOption("arrow.parquet_max_chunks", 200)) {
-
# Ensure is a float to prevent integer overflow issues
num_cells <- as.numeric(rows) * as.numeric(columns)
diff --git a/r/man/make_readable_file.Rd b/r/man/make_readable_file.Rd
index 1544815211..ad9a1c3374 100644
--- a/r/man/make_readable_file.Rd
+++ b/r/man/make_readable_file.Rd
@@ -4,12 +4,14 @@
\alias{make_readable_file}
\title{Handle a range of possible input sources}
\usage{
-make_readable_file(file, mmap = TRUE)
+make_readable_file(file, mmap = TRUE, random_access = TRUE)
}
\arguments{
\item{file}{A character file name, \code{raw} vector, or an Arrow input stream}
\item{mmap}{Logical: whether to memory-map the file (default \code{TRUE})}
+
+\item{random_access}{Logical: whether the result must be a RandomAccessFile}
}
\value{
An \code{InputStream} or a subclass of one.
diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R
index 9f42a00d85..4caadc27c4 100644
--- a/r/tests/testthat/test-feather.R
+++ b/r/tests/testthat/test-feather.R
@@ -327,3 +327,12 @@ test_that("Error is created when feather reads a parquet
file", {
test_that("The read_ipc_file function is an alias of read_feather", {
expect_identical(read_ipc_file, read_feather)
})
+
+test_that("Can read Feather files from a URL", {
+ skip_if_offline()
+ skip_on_cran()
+ feather_url <-
"https://github.com/apache/arrow-testing/raw/master/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_datetime.arrow_file"
# nolint
+ fu <- read_feather(feather_url)
+ expect_true(tibble::is_tibble(fu))
+ expect_identical(dim(fu), c(17L, 15L))
+})
diff --git a/r/tests/testthat/test-io.R b/r/tests/testthat/test-io.R
index b4780af75d..8698250d47 100644
--- a/r/tests/testthat/test-io.R
+++ b/r/tests/testthat/test-io.R
@@ -244,3 +244,4 @@ test_that("reencoding input stream errors for invalid
characters", {
unlink(temp_utf8)
})
+
diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R
index e1e54a5139..12711521cc 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/tests/testthat/test-parquet.R
@@ -472,3 +472,12 @@ test_that("Can read parquet with nested lists and maps", {
pq <- read_parquet(paste0(parquet_test_data, "/nested_maps.snappy.parquet"),
as_data_frame = FALSE)
expect_true(pq$a$type == map_of(utf8(), map_of(int32(), field("value",
boolean(), nullable = FALSE))))
})
+
+test_that("Can read Parquet files from a URL", {
+ skip_if_offline()
+ skip_on_cran()
+ parquet_url <-
"https://github.com/apache/arrow/blob/64f2cc7986ce672dd1a8cb268d193617a80a1653/r/inst/v0.7.1.parquet?raw=true"
# nolint
+ pu <- read_parquet(parquet_url)
+ expect_true(tibble::is_tibble(pu))
+ expect_identical(dim(pu), c(10L, 11L))
+})