This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new c72f84a48b ARROW-15906: [C++][Python][R] By default, don't create or
delete S3 buckets (#13206)
c72f84a48b is described below
commit c72f84a48b4952796ab78a0c33b84a9fc8f893db
Author: Will Jones <[email protected]>
AuthorDate: Wed Jun 15 12:05:44 2022 +0200
ARROW-15906: [C++][Python][R] By default, don't create or delete S3 buckets
(#13206)
BREAKING CHANGE: modifies `S3FileSystem` to not allow creating or deleting
buckets by default. Two new options are added: `allow_bucket_creations`, which
enables creating buckets, and `allow_bucket_deletion`, which enables deleting
buckets.
Outside of tests, most use cases will not want to create or delete buckets,
and doing so accidentally is not desirable either. Buckets have governance
controls like permissions and cost-tracking tags.
To make it easy for users to transition, the `FromUri` method also supports
these arguments:
```python
from pyarrow.fs import FileSystem
uri =
"s3://minioadmin:minioadmin@?scheme=http&endpoint_override=localhost%3A9000"
fs, path = FileSystem.from_uri(uri)
fs.create_dir("test")
# Traceback (most recent call last):
# File "<stdin>", line 1, in <module>
# File "pyarrow/_fs.pyx", line 463, in pyarrow._fs.FileSystem.create_dir
# check_status(self.fs.CreateDir(directory, recursive=recursive))
# File "pyarrow/error.pxi", line 115, in pyarrow.lib.check_status
# raise IOError(message)
# OSError: Bucket 'test' not found. To create buckets, enable the
allow_bucket_creation option.
uri = uri + "&allow_bucket_creation=True&allow_bucket_deletion=True"
fs, path = FileSystem.from_uri(uri)
fs.create_dir("test")
fs.delete_dir("test")
```
```r
fs <-
FileSystem$from_uri("s3://minioadmin:minioadmin@?scheme=http&endpoint_override=localhost%3A9000")$fs
fs$CreateDir("test")
#> Error: IOError: Bucket 'test' not found. To create buckets, enable the
allow_bucket_creation option.
fs <- FileSystem$from_uri(
paste0("s3://minioadmin:minioadmin@?scheme=http&endpoint_override=localhost%3A9000",
"&allow_bucket_creation=TRUE&allow_bucket_deletion=TRUE")
)$fs
fs$CreateDir("test")
fs$DeleteDir("test")
```
Authored-by: Will Jones <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/filesystem/s3fs.cc | 35 ++++++++++++++++++++++++++++-
cpp/src/arrow/filesystem/s3fs.h | 11 ++++++++++
cpp/src/arrow/filesystem/s3fs_test.cc | 22 +++++++++++++++++++
cpp/src/arrow/util/string.cc | 10 +++++++++
cpp/src/arrow/util/string.h | 8 +++++++
python/pyarrow/_s3fs.pyx | 22 +++++++++++++++++--
python/pyarrow/includes/libarrow_fs.pxd | 2 ++
python/pyarrow/tests/test_dataset.py | 39 +++++++++++++++++++++++++++++----
python/pyarrow/tests/test_fs.py | 19 +++++++++++++---
r/R/arrowExports.R | 4 ++--
r/R/filesystem.R | 19 ++++++++++++++--
r/man/FileSystem.Rd | 15 +++++++++++++
r/src/arrowExports.cpp | 12 +++++-----
r/src/filesystem.cpp | 6 ++++-
r/tests/testthat/test-s3-minio.R | 20 ++++++++++++++++-
15 files changed, 223 insertions(+), 21 deletions(-)
diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index 4336b78330..70b66938fe 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -91,6 +91,7 @@
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/util/optional.h"
+#include "arrow/util/string.h"
#include "arrow/util/task_group.h"
#include "arrow/util/thread_pool.h"
@@ -353,6 +354,12 @@ Result<S3Options> S3Options::FromUri(const Uri& uri,
std::string* out_path) {
options.scheme = kv.second;
} else if (kv.first == "endpoint_override") {
options.endpoint_override = kv.second;
+ } else if (kv.first == "allow_bucket_creation") {
+ ARROW_ASSIGN_OR_RAISE(options.allow_bucket_creation,
+ ::arrow::internal::ParseBoolean(kv.second));
+ } else if (kv.first == "allow_bucket_deletion") {
+ ARROW_ASSIGN_OR_RAISE(options.allow_bucket_deletion,
+ ::arrow::internal::ParseBoolean(kv.second));
} else {
return Status::Invalid("Unexpected query parameter in S3 URI: '",
kv.first, "'");
}
@@ -376,6 +383,8 @@ Result<S3Options> S3Options::FromUri(const std::string&
uri_string,
bool S3Options::Equals(const S3Options& other) const {
return (region == other.region && endpoint_override ==
other.endpoint_override &&
scheme == other.scheme && background_writes ==
other.background_writes &&
+ allow_bucket_creation == other.allow_bucket_creation &&
+ allow_bucket_deletion == other.allow_bucket_deletion &&
credentials_kind == other.credentials_kind &&
proxy_options.Equals(other.proxy_options) &&
GetAccessKey() == other.GetAccessKey() &&
@@ -1677,6 +1686,27 @@ class S3FileSystem::Impl : public
std::enable_shared_from_this<S3FileSystem::Imp
// Create a bucket. Successful if bucket already exists.
Status CreateBucket(const std::string& bucket) {
+ // Check bucket exists first.
+ {
+ S3Model::HeadBucketRequest req;
+ req.SetBucket(ToAwsString(bucket));
+ auto outcome = client_->HeadBucket(req);
+
+ if (outcome.IsSuccess()) {
+ return Status::OK();
+ } else if (!IsNotFound(outcome.GetError())) {
+ return ErrorToStatus(
+ std::forward_as_tuple("When creating bucket '", bucket, "': "),
+ outcome.GetError());
+ }
+
+ if (!options().allow_bucket_creation) {
+ return Status::IOError(
+ "Bucket '", bucket, "' not found. ",
+ "To create buckets, enable the allow_bucket_creation option.");
+ }
+ }
+
S3Model::CreateBucketConfiguration config;
S3Model::CreateBucketRequest req;
auto _region = region();
@@ -2373,13 +2403,16 @@ Status S3FileSystem::DeleteDir(const std::string& s) {
return Status::NotImplemented("Cannot delete all S3 buckets");
}
RETURN_NOT_OK(impl_->DeleteDirContentsAsync(path.bucket, path.key).status());
- if (path.key.empty()) {
+ if (path.key.empty() && options().allow_bucket_deletion) {
// Delete bucket
S3Model::DeleteBucketRequest req;
req.SetBucket(ToAwsString(path.bucket));
return OutcomeToStatus(
std::forward_as_tuple("When deleting bucket '", path.bucket, "': "),
impl_->client_->DeleteBucket(req));
+ } else if (path.key.empty()) {
+ return Status::IOError("Would delete bucket '", path.bucket, "'. ",
+ "To delete buckets, enable the
allow_bucket_deletion option.");
} else {
// Delete "directory"
RETURN_NOT_OK(impl_->DeleteObject(path.bucket, path.key + kSep));
diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h
index c2c99e964e..fa1fdb9671 100644
--- a/cpp/src/arrow/filesystem/s3fs.h
+++ b/cpp/src/arrow/filesystem/s3fs.h
@@ -130,6 +130,17 @@ struct ARROW_EXPORT S3Options {
/// Whether OutputStream writes will be issued in the background, without
blocking.
bool background_writes = true;
+ /// Whether to allow creation of buckets
+ ///
+ /// When S3FileSystem creates new buckets, it does not pass any non-default
settings.
+ /// In AWS S3, the bucket and all objects will be not publicly visible, and
there
+ /// will be no bucket policies and no resource tags. To have more control
over how
+ /// buckets are created, use a different API to create them.
+ bool allow_bucket_creation = false;
+
+ /// Whether to allow deletion of buckets
+ bool allow_bucket_deletion = false;
+
/// \brief Default metadata for OpenOutputStream.
///
/// This will be ignored if non-empty metadata is passed to OpenOutputStream.
diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc
b/cpp/src/arrow/filesystem/s3fs_test.cc
index b44bc7f50c..7216af297a 100644
--- a/cpp/src/arrow/filesystem/s3fs_test.cc
+++ b/cpp/src/arrow/filesystem/s3fs_test.cc
@@ -405,6 +405,9 @@ class TestS3FS : public S3TestMixin {
public:
void SetUp() override {
S3TestMixin::SetUp();
+ // Most tests will create buckets
+ options_.allow_bucket_creation = true;
+ options_.allow_bucket_deletion = true;
MakeFileSystem();
// Set up test bucket
{
@@ -1124,6 +1127,25 @@ TEST_F(TestS3FS, FileSystemFromUri) {
AssertFileInfo(fs.get(), path, FileType::File, 8);
}
+TEST_F(TestS3FS, NoCreateDeleteBucket) {
+ // Create a bucket to try deleting
+ ASSERT_OK(fs_->CreateDir("test-no-delete"));
+
+ options_.allow_bucket_creation = false;
+ options_.allow_bucket_deletion = false;
+ MakeFileSystem();
+
+ auto maybe_create_dir = fs_->CreateDir("test-no-create");
+ ASSERT_RAISES(IOError, maybe_create_dir);
+ ASSERT_THAT(maybe_create_dir.message(),
+ ::testing::HasSubstr("Bucket 'test-no-create' not found"));
+
+ auto maybe_delete_dir = fs_->DeleteDir("test-no-delete");
+ ASSERT_RAISES(IOError, maybe_delete_dir);
+ ASSERT_THAT(maybe_delete_dir.message(),
+ ::testing::HasSubstr("Would delete bucket 'test-no-delete'"));
+}
+
// Simple retry strategy that records errors encountered and its emitted retry
delays
class TestRetryStrategy : public S3RetryStrategy {
public:
diff --git a/cpp/src/arrow/util/string.cc b/cpp/src/arrow/util/string.cc
index d922311df1..3a15860055 100644
--- a/cpp/src/arrow/util/string.cc
+++ b/cpp/src/arrow/util/string.cc
@@ -187,5 +187,15 @@ util::optional<std::string> Replace(util::string_view s,
util::string_view token
s.substr(token_start + token.size()).to_string();
}
+Result<bool> ParseBoolean(util::string_view value) {
+ if (AsciiEqualsCaseInsensitive(value, "true") || value == "1") {
+ return true;
+ } else if (AsciiEqualsCaseInsensitive(value, "false") || value == "0") {
+ return false;
+ } else {
+ return Status::Invalid("String is not a valid boolean value: '", value,
"'.");
+ }
+}
+
} // namespace internal
} // namespace arrow
diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h
index 68b8a54e31..d2c8ac38ee 100644
--- a/cpp/src/arrow/util/string.h
+++ b/cpp/src/arrow/util/string.h
@@ -20,6 +20,7 @@
#include <string>
#include <vector>
+#include "arrow/result.h"
#include "arrow/util/optional.h"
#include "arrow/util/string_view.h"
#include "arrow/util/visibility.h"
@@ -75,5 +76,12 @@ ARROW_EXPORT
util::optional<std::string> Replace(util::string_view s, util::string_view
token,
util::string_view replacement);
+/// \brief Get boolean value from string
+///
+/// If "1", "true" (case-insensitive), returns true
+/// If "0", "false" (case-insensitive), returns false
+/// Otherwise, returns Status::Invalid
+ARROW_EXPORT
+arrow::Result<bool> ParseBoolean(util::string_view value);
} // namespace internal
} // namespace arrow
diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx
index 4d919c8b9d..c71acc4784 100644
--- a/python/pyarrow/_s3fs.pyx
+++ b/python/pyarrow/_s3fs.pyx
@@ -99,6 +99,12 @@ cdef class S3FileSystem(FileSystem):
Note: S3 buckets are special and the operations available on them may be
limited or more expensive than desired.
+ When S3FileSystem creates new buckets (assuming allow_bucket_creation is
+ True), it does not pass any non-default settings. In AWS S3, the bucket
and
+ all objects will be not publicly visible, and will have no bucket policies
+ and no resource tags. To have more control over how buckets are created,
+ use a different API to create them.
+
Parameters
----------
access_key : str, default None
@@ -150,6 +156,12 @@ cdef class S3FileSystem(FileSystem):
S3FileSystem(proxy_options={'scheme': 'http', 'host': 'localhost',
'port': 8020, 'username': 'username',
'password': 'password'})
+ allow_bucket_creation : bool, default False
+ Whether to allow CreateDir at the bucket-level. This option may also
be
+ passed in a URI query parameter.
+ allow_bucket_deletion : bool, default False
+ Whether to allow DeleteDir at the bucket-level. This option may also
be
+ passed in a URI query parameter.
"""
cdef:
@@ -159,7 +171,8 @@ cdef class S3FileSystem(FileSystem):
bint anonymous=False, region=None, scheme=None,
endpoint_override=None, bint background_writes=True,
default_metadata=None, role_arn=None, session_name=None,
- external_id=None, load_frequency=900, proxy_options=None):
+ external_id=None, load_frequency=900, proxy_options=None,
+ allow_bucket_creation=False, allow_bucket_deletion=False):
cdef:
CS3Options options
shared_ptr[CS3FileSystem] wrapped
@@ -253,6 +266,9 @@ cdef class S3FileSystem(FileSystem):
"'proxy_options': expected 'dict' or 'str', "
f"got {type(proxy_options)} instead.")
+ options.allow_bucket_creation = allow_bucket_creation
+ options.allow_bucket_deletion = allow_bucket_deletion
+
with nogil:
wrapped = GetResultValue(CS3FileSystem.Make(options))
@@ -295,6 +311,8 @@ cdef class S3FileSystem(FileSystem):
external_id=frombytes(opts.external_id),
load_frequency=opts.load_frequency,
background_writes=opts.background_writes,
+ allow_bucket_creation=opts.allow_bucket_creation,
+ allow_bucket_deletion=opts.allow_bucket_deletion,
default_metadata=pyarrow_wrap_metadata(opts.default_metadata),
proxy_options={'scheme': frombytes(opts.proxy_options.scheme),
'host': frombytes(opts.proxy_options.host),
@@ -302,7 +320,7 @@ cdef class S3FileSystem(FileSystem):
'username': frombytes(
opts.proxy_options.username),
'password': frombytes(
- opts.proxy_options.password)}
+ opts.proxy_options.password)},
),)
)
diff --git a/python/pyarrow/includes/libarrow_fs.pxd
b/python/pyarrow/includes/libarrow_fs.pxd
index d0e1501003..d47b462aa5 100644
--- a/python/pyarrow/includes/libarrow_fs.pxd
+++ b/python/pyarrow/includes/libarrow_fs.pxd
@@ -155,6 +155,8 @@ cdef extern from "arrow/filesystem/api.h" namespace
"arrow::fs" nogil:
c_string endpoint_override
c_string scheme
c_bool background_writes
+ c_bool allow_bucket_creation
+ c_bool allow_bucket_deletion
shared_ptr[const CKeyValueMetadata] default_metadata
c_string role_arn
c_string session_name
diff --git a/python/pyarrow/tests/test_dataset.py
b/python/pyarrow/tests/test_dataset.py
index ad98c83e66..b35d8f3178 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -2506,6 +2506,7 @@ def s3_example_simple(s3_server):
host, port, access_key, secret_key = s3_server['connection']
uri = (
"s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}"
+ "&allow_bucket_creation=True"
.format(access_key, secret_key, host, port)
)
@@ -2568,9 +2569,10 @@ def
test_open_dataset_from_s3_with_filesystem_uri(s3_server):
host, port, access_key, secret_key = s3_server['connection']
bucket = 'theirbucket'
path = 'nested/folder/data.parquet'
- uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}".format(
- access_key, secret_key, bucket, path, host, port
- )
+ uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}"\
+ "&allow_bucket_creation=true".format(
+ access_key, secret_key, bucket, path, host, port
+ )
fs, path = FileSystem.from_uri(uri)
assert path == 'theirbucket/nested/folder/data.parquet'
@@ -4544,9 +4546,38 @@ def test_write_dataset_s3_put_only(s3_server):
).to_table()
assert result.equals(table)
+ # Passing create_dir is fine if the bucket already exists
+ ds.write_dataset(
+ table, "existing-bucket", filesystem=fs,
+ format="feather", create_dir=True, partitioning=part,
+ existing_data_behavior='overwrite_or_ignore'
+ )
+ # check roundtrip
+ result = ds.dataset(
+ "existing-bucket", filesystem=fs, format="ipc", partitioning="hive"
+ ).to_table()
+ assert result.equals(table)
+
+ # Error enforced by filesystem
+ with pytest.raises(OSError,
+ match="Bucket 'non-existing-bucket' not found"):
+ ds.write_dataset(
+ table, "non-existing-bucket", filesystem=fs,
+ format="feather", create_dir=True,
+ existing_data_behavior='overwrite_or_ignore'
+ )
+
+ # Error enforced by minio / S3 service
+ fs = S3FileSystem(
+ access_key='limited',
+ secret_key='limited123',
+ endpoint_override='{}:{}'.format(host, port),
+ scheme='http',
+ allow_bucket_creation=True,
+ )
with pytest.raises(OSError, match="Access Denied"):
ds.write_dataset(
- table, "existing-bucket", filesystem=fs,
+ table, "non-existing-bucket", filesystem=fs,
format="feather", create_dir=True,
existing_data_behavior='overwrite_or_ignore'
)
diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index 4bd532525a..1e43579a2b 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -240,7 +240,9 @@ def s3fs(request, s3_server):
access_key=access_key,
secret_key=secret_key,
endpoint_override='{}:{}'.format(host, port),
- scheme='http'
+ scheme='http',
+ allow_bucket_creation=True,
+ allow_bucket_deletion=True
)
fs.create_dir(bucket)
@@ -476,6 +478,12 @@ def test_s3fs_limited_permissions_create_bucket(s3_server):
)
fs.create_dir('existing-bucket/test')
+ with pytest.raises(pa.ArrowIOError, match="Bucket 'new-bucket' not found"):
+ fs.create_dir('new-bucket')
+
+ with pytest.raises(pa.ArrowIOError, match="Would delete bucket"):
+ fs.delete_dir('existing-bucket')
+
def test_file_info_constructor():
dt = datetime.fromtimestamp(1568799826, timezone.utc)
@@ -1109,6 +1117,10 @@ def test_s3_options():
assert isinstance(fs, S3FileSystem)
assert pickle.loads(pickle.dumps(fs)) == fs
+ fs = S3FileSystem(allow_bucket_creation=True, allow_bucket_deletion=True)
+ assert isinstance(fs, S3FileSystem)
+ assert pickle.loads(pickle.dumps(fs)) == fs
+
with pytest.raises(ValueError):
S3FileSystem(access_key='access')
with pytest.raises(ValueError):
@@ -1381,8 +1393,9 @@ def test_filesystem_from_uri_s3(s3_server):
host, port, access_key, secret_key = s3_server['connection']
- uri = "s3://{}:{}@mybucket/foo/bar?scheme=http&endpoint_override={}:{}" \
- .format(access_key, secret_key, host, port)
+ uri = "s3://{}:{}@mybucket/foo/bar?scheme=http&endpoint_override={}:{}"\
+ "&allow_bucket_creation=True" \
+ .format(access_key, secret_key, host, port)
fs, path = FileSystem.from_uri(uri)
assert isinstance(fs, S3FileSystem)
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 8ad56f227f..4c579840e4 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -1280,8 +1280,8 @@ fs___CopyFiles <- function(source_fs, source_sel,
destination_fs, destination_ba
invisible(.Call(`_arrow_fs___CopyFiles`, source_fs, source_sel,
destination_fs, destination_base_dir, chunk_size, use_threads))
}
-fs___S3FileSystem__create <- function(anonymous, access_key, secret_key,
session_token, role_arn, session_name, external_id, load_frequency, region,
endpoint_override, scheme, proxy_options, background_writes) {
- .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key,
session_token, role_arn, session_name, external_id, load_frequency, region,
endpoint_override, scheme, proxy_options, background_writes)
+fs___S3FileSystem__create <- function(anonymous, access_key, secret_key,
session_token, role_arn, session_name, external_id, load_frequency, region,
endpoint_override, scheme, proxy_options, background_writes,
allow_bucket_creation, allow_bucket_deletion) {
+ .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key,
session_token, role_arn, session_name, external_id, load_frequency, region,
endpoint_override, scheme, proxy_options, background_writes,
allow_bucket_creation, allow_bucket_deletion)
}
fs___S3FileSystem__region <- function(fs) {
diff --git a/r/R/filesystem.R b/r/R/filesystem.R
index a6d845d4c9..b035430ff6 100644
--- a/r/R/filesystem.R
+++ b/r/R/filesystem.R
@@ -150,6 +150,10 @@ FileSelector$create <- function(base_dir, allow_not_found
= FALSE, recursive = F
#' - `scheme`: S3 connection transport (default "https")
#' - `background_writes`: logical, whether `OutputStream` writes will be issued
#' in the background, without blocking (default `TRUE`)
+#' - `allow_bucket_creation`: logical, if TRUE, the filesystem will create
+#' buckets if `$CreateDir()` is called on the bucket level (default
`FALSE`).
+#' - `allow_bucket_deletion`: logical, if TRUE, the filesystem will delete
+#' buckets if`$DeleteDir()` is called on the bucket level (default `FALSE`).
#'
#' @section Methods:
#'
@@ -189,6 +193,15 @@ FileSelector$create <- function(base_dir, allow_not_found
= FALSE, recursive = F
#' - `$base_path`: for `SubTreeFileSystem`, the path in `$base_fs` which is
considered
#' root in this `SubTreeFileSystem`.
#'
+#' @section Notes:
+#'
+#' On S3FileSystem, `$CreateDir()` on a top-level directory creates a new
bucket.
+#' When S3FileSystem creates new buckets (assuming allow_bucket_creation is
TRUE),
+#' it does not pass any non-default settings. In AWS S3, the bucket and all
+#' objects will be not publicly visible, and will have no bucket policies
+#' and no resource tags. To have more control over how buckets are created,
+#' use a different API to create them.
+#'
#' @usage NULL
#' @format NULL
#' @docType class
@@ -338,7 +351,7 @@ S3FileSystem$create <- function(anonymous = FALSE, ...) {
invalid_args <- intersect(
c(
"access_key", "secret_key", "session_token", "role_arn",
"session_name",
- "external_id", "load_frequency"
+ "external_id", "load_frequency", "allow_bucket_creation",
"allow_bucket_deletion"
),
names(args)
)
@@ -383,7 +396,9 @@ default_s3_options <- list(
endpoint_override = "",
scheme = "",
proxy_options = "",
- background_writes = TRUE
+ background_writes = TRUE,
+ allow_bucket_creation = FALSE,
+ allow_bucket_deletion = FALSE
)
#' Connect to an AWS S3 bucket
diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd
index 2f3dcff670..1ed0164465 100644
--- a/r/man/FileSystem.Rd
+++ b/r/man/FileSystem.Rd
@@ -50,6 +50,10 @@ that emulate S3.
\item \code{scheme}: S3 connection transport (default "https")
\item \code{background_writes}: logical, whether \code{OutputStream} writes
will be issued
in the background, without blocking (default \code{TRUE})
+\item \code{allow_bucket_creation}: logical, if TRUE, the filesystem will
create
+buckets if \verb{$CreateDir()} is called on the bucket level (default
\code{FALSE}).
+\item \code{allow_bucket_deletion}: logical, if TRUE, the filesystem will
delete
+buckets if\verb{$DeleteDir()} is called on the bucket level (default
\code{FALSE}).
}
}
@@ -97,3 +101,14 @@ root in this \code{SubTreeFileSystem}.
}
}
+\section{Notes}{
+
+
+On S3FileSystem, \verb{$CreateDir()} on a top-level directory creates a new
bucket.
+When S3FileSystem creates new buckets (assuming allow_bucket_creation is TRUE),
+it does not pass any non-default settings. In AWS S3, the bucket and all
+objects will be not publicly visible, and will have no bucket policies
+and no resource tags. To have more control over how buckets are created,
+use a different API to create them.
+}
+
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 2e3cf0f605..887327d48f 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -3180,8 +3180,8 @@ END_CPP11
}
// filesystem.cpp
#if defined(ARROW_R_WITH_S3)
-std::shared_ptr<fs::S3FileSystem> fs___S3FileSystem__create(bool anonymous,
std::string access_key, std::string secret_key, std::string session_token,
std::string role_arn, std::string session_name, std::string external_id, int
load_frequency, std::string region, std::string endpoint_override, std::string
scheme, std::string proxy_options, bool background_writes);
-extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP
access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP
role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP
load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP
scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp){
+std::shared_ptr<fs::S3FileSystem> fs___S3FileSystem__create(bool anonymous,
std::string access_key, std::string secret_key, std::string session_token,
std::string role_arn, std::string session_name, std::string external_id, int
load_frequency, std::string region, std::string endpoint_override, std::string
scheme, std::string proxy_options, bool background_writes, bool
allow_bucket_creation, bool allow_bucket_deletion);
+extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP
access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP
role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP
load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP
scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP
allow_bucket_creation_sexp, SEXP allow_bucket_deletion_sexp){
BEGIN_CPP11
arrow::r::Input<bool>::type anonymous(anonymous_sexp);
arrow::r::Input<std::string>::type access_key(access_key_sexp);
@@ -3196,11 +3196,13 @@ BEGIN_CPP11
arrow::r::Input<std::string>::type scheme(scheme_sexp);
arrow::r::Input<std::string>::type proxy_options(proxy_options_sexp);
arrow::r::Input<bool>::type background_writes(background_writes_sexp);
- return cpp11::as_sexp(fs___S3FileSystem__create(anonymous, access_key,
secret_key, session_token, role_arn, session_name, external_id, load_frequency,
region, endpoint_override, scheme, proxy_options, background_writes));
+ arrow::r::Input<bool>::type
allow_bucket_creation(allow_bucket_creation_sexp);
+ arrow::r::Input<bool>::type
allow_bucket_deletion(allow_bucket_deletion_sexp);
+ return cpp11::as_sexp(fs___S3FileSystem__create(anonymous, access_key,
secret_key, session_token, role_arn, session_name, external_id, load_frequency,
region, endpoint_override, scheme, proxy_options, background_writes,
allow_bucket_creation, allow_bucket_deletion));
END_CPP11
}
#else
-extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP
access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP
role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP
load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP
scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp){
+extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP
access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP
role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP
load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP
scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP
allow_bucket_creation_sexp, SEXP allow_bucket_deletion_sexp){
Rf_error("Cannot call fs___S3FileSystem__create(). See
https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow
C++ libraries. ");
}
#endif
@@ -5432,7 +5434,7 @@ static const R_CallMethodDef CallEntries[] = {
{ "_arrow_fs___SubTreeFileSystem__base_path", (DL_FUNC)
&_arrow_fs___SubTreeFileSystem__base_path, 1},
{ "_arrow_fs___FileSystemFromUri", (DL_FUNC)
&_arrow_fs___FileSystemFromUri, 1},
{ "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles,
6},
- { "_arrow_fs___S3FileSystem__create", (DL_FUNC)
&_arrow_fs___S3FileSystem__create, 13},
+ { "_arrow_fs___S3FileSystem__create", (DL_FUNC)
&_arrow_fs___S3FileSystem__create, 15},
{ "_arrow_fs___S3FileSystem__region", (DL_FUNC)
&_arrow_fs___S3FileSystem__region, 1},
{ "_arrow_io___Readable__Read", (DL_FUNC)
&_arrow_io___Readable__Read, 2},
{ "_arrow_io___InputStream__Close", (DL_FUNC)
&_arrow_io___InputStream__Close, 1},
diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp
index c28f52631a..bcafef34e4 100644
--- a/r/src/filesystem.cpp
+++ b/r/src/filesystem.cpp
@@ -282,7 +282,8 @@ std::shared_ptr<fs::S3FileSystem> fs___S3FileSystem__create(
std::string session_token = "", std::string role_arn = "",
std::string session_name = "", std::string external_id = "", int
load_frequency = 900,
std::string region = "", std::string endpoint_override = "", std::string
scheme = "",
- std::string proxy_options = "", bool background_writes = true) {
+ std::string proxy_options = "", bool background_writes = true,
+ bool allow_bucket_creation = false, bool allow_bucket_deletion = false) {
// We need to ensure that S3 is initialized before we start messing with the
// options
StopIfNotOk(fs::EnsureS3Initialized());
@@ -321,6 +322,9 @@ std::shared_ptr<fs::S3FileSystem> fs___S3FileSystem__create(
/// default true
s3_opts.background_writes = background_writes;
+ s3_opts.allow_bucket_creation = allow_bucket_creation;
+ s3_opts.allow_bucket_deletion = allow_bucket_deletion;
+
auto io_context = arrow::io::IOContext(gc_memory_pool());
return ValueOrStop(fs::S3FileSystem::Make(s3_opts, io_context));
}
diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R
index 51db355783..ad11d04d5e 100644
--- a/r/tests/testthat/test-s3-minio.R
+++ b/r/tests/testthat/test-s3-minio.R
@@ -34,7 +34,17 @@ if (arrow_with_s3() && process_is_running("minio server")) {
access_key = minio_key,
secret_key = minio_secret,
scheme = "http",
- endpoint_override = paste0("localhost:", minio_port)
+ endpoint_override = paste0("localhost:", minio_port),
+ allow_bucket_creation = TRUE,
+ allow_bucket_deletion = TRUE
+ )
+ limited_fs <- S3FileSystem$create(
+ access_key = minio_key,
+ secret_key = minio_secret,
+ scheme = "http",
+ endpoint_override = paste0("localhost:", minio_port),
+ allow_bucket_creation = FALSE,
+ allow_bucket_deletion = FALSE
)
now <- as.character(as.numeric(Sys.time()))
fs$CreateDir(now)
@@ -180,6 +190,14 @@ if (arrow_with_s3() && process_is_running("minio server"))
{
expect_length(fs$ls(minio_path("new_dataset_dir")), 1)
})
+ test_that("CreateDir fails on bucket if allow_bucket_creation=False", {
+ now_tmp <- paste0(now, "-test-fail-delete")
+ fs$CreateDir(now_tmp)
+
+ expect_error(limited_fs$CreateDir("should-fail"))
+ expect_error(limited_fs$DeleteDir(now_tmp))
+ })
+
test_that("Let's test copy_files too", {
td <- make_temp_dir()
copy_files(minio_uri("hive_dir"), td)