This is an automated email from the ASF dual-hosted git repository.
felipecrv pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 476c78fd6e GH-38597: [C++] Implement GetFileInfo(selector) for Azure
filesystem (#39009)
476c78fd6e is described below
commit 476c78fd6e535faacfc6a171529ef496abb30cd9
Author: Felipe Oliveira Carvalho <[email protected]>
AuthorDate: Fri Dec 8 15:48:06 2023 -0300
GH-38597: [C++] Implement GetFileInfo(selector) for Azure filesystem
(#39009)
### Rationale for this change
Part of Azure FS implementation.
### What changes are included in this PR?
The version of `GetFileInfo` that takes a prefix and can optionally
recurse into directories.
### Are these changes tested?
By unit tests present in this PR. Separate from this PR, I'm thinking of
way to fuzz-test the FS API.
* Closes: #38597
---
cpp/src/arrow/filesystem/azurefs.cc | 212 +++++++++++++++++++++++++-
cpp/src/arrow/filesystem/azurefs.h | 2 +-
cpp/src/arrow/filesystem/azurefs_test.cc | 248 +++++++++++++++++++++++++++++--
cpp/src/arrow/filesystem/filesystem.cc | 3 +-
cpp/src/arrow/filesystem/path_util.cc | 31 ++--
cpp/src/arrow/filesystem/path_util.h | 12 +-
cpp/src/arrow/filesystem/test_util.cc | 6 +
cpp/src/arrow/filesystem/test_util.h | 4 +
8 files changed, 481 insertions(+), 37 deletions(-)
diff --git a/cpp/src/arrow/filesystem/azurefs.cc
b/cpp/src/arrow/filesystem/azurefs.cc
index 9bd2b0ae9d..daababb04c 100644
--- a/cpp/src/arrow/filesystem/azurefs.cc
+++ b/cpp/src/arrow/filesystem/azurefs.cc
@@ -39,7 +39,7 @@ namespace fs {
// -----------------------------------------------------------------------
// AzureOptions Implementation
-AzureOptions::AzureOptions() {}
+AzureOptions::AzureOptions() = default;
bool AzureOptions::Equals(const AzureOptions& other) const {
return (account_dfs_url == other.account_dfs_url &&
@@ -820,6 +820,209 @@ class AzureFileSystem::Impl {
}
}
+ private:
+ template <typename OnContainer>
+ Status VisitContainers(const Azure::Core::Context& context,
+ OnContainer&& on_container) const {
+ Azure::Storage::Blobs::ListBlobContainersOptions options;
+ try {
+ auto container_list_response =
+ blob_service_client_->ListBlobContainers(options, context);
+ for (; container_list_response.HasPage();
+ container_list_response.MoveToNextPage(context)) {
+ for (const auto& container : container_list_response.BlobContainers) {
+ RETURN_NOT_OK(on_container(container));
+ }
+ }
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus("Failed to list account containers.",
exception);
+ }
+ return Status::OK();
+ }
+
+ static FileInfo FileInfoFromBlob(const std::string& container,
+ const
Azure::Storage::Blobs::Models::BlobItem& blob) {
+ auto path = internal::ConcatAbstractPath(container, blob.Name);
+ if (internal::HasTrailingSlash(blob.Name)) {
+ return DirectoryFileInfoFromPath(path);
+ }
+ FileInfo info{std::move(path), FileType::File};
+ info.set_size(blob.BlobSize);
+
info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified});
+ return info;
+ }
+
+ static FileInfo DirectoryFileInfoFromPath(const std::string& path) {
+ return FileInfo{std::string{internal::RemoveTrailingSlash(path)},
+ FileType::Directory};
+ }
+
+ static std::string_view BasenameView(std::string_view s) {
+ DCHECK(!internal::HasTrailingSlash(s));
+ auto offset = s.find_last_of(internal::kSep);
+ auto result = (offset == std::string_view::npos) ? s : s.substr(offset);
+ DCHECK(!result.empty() && result.back() != internal::kSep);
+ return result;
+ }
+
+ /// \brief List the blobs at the root of a container or some dir in a
container.
+ ///
+ /// \pre container_client is the client for the container named like the
first
+ /// segment of select.base_dir.
+ Status GetFileInfoWithSelectorFromContainer(
+ const Azure::Storage::Blobs::BlobContainerClient& container_client,
+ const Azure::Core::Context& context, Azure::Nullable<int32_t>
page_size_hint,
+ const FileSelector& select, FileInfoVector* acc_results) {
+ ARROW_ASSIGN_OR_RAISE(auto base_location,
AzureLocation::FromString(select.base_dir));
+
+ bool found = false;
+ Azure::Storage::Blobs::ListBlobsOptions options;
+ if (internal::IsEmptyPath(base_location.path)) {
+ // If the base_dir is the root of the container, then we want to list
all blobs in
+ // the container and the Prefix should be empty and not even include the
trailing
+ // slash because the container itself represents the `<container>/`
directory.
+ options.Prefix = {};
+ found = true; // Unless the container itself is not found later!
+ } else {
+ options.Prefix = internal::EnsureTrailingSlash(base_location.path);
+ }
+ options.PageSizeHint = page_size_hint;
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata;
+
+ auto recurse = [&](const std::string& blob_prefix) noexcept -> Status {
+ if (select.recursive && select.max_recursion > 0) {
+ FileSelector sub_select;
+ sub_select.base_dir = internal::ConcatAbstractPath(
+ base_location.container,
internal::RemoveTrailingSlash(blob_prefix));
+ sub_select.allow_not_found = true;
+ sub_select.recursive = true;
+ sub_select.max_recursion = select.max_recursion - 1;
+ return GetFileInfoWithSelectorFromContainer(
+ container_client, context, page_size_hint, sub_select,
acc_results);
+ }
+ return Status::OK();
+ };
+
+ auto process_blob =
+ [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept {
+ // blob.Name has trailing slash only when Prefix is an empty
+ // directory marker blob for the directory we're listing
+ // from, and we should skip it.
+ if (!internal::HasTrailingSlash(blob.Name)) {
+ acc_results->push_back(FileInfoFromBlob(base_location.container,
blob));
+ }
+ };
+ auto process_prefix = [&](const std::string& prefix) noexcept -> Status {
+ const auto path = internal::ConcatAbstractPath(base_location.container,
prefix);
+ acc_results->push_back(DirectoryFileInfoFromPath(path));
+ return recurse(prefix);
+ };
+
+ try {
+ auto list_response =
+ container_client.ListBlobsByHierarchy(/*delimiter=*/"/", options,
context);
+ for (; list_response.HasPage(); list_response.MoveToNextPage(context)) {
+ if (list_response.Blobs.empty() && list_response.BlobPrefixes.empty())
{
+ continue;
+ }
+ found = true;
+ // Blob and BlobPrefixes are sorted by name, so we can merge-iterate
+ // them to ensure returned results are all sorted.
+ size_t blob_index = 0;
+ size_t blob_prefix_index = 0;
+ while (blob_index < list_response.Blobs.size() &&
+ blob_prefix_index < list_response.BlobPrefixes.size()) {
+ const auto& blob = list_response.Blobs[blob_index];
+ const auto& prefix = list_response.BlobPrefixes[blob_prefix_index];
+ const int cmp = blob.Name.compare(prefix);
+ if (cmp < 0) {
+ process_blob(blob);
+ blob_index += 1;
+ } else if (cmp > 0) {
+ RETURN_NOT_OK(process_prefix(prefix));
+ blob_prefix_index += 1;
+ } else {
+ DCHECK_EQ(blob.Name, prefix);
+ RETURN_NOT_OK(process_prefix(prefix));
+ blob_index += 1;
+ blob_prefix_index += 1;
+ // If the container has an empty dir marker blob and another blob
starting
+ // with this blob name as a prefix, the blob doesn't appear in the
listing
+ // that also contains the prefix, so AFAICT this branch in
unreachable. The
+ // code above is kept just in case, but if this DCHECK(false) is
ever reached,
+ // we should refactor this loop to ensure no duplicate entries are
ever
+ // reported.
+ DCHECK(false)
+ << "Unexpected blob/prefix name collision on the same listing
request";
+ }
+ }
+ for (; blob_index < list_response.Blobs.size(); blob_index++) {
+ process_blob(list_response.Blobs[blob_index]);
+ }
+ for (; blob_prefix_index < list_response.BlobPrefixes.size();
+ blob_prefix_index++) {
+
RETURN_NOT_OK(process_prefix(list_response.BlobPrefixes[blob_prefix_index]));
+ }
+ }
+ } catch (const Azure::Storage::StorageException& exception) {
+ if (exception.ErrorCode == "ContainerNotFound") {
+ found = false;
+ } else {
+ return internal::ExceptionToStatus(
+ "Failed to list blobs in a directory: " + select.base_dir + ": " +
+ container_client.GetUrl(),
+ exception);
+ }
+ }
+
+ return found || select.allow_not_found
+ ? Status::OK()
+ : ::arrow::fs::internal::PathNotFound(select.base_dir);
+ }
+
+ public:
+ Status GetFileInfoWithSelector(const Azure::Core::Context& context,
+ Azure::Nullable<int32_t> page_size_hint,
+ const FileSelector& select,
+ FileInfoVector* acc_results) {
+ ARROW_ASSIGN_OR_RAISE(auto base_location,
AzureLocation::FromString(select.base_dir));
+
+ if (base_location.container.empty()) {
+ // Without a container, the base_location is equivalent to the filesystem
+ // root -- `/`. FileSelector::allow_not_found doesn't matter in this case
+ // because the root always exists.
+ auto on_container =
+ [&](const Azure::Storage::Blobs::Models::BlobContainerItem&
container) {
+ // Deleted containers are not listed by ListContainers.
+ DCHECK(!container.IsDeleted);
+
+ // Every container is considered a directory.
+ FileInfo info{container.Name, FileType::Directory};
+ info.set_mtime(
+
std::chrono::system_clock::time_point{container.Details.LastModified});
+ acc_results->push_back(std::move(info));
+
+ // Recurse into containers (subdirectories) if requested.
+ if (select.recursive && select.max_recursion > 0) {
+ FileSelector sub_select;
+ sub_select.base_dir = container.Name;
+ sub_select.allow_not_found = true;
+ sub_select.recursive = true;
+ sub_select.max_recursion = select.max_recursion - 1;
+ ARROW_RETURN_NOT_OK(GetFileInfoWithSelector(context,
page_size_hint,
+ sub_select,
acc_results));
+ }
+ return Status::OK();
+ };
+ return VisitContainers(context, std::move(on_container));
+ }
+
+ auto container_client =
+ blob_service_client_->GetBlobContainerClient(base_location.container);
+ return GetFileInfoWithSelectorFromContainer(container_client, context,
page_size_hint,
+ select, acc_results);
+ }
+
Result<std::shared_ptr<ObjectInputFile>> OpenInputFile(const AzureLocation&
location,
AzureFileSystem* fs) {
RETURN_NOT_OK(ValidateFileLocation(location));
@@ -1196,7 +1399,12 @@ Result<FileInfo> AzureFileSystem::GetFileInfo(const
std::string& path) {
}
Result<FileInfoVector> AzureFileSystem::GetFileInfo(const FileSelector&
select) {
- return Status::NotImplemented("The Azure FileSystem is not fully
implemented");
+ Azure::Core::Context context;
+ Azure::Nullable<int32_t> page_size_hint; // unspecified
+ FileInfoVector results;
+ RETURN_NOT_OK(
+ impl_->GetFileInfoWithSelector(context, page_size_hint, select,
&results));
+ return {std::move(results)};
}
Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) {
diff --git a/cpp/src/arrow/filesystem/azurefs.h
b/cpp/src/arrow/filesystem/azurefs.h
index 9f980ee8ba..b2865b059e 100644
--- a/cpp/src/arrow/filesystem/azurefs.h
+++ b/cpp/src/arrow/filesystem/azurefs.h
@@ -157,7 +157,7 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem {
const AzureOptions& options, const io::IOContext& =
io::default_io_context());
private:
- explicit AzureFileSystem(const AzureOptions& options, const io::IOContext&
io_context);
+ AzureFileSystem(const AzureOptions& options, const io::IOContext&
io_context);
class Impl;
std::unique_ptr<Impl> impl_;
diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc
b/cpp/src/arrow/filesystem/azurefs_test.cc
index 41f1663114..792c63b209 100644
--- a/cpp/src/arrow/filesystem/azurefs_test.cc
+++ b/cpp/src/arrow/filesystem/azurefs_test.cc
@@ -70,6 +70,9 @@ using ::testing::IsEmpty;
using ::testing::Not;
using ::testing::NotNull;
+namespace Blobs = Azure::Storage::Blobs;
+namespace Files = Azure::Storage::Files;
+
auto const* kLoremIpsum = R"""(
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
@@ -193,9 +196,8 @@ TEST(AzureFileSystem, OptionsCompare) {
class AzureFileSystemTest : public ::testing::Test {
public:
std::shared_ptr<FileSystem> fs_;
- std::unique_ptr<Azure::Storage::Blobs::BlobServiceClient>
blob_service_client_;
- std::unique_ptr<Azure::Storage::Files::DataLake::DataLakeServiceClient>
- datalake_service_client_;
+ std::unique_ptr<Blobs::BlobServiceClient> blob_service_client_;
+ std::unique_ptr<Files::DataLake::DataLakeServiceClient>
datalake_service_client_;
AzureOptions options_;
std::mt19937_64 generator_;
std::string container_name_;
@@ -213,15 +215,14 @@ class AzureFileSystemTest : public ::testing::Test {
suite_skipped_ = true;
GTEST_SKIP() << options.status().message();
}
- container_name_ = RandomChars(32);
- blob_service_client_ =
std::make_unique<Azure::Storage::Blobs::BlobServiceClient>(
+ // Stop-gap solution before GH-39119 is fixed.
+ container_name_ = "z" + RandomChars(31);
+ blob_service_client_ = std::make_unique<Blobs::BlobServiceClient>(
options_.account_blob_url, options_.storage_credentials_provider);
- datalake_service_client_ =
-
std::make_unique<Azure::Storage::Files::DataLake::DataLakeServiceClient>(
- options_.account_dfs_url, options_.storage_credentials_provider);
+ datalake_service_client_ =
std::make_unique<Files::DataLake::DataLakeServiceClient>(
+ options_.account_dfs_url, options_.storage_credentials_provider);
ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_));
- auto container_client =
blob_service_client_->GetBlobContainerClient(container_name_);
- container_client.CreateIfNotExists();
+ auto container_client = CreateContainer(container_name_);
auto blob_client =
container_client.GetBlockBlobClient(PreexistingObjectName());
blob_client.UploadFrom(reinterpret_cast<const uint8_t*>(kLoremIpsum),
@@ -239,6 +240,20 @@ class AzureFileSystemTest : public ::testing::Test {
}
}
+ Blobs::BlobContainerClient CreateContainer(const std::string& name) {
+ auto container_client = blob_service_client_->GetBlobContainerClient(name);
+ (void)container_client.CreateIfNotExists();
+ return container_client;
+ }
+
+ Blobs::BlobClient CreateBlob(Blobs::BlobContainerClient& container_client,
+ const std::string& name, const std::string&
data = "") {
+ auto blob_client = container_client.GetBlockBlobClient(name);
+ (void)blob_client.UploadFrom(reinterpret_cast<const uint8_t*>(data.data()),
+ data.size());
+ return blob_client;
+ }
+
std::string PreexistingContainerName() const { return container_name_; }
std::string PreexistingContainerPath() const {
@@ -326,6 +341,45 @@ class AzureFileSystemTest : public ::testing::Test {
top_blob_path,
};
}
+
+ char const* kSubData = "sub data";
+ char const* kSomeData = "some data";
+ char const* kOtherData = "other data";
+
+ void SetUpSmallFileSystemTree() {
+ // Set up test containers
+ CreateContainer("empty-container");
+ auto container = CreateContainer("container");
+
+ CreateBlob(container, "emptydir/");
+ CreateBlob(container, "somedir/subdir/subfile", kSubData);
+ CreateBlob(container, "somefile", kSomeData);
+ // Add an explicit marker for a non-empty directory.
+ CreateBlob(container, "otherdir/1/2/");
+ // otherdir/{1/,2/,3/} are implicitly assumed to exist because of
+ // the otherdir/1/2/3/otherfile blob.
+ CreateBlob(container, "otherdir/1/2/3/otherfile", kOtherData);
+ }
+
+ void AssertInfoAllContainersRecursive(const std::vector<FileInfo>& infos) {
+ ASSERT_EQ(infos.size(), 14);
+ AssertFileInfo(infos[0], "container", FileType::Directory);
+ AssertFileInfo(infos[1], "container/emptydir", FileType::Directory);
+ AssertFileInfo(infos[2], "container/otherdir", FileType::Directory);
+ AssertFileInfo(infos[3], "container/otherdir/1", FileType::Directory);
+ AssertFileInfo(infos[4], "container/otherdir/1/2", FileType::Directory);
+ AssertFileInfo(infos[5], "container/otherdir/1/2/3", FileType::Directory);
+ AssertFileInfo(infos[6], "container/otherdir/1/2/3/otherfile",
FileType::File,
+ strlen(kOtherData));
+ AssertFileInfo(infos[7], "container/somedir", FileType::Directory);
+ AssertFileInfo(infos[8], "container/somedir/subdir", FileType::Directory);
+ AssertFileInfo(infos[9], "container/somedir/subdir/subfile",
FileType::File,
+ strlen(kSubData));
+ AssertFileInfo(infos[10], "container/somefile", FileType::File,
strlen(kSomeData));
+ AssertFileInfo(infos[11], "empty-container", FileType::Directory);
+ AssertFileInfo(infos[12], PreexistingContainerName(), FileType::Directory);
+ AssertFileInfo(infos[13], PreexistingObjectPath(), FileType::File);
+ }
};
class AzuriteFileSystemTest : public AzureFileSystemTest {
@@ -518,6 +572,180 @@ TEST_F(AzureHierarchicalNamespaceFileSystemTest,
GetFileInfoObject) {
RunGetFileInfoObjectTest();
}
+TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) {
+ SetUpSmallFileSystemTree();
+
+ FileSelector select;
+ std::vector<FileInfo> infos;
+
+ // Root dir
+ select.base_dir = "";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 3);
+ ASSERT_EQ(infos, SortedInfos(infos));
+ AssertFileInfo(infos[0], "container", FileType::Directory);
+ AssertFileInfo(infos[1], "empty-container", FileType::Directory);
+ AssertFileInfo(infos[2], container_name_, FileType::Directory);
+
+ // Empty container
+ select.base_dir = "empty-container";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+ // Nonexistent container
+ select.base_dir = "nonexistent-container";
+ ASSERT_RAISES(IOError, fs_->GetFileInfo(select));
+ select.allow_not_found = true;
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+ select.allow_not_found = false;
+ // Non-empty container
+ select.base_dir = "container";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos, SortedInfos(infos));
+ ASSERT_EQ(infos.size(), 4);
+ AssertFileInfo(infos[0], "container/emptydir", FileType::Directory);
+ AssertFileInfo(infos[1], "container/otherdir", FileType::Directory);
+ AssertFileInfo(infos[2], "container/somedir", FileType::Directory);
+ AssertFileInfo(infos[3], "container/somefile", FileType::File, 9);
+
+ // Empty "directory"
+ select.base_dir = "container/emptydir";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+ // Non-empty "directories"
+ select.base_dir = "container/somedir";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 1);
+ AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory);
+ select.base_dir = "container/somedir/subdir";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 1);
+ AssertFileInfo(infos[0], "container/somedir/subdir/subfile", FileType::File,
8);
+ // Nonexistent
+ select.base_dir = "container/nonexistent";
+ ASSERT_RAISES(IOError, fs_->GetFileInfo(select));
+ select.allow_not_found = true;
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+ select.allow_not_found = false;
+
+ // Trailing slashes
+ select.base_dir = "empty-container/";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+ select.base_dir = "nonexistent-container/";
+ ASSERT_RAISES(IOError, fs_->GetFileInfo(select));
+ select.base_dir = "container/";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos, SortedInfos(infos));
+ ASSERT_EQ(infos.size(), 4);
+}
+
+TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) {
+ SetUpSmallFileSystemTree();
+
+ FileSelector select;
+ select.recursive = true;
+
+ std::vector<FileInfo> infos;
+ // Root dir
+ select.base_dir = "";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 14);
+ ASSERT_EQ(infos, SortedInfos(infos));
+ AssertInfoAllContainersRecursive(infos);
+
+ // Empty container
+ select.base_dir = "empty-container";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+
+ // Non-empty container
+ select.base_dir = "container";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos, SortedInfos(infos));
+ ASSERT_EQ(infos.size(), 10);
+ AssertFileInfo(infos[0], "container/emptydir", FileType::Directory);
+ AssertFileInfo(infos[1], "container/otherdir", FileType::Directory);
+ AssertFileInfo(infos[2], "container/otherdir/1", FileType::Directory);
+ AssertFileInfo(infos[3], "container/otherdir/1/2", FileType::Directory);
+ AssertFileInfo(infos[4], "container/otherdir/1/2/3", FileType::Directory);
+ AssertFileInfo(infos[5], "container/otherdir/1/2/3/otherfile",
FileType::File, 10);
+ AssertFileInfo(infos[6], "container/somedir", FileType::Directory);
+ AssertFileInfo(infos[7], "container/somedir/subdir", FileType::Directory);
+ AssertFileInfo(infos[8], "container/somedir/subdir/subfile", FileType::File,
8);
+ AssertFileInfo(infos[9], "container/somefile", FileType::File, 9);
+
+ // Empty "directory"
+ select.base_dir = "container/emptydir";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+
+ // Non-empty "directories"
+ select.base_dir = "container/somedir";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos, SortedInfos(infos));
+ ASSERT_EQ(infos.size(), 2);
+ AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory);
+ AssertFileInfo(infos[1], "container/somedir/subdir/subfile", FileType::File,
8);
+
+ select.base_dir = "container/otherdir";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos, SortedInfos(infos));
+ ASSERT_EQ(infos.size(), 4);
+ AssertFileInfo(infos[0], "container/otherdir/1", FileType::Directory);
+ AssertFileInfo(infos[1], "container/otherdir/1/2", FileType::Directory);
+ AssertFileInfo(infos[2], "container/otherdir/1/2/3", FileType::Directory);
+ AssertFileInfo(infos[3], "container/otherdir/1/2/3/otherfile",
FileType::File, 10);
+}
+
+TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorExplicitImplicitDirDedup) {
+ {
+ auto container = CreateContainer("container");
+ CreateBlob(container, "mydir/emptydir1/");
+ CreateBlob(container, "mydir/emptydir2/");
+ CreateBlob(container, "mydir/nonemptydir1/"); // explicit dir marker
+ CreateBlob(container, "mydir/nonemptydir1/somefile", kSomeData);
+ CreateBlob(container, "mydir/nonemptydir2/somefile", kSomeData);
+ }
+ std::vector<FileInfo> infos;
+
+ FileSelector select; // non-recursive
+ select.base_dir = "container";
+
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 1);
+ ASSERT_EQ(infos, SortedInfos(infos));
+ AssertFileInfo(infos[0], "container/mydir", FileType::Directory);
+
+ select.base_dir = "container/mydir";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 4);
+ ASSERT_EQ(infos, SortedInfos(infos));
+ AssertFileInfo(infos[0], "container/mydir/emptydir1", FileType::Directory);
+ AssertFileInfo(infos[1], "container/mydir/emptydir2", FileType::Directory);
+ AssertFileInfo(infos[2], "container/mydir/nonemptydir1",
FileType::Directory);
+ AssertFileInfo(infos[3], "container/mydir/nonemptydir2",
FileType::Directory);
+
+ select.base_dir = "container/mydir/emptydir1";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+
+ select.base_dir = "container/mydir/emptydir2";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+
+ select.base_dir = "container/mydir/nonemptydir1";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 1);
+ AssertFileInfo(infos[0], "container/mydir/nonemptydir1/somefile",
FileType::File);
+
+ select.base_dir = "container/mydir/nonemptydir2";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 1);
+ AssertFileInfo(infos[0], "container/mydir/nonemptydir2/somefile",
FileType::File);
+}
+
TEST_F(AzuriteFileSystemTest, CreateDirFailureNoContainer) {
ASSERT_RAISES(Invalid, fs_->CreateDir("", false));
}
diff --git a/cpp/src/arrow/filesystem/filesystem.cc
b/cpp/src/arrow/filesystem/filesystem.cc
index 9ecc4610f3..810e9c179b 100644
--- a/cpp/src/arrow/filesystem/filesystem.cc
+++ b/cpp/src/arrow/filesystem/filesystem.cc
@@ -654,8 +654,7 @@ Status CopyFiles(const std::shared_ptr<FileSystem>&
source_fs,
"', which is outside base dir '",
source_sel.base_dir, "'");
}
- auto destination_path =
- internal::ConcatAbstractPath(destination_base_dir,
std::string(*relative));
+ auto destination_path = internal::ConcatAbstractPath(destination_base_dir,
*relative);
if (source_info.IsDirectory()) {
dirs.push_back(destination_path);
diff --git a/cpp/src/arrow/filesystem/path_util.cc
b/cpp/src/arrow/filesystem/path_util.cc
index 46ea436a9f..9c895ae76c 100644
--- a/cpp/src/arrow/filesystem/path_util.cc
+++ b/cpp/src/arrow/filesystem/path_util.cc
@@ -52,7 +52,7 @@ std::vector<std::string> SplitAbstractPath(const std::string&
path, char sep) {
}
auto append_part = [&parts, &v](size_t start, size_t end) {
- parts.push_back(std::string(v.substr(start, end - start)));
+ parts.emplace_back(v.substr(start, end - start));
};
size_t start = 0;
@@ -72,15 +72,12 @@ std::string SliceAbstractPath(const std::string& s, int
offset, int length, char
return "";
}
std::vector<std::string> components = SplitAbstractPath(s, sep);
- std::stringstream combined;
if (offset >= static_cast<int>(components.size())) {
return "";
}
- int end = offset + length;
- if (end > static_cast<int>(components.size())) {
- end = static_cast<int>(components.size());
- }
- for (int i = offset; i < end; i++) {
+ const auto end = std::min(static_cast<size_t>(offset) + length,
components.size());
+ std::stringstream combined;
+ for (auto i = static_cast<size_t>(offset); i < end; i++) {
combined << components[i];
if (i < end - 1) {
combined << sep;
@@ -140,16 +137,20 @@ Status ValidateAbstractPathParts(const
std::vector<std::string>& parts) {
return Status::OK();
}
-std::string ConcatAbstractPath(const std::string& base, const std::string&
stem) {
+std::string ConcatAbstractPath(std::string_view base, std::string_view stem) {
DCHECK(!stem.empty());
if (base.empty()) {
- return stem;
+ return std::string{stem};
}
- return EnsureTrailingSlash(base) + std::string(RemoveLeadingSlash(stem));
+ std::string result;
+ result.reserve(base.length() + stem.length() + 1); // extra 1 is for
potential kSep
+ result += EnsureTrailingSlash(base);
+ result += RemoveLeadingSlash(stem);
+ return result;
}
std::string EnsureTrailingSlash(std::string_view v) {
- if (v.length() > 0 && v.back() != kSep) {
+ if (!v.empty() && !HasTrailingSlash(v)) {
// XXX How about "C:" on Windows? We probably don't want to turn it into
"C:/"...
// Unless the local filesystem always uses absolute paths
return std::string(v) + kSep;
@@ -159,7 +160,7 @@ std::string EnsureTrailingSlash(std::string_view v) {
}
std::string EnsureLeadingSlash(std::string_view v) {
- if (v.length() == 0 || v.front() != kSep) {
+ if (!HasLeadingSlash(v)) {
// XXX How about "C:" on Windows? We probably don't want to turn it into
"/C:"...
return kSep + std::string(v);
} else {
@@ -197,10 +198,6 @@ Status AssertNoTrailingSlash(std::string_view key) {
return Status::OK();
}
-bool HasTrailingSlash(std::string_view key) { return key.back() == '/'; }
-
-bool HasLeadingSlash(std::string_view key) { return key.front() == '/'; }
-
Result<std::string> MakeAbstractPathRelative(const std::string& base,
const std::string& path) {
if (base.empty() || base.front() != kSep) {
@@ -383,7 +380,7 @@ struct Globber::Impl {
Globber::Globber(std::string pattern) : impl_(new Impl(pattern)) {}
-Globber::~Globber() {}
+Globber::~Globber() = default;
bool Globber::Matches(const std::string& path) {
return regex_match(path, impl_->pattern_);
diff --git a/cpp/src/arrow/filesystem/path_util.h
b/cpp/src/arrow/filesystem/path_util.h
index 2c8c123e77..1da7afd3f9 100644
--- a/cpp/src/arrow/filesystem/path_util.h
+++ b/cpp/src/arrow/filesystem/path_util.h
@@ -69,7 +69,7 @@ Status ValidateAbstractPathParts(const
std::vector<std::string>& parts);
// Append a non-empty stem to an abstract path.
ARROW_EXPORT
-std::string ConcatAbstractPath(const std::string& base, const std::string&
stem);
+std::string ConcatAbstractPath(std::string_view base, std::string_view stem);
// Make path relative to base, if it starts with base. Otherwise error out.
ARROW_EXPORT
@@ -94,11 +94,13 @@ std::string_view RemoveTrailingSlash(std::string_view s,
bool preserve_root = fa
ARROW_EXPORT
Status AssertNoTrailingSlash(std::string_view s);
-ARROW_EXPORT
-bool HasTrailingSlash(std::string_view s);
+inline bool HasTrailingSlash(std::string_view s) {
+ return !s.empty() && s.back() == kSep;
+}
-ARROW_EXPORT
-bool HasLeadingSlash(std::string_view s);
+inline bool HasLeadingSlash(std::string_view s) {
+ return !s.empty() && s.front() == kSep;
+}
ARROW_EXPORT
bool IsAncestorOf(std::string_view ancestor, std::string_view descendant);
diff --git a/cpp/src/arrow/filesystem/test_util.cc
b/cpp/src/arrow/filesystem/test_util.cc
index 6c5dda8e65..040917dcd2 100644
--- a/cpp/src/arrow/filesystem/test_util.cc
+++ b/cpp/src/arrow/filesystem/test_util.cc
@@ -126,6 +126,12 @@ void SortInfos(std::vector<FileInfo>* infos) {
std::sort(infos->begin(), infos->end(), FileInfo::ByPath{});
}
+std::vector<FileInfo> SortedInfos(const std::vector<FileInfo>& infos) {
+ auto sorted = infos;
+ SortInfos(&sorted);
+ return sorted;
+}
+
void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector*
out_infos) {
auto fut = CollectAsyncGenerator(gen);
ASSERT_FINISHES_OK_AND_ASSIGN(auto nested_infos, fut);
diff --git a/cpp/src/arrow/filesystem/test_util.h
b/cpp/src/arrow/filesystem/test_util.h
index c4d846fd31..62b488e159 100644
--- a/cpp/src/arrow/filesystem/test_util.h
+++ b/cpp/src/arrow/filesystem/test_util.h
@@ -74,6 +74,10 @@ void CreateFile(FileSystem* fs, const std::string& path,
const std::string& data
ARROW_TESTING_EXPORT
void SortInfos(FileInfoVector* infos);
+// Create a copy of a FileInfo vector sorted by lexicographic path order
+ARROW_TESTING_EXPORT
+FileInfoVector SortedInfos(const FileInfoVector& infos);
+
ARROW_TESTING_EXPORT
void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector*
out_infos);