This is an automated email from the ASF dual-hosted git repository.

felipecrv pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 476c78fd6e GH-38597: [C++] Implement GetFileInfo(selector) for Azure 
filesystem (#39009)
476c78fd6e is described below

commit 476c78fd6e535faacfc6a171529ef496abb30cd9
Author: Felipe Oliveira Carvalho <[email protected]>
AuthorDate: Fri Dec 8 15:48:06 2023 -0300

    GH-38597: [C++] Implement GetFileInfo(selector) for Azure filesystem 
(#39009)
    
    ### Rationale for this change
    
    Part of Azure FS implementation.
    
    ### What changes are included in this PR?
    
    The version of `GetFileInfo` that takes a prefix and can optionally
    recurse into directories.
    
    ### Are these changes tested?
    
    By unit tests present in this PR. Separate from this PR, I'm thinking of
    way to fuzz-test the FS API.
    * Closes: #38597
---
 cpp/src/arrow/filesystem/azurefs.cc      | 212 +++++++++++++++++++++++++-
 cpp/src/arrow/filesystem/azurefs.h       |   2 +-
 cpp/src/arrow/filesystem/azurefs_test.cc | 248 +++++++++++++++++++++++++++++--
 cpp/src/arrow/filesystem/filesystem.cc   |   3 +-
 cpp/src/arrow/filesystem/path_util.cc    |  31 ++--
 cpp/src/arrow/filesystem/path_util.h     |  12 +-
 cpp/src/arrow/filesystem/test_util.cc    |   6 +
 cpp/src/arrow/filesystem/test_util.h     |   4 +
 8 files changed, 481 insertions(+), 37 deletions(-)

diff --git a/cpp/src/arrow/filesystem/azurefs.cc 
b/cpp/src/arrow/filesystem/azurefs.cc
index 9bd2b0ae9d..daababb04c 100644
--- a/cpp/src/arrow/filesystem/azurefs.cc
+++ b/cpp/src/arrow/filesystem/azurefs.cc
@@ -39,7 +39,7 @@ namespace fs {
 // -----------------------------------------------------------------------
 // AzureOptions Implementation
 
-AzureOptions::AzureOptions() {}
+AzureOptions::AzureOptions() = default;
 
 bool AzureOptions::Equals(const AzureOptions& other) const {
   return (account_dfs_url == other.account_dfs_url &&
@@ -820,6 +820,209 @@ class AzureFileSystem::Impl {
     }
   }
 
+ private:
+  template <typename OnContainer>
+  Status VisitContainers(const Azure::Core::Context& context,
+                         OnContainer&& on_container) const {
+    Azure::Storage::Blobs::ListBlobContainersOptions options;
+    try {
+      auto container_list_response =
+          blob_service_client_->ListBlobContainers(options, context);
+      for (; container_list_response.HasPage();
+           container_list_response.MoveToNextPage(context)) {
+        for (const auto& container : container_list_response.BlobContainers) {
+          RETURN_NOT_OK(on_container(container));
+        }
+      }
+    } catch (const Azure::Storage::StorageException& exception) {
+      return internal::ExceptionToStatus("Failed to list account containers.", 
exception);
+    }
+    return Status::OK();
+  }
+
+  static FileInfo FileInfoFromBlob(const std::string& container,
+                                   const 
Azure::Storage::Blobs::Models::BlobItem& blob) {
+    auto path = internal::ConcatAbstractPath(container, blob.Name);
+    if (internal::HasTrailingSlash(blob.Name)) {
+      return DirectoryFileInfoFromPath(path);
+    }
+    FileInfo info{std::move(path), FileType::File};
+    info.set_size(blob.BlobSize);
+    
info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified});
+    return info;
+  }
+
+  static FileInfo DirectoryFileInfoFromPath(const std::string& path) {
+    return FileInfo{std::string{internal::RemoveTrailingSlash(path)},
+                    FileType::Directory};
+  }
+
+  static std::string_view BasenameView(std::string_view s) {
+    DCHECK(!internal::HasTrailingSlash(s));
+    auto offset = s.find_last_of(internal::kSep);
+    auto result = (offset == std::string_view::npos) ? s : s.substr(offset);
+    DCHECK(!result.empty() && result.back() != internal::kSep);
+    return result;
+  }
+
+  /// \brief List the blobs at the root of a container or some dir in a 
container.
+  ///
+  /// \pre container_client is the client for the container named like the 
first
+  /// segment of select.base_dir.
+  Status GetFileInfoWithSelectorFromContainer(
+      const Azure::Storage::Blobs::BlobContainerClient& container_client,
+      const Azure::Core::Context& context, Azure::Nullable<int32_t> 
page_size_hint,
+      const FileSelector& select, FileInfoVector* acc_results) {
+    ARROW_ASSIGN_OR_RAISE(auto base_location, 
AzureLocation::FromString(select.base_dir));
+
+    bool found = false;
+    Azure::Storage::Blobs::ListBlobsOptions options;
+    if (internal::IsEmptyPath(base_location.path)) {
+      // If the base_dir is the root of the container, then we want to list 
all blobs in
+      // the container and the Prefix should be empty and not even include the 
trailing
+      // slash because the container itself represents the `<container>/` 
directory.
+      options.Prefix = {};
+      found = true;  // Unless the container itself is not found later!
+    } else {
+      options.Prefix = internal::EnsureTrailingSlash(base_location.path);
+    }
+    options.PageSizeHint = page_size_hint;
+    options.Include = 
Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata;
+
+    auto recurse = [&](const std::string& blob_prefix) noexcept -> Status {
+      if (select.recursive && select.max_recursion > 0) {
+        FileSelector sub_select;
+        sub_select.base_dir = internal::ConcatAbstractPath(
+            base_location.container, 
internal::RemoveTrailingSlash(blob_prefix));
+        sub_select.allow_not_found = true;
+        sub_select.recursive = true;
+        sub_select.max_recursion = select.max_recursion - 1;
+        return GetFileInfoWithSelectorFromContainer(
+            container_client, context, page_size_hint, sub_select, 
acc_results);
+      }
+      return Status::OK();
+    };
+
+    auto process_blob =
+        [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept {
+          // blob.Name has trailing slash only when Prefix is an empty
+          // directory marker blob for the directory we're listing
+          // from, and we should skip it.
+          if (!internal::HasTrailingSlash(blob.Name)) {
+            acc_results->push_back(FileInfoFromBlob(base_location.container, 
blob));
+          }
+        };
+    auto process_prefix = [&](const std::string& prefix) noexcept -> Status {
+      const auto path = internal::ConcatAbstractPath(base_location.container, 
prefix);
+      acc_results->push_back(DirectoryFileInfoFromPath(path));
+      return recurse(prefix);
+    };
+
+    try {
+      auto list_response =
+          container_client.ListBlobsByHierarchy(/*delimiter=*/"/", options, 
context);
+      for (; list_response.HasPage(); list_response.MoveToNextPage(context)) {
+        if (list_response.Blobs.empty() && list_response.BlobPrefixes.empty()) 
{
+          continue;
+        }
+        found = true;
+        // Blob and BlobPrefixes are sorted by name, so we can merge-iterate
+        // them to ensure returned results are all sorted.
+        size_t blob_index = 0;
+        size_t blob_prefix_index = 0;
+        while (blob_index < list_response.Blobs.size() &&
+               blob_prefix_index < list_response.BlobPrefixes.size()) {
+          const auto& blob = list_response.Blobs[blob_index];
+          const auto& prefix = list_response.BlobPrefixes[blob_prefix_index];
+          const int cmp = blob.Name.compare(prefix);
+          if (cmp < 0) {
+            process_blob(blob);
+            blob_index += 1;
+          } else if (cmp > 0) {
+            RETURN_NOT_OK(process_prefix(prefix));
+            blob_prefix_index += 1;
+          } else {
+            DCHECK_EQ(blob.Name, prefix);
+            RETURN_NOT_OK(process_prefix(prefix));
+            blob_index += 1;
+            blob_prefix_index += 1;
+            // If the container has an empty dir marker blob and another blob 
starting
+            // with this blob name as a prefix, the blob doesn't appear in the 
listing
+            // that also contains the prefix, so AFAICT this branch in 
unreachable. The
+            // code above is kept just in case, but if this DCHECK(false) is 
ever reached,
+            // we should refactor this loop to ensure no duplicate entries are 
ever
+            // reported.
+            DCHECK(false)
+                << "Unexpected blob/prefix name collision on the same listing 
request";
+          }
+        }
+        for (; blob_index < list_response.Blobs.size(); blob_index++) {
+          process_blob(list_response.Blobs[blob_index]);
+        }
+        for (; blob_prefix_index < list_response.BlobPrefixes.size();
+             blob_prefix_index++) {
+          
RETURN_NOT_OK(process_prefix(list_response.BlobPrefixes[blob_prefix_index]));
+        }
+      }
+    } catch (const Azure::Storage::StorageException& exception) {
+      if (exception.ErrorCode == "ContainerNotFound") {
+        found = false;
+      } else {
+        return internal::ExceptionToStatus(
+            "Failed to list blobs in a directory: " + select.base_dir + ": " +
+                container_client.GetUrl(),
+            exception);
+      }
+    }
+
+    return found || select.allow_not_found
+               ? Status::OK()
+               : ::arrow::fs::internal::PathNotFound(select.base_dir);
+  }
+
+ public:
+  Status GetFileInfoWithSelector(const Azure::Core::Context& context,
+                                 Azure::Nullable<int32_t> page_size_hint,
+                                 const FileSelector& select,
+                                 FileInfoVector* acc_results) {
+    ARROW_ASSIGN_OR_RAISE(auto base_location, 
AzureLocation::FromString(select.base_dir));
+
+    if (base_location.container.empty()) {
+      // Without a container, the base_location is equivalent to the filesystem
+      // root -- `/`. FileSelector::allow_not_found doesn't matter in this case
+      // because the root always exists.
+      auto on_container =
+          [&](const Azure::Storage::Blobs::Models::BlobContainerItem& 
container) {
+            // Deleted containers are not listed by ListContainers.
+            DCHECK(!container.IsDeleted);
+
+            // Every container is considered a directory.
+            FileInfo info{container.Name, FileType::Directory};
+            info.set_mtime(
+                
std::chrono::system_clock::time_point{container.Details.LastModified});
+            acc_results->push_back(std::move(info));
+
+            // Recurse into containers (subdirectories) if requested.
+            if (select.recursive && select.max_recursion > 0) {
+              FileSelector sub_select;
+              sub_select.base_dir = container.Name;
+              sub_select.allow_not_found = true;
+              sub_select.recursive = true;
+              sub_select.max_recursion = select.max_recursion - 1;
+              ARROW_RETURN_NOT_OK(GetFileInfoWithSelector(context, 
page_size_hint,
+                                                          sub_select, 
acc_results));
+            }
+            return Status::OK();
+          };
+      return VisitContainers(context, std::move(on_container));
+    }
+
+    auto container_client =
+        blob_service_client_->GetBlobContainerClient(base_location.container);
+    return GetFileInfoWithSelectorFromContainer(container_client, context, 
page_size_hint,
+                                                select, acc_results);
+  }
+
   Result<std::shared_ptr<ObjectInputFile>> OpenInputFile(const AzureLocation& 
location,
                                                          AzureFileSystem* fs) {
     RETURN_NOT_OK(ValidateFileLocation(location));
@@ -1196,7 +1399,12 @@ Result<FileInfo> AzureFileSystem::GetFileInfo(const 
std::string& path) {
 }
 
 Result<FileInfoVector> AzureFileSystem::GetFileInfo(const FileSelector& 
select) {
-  return Status::NotImplemented("The Azure FileSystem is not fully 
implemented");
+  Azure::Core::Context context;
+  Azure::Nullable<int32_t> page_size_hint;  // unspecified
+  FileInfoVector results;
+  RETURN_NOT_OK(
+      impl_->GetFileInfoWithSelector(context, page_size_hint, select, 
&results));
+  return {std::move(results)};
 }
 
 Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) {
diff --git a/cpp/src/arrow/filesystem/azurefs.h 
b/cpp/src/arrow/filesystem/azurefs.h
index 9f980ee8ba..b2865b059e 100644
--- a/cpp/src/arrow/filesystem/azurefs.h
+++ b/cpp/src/arrow/filesystem/azurefs.h
@@ -157,7 +157,7 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem {
       const AzureOptions& options, const io::IOContext& = 
io::default_io_context());
 
  private:
-  explicit AzureFileSystem(const AzureOptions& options, const io::IOContext& 
io_context);
+  AzureFileSystem(const AzureOptions& options, const io::IOContext& 
io_context);
 
   class Impl;
   std::unique_ptr<Impl> impl_;
diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc 
b/cpp/src/arrow/filesystem/azurefs_test.cc
index 41f1663114..792c63b209 100644
--- a/cpp/src/arrow/filesystem/azurefs_test.cc
+++ b/cpp/src/arrow/filesystem/azurefs_test.cc
@@ -70,6 +70,9 @@ using ::testing::IsEmpty;
 using ::testing::Not;
 using ::testing::NotNull;
 
+namespace Blobs = Azure::Storage::Blobs;
+namespace Files = Azure::Storage::Files;
+
 auto const* kLoremIpsum = R"""(
 Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
 incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
@@ -193,9 +196,8 @@ TEST(AzureFileSystem, OptionsCompare) {
 class AzureFileSystemTest : public ::testing::Test {
  public:
   std::shared_ptr<FileSystem> fs_;
-  std::unique_ptr<Azure::Storage::Blobs::BlobServiceClient> 
blob_service_client_;
-  std::unique_ptr<Azure::Storage::Files::DataLake::DataLakeServiceClient>
-      datalake_service_client_;
+  std::unique_ptr<Blobs::BlobServiceClient> blob_service_client_;
+  std::unique_ptr<Files::DataLake::DataLakeServiceClient> 
datalake_service_client_;
   AzureOptions options_;
   std::mt19937_64 generator_;
   std::string container_name_;
@@ -213,15 +215,14 @@ class AzureFileSystemTest : public ::testing::Test {
       suite_skipped_ = true;
       GTEST_SKIP() << options.status().message();
     }
-    container_name_ = RandomChars(32);
-    blob_service_client_ = 
std::make_unique<Azure::Storage::Blobs::BlobServiceClient>(
+    // Stop-gap solution before GH-39119 is fixed.
+    container_name_ = "z" + RandomChars(31);
+    blob_service_client_ = std::make_unique<Blobs::BlobServiceClient>(
         options_.account_blob_url, options_.storage_credentials_provider);
-    datalake_service_client_ =
-        
std::make_unique<Azure::Storage::Files::DataLake::DataLakeServiceClient>(
-            options_.account_dfs_url, options_.storage_credentials_provider);
+    datalake_service_client_ = 
std::make_unique<Files::DataLake::DataLakeServiceClient>(
+        options_.account_dfs_url, options_.storage_credentials_provider);
     ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_));
-    auto container_client = 
blob_service_client_->GetBlobContainerClient(container_name_);
-    container_client.CreateIfNotExists();
+    auto container_client = CreateContainer(container_name_);
 
     auto blob_client = 
container_client.GetBlockBlobClient(PreexistingObjectName());
     blob_client.UploadFrom(reinterpret_cast<const uint8_t*>(kLoremIpsum),
@@ -239,6 +240,20 @@ class AzureFileSystemTest : public ::testing::Test {
     }
   }
 
+  Blobs::BlobContainerClient CreateContainer(const std::string& name) {
+    auto container_client = blob_service_client_->GetBlobContainerClient(name);
+    (void)container_client.CreateIfNotExists();
+    return container_client;
+  }
+
+  Blobs::BlobClient CreateBlob(Blobs::BlobContainerClient& container_client,
+                               const std::string& name, const std::string& 
data = "") {
+    auto blob_client = container_client.GetBlockBlobClient(name);
+    (void)blob_client.UploadFrom(reinterpret_cast<const uint8_t*>(data.data()),
+                                 data.size());
+    return blob_client;
+  }
+
   std::string PreexistingContainerName() const { return container_name_; }
 
   std::string PreexistingContainerPath() const {
@@ -326,6 +341,45 @@ class AzureFileSystemTest : public ::testing::Test {
         top_blob_path,
     };
   }
+
+  char const* kSubData = "sub data";
+  char const* kSomeData = "some data";
+  char const* kOtherData = "other data";
+
+  void SetUpSmallFileSystemTree() {
+    // Set up test containers
+    CreateContainer("empty-container");
+    auto container = CreateContainer("container");
+
+    CreateBlob(container, "emptydir/");
+    CreateBlob(container, "somedir/subdir/subfile", kSubData);
+    CreateBlob(container, "somefile", kSomeData);
+    // Add an explicit marker for a non-empty directory.
+    CreateBlob(container, "otherdir/1/2/");
+    // otherdir/{1/,2/,3/} are implicitly assumed to exist because of
+    // the otherdir/1/2/3/otherfile blob.
+    CreateBlob(container, "otherdir/1/2/3/otherfile", kOtherData);
+  }
+
+  void AssertInfoAllContainersRecursive(const std::vector<FileInfo>& infos) {
+    ASSERT_EQ(infos.size(), 14);
+    AssertFileInfo(infos[0], "container", FileType::Directory);
+    AssertFileInfo(infos[1], "container/emptydir", FileType::Directory);
+    AssertFileInfo(infos[2], "container/otherdir", FileType::Directory);
+    AssertFileInfo(infos[3], "container/otherdir/1", FileType::Directory);
+    AssertFileInfo(infos[4], "container/otherdir/1/2", FileType::Directory);
+    AssertFileInfo(infos[5], "container/otherdir/1/2/3", FileType::Directory);
+    AssertFileInfo(infos[6], "container/otherdir/1/2/3/otherfile", 
FileType::File,
+                   strlen(kOtherData));
+    AssertFileInfo(infos[7], "container/somedir", FileType::Directory);
+    AssertFileInfo(infos[8], "container/somedir/subdir", FileType::Directory);
+    AssertFileInfo(infos[9], "container/somedir/subdir/subfile", 
FileType::File,
+                   strlen(kSubData));
+    AssertFileInfo(infos[10], "container/somefile", FileType::File, 
strlen(kSomeData));
+    AssertFileInfo(infos[11], "empty-container", FileType::Directory);
+    AssertFileInfo(infos[12], PreexistingContainerName(), FileType::Directory);
+    AssertFileInfo(infos[13], PreexistingObjectPath(), FileType::File);
+  }
 };
 
 class AzuriteFileSystemTest : public AzureFileSystemTest {
@@ -518,6 +572,180 @@ TEST_F(AzureHierarchicalNamespaceFileSystemTest, 
GetFileInfoObject) {
   RunGetFileInfoObjectTest();
 }
 
+TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) {
+  SetUpSmallFileSystemTree();
+
+  FileSelector select;
+  std::vector<FileInfo> infos;
+
+  // Root dir
+  select.base_dir = "";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 3);
+  ASSERT_EQ(infos, SortedInfos(infos));
+  AssertFileInfo(infos[0], "container", FileType::Directory);
+  AssertFileInfo(infos[1], "empty-container", FileType::Directory);
+  AssertFileInfo(infos[2], container_name_, FileType::Directory);
+
+  // Empty container
+  select.base_dir = "empty-container";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+  // Nonexistent container
+  select.base_dir = "nonexistent-container";
+  ASSERT_RAISES(IOError, fs_->GetFileInfo(select));
+  select.allow_not_found = true;
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+  select.allow_not_found = false;
+  // Non-empty container
+  select.base_dir = "container";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos, SortedInfos(infos));
+  ASSERT_EQ(infos.size(), 4);
+  AssertFileInfo(infos[0], "container/emptydir", FileType::Directory);
+  AssertFileInfo(infos[1], "container/otherdir", FileType::Directory);
+  AssertFileInfo(infos[2], "container/somedir", FileType::Directory);
+  AssertFileInfo(infos[3], "container/somefile", FileType::File, 9);
+
+  // Empty "directory"
+  select.base_dir = "container/emptydir";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+  // Non-empty "directories"
+  select.base_dir = "container/somedir";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 1);
+  AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory);
+  select.base_dir = "container/somedir/subdir";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 1);
+  AssertFileInfo(infos[0], "container/somedir/subdir/subfile", FileType::File, 
8);
+  // Nonexistent
+  select.base_dir = "container/nonexistent";
+  ASSERT_RAISES(IOError, fs_->GetFileInfo(select));
+  select.allow_not_found = true;
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+  select.allow_not_found = false;
+
+  // Trailing slashes
+  select.base_dir = "empty-container/";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+  select.base_dir = "nonexistent-container/";
+  ASSERT_RAISES(IOError, fs_->GetFileInfo(select));
+  select.base_dir = "container/";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos, SortedInfos(infos));
+  ASSERT_EQ(infos.size(), 4);
+}
+
+TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) {
+  SetUpSmallFileSystemTree();
+
+  FileSelector select;
+  select.recursive = true;
+
+  std::vector<FileInfo> infos;
+  // Root dir
+  select.base_dir = "";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 14);
+  ASSERT_EQ(infos, SortedInfos(infos));
+  AssertInfoAllContainersRecursive(infos);
+
+  // Empty container
+  select.base_dir = "empty-container";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+
+  // Non-empty container
+  select.base_dir = "container";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos, SortedInfos(infos));
+  ASSERT_EQ(infos.size(), 10);
+  AssertFileInfo(infos[0], "container/emptydir", FileType::Directory);
+  AssertFileInfo(infos[1], "container/otherdir", FileType::Directory);
+  AssertFileInfo(infos[2], "container/otherdir/1", FileType::Directory);
+  AssertFileInfo(infos[3], "container/otherdir/1/2", FileType::Directory);
+  AssertFileInfo(infos[4], "container/otherdir/1/2/3", FileType::Directory);
+  AssertFileInfo(infos[5], "container/otherdir/1/2/3/otherfile", 
FileType::File, 10);
+  AssertFileInfo(infos[6], "container/somedir", FileType::Directory);
+  AssertFileInfo(infos[7], "container/somedir/subdir", FileType::Directory);
+  AssertFileInfo(infos[8], "container/somedir/subdir/subfile", FileType::File, 
8);
+  AssertFileInfo(infos[9], "container/somefile", FileType::File, 9);
+
+  // Empty "directory"
+  select.base_dir = "container/emptydir";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+
+  // Non-empty "directories"
+  select.base_dir = "container/somedir";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos, SortedInfos(infos));
+  ASSERT_EQ(infos.size(), 2);
+  AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory);
+  AssertFileInfo(infos[1], "container/somedir/subdir/subfile", FileType::File, 
8);
+
+  select.base_dir = "container/otherdir";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos, SortedInfos(infos));
+  ASSERT_EQ(infos.size(), 4);
+  AssertFileInfo(infos[0], "container/otherdir/1", FileType::Directory);
+  AssertFileInfo(infos[1], "container/otherdir/1/2", FileType::Directory);
+  AssertFileInfo(infos[2], "container/otherdir/1/2/3", FileType::Directory);
+  AssertFileInfo(infos[3], "container/otherdir/1/2/3/otherfile", 
FileType::File, 10);
+}
+
+TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorExplicitImplicitDirDedup) {
+  {
+    auto container = CreateContainer("container");
+    CreateBlob(container, "mydir/emptydir1/");
+    CreateBlob(container, "mydir/emptydir2/");
+    CreateBlob(container, "mydir/nonemptydir1/");  // explicit dir marker
+    CreateBlob(container, "mydir/nonemptydir1/somefile", kSomeData);
+    CreateBlob(container, "mydir/nonemptydir2/somefile", kSomeData);
+  }
+  std::vector<FileInfo> infos;
+
+  FileSelector select;  // non-recursive
+  select.base_dir = "container";
+
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 1);
+  ASSERT_EQ(infos, SortedInfos(infos));
+  AssertFileInfo(infos[0], "container/mydir", FileType::Directory);
+
+  select.base_dir = "container/mydir";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 4);
+  ASSERT_EQ(infos, SortedInfos(infos));
+  AssertFileInfo(infos[0], "container/mydir/emptydir1", FileType::Directory);
+  AssertFileInfo(infos[1], "container/mydir/emptydir2", FileType::Directory);
+  AssertFileInfo(infos[2], "container/mydir/nonemptydir1", 
FileType::Directory);
+  AssertFileInfo(infos[3], "container/mydir/nonemptydir2", 
FileType::Directory);
+
+  select.base_dir = "container/mydir/emptydir1";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+
+  select.base_dir = "container/mydir/emptydir2";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+
+  select.base_dir = "container/mydir/nonemptydir1";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 1);
+  AssertFileInfo(infos[0], "container/mydir/nonemptydir1/somefile", 
FileType::File);
+
+  select.base_dir = "container/mydir/nonemptydir2";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 1);
+  AssertFileInfo(infos[0], "container/mydir/nonemptydir2/somefile", 
FileType::File);
+}
+
 TEST_F(AzuriteFileSystemTest, CreateDirFailureNoContainer) {
   ASSERT_RAISES(Invalid, fs_->CreateDir("", false));
 }
diff --git a/cpp/src/arrow/filesystem/filesystem.cc 
b/cpp/src/arrow/filesystem/filesystem.cc
index 9ecc4610f3..810e9c179b 100644
--- a/cpp/src/arrow/filesystem/filesystem.cc
+++ b/cpp/src/arrow/filesystem/filesystem.cc
@@ -654,8 +654,7 @@ Status CopyFiles(const std::shared_ptr<FileSystem>& 
source_fs,
                              "', which is outside base dir '", 
source_sel.base_dir, "'");
     }
 
-    auto destination_path =
-        internal::ConcatAbstractPath(destination_base_dir, 
std::string(*relative));
+    auto destination_path = internal::ConcatAbstractPath(destination_base_dir, 
*relative);
 
     if (source_info.IsDirectory()) {
       dirs.push_back(destination_path);
diff --git a/cpp/src/arrow/filesystem/path_util.cc 
b/cpp/src/arrow/filesystem/path_util.cc
index 46ea436a9f..9c895ae76c 100644
--- a/cpp/src/arrow/filesystem/path_util.cc
+++ b/cpp/src/arrow/filesystem/path_util.cc
@@ -52,7 +52,7 @@ std::vector<std::string> SplitAbstractPath(const std::string& 
path, char sep) {
   }
 
   auto append_part = [&parts, &v](size_t start, size_t end) {
-    parts.push_back(std::string(v.substr(start, end - start)));
+    parts.emplace_back(v.substr(start, end - start));
   };
 
   size_t start = 0;
@@ -72,15 +72,12 @@ std::string SliceAbstractPath(const std::string& s, int 
offset, int length, char
     return "";
   }
   std::vector<std::string> components = SplitAbstractPath(s, sep);
-  std::stringstream combined;
   if (offset >= static_cast<int>(components.size())) {
     return "";
   }
-  int end = offset + length;
-  if (end > static_cast<int>(components.size())) {
-    end = static_cast<int>(components.size());
-  }
-  for (int i = offset; i < end; i++) {
+  const auto end = std::min(static_cast<size_t>(offset) + length, 
components.size());
+  std::stringstream combined;
+  for (auto i = static_cast<size_t>(offset); i < end; i++) {
     combined << components[i];
     if (i < end - 1) {
       combined << sep;
@@ -140,16 +137,20 @@ Status ValidateAbstractPathParts(const 
std::vector<std::string>& parts) {
   return Status::OK();
 }
 
-std::string ConcatAbstractPath(const std::string& base, const std::string& 
stem) {
+std::string ConcatAbstractPath(std::string_view base, std::string_view stem) {
   DCHECK(!stem.empty());
   if (base.empty()) {
-    return stem;
+    return std::string{stem};
   }
-  return EnsureTrailingSlash(base) + std::string(RemoveLeadingSlash(stem));
+  std::string result;
+  result.reserve(base.length() + stem.length() + 1);  // extra 1 is for 
potential kSep
+  result += EnsureTrailingSlash(base);
+  result += RemoveLeadingSlash(stem);
+  return result;
 }
 
 std::string EnsureTrailingSlash(std::string_view v) {
-  if (v.length() > 0 && v.back() != kSep) {
+  if (!v.empty() && !HasTrailingSlash(v)) {
     // XXX How about "C:" on Windows?  We probably don't want to turn it into 
"C:/"...
     // Unless the local filesystem always uses absolute paths
     return std::string(v) + kSep;
@@ -159,7 +160,7 @@ std::string EnsureTrailingSlash(std::string_view v) {
 }
 
 std::string EnsureLeadingSlash(std::string_view v) {
-  if (v.length() == 0 || v.front() != kSep) {
+  if (!HasLeadingSlash(v)) {
     // XXX How about "C:" on Windows?  We probably don't want to turn it into 
"/C:"...
     return kSep + std::string(v);
   } else {
@@ -197,10 +198,6 @@ Status AssertNoTrailingSlash(std::string_view key) {
   return Status::OK();
 }
 
-bool HasTrailingSlash(std::string_view key) { return key.back() == '/'; }
-
-bool HasLeadingSlash(std::string_view key) { return key.front() == '/'; }
-
 Result<std::string> MakeAbstractPathRelative(const std::string& base,
                                              const std::string& path) {
   if (base.empty() || base.front() != kSep) {
@@ -383,7 +380,7 @@ struct Globber::Impl {
 
 Globber::Globber(std::string pattern) : impl_(new Impl(pattern)) {}
 
-Globber::~Globber() {}
+Globber::~Globber() = default;
 
 bool Globber::Matches(const std::string& path) {
   return regex_match(path, impl_->pattern_);
diff --git a/cpp/src/arrow/filesystem/path_util.h 
b/cpp/src/arrow/filesystem/path_util.h
index 2c8c123e77..1da7afd3f9 100644
--- a/cpp/src/arrow/filesystem/path_util.h
+++ b/cpp/src/arrow/filesystem/path_util.h
@@ -69,7 +69,7 @@ Status ValidateAbstractPathParts(const 
std::vector<std::string>& parts);
 
 // Append a non-empty stem to an abstract path.
 ARROW_EXPORT
-std::string ConcatAbstractPath(const std::string& base, const std::string& 
stem);
+std::string ConcatAbstractPath(std::string_view base, std::string_view stem);
 
 // Make path relative to base, if it starts with base.  Otherwise error out.
 ARROW_EXPORT
@@ -94,11 +94,13 @@ std::string_view RemoveTrailingSlash(std::string_view s, 
bool preserve_root = fa
 ARROW_EXPORT
 Status AssertNoTrailingSlash(std::string_view s);
 
-ARROW_EXPORT
-bool HasTrailingSlash(std::string_view s);
+inline bool HasTrailingSlash(std::string_view s) {
+  return !s.empty() && s.back() == kSep;
+}
 
-ARROW_EXPORT
-bool HasLeadingSlash(std::string_view s);
+inline bool HasLeadingSlash(std::string_view s) {
+  return !s.empty() && s.front() == kSep;
+}
 
 ARROW_EXPORT
 bool IsAncestorOf(std::string_view ancestor, std::string_view descendant);
diff --git a/cpp/src/arrow/filesystem/test_util.cc 
b/cpp/src/arrow/filesystem/test_util.cc
index 6c5dda8e65..040917dcd2 100644
--- a/cpp/src/arrow/filesystem/test_util.cc
+++ b/cpp/src/arrow/filesystem/test_util.cc
@@ -126,6 +126,12 @@ void SortInfos(std::vector<FileInfo>* infos) {
   std::sort(infos->begin(), infos->end(), FileInfo::ByPath{});
 }
 
+std::vector<FileInfo> SortedInfos(const std::vector<FileInfo>& infos) {
+  auto sorted = infos;
+  SortInfos(&sorted);
+  return sorted;
+}
+
 void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* 
out_infos) {
   auto fut = CollectAsyncGenerator(gen);
   ASSERT_FINISHES_OK_AND_ASSIGN(auto nested_infos, fut);
diff --git a/cpp/src/arrow/filesystem/test_util.h 
b/cpp/src/arrow/filesystem/test_util.h
index c4d846fd31..62b488e159 100644
--- a/cpp/src/arrow/filesystem/test_util.h
+++ b/cpp/src/arrow/filesystem/test_util.h
@@ -74,6 +74,10 @@ void CreateFile(FileSystem* fs, const std::string& path, 
const std::string& data
 ARROW_TESTING_EXPORT
 void SortInfos(FileInfoVector* infos);
 
+// Create a copy of a FileInfo vector sorted by lexicographic path order
+ARROW_TESTING_EXPORT
+FileInfoVector SortedInfos(const FileInfoVector& infos);
+
 ARROW_TESTING_EXPORT
 void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* 
out_infos);
 

Reply via email to