emkornfield commented on code in PR #12763:
URL: https://github.com/apache/arrow/pull/12763#discussion_r852615786


##########
cpp/src/arrow/filesystem/gcsfs.cc:
##########
@@ -318,19 +326,43 @@ class GcsFileSystem::Impl {
       return GetFileInfoBucket(path, std::move(meta).status());
     }
     auto meta = client_.GetObjectMetadata(path.bucket, path.object);
-    return GetFileInfoObject(path, meta);
+    Result<FileInfo> info = GetFileInfoObject(path, meta);
+    if (!info.ok() || info->type() != FileType::NotFound) {
+      return info;
+    }
+    // Not found case.  It could be this was written to GCS with a different
+    // "Directory" convention.  So it if there are is at least one objec that
+    // matches the prefix we assume it is a directory.
+    std::string canonical = internal::EnsureTrailingSlash(path.object);
+    std::string end = canonical;
+    end.back() += 1;
+    auto list_result =
+        client_.ListObjects(path.bucket, gcs::Prefix(canonical), 
gcs::EndOffset(end));
+    if (list_result.begin() != list_result.end()) {
+      // If there is at least one result it indicates this is a directory (at
+      // least one object exists that starts with "path/"
+      return FileInfo(path.full_path, FileType::Directory);
+    }
+    // Return the original not-found info if there was no match.
+    return info;
   }
 
   Result<FileInfoVector> GetFileInfo(const FileSelector& select) {
     ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(select.base_dir));
-    // Adding the trailing '/' avoids problems with files named 'a', 'ab', 
'ac'  where GCS
-    // would return all of them if the prefix is 'a'.
+    // Adding the trailing '/' avoids problems with files named 'a', 'ab', 
'ac'  where
+    // GCS would return all of them if the prefix is 'a'.
     const auto canonical = internal::EnsureTrailingSlash(p.object);
-    const auto max_depth = internal::Depth(canonical) + select.max_recursion;
+    // Need to add one level when the object is not empty because all
+    // directories have an extra slash.
+    const auto max_depth =
+        internal::Depth(p.object) + select.max_recursion + !p.object.empty();
     auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(canonical);
     auto delimiter = select.recursive ? gcs::Delimiter() : gcs::Delimiter("/");
+    auto include_trailing = select.recursive ? 
gcs::IncludeTrailingDelimiter(false)
+                                             : 
gcs::IncludeTrailingDelimiter(true);

Review Comment:
   I added a TODO that I hope to resolve before submission.  there were a bunch 
of inconsistencies for some directory edge cases between python tests and C++ 
tests (at least slightly different behavior is tested with slightly different 
expected returns).



##########
cpp/src/arrow/filesystem/gcsfs.cc:
##########
@@ -340,11 +372,13 @@ class GcsFileSystem::Impl {
       }
       // Skip the directory itself from the results, and any result that is 
"too deep"
       // into the recursion.
-      if (o->name() == p.object || internal::Depth(o->name()) > max_depth) {
+      bool has_trailing_slash = !o->name().empty() && o->name().back() == '/';
+      if (o->name() == canonical || o->name() == p.object ||

Review Comment:
   part of TODO items for this PR>



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to