pitrou commented on code in PR #12625:
URL: https://github.com/apache/arrow/pull/12625#discussion_r864515004


##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -49,6 +50,52 @@ Status CheckRelCommon(const RelMessage& rel) {
   return Status::OK();
 }
 
+Result<fs::FileInfoVector> GetGlobFiles(const std::shared_ptr<fs::FileSystem>& 
filesystem,
+                                        const std::string& glob) {
+  fs::FileInfoVector results, temp;
+  fs::FileSelector selector;
+  std::string cur;
+
+  auto split_path = fs::internal::SplitAbstractPath(glob, '/');
+  ARROW_ASSIGN_OR_RAISE(auto file, filesystem->GetFileInfo("/"));
+  results.push_back(std::move(file));
+
+  for (size_t i = 0; i < split_path.size(); i++) {
+    if (split_path[i].find_first_of("*?") == std::string::npos) {
+      if (cur.empty())
+        cur = split_path[i];
+      else
+        cur = fs::internal::ConcatAbstractPath(cur, split_path[i]);
+      continue;
+    } else {
+      for (auto res : results) {
+        if (res.type() != fs::FileType::Directory) continue;
+        selector.base_dir = res.path() + cur;
+        ARROW_ASSIGN_OR_RAISE(auto entries, filesystem->GetFileInfo(selector));
+        fs::internal::Globber globber(
+            fs::internal::ConcatAbstractPath(selector.base_dir, 
split_path[i]));
+        for (auto entry : entries) {
+          if (globber.Matches(entry.path())) {
+            temp.push_back(std::move(entry));
+          }
+        }
+      }
+      results = std::move(temp);
+      cur.clear();
+    }
+  }
+
+  if (!cur.empty()) {
+    for (size_t i = 0; i < results.size(); i++) {
+      ARROW_ASSIGN_OR_RAISE(
+          results[i], filesystem->GetFileInfo(
+                          fs::internal::ConcatAbstractPath(results[i].path(), 
cur)));

Review Comment:
   Yes, we should, because instead of issuing all `GetFileInfo` calls serially 
it may be able to issue them in parallel.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to