pitrou commented on code in PR #14071:
URL: https://github.com/apache/arrow/pull/14071#discussion_r976132329
##########
cpp/src/arrow/util/uri_test.cc:
##########
@@ -267,6 +268,7 @@ TEST(Uri, FileScheme) {
auto check_with_host = [&](std::string uri_string, std::string host,
std::string path) -> void {
ASSERT_OK(uri.Parse(uri_string));
+ ASSERT_TRUE(uri.is_file_scheme());
Review Comment:
For completeness, also add a test where this returns false?
##########
python/pyarrow/tests/test_substrait.py:
##########
@@ -40,9 +39,6 @@ def _write_dummy_data_to_disk(tmpdir, file_name, table):
return path
[email protected](sys.platform == 'win32',
- reason="ARROW-16392: file based URI is" +
- " not fully supported for Windows")
Review Comment:
Neat!
##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -94,6 +94,20 @@ Status CheckRelCommon(const RelMessage& rel) {
return Status::OK();
}
+// Other helper functions
+Status DiscoverFilesFromDir(std::shared_ptr<fs::LocalFileSystem>& local_fs,
+ std::string dirpath, std::vector<fs::FileInfo>&
rel_fpaths) {
Review Comment:
As per the coding conventions, we want to avoid non-const references.
Immutable args should be passed by value or const-ref.
Mutable args should be passed by pointer.
```suggestion
Status DiscoverFilesFromDir(const std::shared_ptr<fs::LocalFileSystem>&
local_fs,
const std::string& dirpath,
std::vector<fs::FileInfo> *rel_fpaths) {
```
##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -210,43 +197,93 @@ Result<DeclarationInfo> FromProto(const substrait::Rel&
rel, const ExtensionSet&
"non-default
substrait::ReadRel::LocalFiles::FileOrFiles::length");
}
- path = path.substr(7);
+ // Extract and parse the read relation's source URI
+ ::arrow::internal::Uri item_uri;
switch (item.path_type_case()) {
- case substrait::ReadRel::LocalFiles::FileOrFiles::kUriPath: {
- ARROW_ASSIGN_OR_RAISE(auto file, filesystem->GetFileInfo(path));
- if (file.type() == fs::FileType::File) {
- files.push_back(std::move(file));
- } else if (file.type() == fs::FileType::Directory) {
- fs::FileSelector selector;
- selector.base_dir = path;
- selector.recursive = true;
- ARROW_ASSIGN_OR_RAISE(auto discovered_files,
- filesystem->GetFileInfo(selector));
- std::move(files.begin(), files.end(),
std::back_inserter(discovered_files));
- }
+ case substrait::ReadRel::LocalFiles::FileOrFiles::kUriPath:
+ RETURN_NOT_OK(item_uri.Parse(item.uri_path()));
break;
- }
+
+ case substrait::ReadRel::LocalFiles::FileOrFiles::kUriFile:
+ RETURN_NOT_OK(item_uri.Parse(item.uri_file()));
+ break;
+
+ case substrait::ReadRel::LocalFiles::FileOrFiles::kUriFolder:
+ RETURN_NOT_OK(item_uri.Parse(item.uri_folder()));
+ break;
+
+ default:
+ RETURN_NOT_OK(item_uri.Parse(item.uri_path_glob()));
+ break;
+ }
+
+ // Validate the URI before processing
+ if (!item_uri.is_file_scheme()) {
+ return Status::NotImplemented("substrait::ReadRel::LocalFiles item
(",
+ item_uri.ToString(),
+ ") with non-filesystem scheme
(file:///)");
+ }
+
+ if (item_uri.port() != -1) {
+ return Status::NotImplemented("substrait::ReadRel::LocalFiles item
(",
+ item_uri.ToString(),
+ ") should not have a port number in
path");
+ }
+
+ if (!item_uri.query_string().empty()) {
+ return Status::NotImplemented("substrait::ReadRel::LocalFiles item
(",
+ item_uri.ToString(),
+ ") should not have a query string in
path");
+ }
+
+ switch (item.file_format_case()) {
+ case substrait::ReadRel::LocalFiles::FileOrFiles::kParquet:
+ format = std::make_shared<dataset::ParquetFileFormat>();
+ break;
+ case substrait::ReadRel::LocalFiles::FileOrFiles::kArrow:
+ format = std::make_shared<dataset::IpcFileFormat>();
+ break;
+ default:
+ // TODO: maybe check for ".feather" or ".arrows"?
+ return Status::NotImplemented(
+ "unsupported file format ",
+ "(see
substrait::ReadRel::LocalFiles::FileOrFiles::file_format)");
+ }
+
+ // Handle the URI as appropriate
+ switch (item.path_type_case()) {
case substrait::ReadRel::LocalFiles::FileOrFiles::kUriFile: {
- files.emplace_back(path, fs::FileType::File);
+ files.emplace_back(item_uri.path(), fs::FileType::File);
break;
}
+
case substrait::ReadRel::LocalFiles::FileOrFiles::kUriFolder: {
- fs::FileSelector selector;
- selector.base_dir = path;
- selector.recursive = true;
- ARROW_ASSIGN_OR_RAISE(auto discovered_files,
- filesystem->GetFileInfo(selector));
- std::move(discovered_files.begin(), discovered_files.end(),
- std::back_inserter(files));
+ RETURN_NOT_OK(DiscoverFilesFromDir(filesystem, item_uri.path(),
files));
+ break;
+ }
+
+ case substrait::ReadRel::LocalFiles::FileOrFiles::kUriPath: {
+ // Let the filesystem API decide for us if the URI is a file or a
directory
+ ARROW_ASSIGN_OR_RAISE(auto file_info,
+ filesystem->GetFileInfo(item_uri.path()));
+
+ // push the FileInfo if it's a file; else, recurse into the
directory
+ if (file_info.type() == fs::FileType::File) {
+ files.push_back(std::move(file_info));
+ } else if (file_info.type() == fs::FileType::Directory) {
+ RETURN_NOT_OK(DiscoverFilesFromDir(filesystem, item_uri.path(),
files));
+ }
Review Comment:
Should probably also handle the `FileInfo::Unknown` and `FileInfo::NotFound`
cases (by erroring out perhaps?).
`FileInfo::Unknown` can be got for non-regular files such as pipes, sockets
etc.
`FileInfo::NotFound` you probably understand :-)
##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -94,6 +94,20 @@ Status CheckRelCommon(const RelMessage& rel) {
return Status::OK();
}
+// Other helper functions
+Status DiscoverFilesFromDir(std::shared_ptr<fs::LocalFileSystem>& local_fs,
+ std::string dirpath, std::vector<fs::FileInfo>&
rel_fpaths) {
+ // Define a selector for a recursive descent
+ fs::FileSelector selector;
+ selector.base_dir = dirpath;
+ selector.recursive = true;
+
+ ARROW_ASSIGN_OR_RAISE(auto file_infos, local_fs->GetFileInfo(selector));
+ std::move(file_infos.begin(), file_infos.end(),
std::back_inserter(rel_fpaths));
Review Comment:
This will also give you subdirectories, is that what you want or should you
filter them out?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]