This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git


The following commit(s) were added to refs/heads/main by this push:
     new 8d79664d Support viewfs scheme along side with hdfs (#777)
8d79664d is described below

commit 8d79664d3a6010a92468bfbee1a55283591d7800
Author: Yothin M <[email protected]>
AuthorDate: Fri May 31 15:00:53 2024 +0700

    Support viewfs scheme along side with hdfs (#777)
---
 pyiceberg/io/__init__.py | 1 +
 pyiceberg/io/pyarrow.py  | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py
index 1a78f306..9143cf66 100644
--- a/pyiceberg/io/__init__.py
+++ b/pyiceberg/io/__init__.py
@@ -284,6 +284,7 @@ SCHEMA_TO_FILE_IO: Dict[str, List[str]] = {
     "gs": [ARROW_FILE_IO],
     "file": [ARROW_FILE_IO, FSSPEC_FILE_IO],
     "hdfs": [ARROW_FILE_IO],
+    "viewfs": [ARROW_FILE_IO],
     "abfs": [FSSPEC_FILE_IO],
     "abfss": [FSSPEC_FILE_IO],
 }
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index 9216c37f..04f30ec6 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -332,7 +332,7 @@ class PyArrowFileIO(FileIO):
         uri = urlparse(location)
         if not uri.scheme:
             return "file", uri.netloc, os.path.abspath(location)
-        elif uri.scheme == "hdfs":
+        elif uri.scheme in ("hdfs", "viewfs"):
             return uri.scheme, uri.netloc, uri.path
         else:
             return uri.scheme, uri.netloc, f"{uri.netloc}{uri.path}"
@@ -356,12 +356,12 @@ class PyArrowFileIO(FileIO):
                 client_kwargs["connect_timeout"] = float(connect_timeout)
 
             return S3FileSystem(**client_kwargs)
-        elif scheme == "hdfs":
+        elif scheme in ("hdfs", "viewfs"):
             from pyarrow.fs import HadoopFileSystem
 
             hdfs_kwargs: Dict[str, Any] = {}
             if netloc:
-                return HadoopFileSystem.from_uri(f"hdfs://{netloc}")
+                return HadoopFileSystem.from_uri(f"{scheme}://{netloc}")
             if host := self.properties.get(HDFS_HOST):
                 hdfs_kwargs["host"] = host
             if port := self.properties.get(HDFS_PORT):

Reply via email to