Fokko commented on code in PR #2291:
URL: https://github.com/apache/iceberg-python/pull/2291#discussion_r2286101986


##########
pyiceberg/io/pyarrow.py:
##########
@@ -381,21 +381,38 @@ def to_input_file(self) -> PyArrowFile:
 
 class PyArrowFileIO(FileIO):
     fs_by_scheme: Callable[[str, Optional[str]], FileSystem]
+    config: Config
 
     def __init__(self, properties: Properties = EMPTY_DICT):
         self.fs_by_scheme: Callable[[str, Optional[str]], FileSystem] = 
lru_cache(self._initialize_fs)
+        self.config = Config()
         super().__init__(properties=properties)
 
     @staticmethod
-    def parse_location(location: str) -> Tuple[str, str, str]:
-        """Return the path without the scheme."""
+    def parse_location(location: str, config: Config) -> Tuple[str, str, str]:
+        """Return (scheme, netloc, path) for the given location.
+
+        Uses environment variables DEFAULT_SCHEME and DEFAULT_NETLOC
+        if scheme/netloc are missing.
+        """
         uri = urlparse(location)
-        if not uri.scheme:
-            return "file", uri.netloc, os.path.abspath(location)
-        elif uri.scheme in ("hdfs", "viewfs"):
-            return uri.scheme, uri.netloc, uri.path
+
+        # Load defaults from environment
+        default_scheme = config.get_str("default-scheme") or "file"
+        default_netloc = config.get_str("default-netloc") or ""
+

Review Comment:
   Thanks for sticking with us @mccormickt12 here, I think there is some 
miscommunication.
   
   How are you opening up the catalog? `load_catalog` is the recommended way of 
doing this: https://py.iceberg.apache.org/api/
   
   Let's consider the following `parse_location`:
   
   ```python
       def parse_location(self, location: str) -> Tuple[str, str, str]:
           """Return (scheme, netloc, path) for the given location.
   
           Uses environment variables default-scheme and default-netloc
           if scheme/netloc are missing.
           """
           uri = urlparse(location)
   
           # Apply logic
           scheme = uri.scheme or self.properties.get("hdfs.default-scheme")
           netloc = uri.netloc or self.properties.get("hdfs.default-netloc")
   
           if scheme in ("hdfs", "viewfs"):
               return scheme, netloc, uri.path
           else:
               # For non-HDFS URIs, include netloc in the path if present
               path = uri.path if uri.scheme else os.path.abspath(location)
               if netloc and not path.startswith(netloc):
                   path = f"{netloc}{path}"
               return scheme, netloc, path
   ```
   
   You can inject the `properties` through:
   
   ```
   load_catalog('default', properties={
       'hdfs.default-scheme': 'hdfs',
       'hdfs.default-netloc': 
'ltx1-yugioh-cluster01.linkfs.prod-ltx1.atd.prod.linkedin.com:9000',
   })
   ```
   
   If you use `load_catalog`, it will also pick up the configuration and the 
environment variables:
   
   ```yaml
   catalog:
     default:
       hdfs.default-scheme: hdfs
       hdfs.default-netloc: 
ltx1-yugioh-cluster01.linkfs.prod-ltx1.atd.prod.linkedin.com:9000
   ```
   
   ```sh
   export PYICEBERG_CATALOG__DEFAULT__HDFS__DEFAULT_SCHEME=hdfs
   ```
   
   Or use the `FileIO` directly:
   
   ```
   PyArrowFileIO(properties={
       'default-scheme': 'hdfs',
       'default-netloc': 
'ltx1-yugioh-cluster01.linkfs.prod-ltx1.atd.prod.linkedin.com:9000',
   })
   ```
   
   What do you think? Does this align with the way you're using PyIceberg?
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to