rambleraptor commented on code in PR #3241: URL: https://github.com/apache/iceberg-python/pull/3241#discussion_r3260987656
########## pyiceberg/catalog/rest/__init__.py: ########## @@ -387,6 +387,11 @@ class ListViewsResponse(IcebergBaseModel): _PLANNING_RESPONSE_ADAPTER = TypeAdapter(PlanningResponse) +def _is_hadoop_only_config(config: Properties) -> bool: Review Comment: Config is a dict[str, str] according to the [OpenAPI doc](https://github.com/apache/iceberg/blob/main/open-api/rest-catalog-open-api.yaml#L3489-L3492) ########## pyiceberg/catalog/rest/__init__.py: ########## @@ -453,22 +458,32 @@ def _create_session(self) -> Session: @staticmethod def _resolve_storage_credentials(storage_credentials: list[StorageCredential], location: str | None) -> Properties: - """Resolve the best-matching storage credential by longest prefix match. + """Pick the longest-prefix storage credential for ``location``. - Mirrors the Java implementation in S3FileIO.clientForStoragePath() which iterates - over storage credential prefixes and selects the one with the longest match. + Mirrors Java ``S3FileIO.clientForStoragePath``. Hadoop-only (``fs.*``) + credentials are filtered out since pyiceberg has no HadoopFileIO to + consume them — otherwise a catalog vending both ``fs.*`` and ``s3.*`` + bundles per location could strand the FileIO with unusable keys. See: https://github.com/apache/iceberg/blob/main/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIO.java """ if not storage_credentials or not location: return {} + consumable = [c for c in storage_credentials if not _is_hadoop_only_config(c.config)] + best_match: StorageCredential | None = None - for cred in storage_credentials: + for cred in consumable: if location.startswith(cred.prefix): if best_match is None or len(cred.prefix) > len(best_match.prefix): best_match = cred + # Java S3FileIO falls back to the "s3" ROOT_PREFIX credential; scope it to + # schemes pyarrow's S3FileSystem handles so non-S3 schemes (gs://, abfs://, + # etc.) don't get handed s3.* keys. + if best_match is None and location.startswith(("s3://", "s3a://", "s3n://", "oss://")): Review Comment: I understand that we want s3 prefixed credentials to get mapped to s3a + s3n. What's oss here? ########## pyiceberg/catalog/rest/__init__.py: ########## @@ -453,22 +458,32 @@ def _create_session(self) -> Session: @staticmethod def _resolve_storage_credentials(storage_credentials: list[StorageCredential], location: str | None) -> Properties: - """Resolve the best-matching storage credential by longest prefix match. + """Pick the longest-prefix storage credential for ``location``. - Mirrors the Java implementation in S3FileIO.clientForStoragePath() which iterates - over storage credential prefixes and selects the one with the longest match. + Mirrors Java ``S3FileIO.clientForStoragePath``. Hadoop-only (``fs.*``) + credentials are filtered out since pyiceberg has no HadoopFileIO to + consume them — otherwise a catalog vending both ``fs.*`` and ``s3.*`` + bundles per location could strand the FileIO with unusable keys. See: https://github.com/apache/iceberg/blob/main/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIO.java """ if not storage_credentials or not location: return {} + consumable = [c for c in storage_credentials if not _is_hadoop_only_config(c.config)] + best_match: StorageCredential | None = None - for cred in storage_credentials: + for cred in consumable: if location.startswith(cred.prefix): if best_match is None or len(cred.prefix) > len(best_match.prefix): best_match = cred + # Java S3FileIO falls back to the "s3" ROOT_PREFIX credential; scope it to + # schemes pyarrow's S3FileSystem handles so non-S3 schemes (gs://, abfs://, + # etc.) don't get handed s3.* keys. + if best_match is None and location.startswith(("s3://", "s3a://", "s3n://", "oss://")): Review Comment: Wouldn't `s3://` get caught by line 477 already? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
