FANNG1 commented on code in PR #5997:
URL: https://github.com/apache/gravitino/pull/5997#discussion_r1905030532


##########
clients/client-python/gravitino/filesystem/gvfs.py:
##########
@@ -918,11 +992,41 @@ def _get_gcs_filesystem(self):
             raise GravitinoRuntimeException(
                 "Service account key is not found in the options."
             )
-        return importlib.import_module("gcsfs").GCSFileSystem(
-            token=service_account_key_path
+        return (
+            TIME_WITHOUT_EXPIRATION,
+            importlib.import_module("gcsfs").GCSFileSystem(
+                token=service_account_key_path
+            ),
         )
 
-    def _get_s3_filesystem(self):
+    def _get_s3_filesystem(self, fileset_catalog: Catalog, identifier: 
NameIdentifier):
+        try:
+            fileset: GenericFileset = 
fileset_catalog.as_fileset_catalog().load_fileset(
+                NameIdentifier.of(identifier.namespace().level(2), 
identifier.name())
+            )
+            credentials = fileset.support_credentials().get_credentials()
+        except (NoSuchCredentialException, CatalogNotInUseException) as e:
+            logger.warning("Failed to get credentials from fileset: %s", e)

Review Comment:
   How about throw exception here?



##########
clients/client-python/gravitino/filesystem/gvfs.py:
##########
@@ -1001,10 +1168,61 @@ def _get_abs_filesystem(self):
                 "ABS account key is not found in the options."
             )
 
-        return importlib.import_module("adlfs").AzureBlobFileSystem(
-            account_name=abs_account_name,
-            account_key=abs_account_key,
+        return (
+            TIME_WITHOUT_EXPIRATION,
+            importlib.import_module("adlfs").AzureBlobFileSystem(
+                account_name=abs_account_name,
+                account_key=abs_account_key,
+            ),
         )
 
+    def _get_most_suitable_s3_credential(self, credentials: List[Credential]):
+        for credential in credentials:
+            # Prefer to use the token credential, if it does not exist, use the
+            # secret key credential.
+            if isinstance(credential, S3TokenCredential):
+                return credential
+
+        for credential in credentials:
+            if isinstance(credential, S3SecretKeyCredential):
+                return credential
+        return None
+
+    def _get_most_suitable_oss_credential(self, credentials: List[Credential]):
+        for credential in credentials:
+            # Prefer to use the token credential, if it does not exist, use the
+            # secret key credential.
+            if isinstance(credential, OSSTokenCredential):
+                return credential
+
+        for credential in credentials:
+            if isinstance(credential, OSSSecretKeyCredential):
+                return credential
+        return None
+
+    def _get_most_suitable_gcs_credential(self, credentials: List[Credential]):
+        for credential in credentials:
+            # Prefer to use the token credential, if it does not exist, return 
None.
+            if isinstance(credential, GCSTokenCredential):
+                return credential
+        return None
+
+    def _get_most_suitable_abs_credential(self, credentials: List[Credential]):
+        for credential in credentials:
+            # Prefer to use the token credential, if it does not exist, use the
+            # account key credential
+            if isinstance(credential, ADLSTokenCredential):
+                return credential
+
+        for credential in credentials:
+            if isinstance(credential, AzureAccountKeyCredential):
+                return credential
+        return None
+
+    def _get_expire_time_by_ratio(self, expire_time: int):
+        if expire_time <= 0:
+            return TIME_WITHOUT_EXPIRATION
+        return time.time() * 1000 + (expire_time - time.time() * 1000) * 0.9

Review Comment:
   `0.9` seems too high, since the filesystem is used out of the control of 
GVFS,  how about make it configurable with default `0.5`?



##########
clients/client-python/gravitino/filesystem/gvfs.py:
##########
@@ -866,50 +896,94 @@ def _get_fileset_catalog(self, catalog_ident: 
NameIdentifier):
         finally:
             write_lock.release()
 
-    def _get_filesystem(self, actual_file_location: str):
+    def _file_system_is_not_expired(self, expire_time: int):

Review Comment:
   is `_file_system_expired` more clear?



##########
clients/client-python/gravitino/filesystem/gvfs.py:
##########
@@ -946,13 +1048,54 @@ def _get_s3_filesystem(self):
                 "AWS endpoint url is not found in the options."
             )
 
-        return importlib.import_module("s3fs").S3FileSystem(
-            key=aws_access_key_id,
-            secret=aws_secret_access_key,
-            endpoint_url=aws_endpoint_url,
+        return (
+            sys.maxsize,
+            importlib.import_module("s3fs").S3FileSystem(
+                key=aws_access_key_id,
+                secret=aws_secret_access_key,
+                endpoint_url=aws_endpoint_url,

Review Comment:
   Why the following code doesn't need `aws_endpoint_url`?
   ```
              if isinstance(credential, S3SecretKeyCredential):
                   fs = importlib.import_module("s3fs").S3FileSystem(
                       key=credential.access_key_id(),
                       secret=credential.secret_access_key(),
                   )
                   return (expire_time, fs)
   ```



##########
clients/client-python/gravitino/filesystem/gvfs.py:
##########
@@ -946,13 +1048,54 @@ def _get_s3_filesystem(self):
                 "AWS endpoint url is not found in the options."
             )
 
-        return importlib.import_module("s3fs").S3FileSystem(
-            key=aws_access_key_id,
-            secret=aws_secret_access_key,
-            endpoint_url=aws_endpoint_url,
+        return (
+            sys.maxsize,
+            importlib.import_module("s3fs").S3FileSystem(
+                key=aws_access_key_id,
+                secret=aws_secret_access_key,
+                endpoint_url=aws_endpoint_url,
+            ),
         )
 
-    def _get_oss_filesystem(self):
+    def _get_oss_filesystem(self, fileset_catalog: Catalog, identifier: 
NameIdentifier):
+        # Can get credential from the fileset
+        try:
+            fileset: GenericFileset = 
fileset_catalog.as_fileset_catalog().load_fileset(
+                NameIdentifier.of(identifier.namespace().level(2), 
identifier.name())
+            )
+            credentials = fileset.support_credentials().get_credentials()
+        except (NoSuchCredentialException, CatalogNotInUseException) as e:
+            logger.warning("Failed to get credentials from fileset: %s", e)
+            credentials = []
+
+        credential = self._get_most_suitable_oss_credential(credentials)
+        if credential is not None:
+            oss_endpoint = fileset_catalog.properties()["oss-endpoint"]
+            expire_time = 
self._get_expire_time_by_ratio(credential.expire_time_in_ms())
+            if isinstance(credential, OSSTokenCredential):
+                fs = importlib.import_module("ossfs").OSSFileSystem(
+                    key=credential.access_key_id(),
+                    secret=credential.secret_access_key(),
+                    token=credential.security_token(),
+                    endpoint=oss_endpoint,
+                )
+                return (expire_time, fs)
+            if isinstance(credential, OSSSecretKeyCredential):
+                return (
+                    expire_time,
+                    importlib.import_module("ossfs").OSSFileSystem(
+                        key=credential.access_key_id(),
+                        secret=credential.secret_access_key(),
+                        endpoint=oss_endpoint,
+                    ),
+                )
+
+        oss_endpoint_url = 
self._options.get(GVFSConfig.GVFS_FILESYSTEM_OSS_ENDPOINT)

Review Comment:
   The action is a little wired, you only use oss endpoint only when credential 
is enabled in server side. seems the client side doesn't need to configuration 
it .



##########
clients/client-python/tests/integration/test_gvfs_with_s3_credential.py:
##########
@@ -0,0 +1,151 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import logging
+import os
+from random import randint
+import unittest
+
+from s3fs import S3FileSystem
+
+from gravitino import (
+    gvfs,
+    GravitinoClient,
+    Catalog,
+    Fileset,
+)
+from gravitino.filesystem.gvfs_config import GVFSConfig
+from tests.integration.test_gvfs_with_s3 import TestGvfsWithS3
+
+logger = logging.getLogger(__name__)
+
+
+def s3_with_credential_is_configured():
+    return all(
+        [
+            os.environ.get("S3_STS_ACCESS_KEY_ID") is not None,
+            os.environ.get("S3_STS_SECRET_ACCESS_KEY") is not None,
+            os.environ.get("S3_STS_ENDPOINT") is not None,
+            os.environ.get("S3_STS_BUCKET_NAME") is not None,
+            os.environ.get("S3_STS_REGION") is not None,
+            os.environ.get("S3_STS_ROLE_ARN") is not None,
+        ]
+    )
+
+
[email protected](s3_with_credential_is_configured(), "S3 is not 
configured.")
+class TestGvfsWithS3Credential(TestGvfsWithS3):
+    # Before running this test, please set the make sure aws-bundle-x.jar has 
been
+    # copy to the $GRAVITINO_HOME/catalogs/hadoop/libs/ directory
+    s3_access_key = os.environ.get("S3_STS_ACCESS_KEY_ID")
+    s3_secret_key = os.environ.get("S3_STS_SECRET_ACCESS_KEY")
+    s3_endpoint = os.environ.get("S3_STS_ENDPOINT")
+    bucket_name = os.environ.get("S3_STS_BUCKET_NAME")
+    s3_sts_region = os.environ.get("S3_STS_REGION")
+    s3_role_arn = os.environ.get("S3_STS_ROLE_ARN")
+
+    metalake_name: str = "TestGvfsWithS3Credential_metalake" + str(randint(1, 
10000))
+
+    def setUp(self):
+        self.options = {
+            f"{GVFSConfig.GVFS_FILESYSTEM_S3_ACCESS_KEY}": self.s3_access_key,
+            f"{GVFSConfig.GVFS_FILESYSTEM_S3_SECRET_KEY}": self.s3_secret_key,
+            f"{GVFSConfig.GVFS_FILESYSTEM_S3_ENDPOINT}": self.s3_endpoint,
+        }
+
+    @classmethod
+    def _init_test_entities(cls):
+        cls.gravitino_admin_client.create_metalake(
+            name=cls.metalake_name, comment="", properties={}
+        )
+        cls.gravitino_client = GravitinoClient(
+            uri="http://localhost:8090";, metalake_name=cls.metalake_name
+        )
+
+        cls.config = {}
+        cls.conf = {}
+        catalog = cls.gravitino_client.create_catalog(
+            name=cls.catalog_name,
+            catalog_type=Catalog.Type.FILESET,
+            provider=cls.catalog_provider,
+            comment="",
+            properties={
+                "filesystem-providers": "s3",
+                "s3-access-key-id": cls.s3_access_key,
+                "s3-secret-access-key": cls.s3_secret_key,
+                "s3-endpoint": cls.s3_endpoint,
+                "s3-region": cls.s3_sts_region,
+                "s3-role-arn": cls.s3_role_arn,
+                "credential-providers": "s3-token",
+            },
+        )
+        catalog.as_schemas().create_schema(
+            schema_name=cls.schema_name, comment="", properties={}
+        )
+
+        cls.fileset_storage_location: str = (
+            
f"s3a://{cls.bucket_name}/{cls.catalog_name}/{cls.schema_name}/{cls.fileset_name}"
+        )
+        cls.fileset_gvfs_location = (
+            
f"gvfs://fileset/{cls.catalog_name}/{cls.schema_name}/{cls.fileset_name}"
+        )
+        catalog.as_fileset_catalog().create_fileset(
+            ident=cls.fileset_ident,
+            fileset_type=Fileset.Type.MANAGED,
+            comment=cls.fileset_comment,
+            storage_location=cls.fileset_storage_location,
+            properties=cls.fileset_properties,
+        )
+
+        cls.fs = S3FileSystem(
+            key=cls.s3_access_key,
+            secret=cls.s3_secret_key,
+            endpoint_url=cls.s3_endpoint,
+        )
+
+    # The following tests are copied from 
tests/integration/test_gvfs_with_s3.py, with some modifications as
+    # `mkdir` and `makedirs` have different behavior in the S3, other cloud 
storage like GCS, ABS, and OSS.
+    # are similar.
+    def test_mkdir(self):
+        mkdir_dir = self.fileset_gvfs_location + "/test_mkdir"
+        mkdir_actual_dir = self.fileset_storage_location + "/test_mkdir"
+        fs = gvfs.GravitinoVirtualFileSystem(
+            server_uri="http://localhost:8090";,
+            metalake_name=self.metalake_name,
+            options=self.options,
+            **self.conf,
+        )
+
+        # it actually takes no effect.
+        self.check_mkdir(mkdir_dir, mkdir_actual_dir, fs)
+
+        with self.assertRaises(ValueError):
+            fs.mkdir(mkdir_dir, create_parents=True)
+        self.assertFalse(fs.exists(mkdir_dir))
+
+    def test_makedirs(self):

Review Comment:
   test_mkdirs?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to