HonahX commented on code in PR #8304:
URL: https://github.com/apache/iceberg/pull/8304#discussion_r1295358314


##########
python/pyiceberg/catalog/glue.py:
##########
@@ -56,124 +64,109 @@
 from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
 from pyiceberg.typedef import EMPTY_DICT
 
-BOTO_SESSION_CONFIG_KEYS = ["aws_access_key_id", "aws_secret_access_key", 
"aws_session_token", "region_name", "profile_name"]
-
-GLUE_CLIENT = "glue"
-
-
-PROP_GLUE_TABLE = "Table"
-PROP_GLUE_TABLE_TYPE = "TableType"
-PROP_GLUE_TABLE_DESCRIPTION = "Description"
-PROP_GLUE_TABLE_PARAMETERS = "Parameters"
-PROP_GLUE_TABLE_DATABASE_NAME = "DatabaseName"
-PROP_GLUE_TABLE_NAME = "Name"
-PROP_GLUE_TABLE_OWNER = "Owner"
-PROP_GLUE_TABLE_STORAGE_DESCRIPTOR = "StorageDescriptor"
-
-PROP_GLUE_TABLELIST = "TableList"
-
-PROP_GLUE_DATABASE = "Database"
-PROP_GLUE_DATABASE_LIST = "DatabaseList"
-PROP_GLUE_DATABASE_NAME = "Name"
-PROP_GLUE_DATABASE_LOCATION = "LocationUri"
-PROP_GLUE_DATABASE_DESCRIPTION = "Description"
-PROP_GLUE_DATABASE_PARAMETERS = "Parameters"
-
-PROP_GLUE_NEXT_TOKEN = "NextToken"
-
-GLUE_DESCRIPTION_KEY = "comment"
-
 
 def _construct_parameters(metadata_location: str) -> Properties:
     return {TABLE_TYPE: ICEBERG.upper(), METADATA_LOCATION: metadata_location}
 
 
-def _construct_create_table_input(table_name: str, metadata_location: str, 
properties: Properties) -> Dict[str, Any]:
-    table_input = {
-        PROP_GLUE_TABLE_NAME: table_name,
-        PROP_GLUE_TABLE_TYPE: EXTERNAL_TABLE,
-        PROP_GLUE_TABLE_PARAMETERS: _construct_parameters(metadata_location),
+def _construct_create_table_input(table_name: str, metadata_location: str, 
properties: Properties) -> TableInputTypeDef:
+    table_input: TableInputTypeDef = {
+        "Name": table_name,
+        "TableType": EXTERNAL_TABLE,
+        "Parameters": _construct_parameters(metadata_location),
     }
 
-    if table_description := properties.get(GLUE_DESCRIPTION_KEY):
-        table_input[PROP_GLUE_TABLE_DESCRIPTION] = table_description
+    if "Description" in properties:
+        table_input["Description"] = properties["Description"]
 
     return table_input
 
 
-def _construct_rename_table_input(to_table_name: str, glue_table: Dict[str, 
Any]) -> Dict[str, Any]:
-    rename_table_input = {PROP_GLUE_TABLE_NAME: to_table_name}
+def _construct_rename_table_input(to_table_name: str, glue_table: 
TableTypeDef) -> TableInputTypeDef:
+    rename_table_input: TableInputTypeDef = {"Name": to_table_name}
     # use the same Glue info to create the new table, pointing to the old 
metadata
-    if table_type := glue_table.get(PROP_GLUE_TABLE_TYPE):
-        rename_table_input[PROP_GLUE_TABLE_TYPE] = table_type
-    if table_parameters := glue_table.get(PROP_GLUE_TABLE_PARAMETERS):
-        rename_table_input[PROP_GLUE_TABLE_PARAMETERS] = table_parameters
-    if table_owner := glue_table.get(PROP_GLUE_TABLE_OWNER):
-        rename_table_input[PROP_GLUE_TABLE_OWNER] = table_owner
-    if table_storage_descriptor := 
glue_table.get(PROP_GLUE_TABLE_STORAGE_DESCRIPTOR):
-        rename_table_input[PROP_GLUE_TABLE_STORAGE_DESCRIPTOR] = 
table_storage_descriptor
-    if table_description := glue_table.get(PROP_GLUE_TABLE_DESCRIPTION):
-        rename_table_input[PROP_GLUE_TABLE_DESCRIPTION] = table_description
+    rename_table_input["TableType"] = glue_table["TableType"]

Review Comment:
   I think we may still need `if "TableType" in glue_table` here? `TableType` 
is not required according to 
https://youtype.github.io/boto3_stubs_docs/mypy_boto3_glue/type_defs/#tabletypedef



##########
python/pyiceberg/catalog/glue.py:
##########
@@ -413,15 +412,18 @@ def list_namespaces(self, namespace: Union[str, 
Identifier] = ()) -> List[Identi
         if namespace:
             return []
 
-        database_list = []
+        database_list: List[DatabaseTypeDef] = []
         databases_response = self.glue.get_databases()
-        next_token = databases_response.get(PROP_GLUE_NEXT_TOKEN)
-        database_list += databases_response.get(PROP_GLUE_DATABASE_LIST, [])
-        while next_token:
-            databases_response = self.glue.get_databases(NextToken=next_token)
-            next_token = databases_response.get(PROP_GLUE_NEXT_TOKEN)
-            database_list += databases_response.get(PROP_GLUE_DATABASE_LIST, 
[])
-        return 
[self.identifier_to_tuple(database.get(PROP_GLUE_DATABASE_NAME)) for database 
in database_list]
+        next_token: Optional[str] = None
+
+        while True:
+            databases_response = self.glue.get_databases() if not next_token 
else self.glue.get_databases(NextToken=next_token)
+            database_list.extend(databases_response["DatabaseList"])
+            next_token = databases_response.get("NextToken")
+            if not next_token:
+                break

Review Comment:
   Thanks for the refactoring!



##########
python/pyiceberg/catalog/glue.py:
##########
@@ -56,124 +64,109 @@
 from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
 from pyiceberg.typedef import EMPTY_DICT
 
-BOTO_SESSION_CONFIG_KEYS = ["aws_access_key_id", "aws_secret_access_key", 
"aws_session_token", "region_name", "profile_name"]
-
-GLUE_CLIENT = "glue"
-
-
-PROP_GLUE_TABLE = "Table"
-PROP_GLUE_TABLE_TYPE = "TableType"
-PROP_GLUE_TABLE_DESCRIPTION = "Description"
-PROP_GLUE_TABLE_PARAMETERS = "Parameters"
-PROP_GLUE_TABLE_DATABASE_NAME = "DatabaseName"
-PROP_GLUE_TABLE_NAME = "Name"
-PROP_GLUE_TABLE_OWNER = "Owner"
-PROP_GLUE_TABLE_STORAGE_DESCRIPTOR = "StorageDescriptor"

Review Comment:
   I see that with the incorporation of Boto3 types, we now have an enhanced 
verification mechanism for key names when defining table inputs and parameters. 
However, in my perspective, it's still beneficial to retain the use of string 
constants, both to minimize the chance of typos and to ensure consistency 
throughout our codebase. For context, we've employed this approach with string 
constants in both hive.py and rest.py:
   
https://github.com/apache/iceberg/blob/cf358a4d150a3c7dcef9d1bccb09986a938aa128/python/pyiceberg/catalog/hive.py#L149-L152
   
https://github.com/apache/iceberg/blob/cf358a4d150a3c7dcef9d1bccb09986a938aa128/python/pyiceberg/catalog/rest.py#L90-L98
   
   If the concern revolves around the clarity of the variable names, perhaps we 
can consider simplifying them? For example:
   ```
   TABLE = "Table"
   TABLE_TYPE = "TableType"
   ...
   ```
   
   I'd truly appreciate your perspective on this. Do you think this approach 
would be beneficial, or do you have alternative suggestions?



##########
python/pyiceberg/catalog/glue.py:
##########
@@ -56,124 +64,109 @@
 from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
 from pyiceberg.typedef import EMPTY_DICT
 
-BOTO_SESSION_CONFIG_KEYS = ["aws_access_key_id", "aws_secret_access_key", 
"aws_session_token", "region_name", "profile_name"]
-
-GLUE_CLIENT = "glue"
-
-
-PROP_GLUE_TABLE = "Table"
-PROP_GLUE_TABLE_TYPE = "TableType"
-PROP_GLUE_TABLE_DESCRIPTION = "Description"
-PROP_GLUE_TABLE_PARAMETERS = "Parameters"
-PROP_GLUE_TABLE_DATABASE_NAME = "DatabaseName"
-PROP_GLUE_TABLE_NAME = "Name"
-PROP_GLUE_TABLE_OWNER = "Owner"
-PROP_GLUE_TABLE_STORAGE_DESCRIPTOR = "StorageDescriptor"
-
-PROP_GLUE_TABLELIST = "TableList"
-
-PROP_GLUE_DATABASE = "Database"
-PROP_GLUE_DATABASE_LIST = "DatabaseList"
-PROP_GLUE_DATABASE_NAME = "Name"
-PROP_GLUE_DATABASE_LOCATION = "LocationUri"
-PROP_GLUE_DATABASE_DESCRIPTION = "Description"
-PROP_GLUE_DATABASE_PARAMETERS = "Parameters"
-
-PROP_GLUE_NEXT_TOKEN = "NextToken"
-
-GLUE_DESCRIPTION_KEY = "comment"
-
 
 def _construct_parameters(metadata_location: str) -> Properties:
     return {TABLE_TYPE: ICEBERG.upper(), METADATA_LOCATION: metadata_location}
 
 
-def _construct_create_table_input(table_name: str, metadata_location: str, 
properties: Properties) -> Dict[str, Any]:
-    table_input = {
-        PROP_GLUE_TABLE_NAME: table_name,
-        PROP_GLUE_TABLE_TYPE: EXTERNAL_TABLE,
-        PROP_GLUE_TABLE_PARAMETERS: _construct_parameters(metadata_location),
+def _construct_create_table_input(table_name: str, metadata_location: str, 
properties: Properties) -> TableInputTypeDef:
+    table_input: TableInputTypeDef = {
+        "Name": table_name,
+        "TableType": EXTERNAL_TABLE,
+        "Parameters": _construct_parameters(metadata_location),
     }
 
-    if table_description := properties.get(GLUE_DESCRIPTION_KEY):
-        table_input[PROP_GLUE_TABLE_DESCRIPTION] = table_description
+    if "Description" in properties:
+        table_input["Description"] = properties["Description"]
 
     return table_input
 
 
-def _construct_rename_table_input(to_table_name: str, glue_table: Dict[str, 
Any]) -> Dict[str, Any]:
-    rename_table_input = {PROP_GLUE_TABLE_NAME: to_table_name}
+def _construct_rename_table_input(to_table_name: str, glue_table: 
TableTypeDef) -> TableInputTypeDef:
+    rename_table_input: TableInputTypeDef = {"Name": to_table_name}
     # use the same Glue info to create the new table, pointing to the old 
metadata
-    if table_type := glue_table.get(PROP_GLUE_TABLE_TYPE):
-        rename_table_input[PROP_GLUE_TABLE_TYPE] = table_type
-    if table_parameters := glue_table.get(PROP_GLUE_TABLE_PARAMETERS):
-        rename_table_input[PROP_GLUE_TABLE_PARAMETERS] = table_parameters
-    if table_owner := glue_table.get(PROP_GLUE_TABLE_OWNER):
-        rename_table_input[PROP_GLUE_TABLE_OWNER] = table_owner
-    if table_storage_descriptor := 
glue_table.get(PROP_GLUE_TABLE_STORAGE_DESCRIPTOR):
-        rename_table_input[PROP_GLUE_TABLE_STORAGE_DESCRIPTOR] = 
table_storage_descriptor
-    if table_description := glue_table.get(PROP_GLUE_TABLE_DESCRIPTION):
-        rename_table_input[PROP_GLUE_TABLE_DESCRIPTION] = table_description
+    rename_table_input["TableType"] = glue_table["TableType"]
+    if "Owner" in glue_table:
+        rename_table_input["Owner"] = glue_table["Owner"]
+
+    if "Parameters" in glue_table:
+        rename_table_input["Parameters"] = glue_table["Parameters"]
+
+    if "StorageDescriptor" in glue_table:
+        # It turns out the output of StorageDescriptor is not the same as the 
input type
+        # because the Column can have a different type, but for now it seems 
to work, so
+        # silence the type error.
+        rename_table_input["StorageDescriptor"] = 
cast(StorageDescriptorTypeDef, glue_table["StorageDescriptor"])
+
+    if "Description" in glue_table:
+        rename_table_input["Description"] = glue_table["Description"]
+
     return rename_table_input
 
 
-def _construct_database_input(database_name: str, properties: Properties) -> 
Dict[str, Any]:
-    database_input: Dict[str, Any] = {PROP_GLUE_DATABASE_NAME: database_name}
+def _construct_database_input(database_name: str, properties: Properties) -> 
DatabaseInputTypeDef:
+    database_input: DatabaseInputTypeDef = {"Name": database_name}
     parameters = {}
     for k, v in properties.items():
-        if k == GLUE_DESCRIPTION_KEY:
-            database_input[PROP_GLUE_DATABASE_DESCRIPTION] = v
+        if k == "Description":
+            database_input["Description"] = v
         elif k == LOCATION:
-            database_input[PROP_GLUE_DATABASE_LOCATION] = v
+            database_input["LocationUri"] = v
         else:
             parameters[k] = v
-    database_input[PROP_GLUE_DATABASE_PARAMETERS] = parameters
+    database_input["Parameters"] = parameters
     return database_input
 
 
 class GlueCatalog(Catalog):
-    def __init__(self, name: str, **properties: str):
+    def __init__(self, name: str, **properties: Any):
         super().__init__(name, **properties)
 
-        session_config = {k: v for k, v in properties.items() if k in 
BOTO_SESSION_CONFIG_KEYS}
-        session = boto3.Session(**session_config)
-        self.glue = session.client(GLUE_CLIENT)
+        session = boto3.Session(
+            profile_name=properties.get("profile_name"),
+            region_name=properties.get("region_name"),
+            botocore_session=properties.get("botocore_session"),
+            aws_access_key_id=properties.get("aws_access_key_id"),
+            aws_secret_access_key=properties.get("aws_secret_access_key"),
+            aws_session_token=properties.get("aws_session_token"),
+        )
+        self.glue: GlueClient = session.client("glue")
+
+    def _convert_glue_to_iceberg(self, glue_table: TableTypeDef) -> Table:
+        properties: Properties = glue_table["Parameters"]
 
-    def _convert_glue_to_iceberg(self, glue_table: Dict[str, Any]) -> Table:
-        properties: Properties = glue_table.get(PROP_GLUE_TABLE_PARAMETERS, {})
+        database_name = glue_table["DatabaseName"]

Review Comment:
   Do we want to add null check for "Parameters" and "DatabaseName" as they are 
specified as `NotRequired` according to 
https://youtype.github.io/boto3_stubs_docs/mypy_boto3_glue/type_defs/#tabletypedef
 ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to