HonahX commented on code in PR #8304:
URL: https://github.com/apache/iceberg/pull/8304#discussion_r1295358314
##########
python/pyiceberg/catalog/glue.py:
##########
@@ -56,124 +64,109 @@
from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
from pyiceberg.typedef import EMPTY_DICT
-BOTO_SESSION_CONFIG_KEYS = ["aws_access_key_id", "aws_secret_access_key",
"aws_session_token", "region_name", "profile_name"]
-
-GLUE_CLIENT = "glue"
-
-
-PROP_GLUE_TABLE = "Table"
-PROP_GLUE_TABLE_TYPE = "TableType"
-PROP_GLUE_TABLE_DESCRIPTION = "Description"
-PROP_GLUE_TABLE_PARAMETERS = "Parameters"
-PROP_GLUE_TABLE_DATABASE_NAME = "DatabaseName"
-PROP_GLUE_TABLE_NAME = "Name"
-PROP_GLUE_TABLE_OWNER = "Owner"
-PROP_GLUE_TABLE_STORAGE_DESCRIPTOR = "StorageDescriptor"
-
-PROP_GLUE_TABLELIST = "TableList"
-
-PROP_GLUE_DATABASE = "Database"
-PROP_GLUE_DATABASE_LIST = "DatabaseList"
-PROP_GLUE_DATABASE_NAME = "Name"
-PROP_GLUE_DATABASE_LOCATION = "LocationUri"
-PROP_GLUE_DATABASE_DESCRIPTION = "Description"
-PROP_GLUE_DATABASE_PARAMETERS = "Parameters"
-
-PROP_GLUE_NEXT_TOKEN = "NextToken"
-
-GLUE_DESCRIPTION_KEY = "comment"
-
def _construct_parameters(metadata_location: str) -> Properties:
return {TABLE_TYPE: ICEBERG.upper(), METADATA_LOCATION: metadata_location}
-def _construct_create_table_input(table_name: str, metadata_location: str,
properties: Properties) -> Dict[str, Any]:
- table_input = {
- PROP_GLUE_TABLE_NAME: table_name,
- PROP_GLUE_TABLE_TYPE: EXTERNAL_TABLE,
- PROP_GLUE_TABLE_PARAMETERS: _construct_parameters(metadata_location),
+def _construct_create_table_input(table_name: str, metadata_location: str,
properties: Properties) -> TableInputTypeDef:
+ table_input: TableInputTypeDef = {
+ "Name": table_name,
+ "TableType": EXTERNAL_TABLE,
+ "Parameters": _construct_parameters(metadata_location),
}
- if table_description := properties.get(GLUE_DESCRIPTION_KEY):
- table_input[PROP_GLUE_TABLE_DESCRIPTION] = table_description
+ if "Description" in properties:
+ table_input["Description"] = properties["Description"]
return table_input
-def _construct_rename_table_input(to_table_name: str, glue_table: Dict[str,
Any]) -> Dict[str, Any]:
- rename_table_input = {PROP_GLUE_TABLE_NAME: to_table_name}
+def _construct_rename_table_input(to_table_name: str, glue_table:
TableTypeDef) -> TableInputTypeDef:
+ rename_table_input: TableInputTypeDef = {"Name": to_table_name}
# use the same Glue info to create the new table, pointing to the old
metadata
- if table_type := glue_table.get(PROP_GLUE_TABLE_TYPE):
- rename_table_input[PROP_GLUE_TABLE_TYPE] = table_type
- if table_parameters := glue_table.get(PROP_GLUE_TABLE_PARAMETERS):
- rename_table_input[PROP_GLUE_TABLE_PARAMETERS] = table_parameters
- if table_owner := glue_table.get(PROP_GLUE_TABLE_OWNER):
- rename_table_input[PROP_GLUE_TABLE_OWNER] = table_owner
- if table_storage_descriptor :=
glue_table.get(PROP_GLUE_TABLE_STORAGE_DESCRIPTOR):
- rename_table_input[PROP_GLUE_TABLE_STORAGE_DESCRIPTOR] =
table_storage_descriptor
- if table_description := glue_table.get(PROP_GLUE_TABLE_DESCRIPTION):
- rename_table_input[PROP_GLUE_TABLE_DESCRIPTION] = table_description
+ rename_table_input["TableType"] = glue_table["TableType"]
Review Comment:
I think we may still need `if "TableType" in glue_table` here? `TableType`
is not required according to
https://youtype.github.io/boto3_stubs_docs/mypy_boto3_glue/type_defs/#tabletypedef
##########
python/pyiceberg/catalog/glue.py:
##########
@@ -413,15 +412,18 @@ def list_namespaces(self, namespace: Union[str,
Identifier] = ()) -> List[Identi
if namespace:
return []
- database_list = []
+ database_list: List[DatabaseTypeDef] = []
databases_response = self.glue.get_databases()
- next_token = databases_response.get(PROP_GLUE_NEXT_TOKEN)
- database_list += databases_response.get(PROP_GLUE_DATABASE_LIST, [])
- while next_token:
- databases_response = self.glue.get_databases(NextToken=next_token)
- next_token = databases_response.get(PROP_GLUE_NEXT_TOKEN)
- database_list += databases_response.get(PROP_GLUE_DATABASE_LIST,
[])
- return
[self.identifier_to_tuple(database.get(PROP_GLUE_DATABASE_NAME)) for database
in database_list]
+ next_token: Optional[str] = None
+
+ while True:
+ databases_response = self.glue.get_databases() if not next_token
else self.glue.get_databases(NextToken=next_token)
+ database_list.extend(databases_response["DatabaseList"])
+ next_token = databases_response.get("NextToken")
+ if not next_token:
+ break
Review Comment:
Thanks for the refactoring!
##########
python/pyiceberg/catalog/glue.py:
##########
@@ -56,124 +64,109 @@
from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
from pyiceberg.typedef import EMPTY_DICT
-BOTO_SESSION_CONFIG_KEYS = ["aws_access_key_id", "aws_secret_access_key",
"aws_session_token", "region_name", "profile_name"]
-
-GLUE_CLIENT = "glue"
-
-
-PROP_GLUE_TABLE = "Table"
-PROP_GLUE_TABLE_TYPE = "TableType"
-PROP_GLUE_TABLE_DESCRIPTION = "Description"
-PROP_GLUE_TABLE_PARAMETERS = "Parameters"
-PROP_GLUE_TABLE_DATABASE_NAME = "DatabaseName"
-PROP_GLUE_TABLE_NAME = "Name"
-PROP_GLUE_TABLE_OWNER = "Owner"
-PROP_GLUE_TABLE_STORAGE_DESCRIPTOR = "StorageDescriptor"
Review Comment:
I see that with the incorporation of Boto3 types, we now have an enhanced
verification mechanism for key names when defining table inputs and parameters.
However, in my perspective, it's still beneficial to retain the use of string
constants, both to minimize the chance of typos and to ensure consistency
throughout our codebase. For context, we've employed this approach with string
constants in both hive.py and rest.py:
https://github.com/apache/iceberg/blob/cf358a4d150a3c7dcef9d1bccb09986a938aa128/python/pyiceberg/catalog/hive.py#L149-L152
https://github.com/apache/iceberg/blob/cf358a4d150a3c7dcef9d1bccb09986a938aa128/python/pyiceberg/catalog/rest.py#L90-L98
If the concern revolves around the clarity of the variable names, perhaps we
can consider simplifying them? For example:
```
TABLE = "Table"
TABLE_TYPE = "TableType"
...
```
I'd truly appreciate your perspective on this. Do you think this approach
would be beneficial, or do you have alternative suggestions?
##########
python/pyiceberg/catalog/glue.py:
##########
@@ -56,124 +64,109 @@
from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
from pyiceberg.typedef import EMPTY_DICT
-BOTO_SESSION_CONFIG_KEYS = ["aws_access_key_id", "aws_secret_access_key",
"aws_session_token", "region_name", "profile_name"]
-
-GLUE_CLIENT = "glue"
-
-
-PROP_GLUE_TABLE = "Table"
-PROP_GLUE_TABLE_TYPE = "TableType"
-PROP_GLUE_TABLE_DESCRIPTION = "Description"
-PROP_GLUE_TABLE_PARAMETERS = "Parameters"
-PROP_GLUE_TABLE_DATABASE_NAME = "DatabaseName"
-PROP_GLUE_TABLE_NAME = "Name"
-PROP_GLUE_TABLE_OWNER = "Owner"
-PROP_GLUE_TABLE_STORAGE_DESCRIPTOR = "StorageDescriptor"
-
-PROP_GLUE_TABLELIST = "TableList"
-
-PROP_GLUE_DATABASE = "Database"
-PROP_GLUE_DATABASE_LIST = "DatabaseList"
-PROP_GLUE_DATABASE_NAME = "Name"
-PROP_GLUE_DATABASE_LOCATION = "LocationUri"
-PROP_GLUE_DATABASE_DESCRIPTION = "Description"
-PROP_GLUE_DATABASE_PARAMETERS = "Parameters"
-
-PROP_GLUE_NEXT_TOKEN = "NextToken"
-
-GLUE_DESCRIPTION_KEY = "comment"
-
def _construct_parameters(metadata_location: str) -> Properties:
return {TABLE_TYPE: ICEBERG.upper(), METADATA_LOCATION: metadata_location}
-def _construct_create_table_input(table_name: str, metadata_location: str,
properties: Properties) -> Dict[str, Any]:
- table_input = {
- PROP_GLUE_TABLE_NAME: table_name,
- PROP_GLUE_TABLE_TYPE: EXTERNAL_TABLE,
- PROP_GLUE_TABLE_PARAMETERS: _construct_parameters(metadata_location),
+def _construct_create_table_input(table_name: str, metadata_location: str,
properties: Properties) -> TableInputTypeDef:
+ table_input: TableInputTypeDef = {
+ "Name": table_name,
+ "TableType": EXTERNAL_TABLE,
+ "Parameters": _construct_parameters(metadata_location),
}
- if table_description := properties.get(GLUE_DESCRIPTION_KEY):
- table_input[PROP_GLUE_TABLE_DESCRIPTION] = table_description
+ if "Description" in properties:
+ table_input["Description"] = properties["Description"]
return table_input
-def _construct_rename_table_input(to_table_name: str, glue_table: Dict[str,
Any]) -> Dict[str, Any]:
- rename_table_input = {PROP_GLUE_TABLE_NAME: to_table_name}
+def _construct_rename_table_input(to_table_name: str, glue_table:
TableTypeDef) -> TableInputTypeDef:
+ rename_table_input: TableInputTypeDef = {"Name": to_table_name}
# use the same Glue info to create the new table, pointing to the old
metadata
- if table_type := glue_table.get(PROP_GLUE_TABLE_TYPE):
- rename_table_input[PROP_GLUE_TABLE_TYPE] = table_type
- if table_parameters := glue_table.get(PROP_GLUE_TABLE_PARAMETERS):
- rename_table_input[PROP_GLUE_TABLE_PARAMETERS] = table_parameters
- if table_owner := glue_table.get(PROP_GLUE_TABLE_OWNER):
- rename_table_input[PROP_GLUE_TABLE_OWNER] = table_owner
- if table_storage_descriptor :=
glue_table.get(PROP_GLUE_TABLE_STORAGE_DESCRIPTOR):
- rename_table_input[PROP_GLUE_TABLE_STORAGE_DESCRIPTOR] =
table_storage_descriptor
- if table_description := glue_table.get(PROP_GLUE_TABLE_DESCRIPTION):
- rename_table_input[PROP_GLUE_TABLE_DESCRIPTION] = table_description
+ rename_table_input["TableType"] = glue_table["TableType"]
+ if "Owner" in glue_table:
+ rename_table_input["Owner"] = glue_table["Owner"]
+
+ if "Parameters" in glue_table:
+ rename_table_input["Parameters"] = glue_table["Parameters"]
+
+ if "StorageDescriptor" in glue_table:
+ # It turns out the output of StorageDescriptor is not the same as the
input type
+ # because the Column can have a different type, but for now it seems
to work, so
+ # silence the type error.
+ rename_table_input["StorageDescriptor"] =
cast(StorageDescriptorTypeDef, glue_table["StorageDescriptor"])
+
+ if "Description" in glue_table:
+ rename_table_input["Description"] = glue_table["Description"]
+
return rename_table_input
-def _construct_database_input(database_name: str, properties: Properties) ->
Dict[str, Any]:
- database_input: Dict[str, Any] = {PROP_GLUE_DATABASE_NAME: database_name}
+def _construct_database_input(database_name: str, properties: Properties) ->
DatabaseInputTypeDef:
+ database_input: DatabaseInputTypeDef = {"Name": database_name}
parameters = {}
for k, v in properties.items():
- if k == GLUE_DESCRIPTION_KEY:
- database_input[PROP_GLUE_DATABASE_DESCRIPTION] = v
+ if k == "Description":
+ database_input["Description"] = v
elif k == LOCATION:
- database_input[PROP_GLUE_DATABASE_LOCATION] = v
+ database_input["LocationUri"] = v
else:
parameters[k] = v
- database_input[PROP_GLUE_DATABASE_PARAMETERS] = parameters
+ database_input["Parameters"] = parameters
return database_input
class GlueCatalog(Catalog):
- def __init__(self, name: str, **properties: str):
+ def __init__(self, name: str, **properties: Any):
super().__init__(name, **properties)
- session_config = {k: v for k, v in properties.items() if k in
BOTO_SESSION_CONFIG_KEYS}
- session = boto3.Session(**session_config)
- self.glue = session.client(GLUE_CLIENT)
+ session = boto3.Session(
+ profile_name=properties.get("profile_name"),
+ region_name=properties.get("region_name"),
+ botocore_session=properties.get("botocore_session"),
+ aws_access_key_id=properties.get("aws_access_key_id"),
+ aws_secret_access_key=properties.get("aws_secret_access_key"),
+ aws_session_token=properties.get("aws_session_token"),
+ )
+ self.glue: GlueClient = session.client("glue")
+
+ def _convert_glue_to_iceberg(self, glue_table: TableTypeDef) -> Table:
+ properties: Properties = glue_table["Parameters"]
- def _convert_glue_to_iceberg(self, glue_table: Dict[str, Any]) -> Table:
- properties: Properties = glue_table.get(PROP_GLUE_TABLE_PARAMETERS, {})
+ database_name = glue_table["DatabaseName"]
Review Comment:
Do we want to add null check for "Parameters" and "DatabaseName" as they are
specified as `NotRequired` according to
https://youtype.github.io/boto3_stubs_docs/mypy_boto3_glue/type_defs/#tabletypedef
?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]