This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new eeb0253373 Python: Add catalog name to identifiers (#7946)
eeb0253373 is described below
commit eeb0253373322bf2e8eb195955e21ff9904b8e1b
Author: Jonas(Rushan) Jiang <[email protected]>
AuthorDate: Thu Jul 6 00:34:23 2023 -0700
Python: Add catalog name to identifiers (#7946)
* add catalog name to the identifier of table returned by glue catalog
* add catalog name to the identifier of table returned by hive catalog
---
python/pyiceberg/catalog/glue.py | 2 +-
python/pyiceberg/catalog/hive.py | 2 +-
python/tests/catalog/integration_test_glue.py | 15 +++++----
python/tests/catalog/test_glue.py | 47 ++++++++++++++++++---------
python/tests/catalog/test_hive.py | 2 +-
5 files changed, 42 insertions(+), 26 deletions(-)
diff --git a/python/pyiceberg/catalog/glue.py b/python/pyiceberg/catalog/glue.py
index 7e06f2e47f..1d5160dbaa 100644
--- a/python/pyiceberg/catalog/glue.py
+++ b/python/pyiceberg/catalog/glue.py
@@ -166,7 +166,7 @@ class GlueCatalog(Catalog):
file = io.new_input(metadata_location)
metadata = FromInputFile.table_metadata(file)
return Table(
- identifier=(glue_table[PROP_GLUE_TABLE_DATABASE_NAME],
glue_table[PROP_GLUE_TABLE_NAME]),
+ identifier=(self.name, glue_table[PROP_GLUE_TABLE_DATABASE_NAME],
glue_table[PROP_GLUE_TABLE_NAME]),
metadata=metadata,
metadata_location=metadata_location,
io=self._load_file_io(metadata.properties, metadata_location),
diff --git a/python/pyiceberg/catalog/hive.py b/python/pyiceberg/catalog/hive.py
index 839fb2a3d5..08655676de 100644
--- a/python/pyiceberg/catalog/hive.py
+++ b/python/pyiceberg/catalog/hive.py
@@ -239,7 +239,7 @@ class HiveCatalog(Catalog):
file = io.new_input(metadata_location)
metadata = FromInputFile.table_metadata(file)
return Table(
- identifier=(table.dbName, table.tableName),
+ identifier=(self.name, table.dbName, table.tableName),
metadata=metadata,
metadata_location=metadata_location,
io=self._load_file_io(metadata.properties, metadata_location),
diff --git a/python/tests/catalog/integration_test_glue.py
b/python/tests/catalog/integration_test_glue.py
index 6f07720b0a..bd025f2a3d 100644
--- a/python/tests/catalog/integration_test_glue.py
+++ b/python/tests/catalog/integration_test_glue.py
@@ -35,6 +35,7 @@ from tests.conftest import clean_up, get_bucket_name,
get_s3_path
# The number of tables/databases used in list_table/namespace test
LIST_TEST_NUMBER = 2
+CATALOG_NAME = "glue"
@pytest.fixture(name="glue", scope="module")
@@ -45,7 +46,7 @@ def fixture_glue_client() -> boto3.client:
@pytest.fixture(name="test_catalog", scope="module")
def fixture_test_catalog() -> Generator[Catalog, None, None]:
"""The pre- and post-setting of aws integration test."""
- test_catalog = GlueCatalog("glue",
warehouse=get_s3_path(get_bucket_name()))
+ test_catalog = GlueCatalog(CATALOG_NAME,
warehouse=get_s3_path(get_bucket_name()))
yield test_catalog
clean_up(test_catalog)
@@ -57,7 +58,7 @@ def test_create_table(
test_catalog.create_namespace(database_name)
test_catalog.create_table(identifier, table_schema_nested,
get_s3_path(get_bucket_name(), database_name, table_name))
table = test_catalog.load_table(identifier)
- assert table.identifier == identifier
+ assert table.identifier == (CATALOG_NAME,) + identifier
metadata_location = table.metadata_location.split(get_bucket_name())[1][1:]
s3.head_object(Bucket=get_bucket_name(), Key=metadata_location)
@@ -78,7 +79,7 @@ def test_create_table_with_default_location(
test_catalog.create_namespace(database_name)
test_catalog.create_table(identifier, table_schema_nested)
table = test_catalog.load_table(identifier)
- assert table.identifier == identifier
+ assert table.identifier == (CATALOG_NAME,) + identifier
metadata_location = table.metadata_location.split(get_bucket_name())[1][1:]
s3.head_object(Bucket=get_bucket_name(), Key=metadata_location)
@@ -125,11 +126,11 @@ def test_rename_table(
new_table_name = f"rename-{table_name}"
identifier = (database_name, table_name)
table = test_catalog.create_table(identifier, table_schema_nested)
- assert table.identifier == identifier
+ assert table.identifier == (CATALOG_NAME,) + identifier
new_identifier = (new_database_name, new_table_name)
test_catalog.rename_table(identifier, new_identifier)
new_table = test_catalog.load_table(new_identifier)
- assert new_table.identifier == new_identifier
+ assert new_table.identifier == (CATALOG_NAME,) + new_identifier
assert new_table.metadata_location == table.metadata_location
metadata_location =
new_table.metadata_location.split(get_bucket_name())[1][1:]
s3.head_object(Bucket=get_bucket_name(), Key=metadata_location)
@@ -141,7 +142,7 @@ def test_drop_table(test_catalog: Catalog,
table_schema_nested: Schema, table_na
identifier = (database_name, table_name)
test_catalog.create_namespace(database_name)
table = test_catalog.create_table(identifier, table_schema_nested)
- assert table.identifier == identifier
+ assert table.identifier == (CATALOG_NAME,) + identifier
test_catalog.drop_table(identifier)
with pytest.raises(NoSuchTableError):
test_catalog.load_table(identifier)
@@ -154,7 +155,7 @@ def test_purge_table(
test_catalog.create_namespace(database_name)
test_catalog.create_table(identifier, table_schema_nested)
table = test_catalog.load_table(identifier)
- assert table.identifier == identifier
+ assert table.identifier == (CATALOG_NAME,) + identifier
metadata_location = table.metadata_location.split(get_bucket_name())[1][1:]
s3.head_object(Bucket=get_bucket_name(), Key=metadata_location)
test_catalog.purge_table(identifier)
diff --git a/python/tests/catalog/test_glue.py
b/python/tests/catalog/test_glue.py
index cf2e75c7d3..d217416697 100644
--- a/python/tests/catalog/test_glue.py
+++ b/python/tests/catalog/test_glue.py
@@ -38,11 +38,12 @@ from tests.conftest import BUCKET_NAME,
TABLE_METADATA_LOCATION_REGEX
def test_create_table_with_database_location(
_bucket_initialize: None, _patch_aiobotocore: None, table_schema_nested:
Schema, database_name: str, table_name: str
) -> None:
+ catalog_name = "glue"
identifier = (database_name, table_name)
- test_catalog = GlueCatalog("glue", **{"py-io-impl":
"pyiceberg.io.fsspec.FsspecFileIO"})
+ test_catalog = GlueCatalog(catalog_name, **{"py-io-impl":
"pyiceberg.io.fsspec.FsspecFileIO"})
test_catalog.create_namespace(namespace=database_name,
properties={"location": f"s3://{BUCKET_NAME}/{database_name}.db"})
table = test_catalog.create_table(identifier, table_schema_nested)
- assert table.identifier == identifier
+ assert table.identifier == (catalog_name,) + identifier
assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location)
@@ -50,11 +51,14 @@ def test_create_table_with_database_location(
def test_create_table_with_default_warehouse(
_bucket_initialize: None, _patch_aiobotocore: None, table_schema_nested:
Schema, database_name: str, table_name: str
) -> None:
+ catalog_name = "glue"
identifier = (database_name, table_name)
- test_catalog = GlueCatalog("glue", **{"py-io-impl":
"pyiceberg.io.fsspec.FsspecFileIO", "warehouse": f"s3://{BUCKET_NAME}"})
+ test_catalog = GlueCatalog(
+ catalog_name, **{"py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO",
"warehouse": f"s3://{BUCKET_NAME}"}
+ )
test_catalog.create_namespace(namespace=database_name)
table = test_catalog.create_table(identifier, table_schema_nested)
- assert table.identifier == identifier
+ assert table.identifier == (catalog_name,) + identifier
assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location)
@@ -62,13 +66,14 @@ def test_create_table_with_default_warehouse(
def test_create_table_with_given_location(
_bucket_initialize: None, _patch_aiobotocore: None, table_schema_nested:
Schema, database_name: str, table_name: str
) -> None:
+ catalog_name = "glue"
identifier = (database_name, table_name)
- test_catalog = GlueCatalog("glue", **{"py-io-impl":
"pyiceberg.io.fsspec.FsspecFileIO"})
+ test_catalog = GlueCatalog(catalog_name, **{"py-io-impl":
"pyiceberg.io.fsspec.FsspecFileIO"})
test_catalog.create_namespace(namespace=database_name)
table = test_catalog.create_table(
identifier=identifier, schema=table_schema_nested,
location=f"s3://{BUCKET_NAME}/{database_name}.db/{table_name}"
)
- assert table.identifier == identifier
+ assert table.identifier == (catalog_name,) + identifier
assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location)
@@ -76,8 +81,9 @@ def test_create_table_with_given_location(
def test_create_table_with_no_location(
_bucket_initialize: None, _patch_aiobotocore: None, table_schema_nested:
Schema, database_name: str, table_name: str
) -> None:
+ catalog_name = "glue"
identifier = (database_name, table_name)
- test_catalog = GlueCatalog("glue", **{"py-io-impl":
"pyiceberg.io.fsspec.FsspecFileIO"})
+ test_catalog = GlueCatalog(catalog_name, **{"py-io-impl":
"pyiceberg.io.fsspec.FsspecFileIO"})
test_catalog.create_namespace(namespace=database_name)
with pytest.raises(ValueError):
test_catalog.create_table(identifier=identifier,
schema=table_schema_nested)
@@ -87,11 +93,12 @@ def test_create_table_with_no_location(
def test_create_table_with_strips(
_bucket_initialize: None, _patch_aiobotocore: None, table_schema_nested:
Schema, database_name: str, table_name: str
) -> None:
+ catalog_name = "glue"
identifier = (database_name, table_name)
- test_catalog = GlueCatalog("glue", **{"py-io-impl":
"pyiceberg.io.fsspec.FsspecFileIO"})
+ test_catalog = GlueCatalog(catalog_name, **{"py-io-impl":
"pyiceberg.io.fsspec.FsspecFileIO"})
test_catalog.create_namespace(namespace=database_name,
properties={"location": f"s3://{BUCKET_NAME}/{database_name}.db/"})
table = test_catalog.create_table(identifier, table_schema_nested)
- assert table.identifier == identifier
+ assert table.identifier == (catalog_name,) + identifier
assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location)
@@ -99,11 +106,12 @@ def test_create_table_with_strips(
def test_create_table_with_strips_bucket_root(
_bucket_initialize: None, _patch_aiobotocore: None, table_schema_nested:
Schema, database_name: str, table_name: str
) -> None:
+ catalog_name = "glue"
identifier = (database_name, table_name)
test_catalog = GlueCatalog("glue", **{"py-io-impl":
"pyiceberg.io.fsspec.FsspecFileIO", "warehouse": f"s3://{BUCKET_NAME}/"})
test_catalog.create_namespace(namespace=database_name)
table_strip = test_catalog.create_table(identifier, table_schema_nested)
- assert table_strip.identifier == identifier
+ assert table_strip.identifier == (catalog_name,) + identifier
assert TABLE_METADATA_LOCATION_REGEX.match(table_strip.metadata_location)
@@ -133,12 +141,15 @@ def test_create_duplicated_table(
def test_load_table(
_bucket_initialize: None, _patch_aiobotocore: None, table_schema_nested:
Schema, database_name: str, table_name: str
) -> None:
+ catalog_name = "glue"
identifier = (database_name, table_name)
- test_catalog = GlueCatalog("glue", **{"py-io-impl":
"pyiceberg.io.fsspec.FsspecFileIO", "warehouse": f"s3://{BUCKET_NAME}/"})
+ test_catalog = GlueCatalog(
+ catalog_name, **{"py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO",
"warehouse": f"s3://{BUCKET_NAME}/"}
+ )
test_catalog.create_namespace(namespace=database_name)
test_catalog.create_table(identifier, table_schema_nested)
table = test_catalog.load_table(identifier)
- assert table.identifier == identifier
+ assert table.identifier == (catalog_name,) + identifier
assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location)
@@ -155,12 +166,15 @@ def test_load_non_exist_table(_bucket_initialize: None,
_patch_aiobotocore: None
def test_drop_table(
_bucket_initialize: None, _patch_aiobotocore: None, table_schema_nested:
Schema, database_name: str, table_name: str
) -> None:
+ catalog_name = "glue"
identifier = (database_name, table_name)
- test_catalog = GlueCatalog("glue", **{"py-io-impl":
"pyiceberg.io.fsspec.FsspecFileIO", "warehouse": f"s3://{BUCKET_NAME}/"})
+ test_catalog = GlueCatalog(
+ catalog_name, **{"py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO",
"warehouse": f"s3://{BUCKET_NAME}/"}
+ )
test_catalog.create_namespace(namespace=database_name)
test_catalog.create_table(identifier, table_schema_nested)
table = test_catalog.load_table(identifier)
- assert table.identifier == identifier
+ assert table.identifier == (catalog_name,) + identifier
assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location)
test_catalog.drop_table(identifier)
with pytest.raises(NoSuchTableError):
@@ -179,17 +193,18 @@ def test_drop_non_exist_table(_bucket_initialize: None,
_patch_aiobotocore: None
def test_rename_table(
_bucket_initialize: None, _patch_aiobotocore: None, table_schema_nested:
Schema, database_name: str, table_name: str
) -> None:
+ catalog_name = "glue"
new_table_name = f"{table_name}_new"
identifier = (database_name, table_name)
new_identifier = (database_name, new_table_name)
test_catalog = GlueCatalog("glue", **{"py-io-impl":
"pyiceberg.io.fsspec.FsspecFileIO", "warehouse": f"s3://{BUCKET_NAME}/"})
test_catalog.create_namespace(namespace=database_name)
table = test_catalog.create_table(identifier, table_schema_nested)
- assert table.identifier == identifier
+ assert table.identifier == (catalog_name,) + identifier
assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location)
test_catalog.rename_table(identifier, new_identifier)
new_table = test_catalog.load_table(new_identifier)
- assert new_table.identifier == new_identifier
+ assert new_table.identifier == (catalog_name,) + new_identifier
# the metadata_location should not change
assert new_table.metadata_location == table.metadata_location
# old table should be dropped
diff --git a/python/tests/catalog/test_hive.py
b/python/tests/catalog/test_hive.py
index fee3a4f731..23bc0208b6 100644
--- a/python/tests/catalog/test_hive.py
+++ b/python/tests/catalog/test_hive.py
@@ -390,7 +390,7 @@ def test_load_table(hive_table: HiveTable) -> None:
last_sequence_number=34,
)
- assert table.identifier == ("default", "new_tabl2e")
+ assert table.identifier == (HIVE_CATALOG_NAME, "default", "new_tabl2e")
assert expected == table.metadata