[GitHub] [iceberg] Fokko commented on a diff in pull request #4706: Python: Support iceberg base catalog in python library (#3245)

GitBox Tue, 24 May 2022 10:57:56 -0700


Fokko commented on code in PR #4706:
URL: https://github.com/apache/iceberg/pull/4706#discussion_r880799692



##########
python/tests/catalog/test_base.py:
##########
@@ -0,0 +1,401 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+from typing import Dict, List, Optional, Set, Union
+
+import pytest
+
+from iceberg.catalog.base import Catalog, Identifier, Properties
+from iceberg.exceptions import (
+    AlreadyExistsError,
+    NamespaceNotEmptyError,
+    NoSuchNamespaceError,
+    NoSuchTableError,
+)
+from iceberg.schema import Schema
+from iceberg.table.base import PartitionSpec, Table
+
+
+class InMemoryCatalog(Catalog):
+    """An in-memory catalog implementation for testing purposes."""
+
+    __tables: Dict[Identifier, Table]
+    __namespaces: Dict[Identifier, Properties]
+
+    def __init__(self, name: str, properties: Properties):
+        super().__init__(name, properties)
+        self.__tables = {}
+        self.__namespaces = {}
+
+    def create_table(
+        self,
+        identifier: Union[str, Identifier],
+        schema: Schema,
+        location: Optional[str] = None,
+        partition_spec: Optional[PartitionSpec] = None,
+        properties: Optional[Properties] = None,
+    ) -> Table:
+
+        identifier = Catalog.identifier_to_tuple(identifier)
+        namespace = Catalog.namespace_from(identifier)
+
+        if identifier in self.__tables:
+            raise AlreadyExistsError(f"Table already exists: {identifier}")
+        else:
+            if namespace not in self.__namespaces:
+                self.__namespaces[namespace] = {}
+
+            table = Table()
+            table.identifier = identifier
+            self.__tables[identifier] = table
+            return table
+
+    def load_table(self, identifier: Union[str, Identifier]) -> Table:
+        identifier = Catalog.identifier_to_tuple(identifier)
+        try:
+            return self.__tables[identifier]
+        except KeyError as error:
+            raise NoSuchTableError(f"Table does not exist: {identifier}") from 
error
+
+    def drop_table(self, identifier: Union[str, Identifier]) -> None:
+        identifier = Catalog.identifier_to_tuple(identifier)
+        try:
+            self.__tables.pop(identifier)
+        except KeyError as error:
+            raise NoSuchTableError(f"Table does not exist: {identifier}") from 
error
+
+    def purge_table(self, identifier: Union[str, Identifier]) -> None:
+        self.drop_table(identifier)
+
+    def rename_table(self, from_identifier: Union[str, Identifier], 
to_identifier: Union[str, Identifier]) -> Table:
+        from_identifier = Catalog.identifier_to_tuple(from_identifier)
+        try:
+            table = self.__tables.pop(from_identifier)
+        except KeyError as error:
+            raise NoSuchTableError(f"Table does not exist: {from_identifier}") 
from error
+
+        to_identifier = Catalog.identifier_to_tuple(to_identifier)
+        to_namespace = Catalog.namespace_from(to_identifier)
+        if to_namespace not in self.__namespaces:
+            self.__namespaces[to_namespace] = {}
+
+        table.identifier = to_identifier
+        self.__tables[to_identifier] = table
+        return table
+
+    def create_namespace(self, namespace: Union[str, Identifier], properties: 
Optional[Properties] = None) -> None:
+        namespace = Catalog.identifier_to_tuple(namespace)
+        if namespace in self.__namespaces:
+            raise AlreadyExistsError(f"Namespace already exists: {namespace}")
+        else:
+            self.__namespaces[namespace] = properties if properties else {}

Review Comment:
   Less is more:
   ```suggestion
               self.__namespaces[namespace] = properties or {}
   ```



##########
python/tests/catalog/test_base.py:
##########
@@ -0,0 +1,401 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+from typing import Dict, List, Optional, Set, Union
+
+import pytest
+
+from iceberg.catalog.base import Catalog, Identifier, Properties
+from iceberg.exceptions import (
+    AlreadyExistsError,
+    NamespaceNotEmptyError,
+    NoSuchNamespaceError,
+    NoSuchTableError,
+)
+from iceberg.schema import Schema
+from iceberg.table.base import PartitionSpec, Table
+
+
+class InMemoryCatalog(Catalog):
+    """An in-memory catalog implementation for testing purposes."""
+
+    __tables: Dict[Identifier, Table]

Review Comment:
   I think a single underscore is more appropriate here. It is an internal 
variable of the `InMemoryCatalog`.
   
   If we want to set this as a ClassVar, then we also have to annotation for it:
   ```suggestion
       _tables: ClassVar[Dict[Identifier, Table]]
   ```
   But I'm not sure if we want to do that since we would like to instantiate a 
fresh InMemoryCatalog for the different tests.



##########
python/src/iceberg/catalog/base.py:
##########
@@ -0,0 +1,247 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+from iceberg.schema import Schema
+from iceberg.table.base import PartitionSpec, Table
+
+Identifier = Tuple[str, ...]
+Properties = Dict[str, str]
+
+
+class Catalog(ABC):
+    """Base Catalog for table operations like - create, drop, load, list and 
others.
+
+    The catalog table APIs accept a table identifier, which is fully 
classified table name. The identifier can be a string or
+    tuple of strings. If the identifier is a string, it is split into a tuple 
on '.'. If it is a tuple, it is used as-is.
+
+    The catalog namespace APIs follow a similar convention wherein they also 
accept a namespace identifier that can be a string
+    or tuple of strings.
+
+    Attributes:
+        name(str): Name of the catalog
+        properties(Properties): Catalog properties
+    """
+
+    def __init__(self, name: str, properties: Properties):

Review Comment:
   I'll continue my quest for data classes here as well. Instead of creating 
the getters/setters, we could also let the dataclasses do this:
   ```python
   @dataclass
   class Catalog(ABC):
   
       name: str =  field()
       name: Properties =  field(default_factory=dict)
   ```
   This will automatically generate the getters and setters.



##########
python/src/iceberg/catalog/base.py:
##########
@@ -0,0 +1,247 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+from iceberg.schema import Schema
+from iceberg.table.base import PartitionSpec, Table
+
+Identifier = Tuple[str, ...]
+Properties = Dict[str, str]
+
+
+class Catalog(ABC):
+    """Base Catalog for table operations like - create, drop, load, list and 
others.
+
+    The catalog table APIs accept a table identifier, which is fully 
classified table name. The identifier can be a string or
+    tuple of strings. If the identifier is a string, it is split into a tuple 
on '.'. If it is a tuple, it is used as-is.
+
+    The catalog namespace APIs follow a similar convention wherein they also 
accept a namespace identifier that can be a string
+    or tuple of strings.
+
+    Attributes:
+        name(str): Name of the catalog
+        properties(Properties): Catalog properties
+    """
+
+    def __init__(self, name: str, properties: Properties):
+        self._name = name
+        self._properties = properties
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @property
+    def properties(self) -> Properties:
+        return self._properties
+
+    @abstractmethod
+    def create_table(
+        self,
+        identifier: Union[str, Identifier],
+        schema: Schema,
+        location: Optional[str] = None,
+        partition_spec: Optional[PartitionSpec] = None,
+        properties: Optional[Properties] = None,
+    ) -> Table:
+        """Create a table
+
+        Args:
+            identifier: Table identifier.
+            schema: Table's schema.
+            location: Location for the table. Optional Argument.
+            partition_spec: PartitionSpec for the table. Optional Argument.
+            properties: Table properties that can be a string based 
dictionary. Optional Argument.
+
+        Returns:
+            Table: the created table instance
+
+        Raises:
+            AlreadyExistsError: If a table with the name already exists
+        """
+
+    @abstractmethod
+    def load_table(self, identifier: Union[str, Identifier]) -> Table:
+        """Loads the table's metadata and returns the table instance.
+
+        You can also use this method to check for table existence using 'try 
catalog.table() except TableNotFoundError'
+        Note: This method doesn't scan data stored in the table.
+
+        Args:
+            identifier: Table identifier.
+
+        Returns:
+            Table: the table instance with its metadata
+
+        Raises:
+            TableNotFoundError: If a table with the name does not exist
+        """
+
+    @abstractmethod
+    def drop_table(self, identifier: Union[str, Identifier]) -> None:
+        """Drop a table.
+
+        Args:
+            identifier: Table identifier.
+
+        Raises:
+            TableNotFoundError: If a table with the name does not exist
+        """
+
+    @abstractmethod
+    def purge_table(self, identifier: Union[str, Identifier]) -> None:
+        """Drop a table and purge all data and metadata files.
+
+        Args:
+            identifier: Table identifier.
+
+        Raises:
+            TableNotFoundError: If a table with the name does not exist
+        """
+
+    @abstractmethod
+    def rename_table(self, from_identifier: Union[str, Identifier], 
to_identifier: Union[str, Identifier]) -> Table:
+        """Rename a fully classified table name
+
+        Args:
+            from_identifier: Existing table identifier.
+            to_identifier: New table identifier.
+
+        Returns:
+            Table: the updated table instance with its metadata
+
+        Raises:
+            TableNotFoundError: If a table with the name does not exist
+        """
+
+    @abstractmethod
+    def create_namespace(self, namespace: Union[str, Identifier], properties: 
Optional[Properties] = None) -> None:
+        """Create a namespace in the catalog.
+
+        Args:
+            namespace: Namespace identifier
+            properties: A string dictionary of properties for the given 
namespace
+
+        Raises:
+            AlreadyExistsError: If a namespace with the given name already 
exists
+        """
+
+    @abstractmethod
+    def drop_namespace(self, namespace: Union[str, Identifier]) -> None:
+        """Drop a namespace.
+
+        Args:
+            namespace: Namespace identifier
+
+        Raises:
+            NamespaceNotFoundError: If a namespace with the given name does 
not exist
+            NamespaceNotEmptyError: If the namespace is not empty
+        """
+
+    @abstractmethod
+    def list_tables(self, namespace: Optional[Union[str, Identifier]] = None) 
-> List[Identifier]:
+        """List tables under the given namespace in the catalog.
+
+        If namespace not provided, will list all tables in the catalog.
+
+        Args:
+            namespace: Namespace identifier to search.
+
+        Returns:
+            List[Identifier]: list of table identifiers.
+
+        Raises:
+            NamespaceNotFoundError: If a namespace with the given name does 
not exist
+        """
+
+    @abstractmethod
+    def list_namespaces(self) -> List[Identifier]:
+        """List namespaces from the given namespace. If not given, list 
top-level namespaces from the catalog.
+
+        Returns:
+            List[Identifier]: a List of namespace identifiers
+        """
+
+    @abstractmethod
+    def load_namespace_properties(self, namespace: Union[str, Identifier]) -> 
Properties:
+        """Get properties for a namespace.
+
+        Args:
+            namespace: Namespace identifier
+
+        Returns:
+            Properties: Properties for the given namespace
+
+        Raises:
+            NamespaceNotFoundError: If a namespace with the given name does 
not exist
+        """
+
+    @abstractmethod
+    def update_namespace_properties(
+        self, namespace: Union[str, Identifier], removals: Optional[Set[str]] 
= None, updates: Optional[Properties] = None
+    ) -> None:
+        """Removes provided property keys and updates properties for a 
namespace.
+
+        Args:
+            namespace: Namespace identifier
+            removals: Set of property keys that need to be removed. Optional 
Argument.
+            updates: Properties to be updated for the given namespace. 
Optional Argument.
+
+        Raises:
+            NamespaceNotFoundError: If a namespace with the given name does 
not exist
+            ValueError: If removals and updates have overlapping keys.
+        """
+
+    @staticmethod
+    def identifier_to_tuple(identifier: Union[str, Identifier]) -> Identifier:

Review Comment:
   Naming things is one of the hardest things in Computer Science, but I find 
this one a bit awkward since:
   ```python
   Identifier = Tuple[str, ...]
   ```
   So it is actually already a tuple. How about:
   ```suggestion
       def to_identifier(identifier: Union[str, Identifier]) -> Identifier:
   ```



##########
python/src/iceberg/catalog/base.py:
##########
@@ -0,0 +1,247 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+from iceberg.schema import Schema
+from iceberg.table.base import PartitionSpec, Table
+
+Identifier = Tuple[str, ...]
+Properties = Dict[str, str]
+
+
+class Catalog(ABC):
+    """Base Catalog for table operations like - create, drop, load, list and 
others.
+
+    The catalog table APIs accept a table identifier, which is fully 
classified table name. The identifier can be a string or
+    tuple of strings. If the identifier is a string, it is split into a tuple 
on '.'. If it is a tuple, it is used as-is.
+
+    The catalog namespace APIs follow a similar convention wherein they also 
accept a namespace identifier that can be a string
+    or tuple of strings.
+
+    Attributes:
+        name(str): Name of the catalog
+        properties(Properties): Catalog properties
+    """
+
+    def __init__(self, name: str, properties: Properties):

Review Comment:
   If we want to go full blown pythonic in the future, we could also validate 
these using Pydantic validators: 
https://pydantic-docs.helpmanual.io/usage/validators/



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] Fokko commented on a diff in pull request #4706: Python: Support iceberg base catalog in python library (#3245)

Reply via email to