Fokko commented on code in PR #8374:
URL: https://github.com/apache/iceberg/pull/8374#discussion_r1306713188
##########
python/pyiceberg/table/__init__.py:
##########
@@ -934,206 +963,611 @@ def case_sensitive(self, case_sensitive: bool) ->
UpdateSchema:
return self
def add_column(
- self, name: str, type_var: IcebergType, doc: Optional[str] = None,
parent: Optional[str] = None, required: bool = False
+ self, path: Union[str, Tuple[str, ...]], field_type: IcebergType, doc:
Optional[str] = None, required: bool = False
) -> UpdateSchema:
"""Add a new column to a nested struct or Add a new top-level column.
Args:
- name: Name for the new column.
- type_var: Type for the new column.
+ path: Name for the new column.
+ field_type: Type for the new column.
doc: Documentation string for the new column.
- parent: Name of the parent struct to the column will be added to.
required: Whether the new column is required.
Returns:
- This for method chaining
+ This for method chaining.
"""
- if "." in name:
- raise ValueError(f"Cannot add column with ambiguous name: {name}")
+ path = (path,) if isinstance(path, str) else path
+
+ if "." in path[-1]:
+ raise ValueError(f"Cannot add column with ambiguous name:
{path[-1]}, provide a tuple instead")
if required and not self._allow_incompatible_changes:
# Table format version 1 and 2 cannot add required column because
there is no initial value
- raise ValueError(f"Incompatible change: cannot add required
column: {name}")
+ raise ValueError(f'Incompatible change: cannot add required
column: {".".join(path)}')
+
+ name = path[-1]
+ parent = path[:-1]
+
+ full_name = ".".join(path)
+ parent_id: int = TABLE_ROOT_ID
+
+ if len(parent) > 0:
+ parent_field = self._schema.find_field(".".join(parent),
self._case_sensitive)
+ parent_type = parent_field.field_type
+ if isinstance(parent_type, MapType):
+ parent_field = parent_type.value_field
+ elif isinstance(parent_type, ListType):
+ parent_field = parent_type.element_field
+
+ if not parent_field.field_type.is_struct:
+ raise ValueError(f"Cannot add column '{name}' to non-struct
type: {'.'.join(parent)}")
+
+ parent_id = parent_field.field_id
+
+ exists = False
+ try:
+ exists = self._schema.find_field(full_name, self._case_sensitive)
is not None
+ except ValueError:
+ pass
+
+ if exists:
+ raise ValueError(f"Cannot add column, name already exists:
{full_name}")
+
+ # assign new IDs in order
+ new_id = self.assign_new_column_id()
+
+ # update tracking for moves
+ self._added_name_to_id[full_name] = new_id
+
+ new_type = assign_fresh_schema_ids(field_type,
self.assign_new_column_id)
+ field = NestedField(field_id=new_id, name=name, field_type=new_type,
required=required, doc=doc)
+
+ self._adds[parent_id] = self._adds.get(parent_id, []) + [field]
Review Comment:
I'm open to making it more readable, but that doesn't work on my machine:
```python
➜ python git:(fd-followup) python3
Python 3.11.4 (main, Jul 25 2023, 17:36:13) [Clang 14.0.3
(clang-1403.0.22.14.1)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> tuple(*[1, 2], 3)
```
What does work is:
```python
self._adds[parent_id] = self._adds.get(parent_id, tuple()), + (field,)
```
But then we end up with the same thing as before, but then a tuple. I would
suggest refactoring it as:
```python
if parent_id in self._adds:
self._adds[parent_id].append(field)
else:
self._adds[parent_id] = [field]
```
Then we don't recreate the collections all the time (but this isn't the
hottest path, probably)
##########
python/pyiceberg/table/__init__.py:
##########
@@ -934,206 +963,611 @@ def case_sensitive(self, case_sensitive: bool) ->
UpdateSchema:
return self
def add_column(
- self, name: str, type_var: IcebergType, doc: Optional[str] = None,
parent: Optional[str] = None, required: bool = False
+ self, path: Union[str, Tuple[str, ...]], field_type: IcebergType, doc:
Optional[str] = None, required: bool = False
) -> UpdateSchema:
"""Add a new column to a nested struct or Add a new top-level column.
Args:
- name: Name for the new column.
- type_var: Type for the new column.
+ path: Name for the new column.
+ field_type: Type for the new column.
doc: Documentation string for the new column.
- parent: Name of the parent struct to the column will be added to.
required: Whether the new column is required.
Returns:
- This for method chaining
+ This for method chaining.
"""
- if "." in name:
- raise ValueError(f"Cannot add column with ambiguous name: {name}")
+ path = (path,) if isinstance(path, str) else path
+
+ if "." in path[-1]:
+ raise ValueError(f"Cannot add column with ambiguous name:
{path[-1]}, provide a tuple instead")
if required and not self._allow_incompatible_changes:
# Table format version 1 and 2 cannot add required column because
there is no initial value
- raise ValueError(f"Incompatible change: cannot add required
column: {name}")
+ raise ValueError(f'Incompatible change: cannot add required
column: {".".join(path)}')
+
+ name = path[-1]
+ parent = path[:-1]
+
+ full_name = ".".join(path)
+ parent_id: int = TABLE_ROOT_ID
+
+ if len(parent) > 0:
+ parent_field = self._schema.find_field(".".join(parent),
self._case_sensitive)
+ parent_type = parent_field.field_type
+ if isinstance(parent_type, MapType):
+ parent_field = parent_type.value_field
+ elif isinstance(parent_type, ListType):
+ parent_field = parent_type.element_field
+
+ if not parent_field.field_type.is_struct:
+ raise ValueError(f"Cannot add column '{name}' to non-struct
type: {'.'.join(parent)}")
+
+ parent_id = parent_field.field_id
+
+ exists = False
+ try:
+ exists = self._schema.find_field(full_name, self._case_sensitive)
is not None
+ except ValueError:
+ pass
+
+ if exists:
+ raise ValueError(f"Cannot add column, name already exists:
{full_name}")
+
+ # assign new IDs in order
+ new_id = self.assign_new_column_id()
+
+ # update tracking for moves
+ self._added_name_to_id[full_name] = new_id
+
+ new_type = assign_fresh_schema_ids(field_type,
self.assign_new_column_id)
+ field = NestedField(field_id=new_id, name=name, field_type=new_type,
required=required, doc=doc)
+
+ self._adds[parent_id] = self._adds.get(parent_id, []) + [field]
Review Comment:
I'm open to making it more readable, but that doesn't work on my machine:
```python
➜ python git:(fd-followup) python3
Python 3.11.4 (main, Jul 25 2023, 17:36:13) [Clang 14.0.3
(clang-1403.0.22.14.1)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> tuple(*[1, 2], 3)
```
What does work is:
```python
self._adds[parent_id] = self._adds.get(parent_id, tuple()), + (field,)
```
But then we end up with the same thing as before, but then a tuple. I would
suggest refactoring it as:
```python
if parent_id in self._adds:
self._adds[parent_id].append(field)
else:
self._adds[parent_id] = [field]
```
Then we don't recreate the collections all the time (but this isn't the
hottest path, probably)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]