rdblue commented on code in PR #4685:
URL: https://github.com/apache/iceberg/pull/4685#discussion_r871462282
##########
python/src/iceberg/schema.py:
##########
@@ -497,31 +530,78 @@ def index_name_by_id(schema_or_type) -> Dict[int, str]:
return indexer.by_id()
-class _BuildPositionAccessors(SchemaVisitor[Dict[int, "Accessor"]]):
- """A schema visitor for generating a field ID to accessor index"""
+Position = int
- def __init__(self) -> None:
- self._index: Dict[int, Accessor] = {}
- def schema(self, schema, result: Dict[int, Accessor]) -> Dict[int,
Accessor]:
- return self._index
+class _BuildPositionAccessors(SchemaVisitor[Dict[Position, Accessor]]):
+ """A schema visitor for generating a field ID to accessor index
- def struct(self, struct, result: List[Dict[int, Accessor]]) -> Dict[int,
Accessor]:
- # TODO: Populate the `self._index` dictionary where the key is the
field ID and the value is an accessor for that field.
- # The equivalent java logic can be found here:
https://github.com/apache/iceberg/blob/master/api/src/main/java/org/apache/iceberg/Accessors.java#L213-L230
- return self._index
+ Example:
+ >>> from iceberg.schema import Schema
+ >>> from iceberg.types import *
+ >>> schema = Schema(
+ ... NestedField(field_id=2, name="id", field_type=IntegerType(),
is_optional=False),
+ ... NestedField(field_id=1, name="data", field_type=StringType(),
is_optional=True),
+ ... NestedField(
+ ... field_id=3,
+ ... name="location",
+ ... field_type=StructType(
+ ... NestedField(field_id=5, name="latitude",
field_type=FloatType(), is_optional=False),
+ ... NestedField(field_id=6, name="longitude",
field_type=FloatType(), is_optional=False),
+ ... ),
+ ... is_optional=True,
+ ... ),
+ ... schema_id=1,
+ ... identifier_field_ids=[1],
+ ... )
+ >>> result = build_position_accessors(schema)
+ >>> expected = {
+ ... 2: Accessor(position=0, inner=None),
+ ... 1: Accessor(position=1, inner=None),
+ ... 5: Accessor(position=2, inner=Accessor(position=0,
inner=None)),
+ ... 6: Accessor(position=2, inner=Accessor(position=1,
inner=None)),
+ ... }
+ >>> result == expected
+ True
+ """
- def field(self, field: NestedField, result: Dict[int, Accessor]) ->
Dict[int, Accessor]:
- return self._index
+ @staticmethod
+ def _wrap_leaves(result: Dict[Position, Accessor], position: Position = 0)
-> Dict[Position, Accessor]:
+ return {field_id: Accessor(position, inner=inner) for field_id, inner
in result.items()}
- def list(self, list_type: ListType, result: Dict[int, Accessor]) ->
Dict[int, Accessor]:
- return self._index
+ def schema(self, schema: Schema, result: Dict[Position, Accessor]) ->
Dict[Position, Accessor]:
+ return result
- def map(self, map_type: MapType, key_result: Dict[int, Accessor],
value_result: Dict[int, Accessor]) -> Dict[int, Accessor]:
- return self._index
+ def struct(self, struct: StructType, field_results: List[Dict[Position,
Accessor]]) -> Dict[Position, Accessor]:
+ result = {}
- def primitive(self, primitive: PrimitiveType) -> Dict[int, Accessor]:
- return self._index
+ for position, field in enumerate(struct.fields):
+ if field_results[position]:
Review Comment:
Since `list`, `map`, and `primitive` all return a map, is the `else` case
used? It is used in the JVM version for all 3 cases, but I think here it is
probably just used for the `primitive` case because the accessor map is empty,
right?
I think that is correct (if `{}` uses the `else` branch) but I don't think
there's a need to return accessors for fields inside maps or lists because we
don't really know how to handle repeated elements. We could extend this later
so that a repeated layer returns a tuple of the values, but right now we don't
really need that.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]