[GitHub] [iceberg] HonahX commented on a diff in pull request #8622: Python: ManifestWriter and ManifestListWriter

via GitHub Sat, 23 Sep 2023 01:23:35 -0700


HonahX commented on code in PR #8622:
URL: https://github.com/apache/iceberg/pull/8622#discussion_r1334942457



##########
python/pyiceberg/manifest.py:
##########
@@ -79,88 +95,107 @@ def __repr__(self) -> str:
         return f"FileFormat.{self.name}"
 
 
-DATA_FILE_TYPE = StructType(
-    NestedField(
-        field_id=134,
-        name="content",
-        field_type=IntegerType(),
-        required=False,
-        doc="Contents of the file: 0=data, 1=position deletes, 2=equality 
deletes",
-        initial_default=DataFileContent.DATA,
-    ),
-    NestedField(field_id=100, name="file_path", field_type=StringType(), 
required=True, doc="Location URI with FS scheme"),
-    NestedField(
-        field_id=101, name="file_format", field_type=StringType(), 
required=True, doc="File format name: avro, orc, or parquet"
-    ),
-    NestedField(
-        field_id=102,
-        name="partition",
-        field_type=StructType(),
-        required=True,
-        doc="Partition data tuple, schema based on the partition spec",
-    ),
-    NestedField(field_id=103, name="record_count", field_type=LongType(), 
required=True, doc="Number of records in the file"),
-    NestedField(field_id=104, name="file_size_in_bytes", 
field_type=LongType(), required=True, doc="Total file size in bytes"),
-    NestedField(
-        field_id=108,
-        name="column_sizes",
-        field_type=MapType(key_id=117, key_type=IntegerType(), value_id=118, 
value_type=LongType()),
-        required=False,
-        doc="Map of column id to total size on disk",
-    ),
-    NestedField(
-        field_id=109,
-        name="value_counts",
-        field_type=MapType(key_id=119, key_type=IntegerType(), value_id=120, 
value_type=LongType()),
-        required=False,
-        doc="Map of column id to total count, including null and NaN",
-    ),
-    NestedField(
-        field_id=110,
-        name="null_value_counts",
-        field_type=MapType(key_id=121, key_type=IntegerType(), value_id=122, 
value_type=LongType()),
-        required=False,
-        doc="Map of column id to null value count",
-    ),
-    NestedField(
-        field_id=137,
-        name="nan_value_counts",
-        field_type=MapType(key_id=138, key_type=IntegerType(), value_id=139, 
value_type=LongType()),
-        required=False,
-        doc="Map of column id to number of NaN values in the column",
-    ),
-    NestedField(
-        field_id=125,
-        name="lower_bounds",
-        field_type=MapType(key_id=126, key_type=IntegerType(), value_id=127, 
value_type=BinaryType()),
-        required=False,
-        doc="Map of column id to lower bound",
-    ),
-    NestedField(
-        field_id=128,
-        name="upper_bounds",
-        field_type=MapType(key_id=129, key_type=IntegerType(), value_id=130, 
value_type=BinaryType()),
-        required=False,
-        doc="Map of column id to upper bound",
-    ),
-    NestedField(field_id=131, name="key_metadata", field_type=BinaryType(), 
required=False, doc="Encryption key metadata blob"),
-    NestedField(
-        field_id=132,
-        name="split_offsets",
-        field_type=ListType(element_id=133, element_type=LongType(), 
element_required=True),
-        required=False,
-        doc="Splittable offsets",
-    ),
-    NestedField(
-        field_id=135,
-        name="equality_ids",
-        field_type=ListType(element_id=136, element_type=LongType(), 
element_required=True),
-        required=False,
-        doc="Equality comparison field IDs",
-    ),
-    NestedField(field_id=140, name="sort_order_id", field_type=IntegerType(), 
required=False, doc="Sort order ID"),
-    NestedField(field_id=141, name="spec_id", field_type=IntegerType(), 
required=False, doc="Partition spec ID"),
-)
+def data_file_type(partition_type: StructType) -> StructType:

Review Comment:
   Migrated review:
   > I'm not a fan of this one, but I see why it is necessary. For reading, we 
can override certain field IDs:
   > 
   > 
https://github.com/apache/iceberg/blob/e389e4d139624a49729379acd330dd9c96187b04/python/pyiceberg/manifest.py#L319-L335
   > 
   > We could do the same when writing. We can override field-id `102` when 
constructing the writer. WDYT?
   
   



##########
python/pyiceberg/manifest.py:
##########
@@ -265,6 +306,64 @@ def __init__(self, *data: Any, **named_data: Any) -> None:
         super().__init__(*data, **{"struct": PARTITION_FIELD_SUMMARY_TYPE, 
**named_data})
 
 
+class PartitionFieldStats:
+    _type: PrimitiveType
+    _contains_null: bool
+    _contains_nan: bool
+    _min: Optional[Any]
+    _max: Optional[Any]
+
+    def __init__(self, iceberg_type: IcebergType) -> None:
+        if not isinstance(iceberg_type, PrimitiveType):
+            raise ValueError(f"Expected a primitive type for the partition 
field, got {iceberg_type}")
+        self._type = iceberg_type
+        self._contains_null = False
+        self._contains_nan = False
+        self._min = None
+        self._max = None
+
+    def to_summary(self) -> PartitionFieldSummary:
+        return PartitionFieldSummary(
+            contains_null=self._contains_null,
+            contains_nan=self._contains_nan,
+            lower_bound=to_bytes(self._type, self._min) if self._min is not 
None else None,
+            upper_bound=to_bytes(self._type, self._max) if self._max is not 
None else None,
+        )
+
+    def update(self, value: Any) -> None:
+        if value is None:
+            self._contains_null = True
+        elif math.isnan(value):
+            self._contains_nan = True
+        else:
+            if self._min is None:
+                self._min = value
+                self._max = value
+            else:
+                self._max = max(self._max, value)
+                self._min = min(self._min, value)
+
+
+class PartitionSummary:
+    _field_stats: List[PartitionFieldStats]
+    _types: List[IcebergType]
+
+    def __init__(self, spec: PartitionSpec, schema: Schema):
+        self._types = [field.field_type for field in 
spec.partition_type(schema).fields]
+        self._field_stats = [PartitionFieldStats(field_type) for field_type in 
self._types]
+
+    def summaries(self) -> List[PartitionFieldSummary]:
+        return [field.to_summary() for field in self._field_stats]
+
+    def update(self, partition_keys: Record) -> PartitionSummary:

Review Comment:
   Migrated Review
   > More on a meta-level. Instead of this, and the class above, I would 
probably write a function to convert `PartitionSpec`'s to PartitionSummaries. I 
think that's more Python (and for me also easier to follow, but that's super 
personal of course).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] HonahX commented on a diff in pull request #8622: Python: ManifestWriter and ManifestListWriter

Reply via email to