c-thiel commented on code in PR #2188:
URL: https://github.com/apache/iceberg-rust/pull/2188#discussion_r3363021140
##########
crates/iceberg/src/spec/schema/mod.rs:
##########
@@ -53,6 +54,9 @@ pub type SchemaRef = Arc<Schema>;
pub const DEFAULT_SCHEMA_ID: SchemaId = 0;
/// Delimiter for schema name, which denotes a nested struct.
pub const SCHEMA_NAME_DELIMITER: &str = ".";
+/// Minimum format version that allows non-null field default values.
+/// Mirrors Java's `Schema.DEFAULT_VALUES_MIN_FORMAT_VERSION`.
+pub const MIN_FORMAT_VERSION_DEFAULT_VALUES: FormatVersion = FormatVersion::V3;
Review Comment:
Applied. That said, I'd lean toward keeping spec-defined version floors like
this pub. iceberg-rust is an SDK for empowered users — I don't think we should
be overly protective with visibility. Downstream catalogs/engines (Lakekeeper,
for us) that gate default-value writes on format version otherwise have to
re-declare this constant locally.
##########
crates/iceberg/src/spec/schema/mod.rs:
##########
@@ -421,6 +425,85 @@ impl Schema {
pub fn field_id_to_fields(&self) -> &HashMap<i32, NestedFieldRef> {
&self.id_to_field
}
+
+ /// Minimum [`FormatVersion`] required to represent all *types* in this
schema.
+ ///
+ /// Types only; for initial-default version floors see
[`Schema::check_format_compatibility`].
+ pub fn min_format_version(&self) -> FormatVersion {
+ // `id_to_field` is flattened, so the max over all fields covers
nested ones too.
+ self.id_to_field
+ .values()
+ .map(|f| leaf_min_format_version(&f.field_type))
+ .max()
+ .unwrap_or(FormatVersion::V1)
+ }
+
+ /// Returns an error listing every field incompatible with
`format_version`.
+ /// Mirrors Java's `Schema.checkCompatibility()`. Two checks per field:
+ ///
+ /// - **Type** — per `leaf_min_format_version`.
+ /// - **Initial default** — a non-null `initial_default` backfills
pre-existing rows,
+ /// so it requires [`MIN_FORMAT_VERSION_DEFAULT_VALUES`];
`write_default` is not
+ /// checked, as it only affects newly written rows (read identically at
any version).
+ pub fn check_format_compatibility(&self, format_version: FormatVersion) ->
Result<()> {
+ // (field id, message); sorted by id below for a deterministic error.
+ let mut problems: Vec<(i32, String)> = Vec::new();
+
+ // `id_to_field` is flattened, so checking each field by its own type
keeps the
+ // blame on the offending leaf, not its container (mirrors Java's
`lazyIdToField`).
+ for field in self.id_to_field.values() {
+ let min_version = leaf_min_format_version(&field.field_type);
+ if format_version < min_version {
+ let name = self
+ .name_by_field_id(field.id)
+ .unwrap_or(field.name.as_str());
+ problems.push((field.id, format!(
+ "Invalid type for {name}: {} is not supported until
{min_version} but format version is {format_version}.",
+ field.field_type,
+ )));
+ }
+
+ if let Some(default) = &field.initial_default
+ && format_version < MIN_FORMAT_VERSION_DEFAULT_VALUES
+ {
+ let name = self
+ .name_by_field_id(field.id)
+ .unwrap_or(field.name.as_str());
+ problems.push((field.id, format!(
+ "Invalid initial default for {name}: non-null default
({default:?}) is not supported until {MIN_FORMAT_VERSION_DEFAULT_VALUES} but
format version is {format_version}."
+ )));
+ }
+ }
+
+ if problems.is_empty() {
+ return Ok(());
+ }
+
+ // Stable sort by id: HashMap order is nondeterministic, and stability
keeps a
+ // field's type problem before its default problem (matches Java's
TreeMap order).
+ let message = problems
+ .into_iter()
+ .sorted_by_key(|(id, _)| *id)
+ .map(|(_, msg)| msg)
+ .join("\n- ");
+ Err(Error::new(
+ ErrorKind::DataInvalid,
+ format!("Invalid schema for {format_version}:\n- {message}"),
+ ))
+ }
+}
+
+/// Minimum [`FormatVersion`] required by a type itself, ignoring nested
fields.
+///
+/// `TimestampNs` / `TimestamptzNs` / `Variant` require v3; everything else
(including
+/// nested types, validated per-leaf elsewhere) is valid from v1. Single
source of truth
+/// for the type version rules, mirroring Java's `Schema.MIN_FORMAT_VERSIONS`.
+fn leaf_min_format_version(field_type: &Type) -> FormatVersion {
Review Comment:
moved the shallow per-type rule onto `Type::min_format_version` - just like
it was originally, just with the fixed recursion from the last review.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]