laskoviymishka commented on code in PR #984:
URL: https://github.com/apache/iceberg-go/pull/984#discussion_r3200057360


##########
schema.go:
##########
@@ -567,6 +567,8 @@ type SchemaVisitorPerPrimitiveType[T any] interface {
        VisitBinary() T
        VisitUUID() T
        VisitUnknown() T
+       VisitGeometry(GeometryType) T

Review Comment:
   These methods get added to the interface but `visitField` (~671–720) doesn't 
dispatch them. `GeometryType`/`GeographyType` are `PrimitiveType`s, so they 
fall through to `visitor.Primitive(...)`, which both visitors panic from. 
`SchemaToArrowSchema` on any geo column will panic at runtime today. I'd add 
the two cases in the dispatcher and a regression test that runs 
`iceberg.Visit(geoSchema, convertToArrow{})`.



##########
types.go:
##########
@@ -777,6 +824,145 @@ func (UnknownType) primitive()     {}
 func (UnknownType) Type() string   { return "unknown" }
 func (UnknownType) String() string { return "unknown" }
 
+type EdgeAlgorithm string
+
+const (
+       EdgeAlgorithmSpherical EdgeAlgorithm = "spherical"
+       EdgeAlgorithmVincenty  EdgeAlgorithm = "vincenty"
+       EdgeAlgorithmThomas    EdgeAlgorithm = "thomas"
+       EdgeAlgorithmAndoyer   EdgeAlgorithm = "andoyer"
+       EdgeAlgorithmKarney    EdgeAlgorithm = "karney"
+)
+
+func ParseEdgeAlgorithm(s string) (EdgeAlgorithm, error) {
+       switch strings.ToLower(s) {
+       case "spherical":
+               return EdgeAlgorithmSpherical, nil
+       case "vincenty":
+               return EdgeAlgorithmVincenty, nil
+       case "thomas":
+               return EdgeAlgorithmThomas, nil
+       case "andoyer":
+               return EdgeAlgorithmAndoyer, nil
+       case "karney":
+               return EdgeAlgorithmKarney, nil
+       default:
+               return "", fmt.Errorf("invalid edge interpolation algorithm: 
%s", s)
+       }
+}
+
+func (e EdgeAlgorithm) String() string {
+       return string(e)
+}
+
+const defaultGeoCRS = "OGC:CRS84"
+
+type GeometryType struct {
+       crs string
+}
+
+func GeometryTypeOf(crs string) (GeometryType, error) {
+       if crs == "" {
+               return GeometryType{}, errors.New("invalid CRS: (empty string)")
+       }
+       if strings.EqualFold(crs, defaultGeoCRS) {
+               return GeometryType{}, nil
+       }
+
+       return GeometryType{crs: crs}, nil
+}
+
+func (g GeometryType) CRS() string {
+       if g.crs == "" {
+               return defaultGeoCRS
+       }
+
+       return g.crs
+}
+
+func (g GeometryType) Equals(other Type) bool {
+       rhs, ok := other.(GeometryType)
+       if !ok {
+               return false
+       }
+
+       return g.crs == rhs.crs
+}
+
+func (GeometryType) primitive() {}
+func (g GeometryType) Type() string {
+       if g.crs == "" {
+               return "geometry"
+       }
+
+       return fmt.Sprintf("geometry(%s)", g.crs)
+}
+
+func (g GeometryType) String() string {
+       return g.Type()
+}
+
+type GeographyType struct {
+       crs       string
+       algorithm EdgeAlgorithm
+}
+
+func GeographyTypeOf(crs string, algorithm EdgeAlgorithm) (GeographyType, 
error) {
+       if crs == "" {
+               return GeographyType{}, errors.New("invalid CRS: (empty 
string)")
+       }
+
+       normalizedCRS := crs
+       if strings.EqualFold(crs, defaultGeoCRS) {
+               normalizedCRS = ""
+       }
+
+       return GeographyType{crs: normalizedCRS, algorithm: algorithm}, nil
+}
+
+func (g GeographyType) CRS() string {
+       if g.crs == "" {
+               return defaultGeoCRS
+       }
+
+       return g.crs
+}
+
+func (g GeographyType) Algorithm() EdgeAlgorithm {
+       return g.algorithm
+}
+
+func (g GeographyType) Equals(other Type) bool {
+       rhs, ok := other.(GeographyType)
+       if !ok {
+               return false
+       }
+
+       return g.crs == rhs.crs && g.algorithm == rhs.algorithm
+}
+
+func (GeographyType) primitive() {}
+func (g GeographyType) Type() string {

Review Comment:
   Spec default algorithm is `spherical`. Java treats it as null internally and 
emits canonical `"geography"` for `geography(OGC:CRS84, spherical)`. Here we 
re-emit the full string, so `GeographyTypeOf("OGC:CRS84", 
EdgeAlgorithmSpherical)` doesn't equal `GeographyType{}` and emits different 
JSON than Java for the same logical type — schema fingerprints will diverge. 
I'd canonicalize at construction (CRS84 + spherical → zero-value) and have 
`Algorithm()` return `EdgeAlgorithmSpherical` for the empty internal state, 
mirroring `CRS()`.



##########
types.go:
##########
@@ -777,6 +824,145 @@ func (UnknownType) primitive()     {}
 func (UnknownType) Type() string   { return "unknown" }
 func (UnknownType) String() string { return "unknown" }
 
+type EdgeAlgorithm string
+
+const (
+       EdgeAlgorithmSpherical EdgeAlgorithm = "spherical"
+       EdgeAlgorithmVincenty  EdgeAlgorithm = "vincenty"
+       EdgeAlgorithmThomas    EdgeAlgorithm = "thomas"
+       EdgeAlgorithmAndoyer   EdgeAlgorithm = "andoyer"
+       EdgeAlgorithmKarney    EdgeAlgorithm = "karney"
+)
+
+func ParseEdgeAlgorithm(s string) (EdgeAlgorithm, error) {
+       switch strings.ToLower(s) {
+       case "spherical":
+               return EdgeAlgorithmSpherical, nil
+       case "vincenty":
+               return EdgeAlgorithmVincenty, nil
+       case "thomas":
+               return EdgeAlgorithmThomas, nil
+       case "andoyer":
+               return EdgeAlgorithmAndoyer, nil
+       case "karney":
+               return EdgeAlgorithmKarney, nil
+       default:
+               return "", fmt.Errorf("invalid edge interpolation algorithm: 
%s", s)
+       }
+}
+
+func (e EdgeAlgorithm) String() string {
+       return string(e)
+}
+
+const defaultGeoCRS = "OGC:CRS84"
+
+type GeometryType struct {
+       crs string
+}
+
+func GeometryTypeOf(crs string) (GeometryType, error) {
+       if crs == "" {
+               return GeometryType{}, errors.New("invalid CRS: (empty string)")
+       }
+       if strings.EqualFold(crs, defaultGeoCRS) {
+               return GeometryType{}, nil
+       }
+
+       return GeometryType{crs: crs}, nil
+}
+
+func (g GeometryType) CRS() string {
+       if g.crs == "" {
+               return defaultGeoCRS
+       }
+
+       return g.crs
+}
+
+func (g GeometryType) Equals(other Type) bool {
+       rhs, ok := other.(GeometryType)
+       if !ok {
+               return false
+       }
+
+       return g.crs == rhs.crs
+}
+
+func (GeometryType) primitive() {}
+func (g GeometryType) Type() string {
+       if g.crs == "" {
+               return "geometry"
+       }
+
+       return fmt.Sprintf("geometry(%s)", g.crs)
+}
+
+func (g GeometryType) String() string {
+       return g.Type()
+}
+
+type GeographyType struct {
+       crs       string
+       algorithm EdgeAlgorithm
+}
+
+func GeographyTypeOf(crs string, algorithm EdgeAlgorithm) (GeographyType, 
error) {
+       if crs == "" {
+               return GeographyType{}, errors.New("invalid CRS: (empty 
string)")
+       }
+
+       normalizedCRS := crs
+       if strings.EqualFold(crs, defaultGeoCRS) {
+               normalizedCRS = ""
+       }
+
+       return GeographyType{crs: normalizedCRS, algorithm: algorithm}, nil

Review Comment:
   Doesn't validate `algorithm` — `GeographyTypeOf("srid:4326", 
EdgeAlgorithm("garbage"))` succeeds and produces metadata Java/PyIceberg will 
reject. JSON parse goes through `ParseEdgeAlgorithm`; the constructor doesn't. 
I'd run it here too when `algorithm != ""`.



##########
types.go:
##########
@@ -32,6 +33,8 @@ import (
 var (
        regexFromBrackets = regexp.MustCompile(`^\w+\[(\d+)\]$`)
        decimalRegex      = 
regexp.MustCompile(`decimal\(\s*(\d+)\s*,\s*(\d+)\s*\)`)
+       geometryRegex     = 
regexp.MustCompile(`(?i)^geometry\s*(?:\(\s*([^)]+?)\s*\))?$`)
+       geographyRegex    = 
regexp.MustCompile(`(?i)^geography\s*(?:\(\s*([^,]+?)\s*(?:,\s*(\w+)\s*)?\))?$`)
 )

Review Comment:
   Both regexes over-accept: `geometry(srid:4326, extra)` parses with the comma 
absorbed into the CRS group; `geography(srid:4269 karney)` (missing comma) 
parses with the space absorbed. I'd tighten the CRS group (e.g. `[^),]+?` for 
geometry) and add a couple of negative tests.



##########
table/metadata_schema_compatibility.go:
##########
@@ -39,7 +39,17 @@ func (e ErrIncompatibleSchema) Error() string {
                        fmt.Fprintf(&problems, "\n- invalid type for %s: %s is 
not supported until v%d", f.ColName, f.Field.Type, 
f.UnsupportedType.MinFormatVersion)
                }
                if f.InvalidDefault != nil {
-                       fmt.Fprintf(&problems, "\n- invalid initial default for 
%s: non-null default (%v) is not supported until v%d", f.ColName, 
f.Field.InitialDefault, f.InvalidDefault.MinFormatVersion)
+                       switch f.Field.Type.(type) {
+                       case iceberg.GeometryType, iceberg.GeographyType:
+                               if f.Field.InitialDefault != nil {
+                                       fmt.Fprintf(&problems, "\n- invalid 
initial default for %s: %s columns must default to null", f.ColName, 
f.Field.Type)

Review Comment:
   When a field has both `InitialDefault` and `WriteDefault`, two 
`IncompatibleField` entries get appended (see ~line 132 below) and `Error()` 
re-prints both default kinds for each — four lines for two problems. Either 
collapse to one entry per field, or drive the printout from 
`f.InvalidDefault.WriteDefault` (the value captured at append time).



##########
table/metadata_builder_internal_test.go:
##########
@@ -1838,6 +1840,86 @@ func TestUnknownTypeValidation(t *testing.T) {
        })
 }
 
+func TestGeometryGeographyNullOnlyDefaults(t *testing.T) {

Review Comment:
   The `v2 with non-null initial default` subtest asserts `"is not supported 
until v3"`. That path actually fires both the v2-unsupported error and the 
must-default-to-null error, and `ErrorContains` happens to match the first. If 
geo ever becomes allowed in v2, this silently changes meaning. Worth splitting: 
one v2 subtest asserting the type-unsupported message, one v3 subtest with 
non-null default asserting must-default-to-null.



##########
table/metadata_schema_compatibility.go:
##########
@@ -113,12 +123,30 @@ func checkSchemaCompatibility(sc *iceberg.Schema, 
formatVersion int) error {
                        })
                }
 
-               if field.InitialDefault != nil && formatVersion < 
defaultValuesMinFormatVersion {
-                       problems = append(problems, IncompatibleField{
-                               Field:          field,
-                               ColName:        colName,
-                               InvalidDefault: 
&InvalidDefault{MinFormatVersion: defaultValuesMinFormatVersion, WriteDefault: 
field.InitialDefault},
-                       })
+               switch field.Type.(type) {
+               case iceberg.GeometryType, iceberg.GeographyType:
+                       if field.InitialDefault != nil {
+                               problems = append(problems, IncompatibleField{
+                                       Field:          field,
+                                       ColName:        colName,
+                                       InvalidDefault: 
&InvalidDefault{MinFormatVersion: formatVersion, WriteDefault: 
field.InitialDefault},

Review Comment:
   `MinFormatVersion = formatVersion` (current version) here, but the field's 
documented as the minimum *required* version where the feature became 
supported. `IncompatibleField` is exported, so a downstream consumer reads a 
misleading value. Cleaner to add a separate reason flag like 
`MustBeNullForType` and switch on that in `Error()` instead of re-checking 
`f.Field.Type`.



##########
types.go:
##########
@@ -172,6 +179,46 @@ func (t *typeIFace) UnmarshalJSON(b []byte) error {
                                prec, _ := strconv.Atoi(matches[1])
                                scale, _ := strconv.Atoi(matches[2])
                                t.Type = DecimalType{precision: prec, scale: 
scale}
+                       case strings.HasPrefix(strings.ToLower(typename), 
"geometry"):

Review Comment:
   Geo type names match case-insensitively but every other type in this switch 
is case-sensitive. Matches Java for geo specifically, but it's internally 
surprising — worth a comment noting the asymmetry is intentional.



##########
types.go:
##########
@@ -777,6 +824,145 @@ func (UnknownType) primitive()     {}
 func (UnknownType) Type() string   { return "unknown" }
 func (UnknownType) String() string { return "unknown" }
 
+type EdgeAlgorithm string
+
+const (
+       EdgeAlgorithmSpherical EdgeAlgorithm = "spherical"
+       EdgeAlgorithmVincenty  EdgeAlgorithm = "vincenty"
+       EdgeAlgorithmThomas    EdgeAlgorithm = "thomas"
+       EdgeAlgorithmAndoyer   EdgeAlgorithm = "andoyer"
+       EdgeAlgorithmKarney    EdgeAlgorithm = "karney"
+)
+
+func ParseEdgeAlgorithm(s string) (EdgeAlgorithm, error) {
+       switch strings.ToLower(s) {
+       case "spherical":
+               return EdgeAlgorithmSpherical, nil
+       case "vincenty":
+               return EdgeAlgorithmVincenty, nil
+       case "thomas":
+               return EdgeAlgorithmThomas, nil
+       case "andoyer":
+               return EdgeAlgorithmAndoyer, nil
+       case "karney":
+               return EdgeAlgorithmKarney, nil
+       default:
+               return "", fmt.Errorf("invalid edge interpolation algorithm: 
%s", s)
+       }
+}
+
+func (e EdgeAlgorithm) String() string {
+       return string(e)
+}
+
+const defaultGeoCRS = "OGC:CRS84"
+
+type GeometryType struct {
+       crs string
+}
+
+func GeometryTypeOf(crs string) (GeometryType, error) {
+       if crs == "" {
+               return GeometryType{}, errors.New("invalid CRS: (empty string)")

Review Comment:
   Plain `errors.New` here. Rest of this file wraps `ErrInvalidTypeString` 
(lines 161/169/198) so callers can `errors.Is`. Same applies to 
`GeographyTypeOf` (line 912) and `ParseEdgeAlgorithm` (line 850).



##########
table/arrow_utils.go:
##########
@@ -630,6 +630,24 @@ func (c convertToArrow) VisitUnknown() arrow.Field {
        }
 }
 
+func (c convertToArrow) VisitGeometry(iceberg.GeometryType) arrow.Field {

Review Comment:
   Pass-through binary is fine for now. Worth a one-liner about the GeoArrow 
migration plan (extension metadata over a binary buffer, no shape change for 
downstream readers) so we don't trap ourselves once #991 lands. Also note this 
method is currently unreachable due to the dispatch issue on `schema.go`.



##########
transforms.go:
##########
@@ -107,9 +107,14 @@ func (t IdentityTransform) MarshalText() ([]byte, error) {
 func (IdentityTransform) String() string { return "identity" }
 
 func (IdentityTransform) CanTransform(t Type) bool {
-       _, ok := t.(PrimitiveType)
+       switch t.(type) {

Review Comment:
   Style nit — the nested switch only adds a level to special-case geo. I'd 
flatten:
   
   ```go
   switch t.(type) {
   case GeometryType, GeographyType:
       return false
   }
   _, ok := t.(PrimitiveType)
   return ok
   ```



##########
table/substrait/substrait.go:
##########
@@ -169,6 +169,10 @@ func (convertToSubstrait) VisitUnknown() types.Type {
        // Returning nil indicates this type cannot be converted to Substrait
        return nil
 }
+func (convertToSubstrait) VisitGeometry(iceberg.GeometryType) types.Type { 
return &types.BinaryType{} }

Review Comment:
   Same dispatch issue as `schema.go` — these are dead code today because 
`visitField` doesn't route to them, and `convertToSubstrait.Primitive` panics 
on the fall-through. Minor: `VisitGeometry` is one-line but `VisitGeography` is 
multi-line; surrounding methods are uniformly one-liners.



##########
transforms_test.go:
##########
@@ -258,6 +258,8 @@ func TestCanTransform(t *testing.T) {
                        },
                        notAllowed: []iceberg.Type{
                                &iceberg.StructType{}, &iceberg.ListType{}, 
&iceberg.MapType{},
+                               iceberg.GeometryType{},

Review Comment:
   `Identity` and `Bucket` get geo coverage but 
`Void`/`Truncate`/`Year`/`Month`/`Day`/`Hour` aren't pinned. `Void` 
particularly — per spec it's the only transform geo is allowed for, so it's the 
one most likely to silently regress.



##########
table/update_schema_test.go:
##########
@@ -344,6 +344,31 @@ func TestAddColumn(t *testing.T) {
                        }},
                }, newSchema.Fields())
        })
+
+       t.Run("test update schema with add geometry and geography columns", 
func(t *testing.T) {
+               table := New([]string{"id"}, testMetadata, "", nil, nil)

Review Comment:
   This builds against `testMetadata` (v2) and only calls `Apply()` — that path 
skips `checkSchemaCompatibility`, so the test passes despite geo being illegal 
in v2. Reads as if v2 add works. Build a v3 metadata for this case (mirror the 
`geoMeta` construction in the error test below) and ideally run 
`Commit()`/`BuildUpdates()` so the realistic path is covered.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to