[iceberg-go] branch main updated: feat(partitioning): Add partition specs/fields and basic transform interface (#2)

etudenhoefner Thu, 14 Sep 2023 09:22:02 -0700

This is an automated email from the ASF dual-hosted git repository.

etudenhoefner pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git



The following commit(s) were added to refs/heads/main by this push:
     new 39b0197  feat(partitioning): Add partition specs/fields and basic 
transform interface (#2)
39b0197 is described below

commit 39b019774631030c97a074816c63da4d38f0113d
Author: Matt Topol <[email protected]>
AuthorDate: Thu Sep 14 12:21:52 2023 -0400

    feat(partitioning): Add partition specs/fields and basic transform 
interface (#2)
---
 dev/.rat-excludes  |   1 +
 errors.go          |   1 +
 go.mod             |   4 +-
 go.sum             |  18 ++++-
 partitions.go      | 232 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 partitions_test.go | 143 +++++++++++++++++++++++++++++++++
 schema.go          |   2 +-
 transforms.go      | 173 +++++++++++++++++++++++++++++++++++++++
 transforms_test.go |  89 ++++++++++++++++++++
 utils.go           |   5 +-
 10 files changed, 661 insertions(+), 7 deletions(-)

diff --git a/dev/.rat-excludes b/dev/.rat-excludes
index 785e1c9..b968f68 100644
--- a/dev/.rat-excludes
+++ b/dev/.rat-excludes
@@ -4,3 +4,4 @@ LICENSE
 NOTICE
 go.sum
 build
+rat-results.txt
diff --git a/errors.go b/errors.go
index 801f5c5..c5bd870 100644
--- a/errors.go
+++ b/errors.go
@@ -24,4 +24,5 @@ var (
        ErrNotImplemented    = errors.New("not implemented")
        ErrInvalidArgument   = errors.New("invalid argument")
        ErrInvalidSchema     = errors.New("invalid schema")
+       ErrInvalidTransform  = errors.New("invalid transform syntax")
 )
diff --git a/go.mod b/go.mod
index 030d65e..6bea61d 100644
--- a/go.mod
+++ b/go.mod
@@ -21,11 +21,13 @@ go 1.20
 
 require (
        github.com/stretchr/testify v1.8.4
-       golang.org/x/exp v0.0.0-20230811145659-89c5cff77bcb
+       golang.org/x/exp v0.0.0-20230905200255-921286631fa9
 )
 
 require (
        github.com/davecgh/go-spew v1.1.1 // indirect
+       github.com/kr/pretty v0.3.1 // indirect
        github.com/pmezard/go-difflib v1.0.0 // indirect
+       gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
        gopkg.in/yaml.v3 v3.0.1 // indirect
 )
diff --git a/go.sum b/go.sum
index 172fe17..460287c 100644
--- a/go.sum
+++ b/go.sum
@@ -1,12 +1,24 @@
+github.com/creack/pty v1.1.9/go.mod 
h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
 github.com/davecgh/go-spew v1.1.1 
h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod 
h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/kr/pretty v0.2.1/go.mod 
h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
+github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/pretty v0.3.1/go.mod 
h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0/go.mod 
h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod 
h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod 
h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
 github.com/pmezard/go-difflib v1.0.0 
h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod 
h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/rogpeppe/go-internal v1.9.0 
h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
+github.com/rogpeppe/go-internal v1.9.0/go.mod 
h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
 github.com/stretchr/testify v1.8.4 
h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
 github.com/stretchr/testify v1.8.4/go.mod 
h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
-golang.org/x/exp v0.0.0-20230811145659-89c5cff77bcb 
h1:mIKbk8weKhSeLH2GmUTrvx8CjkyJmnU1wFmg59CUjFA=
-golang.org/x/exp v0.0.0-20230811145659-89c5cff77bcb/go.mod 
h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 
h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+golang.org/x/exp v0.0.0-20230905200255-921286631fa9 
h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g=
+golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod 
h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod 
h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c 
h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod 
h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/partitions.go b/partitions.go
new file mode 100644
index 0000000..c24f082
--- /dev/null
+++ b/partitions.go
@@ -0,0 +1,232 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package iceberg
+
+import (
+       "encoding/json"
+       "fmt"
+       "strings"
+
+       "golang.org/x/exp/slices"
+)
+
+const (
+       partitionDataIDStart   = 1000
+       InitialPartitionSpecID = 0
+)
+
+// UnpartitionedSpec is the default unpartitioned spec which can
+// be used for comparisons or to just provide a convenience for referencing
+// the same unpartitioned spec object.
+var UnpartitionedSpec = &PartitionSpec{id: 0}
+
+// PartitionField represents how one partition value is derived from the
+// source column by transformation.
+type PartitionField struct {
+       // SourceID is the source column id of the table's schema
+       SourceID int `json:"source-id"`
+       // FieldID is the partition field id across all the table partition 
specs
+       FieldID int `json:"field-id"`
+       // Name is the name of the partition field itself
+       Name string `json:"name"`
+       // Transform is the transform used to produce the partition value
+       Transform Transform `json:"transform"`
+}
+
+func (p *PartitionField) String() string {
+       return fmt.Sprintf("%d: %s: %s(%d)", p.FieldID, p.Name, p.Transform, 
p.SourceID)
+}
+
+func (p *PartitionField) UnmarshalJSON(b []byte) error {
+       type Alias PartitionField
+       aux := struct {
+               TransformString string `json:"transform"`
+               *Alias
+       }{
+               Alias: (*Alias)(p),
+       }
+
+       err := json.Unmarshal(b, &aux)
+       if err != nil {
+               return err
+       }
+
+       if p.Transform, err = ParseTransform(aux.TransformString); err != nil {
+               return err
+       }
+
+       return nil
+}
+
+// PartitionSpec captures the transformation from table data to partition 
values
+type PartitionSpec struct {
+       // any change to a PartitionSpec will produce a new spec id
+       id     int
+       fields []PartitionField
+
+       // this is populated by initialize after creation
+       sourceIdToFields map[int][]PartitionField
+}
+
+func NewPartitionSpec(fields ...PartitionField) PartitionSpec {
+       return NewPartitionSpecID(InitialPartitionSpecID, fields...)
+}
+
+func NewPartitionSpecID(id int, fields ...PartitionField) PartitionSpec {
+       ret := PartitionSpec{id: id, fields: fields}
+       ret.initialize()
+       return ret
+}
+
+// CompatibleWith returns true if this partition spec is considered
+// compatible with the passed in partition spec. This means that the two
+// specs have equivalent field lists regardless of the spec id.
+func (ps *PartitionSpec) CompatibleWith(other *PartitionSpec) bool {
+       if ps == other {
+               return true
+       }
+
+       if len(ps.fields) != len(other.fields) {
+               return false
+       }
+
+       return slices.EqualFunc(ps.fields, other.fields, func(left, right 
PartitionField) bool {
+               return left.SourceID == right.SourceID && left.Name == 
right.Name &&
+                       left.Transform == right.Transform
+       })
+}
+
+// Equals returns true iff the field lists are the same AND the spec id
+// is the same between this partition spec and the provided one.
+func (ps *PartitionSpec) Equals(other PartitionSpec) bool {
+       return ps.id == other.id && slices.Equal(ps.fields, other.fields)
+}
+
+func (ps PartitionSpec) MarshalJSON() ([]byte, error) {
+       if ps.fields == nil {
+               ps.fields = []PartitionField{}
+       }
+       return json.Marshal(struct {
+               ID     int              `json:"spec-id"`
+               Fields []PartitionField `json:"fields"`
+       }{ps.id, ps.fields})
+}
+
+func (ps *PartitionSpec) UnmarshalJSON(b []byte) error {
+       aux := struct {
+               ID     int              `json:"spec-id"`
+               Fields []PartitionField `json:"fields"`
+       }{ID: ps.id, Fields: ps.fields}
+
+       if err := json.Unmarshal(b, &aux); err != nil {
+               return err
+       }
+
+       ps.id, ps.fields = aux.ID, aux.Fields
+       ps.initialize()
+       return nil
+}
+
+func (ps *PartitionSpec) initialize() {
+       ps.sourceIdToFields = make(map[int][]PartitionField)
+       for _, f := range ps.fields {
+               ps.sourceIdToFields[f.SourceID] =
+                       append(ps.sourceIdToFields[f.SourceID], f)
+       }
+}
+
+func (ps *PartitionSpec) ID() int                    { return ps.id }
+func (ps *PartitionSpec) NumFields() int             { return len(ps.fields) }
+func (ps *PartitionSpec) Field(i int) PartitionField { return ps.fields[i] }
+
+func (ps *PartitionSpec) IsUnpartitioned() bool {
+       if len(ps.fields) == 0 {
+               return true
+       }
+
+       for _, f := range ps.fields {
+               if _, ok := f.Transform.(VoidTransform); !ok {
+                       return false
+               }
+       }
+
+       return true
+}
+
+func (ps *PartitionSpec) FieldsBySourceID(fieldID int) []PartitionField {
+       return slices.Clone(ps.sourceIdToFields[fieldID])
+}
+
+func (ps PartitionSpec) String() string {
+       var b strings.Builder
+       b.WriteByte('[')
+       for i, f := range ps.fields {
+               if i == 0 {
+                       b.WriteString("\n")
+               }
+               b.WriteString("\t")
+               b.WriteString(f.String())
+               b.WriteString("\n")
+       }
+       b.WriteByte(']')
+
+       return b.String()
+}
+
+func (ps *PartitionSpec) LastAssignedFieldID() int {
+       if len(ps.fields) == 0 {
+               return partitionDataIDStart - 1
+       }
+
+       id := ps.fields[0].FieldID
+       for _, f := range ps.fields[1:] {
+               if f.FieldID > id {
+                       id = f.FieldID
+               }
+       }
+       return id
+}
+
+// PartitionType produces a struct of the partition spec.
+//
+// The partition fields should be optional:
+//   - All partition transforms are required to produce null if the input value
+//     is null. This can happen when the source column is optional.
+//   - Partition fields may be added later, in which case not all files would
+//     have the result field and it may be null.
+//
+// There is a case where we can guarantee that a partition field in the first
+// and only parittion spec that uses a required source column will never be
+// null, but it doesn't seem worth tracking this case.
+func (ps *PartitionSpec) PartitionType(schema *Schema) *StructType {
+       nestedFields := []NestedField{}
+       for _, field := range ps.fields {
+               sourceType, ok := schema.FindTypeByID(field.SourceID)
+               if !ok {
+                       continue
+               }
+               resultType := field.Transform.ResultType(sourceType)
+               nestedFields = append(nestedFields, NestedField{
+                       ID:       field.FieldID,
+                       Name:     field.Name,
+                       Type:     resultType,
+                       Required: false,
+               })
+       }
+       return &StructType{FieldList: nestedFields}
+}
diff --git a/partitions_test.go b/partitions_test.go
new file mode 100644
index 0000000..fd29190
--- /dev/null
+++ b/partitions_test.go
@@ -0,0 +1,143 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package iceberg_test
+
+import (
+       "encoding/json"
+       "testing"
+
+       "github.com/apache/iceberg-go"
+       "github.com/stretchr/testify/assert"
+       "github.com/stretchr/testify/require"
+)
+
+func TestPartitionSpec(t *testing.T) {
+       assert.Equal(t, 999, iceberg.UnpartitionedSpec.LastAssignedFieldID())
+
+       bucket := iceberg.BucketTransform{NumBuckets: 4}
+       idField1 := iceberg.PartitionField{
+               SourceID: 3, FieldID: 1001, Name: "id", Transform: bucket}
+       spec1 := iceberg.NewPartitionSpec(idField1)
+
+       assert.Zero(t, spec1.ID())
+       assert.Equal(t, 1, spec1.NumFields())
+       assert.Equal(t, idField1, spec1.Field(0))
+       assert.NotEqual(t, idField1, spec1)
+       assert.False(t, spec1.IsUnpartitioned())
+       assert.True(t, spec1.CompatibleWith(&spec1))
+       assert.True(t, spec1.Equals(spec1))
+       assert.Equal(t, 1001, spec1.LastAssignedFieldID())
+       assert.Equal(t, "[\n\t1001: id: bucket[4](3)\n]", spec1.String())
+
+       // only differs by PartitionField FieldID
+       idField2 := iceberg.PartitionField{
+               SourceID: 3, FieldID: 1002, Name: "id", Transform: bucket}
+       spec2 := iceberg.NewPartitionSpec(idField2)
+
+       assert.False(t, spec1.Equals(spec2))
+       assert.True(t, spec1.CompatibleWith(&spec2))
+       assert.Equal(t, []iceberg.PartitionField{idField1}, 
spec1.FieldsBySourceID(3))
+       assert.Empty(t, spec1.FieldsBySourceID(1925))
+
+       spec3 := iceberg.NewPartitionSpec(idField1, idField2)
+       assert.False(t, spec1.CompatibleWith(&spec3))
+       assert.Equal(t, 1002, spec3.LastAssignedFieldID())
+}
+
+func TestUnpartitionedWithVoidField(t *testing.T) {
+       spec := iceberg.NewPartitionSpec(iceberg.PartitionField{
+               SourceID: 3, FieldID: 1001, Name: "void", Transform: 
iceberg.VoidTransform{},
+       })
+
+       assert.True(t, spec.IsUnpartitioned())
+
+       spec2 := iceberg.NewPartitionSpec(iceberg.PartitionField{
+               SourceID: 3, FieldID: 1001, Name: "void", Transform: 
iceberg.VoidTransform{},
+       }, iceberg.PartitionField{
+               SourceID: 3, FieldID: 1002, Name: "bucket", Transform: 
iceberg.BucketTransform{NumBuckets: 2},
+       })
+
+       assert.False(t, spec2.IsUnpartitioned())
+}
+
+func TestSerializeUnpartitionedSpec(t *testing.T) {
+       data, err := json.Marshal(iceberg.UnpartitionedSpec)
+       require.NoError(t, err)
+
+       assert.JSONEq(t, `{"spec-id": 0, "fields": []}`, string(data))
+       assert.True(t, iceberg.UnpartitionedSpec.IsUnpartitioned())
+}
+
+func TestSerializePartitionSpec(t *testing.T) {
+       spec := iceberg.NewPartitionSpecID(3,
+               iceberg.PartitionField{SourceID: 1, FieldID: 1000,
+                       Transform: iceberg.TruncateTransform{Width: 19}, Name: 
"str_truncate"},
+               iceberg.PartitionField{SourceID: 2, FieldID: 1001,
+                       Transform: iceberg.BucketTransform{NumBuckets: 25}, 
Name: "int_bucket"},
+       )
+
+       data, err := json.Marshal(spec)
+       require.NoError(t, err)
+
+       assert.JSONEq(t, `{
+               "spec-id": 3,
+               "fields": [
+                       {
+                               "source-id": 1,
+                               "field-id": 1000,
+                               "transform": "truncate[19]",
+                               "name": "str_truncate"
+                       },
+                       {
+                               "source-id": 2,
+                               "field-id": 1001,
+                               "transform": "bucket[25]",
+                               "name": "int_bucket"
+                       }
+               ]
+       }`, string(data))
+
+       var outspec iceberg.PartitionSpec
+       require.NoError(t, json.Unmarshal(data, &outspec))
+
+       assert.True(t, spec.Equals(outspec))
+}
+
+func TestPartitionType(t *testing.T) {
+       spec := iceberg.NewPartitionSpecID(3,
+               iceberg.PartitionField{SourceID: 1, FieldID: 1000,
+                       Transform: iceberg.TruncateTransform{Width: 19}, Name: 
"str_truncate"},
+               iceberg.PartitionField{SourceID: 2, FieldID: 1001,
+                       Transform: iceberg.BucketTransform{NumBuckets: 25}, 
Name: "int_bucket"},
+               iceberg.PartitionField{SourceID: 3, FieldID: 1002,
+                       Transform: iceberg.IdentityTransform{}, Name: 
"bool_identity"},
+               iceberg.PartitionField{SourceID: 1, FieldID: 1003,
+                       Transform: iceberg.VoidTransform{}, Name: "str_void"},
+       )
+
+       expected := &iceberg.StructType{
+               FieldList: []iceberg.NestedField{
+                       {ID: 1000, Name: "str_truncate", Type: 
iceberg.PrimitiveTypes.String},
+                       {ID: 1001, Name: "int_bucket", Type: 
iceberg.PrimitiveTypes.Int32},
+                       {ID: 1002, Name: "bool_identity", Type: 
iceberg.PrimitiveTypes.Bool},
+                       {ID: 1003, Name: "str_void", Type: 
iceberg.PrimitiveTypes.String},
+               },
+       }
+       actual := spec.PartitionType(tableSchemaSimple)
+       assert.Truef(t, expected.Equals(actual), "expected: %s, got: %s", 
expected, actual)
+}
diff --git a/schema.go b/schema.go
index e9e559b..8edce1f 100644
--- a/schema.go
+++ b/schema.go
@@ -214,7 +214,7 @@ func (s *Schema) FindFieldByNameCaseInsensitive(name 
string) (NestedField, bool)
        return s.FindFieldByID(id)
 }
 
-// FindFieldByID is like [*Schema.FindColumnByName], but returns the whole
+// FindFieldByID is like [*Schema.FindColumnName], but returns the whole
 // field rather than just the field name.
 func (s *Schema) FindFieldByID(id int) (NestedField, bool) {
        idx, _ := s.lazyIDToField()
diff --git a/transforms.go b/transforms.go
new file mode 100644
index 0000000..58d2011
--- /dev/null
+++ b/transforms.go
@@ -0,0 +1,173 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package iceberg
+
+import (
+       "encoding"
+       "fmt"
+       "strconv"
+       "strings"
+)
+
+// ParseTransform takes the string representation of a transform as
+// defined in the iceberg spec, and produces the appropriate Transform
+// object or an error if the string is not a valid transform string.
+func ParseTransform(s string) (Transform, error) {
+       s = strings.ToLower(s)
+       switch {
+       case strings.HasPrefix(s, "bucket"):
+               matches := regexFromBrackets.FindStringSubmatch(s)
+               if len(matches) != 2 {
+                       break
+               }
+
+               n, _ := strconv.Atoi(matches[1])
+               return BucketTransform{NumBuckets: n}, nil
+       case strings.HasPrefix(s, "truncate"):
+               matches := regexFromBrackets.FindStringSubmatch(s)
+               if len(matches) != 2 {
+                       break
+               }
+
+               n, _ := strconv.Atoi(matches[1])
+               return TruncateTransform{Width: n}, nil
+       default:
+               switch s {
+               case "identity":
+                       return IdentityTransform{}, nil
+               case "void":
+                       return VoidTransform{}, nil
+               case "year":
+                       return YearTransform{}, nil
+               case "month":
+                       return MonthTransform{}, nil
+               case "day":
+                       return DayTransform{}, nil
+               case "hour":
+                       return HourTransform{}, nil
+               }
+       }
+
+       return nil, fmt.Errorf("%w: %s", ErrInvalidTransform, s)
+}
+
+// Transform is an interface for the various Transformation types
+// in partition specs. Currently, they do not yet provide actual
+// transformation functions or implementation. That will come later as
+// data reading gets implemented.
+type Transform interface {
+       fmt.Stringer
+       encoding.TextMarshaler
+       ResultType(t Type) Type
+}
+
+// IdentityTransform uses the identity function, performing no transformation
+// but instead partitioning on the value itself.
+type IdentityTransform struct{}
+
+func (t IdentityTransform) MarshalText() ([]byte, error) {
+       return []byte(t.String()), nil
+}
+
+func (IdentityTransform) String() string { return "identity" }
+
+func (IdentityTransform) ResultType(t Type) Type { return t }
+
+// VoidTransform is a transformation that always returns nil.
+type VoidTransform struct{}
+
+func (t VoidTransform) MarshalText() ([]byte, error) {
+       return []byte(t.String()), nil
+}
+
+func (VoidTransform) String() string { return "void" }
+
+func (VoidTransform) ResultType(t Type) Type { return t }
+
+// BucketTransform transforms values into a bucket partition value. It is
+// parameterized by a number of buckets. Bucket partition transforms use
+// a 32-bit hash of the source value to produce a positive value by mod
+// the bucket number.
+type BucketTransform struct {
+       NumBuckets int
+}
+
+func (t BucketTransform) MarshalText() ([]byte, error) {
+       return []byte(t.String()), nil
+}
+
+func (t BucketTransform) String() string { return fmt.Sprintf("bucket[%d]", 
t.NumBuckets) }
+
+func (BucketTransform) ResultType(Type) Type { return PrimitiveTypes.Int32 }
+
+// TruncateTransform is a transformation for truncating a value to a specified 
width.
+type TruncateTransform struct {
+       Width int
+}
+
+func (t TruncateTransform) MarshalText() ([]byte, error) {
+       return []byte(t.String()), nil
+}
+
+func (t TruncateTransform) String() string { return 
fmt.Sprintf("truncate[%d]", t.Width) }
+
+func (TruncateTransform) ResultType(t Type) Type { return t }
+
+// YearTransform transforms a datetime value into a year value.
+type YearTransform struct{}
+
+func (t YearTransform) MarshalText() ([]byte, error) {
+       return []byte(t.String()), nil
+}
+
+func (YearTransform) String() string { return "year" }
+
+func (YearTransform) ResultType(Type) Type { return PrimitiveTypes.Int32 }
+
+// MonthTransform transforms a datetime value into a month value.
+type MonthTransform struct{}
+
+func (t MonthTransform) MarshalText() ([]byte, error) {
+       return []byte(t.String()), nil
+}
+
+func (MonthTransform) String() string { return "month" }
+
+func (MonthTransform) ResultType(Type) Type { return PrimitiveTypes.Int32 }
+
+// DayTransform transforms a datetime value into a date value.
+type DayTransform struct{}
+
+func (t DayTransform) MarshalText() ([]byte, error) {
+       return []byte(t.String()), nil
+}
+
+func (DayTransform) String() string { return "day" }
+
+func (DayTransform) ResultType(Type) Type { return PrimitiveTypes.Date }
+
+// HourTransform transforms a datetime value into an hour value.
+type HourTransform struct{}
+
+func (t HourTransform) MarshalText() ([]byte, error) {
+       return []byte(t.String()), nil
+}
+
+func (HourTransform) String() string { return "hour" }
+
+func (HourTransform) ResultType(Type) Type { return PrimitiveTypes.Int32 }
diff --git a/transforms_test.go b/transforms_test.go
new file mode 100644
index 0000000..a455ede
--- /dev/null
+++ b/transforms_test.go
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package iceberg_test
+
+import (
+       "strings"
+       "testing"
+
+       "github.com/apache/iceberg-go"
+       "github.com/stretchr/testify/assert"
+       "github.com/stretchr/testify/require"
+)
+
+func TestParseTransform(t *testing.T) {
+       tests := []struct {
+               toparse  string
+               expected iceberg.Transform
+       }{
+               {"identity", iceberg.IdentityTransform{}},
+               {"IdEnTiTy", iceberg.IdentityTransform{}},
+               {"void", iceberg.VoidTransform{}},
+               {"VOId", iceberg.VoidTransform{}},
+               {"year", iceberg.YearTransform{}},
+               {"yEAr", iceberg.YearTransform{}},
+               {"month", iceberg.MonthTransform{}},
+               {"MONtH", iceberg.MonthTransform{}},
+               {"day", iceberg.DayTransform{}},
+               {"DaY", iceberg.DayTransform{}},
+               {"hour", iceberg.HourTransform{}},
+               {"hOuR", iceberg.HourTransform{}},
+               {"bucket[5]", iceberg.BucketTransform{NumBuckets: 5}},
+               {"bucket[100]", iceberg.BucketTransform{NumBuckets: 100}},
+               {"BUCKET[5]", iceberg.BucketTransform{NumBuckets: 5}},
+               {"bUCKeT[100]", iceberg.BucketTransform{NumBuckets: 100}},
+               {"truncate[10]", iceberg.TruncateTransform{Width: 10}},
+               {"truncate[255]", iceberg.TruncateTransform{Width: 255}},
+               {"TRUNCATE[10]", iceberg.TruncateTransform{Width: 10}},
+               {"tRuNCATe[255]", iceberg.TruncateTransform{Width: 255}},
+       }
+
+       for _, tt := range tests {
+               t.Run(tt.toparse, func(t *testing.T) {
+                       transform, err := iceberg.ParseTransform(tt.toparse)
+                       require.NoError(t, err)
+                       assert.Equal(t, tt.expected, transform)
+
+                       txt, err := transform.MarshalText()
+                       assert.NoError(t, err)
+                       assert.Equal(t, strings.ToLower(tt.toparse), 
string(txt))
+               })
+       }
+
+       errorTests := []struct {
+               name    string
+               toparse string
+       }{
+               {"foobar", "foobar"},
+               {"bucket no brackets", "bucket"},
+               {"truncate no brackets", "truncate"},
+               {"bucket no val", "bucket[]"},
+               {"truncate no val", "truncate[]"},
+               {"bucket neg", "bucket[-1]"},
+               {"truncate neg", "truncate[-1]"},
+       }
+
+       for _, tt := range errorTests {
+               t.Run(tt.name, func(t *testing.T) {
+                       tr, err := iceberg.ParseTransform(tt.toparse)
+                       assert.Nil(t, tr)
+                       assert.ErrorIs(t, err, iceberg.ErrInvalidTransform)
+                       assert.ErrorContains(t, err, tt.toparse)
+               })
+       }
+}
diff --git a/utils.go b/utils.go
index ccb6bbc..907a35f 100644
--- a/utils.go
+++ b/utils.go
@@ -19,6 +19,7 @@ package iceberg
 
 import (
        "runtime/debug"
+       "strings"
 
        "golang.org/x/exp/constraints"
 )
@@ -29,9 +30,9 @@ func init() {
        version = "(unknown version)"
        if info, ok := debug.ReadBuildInfo(); ok {
                for _, dep := range info.Deps {
-                       switch {
-                       case dep.Path == "github.com/apache/iceberg/go/iceberg":
+                       if strings.HasPrefix(dep.Path, 
"github.com/apache/iceberg-go") {
                                version = dep.Version
+                               break
                        }
                }
        }

[iceberg-go] branch main updated: feat(partitioning): Add partition specs/fields and basic transform interface (#2)

Reply via email to