rdblue commented on a change in pull request #1177: URL: https://github.com/apache/iceberg/pull/1177#discussion_r463874818
########## File path: core/src/test/java/org/apache/iceberg/TestSchemaUpdateSync.java ########## @@ -0,0 +1,300 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg; + +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + +import com.google.common.collect.Lists; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.iceberg.types.Type.PrimitiveType; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.BinaryType; +import org.apache.iceberg.types.Types.BooleanType; +import org.apache.iceberg.types.Types.DateType; +import org.apache.iceberg.types.Types.DecimalType; +import org.apache.iceberg.types.Types.DoubleType; +import org.apache.iceberg.types.Types.FixedType; +import org.apache.iceberg.types.Types.FloatType; +import org.apache.iceberg.types.Types.IntegerType; +import org.apache.iceberg.types.Types.LongType; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.types.Types.StringType; +import org.apache.iceberg.types.Types.TimeType; +import org.apache.iceberg.types.Types.TimestampType; +import org.apache.iceberg.types.Types.UUIDType; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +public class TestSchemaUpdateSync { + + @Rule + public ExpectedException thrown = ExpectedException.none(); + + private static List<? extends PrimitiveType> primitiveTypes() { + return Lists.newArrayList(StringType.get(), + TimeType.get(), + TimestampType.withoutZone(), + TimestampType.withZone(), + UUIDType.get(), + DateType.get(), + BooleanType.get(), + BinaryType.get(), + DoubleType.get(), + IntegerType.get(), + FixedType.ofLength(10), + DecimalType.of(10, 2), + LongType.get(), + FloatType.get() + ); + } + + private static NestedField[] primitiveFields(Integer initialValue, List<? extends PrimitiveType> primitiveTypes) { + AtomicInteger i = new AtomicInteger(initialValue); + return primitiveTypes.stream() + .map(type -> optional(i.incrementAndGet(), type.toString(), + Types.fromPrimitiveString(type.toString()))).toArray(NestedField[]::new); + } + + @Test + public void testAddTopLevelPrimitives() { + Schema newSchema = new Schema(primitiveFields(0, primitiveTypes())); + Schema applied = new SchemaUpdate(new Schema(), 0).upsertSchema(newSchema).apply(); + Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); + } + + @Test + public void testAddTopLevelListOfPrimitives() { + for(PrimitiveType primitiveType : primitiveTypes()) { + Schema newSchema = new Schema(optional(1, "aList", Types.ListType.ofOptional(2, primitiveType))); + Schema applied = new SchemaUpdate(new Schema(), 0).upsertSchema(newSchema).apply(); + Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); + } + } + + @Test + public void testAddTopLevelMapOfPrimitives() { + for(PrimitiveType primitiveType : primitiveTypes()) { + Schema newSchema = new Schema(optional(1, "aMap", Types.MapType.ofOptional(2, 3, primitiveType, primitiveType))); + Schema applied = new SchemaUpdate(new Schema(), 0).upsertSchema(newSchema).apply(); + Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); + } + } + + @Test + public void testAddTopLevelStructOfPrimitives() { + for(PrimitiveType primitiveType : primitiveTypes()) { + Schema currentSchema = new Schema(optional(1, "aStruct", Types.StructType.of( + optional(2, "primitive", primitiveType)))); + Schema applied = new SchemaUpdate(new Schema(), 0).upsertSchema(currentSchema).apply(); + Assert.assertEquals(currentSchema.asStruct(), applied.asStruct()); + } + } + + @Test + public void testAddNestedPrimitive() { + for(PrimitiveType primitiveType : primitiveTypes()) { + Schema currentSchema = new Schema(optional(1, "aStruct", Types.StructType.of())); + Schema newSchema = new Schema(optional(1, "aStruct", Types.StructType.of( + optional(2, "primitive", primitiveType)))); + Schema applied = new SchemaUpdate(currentSchema, 1).upsertSchema(newSchema).apply(); + Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); + } + } + + @Test + public void testAddNestedPrimitives() { + Schema currentSchema = new Schema(optional(1, "aStruct", Types.StructType.of())); + Schema newSchema = new Schema(optional(1, "aStruct", Types.StructType.of( + primitiveFields(1, primitiveTypes())))); + Schema applied = new SchemaUpdate(currentSchema, 1).upsertSchema(newSchema).apply(); + Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); + } + + @Test + public void testAddNestedLists() { + Schema newSchema = new Schema(optional(1, "aList", + Types.ListType.ofOptional(2, + Types.ListType.ofOptional(3, + Types.ListType.ofOptional(4, + Types.ListType.ofOptional(5, + Types.ListType.ofOptional(6, + Types.ListType.ofOptional(7, + Types.ListType.ofOptional(8, + Types.ListType.ofOptional(9, + Types.ListType.ofOptional(10, + DecimalType.of(11, 20)))))))))))); + Schema applied = new SchemaUpdate(new Schema(), 0).upsertSchema(newSchema).apply(); + Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); + } + + @Test + public void testAddNestedStruct() { + Schema newSchema = new Schema(optional(1, "struct1", Types.StructType.of( + optional(2, "struct2", Types.StructType.of( + optional(3, "struct3", Types.StructType.of( + optional(4, "struct4", Types.StructType.of( + optional(5, "struct5", Types.StructType.of( + optional(6, "struct6", Types.StructType.of( + optional(7, "aString", StringType.get())))))))))))))); + Schema applied = new SchemaUpdate(new Schema(), 0).upsertSchema(newSchema).apply(); + Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); + } + + @Test + public void testAddNestedMaps() { + Schema newSchema = new Schema(optional(1, "struct", Types.MapType.ofOptional( + 2, 3, StringType.get(), Types.MapType.ofOptional( + 4, 5, StringType.get(), Types.MapType.ofOptional( + 6, 7, StringType.get(), Types.MapType.ofOptional( + 8 ,9, StringType.get(), Types.MapType.ofOptional( + 10 ,11, StringType.get(), Types.MapType.ofOptional( + 12, 13, StringType.get(), StringType.get())))))))); + Schema applied = new SchemaUpdate(new Schema(), 0).upsertSchema(newSchema).apply(); + Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); + } + + @Test + public void testDetectInvalidTopLevelList() { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Cannot change column type: aList.element: string -> long"); + + Schema currentSchema = new Schema(optional(1, "aList", + Types.ListType.ofOptional(2, StringType.get()))); + Schema newSchema = new Schema(optional(1, "aList", + Types.ListType.ofOptional(2, LongType.get()))); + new SchemaUpdate(currentSchema, 2).upsertSchema(newSchema).apply(); + } + + @Test + public void testDetectInvalidTopLevelMapValue() { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Cannot change column type: aMap.value: string -> long"); + + Schema currentSchema = new Schema(optional(1, "aMap", + Types.MapType.ofOptional(2,3, StringType.get(), StringType.get()))); + Schema newSchema = new Schema(optional(1, "aMap", + Types.MapType.ofOptional(2,3, StringType.get(), LongType.get()))); + Schema apply = new SchemaUpdate(currentSchema, 3).upsertSchema(newSchema).apply(); + System.out.println(apply.toString()); + } + + @Test + public void testDetectInvalidTopLevelMapKey() { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Cannot change column type: aMap.key: string -> uuid"); + + Schema currentSchema = new Schema(optional(1, "aMap", + Types.MapType.ofOptional(2,3 , StringType.get(), StringType.get()))); + Schema newSchema = new Schema(optional(1, "aMap", + Types.MapType.ofOptional(2,3 , UUIDType.get(), StringType.get()))); + new SchemaUpdate(currentSchema, 3).upsertSchema(newSchema).apply(); + } + + @Test + // int 32-bit signed integers -> Can promote to long + public void testTypePromoteIntegerToLong() { + Schema currentSchema = new Schema(required(1, "aCol", IntegerType.get())); + Schema newSchema = new Schema(required(1, "aCol", LongType.get())); + + Schema applied = new SchemaUpdate(currentSchema, 1).upsertSchema(newSchema).apply(); + Assert.assertEquals(1, applied.asStruct().fields().size()); + Assert.assertEquals(LongType.get(), applied.asStruct().fields().get(0).type()); + } + + @Test + // float 32-bit IEEE 754 floating point -> Can promote to double + public void testTypePromoteFloatToDouble() { + Schema currentSchema = new Schema(required(1, "aCol", FloatType.get())); + Schema newSchema = new Schema(required(1, "aCol", DoubleType.get())); + + Schema applied = new SchemaUpdate(currentSchema, 1).upsertSchema(newSchema).apply(); + Assert.assertEquals(1, applied.asStruct().fields().size()); + Assert.assertEquals(DoubleType.get(), applied.asStruct().fields().get(0).type()); + // Doesn't work Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); + // java.lang.AssertionError: + // Expected :struct<1: aCol: required double> + // Actual :struct<1: aCol: required double ()> + } + + @Test + public void testInvalidTypePromoteDoubleToFloat() { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Cannot change column type: aCol: double -> float"); + + Schema currentSchema = new Schema(required(1, "aCol", DoubleType.get())); + Schema newSchema = new Schema(required(1, "aCol", FloatType.get())); + + new SchemaUpdate(currentSchema, 1).upsertSchema(newSchema).apply(); + } + + @Test + // decimal(P,S) Fixed-point decimal; precision P, scale S -> Scale is fixed [1], precision must be 38 or less + public void testTypePromoteDecimalToFixedScaleWithWiderPrecision() { + Schema currentSchema = new Schema(required(1, "aCol", DecimalType.of(20, 1))); + Schema newSchema = new Schema(required(1, "aCol", DecimalType.of(22, 1))); + + Schema applied = new SchemaUpdate(currentSchema, 1).upsertSchema(newSchema).apply(); + Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); + } + + @Test + public void testAddPrimitiveToNestedStruct() { + Schema schema = new Schema( + required(1, "struct1", Types.StructType.of( + optional(2, "struct2", Types.StructType.of( + optional(3, "list", Types.ListType.ofOptional( + 4, Types.StructType.of( + optional(5, "value", StringType.get()))))))))); + + Schema newSchema = new Schema( + required(1, "struct1", Types.StructType.of( + optional(2, "struct2", Types.StructType.of( + optional(3, "list", Types.ListType.ofOptional( + 4, Types.StructType.of( + optional(5, "time", TimeType.get()))))))))); Review comment: I just went back to the sync notes to refresh my memory and found that I hadn't posted them to the doc! I updated it. Sorry about that. Just catching up on this now. I think the main use case that we're trying to solve here is the union case, where you just need to produce a schema with a union of the fields from all of the observed schemas. I think focusing on that use case makes the choice of whether to support delete clear: just because a column is missing, doesn't mean it shouldn't be in the union. Renames are similar: if the field exists, do nothing and if it doesn't exist, add it. The only case where we could detect a rename in a union operation is if both schemas have IDs. But then I would say that we should avoid unnecessary modification of the union schema. That's because we don't know which name is correct (the existing name or the observed name) but using the observed name is more likely to break existing queries for the table. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
