RussellSpitzer commented on a change in pull request #3164: URL: https://github.com/apache/iceberg/pull/3164#discussion_r714315864
########## File path: data/src/test/java/org/apache/iceberg/io/TestPartitioningWriters.java ########## @@ -0,0 +1,535 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.io; + +import java.io.File; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; +import org.apache.iceberg.AssertHelpers; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.RowDelta; +import org.apache.iceberg.Schema; +import org.apache.iceberg.deletes.PositionDelete; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.util.StructLikeSet; +import org.junit.Assert; +import org.junit.Assume; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +@RunWith(Parameterized.class) +public abstract class TestPartitioningWriters<T> extends WriterTestBase<T> { + + @Parameterized.Parameters(name = "FileFormat={0}") + public static Object[] parameters() { + return new Object[][] { + new Object[]{FileFormat.AVRO}, + new Object[]{FileFormat.PARQUET}, + new Object[]{FileFormat.ORC}, + }; + } + + private static final int TABLE_FORMAT_VERSION = 2; + private static final long TARGET_FILE_SIZE = 128L * 1024 * 1024; + + private final FileFormat fileFormat; + private OutputFileFactory fileFactory = null; + + public TestPartitioningWriters(FileFormat fileFormat) { + super(TABLE_FORMAT_VERSION); + this.fileFormat = fileFormat; + } + + protected abstract StructLikeSet toSet(Iterable<T> records); + + protected FileFormat format() { + return fileFormat; + } + + @Before + public void setupTable() throws Exception { + this.tableDir = temp.newFolder(); + Assert.assertTrue(tableDir.delete()); // created during table creation + + this.metadataDir = new File(tableDir, "metadata"); + this.table = create(SCHEMA, PartitionSpec.unpartitioned()); + this.fileFactory = OutputFileFactory.builderFor(table, 1, 1).format(fileFormat).build(); + } + + @Test + public void testClusteredDataWriterNoRecords() throws IOException { + FileWriterFactory<T> writerFactory = newWriterFactory(table.schema()); + ClusteredDataWriter<T> writer = new ClusteredDataWriter<>( + writerFactory, fileFactory, table.io(), + fileFormat, TARGET_FILE_SIZE); + + writer.close(); + Assert.assertEquals("Must be no data files", 0, writer.result().dataFiles().size()); + + writer.close(); + Assert.assertEquals("Must be no data files", 0, writer.result().dataFiles().size()); + } + + @Test + public void testClusteredDataWriterMultiplePartitions() throws IOException { + table.updateSpec() + .addField(Expressions.ref("data")) + .commit(); + + FileWriterFactory<T> writerFactory = newWriterFactory(table.schema()); + ClusteredDataWriter<T> writer = new ClusteredDataWriter<>( + writerFactory, fileFactory, table.io(), + fileFormat, TARGET_FILE_SIZE); + + PartitionSpec spec = table.spec(); + + writer.write(toRow(1, "aaa"), spec, partitionKey(spec, "aaa")); + writer.write(toRow(2, "aaa"), spec, partitionKey(spec, "aaa")); + writer.write(toRow(3, "bbb"), spec, partitionKey(spec, "bbb")); + writer.write(toRow(4, "bbb"), spec, partitionKey(spec, "bbb")); + writer.write(toRow(5, "ccc"), spec, partitionKey(spec, "ccc")); + + writer.close(); + + DataWriteResult result = writer.result(); + Assert.assertEquals("Must be 3 data files", 3, result.dataFiles().size()); + + RowDelta rowDelta = table.newRowDelta(); + result.dataFiles().forEach(rowDelta::addRows); + rowDelta.commit(); + + List<T> expectedRows = ImmutableList.of( + toRow(1, "aaa"), + toRow(2, "aaa"), + toRow(3, "bbb"), + toRow(4, "bbb"), + toRow(5, "ccc") + ); + Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*")); + } + + @Test + public void testClusteredDataWriterOutOfOrderPartitions() throws IOException { + table.updateSpec() + .addField(Expressions.ref("data")) + .commit(); + + FileWriterFactory<T> writerFactory = newWriterFactory(table.schema()); + ClusteredDataWriter<T> writer = new ClusteredDataWriter<>( + writerFactory, fileFactory, table.io(), + fileFormat, TARGET_FILE_SIZE); + + PartitionSpec spec = table.spec(); + + writer.write(toRow(1, "aaa"), spec, partitionKey(spec, "aaa")); + writer.write(toRow(2, "aaa"), spec, partitionKey(spec, "aaa")); + writer.write(toRow(3, "bbb"), spec, partitionKey(spec, "bbb")); + writer.write(toRow(4, "bbb"), spec, partitionKey(spec, "bbb")); + writer.write(toRow(5, "ccc"), spec, partitionKey(spec, "ccc")); + + AssertHelpers.assertThrows("Should fail to write out of order partitions", + IllegalStateException.class, "Already closed files for partition", + () -> { + try { + writer.write(toRow(6, "aaa"), spec, partitionKey(spec, "aaa")); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + }); + + writer.close(); + } + + @Test + public void testClusteredEqualityDeleteWriterNoRecords() throws IOException { + Assume.assumeFalse("ORC delete files are not supported", fileFormat == FileFormat.ORC); + + List<Integer> equalityFieldIds = ImmutableList.of(table.schema().findField("id").fieldId()); + Schema equalityDeleteRowSchema = table.schema().select("id"); + FileWriterFactory<T> writerFactory = newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); + ClusteredEqualityDeleteWriter<T> writer = new ClusteredEqualityDeleteWriter<>( + writerFactory, fileFactory, table.io(), + fileFormat, TARGET_FILE_SIZE); + + writer.close(); + Assert.assertEquals(0, writer.result().deleteFiles().size()); + Assert.assertEquals(0, writer.result().referencedDataFiles().size()); + Assert.assertFalse(writer.result().referencesDataFiles()); + + writer.close(); + Assert.assertEquals(0, writer.result().deleteFiles().size()); + Assert.assertEquals(0, writer.result().referencedDataFiles().size()); + Assert.assertFalse(writer.result().referencesDataFiles()); + } + + @Test + public void testClusteredEqualityDeleteWriterMultipleSpecs() throws IOException { + Assume.assumeFalse("ORC delete files are not supported", fileFormat == FileFormat.ORC); + + List<Integer> equalityFieldIds = ImmutableList.of(table.schema().findField("id").fieldId()); + Schema equalityDeleteRowSchema = table.schema().select("id"); + FileWriterFactory<T> writerFactory = newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); + + // add an unpartitioned data file + ImmutableList<T> rows1 = ImmutableList.of( + toRow(1, "aaa"), + toRow(2, "aaa"), + toRow(11, "aaa") + ); + DataFile dataFile1 = writeData(writerFactory, fileFactory, rows1, table.spec(), null); + table.newFastAppend() + .appendFile(dataFile1) + .commit(); + + // partition by bucket + table.updateSpec() + .addField(Expressions.bucket("data", 16)) + .commit(); + + // add a data file partitioned by bucket + ImmutableList<T> rows2 = ImmutableList.of( + toRow(3, "bbb"), + toRow(4, "bbb"), + toRow(12, "bbb") + ); + DataFile dataFile2 = writeData(writerFactory, fileFactory, rows2, table.spec(), partitionKey(table.spec(), "bbb")); + table.newFastAppend() + .appendFile(dataFile2) + .commit(); + + // partition by data + table.updateSpec() + .removeField(Expressions.bucket("data", 16)) + .addField(Expressions.ref("data")) + .commit(); + + // add a data file partitioned by data + ImmutableList<T> rows3 = ImmutableList.of( + toRow(5, "ccc"), + toRow(13, "ccc") + ); + DataFile dataFile3 = writeData(writerFactory, fileFactory, rows3, table.spec(), partitionKey(table.spec(), "ccc")); + table.newFastAppend() + .appendFile(dataFile3) + .commit(); + + ClusteredEqualityDeleteWriter<T> writer = new ClusteredEqualityDeleteWriter<>( + writerFactory, fileFactory, table.io(), + fileFormat, TARGET_FILE_SIZE); + + PartitionSpec unpartitionedSpec = table.specs().get(0); + PartitionSpec bucketSpec = table.specs().get(1); + PartitionSpec identitySpec = table.specs().get(2); + + writer.write(toRow(1, "aaa"), unpartitionedSpec, null); + writer.write(toRow(2, "aaa"), unpartitionedSpec, null); + writer.write(toRow(3, "bbb"), bucketSpec, partitionKey(bucketSpec, "bbb")); + writer.write(toRow(4, "bbb"), bucketSpec, partitionKey(bucketSpec, "bbb")); + writer.write(toRow(5, "ccc"), identitySpec, partitionKey(identitySpec, "ccc")); + + writer.close(); + + DeleteWriteResult result = writer.result(); + Assert.assertEquals("Must be 3 delete files", 3, result.deleteFiles().size()); + Assert.assertEquals("Must not reference data files", 0, writer.result().referencedDataFiles().size()); Review comment: What's the difference between this check and the one beneath it? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
