stevenzwu commented on code in PR #12298:
URL: https://github.com/apache/iceberg/pull/12298#discussion_r2507603078
##########
flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java:
##########
@@ -29,9 +30,10 @@
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
-abstract class FlinkSchemaVisitor<T> {
+@Internal
+public abstract class FlinkSchemaVisitor<T> {
Review Comment:
why is this class public now? I didn't seem to see any new usage in this PR.
##########
spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java:
##########
@@ -67,75 +63,36 @@ protected CloseableIterable<ColumnarBatch> newBatchIterable(
Expression residual,
Map<Integer, ?> idToConstant,
SparkDeleteFilter deleteFilter) {
- switch (format) {
- case PARQUET:
- return newParquetIterable(inputFile, start, length, residual,
idToConstant, deleteFilter);
-
- case ORC:
- return newOrcIterable(inputFile, start, length, residual,
idToConstant);
-
- default:
- throw new UnsupportedOperationException(
- "Format: " + format + " not supported for batched reads");
+ Schema requiredSchema = deleteFilter != null ?
deleteFilter.requiredSchema() : expectedSchema();
+ ReadBuilder readBuilder =
+ FormatModelRegistry.readBuilder(format, ColumnarBatch.class,
inputFile);
+ if (parquetConf != null) {
+ readBuilder =
+ readBuilder
+ .recordsPerBatch(parquetConf.batchSize())
+ .set(
+ VectorizedSparkParquetReaders.PARQUET_READER_TYPE,
+ parquetConf.readerType().name());
+ } else if (orcConf != null) {
+ readBuilder = readBuilder.recordsPerBatch(orcConf.batchSize());
}
- }
- private CloseableIterable<ColumnarBatch> newParquetIterable(
- InputFile inputFile,
- long start,
- long length,
- Expression residual,
- Map<Integer, ?> idToConstant,
- SparkDeleteFilter deleteFilter) {
- // get required schema if there are deletes
- Schema requiredSchema = deleteFilter != null ?
deleteFilter.requiredSchema() : expectedSchema();
+ if (readBuilder instanceof ParquetFormatModel.SupportsDeleteFilter<?>) {
Review Comment:
it is a bit weird to use `ParquetFormatModel.SupportsDeleteFilter` here. if
it is Parquet specific, it should probably be used in the if block of line
69-76. But if Orc can supports delete filter for batch reader, maybe the
`SupportsDeleteFilter` interface should be more generic and shared.
##########
spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java:
##########
@@ -105,119 +124,106 @@ class SparkFileWriterFactory extends
BaseFileWriterFactory<InternalRow> {
super(
table,
dataFileFormat,
+ InternalRow.class,
dataSchema,
dataSortOrder,
deleteFileFormat,
equalityFieldIds,
equalityDeleteRowSchema,
equalityDeleteSortOrder,
- ImmutableMap.of());
+ writeProperties,
+ calculateSparkType(dataSparkType, dataSchema),
+ calculateSparkType(equalityDeleteSparkType, equalityDeleteRowSchema));
- this.dataSparkType = dataSparkType;
- this.equalityDeleteSparkType = equalityDeleteSparkType;
- this.positionDeleteSparkType = null;
+ this.table = table;
+ this.format = dataFileFormat;
this.writeProperties = writeProperties != null ? writeProperties :
ImmutableMap.of();
+ this.positionDeleteRowSchema = null;
+ this.useDeprecatedPositionDeleteWriter = false;
}
static Builder builderFor(Table table) {
return new Builder(table);
}
- @Override
- protected void configureDataWrite(Avro.DataWriteBuilder builder) {
- builder.createWriterFunc(ignored -> new SparkAvroWriter(dataSparkType()));
- builder.setAll(writeProperties);
- }
-
- @Override
- protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) {
- builder.createWriterFunc(ignored -> new
SparkAvroWriter(equalityDeleteSparkType()));
- builder.setAll(writeProperties);
- }
-
- @Override
- protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) {
- boolean withRow =
-
positionDeleteSparkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME).isDefined();
- if (withRow) {
- // SparkAvroWriter accepts just the Spark type of the row ignoring the
path and pos
- StructField rowField =
positionDeleteSparkType().apply(DELETE_FILE_ROW_FIELD_NAME);
- StructType positionDeleteRowSparkType = (StructType) rowField.dataType();
- builder.createWriterFunc(ignored -> new
SparkAvroWriter(positionDeleteRowSparkType));
- }
-
- builder.setAll(writeProperties);
- }
-
- @Override
- protected void configureDataWrite(Parquet.DataWriteBuilder builder) {
- builder.createWriterFunc(msgType ->
SparkParquetWriters.buildWriter(dataSparkType(), msgType));
- builder.setAll(writeProperties);
- }
-
- @Override
- protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) {
- builder.createWriterFunc(
- msgType -> SparkParquetWriters.buildWriter(equalityDeleteSparkType(),
msgType));
- builder.setAll(writeProperties);
- }
-
- @Override
- protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) {
- builder.createWriterFunc(
- msgType -> SparkParquetWriters.buildWriter(positionDeleteSparkType(),
msgType));
- builder.transformPaths(path -> UTF8String.fromString(path.toString()));
- builder.setAll(writeProperties);
- }
-
- @Override
- protected void configureDataWrite(ORC.DataWriteBuilder builder) {
- builder.createWriterFunc(SparkOrcWriter::new);
- builder.setAll(writeProperties);
- }
-
- @Override
- protected void configureEqualityDelete(ORC.DeleteWriteBuilder builder) {
- builder.createWriterFunc(SparkOrcWriter::new);
- builder.setAll(writeProperties);
- }
-
- @Override
- protected void configurePositionDelete(ORC.DeleteWriteBuilder builder) {
- builder.createWriterFunc(SparkOrcWriter::new);
- builder.transformPaths(path -> UTF8String.fromString(path.toString()));
- builder.setAll(writeProperties);
- }
-
- private StructType dataSparkType() {
- if (dataSparkType == null) {
- Preconditions.checkNotNull(dataSchema(), "Data schema must not be null");
- this.dataSparkType = SparkSchemaUtil.convert(dataSchema());
- }
-
- return dataSparkType;
- }
-
- private StructType equalityDeleteSparkType() {
- if (equalityDeleteSparkType == null) {
- Preconditions.checkNotNull(
- equalityDeleteRowSchema(), "Equality delete schema must not be
null");
- this.equalityDeleteSparkType =
SparkSchemaUtil.convert(equalityDeleteRowSchema());
- }
-
- return equalityDeleteSparkType;
- }
-
private StructType positionDeleteSparkType() {
if (positionDeleteSparkType == null) {
// wrap the optional row schema into the position delete schema
containing path and position
- Schema positionDeleteSchema =
DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema());
+ Schema positionDeleteSchema =
DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema);
this.positionDeleteSparkType =
SparkSchemaUtil.convert(positionDeleteSchema);
}
return positionDeleteSparkType;
}
+ @Override
+ public PositionDeleteWriter<InternalRow> newPositionDeleteWriter(
+ EncryptedOutputFile file, PartitionSpec spec, StructLike partition) {
+ if (!useDeprecatedPositionDeleteWriter) {
+ return super.newPositionDeleteWriter(file, spec, partition);
+ } else {
+ LOG.info(
+ "Deprecated feature used. Position delete row schema is used to
create the position delete writer.");
+ MetricsConfig metricsConfig =
+ table != null
+ ? MetricsConfig.forPositionDelete(table)
+ : MetricsConfig.fromProperties(ImmutableMap.of());
+
+ try {
+ switch (format) {
+ case AVRO:
+ StructType positionDeleteRowSparkType =
+ (StructType)
positionDeleteSparkType().apply(DELETE_FILE_ROW_FIELD_NAME).dataType();
+
+ return Avro.writeDeletes(file)
+ .createWriterFunc(ignored -> new
SparkAvroWriter(positionDeleteRowSparkType))
+ .withPartition(partition)
+ .overwrite()
+ .rowSchema(positionDeleteRowSchema)
+ .withSpec(spec)
+ .withKeyMetadata(file.keyMetadata())
+ .setAll(writeProperties)
+ .metricsConfig(metricsConfig)
+ .buildPositionWriter();
+
+ case ORC:
+ return ORC.writeDeletes(file)
+ .createWriterFunc(SparkOrcWriter::new)
+ .transformPaths(path -> UTF8String.fromString(path.toString()))
+ .withPartition(partition)
+ .overwrite()
+ .rowSchema(positionDeleteRowSchema)
+ .withSpec(spec)
+ .withKeyMetadata(file.keyMetadata())
+ .setAll(writeProperties)
+ .metricsConfig(metricsConfig)
+ .buildPositionWriter();
+
+ case PARQUET:
+ return Parquet.writeDeletes(file)
+ .createWriterFunc(
+ msgType ->
SparkParquetWriters.buildWriter(positionDeleteSparkType(), msgType))
+ .transformPaths(path -> UTF8String.fromString(path.toString()))
+ .withPartition(partition)
+ .overwrite()
+ .metricsConfig(metricsConfig)
+ .rowSchema(positionDeleteRowSchema)
+ .withSpec(spec)
+ .withKeyMetadata(file.keyMetadata())
+ .setAll(writeProperties)
+ .metricsConfig(metricsConfig)
+ .buildPositionWriter();
+
+ default:
+ throw new UnsupportedOperationException(
+ "Cannot write pos-deletes for unsupported file format: " +
format);
+ }
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
Review Comment:
nit: error msg
##########
core/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java:
##########
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.data;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.io.UncheckedIOException;
+import java.util.Map;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.MetricsConfig;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.SortOrder;
+import org.apache.iceberg.StructLike;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.deletes.EqualityDeleteWriter;
+import org.apache.iceberg.deletes.PositionDeleteWriter;
+import org.apache.iceberg.encryption.EncryptedOutputFile;
+import org.apache.iceberg.encryption.EncryptionKeyMetadata;
+import org.apache.iceberg.formats.DataWriteBuilder;
+import org.apache.iceberg.formats.EqualityDeleteWriteBuilder;
+import org.apache.iceberg.formats.FormatModelRegistry;
+import org.apache.iceberg.formats.PositionDeleteWriteBuilder;
+import org.apache.iceberg.io.DataWriter;
+import org.apache.iceberg.io.FileWriterFactory;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+
+/**
+ * A base writer factory to be extended by query engine integrations.
+ *
+ * @param <T> row type
+ */
+public abstract class RegistryBasedFileWriterFactory<T, S>
+ implements FileWriterFactory<T>, Serializable {
+ private final Table table;
+ private final FileFormat dataFileFormat;
+ private final Class<T> inputType;
+ private final Schema dataSchema;
+ private final SortOrder dataSortOrder;
+ private final FileFormat deleteFileFormat;
+ private final int[] equalityFieldIds;
+ private final Schema equalityDeleteRowSchema;
+ private final SortOrder equalityDeleteSortOrder;
+ private final Map<String, String> writerProperties;
+ private final S inputSchema;
+ private final S equalityDeleteInputSchema;
+
+ protected RegistryBasedFileWriterFactory(
+ Table table,
+ FileFormat dataFileFormat,
+ Class<T> inputType,
+ Schema dataSchema,
+ SortOrder dataSortOrder,
+ FileFormat deleteFileFormat,
+ int[] equalityFieldIds,
+ Schema equalityDeleteRowSchema,
+ SortOrder equalityDeleteSortOrder,
+ Map<String, String> writerProperties,
+ S inputSchema,
+ S equalityDeleteInputSchema) {
+ this.table = table;
+ this.dataFileFormat = dataFileFormat;
+ this.inputType = inputType;
+ this.dataSchema = dataSchema;
+ this.dataSortOrder = dataSortOrder;
+ this.deleteFileFormat = deleteFileFormat;
+ this.equalityFieldIds = equalityFieldIds;
+ this.equalityDeleteRowSchema = equalityDeleteRowSchema;
+ this.equalityDeleteSortOrder = equalityDeleteSortOrder;
+ this.writerProperties = writerProperties != null ? writerProperties :
ImmutableMap.of();
+ this.inputSchema = inputSchema;
+ this.equalityDeleteInputSchema = equalityDeleteInputSchema;
+ }
+
+ protected S inputSchema() {
+ return inputSchema;
+ }
+
+ protected S equalityDeleteInputSchema() {
+ return equalityDeleteInputSchema;
+ }
+
+ @Override
+ public DataWriter<T> newDataWriter(
+ EncryptedOutputFile file, PartitionSpec spec, StructLike partition) {
+ Preconditions.checkNotNull(dataSchema, "Data schema must not be null");
+ EncryptionKeyMetadata keyMetadata = file.keyMetadata();
+ Map<String, String> properties = table != null ? table.properties() :
ImmutableMap.of();
+ MetricsConfig metricsConfig =
+ table != null ? MetricsConfig.forTable(table) :
MetricsConfig.getDefault();
+
+ try {
+ DataWriteBuilder<T, S> builder =
+ FormatModelRegistry.dataWriteBuilder(dataFileFormat, inputType,
file);
+ return builder
+ .schema(dataSchema)
+ .inputSchema(inputSchema())
+ .setAll(properties)
+ .setAll(writerProperties)
+ .metricsConfig(metricsConfig)
+ .spec(spec)
+ .partition(partition)
+ .keyMetadata(keyMetadata)
+ .sortOrder(dataSortOrder)
+ .overwrite()
+ .build();
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
Review Comment:
nit: error msg
##########
data/src/main/java/org/apache/iceberg/data/GenericFileWriterFactory.java:
##########
@@ -107,62 +130,166 @@ public class GenericFileWriterFactory extends
BaseFileWriterFactory<Record> {
super(
table,
dataFileFormat,
+ Record.class,
dataSchema,
dataSortOrder,
deleteFileFormat,
equalityFieldIds,
equalityDeleteRowSchema,
equalityDeleteSortOrder,
- positionDeleteRowSchema);
+ ImmutableMap.of(),
+ dataSchema,
+ equalityDeleteRowSchema);
+ this.table = table;
+ this.format = dataFileFormat;
+ this.positionDeleteRowSchema = positionDeleteRowSchema;
}
static Builder builderFor(Table table) {
return new Builder(table);
}
- @Override
+ /**
+ * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called
starting 1.10.0 as the
+ * configuration is done by the {@link FormatModelRegistry}.
+ */
+ @Deprecated
protected void configureDataWrite(Avro.DataWriteBuilder builder) {
- builder.createWriterFunc(DataWriter::create);
+ throwUnsupportedOperationException();
}
- @Override
+ /**
+ * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called
starting 1.10.0 as the
+ * configuration is done by the {@link FormatModelRegistry}.
+ */
+ @Deprecated
protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) {
- builder.createWriterFunc(DataWriter::create);
+ throwUnsupportedOperationException();
}
- @Override
+ /**
+ * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called
starting 1.10.0 as the
+ * configuration is done by the {@link FormatModelRegistry}.
+ */
+ @Deprecated
protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) {
- builder.createWriterFunc(DataWriter::create);
+ throwUnsupportedOperationException();
}
- @Override
+ /**
+ * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called
starting 1.10.0 as the
+ * configuration is done by the {@link FormatModelRegistry}.
+ */
+ @Deprecated
protected void configureDataWrite(Parquet.DataWriteBuilder builder) {
- builder.createWriterFunc(GenericParquetWriter::create);
+ throwUnsupportedOperationException();
}
- @Override
+ /**
+ * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called
starting 1.10.0 as the
+ * configuration is done by the {@link FormatModelRegistry}.
+ */
+ @Deprecated
protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) {
- builder.createWriterFunc(GenericParquetWriter::create);
+ throwUnsupportedOperationException();
}
- @Override
+ /**
+ * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called
starting 1.10.0 as the
+ * configuration is done by the {@link FormatModelRegistry}.
+ */
+ @Deprecated
protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) {
- builder.createWriterFunc(GenericParquetWriter::create);
+ throwUnsupportedOperationException();
}
- @Override
+ /**
+ * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called
starting 1.10.0 as the
+ * configuration is done by the {@link FormatModelRegistry}.
+ */
+ @Deprecated
protected void configureDataWrite(ORC.DataWriteBuilder builder) {
- builder.createWriterFunc(GenericOrcWriter::buildWriter);
+ throwUnsupportedOperationException();
}
- @Override
+ /**
+ * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called
starting 1.10.0 as the
+ * configuration is done by the {@link FormatModelRegistry}.
+ */
+ @Deprecated
protected void configureEqualityDelete(ORC.DeleteWriteBuilder builder) {
- builder.createWriterFunc(GenericOrcWriter::buildWriter);
+ throwUnsupportedOperationException();
}
- @Override
+ /**
+ * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called
starting 1.10.0 as the
+ * configuration is done by the {@link FormatModelRegistry}.
+ */
+ @Deprecated
protected void configurePositionDelete(ORC.DeleteWriteBuilder builder) {
- builder.createWriterFunc(GenericOrcWriter::buildWriter);
+ throwUnsupportedOperationException();
+ }
+
+ private void throwUnsupportedOperationException() {
+ throw new UnsupportedOperationException(
+ "Method is deprecated and should not be called. "
+ + "Configuration is already done by the registry.");
+ }
+
+ @Override
+ public PositionDeleteWriter<Record> newPositionDeleteWriter(
+ EncryptedOutputFile file, PartitionSpec spec, StructLike partition) {
+ if (positionDeleteRowSchema == null) {
+ return super.newPositionDeleteWriter(file, spec, partition);
+ } else {
+ LOG.info(
+ "Deprecated feature used. Position delete row schema is used to
create the position delete writer.");
+ MetricsConfig metricsConfig =
+ table != null
+ ? MetricsConfig.forPositionDelete(table)
+ : MetricsConfig.fromProperties(ImmutableMap.of());
+
+ try {
+ switch (format) {
+ case AVRO:
+ return Avro.writeDeletes(file)
+ .createWriterFunc(DataWriter::create)
+ .withPartition(partition)
+ .overwrite()
+ .rowSchema(positionDeleteRowSchema)
+ .withSpec(spec)
+ .withKeyMetadata(file.keyMetadata())
+ .buildPositionWriter();
+
+ case ORC:
+ return ORC.writeDeletes(file)
+ .createWriterFunc(GenericOrcWriter::buildWriter)
+ .withPartition(partition)
+ .overwrite()
+ .rowSchema(positionDeleteRowSchema)
+ .withSpec(spec)
+ .withKeyMetadata(file.keyMetadata())
+ .buildPositionWriter();
+
+ case PARQUET:
+ return Parquet.writeDeletes(file)
+ .createWriterFunc(GenericParquetWriter::create)
+ .withPartition(partition)
+ .overwrite()
+ .metricsConfig(metricsConfig)
+ .rowSchema(positionDeleteRowSchema)
+ .withSpec(spec)
+ .withKeyMetadata(file.keyMetadata())
+ .buildPositionWriter();
+
+ default:
+ throw new UnsupportedOperationException(
+ "Cannot write pos-deletes for unsupported file format: " +
format);
+ }
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
Review Comment:
nit: error msg
##########
parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java:
##########
@@ -302,7 +302,7 @@ WriteBuilder withWriterVersion(WriterVersion version) {
}
// supposed to always be a private method used strictly by data and delete
write builders
- private WriteBuilder createContextFunc(
+ protected WriteBuilder createContextFunc(
Review Comment:
what are the reasons for the two `protected` changes in this class?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]