Re: [PR] Iceberg: FanoutPositionOnlyDeleteWriter support [hive]

via GitHub Thu, 29 Aug 2024 03:57:48 -0700


SourabhBadhya commented on code in PR #5418:
URL: https://github.com/apache/hive/pull/5418#discussion_r1735981881



##########
iceberg/checkstyle/checkstyle.xml:
##########
@@ -119,6 +119,7 @@
                 java.util.Collections.*,
                 java.util.stream.Collectors.*,
                 org.apache.commons.lang3.Validate.*,
+                org.apache.hadoop.hive.ql.metadata.VirtualColumn.*,

Review Comment:
   Why this change?



##########
iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java:
##########
@@ -350,6 +351,10 @@ public static boolean isCopyOnWriteMode(Context.Operation 
operation, BinaryOpera
     return 
RowLevelOperationMode.COPY_ON_WRITE.modeName().equalsIgnoreCase(mode);
   }
 
+  public static boolean isFanoutEnabled(Map<String, String> props) {
+    return PropertyUtil.propertyAsBoolean(props, "write.fanout.enabled", true);

Review Comment:
   Is `write.fanout.enabled` a new property? Maybe consider creating a String 
constant.



##########
iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/WriterBuilder.java:
##########
@@ -78,75 +75,58 @@ public WriterBuilder queryId(String newQueryId) {
     return this;
   }
 
-  public WriterBuilder poolSize(int newPoolSize) {
-    this.poolSize = newPoolSize;
-    return this;
-  }
-
   public WriterBuilder operation(Operation newOperation) {
     this.operation = newOperation;
     return this;
   }
 
-  public WriterBuilder isFanoutEnabled(boolean isFanoutEnabled) {
-    this.fanoutEnabled = isFanoutEnabled;
+  public WriterBuilder hasOrdering(boolean inputOrdered) {
+    context.inputOrdered = inputOrdered;
+    if (IcebergTableUtil.isFanoutEnabled(table.properties()) && !inputOrdered) 
{
+      context.useFanoutWriter = true;
+    }
     return this;
   }
 
-  public WriterBuilder isMergeTask(boolean mergeTaskEnabled) {
-    this.isMergeTask = mergeTaskEnabled;
+  public WriterBuilder isMergeTask(boolean isMergeTaskEnabled) {
+    context.isMergeTask = isMergeTaskEnabled;
     return this;
   }
 
   public HiveIcebergWriter build() {
-    Map<String, String> properties = table.properties();
-
-    String dataFileFormatName = properties.getOrDefault(DEFAULT_FILE_FORMAT, 
DEFAULT_FILE_FORMAT_DEFAULT);
-    FileFormat dataFileFormat = 
FileFormat.valueOf(dataFileFormatName.toUpperCase(Locale.ENGLISH));
-
-    String deleteFileFormatName = 
properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName);
-    FileFormat deleteFileFormat = 
FileFormat.valueOf(deleteFileFormatName.toUpperCase(Locale.ENGLISH));
-
-    long targetFileSize = PropertyUtil.propertyAsLong(table.properties(), 
TableProperties.WRITE_TARGET_FILE_SIZE_BYTES,
-        TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
-
-    boolean skipRowData =
-        
Boolean.parseBoolean(properties.getOrDefault(ICEBERG_DELETE_SKIPROWDATA, 
ICEBERG_DELETE_SKIPROWDATA_DEFAULT));
-
-    Schema dataSchema = table.schema();
-    FileIO io = table.io();
-    Map<Integer, PartitionSpec> specs = table.specs();
-    int currentSpecId = table.spec().specId();
     int partitionId = attemptID.getTaskID().getId();
     int taskId = attemptID.getId();
     String operationId = queryId + "-" + attemptID.getJobID() + "-" + 
operationNum.incrementAndGet();
-    OutputFileFactory outputFileFactory = OutputFileFactory.builderFor(table, 
partitionId, taskId)
-        .format(dataFileFormat)
-        .operationId("data-" + operationId)
+
+    OutputFileFactory dataFileFactory = OutputFileFactory.builderFor(table, 
partitionId, taskId)
+        .format(context.dataFileFormat())
+        .operationId(operationId)
         .build();
 
-    OutputFileFactory deleteOutputFileFactory = 
OutputFileFactory.builderFor(table, partitionId, taskId)
-        .format(deleteFileFormat)
-        .operationId("delete-" + operationId)
+    OutputFileFactory deleteFileFactory = OutputFileFactory.builderFor(table, 
partitionId, taskId)
+        .format(context.deleteFileFormat())
+        .operationId(operationId)
+        .suffix("deletes")

Review Comment:
   Whats this suffix? Is it used in the file name?



##########
iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergWriterBase.java:
##########
@@ -77,4 +86,39 @@ protected PartitionKey partition(Record row, int specId) {
     partitionKey.partition(wrapper.wrap(row));
     return partitionKey;
   }
+
+  // use a fanout writer only if enabled and the input is unordered and the 
table is partitioned
+  static PartitioningWriter<Record, DataWriteResult> newDataWriter(
+      Table table, HiveFileWriterFactory writers, OutputFileFactory files, 
Context context) {
+
+    FileIO io = table.io();
+    boolean useFanoutWriter = context.useFanoutWriter();
+    long targetFileSize = context.targetDataFileSize();
+
+    if (table.spec().isPartitioned() && useFanoutWriter) {

Review Comment:
   Fanout writers should be used only when its partitioned? Is this Iceberg / 
Spark induced behaviour?



##########
iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputFormat.java:
##########
@@ -41,10 +41,8 @@
 import org.apache.iceberg.mr.mapred.Container;
 import org.apache.parquet.hadoop.ParquetOutputFormat;
 
-public class HiveIcebergOutputFormat<T> implements OutputFormat<NullWritable, 
Container<Record>>,
+public class HiveIcebergOutputFormat implements OutputFormat<NullWritable, 
Container<Record>>,
     HiveOutputFormat<NullWritable, Container<Record>> {
-  private static final String DELETE_FILE_THREAD_POOL_SIZE = 
"iceberg.delete.file.thread.pool.size";
-  private static final int DELETE_FILE_THREAD_POOL_SIZE_DEFAULT = 10;

Review Comment:
   Are we supporting this config, because removal of these configs might result 
in backward incompatibility?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org
For additional commands, e-mail: gitbox-h...@hive.apache.org

Re: [PR] Iceberg: FanoutPositionOnlyDeleteWriter support [hive]

Reply via email to