jackye1995 commented on code in PR #6449:
URL: https://github.com/apache/iceberg/pull/6449#discussion_r1064836123


##########
delta-lake/src/main/java/org/apache/iceberg/delta/BaseSnapshotDeltaLakeTableAction.java:
##########
@@ -0,0 +1,340 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.delta;
+
+import io.delta.standalone.DeltaLog;
+import io.delta.standalone.VersionLog;
+import io.delta.standalone.actions.Action;
+import io.delta.standalone.actions.AddFile;
+import io.delta.standalone.actions.RemoveFile;
+import java.io.File;
+import java.net.URI;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.iceberg.AppendFiles;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.DataFiles;
+import org.apache.iceberg.DeleteFiles;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.Metrics;
+import org.apache.iceberg.MetricsConfig;
+import org.apache.iceberg.OverwriteFiles;
+import org.apache.iceberg.PartitionField;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.SnapshotSummary;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.Transaction;
+import org.apache.iceberg.avro.Avro;
+import org.apache.iceberg.catalog.Catalog;
+import org.apache.iceberg.catalog.TableIdentifier;
+import org.apache.iceberg.exceptions.ValidationException;
+import org.apache.iceberg.hadoop.HadoopFileIO;
+import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.mapping.NameMapping;
+import org.apache.iceberg.mapping.NameMappingParser;
+import org.apache.iceberg.orc.OrcMetrics;
+import org.apache.iceberg.parquet.ParquetUtil;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.types.Type;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Takes a Delta Lake table's location and attempts to create an Iceberg table 
snapshot in an
+ * optional user-specified location (default to the Delta Lake table's 
location) with a different
+ * identifier.
+ */
+public class BaseSnapshotDeltaLakeTableAction implements 
SnapshotDeltaLakeTable {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(BaseSnapshotDeltaLakeTableAction.class);
+
+  private static final String SNAPSHOT_SOURCE_PROP = "snapshot_source";
+  private static final String DELTA_SOURCE_VALUE = "delta";
+  private static final String ORIGINAL_LOCATION_PROP = "original_location";
+  private static final String PARQUET_SUFFIX = ".parquet";
+  private static final String AVRO_SUFFIX = ".avro";
+  private static final String ORC_SUFFIX = ".orc";
+  private final ImmutableMap.Builder<String, String> 
additionalPropertiesBuilder =
+      ImmutableMap.builder();
+  private final DeltaLog deltaLog;
+  private final Catalog icebergCatalog;
+  private final String deltaTableLocation;
+  private final TableIdentifier newTableIdentifier;
+  private String newTableLocation;
+  private final HadoopFileIO deltaLakeFileIO;
+
+  /**
+   * Snapshot a delta lake table to be an iceberg table. The action will read 
the delta lake table's
+   * log through the table's path, create a new iceberg table using the given 
icebergCatalog and
+   * newTableIdentifier, and commit all changes in one iceberg transaction.
+   *
+   * <p>The new table will only be created if the snapshot is successful.
+   *
+   * @param icebergCatalog the iceberg catalog to create the iceberg table
+   * @param deltaTableLocation the delta lake table's path
+   * @param newTableIdentifier the identifier of the new iceberg table
+   * @param deltaLakeConfiguration the hadoop configuration to access the 
delta lake table
+   */
+  public BaseSnapshotDeltaLakeTableAction(
+      Catalog icebergCatalog,
+      String deltaTableLocation,
+      TableIdentifier newTableIdentifier,
+      Configuration deltaLakeConfiguration) {
+    this.icebergCatalog = icebergCatalog;
+    this.deltaTableLocation = deltaTableLocation;
+    this.newTableIdentifier = newTableIdentifier;
+    this.newTableLocation = deltaTableLocation;
+    this.deltaLog = DeltaLog.forTable(deltaLakeConfiguration, 
deltaTableLocation);
+    this.deltaLakeFileIO = new HadoopFileIO(deltaLakeConfiguration);
+  }
+
+  @Override
+  public SnapshotDeltaLakeTable tableProperties(Map<String, String> 
properties) {
+    additionalPropertiesBuilder.putAll(properties);
+    return this;
+  }
+
+  @Override
+  public SnapshotDeltaLakeTable tableProperty(String name, String value) {
+    additionalPropertiesBuilder.put(name, value);
+    return this;
+  }
+
+  @Override
+  public SnapshotDeltaLakeTable tableLocation(String location) {
+    this.newTableLocation = location;
+    return this;
+  }
+
+  @Override
+  public Result execute() {
+    io.delta.standalone.Snapshot updatedSnapshot = deltaLog.update();
+    Schema schema = 
convertDeltaLakeSchema(updatedSnapshot.getMetadata().getSchema());
+    PartitionSpec partitionSpec = getPartitionSpecFromDeltaSnapshot(schema);
+    Transaction icebergTransaction =
+        icebergCatalog.newCreateTableTransaction(
+            newTableIdentifier,
+            schema,
+            partitionSpec,
+            newTableLocation,
+            destTableProperties(updatedSnapshot, deltaTableLocation));
+
+    long totalDataFiles = copyFromDeltaLakeToIceberg(icebergTransaction);
+    icebergTransaction.commitTransaction();
+    LOG.info(
+        "Successfully loaded Iceberg metadata for {} files in {}",
+        totalDataFiles,
+        deltaTableLocation);
+    return new BaseSnapshotDeltaLakeTableActionResult(totalDataFiles);
+  }
+
+  private Schema convertDeltaLakeSchema(io.delta.standalone.types.StructType 
deltaSchema) {
+    Type converted =
+        DeltaLakeDataTypeVisitor.visit(deltaSchema, new 
DeltaLakeTypeToType(deltaSchema));
+    return new Schema(converted.asNestedType().asStructType().fields());
+  }
+
+  private PartitionSpec getPartitionSpecFromDeltaSnapshot(Schema schema) {
+    List<String> partitionNames = 
deltaLog.snapshot().getMetadata().getPartitionColumns();
+    if (partitionNames.isEmpty()) {
+      return PartitionSpec.unpartitioned();
+    }
+
+    PartitionSpec.Builder builder = PartitionSpec.builderFor(schema);
+    for (String partitionName : partitionNames) {
+      builder.identity(partitionName);
+    }
+    return builder.build();
+  }
+
+  private long copyFromDeltaLakeToIceberg(Transaction transaction) {
+    Iterator<VersionLog> versionLogIterator =
+        deltaLog.getChanges(
+            0, // retrieve actions starting from the initial version
+            false); // not throw exception when data loss detected
+
+    while (versionLogIterator.hasNext()) {
+      VersionLog versionLog = versionLogIterator.next();
+      commitDeltaVersionLogToIcebergTransaction(versionLog, transaction);
+    }
+
+    return Long.parseLong(
+        
transaction.table().currentSnapshot().summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP));
+  }
+
+  private void commitDeltaVersionLogToIcebergTransaction(
+      VersionLog versionLog, Transaction transaction) {
+    List<Action> actions = versionLog.getActions();
+
+    // We first need to iterate through to see what kind of transaction this 
was. There are 3

Review Comment:
   nit: we can put these comments as javadoc for this method, instead of within 
the method as inline comments



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to