[GitHub] [iceberg] RussellSpitzer commented on a change in pull request #1525: Provide API and Implementation for Creating Iceberg Tables from Spark

GitBox Mon, 12 Oct 2020 13:02:53 -0700


RussellSpitzer commented on a change in pull request #1525:
URL: https://github.com/apache/iceberg/pull/1525#discussion_r503506054




##########
File path: spark3/src/main/java/org/apache/iceberg/spark/MigrateAction.java
##########
@@ -0,0 +1,272 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark;
+
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import jline.internal.Log;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.SnapshotSummary;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.actions.Action;
+import org.apache.iceberg.actions.ExpireSnapshotsAction;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
+import org.apache.iceberg.spark.source.SparkTable;
+import org.apache.spark.sql.AnalysisException;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException;
+import org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException;
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
+import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException;
+import org.apache.spark.sql.catalyst.catalog.CatalogTable;
+import org.apache.spark.sql.catalyst.catalog.CatalogUtils;
+import org.apache.spark.sql.connector.catalog.CatalogManager;
+import org.apache.spark.sql.connector.catalog.CatalogPlugin;
+import org.apache.spark.sql.connector.catalog.Identifier;
+import org.apache.spark.sql.connector.catalog.StagedTable;
+import org.apache.spark.sql.connector.catalog.TableCatalog;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.collection.JavaConverters;
+
+/**
+ * This action will migrate a known table in a Spark Catalog that is not an 
Iceberg table into an Iceberg table.
+ * The created new table will be able to interact with and modify files in the 
original table.
+ *
+ * There are two main code paths
+ *   - Creating a brand new iceberg table or replacing an existing Iceberg 
table
+ *   This pathway will use a staged table to stage the creation or 
replacement, only committing after
+ *   import has succeeded.
+ *
+ *   - Replacing a table in the Session Catalog with an Iceberg Table with the 
same name
+ *   This pathway will create a temporary table with a different name. This 
replacement table will
+ *   be committed upon a successful import. Then the original session catalog 
entry will be dropped
+ *   and the new replacement table renamed to take its place.
+ */
+public class MigrateAction implements Action<Long> {
+  private static final Logger LOG = 
LoggerFactory.getLogger(ExpireSnapshotsAction.class);
+  private static final Set<String> ALLOWED_SOURCES = 
ImmutableSet.of("parquet", "avro", "orc", "hive");
+  private static final String ICEBERG_METADATA_FOLDER = "metadata";
+  private static final String REPLACEMENT_NAME = "_REPLACEMENT_";
+
+
+  private final SparkSession spark;
+
+  // Source Fields
+  private final CatalogTable sourceTable;
+  private final String sourceTableLocation;
+  private final Identifier sourceTableName;
+  private final PartitionSpec sourcePartitionSpec;
+
+  // Destination Fields
+  private final Boolean sessionCatalogReplacement;
+  private final SparkSessionCatalog destCatalog;
+  private final Identifier destTableName;
+
+  // Optional Parameters for destination
+  private String destDataLocation;
+  private String destMetadataLocation;
+  private Map<String, String> additionalProperties = Maps.newHashMap();
+
+  /**
+   * Creates an Iceberg Location at a given location instead of using the 
location
+   * provided by the source table. New metadata and data files will be added 
to this
+   * new location and further operations will not effect the source table.
+   *
+   * Use this if you would like to experiment with Iceberg without changing
+   * your original files.
+   * @param newLocation the base directory for the new Iceberg Table
+   * @return this for chaining
+   */
+  public MigrateAction withNewTableLocation(String newLocation) {
+    this.destDataLocation = newLocation;
+    this.destMetadataLocation = newLocation + "/" + ICEBERG_METADATA_FOLDER;
+    return this;
+  }
+
+  /**
+   * Adds additional properties to the newly created Iceberg Table. Any 
properties with
+   * the same key name will be overwritten.
+   * @param properties a map of properties to be included
+   * @return this for chaining
+   */
+  public MigrateAction withAdditionalProperties(Map<String, String> 
properties) {
+    this.additionalProperties.putAll(properties);
+    return this;
+  }
+
+  /**
+   * Adds an additional property to the newly created Iceberg Table. Any 
properties
+   * with the same key name will be overwritten.
+   * @param key the key of the property to add
+   * @param value the value of the property to add
+   */
+  public MigrateAction withAdditionalProperty(String key, String value) {
+    this.additionalProperties.put(key, value);
+    return this;
+  }
+
+  public MigrateAction(SparkSession spark, Identifier destTableName, 
Identifier sourceTableName) {
+    this.spark = spark;
+    this.sourceTableName = sourceTableName;
+    this.destTableName = destTableName;
+    this.destCatalog = getSparkSessionCatalogOrFail(spark, destTableName);
+
+    try {
+      sourcePartitionSpec = SparkSchemaUtil.specForTable(spark, 
Spark3Util.toTableIdentifier(sourceTableName));
+    } catch (AnalysisException e) {
+      throw new IllegalArgumentException("Cannot determining partitioning of " 
+ sourceTableName.toString(), e);
+    }
+
+    try {
+      this.sourceTable =
+        
spark.sessionState().catalog().getTableMetadata(Spark3Util.toTableIdentifier(sourceTableName));
+    } catch (NoSuchTableException | NoSuchDatabaseException e) {
+      throw new IllegalArgumentException(String.format("Could not find source 
table %s", sourceTableName), e);
+    }
+    validateSourceTable(sourceTable, ALLOWED_SOURCES);
+
+    String sourceTableProvider = 
sourceTable.provider().get().toLowerCase(Locale.ROOT);
+    this.sessionCatalogReplacement = !sourceTableProvider.equals("iceberg") && 
sourceTableName.equals(destTableName);
+
+    this.sourceTableLocation = 
CatalogUtils.URIToString(sourceTable.storage().locationUri().get());
+    this.destDataLocation = sourceTableLocation;
+    this.destMetadataLocation = sourceTableLocation + "/" + 
ICEBERG_METADATA_FOLDER;
+  }
+
+  private static void validateSourceTable(CatalogTable sourceTable, 
Set<String> supportedSourceTableProviders) {
+    String sourceTableProvider = 
sourceTable.provider().get().toLowerCase(Locale.ROOT);

Review comment:
       Added in a Hive test (at least I think so, used CREATE EXTERNAL TABLE 
LOCATION ....) which I though triggered the hive path, works for this.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] RussellSpitzer commented on a change in pull request #1525: Provide API and Implementation for Creating Iceberg Tables from Spark

Reply via email to