[GitHub] [iceberg] aokolnychyi commented on a change in pull request #1525: Provide API and Implementation for Creating Iceberg Tables from Spark

GitBox Wed, 18 Nov 2020 15:23:00 -0800


aokolnychyi commented on a change in pull request #1525:
URL: https://github.com/apache/iceberg/pull/1525#discussion_r526479359




##########
File path: 
spark3/src/main/java/org/apache/iceberg/actions/Spark3CreateAction.java
##########
@@ -0,0 +1,310 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.actions;
+
+import java.util.Arrays;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import jline.internal.Log;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.SnapshotSummary;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
+import org.apache.iceberg.spark.Spark3Util;
+import org.apache.iceberg.spark.SparkCatalog;
+import org.apache.iceberg.spark.SparkSchemaUtil;
+import org.apache.iceberg.spark.SparkSessionCatalog;
+import org.apache.iceberg.spark.SparkTableUtil;
+import org.apache.iceberg.spark.source.SparkTable;
+import org.apache.spark.sql.AnalysisException;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException;
+import org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException;
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
+import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException;
+import org.apache.spark.sql.catalyst.catalog.CatalogTable;
+import org.apache.spark.sql.catalyst.catalog.CatalogUtils;
+import org.apache.spark.sql.connector.catalog.CatalogPlugin;
+import org.apache.spark.sql.connector.catalog.Identifier;
+import org.apache.spark.sql.connector.catalog.StagedTable;
+import org.apache.spark.sql.connector.catalog.StagingTableCatalog;
+import org.apache.spark.sql.connector.catalog.TableCatalog;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.collection.JavaConverters;
+
+/**
+ * This action will migrate a known table in a Spark Catalog that is not an 
Iceberg table into an Iceberg table.
+ * The created new table will be able to interact with and modify files in the 
original table.
+ *
+ * There are two main code paths
+ *   - Creating a brand new iceberg table or replacing an existing Iceberg 
table
+ *   This pathway will use a staged table to stage the creation or 
replacement, only committing after
+ *   import has succeeded.
+ *
+ *   - Replacing a table in the Session Catalog with an Iceberg Table of the 
same name.
+ *   This pathway will first create a temporary table with a different name. 
This replacement table will
+ *   be committed upon a successful import. Then the original session catalog 
entry will be dropped
+ *   and the new replacement table renamed to take its place.
+ */
+class Spark3CreateAction implements CreateAction {
+  private static final Logger LOG = 
LoggerFactory.getLogger(Spark3CreateAction.class);
+  private static final Set<String> ALLOWED_SOURCES = 
ImmutableSet.of("parquet", "avro", "orc", "hive");
+  private static final String ICEBERG_METADATA_FOLDER = "metadata";
+  private static final String REPLACEMENT_NAME = "_REPLACEMENT_";
+
+  private final SparkSession spark;
+
+  // Source Fields
+  private final CatalogTable sourceTable;
+  private final String sourceTableLocation;
+  private final CatalogPlugin sourceCatalog;
+  private final Identifier sourceTableName;
+  private final PartitionSpec sourcePartitionSpec;
+
+  // Destination Fields
+  private final Boolean sessionCatalogReplacement;
+  private final CatalogPlugin destCatalog;
+  private final Identifier destTableName;
+
+  // Optional Parameters for destination
+  private String destDataLocation;
+  private String destMetadataLocation;
+  private Map<String, String> additionalProperties = Maps.newHashMap();
+
+  Spark3CreateAction(SparkSession spark, CatalogPlugin sourceCatalog, 
Identifier sourceTableName,
+                       CatalogPlugin destCatalog,  Identifier destTableName) {
+
+    this.spark = spark;
+    this.sourceCatalog = checkSourceCatalog(sourceCatalog);
+    this.sourceTableName = sourceTableName;
+    this.destCatalog = destCatalog;
+    this.destTableName = destTableName;
+
+    try {
+      String sourceString = String.join(".", sourceTableName.namespace()) + 
"." + sourceTableName.name();
+      sourcePartitionSpec = SparkSchemaUtil.specForTable(spark, sourceString);
+    } catch (AnalysisException e) {
+      throw new IllegalArgumentException("Cannot determining partitioning of " 
+ sourceTableName.toString(), e);
+    }
+
+    try {
+      this.sourceTable = 
spark.sessionState().catalog().getTableMetadata(Spark3Util.toTableIdentifier(sourceTableName));
+    } catch (NoSuchTableException | NoSuchDatabaseException e) {
+      throw new IllegalArgumentException(String.format("Could not find source 
table %s", sourceTableName), e);
+    }
+    validateSourceTable(sourceTable);
+
+    this.sessionCatalogReplacement = isSessionCatalogReplacement();
+
+    this.sourceTableLocation = 
CatalogUtils.URIToString(sourceTable.storage().locationUri().get());
+    this.destDataLocation = sourceTableLocation;
+    this.destMetadataLocation = sourceTableLocation + "/" + 
ICEBERG_METADATA_FOLDER;
+  }
+
+  private boolean isSessionCatalogReplacement() {
+    boolean sourceIceberg = 
sourceTable.provider().get().toLowerCase(Locale.ROOT).equals("iceberg");
+    boolean sameCatalog = sourceCatalog == destCatalog;
+    boolean sameIdentifier = 
sourceTableName.name().equals(destTableName.name()) &&
+        Arrays.equals(sourceTableName.namespace(), destTableName.namespace());
+    return !sourceIceberg && sameCatalog && sameIdentifier;
+  }
+
+
+  /**
+   * Creates the Iceberg data and metadata at a given location instead of the 
source table
+   * location. New metadata and data files will be added to this
+   * new location and further operations will not effect the source table.
+   *
+   * @param newLocation the base directory for the new Iceberg Table
+   * @return this for chaining
+   */
+  CreateAction asSnapshotAtLocation(String newLocation) {
+    Preconditions.checkArgument(!newLocation.equals(sourceTableLocation), 
"Cannot create a snapshot with the" +
+        "same data location as the source table. To place new files in the 
source table directory use the migrate " +
+        "command.");
+    this.destDataLocation = newLocation;

Review comment:
       I am not sure this is correct. Shouldn't we be assigning the table 
location, not data location?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] aokolnychyi commented on a change in pull request #1525: Provide API and Implementation for Creating Iceberg Tables from Spark

Reply via email to