[GitHub] [iceberg] RussellSpitzer commented on a change in pull request #1525: Provide API and Implementation for Creating Iceberg Tables from Spark

GitBox Fri, 04 Dec 2020 10:32:57 -0800


RussellSpitzer commented on a change in pull request #1525:
URL: https://github.com/apache/iceberg/pull/1525#discussion_r536297083




##########
File path: 
spark3/src/test/java/org/apache/iceberg/actions/TestCreateActions.java
##########
@@ -0,0 +1,347 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.actions;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.filefilter.TrueFileFilter;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
+import org.apache.iceberg.spark.Spark3Util;
+import org.apache.iceberg.spark.SparkCatalog;
+import org.apache.iceberg.spark.SparkCatalogTestBase;
+import org.apache.iceberg.spark.SparkSessionCatalog;
+import org.apache.iceberg.spark.source.SimpleRecord;
+import org.apache.iceberg.spark.source.SparkTable;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.catalyst.TableIdentifier;
+import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException;
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
+import org.apache.spark.sql.catalyst.catalog.CatalogTable;
+import org.apache.spark.sql.catalyst.parser.ParseException;
+import org.apache.spark.sql.connector.catalog.Identifier;
+import org.apache.spark.sql.connector.catalog.TableCatalog;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runners.Parameterized;
+import scala.Some;
+import scala.collection.Seq;
+
+public class TestCreateActions extends SparkCatalogTestBase {
+  private static final String CREATE_PARTITIONED_PARQUET = "CREATE TABLE %s 
(id INT, data STRING) " +
+      "using parquet PARTITIONED BY (id) LOCATION '%s'";
+  private static final String CREATE_PARQUET = "CREATE TABLE %s (id INT, data 
STRING) " +
+      "using parquet LOCATION '%s'";
+  private static final String CREATE_HIVE_EXTERNAL_PARQUET = "CREATE EXTERNAL 
TABLE %s (data STRING) " +
+      "PARTITIONED BY (id INT) STORED AS parquet LOCATION '%s'";
+  private static final String CREATE_HIVE_PARQUET = "CREATE TABLE %s (data 
STRING) " +
+      "PARTITIONED BY (id INT) STORED AS parquet";
+
+  private static final String NAMESPACE = "default";
+
+  @Parameterized.Parameters(name = "Catalog Name {0} - Options {2}")
+  public static Object[][] parameters() {
+    return new Object[][] {
+        new Object[] {"spark_catalog", SparkSessionCatalog.class.getName(), 
ImmutableMap.of(
+            "type", "hive",
+            "default-namespace", "default",
+            "parquet-enabled", "true",
+            "cache-enabled", "false" // Spark will delete tables using v1, 
leaving the cache out of sync
+        )},
+        new Object[] {"spark_catalog", SparkSessionCatalog.class.getName(), 
ImmutableMap.of(
+            "type", "hadoop",
+            "default-namespace", "default",
+            "parquet-enabled", "true",
+            "cache-enabled", "false" // Spark will delete tables using v1, 
leaving the cache out of sync
+        )},
+        new Object[] { "testhive", SparkCatalog.class.getName(), 
ImmutableMap.of(
+            "type", "hive",
+            "default-namespace", "default"
+        )},
+        new Object[] { "testhadoop", SparkCatalog.class.getName(), 
ImmutableMap.of(
+            "type", "hadoop",
+            "default-namespace", "default"
+        )}
+    };
+  }
+
+  @Rule
+  public TemporaryFolder temp = new TemporaryFolder();
+
+  private String baseTableName = "baseTable";
+  private File tableDir;
+  private String tableLocation;
+  private final String type;
+  private final TableCatalog catalog;
+
+  public TestCreateActions(
+      String catalogName,
+      String implementation,
+      Map<String, String> config) {
+    super(catalogName, implementation, config);
+    this.catalog = (TableCatalog) 
spark.sessionState().catalogManager().catalog(catalogName);
+    this.type = config.get("type");
+  }
+
+  @Before
+  public void before() {
+    try {
+      this.tableDir = temp.newFolder();
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+    this.tableLocation = tableDir.toURI().toString();
+
+    spark.conf().set("hive.exec.dynamic.partition", "true");
+    spark.conf().set("hive.exec.dynamic.partition.mode", "nonstrict");
+    spark.sql(String.format("DROP TABLE IF EXISTS %s", baseTableName));
+
+    List<SimpleRecord> expected = Lists.newArrayList(
+        new SimpleRecord(1, "a"),
+        new SimpleRecord(2, "b"),
+        new SimpleRecord(3, "c")
+    );
+
+    Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);
+
+    df.select("id", "data").orderBy("data").write()
+        .mode("append")
+        .option("path", tableLocation)
+        .saveAsTable(baseTableName);
+  }
+
+  @After
+  public void after() throws IOException {
+    // Drop the hive table.
+    spark.sql(String.format("DROP TABLE IF EXISTS %s", baseTableName));
+  }
+
+  @Test
+  public void testMigratePartitioned() throws Exception {
+    Assume.assumeTrue("Cannot migrate to a hadoop based catalog", 
!type.equals("hadoop"));
+    Assume.assumeTrue("Can only migrate from Spark Session Catalog", 
catalog.name().equals("spark_catalog"));
+    String source = sourceName("test_migrate_partitioned_table");
+    String dest = source;
+    createSourceTable(CREATE_PARTITIONED_PARQUET, source);
+    assertMigratedFileCount(Actions.migrate(source), source, dest);
+  }
+
+  @Test
+  public void testMigrateUnpartitioned() throws Exception {
+    Assume.assumeTrue("Cannot migrate to a hadoop based catalog", 
!type.equals("hadoop"));
+    Assume.assumeTrue("Can only migrate from Spark Session Catalog", 
catalog.name().equals("spark_catalog"));
+    String source = sourceName("test_migrate_unpartitioned_table");
+    String dest = source;
+    createSourceTable(CREATE_PARQUET, source);
+    assertMigratedFileCount(Actions.migrate(source), source, dest);
+  }
+
+  @Test
+  public void testSnapshotPartitioned() throws Exception {
+    Assume.assumeTrue("Cannot snapshot with arbitrary location in a hadoop 
based catalog",
+        !type.equals("hadoop"));
+    File location = temp.newFolder();
+    String source = sourceName("test_snapshot_partitioned_table");
+    String dest = destName("iceberg_snapshot_partitioned");
+    createSourceTable(CREATE_PARTITIONED_PARQUET, source);
+    assertMigratedFileCount(Actions.snapshot(source, dest, 
location.toString()), source, dest);
+    assertIsolatedSnapshot(source, dest);
+  }
+
+  @Test
+  public void testSnapshotUnpartitioned() throws Exception {
+    Assume.assumeTrue("Cannot snapshot with arbitrary location in a hadoop 
based catalog",
+        !type.equals("hadoop"));
+    File location = temp.newFolder();
+    String source = sourceName("test_snapshot_unpartitioned_table");
+    String dest = destName("iceberg_snapshot_unpartitioned");
+    createSourceTable(CREATE_PARQUET, source);
+    assertMigratedFileCount(Actions.snapshot(source, dest, 
location.toString()), source, dest);
+    assertIsolatedSnapshot(source, dest);
+  }
+
+  @Test
+  public void testSnapshotHiveTable() throws Exception {
+    Assume.assumeTrue("Cannot snapshot with arbitrary location in a hadoop 
based catalog",
+        !type.equals("hadoop"));
+    File location = temp.newFolder();
+    String source = sourceName("snapshot_hive_table");
+    String dest = destName("iceberg_snapshot_hive_table");
+    createSourceTable(CREATE_HIVE_EXTERNAL_PARQUET, source);
+    assertMigratedFileCount(Actions.snapshot(source, dest, 
location.toString()), source, dest);
+    assertIsolatedSnapshot(source, dest);
+  }
+
+  @Test
+  public void testMigrateHiveTable() throws Exception {
+    Assume.assumeTrue("Cannot migrate to a hadoop based catalog", 
!type.equals("hadoop"));
+    String source = sourceName("migrate_hive_table");
+    String dest = source;
+    createSourceTable(CREATE_HIVE_EXTERNAL_PARQUET, source);
+    assertMigratedFileCount(Actions.migrate(source), source, dest);
+  }
+
+  @Test
+  public void testSnapshotManagedHiveTable() throws Exception {
+    Assume.assumeTrue("Cannot migrate to a hadoop based catalog", 
!type.equals("hadoop"));
+    File location = temp.newFolder();
+    String source = sourceName("snapshot_managed_hive_table");
+    String dest = destName("iceberg_snapshot_managed_hive_table");
+    createSourceTable(CREATE_HIVE_PARQUET, source);
+    assertMigratedFileCount(Actions.snapshot(source, dest, 
location.toString()), source, dest);

Review comment:
       Changed the interfaces around so it is no longer part of the static 
method




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] RussellSpitzer commented on a change in pull request #1525: Provide API and Implementation for Creating Iceberg Tables from Spark

Reply via email to