szehon-ho commented on a change in pull request #3056: URL: https://github.com/apache/iceberg/pull/3056#discussion_r813231019
########## File path: spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java ########## @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.spark.sql; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.AssertHelpers; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Streams; +import org.apache.iceberg.spark.SparkCatalogTestBase; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestDropTable extends SparkCatalogTestBase { + + public TestDropTable(String catalogName, String implementation, Map<String, String> config) { + super(catalogName, implementation, config); + } + + @Before + public void createTable() { + sql("CREATE TABLE %s (id INT, name STRING) USING iceberg", tableName); + sql("INSERT INTO %s VALUES (1, 'test')", tableName); + } + + @After + public void removeTable() throws IOException { + sql("DROP TABLE IF EXISTS %s", tableName); + } + + @Test + public void testDropTable() throws IOException { + dropTableInternal(); + } + + @Test + public void testDropTableGCDisabled() throws IOException { + sql("ALTER TABLE %s SET TBLPROPERTIES (gc.enabled = false)", tableName); + dropTableInternal(); + } + + private void dropTableInternal() throws IOException { + assertEquals("Should have expected rows", + ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + + List<String> manifestAndFiles = getManifestsAndFiles(); + Assert.assertEquals("There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertTrue("All files should be existed", checkFilesExist(manifestAndFiles, true)); + + sql("DROP TABLE %s", tableName); + Assert.assertFalse("Table should not exist", validationCatalog.tableExists(tableIdent)); + + if (catalogName.equals("testhadoop")) { + // HadoopCatalog drop table without purge will delete the base table location. + Assert.assertTrue("All files should be deleted", checkFilesExist(manifestAndFiles, false)); + } else { + Assert.assertTrue("All files should not be deleted", checkFilesExist(manifestAndFiles, true)); + } + } + + @Test + public void testPurgeTable() throws IOException { + assertEquals("Should have expected rows", + ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + + List<String> manifestAndFiles = getManifestsAndFiles(); + Assert.assertEquals("There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertTrue("All files should be existed", checkFilesExist(manifestAndFiles, true)); Review comment: Nit: "should be existed" => "should exist" ########## File path: spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java ########## @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.spark.sql; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.AssertHelpers; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Streams; +import org.apache.iceberg.spark.SparkCatalogTestBase; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestDropTable extends SparkCatalogTestBase { + + public TestDropTable(String catalogName, String implementation, Map<String, String> config) { + super(catalogName, implementation, config); + } + + @Before + public void createTable() { + sql("CREATE TABLE %s (id INT, name STRING) USING iceberg", tableName); + sql("INSERT INTO %s VALUES (1, 'test')", tableName); + } + + @After + public void removeTable() throws IOException { + sql("DROP TABLE IF EXISTS %s", tableName); + } + + @Test + public void testDropTable() throws IOException { + dropTableInternal(); + } + + @Test + public void testDropTableGCDisabled() throws IOException { + sql("ALTER TABLE %s SET TBLPROPERTIES (gc.enabled = false)", tableName); + dropTableInternal(); + } + + private void dropTableInternal() throws IOException { + assertEquals("Should have expected rows", + ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + + List<String> manifestAndFiles = getManifestsAndFiles(); + Assert.assertEquals("There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); Review comment: Nit, "There totally should have" can be "there should be" ########## File path: spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java ########## @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.spark.sql; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.AssertHelpers; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Streams; +import org.apache.iceberg.spark.SparkCatalogTestBase; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestDropTable extends SparkCatalogTestBase { + + public TestDropTable(String catalogName, String implementation, Map<String, String> config) { + super(catalogName, implementation, config); + } + + @Before + public void createTable() { + sql("CREATE TABLE %s (id INT, name STRING) USING iceberg", tableName); + sql("INSERT INTO %s VALUES (1, 'test')", tableName); + } + + @After + public void removeTable() throws IOException { + sql("DROP TABLE IF EXISTS %s", tableName); + } + + @Test + public void testDropTable() throws IOException { + dropTableInternal(); + } + + @Test + public void testDropTableGCDisabled() throws IOException { + sql("ALTER TABLE %s SET TBLPROPERTIES (gc.enabled = false)", tableName); + dropTableInternal(); + } + + private void dropTableInternal() throws IOException { + assertEquals("Should have expected rows", + ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + + List<String> manifestAndFiles = getManifestsAndFiles(); + Assert.assertEquals("There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertTrue("All files should be existed", checkFilesExist(manifestAndFiles, true)); + + sql("DROP TABLE %s", tableName); + Assert.assertFalse("Table should not exist", validationCatalog.tableExists(tableIdent)); + + if (catalogName.equals("testhadoop")) { + // HadoopCatalog drop table without purge will delete the base table location. + Assert.assertTrue("All files should be deleted", checkFilesExist(manifestAndFiles, false)); + } else { + Assert.assertTrue("All files should not be deleted", checkFilesExist(manifestAndFiles, true)); + } + } + + @Test + public void testPurgeTable() throws IOException { + assertEquals("Should have expected rows", + ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + + List<String> manifestAndFiles = getManifestsAndFiles(); + Assert.assertEquals("There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertTrue("All files should be existed", checkFilesExist(manifestAndFiles, true)); + + sql("DROP TABLE %s PURGE", tableName); + Assert.assertFalse("Table should not exist", validationCatalog.tableExists(tableIdent)); + Assert.assertTrue("All files should be deleted", checkFilesExist(manifestAndFiles, false)); + } + + @Test + public void testPurgeTableGCDisabled() throws IOException { + sql("ALTER TABLE %s SET TBLPROPERTIES (gc.enabled = false)", tableName); + + assertEquals("Should have expected rows", + ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + + List<String> manifestAndFiles = getManifestsAndFiles(); + Assert.assertEquals("There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertTrue("All files should be existed", checkFilesExist(manifestAndFiles, true)); Review comment: same as above ########## File path: spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java ########## @@ -244,15 +253,51 @@ public SparkTable alterTable(Identifier ident, TableChange... changes) throws No @Override public boolean dropTable(Identifier ident) { + return dropTableWithoutPurging(ident); + } + + @Override + public boolean purgeTable(Identifier ident) { try { - return isPathIdentifier(ident) ? - tables.dropTable(((PathIdentifier) ident).location()) : - icebergCatalog.dropTable(buildIdentifier(ident)); + Table table = load(ident).first(); + ValidationException.check( + PropertyUtil.propertyAsBoolean(table.properties(), GC_ENABLED, GC_ENABLED_DEFAULT), + "Cannot purge table: GC is disabled (deleting files may corrupt other tables)"); + String metadataFileLocation = ((HasTableOperations) table).operations().current().metadataFileLocation(); + + boolean dropped = dropTableWithoutPurging(ident); + + if (dropped) { + try { + // We should check whether the metadata file is existed. Because the HadoopCatalog/HadoopTables will drop the Review comment: Nit "is existed" => "exists" ########## File path: spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java ########## @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.spark.sql; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.AssertHelpers; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Streams; +import org.apache.iceberg.spark.SparkCatalogTestBase; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestDropTable extends SparkCatalogTestBase { + + public TestDropTable(String catalogName, String implementation, Map<String, String> config) { + super(catalogName, implementation, config); + } + + @Before + public void createTable() { + sql("CREATE TABLE %s (id INT, name STRING) USING iceberg", tableName); + sql("INSERT INTO %s VALUES (1, 'test')", tableName); + } + + @After + public void removeTable() throws IOException { + sql("DROP TABLE IF EXISTS %s", tableName); + } + + @Test + public void testDropTable() throws IOException { + dropTableInternal(); + } + + @Test + public void testDropTableGCDisabled() throws IOException { + sql("ALTER TABLE %s SET TBLPROPERTIES (gc.enabled = false)", tableName); + dropTableInternal(); + } + + private void dropTableInternal() throws IOException { + assertEquals("Should have expected rows", + ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + + List<String> manifestAndFiles = getManifestsAndFiles(); + Assert.assertEquals("There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertTrue("All files should be existed", checkFilesExist(manifestAndFiles, true)); + + sql("DROP TABLE %s", tableName); + Assert.assertFalse("Table should not exist", validationCatalog.tableExists(tableIdent)); + + if (catalogName.equals("testhadoop")) { + // HadoopCatalog drop table without purge will delete the base table location. + Assert.assertTrue("All files should be deleted", checkFilesExist(manifestAndFiles, false)); + } else { + Assert.assertTrue("All files should not be deleted", checkFilesExist(manifestAndFiles, true)); + } + } + + @Test + public void testPurgeTable() throws IOException { + assertEquals("Should have expected rows", + ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + + List<String> manifestAndFiles = getManifestsAndFiles(); + Assert.assertEquals("There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertTrue("All files should be existed", checkFilesExist(manifestAndFiles, true)); + + sql("DROP TABLE %s PURGE", tableName); + Assert.assertFalse("Table should not exist", validationCatalog.tableExists(tableIdent)); + Assert.assertTrue("All files should be deleted", checkFilesExist(manifestAndFiles, false)); + } + + @Test + public void testPurgeTableGCDisabled() throws IOException { + sql("ALTER TABLE %s SET TBLPROPERTIES (gc.enabled = false)", tableName); + + assertEquals("Should have expected rows", + ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + + List<String> manifestAndFiles = getManifestsAndFiles(); + Assert.assertEquals("There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertTrue("All files should be existed", checkFilesExist(manifestAndFiles, true)); + + AssertHelpers.assertThrows("Purge table is not allowed when GC is disabled", ValidationException.class, + "Cannot purge table: GC is disabled (deleting files may corrupt other tables", + () -> sql("DROP TABLE %s PURGE", tableName)); + + Assert.assertTrue("Table should not been dropped", validationCatalog.tableExists(tableIdent)); + Assert.assertTrue("All files should not be deleted", checkFilesExist(manifestAndFiles, true)); + } + + private List<String> getManifestsAndFiles() { Review comment: nit: in Iceberg code style, can remove get from method name: https://iceberg.apache.org/contribute/. (in "Method Naming". (I have received a few reviews on this myself :)) ########## File path: spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java ########## @@ -244,15 +253,51 @@ public SparkTable alterTable(Identifier ident, TableChange... changes) throws No @Override public boolean dropTable(Identifier ident) { + return dropTableWithoutPurging(ident); + } + + @Override + public boolean purgeTable(Identifier ident) { try { - return isPathIdentifier(ident) ? - tables.dropTable(((PathIdentifier) ident).location()) : - icebergCatalog.dropTable(buildIdentifier(ident)); + Table table = load(ident).first(); + ValidationException.check( + PropertyUtil.propertyAsBoolean(table.properties(), GC_ENABLED, GC_ENABLED_DEFAULT), + "Cannot purge table: GC is disabled (deleting files may corrupt other tables)"); + String metadataFileLocation = ((HasTableOperations) table).operations().current().metadataFileLocation(); + + boolean dropped = dropTableWithoutPurging(ident); + + if (dropped) { + try { + // We should check whether the metadata file is existed. Because the HadoopCatalog/HadoopTables will drop the + // warehouse directly and ignore the `purge` argument. + table.io().newInputFile(metadataFileLocation).newStream().close(); Review comment: Cant we use InputFile.exists() instead of newStream and check for exception? ########## File path: spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java ########## @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.spark.sql; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.AssertHelpers; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Streams; +import org.apache.iceberg.spark.SparkCatalogTestBase; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestDropTable extends SparkCatalogTestBase { + + public TestDropTable(String catalogName, String implementation, Map<String, String> config) { + super(catalogName, implementation, config); + } + + @Before + public void createTable() { + sql("CREATE TABLE %s (id INT, name STRING) USING iceberg", tableName); + sql("INSERT INTO %s VALUES (1, 'test')", tableName); + } + + @After + public void removeTable() throws IOException { + sql("DROP TABLE IF EXISTS %s", tableName); + } + + @Test + public void testDropTable() throws IOException { + dropTableInternal(); + } + + @Test + public void testDropTableGCDisabled() throws IOException { + sql("ALTER TABLE %s SET TBLPROPERTIES (gc.enabled = false)", tableName); + dropTableInternal(); + } + + private void dropTableInternal() throws IOException { + assertEquals("Should have expected rows", + ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + + List<String> manifestAndFiles = getManifestsAndFiles(); + Assert.assertEquals("There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertTrue("All files should be existed", checkFilesExist(manifestAndFiles, true)); + + sql("DROP TABLE %s", tableName); + Assert.assertFalse("Table should not exist", validationCatalog.tableExists(tableIdent)); + + if (catalogName.equals("testhadoop")) { + // HadoopCatalog drop table without purge will delete the base table location. + Assert.assertTrue("All files should be deleted", checkFilesExist(manifestAndFiles, false)); + } else { + Assert.assertTrue("All files should not be deleted", checkFilesExist(manifestAndFiles, true)); + } + } + + @Test + public void testPurgeTable() throws IOException { + assertEquals("Should have expected rows", + ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + + List<String> manifestAndFiles = getManifestsAndFiles(); + Assert.assertEquals("There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); Review comment: same as above ########## File path: spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java ########## @@ -244,15 +253,51 @@ public SparkTable alterTable(Identifier ident, TableChange... changes) throws No @Override public boolean dropTable(Identifier ident) { + return dropTableWithoutPurging(ident); + } + + @Override + public boolean purgeTable(Identifier ident) { try { - return isPathIdentifier(ident) ? - tables.dropTable(((PathIdentifier) ident).location()) : - icebergCatalog.dropTable(buildIdentifier(ident)); + Table table = load(ident).first(); + ValidationException.check( + PropertyUtil.propertyAsBoolean(table.properties(), GC_ENABLED, GC_ENABLED_DEFAULT), + "Cannot purge table: GC is disabled (deleting files may corrupt other tables)"); + String metadataFileLocation = ((HasTableOperations) table).operations().current().metadataFileLocation(); + + boolean dropped = dropTableWithoutPurging(ident); + + if (dropped) { + try { + // We should check whether the metadata file is existed. Because the HadoopCatalog/HadoopTables will drop the + // warehouse directly and ignore the `purge` argument. + table.io().newInputFile(metadataFileLocation).newStream().close(); + } catch (NotFoundException e) { + return true; + } catch (IOException e) { + throw new UncheckedIOException(e); + } + + SparkActions.get() + .deleteReachableFiles(metadataFileLocation) + .io(table.io()) + .execute(); + } + + return dropped; } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { return false; } } + private boolean dropTableWithoutPurging(Identifier ident) { + if (isPathIdentifier(ident)) { + return tables.dropTable(((PathIdentifier) ident).location(), false); Review comment: Nit: there seems to also be guidelines about making comment when passing booleans in https://iceberg.apache.org/contribute/. in "Boolean Arguments" , about this very case it seems! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
