This is an automated email from the ASF dual-hosted git repository.

etudenhoefner pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new 16b6cefbb5 Nessie: Permit Iceberg GC via explicit property overrides 
(#8382)
16b6cefbb5 is described below

commit 16b6cefbb5c5992dccee086fae8912fe5b3fbd00
Author: Dmitri Bourlatchkov <[email protected]>
AuthorDate: Tue Sep 19 10:08:20 2023 -0400

    Nessie: Permit Iceberg GC via explicit property overrides (#8382)
---
 .../org/apache/iceberg/nessie/NessieCatalog.java   | 20 ++++++--
 .../iceberg/nessie/NessieTableOperations.java      |  2 +
 .../java/org/apache/iceberg/nessie/NessieUtil.java | 52 +++++++++++++--------
 .../org/apache/iceberg/nessie/BaseTestIceberg.java |  7 +++
 .../org/apache/iceberg/nessie/TestNessieTable.java | 54 ++++++++++++++++------
 5 files changed, 100 insertions(+), 35 deletions(-)

diff --git a/nessie/src/main/java/org/apache/iceberg/nessie/NessieCatalog.java 
b/nessie/src/main/java/org/apache/iceberg/nessie/NessieCatalog.java
index ad5af4c3b4..b02df3d759 100644
--- a/nessie/src/main/java/org/apache/iceberg/nessie/NessieCatalog.java
+++ b/nessie/src/main/java/org/apache/iceberg/nessie/NessieCatalog.java
@@ -28,6 +28,7 @@ import org.apache.iceberg.BaseMetastoreCatalog;
 import org.apache.iceberg.CatalogProperties;
 import org.apache.iceberg.CatalogUtil;
 import org.apache.iceberg.TableOperations;
+import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.catalog.Namespace;
 import org.apache.iceberg.catalog.SupportsNamespaces;
 import org.apache.iceberg.catalog.TableIdentifier;
@@ -58,12 +59,22 @@ public class NessieCatalog extends BaseMetastoreCatalog
   private static final Logger LOG = 
LoggerFactory.getLogger(NessieCatalog.class);
   private static final Joiner SLASH = Joiner.on("/");
   private static final String NAMESPACE_LOCATION_PROPS = "location";
+
+  private static final Map<String, String> DEFAULT_CATALOG_OPTIONS =
+      ImmutableMap.<String, String>builder()
+          .put(CatalogProperties.TABLE_DEFAULT_PREFIX + 
TableProperties.GC_ENABLED, "false")
+          .put(
+              CatalogProperties.TABLE_DEFAULT_PREFIX
+                  + TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED,
+              "false") // just in case 
METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT changes
+          .build();
+
   private NessieIcebergClient client;
   private String warehouseLocation;
   private Object config;
   private String name;
   private FileIO fileIO;
-  private Map<String, String> catalogOptions;
+  private Map<String, String> catalogOptions = DEFAULT_CATALOG_OPTIONS;
   private CloseableGroup closeableGroup;
 
   public NessieCatalog() {}
@@ -128,7 +139,10 @@ public class NessieCatalog extends BaseMetastoreCatalog
     this.client = Preconditions.checkNotNull(client, "client must be 
non-null");
     this.fileIO = Preconditions.checkNotNull(fileIO, "fileIO must be 
non-null");
     this.catalogOptions =
-        Preconditions.checkNotNull(catalogOptions, "catalogOptions must be 
non-null");
+        ImmutableMap.<String, String>builder()
+            .putAll(DEFAULT_CATALOG_OPTIONS)
+            .putAll(Preconditions.checkNotNull(catalogOptions, "catalogOptions 
must be non-null"))
+            .buildKeepingLast();
     this.warehouseLocation = validateWarehouseLocation(name, catalogOptions);
     this.closeableGroup = new CloseableGroup();
     closeableGroup.addCloseable(client);
@@ -346,6 +360,6 @@ public class NessieCatalog extends BaseMetastoreCatalog
 
   @Override
   protected Map<String, String> properties() {
-    return catalogOptions == null ? ImmutableMap.of() : catalogOptions;
+    return catalogOptions;
   }
 }
diff --git 
a/nessie/src/main/java/org/apache/iceberg/nessie/NessieTableOperations.java 
b/nessie/src/main/java/org/apache/iceberg/nessie/NessieTableOperations.java
index 4a55c73ec4..d999e10e91 100644
--- a/nessie/src/main/java/org/apache/iceberg/nessie/NessieTableOperations.java
+++ b/nessie/src/main/java/org/apache/iceberg/nessie/NessieTableOperations.java
@@ -55,6 +55,8 @@ public class NessieTableOperations extends 
BaseMetastoreTableOperations {
    */
   public static final String NESSIE_COMMIT_ID_PROPERTY = "nessie.commit.id";
 
+  public static final String NESSIE_GC_NO_WARNING_PROPERTY = 
"nessie.gc.no-warning";
+
   private final NessieIcebergClient client;
   private final ContentKey key;
   private IcebergTable table;
diff --git a/nessie/src/main/java/org/apache/iceberg/nessie/NessieUtil.java 
b/nessie/src/main/java/org/apache/iceberg/nessie/NessieUtil.java
index 4a7a73e408..8c69f844db 100644
--- a/nessie/src/main/java/org/apache/iceberg/nessie/NessieUtil.java
+++ b/nessie/src/main/java/org/apache/iceberg/nessie/NessieUtil.java
@@ -102,6 +102,38 @@ public final class NessieUtil {
         .orElseGet(() -> System.getProperty("user.name"));
   }
 
+  private static void checkAndUpdateGCProperties(
+      TableMetadata tableMetadata, Map<String, String> updatedProperties, 
String identifier) {
+    if (tableMetadata.propertyAsBoolean(
+        NessieTableOperations.NESSIE_GC_NO_WARNING_PROPERTY, false)) {
+      return;
+    }
+
+    // To prevent accidental deletion of files that are still referenced by 
other branches/tags,
+    // setting GC_ENABLED to 'false' is recommended, so that all Iceberg's gc 
operations like
+    // expire_snapshots, remove_orphan_files, drop_table with purge will fail 
with an error.
+    // `nessie-gc` CLI provides a reference-aware GC functionality for the 
expired/unreferenced
+    // files.
+    // Advanced users may still want to use the simpler Iceberg GC tools iff 
their Nessie Server
+    // contains only one branch (in which case the full Nessie history will be 
reflected in the
+    // Iceberg sequence of snapshots).
+    if (tableMetadata.propertyAsBoolean(
+            TableProperties.GC_ENABLED, TableProperties.GC_ENABLED_DEFAULT)
+        || tableMetadata.propertyAsBoolean(
+            TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED,
+            TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT)) {
+      
updatedProperties.put(NessieTableOperations.NESSIE_GC_NO_WARNING_PROPERTY, 
"true");
+      LOG.warn(
+          "The Iceberg property '{}' and/or '{}' is enabled on table '{}' in 
NessieCatalog."
+              + " This will likely make data in other Nessie branches and tags 
and in earlier, historical Nessie"
+              + " commits inaccessible. The recommended setting for those 
properties is 'false'. Use the 'nessie-gc'"
+              + " tool for Nessie reference-aware garbage collection.",
+          TableProperties.GC_ENABLED,
+          TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED,
+          identifier);
+    }
+  }
+
   public static TableMetadata updateTableMetadataWithNessieSpecificProperties(
       TableMetadata tableMetadata,
       String metadataLocation,
@@ -111,24 +143,8 @@ public final class NessieUtil {
     // Update the TableMetadata with the Content of NessieTableState.
     Map<String, String> newProperties = 
Maps.newHashMap(tableMetadata.properties());
     newProperties.put(NessieTableOperations.NESSIE_COMMIT_ID_PROPERTY, 
reference.getHash());
-    // To prevent accidental deletion of files that are still referenced by 
other branches/tags,
-    // setting GC_ENABLED to false. So that all Iceberg's gc operations like 
expire_snapshots,
-    // remove_orphan_files, drop_table with purge will fail with an error.
-    // Nessie CLI will provide a reference aware GC functionality for the 
expired/unreferenced
-    // files.
-    newProperties.put(TableProperties.GC_ENABLED, "false");
-
-    boolean metadataCleanupEnabled =
-        newProperties
-            
.getOrDefault(TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED, "false")
-            .equalsIgnoreCase("true");
-    if (metadataCleanupEnabled) {
-      newProperties.put(TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED, 
"false");
-      LOG.warn(
-          "Automatic table metadata files cleanup was requested, but disabled 
because "
-              + "the Nessie catalog can use historical metadata files from 
other references. "
-              + "Use the 'nessie-gc' tool for history-aware GC");
-    }
+
+    checkAndUpdateGCProperties(tableMetadata, newProperties, identifier);
 
     TableMetadata.Builder builder =
         TableMetadata.buildFrom(tableMetadata)
diff --git 
a/nessie/src/test/java/org/apache/iceberg/nessie/BaseTestIceberg.java 
b/nessie/src/test/java/org/apache/iceberg/nessie/BaseTestIceberg.java
index f4ebfdc4be..69dae7c21f 100644
--- a/nessie/src/test/java/org/apache/iceberg/nessie/BaseTestIceberg.java
+++ b/nessie/src/test/java/org/apache/iceberg/nessie/BaseTestIceberg.java
@@ -23,7 +23,9 @@ import static 
org.apache.iceberg.types.Types.NestedField.required;
 import java.io.IOException;
 import java.net.URI;
 import java.nio.file.Path;
+import java.util.Collections;
 import java.util.List;
+import java.util.Map;
 import org.apache.avro.generic.GenericData.Record;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.iceberg.BaseTable;
@@ -136,10 +138,15 @@ public abstract class BaseTestIceberg {
   }
 
   NessieCatalog initCatalog(String ref, String hash) {
+    return initCatalog(ref, hash, Collections.emptyMap());
+  }
+
+  NessieCatalog initCatalog(String ref, String hash, Map<String, String> 
extraOptions) {
     NessieCatalog newCatalog = new NessieCatalog();
     newCatalog.setConf(hadoopConfig);
     ImmutableMap.Builder<String, String> options =
         ImmutableMap.<String, String>builder()
+            .putAll(extraOptions)
             .put("ref", ref)
             .put(CatalogProperties.URI, uri)
             .put("auth-type", "NONE")
diff --git 
a/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieTable.java 
b/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieTable.java
index f9ec85053e..028f178b2d 100644
--- a/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieTable.java
+++ b/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieTable.java
@@ -38,6 +38,7 @@ import java.util.stream.Stream;
 import org.apache.avro.generic.GenericData;
 import org.apache.avro.generic.GenericRecordBuilder;
 import org.apache.iceberg.BaseTable;
+import org.apache.iceberg.CatalogProperties;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.HasTableOperations;
 import org.apache.iceberg.ManifestFile;
@@ -53,6 +54,7 @@ import org.apache.iceberg.exceptions.AlreadyExistsException;
 import org.apache.iceberg.exceptions.CommitFailedException;
 import org.apache.iceberg.exceptions.NotFoundException;
 import org.apache.iceberg.exceptions.ValidationException;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.types.Types;
 import org.assertj.core.api.Assertions;
@@ -573,12 +575,11 @@ public class TestNessieTable extends BaseTestIceberg {
   }
 
   @Test
-  public void testGCEnabled() {
+  public void testGCDisabled() {
     Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);
 
-    
Assertions.assertThat(icebergTable.properties().get(TableProperties.GC_ENABLED))
-        .isNotNull()
-        .isEqualTo("false");
+    Assertions.assertThat(icebergTable.properties())
+        .containsEntry(TableProperties.GC_ENABLED, "false");
 
     Assertions.assertThatThrownBy(
             () ->
@@ -589,18 +590,43 @@ public class TestNessieTable extends BaseTestIceberg {
   }
 
   @Test
-  public void testTableMetadataFilesCleanupDisable() throws 
NessieNotFoundException {
+  public void testGCEnabled() {
     Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);
+    icebergTable.updateProperties().set(TableProperties.GC_ENABLED, 
"true").commit();
+    Assertions.assertThat(icebergTable.properties())
+        .containsEntry(TableProperties.GC_ENABLED, "true");
 
-    // Forceful setting of property also should get override with false
-    icebergTable
-        .updateProperties()
-        .set(TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED, "true")
-        .commit();
-    Assertions.assertThat(
-            
icebergTable.properties().get(TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED))
-        .isNotNull()
-        .isEqualTo("false");
+    Assertions.assertThatCode(
+            () ->
+                
icebergTable.expireSnapshots().expireOlderThan(System.currentTimeMillis()).commit())
+        .doesNotThrowAnyException();
+  }
+
+  @Test
+  public void testGCEnabledViaTableDefaultCatalogProperty() {
+    catalog.dropTable(TABLE_IDENTIFIER, false); // pre-created in @BeforeEach
+
+    catalog =
+        initCatalog(
+            branch,
+            null,
+            ImmutableMap.<String, String>builder()
+                .put(CatalogProperties.TABLE_DEFAULT_PREFIX + 
TableProperties.GC_ENABLED, "true")
+                .build());
+
+    // Create the table again using updated config defaults.
+    tableLocation = createTable(TABLE_IDENTIFIER, 
schema).location().replaceFirst("file:", "");
+    Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);
+
+    Assertions.assertThatCode(
+            () ->
+                
icebergTable.expireSnapshots().expireOlderThan(System.currentTimeMillis()).commit())
+        .doesNotThrowAnyException();
+  }
+
+  @Test
+  public void testTableMetadataFilesCleanupDisable() throws 
NessieNotFoundException {
+    Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);
 
     icebergTable
         .updateProperties()

Reply via email to