This is an automated email from the ASF dual-hosted git repository.
etudenhoefner pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 16b6cefbb5 Nessie: Permit Iceberg GC via explicit property overrides
(#8382)
16b6cefbb5 is described below
commit 16b6cefbb5c5992dccee086fae8912fe5b3fbd00
Author: Dmitri Bourlatchkov <[email protected]>
AuthorDate: Tue Sep 19 10:08:20 2023 -0400
Nessie: Permit Iceberg GC via explicit property overrides (#8382)
---
.../org/apache/iceberg/nessie/NessieCatalog.java | 20 ++++++--
.../iceberg/nessie/NessieTableOperations.java | 2 +
.../java/org/apache/iceberg/nessie/NessieUtil.java | 52 +++++++++++++--------
.../org/apache/iceberg/nessie/BaseTestIceberg.java | 7 +++
.../org/apache/iceberg/nessie/TestNessieTable.java | 54 ++++++++++++++++------
5 files changed, 100 insertions(+), 35 deletions(-)
diff --git a/nessie/src/main/java/org/apache/iceberg/nessie/NessieCatalog.java
b/nessie/src/main/java/org/apache/iceberg/nessie/NessieCatalog.java
index ad5af4c3b4..b02df3d759 100644
--- a/nessie/src/main/java/org/apache/iceberg/nessie/NessieCatalog.java
+++ b/nessie/src/main/java/org/apache/iceberg/nessie/NessieCatalog.java
@@ -28,6 +28,7 @@ import org.apache.iceberg.BaseMetastoreCatalog;
import org.apache.iceberg.CatalogProperties;
import org.apache.iceberg.CatalogUtil;
import org.apache.iceberg.TableOperations;
+import org.apache.iceberg.TableProperties;
import org.apache.iceberg.catalog.Namespace;
import org.apache.iceberg.catalog.SupportsNamespaces;
import org.apache.iceberg.catalog.TableIdentifier;
@@ -58,12 +59,22 @@ public class NessieCatalog extends BaseMetastoreCatalog
private static final Logger LOG =
LoggerFactory.getLogger(NessieCatalog.class);
private static final Joiner SLASH = Joiner.on("/");
private static final String NAMESPACE_LOCATION_PROPS = "location";
+
+ private static final Map<String, String> DEFAULT_CATALOG_OPTIONS =
+ ImmutableMap.<String, String>builder()
+ .put(CatalogProperties.TABLE_DEFAULT_PREFIX +
TableProperties.GC_ENABLED, "false")
+ .put(
+ CatalogProperties.TABLE_DEFAULT_PREFIX
+ + TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED,
+ "false") // just in case
METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT changes
+ .build();
+
private NessieIcebergClient client;
private String warehouseLocation;
private Object config;
private String name;
private FileIO fileIO;
- private Map<String, String> catalogOptions;
+ private Map<String, String> catalogOptions = DEFAULT_CATALOG_OPTIONS;
private CloseableGroup closeableGroup;
public NessieCatalog() {}
@@ -128,7 +139,10 @@ public class NessieCatalog extends BaseMetastoreCatalog
this.client = Preconditions.checkNotNull(client, "client must be
non-null");
this.fileIO = Preconditions.checkNotNull(fileIO, "fileIO must be
non-null");
this.catalogOptions =
- Preconditions.checkNotNull(catalogOptions, "catalogOptions must be
non-null");
+ ImmutableMap.<String, String>builder()
+ .putAll(DEFAULT_CATALOG_OPTIONS)
+ .putAll(Preconditions.checkNotNull(catalogOptions, "catalogOptions
must be non-null"))
+ .buildKeepingLast();
this.warehouseLocation = validateWarehouseLocation(name, catalogOptions);
this.closeableGroup = new CloseableGroup();
closeableGroup.addCloseable(client);
@@ -346,6 +360,6 @@ public class NessieCatalog extends BaseMetastoreCatalog
@Override
protected Map<String, String> properties() {
- return catalogOptions == null ? ImmutableMap.of() : catalogOptions;
+ return catalogOptions;
}
}
diff --git
a/nessie/src/main/java/org/apache/iceberg/nessie/NessieTableOperations.java
b/nessie/src/main/java/org/apache/iceberg/nessie/NessieTableOperations.java
index 4a55c73ec4..d999e10e91 100644
--- a/nessie/src/main/java/org/apache/iceberg/nessie/NessieTableOperations.java
+++ b/nessie/src/main/java/org/apache/iceberg/nessie/NessieTableOperations.java
@@ -55,6 +55,8 @@ public class NessieTableOperations extends
BaseMetastoreTableOperations {
*/
public static final String NESSIE_COMMIT_ID_PROPERTY = "nessie.commit.id";
+ public static final String NESSIE_GC_NO_WARNING_PROPERTY =
"nessie.gc.no-warning";
+
private final NessieIcebergClient client;
private final ContentKey key;
private IcebergTable table;
diff --git a/nessie/src/main/java/org/apache/iceberg/nessie/NessieUtil.java
b/nessie/src/main/java/org/apache/iceberg/nessie/NessieUtil.java
index 4a7a73e408..8c69f844db 100644
--- a/nessie/src/main/java/org/apache/iceberg/nessie/NessieUtil.java
+++ b/nessie/src/main/java/org/apache/iceberg/nessie/NessieUtil.java
@@ -102,6 +102,38 @@ public final class NessieUtil {
.orElseGet(() -> System.getProperty("user.name"));
}
+ private static void checkAndUpdateGCProperties(
+ TableMetadata tableMetadata, Map<String, String> updatedProperties,
String identifier) {
+ if (tableMetadata.propertyAsBoolean(
+ NessieTableOperations.NESSIE_GC_NO_WARNING_PROPERTY, false)) {
+ return;
+ }
+
+ // To prevent accidental deletion of files that are still referenced by
other branches/tags,
+ // setting GC_ENABLED to 'false' is recommended, so that all Iceberg's gc
operations like
+ // expire_snapshots, remove_orphan_files, drop_table with purge will fail
with an error.
+ // `nessie-gc` CLI provides a reference-aware GC functionality for the
expired/unreferenced
+ // files.
+ // Advanced users may still want to use the simpler Iceberg GC tools iff
their Nessie Server
+ // contains only one branch (in which case the full Nessie history will be
reflected in the
+ // Iceberg sequence of snapshots).
+ if (tableMetadata.propertyAsBoolean(
+ TableProperties.GC_ENABLED, TableProperties.GC_ENABLED_DEFAULT)
+ || tableMetadata.propertyAsBoolean(
+ TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED,
+ TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT)) {
+
updatedProperties.put(NessieTableOperations.NESSIE_GC_NO_WARNING_PROPERTY,
"true");
+ LOG.warn(
+ "The Iceberg property '{}' and/or '{}' is enabled on table '{}' in
NessieCatalog."
+ + " This will likely make data in other Nessie branches and tags
and in earlier, historical Nessie"
+ + " commits inaccessible. The recommended setting for those
properties is 'false'. Use the 'nessie-gc'"
+ + " tool for Nessie reference-aware garbage collection.",
+ TableProperties.GC_ENABLED,
+ TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED,
+ identifier);
+ }
+ }
+
public static TableMetadata updateTableMetadataWithNessieSpecificProperties(
TableMetadata tableMetadata,
String metadataLocation,
@@ -111,24 +143,8 @@ public final class NessieUtil {
// Update the TableMetadata with the Content of NessieTableState.
Map<String, String> newProperties =
Maps.newHashMap(tableMetadata.properties());
newProperties.put(NessieTableOperations.NESSIE_COMMIT_ID_PROPERTY,
reference.getHash());
- // To prevent accidental deletion of files that are still referenced by
other branches/tags,
- // setting GC_ENABLED to false. So that all Iceberg's gc operations like
expire_snapshots,
- // remove_orphan_files, drop_table with purge will fail with an error.
- // Nessie CLI will provide a reference aware GC functionality for the
expired/unreferenced
- // files.
- newProperties.put(TableProperties.GC_ENABLED, "false");
-
- boolean metadataCleanupEnabled =
- newProperties
-
.getOrDefault(TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED, "false")
- .equalsIgnoreCase("true");
- if (metadataCleanupEnabled) {
- newProperties.put(TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED,
"false");
- LOG.warn(
- "Automatic table metadata files cleanup was requested, but disabled
because "
- + "the Nessie catalog can use historical metadata files from
other references. "
- + "Use the 'nessie-gc' tool for history-aware GC");
- }
+
+ checkAndUpdateGCProperties(tableMetadata, newProperties, identifier);
TableMetadata.Builder builder =
TableMetadata.buildFrom(tableMetadata)
diff --git
a/nessie/src/test/java/org/apache/iceberg/nessie/BaseTestIceberg.java
b/nessie/src/test/java/org/apache/iceberg/nessie/BaseTestIceberg.java
index f4ebfdc4be..69dae7c21f 100644
--- a/nessie/src/test/java/org/apache/iceberg/nessie/BaseTestIceberg.java
+++ b/nessie/src/test/java/org/apache/iceberg/nessie/BaseTestIceberg.java
@@ -23,7 +23,9 @@ import static
org.apache.iceberg.types.Types.NestedField.required;
import java.io.IOException;
import java.net.URI;
import java.nio.file.Path;
+import java.util.Collections;
import java.util.List;
+import java.util.Map;
import org.apache.avro.generic.GenericData.Record;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.BaseTable;
@@ -136,10 +138,15 @@ public abstract class BaseTestIceberg {
}
NessieCatalog initCatalog(String ref, String hash) {
+ return initCatalog(ref, hash, Collections.emptyMap());
+ }
+
+ NessieCatalog initCatalog(String ref, String hash, Map<String, String>
extraOptions) {
NessieCatalog newCatalog = new NessieCatalog();
newCatalog.setConf(hadoopConfig);
ImmutableMap.Builder<String, String> options =
ImmutableMap.<String, String>builder()
+ .putAll(extraOptions)
.put("ref", ref)
.put(CatalogProperties.URI, uri)
.put("auth-type", "NONE")
diff --git
a/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieTable.java
b/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieTable.java
index f9ec85053e..028f178b2d 100644
--- a/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieTable.java
+++ b/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieTable.java
@@ -38,6 +38,7 @@ import java.util.stream.Stream;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.iceberg.BaseTable;
+import org.apache.iceberg.CatalogProperties;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.HasTableOperations;
import org.apache.iceberg.ManifestFile;
@@ -53,6 +54,7 @@ import org.apache.iceberg.exceptions.AlreadyExistsException;
import org.apache.iceberg.exceptions.CommitFailedException;
import org.apache.iceberg.exceptions.NotFoundException;
import org.apache.iceberg.exceptions.ValidationException;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.types.Types;
import org.assertj.core.api.Assertions;
@@ -573,12 +575,11 @@ public class TestNessieTable extends BaseTestIceberg {
}
@Test
- public void testGCEnabled() {
+ public void testGCDisabled() {
Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);
-
Assertions.assertThat(icebergTable.properties().get(TableProperties.GC_ENABLED))
- .isNotNull()
- .isEqualTo("false");
+ Assertions.assertThat(icebergTable.properties())
+ .containsEntry(TableProperties.GC_ENABLED, "false");
Assertions.assertThatThrownBy(
() ->
@@ -589,18 +590,43 @@ public class TestNessieTable extends BaseTestIceberg {
}
@Test
- public void testTableMetadataFilesCleanupDisable() throws
NessieNotFoundException {
+ public void testGCEnabled() {
Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);
+ icebergTable.updateProperties().set(TableProperties.GC_ENABLED,
"true").commit();
+ Assertions.assertThat(icebergTable.properties())
+ .containsEntry(TableProperties.GC_ENABLED, "true");
- // Forceful setting of property also should get override with false
- icebergTable
- .updateProperties()
- .set(TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED, "true")
- .commit();
- Assertions.assertThat(
-
icebergTable.properties().get(TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED))
- .isNotNull()
- .isEqualTo("false");
+ Assertions.assertThatCode(
+ () ->
+
icebergTable.expireSnapshots().expireOlderThan(System.currentTimeMillis()).commit())
+ .doesNotThrowAnyException();
+ }
+
+ @Test
+ public void testGCEnabledViaTableDefaultCatalogProperty() {
+ catalog.dropTable(TABLE_IDENTIFIER, false); // pre-created in @BeforeEach
+
+ catalog =
+ initCatalog(
+ branch,
+ null,
+ ImmutableMap.<String, String>builder()
+ .put(CatalogProperties.TABLE_DEFAULT_PREFIX +
TableProperties.GC_ENABLED, "true")
+ .build());
+
+ // Create the table again using updated config defaults.
+ tableLocation = createTable(TABLE_IDENTIFIER,
schema).location().replaceFirst("file:", "");
+ Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);
+
+ Assertions.assertThatCode(
+ () ->
+
icebergTable.expireSnapshots().expireOlderThan(System.currentTimeMillis()).commit())
+ .doesNotThrowAnyException();
+ }
+
+ @Test
+ public void testTableMetadataFilesCleanupDisable() throws
NessieNotFoundException {
+ Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);
icebergTable
.updateProperties()