This is an automated email from the ASF dual-hosted git repository.
etudenhoefner pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/main by this push:
new e4bc593d48 Core: Add DataFileSet / DeleteFileSet (#11195)
e4bc593d48 is described below
commit e4bc593d48df08c66549536266ca960024642295
Author: Eduard Tudenhoefner <[email protected]>
AuthorDate: Tue Oct 1 08:14:34 2024 +0200
Core: Add DataFileSet / DeleteFileSet (#11195)
---
.../java/org/apache/iceberg/util/DataFileSet.java | 113 ++++++++
.../org/apache/iceberg/util/DeleteFileSet.java | 114 ++++++++
.../java/org/apache/iceberg/util/WrapperSet.java | 177 ++++++++++++
.../org/apache/iceberg/util/TestDataFileSet.java | 303 +++++++++++++++++++
.../org/apache/iceberg/util/TestDeleteFileSet.java | 321 +++++++++++++++++++++
5 files changed, 1028 insertions(+)
diff --git a/api/src/main/java/org/apache/iceberg/util/DataFileSet.java
b/api/src/main/java/org/apache/iceberg/util/DataFileSet.java
new file mode 100644
index 0000000000..27cbee088a
--- /dev/null
+++ b/api/src/main/java/org/apache/iceberg/util/DataFileSet.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.util;
+
+import java.util.Objects;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
+
+public class DataFileSet extends WrapperSet<DataFile> {
+ private static final ThreadLocal<DataFileWrapper> WRAPPERS =
+ ThreadLocal.withInitial(() -> DataFileWrapper.wrap(null));
+
+ private DataFileSet() {
+ // needed for serialization/deserialization
+ }
+
+ private DataFileSet(Iterable<Wrapper<DataFile>> wrappers) {
+ super(wrappers);
+ }
+
+ public static DataFileSet create() {
+ return new DataFileSet();
+ }
+
+ public static DataFileSet of(Iterable<? extends DataFile> iterable) {
+ return new DataFileSet(
+ Iterables.transform(
+ iterable,
+ obj -> {
+ Preconditions.checkNotNull(obj, "Invalid object: null");
+ return DataFileWrapper.wrap(obj);
+ }));
+ }
+
+ @Override
+ protected Wrapper<DataFile> wrapper() {
+ return WRAPPERS.get();
+ }
+
+ @Override
+ protected Wrapper<DataFile> wrap(DataFile dataFile) {
+ return DataFileWrapper.wrap(dataFile);
+ }
+
+ @Override
+ protected Class<DataFile> elementClass() {
+ return DataFile.class;
+ }
+
+ private static class DataFileWrapper implements Wrapper<DataFile> {
+ private DataFile file;
+
+ private DataFileWrapper(DataFile file) {
+ this.file = file;
+ }
+
+ private static DataFileWrapper wrap(DataFile dataFile) {
+ return new DataFileWrapper(dataFile);
+ }
+
+ @Override
+ public DataFile get() {
+ return file;
+ }
+
+ @Override
+ public Wrapper<DataFile> set(DataFile dataFile) {
+ this.file = dataFile;
+ return this;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+
+ if (!(o instanceof DataFileWrapper)) {
+ return false;
+ }
+
+ DataFileWrapper that = (DataFileWrapper) o;
+ return Objects.equals(file.location(), that.file.location());
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hashCode(file.location());
+ }
+
+ @Override
+ public String toString() {
+ return file.location();
+ }
+ }
+}
diff --git a/api/src/main/java/org/apache/iceberg/util/DeleteFileSet.java
b/api/src/main/java/org/apache/iceberg/util/DeleteFileSet.java
new file mode 100644
index 0000000000..bbe9824963
--- /dev/null
+++ b/api/src/main/java/org/apache/iceberg/util/DeleteFileSet.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.util;
+
+import java.util.Objects;
+import org.apache.iceberg.DeleteFile;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
+
+public class DeleteFileSet extends WrapperSet<DeleteFile> {
+ private static final ThreadLocal<DeleteFileWrapper> WRAPPERS =
+ ThreadLocal.withInitial(() -> DeleteFileWrapper.wrap(null));
+
+ private DeleteFileSet() {
+ // needed for serialization/deserialization
+ }
+
+ private DeleteFileSet(Iterable<Wrapper<DeleteFile>> wrappers) {
+ super(wrappers);
+ }
+
+ public static DeleteFileSet create() {
+ return new DeleteFileSet();
+ }
+
+ public static DeleteFileSet of(Iterable<? extends DeleteFile> iterable) {
+ return new DeleteFileSet(
+ Iterables.transform(
+ iterable,
+ obj -> {
+ Preconditions.checkNotNull(obj, "Invalid object: null");
+ return DeleteFileWrapper.wrap(obj);
+ }));
+ }
+
+ @Override
+ protected Wrapper<DeleteFile> wrapper() {
+ return WRAPPERS.get();
+ }
+
+ @Override
+ protected Wrapper<DeleteFile> wrap(DeleteFile deleteFile) {
+ return DeleteFileWrapper.wrap(deleteFile);
+ }
+
+ @Override
+ protected Class<DeleteFile> elementClass() {
+ return DeleteFile.class;
+ }
+
+ private static class DeleteFileWrapper implements Wrapper<DeleteFile> {
+ private DeleteFile file;
+
+ private DeleteFileWrapper(DeleteFile file) {
+ this.file = file;
+ }
+
+ private static DeleteFileWrapper wrap(DeleteFile deleteFile) {
+ return new DeleteFileWrapper(deleteFile);
+ }
+
+ @Override
+ public DeleteFile get() {
+ return file;
+ }
+
+ @Override
+ public Wrapper<DeleteFile> set(DeleteFile deleteFile) {
+ this.file = deleteFile;
+ return this;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+
+ if (!(o instanceof DeleteFileWrapper)) {
+ return false;
+ }
+
+ DeleteFileWrapper that = (DeleteFileWrapper) o;
+ // this needs to be updated once deletion vector support is added
+ return Objects.equals(file.location(), that.file.location());
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hashCode(file.location());
+ }
+
+ @Override
+ public String toString() {
+ return file.location();
+ }
+ }
+}
diff --git a/api/src/main/java/org/apache/iceberg/util/WrapperSet.java
b/api/src/main/java/org/apache/iceberg/util/WrapperSet.java
new file mode 100644
index 0000000000..e589f435e1
--- /dev/null
+++ b/api/src/main/java/org/apache/iceberg/util/WrapperSet.java
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.util;
+
+import java.io.Serializable;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.stream.Collectors;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterators;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.relocated.com.google.common.collect.Streams;
+
+/**
+ * A custom set for a {@link Wrapper} of the given type that maintains
insertion order and does not
+ * allow null elements.
+ *
+ * @param <T> The type to wrap in a {@link Wrapper} instance.
+ */
+abstract class WrapperSet<T> implements Set<T>, Serializable {
+ private final Set<Wrapper<T>> set = Sets.newLinkedHashSet();
+
+ protected WrapperSet(Iterable<Wrapper<T>> wrappers) {
+ wrappers.forEach(set::add);
+ }
+
+ protected WrapperSet() {}
+
+ protected abstract Wrapper<T> wrapper();
+
+ protected abstract Wrapper<T> wrap(T file);
+
+ protected abstract Class<T> elementClass();
+
+ protected interface Wrapper<T> extends Serializable {
+ T get();
+
+ Wrapper<T> set(T object);
+ }
+
+ @Override
+ public int size() {
+ return set.size();
+ }
+
+ @Override
+ public boolean isEmpty() {
+ return set.isEmpty();
+ }
+
+ @Override
+ public boolean contains(Object obj) {
+ Preconditions.checkNotNull(obj, "Invalid object: null");
+ Wrapper<T> wrapper = wrapper();
+ boolean result = set.contains(wrapper.set(elementClass().cast(obj)));
+ wrapper.set(null); // don't hold a reference to the value
+ return result;
+ }
+
+ @Override
+ public Iterator<T> iterator() {
+ return Iterators.transform(set.iterator(), Wrapper::get);
+ }
+
+ @Override
+ public Object[] toArray() {
+ return Lists.newArrayList(iterator()).toArray();
+ }
+
+ @Override
+ public <X> X[] toArray(X[] destArray) {
+ return Lists.newArrayList(iterator()).toArray(destArray);
+ }
+
+ @Override
+ public boolean add(T obj) {
+ Preconditions.checkNotNull(obj, "Invalid object: null");
+ return set.add(wrap(obj));
+ }
+
+ @Override
+ public boolean remove(Object obj) {
+ Preconditions.checkNotNull(obj, "Invalid object: null");
+ Wrapper<T> wrapper = wrapper();
+ boolean result = set.remove(wrapper.set(elementClass().cast(obj)));
+ wrapper.set(null); // don't hold a reference to the value
+ return result;
+ }
+
+ @Override
+ public boolean containsAll(Collection<?> collection) {
+ Preconditions.checkNotNull(collection, "Invalid collection: null");
+ return Iterables.all(collection, this::contains);
+ }
+
+ @Override
+ public boolean addAll(Collection<? extends T> collection) {
+ Preconditions.checkNotNull(collection, "Invalid collection: null");
+ return collection.stream().filter(this::add).count() != 0;
+ }
+
+ @Override
+ public boolean retainAll(Collection<?> collection) {
+ Preconditions.checkNotNull(collection, "Invalid collection: null");
+ Set<Wrapper<T>> toRetain =
+ collection.stream()
+ .map(obj -> Preconditions.checkNotNull(obj, "Invalid object:
null"))
+ .map(elementClass()::cast)
+ .map(this::wrap)
+ .collect(Collectors.toSet());
+
+ return Iterables.retainAll(set, toRetain);
+ }
+
+ @Override
+ public boolean removeAll(Collection<?> collection) {
+ Preconditions.checkNotNull(collection, "Invalid collection: null");
+ return collection.stream().filter(this::remove).count() != 0;
+ }
+
+ @Override
+ public void clear() {
+ set.clear();
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ } else if (!(other instanceof Set)) {
+ return false;
+ }
+
+ Set<?> that = (Set<?>) other;
+
+ if (size() != that.size()) {
+ return false;
+ }
+
+ try {
+ return containsAll(that);
+ } catch (ClassCastException | NullPointerException unused) {
+ return false;
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ return set.stream().mapToInt(Object::hashCode).sum();
+ }
+
+ @Override
+ public String toString() {
+ return Streams.stream(iterator())
+ .map(Object::toString)
+ .collect(Collectors.joining(", ", "[", "]"));
+ }
+}
diff --git a/core/src/test/java/org/apache/iceberg/util/TestDataFileSet.java
b/core/src/test/java/org/apache/iceberg/util/TestDataFileSet.java
new file mode 100644
index 0000000000..0f298ad82e
--- /dev/null
+++ b/core/src/test/java/org/apache/iceberg/util/TestDataFileSet.java
@@ -0,0 +1,303 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.util;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Set;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.DataFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.TestHelpers;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Testing {@link DataFileSet} is easier in iceberg-core since the data file
builders are located
+ * here
+ */
+public class TestDataFileSet {
+
+ private static final DataFile FILE_A =
+ DataFiles.builder(PartitionSpec.unpartitioned())
+ .withPath("/path/to/data-a.parquet")
+ .withFileSizeInBytes(1)
+ .withRecordCount(1)
+ .build();
+ private static final DataFile FILE_B =
+ DataFiles.builder(PartitionSpec.unpartitioned())
+ .withPath("/path/to/data-b.parquet")
+ .withFileSizeInBytes(2)
+ .withRecordCount(2)
+ .build();
+ private static final DataFile FILE_C =
+ DataFiles.builder(PartitionSpec.unpartitioned())
+ .withPath("/path/to/data-c.parquet")
+ .withFileSizeInBytes(3)
+ .withRecordCount(3)
+ .build();
+ private static final DataFile FILE_D =
+ DataFiles.builder(PartitionSpec.unpartitioned())
+ .withPath("/path/to/data-d.parquet")
+ .withFileSizeInBytes(4)
+ .withRecordCount(4)
+ .build();
+
+ @Test
+ public void emptySet() {
+ assertThat(DataFileSet.create()).isEmpty();
+ assertThat(DataFileSet.create()).doesNotContain(FILE_A, FILE_B, FILE_C);
+ }
+
+ @Test
+ public void insertionOrderIsMaintained() {
+ DataFileSet set = DataFileSet.create();
+ set.addAll(ImmutableList.of(FILE_D, FILE_A, FILE_C));
+ set.add(FILE_B);
+ set.add(FILE_D);
+
+ assertThat(set).hasSize(4).containsExactly(FILE_D, FILE_A, FILE_C, FILE_B);
+ }
+
+ @Test
+ public void clear() {
+ DataFileSet set = DataFileSet.of(ImmutableList.of(FILE_A, FILE_B));
+ set.clear();
+ assertThat(set).isEmpty();
+ }
+
+ @Test
+ public void addAll() {
+ DataFileSet empty = DataFileSet.create();
+ assertThatThrownBy(() -> empty.add(null))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ assertThatThrownBy(() -> empty.addAll(null))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid collection: null");
+
+ assertThatThrownBy(() -> empty.addAll(Collections.singletonList(null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ assertThatThrownBy(() -> empty.addAll(Arrays.asList(FILE_A, null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ DataFileSet set = DataFileSet.create();
+ set.addAll(ImmutableList.of(FILE_B, FILE_A, FILE_C, FILE_A));
+ assertThat(set).hasSize(3).containsExactly(FILE_B, FILE_A, FILE_C);
+ }
+
+ @Test
+ public void contains() {
+ DataFileSet set = DataFileSet.of(ImmutableList.of(FILE_A, FILE_B));
+ assertThatThrownBy(() -> set.contains(null))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ assertThat(set)
+ .hasSize(2)
+ .containsExactly(FILE_A, FILE_B)
+ .doesNotContain(FILE_C)
+ .doesNotContain(FILE_D);
+
+ assertThatThrownBy(() -> DataFileSet.of(Arrays.asList(FILE_C, FILE_B,
null, FILE_A)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+ }
+
+ @Test
+ public void containsAll() {
+ DataFileSet set = DataFileSet.of(ImmutableList.of(FILE_A, FILE_B));
+ assertThatThrownBy(() -> set.containsAll(null))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid collection: null");
+
+ assertThatThrownBy(() -> set.containsAll(Collections.singletonList(null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ assertThatThrownBy(() -> set.containsAll(Arrays.asList(FILE_A, null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ assertThat(set.containsAll(ImmutableList.of(FILE_B, FILE_A))).isTrue();
+ assertThat(set.containsAll(ImmutableList.of(FILE_B, FILE_A,
FILE_C))).isFalse();
+ assertThat(set.containsAll(ImmutableList.of(FILE_B))).isTrue();
+ }
+
+ @Test
+ public void toArray() {
+ DataFileSet set = DataFileSet.of(ImmutableList.of(FILE_B, FILE_A));
+ assertThat(set.toArray()).hasSize(2).containsExactly(FILE_B, FILE_A);
+
+ DataFile[] array = new DataFile[1];
+ assertThat(set.toArray(array)).hasSize(2).containsExactly(FILE_B, FILE_A);
+
+ array = new DataFile[0];
+ assertThat(set.toArray(array)).hasSize(2).containsExactly(FILE_B, FILE_A);
+
+ array = new DataFile[5];
+ assertThat(set.toArray(array)).hasSize(5).containsExactly(FILE_B, FILE_A,
null, null, null);
+
+ array = new DataFile[2];
+ assertThat(set.toArray(array)).hasSize(2).containsExactly(FILE_B, FILE_A);
+ }
+
+ @Test
+ public void retainAll() {
+ DataFileSet empty = DataFileSet.create();
+ assertThatThrownBy(() -> empty.retainAll(null))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid collection: null");
+
+ assertThatThrownBy(() -> empty.retainAll(Collections.singletonList(null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ assertThatThrownBy(() -> empty.retainAll(Arrays.asList(FILE_A, null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ DataFileSet set = DataFileSet.of(ImmutableList.of(FILE_A, FILE_B));
+ assertThat(set.retainAll(ImmutableList.of(FILE_C, FILE_D, FILE_A)))
+ .as("Set should have changed")
+ .isTrue();
+
+ assertThat(set).hasSize(1).containsExactly(FILE_A);
+
+ set = DataFileSet.of(ImmutableList.of(FILE_A, FILE_B));
+
+ assertThat(set.retainAll(ImmutableList.of(FILE_B, FILE_A)))
+ .as("Set should not have changed")
+ .isFalse();
+
+ assertThat(set.retainAll(ImmutableList.of(FILE_C, FILE_D)))
+ .as("Set should have changed")
+ .isTrue();
+
+ assertThat(set).isEmpty();
+ }
+
+ @Test
+ public void remove() {
+ DataFileSet set = DataFileSet.of(ImmutableList.of(FILE_A, FILE_B));
+ assertThatThrownBy(() -> set.remove(null))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ set.remove(FILE_C);
+ assertThat(set).containsExactly(FILE_A, FILE_B);
+ set.remove(FILE_B);
+ assertThat(set).containsExactly(FILE_A);
+ set.remove(FILE_A);
+ assertThat(set).isEmpty();
+ }
+
+ @Test
+ public void removeAll() {
+ DataFileSet empty = DataFileSet.create();
+ assertThatThrownBy(() -> empty.removeAll(null))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid collection: null");
+
+ assertThatThrownBy(() -> empty.removeAll(Collections.singletonList(null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ assertThatThrownBy(() -> empty.removeAll(Arrays.asList(FILE_A, null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ DataFileSet set = DataFileSet.of(ImmutableList.of(FILE_A, FILE_B));
+
+ assertThat(set.removeAll(ImmutableList.of(FILE_C, FILE_D, FILE_A)))
+ .as("Set should have changed")
+ .isTrue();
+
+ assertThat(set).hasSize(1).containsExactly(FILE_B);
+
+ set = DataFileSet.of(ImmutableList.of(FILE_A, FILE_B));
+ assertThat(set.removeAll(ImmutableList.of(FILE_C, FILE_D)))
+ .as("Set should not have changed")
+ .isFalse();
+
+ assertThat(set.removeAll(ImmutableList.of(FILE_B, FILE_A)))
+ .as("Set should have changed")
+ .isTrue();
+
+ assertThat(set).isEmpty();
+ }
+
+ @Test
+ public void equalsAndHashCode() {
+ DataFileSet set1 = DataFileSet.create();
+ DataFileSet set2 = DataFileSet.create();
+
+ assertThat(set1).isEqualTo(set2);
+ assertThat(set1.hashCode()).isEqualTo(set2.hashCode());
+
+ set1.add(FILE_A);
+ set1.add(FILE_B);
+ set1.add(FILE_C);
+
+ // different DataFile instances but all use the same paths as set1
+ set2.add(
+ DataFiles.builder(PartitionSpec.unpartitioned())
+ .withPath(FILE_A.location())
+ .withFileSizeInBytes(10)
+ .withRecordCount(1)
+ .build());
+ set2.add(
+ DataFiles.builder(PartitionSpec.unpartitioned())
+ .withPath(FILE_B.location())
+ .withFileSizeInBytes(100)
+ .withRecordCount(10)
+ .build());
+ set2.add(
+ DataFiles.builder(PartitionSpec.unpartitioned())
+ .withPath(FILE_C.location())
+ .withFileSizeInBytes(1000)
+ .withRecordCount(100)
+ .build());
+
+ Set<DataFile> set3 = Collections.unmodifiableSet(set2);
+
+ assertThat(set1).isEqualTo(set2).isEqualTo(set3);
+
assertThat(set1.hashCode()).isEqualTo(set2.hashCode()).isEqualTo(set3.hashCode());
+ }
+
+ @Test
+ public void kryoSerialization() throws Exception {
+ DataFileSet dataFiles = DataFileSet.of(ImmutableList.of(FILE_C, FILE_B,
FILE_A));
+
assertThat(TestHelpers.KryoHelpers.roundTripSerialize(dataFiles)).isEqualTo(dataFiles);
+ }
+
+ @Test
+ public void javaSerialization() throws Exception {
+ DataFileSet dataFiles = DataFileSet.of(ImmutableList.of(FILE_C, FILE_B,
FILE_A));
+ DataFileSet deserialized =
TestHelpers.deserialize(TestHelpers.serialize(dataFiles));
+ assertThat(deserialized).isEqualTo(dataFiles);
+ }
+}
diff --git a/core/src/test/java/org/apache/iceberg/util/TestDeleteFileSet.java
b/core/src/test/java/org/apache/iceberg/util/TestDeleteFileSet.java
new file mode 100644
index 0000000000..5f4488a3a1
--- /dev/null
+++ b/core/src/test/java/org/apache/iceberg/util/TestDeleteFileSet.java
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.util;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Set;
+import org.apache.iceberg.DeleteFile;
+import org.apache.iceberg.FileMetadata;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.TestHelpers;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Testing {@link DeleteFileSet} is easier in iceberg-core since the delete
file builders are
+ * located here
+ */
+public class TestDeleteFileSet {
+
+ private static final DeleteFile FILE_A_DELETES =
+ FileMetadata.deleteFileBuilder(PartitionSpec.unpartitioned())
+ .ofPositionDeletes()
+ .withPath("/path/to/data-a-deletes.parquet")
+ .withFileSizeInBytes(1)
+ .withRecordCount(1)
+ .build();
+ private static final DeleteFile FILE_B_DELETES =
+ FileMetadata.deleteFileBuilder(PartitionSpec.unpartitioned())
+ .ofPositionDeletes()
+ .withPath("/path/to/data-b-deletes.parquet")
+ .withFileSizeInBytes(2)
+ .withRecordCount(2)
+ .build();
+ private static final DeleteFile FILE_C_DELETES =
+ FileMetadata.deleteFileBuilder(PartitionSpec.unpartitioned())
+ .ofPositionDeletes()
+ .withPath("/path/to/data-c-deletes.parquet")
+ .withFileSizeInBytes(3)
+ .withRecordCount(3)
+ .build();
+ private static final DeleteFile FILE_D_DELETES =
+ FileMetadata.deleteFileBuilder(PartitionSpec.unpartitioned())
+ .ofPositionDeletes()
+ .withPath("/path/to/data-d-deletes.parquet")
+ .withFileSizeInBytes(4)
+ .withRecordCount(4)
+ .build();
+
+ @Test
+ public void emptySet() {
+ assertThat(DeleteFileSet.create()).isEmpty();
+ assertThat(DeleteFileSet.create())
+ .doesNotContain(FILE_A_DELETES, FILE_B_DELETES, FILE_C_DELETES);
+ }
+
+ @Test
+ public void insertionOrderIsMaintained() {
+ DeleteFileSet set = DeleteFileSet.create();
+ set.addAll(ImmutableList.of(FILE_D_DELETES, FILE_A_DELETES,
FILE_C_DELETES));
+ set.add(FILE_B_DELETES);
+ set.add(FILE_D_DELETES);
+
+ assertThat(set)
+ .hasSize(4)
+ .containsExactly(FILE_D_DELETES, FILE_A_DELETES, FILE_C_DELETES,
FILE_B_DELETES);
+ }
+
+ @Test
+ public void clear() {
+ DeleteFileSet set = DeleteFileSet.of(ImmutableList.of(FILE_A_DELETES,
FILE_B_DELETES));
+ set.clear();
+ assertThat(set).isEmpty();
+ }
+
+ @Test
+ public void addAll() {
+ DeleteFileSet empty = DeleteFileSet.create();
+ assertThatThrownBy(() -> empty.add(null))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ assertThatThrownBy(() -> empty.addAll(null))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid collection: null");
+
+ assertThatThrownBy(() -> empty.addAll(Collections.singletonList(null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ assertThatThrownBy(() -> empty.addAll(Arrays.asList(FILE_A_DELETES, null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ DeleteFileSet set = DeleteFileSet.create();
+ set.addAll(ImmutableList.of(FILE_B_DELETES, FILE_A_DELETES,
FILE_C_DELETES, FILE_A_DELETES));
+ assertThat(set).hasSize(3).containsExactly(FILE_B_DELETES, FILE_A_DELETES,
FILE_C_DELETES);
+ }
+
+ @Test
+ public void contains() {
+ DeleteFileSet set = DeleteFileSet.of(ImmutableList.of(FILE_A_DELETES,
FILE_B_DELETES));
+ assertThatThrownBy(() -> set.contains(null))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ assertThat(set)
+ .hasSize(2)
+ .containsExactly(FILE_A_DELETES, FILE_B_DELETES)
+ .doesNotContain(FILE_C_DELETES)
+ .doesNotContain(FILE_D_DELETES);
+
+ assertThatThrownBy(
+ () ->
+ DeleteFileSet.of(
+ Arrays.asList(FILE_C_DELETES, FILE_B_DELETES, null,
FILE_A_DELETES)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+ }
+
+ @Test
+ public void containsAll() {
+ DeleteFileSet set = DeleteFileSet.of(ImmutableList.of(FILE_A_DELETES,
FILE_B_DELETES));
+ assertThatThrownBy(() -> set.containsAll(null))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid collection: null");
+
+ assertThatThrownBy(() -> set.containsAll(Collections.singletonList(null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ assertThatThrownBy(() -> set.containsAll(Arrays.asList(FILE_A_DELETES,
null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ assertThat(set.containsAll(ImmutableList.of(FILE_B_DELETES,
FILE_A_DELETES))).isTrue();
+ assertThat(set.containsAll(ImmutableList.of(FILE_B_DELETES,
FILE_A_DELETES, FILE_C_DELETES)))
+ .isFalse();
+ assertThat(set.containsAll(ImmutableList.of(FILE_B_DELETES))).isTrue();
+ }
+
+ @Test
+ public void toArray() {
+ DeleteFileSet set = DeleteFileSet.of(ImmutableList.of(FILE_B_DELETES,
FILE_A_DELETES));
+ assertThat(set.toArray()).hasSize(2).containsExactly(FILE_B_DELETES,
FILE_A_DELETES);
+
+ DeleteFile[] array = new DeleteFile[1];
+ assertThat(set.toArray(array)).hasSize(2).containsExactly(FILE_B_DELETES,
FILE_A_DELETES);
+
+ array = new DeleteFile[0];
+ assertThat(set.toArray(array)).hasSize(2).containsExactly(FILE_B_DELETES,
FILE_A_DELETES);
+
+ array = new DeleteFile[5];
+ assertThat(set.toArray(array))
+ .hasSize(5)
+ .containsExactly(FILE_B_DELETES, FILE_A_DELETES, null, null, null);
+
+ array = new DeleteFile[2];
+ assertThat(set.toArray(array)).hasSize(2).containsExactly(FILE_B_DELETES,
FILE_A_DELETES);
+ }
+
+ @Test
+ public void retainAll() {
+ DeleteFileSet empty = DeleteFileSet.create();
+ assertThatThrownBy(() -> empty.retainAll(null))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid collection: null");
+
+ assertThatThrownBy(() -> empty.retainAll(Collections.singletonList(null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ assertThatThrownBy(() -> empty.retainAll(Arrays.asList(FILE_A_DELETES,
null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ DeleteFileSet set = DeleteFileSet.of(ImmutableList.of(FILE_A_DELETES,
FILE_B_DELETES));
+ assertThat(set.retainAll(ImmutableList.of(FILE_C_DELETES, FILE_D_DELETES,
FILE_A_DELETES)))
+ .as("Set should have changed")
+ .isTrue();
+
+ assertThat(set).hasSize(1).containsExactly(FILE_A_DELETES);
+
+ set = DeleteFileSet.of(ImmutableList.of(FILE_A_DELETES, FILE_B_DELETES));
+
+ assertThat(set.retainAll(ImmutableList.of(FILE_B_DELETES, FILE_A_DELETES)))
+ .as("Set should not have changed")
+ .isFalse();
+
+ assertThat(set.retainAll(ImmutableList.of(FILE_C_DELETES, FILE_D_DELETES)))
+ .as("Set should have changed")
+ .isTrue();
+
+ assertThat(set).isEmpty();
+ }
+
+ @Test
+ public void remove() {
+ DeleteFileSet set = DeleteFileSet.of(ImmutableList.of(FILE_A_DELETES,
FILE_B_DELETES));
+ assertThatThrownBy(() -> set.remove(null))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ set.remove(FILE_C_DELETES);
+ assertThat(set).containsExactly(FILE_A_DELETES, FILE_B_DELETES);
+ assertThat(set).containsExactly(FILE_A_DELETES, FILE_B_DELETES);
+ set.remove(FILE_B_DELETES);
+ assertThat(set).containsExactly(FILE_A_DELETES);
+ set.remove(FILE_A_DELETES);
+ assertThat(set).isEmpty();
+ }
+
+ @Test
+ public void removeAll() {
+ DeleteFileSet empty = DeleteFileSet.create();
+ assertThatThrownBy(() -> empty.removeAll(null))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid collection: null");
+
+ assertThatThrownBy(() -> empty.removeAll(Collections.singletonList(null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ assertThatThrownBy(() -> empty.removeAll(Arrays.asList(FILE_A_DELETES,
null)))
+ .isInstanceOf(NullPointerException.class)
+ .hasMessage("Invalid object: null");
+
+ DeleteFileSet set = DeleteFileSet.of(ImmutableList.of(FILE_A_DELETES,
FILE_B_DELETES));
+ assertThat(set.removeAll(ImmutableList.of(FILE_C_DELETES, FILE_D_DELETES,
FILE_A_DELETES)))
+ .as("Set should have changed")
+ .isTrue();
+
+ assertThat(set).hasSize(1).containsExactly(FILE_B_DELETES);
+
+ set = DeleteFileSet.of(ImmutableList.of(FILE_A_DELETES, FILE_B_DELETES));
+ assertThat(set.removeAll(ImmutableList.of(FILE_C_DELETES, FILE_D_DELETES)))
+ .as("Set should not have changed")
+ .isFalse();
+
+ assertThat(set.removeAll(ImmutableList.of(FILE_B_DELETES, FILE_A_DELETES)))
+ .as("Set should have changed")
+ .isTrue();
+
+ assertThat(set).isEmpty();
+ }
+
+ @Test
+ public void equalsAndHashCode() {
+ DeleteFileSet set1 = DeleteFileSet.create();
+ DeleteFileSet set2 = DeleteFileSet.create();
+
+ assertThat(set1).isEqualTo(set2);
+ assertThat(set1.hashCode()).isEqualTo(set2.hashCode());
+
+ set1.add(FILE_A_DELETES);
+ set1.add(FILE_B_DELETES);
+ set1.add(FILE_C_DELETES);
+
+ // different DeleteFile instances but all use the same paths as set1
+ set2.add(
+ FileMetadata.deleteFileBuilder(PartitionSpec.unpartitioned())
+ .ofPositionDeletes()
+ .withPath(FILE_A_DELETES.location())
+ .withFileSizeInBytes(10)
+ .withRecordCount(1)
+ .build());
+ set2.add(
+ FileMetadata.deleteFileBuilder(PartitionSpec.unpartitioned())
+ .ofPositionDeletes()
+ .withPath(FILE_B_DELETES.location())
+ .withFileSizeInBytes(100)
+ .withRecordCount(10)
+ .build());
+ set2.add(
+ FileMetadata.deleteFileBuilder(PartitionSpec.unpartitioned())
+ .ofPositionDeletes()
+ .withPath(FILE_C_DELETES.location())
+ .withFileSizeInBytes(1000)
+ .withRecordCount(100)
+ .build());
+
+ Set<DeleteFile> set3 = Collections.unmodifiableSet(set2);
+
+ assertThat(set1).isEqualTo(set2).isEqualTo(set3);
+
assertThat(set1.hashCode()).isEqualTo(set2.hashCode()).isEqualTo(set3.hashCode());
+ }
+
+ @Test
+ public void kryoSerialization() throws Exception {
+ DeleteFileSet deleteFiles =
+ DeleteFileSet.of(ImmutableList.of(FILE_C_DELETES, FILE_B_DELETES,
FILE_A_DELETES));
+
assertThat(TestHelpers.KryoHelpers.roundTripSerialize(deleteFiles)).isEqualTo(deleteFiles);
+ }
+
+ @Test
+ public void javaSerialization() throws Exception {
+ DeleteFileSet deleteFiles =
+ DeleteFileSet.of(ImmutableList.of(FILE_C_DELETES, FILE_B_DELETES,
FILE_A_DELETES));
+ DeleteFileSet deserialize =
TestHelpers.deserialize(TestHelpers.serialize(deleteFiles));
+ assertThat(deserialize).isEqualTo(deleteFiles);
+ }
+}