Re: [PR] Object storage operations [polaris]
dimas-b commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2760338624
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/PurgeSpec.java:
##
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import static com.google.common.base.Preconditions.checkState;
+
+import java.util.OptionalDouble;
+import java.util.function.Consumer;
+import org.apache.polaris.immutables.PolarisImmutable;
+import org.immutables.value.Value;
+
+@SuppressWarnings("unused")
+@PolarisImmutable
+public interface PurgeSpec {
+ PurgeSpec DEFAULT_INSTANCE = PurgeSpec.builder().build();
+
+ @Value.Default
+ default FileFilter fileFilter() {
+return FileFilter.alwaysTrue();
+ }
+
+ PurgeSpec withFileFilter(FileFilter fileFilter);
Review Comment:
Indeed, this method does not appear to have any immediate callers... @snazy
: is there a follow-up PR planned that will call it?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
dimas-b commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2760270191
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsImpl.java:
##
@@ -0,0 +1,471 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+
+import com.google.common.collect.Streams;
+import com.google.common.util.concurrent.RateLimiter;
+import jakarta.annotation.Nonnull;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.io.BulkDeletionFailureException;
+import org.apache.iceberg.io.CloseableIterator;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
+import org.apache.iceberg.view.ViewMetadata;
+import org.apache.iceberg.view.ViewMetadataParser;
+import org.apache.polaris.storage.files.api.FileFilter;
+import org.apache.polaris.storage.files.api.FileOperations;
+import org.apache.polaris.storage.files.api.FileSpec;
+import org.apache.polaris.storage.files.api.FileType;
+import org.apache.polaris.storage.files.api.ImmutablePurgeStats;
+import org.apache.polaris.storage.files.api.PurgeSpec;
+import org.apache.polaris.storage.files.api.PurgeStats;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @param fileIO the {@link FileIO} instance to use. The given instance must
implement both {@link
+ * org.apache.iceberg.io.SupportsBulkOperations} and {@link
+ * org.apache.iceberg.io.SupportsPrefixOperations}.
+ */
+record FileOperationsImpl(@Nonnull FileIO fileIO) implements FileOperations {
+ private static final Logger LOGGER =
LoggerFactory.getLogger(FileOperationsImpl.class);
+
+ @Override
+ public Stream findFiles(@Nonnull String prefix, @Nonnull
FileFilter filter) {
+var prefixUri = StorageUri.of(prefix).resolve("/");
+if (fileIO instanceof SupportsPrefixOperations prefixOps) {
+ return Streams.stream(prefixOps.listPrefix(prefix).iterator())
+ .filter(Objects::nonNull)
+ .map(
+ fileInfo -> {
+var location = StorageUri.of(fileInfo.location());
+if (!location.isAbsolute()) {
+ // ADLSFileIO does _not_ include the prefix, but GCSFileIO
and S3FileIO do.
+ location = prefixUri.resolve(location);
+}
+return FileSpec.builder()
+.location(location.toString())
+.size(fileInfo.size())
+.createdAtMillis(fileInfo.createdAtMillis())
+.build();
+ })
+ .filter(filter);
+}
+
+throw new IllegalStateException(
+format(
+"An Iceberg FileIO supporting prefix operations is required, but
the given %s does not",
+fileIO.getClass().getName()));
+ }
+
+ @Override
+ public Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate) {
+var metadataOpt = readTableMetadataFailsafe(tableMetadataLocation);
+if (metadataOpt.isEmpty()) {
+ return Stream.empty();
+}
+var metadata = metadataOpt.get();
+
+var metadataFileSpec =
+
FileSpec.fromLocation(tableMetadataLocation).fileType(FileType.ICEBERG_METADATA).build();
+
+var fileSources = new ArrayList>();
+
+fileSources.add(Stream.of(metadataFileSpec));
+
+var statisticsFiles = metadata.statisticsFiles();
+if (statisticsFiles != null)
Re: [PR] Object storage operations [polaris]
dimas-b commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2760267510
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsImpl.java:
##
@@ -0,0 +1,471 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+
+import com.google.common.collect.Streams;
+import com.google.common.util.concurrent.RateLimiter;
+import jakarta.annotation.Nonnull;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.io.BulkDeletionFailureException;
+import org.apache.iceberg.io.CloseableIterator;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
+import org.apache.iceberg.view.ViewMetadata;
+import org.apache.iceberg.view.ViewMetadataParser;
+import org.apache.polaris.storage.files.api.FileFilter;
+import org.apache.polaris.storage.files.api.FileOperations;
+import org.apache.polaris.storage.files.api.FileSpec;
+import org.apache.polaris.storage.files.api.FileType;
+import org.apache.polaris.storage.files.api.ImmutablePurgeStats;
+import org.apache.polaris.storage.files.api.PurgeSpec;
+import org.apache.polaris.storage.files.api.PurgeStats;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @param fileIO the {@link FileIO} instance to use. The given instance must
implement both {@link
+ * org.apache.iceberg.io.SupportsBulkOperations} and {@link
+ * org.apache.iceberg.io.SupportsPrefixOperations}.
+ */
+record FileOperationsImpl(@Nonnull FileIO fileIO) implements FileOperations {
+ private static final Logger LOGGER =
LoggerFactory.getLogger(FileOperationsImpl.class);
+
+ @Override
+ public Stream findFiles(@Nonnull String prefix, @Nonnull
FileFilter filter) {
+var prefixUri = StorageUri.of(prefix).resolve("/");
+if (fileIO instanceof SupportsPrefixOperations prefixOps) {
+ return Streams.stream(prefixOps.listPrefix(prefix).iterator())
+ .filter(Objects::nonNull)
+ .map(
+ fileInfo -> {
+var location = StorageUri.of(fileInfo.location());
+if (!location.isAbsolute()) {
+ // ADLSFileIO does _not_ include the prefix, but GCSFileIO
and S3FileIO do.
+ location = prefixUri.resolve(location);
+}
+return FileSpec.builder()
+.location(location.toString())
+.size(fileInfo.size())
+.createdAtMillis(fileInfo.createdAtMillis())
+.build();
+ })
+ .filter(filter);
+}
+
+throw new IllegalStateException(
+format(
+"An Iceberg FileIO supporting prefix operations is required, but
the given %s does not",
+fileIO.getClass().getName()));
+ }
+
+ @Override
+ public Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate) {
+var metadataOpt = readTableMetadataFailsafe(tableMetadataLocation);
+if (metadataOpt.isEmpty()) {
+ return Stream.empty();
+}
+var metadata = metadataOpt.get();
+
+var metadataFileSpec =
+
FileSpec.fromLocation(tableMetadataLocation).fileType(FileType.ICEBERG_METADATA).build();
+
+var fileSources = new ArrayList>();
+
+fileSources.add(Stream.of(metadataFileSpec));
+
+var statisticsFiles = metadata.statisticsFiles();
+if (statisticsFiles != null)
Re: [PR] Object storage operations [polaris]
dimas-b commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2760249846
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileOperations.java:
##
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import jakarta.annotation.Nonnull;
+import java.util.stream.Stream;
+
+/**
+ * Object storage file operations, used to find files below a given prefix, to
purge files, to
+ * identify referenced files, etc.
+ *
+ * All functions of this interface rather yield incomplete results and
continue over throwing
+ * exceptions.
+ */
+public interface FileOperations {
+ /**
+ * Find files that match the given prefix and filter.
+ *
+ * Whether existing but inaccessible files are included in the result
depends on the object
+ * store.
+ *
+ * Call sites should consider rate-limiting the scan operations, for
example, by using Guava's
+ * {@code RateLimiter} via a {@code Stream.map(x -> { rateLimiter.acquire();
return x; }} step on
+ * the returned stream.
+ *
+ * @param prefix full object storage URI prefix, including scheme and bucket.
+ * @param filter file filter
+ * @return a stream of file specs with the {@link
FileSpec#createdAtMillis()} and {@link
+ * FileSpec#size()} attributes populated with the information provided
by the object store.
+ * The {@link FileSpec#fileType() file type} attribute is not populated,
it may be {@link
+ * FileSpec#guessTypeFromName() guessed}.
+ */
+ Stream findFiles(@Nonnull String prefix, @Nonnull FileFilter
filter);
+
+ /**
+ * Identifies all files referenced by the given table-metadata.
+ *
+ * In case "container" files, like the metadata, manifest-list or
manifest files, are not
+ * readable, the returned stream will just not include those.
+ *
+ * Rate-limiting the returned stream is recommended when identifying
multiple tables and/or
+ * views. Rate-limiting on a single invocation may not be effective as
expected.
+ *
+ * @param tableMetadataLocation Iceberg table-metadata location
+ * @param deduplicate if true, attempt to deduplicate files by their
location, adding additional
+ * heap pressure to the operation. Implementations may ignore this
parameter or may not
+ * deduplicate all identified files.
+ * @return a stream of {@link FileSpec file specs}. The {@link
FileSpec#createdAtMillis()}
+ * attribute is usually not populated, as it would have to be derived
from user-provided
+ * information in metadata or snapshot. The {@link FileSpec#fileType()}
attribute is populated
+ * based on where a file appears during identification.
+ */
+ Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate);
+
+ /**
+ * Identifies all files referenced by the given view-metadata.
+ *
+ * In case "container" files like the metadata are not readable, the
returned stream will just
+ * not include those.
+ *
+ * Rate-limiting the returned stream is recommended when identifying
multiple tables and/or
+ * views. Rate-limiting on a single invocation may not be effective as
expected.
+ *
+ * @param viewMetadataLocation Iceberg view-metadata location
+ * @param deduplicate if true, attempt to deduplicate files by their
location, adding additional
+ * heap pressure to the operation. Implementations may ignore this
parameter or may not
+ * deduplicate all identified files.
+ * @return a stream of {@link FileSpec file specs}. The {@link
FileSpec#createdAtMillis()}
+ * attribute is usually not populated, as it would have been derived
from user-provided
+ * information in metadata or snapshot. The {@link FileSpec#fileType()}
attribute is populated
+ * based on where a file appears during identification.
+ */
+ Stream identifyIcebergViewFiles(
+ @Nonnull String viewMetadataLocation, boolean deduplicate);
+
+ /**
+ * Purges all files that are referenced by the given table-metadata,
respecting the given filter.
+ *
+ * In case "container" files, like the metadata, manifes
Re: [PR] Object storage operations [polaris]
dimas-b commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2760227624
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileType.java:
##
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import org.apache.iceberg.ContentFile;
+
+public enum FileType {
+ UNKNOWN(false, false),
+ ICEBERG_METADATA(true, false),
+ ICEBERG_STATISTICS(false, false),
+ ICEBERG_MANIFEST_LIST(false, false),
+ ICEBERG_MANIFEST_FILE(false, false),
+ ICEBERG_DATA_FILE(false, true),
+ ICEBERG_DELETE_FILE(false, true),
+ ;
+
+ private final boolean metadata;
+ private final boolean dataOrDelete;
+
+ FileType(boolean metadata, boolean dataOrDelete) {
+this.metadata = metadata;
+this.dataOrDelete = dataOrDelete;
+ }
+
+ @SuppressWarnings("UnnecessaryDefault")
+ public static FileType fromContentFile(ContentFile>
contentFile) {
+return switch (contentFile.content()) {
+ case DATA -> ICEBERG_DATA_FILE;
+ case EQUALITY_DELETES, POSITION_DELETES -> ICEBERG_DELETE_FILE;
+ default -> UNKNOWN;
+};
+ }
+
+ public boolean metadata() {
+return metadata;
+ }
+
+ public boolean dataOrDelete() {
Review Comment:
The latest code has `fromContentFile`. WDYT?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
dimas-b commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2760222421
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/PurgeSpec.java:
##
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import static com.google.common.base.Preconditions.checkState;
+
+import java.util.OptionalDouble;
+import java.util.function.Consumer;
+import org.apache.polaris.immutables.PolarisImmutable;
+import org.immutables.value.Value;
+
+@SuppressWarnings("unused")
+@PolarisImmutable
+public interface PurgeSpec {
+ PurgeSpec DEFAULT_INSTANCE = PurgeSpec.builder().build();
+
+ @Value.Default
+ default FileFilter fileFilter() {
+return FileFilter.alwaysTrue();
+ }
+
+ PurgeSpec withFileFilter(FileFilter fileFilter);
+
+ /**
+ * Delete batch size for purge/batch-deletion operations. Implementations
may opt to ignore this
+ * parameter and enforce a reasonable or required different limit.
+ */
+ @Value.Default
+ default int deleteBatchSize() {
+return 250;
Review Comment:
The latest code defines `DEFAULT_BATCH_SITE` for this.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
dimas-b commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2760210472
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/StorageUri.java:
##
@@ -0,0 +1,288 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.polaris.storage.files.impl;
+
+import com.google.common.base.Preconditions;
+import jakarta.annotation.Nonnull;
+import jakarta.annotation.Nullable;
+import java.net.URI;
+import java.util.Comparator;
+import java.util.Locale;
+import java.util.Objects;
+
+/**
+ * A utility class for working with file storage locations to be used instead
of {@link URI} since
+ * many actual object store locations do not always match the URI syntax.
+ *
+ * This type is private to the {@code polaris-storage-files-impl}
module.
+ */
+final class StorageUri implements Comparable {
+ public static final String SCHEME_FILE = "file";
Review Comment:
The latest code does not have this constant.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
dimas-b commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2760203117
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileSpec.java:
##
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import com.google.errorprone.annotations.CanIgnoreReturnValue;
+import java.util.Optional;
+import java.util.OptionalLong;
+import org.apache.polaris.immutables.PolarisImmutable;
+
+/**
+ * Describes a single file/object in an object storage.
+ *
+ * Not all attributes are populated by every {@link FileOperations}
function.
+ */
+@PolarisImmutable
+public interface FileSpec {
+
+ /** The full object storage URI. */
+ String location();
+
+ /**
+ * The type of the file, if known, as a convenience hint.
+ *
+ * The type might have been guessed from the file name, in which case the
returned value is not
+ * guaranteed to be accurate.
+ *
+ * @see #guessTypeFromName()
+ */
+ Optional fileType();
+
+ /** The size of the file in bytes, if available. */
+ OptionalLong size();
+
+ /** The creation timestamp in milliseconds since the epoch, if available. */
+ OptionalLong createdAtMillis();
+
+ static Builder builder() {
+return ImmutableFileSpec.builder();
+ }
+
+ static Builder fromLocation(String location) {
+return builder().location(location);
+ }
+
+ static Builder fromLocationAndSize(String location, long size) {
+var b = fromLocation(location);
+if (size > 0L) {
+ b.size(size);
+}
+return b;
+ }
+
+ /** Guesses the given file's type from its name, not guaranteed to be
accurate. */
+ default FileType guessTypeFromName() {
Review Comment:
AFAIK this method has been moved to test code.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
dimas-b commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2760196469
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/PurgeStats.java:
##
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import java.time.Duration;
+import org.apache.polaris.immutables.PolarisImmutable;
+
+@PolarisImmutable
+public interface PurgeStats {
+ Duration duration();
+
+ /**
+ * The number of purged files.
+ *
+ * The returned value may be wrong and include non-existing files.
+ */
+ long purgedFiles();
+
+ /**
+ * Number of files that were not purged.
+ *
+ * The returned value may be wrong and not include non-existing files.
Review Comment:
This javadoc has been updated. WDYT?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
dimas-b commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2760194821
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/PurgeStats.java:
##
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import java.time.Duration;
+import org.apache.polaris.immutables.PolarisImmutable;
+
+@PolarisImmutable
+public interface PurgeStats {
+ Duration duration();
+
+ /**
+ * The number of purged files.
+ *
+ * The returned value may be wrong and include non-existing files.
Review Comment:
This javadoc has been updated. WDYT?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
dimas-b commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2760188673
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsImpl.java:
##
@@ -0,0 +1,471 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+
+import com.google.common.collect.Streams;
+import com.google.common.util.concurrent.RateLimiter;
+import jakarta.annotation.Nonnull;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.io.BulkDeletionFailureException;
+import org.apache.iceberg.io.CloseableIterator;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
+import org.apache.iceberg.view.ViewMetadata;
+import org.apache.iceberg.view.ViewMetadataParser;
+import org.apache.polaris.storage.files.api.FileFilter;
+import org.apache.polaris.storage.files.api.FileOperations;
+import org.apache.polaris.storage.files.api.FileSpec;
+import org.apache.polaris.storage.files.api.FileType;
+import org.apache.polaris.storage.files.api.ImmutablePurgeStats;
+import org.apache.polaris.storage.files.api.PurgeSpec;
+import org.apache.polaris.storage.files.api.PurgeStats;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @param fileIO the {@link FileIO} instance to use. The given instance must
implement both {@link
+ * org.apache.iceberg.io.SupportsBulkOperations} and {@link
+ * org.apache.iceberg.io.SupportsPrefixOperations}.
+ */
+record FileOperationsImpl(@Nonnull FileIO fileIO) implements FileOperations {
+ private static final Logger LOGGER =
LoggerFactory.getLogger(FileOperationsImpl.class);
+
+ @Override
+ public Stream findFiles(@Nonnull String prefix, @Nonnull
FileFilter filter) {
+var prefixUri = StorageUri.of(prefix).resolve("/");
+if (fileIO instanceof SupportsPrefixOperations prefixOps) {
+ return Streams.stream(prefixOps.listPrefix(prefix).iterator())
+ .filter(Objects::nonNull)
+ .map(
+ fileInfo -> {
+var location = StorageUri.of(fileInfo.location());
+if (!location.isAbsolute()) {
+ // ADLSFileIO does _not_ include the prefix, but GCSFileIO
and S3FileIO do.
+ location = prefixUri.resolve(location);
+}
+return FileSpec.builder()
+.location(location.toString())
+.size(fileInfo.size())
+.createdAtMillis(fileInfo.createdAtMillis())
+.build();
+ })
+ .filter(filter);
+}
+
+throw new IllegalStateException(
+format(
+"An Iceberg FileIO supporting prefix operations is required, but
the given %s does not",
+fileIO.getClass().getName()));
+ }
+
+ @Override
+ public Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate) {
+var metadataOpt = readTableMetadataFailsafe(tableMetadataLocation);
Review Comment:
The latest code has `optionalTableMetadata`. I hope that's clearer
:slightly_smiling_face:
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
dimas-b commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2760175061
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsImpl.java:
##
@@ -0,0 +1,471 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+
+import com.google.common.collect.Streams;
+import com.google.common.util.concurrent.RateLimiter;
+import jakarta.annotation.Nonnull;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.io.BulkDeletionFailureException;
+import org.apache.iceberg.io.CloseableIterator;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
+import org.apache.iceberg.view.ViewMetadata;
+import org.apache.iceberg.view.ViewMetadataParser;
+import org.apache.polaris.storage.files.api.FileFilter;
+import org.apache.polaris.storage.files.api.FileOperations;
+import org.apache.polaris.storage.files.api.FileSpec;
+import org.apache.polaris.storage.files.api.FileType;
+import org.apache.polaris.storage.files.api.ImmutablePurgeStats;
+import org.apache.polaris.storage.files.api.PurgeSpec;
+import org.apache.polaris.storage.files.api.PurgeStats;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @param fileIO the {@link FileIO} instance to use. The given instance must
implement both {@link
+ * org.apache.iceberg.io.SupportsBulkOperations} and {@link
+ * org.apache.iceberg.io.SupportsPrefixOperations}.
+ */
+record FileOperationsImpl(@Nonnull FileIO fileIO) implements FileOperations {
+ private static final Logger LOGGER =
LoggerFactory.getLogger(FileOperationsImpl.class);
+
+ @Override
+ public Stream findFiles(@Nonnull String prefix, @Nonnull
FileFilter filter) {
+var prefixUri = StorageUri.of(prefix).resolve("/");
+if (fileIO instanceof SupportsPrefixOperations prefixOps) {
+ return Streams.stream(prefixOps.listPrefix(prefix).iterator())
+ .filter(Objects::nonNull)
+ .map(
+ fileInfo -> {
+var location = StorageUri.of(fileInfo.location());
+if (!location.isAbsolute()) {
+ // ADLSFileIO does _not_ include the prefix, but GCSFileIO
and S3FileIO do.
+ location = prefixUri.resolve(location);
+}
+return FileSpec.builder()
+.location(location.toString())
+.size(fileInfo.size())
+.createdAtMillis(fileInfo.createdAtMillis())
+.build();
+ })
+ .filter(filter);
+}
+
+throw new IllegalStateException(
+format(
+"An Iceberg FileIO supporting prefix operations is required, but
the given %s does not",
+fileIO.getClass().getName()));
+ }
+
+ @Override
+ public Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate) {
+var metadataOpt = readTableMetadataFailsafe(tableMetadataLocation);
+if (metadataOpt.isEmpty()) {
+ return Stream.empty();
+}
+var metadata = metadataOpt.get();
+
+var metadataFileSpec =
+
FileSpec.fromLocation(tableMetadataLocation).fileType(FileType.ICEBERG_METADATA).build();
+
+var fileSources = new ArrayList>();
+
+fileSources.add(Stream.of(metadataFileSpec));
+
+var statisticsFiles = metadata.statisticsFiles();
+if (statisticsFiles != null)
Re: [PR] Object storage operations [polaris]
dimas-b commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2760130031
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsImpl.java:
##
@@ -0,0 +1,471 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+
+import com.google.common.collect.Streams;
+import com.google.common.util.concurrent.RateLimiter;
+import jakarta.annotation.Nonnull;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.io.BulkDeletionFailureException;
+import org.apache.iceberg.io.CloseableIterator;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
+import org.apache.iceberg.view.ViewMetadata;
+import org.apache.iceberg.view.ViewMetadataParser;
+import org.apache.polaris.storage.files.api.FileFilter;
+import org.apache.polaris.storage.files.api.FileOperations;
+import org.apache.polaris.storage.files.api.FileSpec;
+import org.apache.polaris.storage.files.api.FileType;
+import org.apache.polaris.storage.files.api.ImmutablePurgeStats;
+import org.apache.polaris.storage.files.api.PurgeSpec;
+import org.apache.polaris.storage.files.api.PurgeStats;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @param fileIO the {@link FileIO} instance to use. The given instance must
implement both {@link
+ * org.apache.iceberg.io.SupportsBulkOperations} and {@link
+ * org.apache.iceberg.io.SupportsPrefixOperations}.
+ */
+record FileOperationsImpl(@Nonnull FileIO fileIO) implements FileOperations {
+ private static final Logger LOGGER =
LoggerFactory.getLogger(FileOperationsImpl.class);
+
+ @Override
+ public Stream findFiles(@Nonnull String prefix, @Nonnull
FileFilter filter) {
+var prefixUri = StorageUri.of(prefix).resolve("/");
+if (fileIO instanceof SupportsPrefixOperations prefixOps) {
+ return Streams.stream(prefixOps.listPrefix(prefix).iterator())
+ .filter(Objects::nonNull)
+ .map(
+ fileInfo -> {
+var location = StorageUri.of(fileInfo.location());
+if (!location.isAbsolute()) {
+ // ADLSFileIO does _not_ include the prefix, but GCSFileIO
and S3FileIO do.
+ location = prefixUri.resolve(location);
+}
+return FileSpec.builder()
+.location(location.toString())
+.size(fileInfo.size())
+.createdAtMillis(fileInfo.createdAtMillis())
+.build();
+ })
+ .filter(filter);
+}
+
+throw new IllegalStateException(
+format(
+"An Iceberg FileIO supporting prefix operations is required, but
the given %s does not",
+fileIO.getClass().getName()));
+ }
+
+ @Override
+ public Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate) {
+var metadataOpt = readTableMetadataFailsafe(tableMetadataLocation);
+if (metadataOpt.isEmpty()) {
+ return Stream.empty();
+}
+var metadata = metadataOpt.get();
+
+var metadataFileSpec =
+
FileSpec.fromLocation(tableMetadataLocation).fileType(FileType.ICEBERG_METADATA).build();
+
+var fileSources = new ArrayList>();
+
+fileSources.add(Stream.of(metadataFileSpec));
+
+var statisticsFiles = metadata.statisticsFiles();
+if (statisticsFiles != null)
Re: [PR] Object storage operations [polaris]
snazy commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2758458052
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/PurgeStats.java:
##
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import java.time.Duration;
+import org.apache.polaris.immutables.PolarisImmutable;
+
+@PolarisImmutable
+public interface PurgeStats {
+ Duration duration();
Review Comment:
I do not get the "doesn't seem to be used anywhere" argument. How should
_any newly_ added code be used anywhere? Not sure why that should be a problem
for this PR but not for other PRs?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
flyrain commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2757300055
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/PurgeStats.java:
##
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import java.time.Duration;
+import org.apache.polaris.immutables.PolarisImmutable;
+
+@PolarisImmutable
+public interface PurgeStats {
+ Duration duration();
Review Comment:
Thanks for adding the doc. However, it doesn't seem to be used anywhere. Can
we remove it? We could add it back when it is required.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
flyrain commented on code in PR #3256: URL: https://github.com/apache/polaris/pull/3256#discussion_r2757290120 ## storage/files/impl/src/main/resources/META-INF/beans.xml: ## @@ -0,0 +1,24 @@ + + +https://jakarta.ee/xml/ns/jakartaee"; + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; + xsi:schemaLocation="https://jakarta.ee/xml/ns/jakartaee https://jakarta.ee/xml/ns/jakartaee/beans_4_0.xsd";> + Review Comment: We could always add it back once it is needed. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
snazy commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2741673229
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileOperations.java:
##
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import jakarta.annotation.Nonnull;
+import java.util.stream.Stream;
+
+/**
+ * Object storage file operations, used to find files below a given prefix, to
purge files, to
+ * identify referenced files, etc.
+ *
+ * All functions of this interface rather yield incomplete results and
continue over throwing
+ * exceptions.
+ */
+public interface FileOperations {
+ /**
+ * Find files that match the given prefix and filter.
+ *
+ * Whether existing but inaccessible files are included in the result
depends on the object
+ * store.
+ *
+ * Call sites should consider rate-limiting the scan operations, for
example, by using Guava's
+ * {@code RateLimiter} via a {@code Stream.map(x -> { rateLimiter.acquire();
return x; }} step on
+ * the returned stream.
+ *
+ * @param prefix full object storage URI prefix, including scheme and bucket.
+ * @param filter file filter
+ * @return a stream of file specs with the {@link
FileSpec#createdAtMillis()} and {@link
+ * FileSpec#size()} attributes populated with the information provided
by the object store.
+ * The {@link FileSpec#fileType() file type} attribute is not populated,
it may be {@link
+ * FileSpec#guessTypeFromName() guessed}.
+ */
+ Stream findFiles(@Nonnull String prefix, @Nonnull FileFilter
filter);
+
+ /**
+ * Identifies all files referenced by the given table-metadata.
+ *
+ * In case "container" files, like the metadata, manifest-list or
manifest files, are not
+ * readable, the returned stream will just not include those.
+ *
+ * Rate-limiting the returned stream is recommended when identifying
multiple tables and/or
+ * views. Rate-limiting on a single invocation may not be effective as
expected.
+ *
+ * @param tableMetadataLocation Iceberg table-metadata location
+ * @param deduplicate if true, attempt to deduplicate files by their
location, adding additional
+ * heap pressure to the operation. Implementations may ignore this
parameter or may not
Review Comment:
Deduplication is (sadly) needed.
When iterating over a snapshot's manifests, we do not have any information
whether a specific manifest file has been "seen" before, or whether it has been
"taken over" from another snapshot.
This means we would be processing the same manifest files multiple times.
The same is true for data/delete files references from the manifest files.
Note that newer snapshots reference previous manifest files, as returned by
`Snapshot.allManifests(FileIO)`. Since there is no better way than that
function, we have to either bite that bullet and re-read the same
manifest-files multiple times or have a somewhat working deduplication
mechanism, as implemented. The implemented deduplicator only considers the most
recent 10,000 manifest file paths now.
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileType.java:
##
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import org.apache.iceberg.Content
Re: [PR] Object storage operations [polaris]
flyrain commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2726371138
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/StorageUri.java:
##
@@ -0,0 +1,288 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.polaris.storage.files.impl;
+
+import com.google.common.base.Preconditions;
+import jakarta.annotation.Nonnull;
+import jakarta.annotation.Nullable;
+import java.net.URI;
+import java.util.Comparator;
+import java.util.Locale;
+import java.util.Objects;
+
+/**
+ * A utility class for working with file storage locations to be used instead
of {@link URI} since
+ * many actual object store locations do not always match the URI syntax.
+ *
+ * This type is private to the {@code polaris-storage-files-impl}
module.
+ */
+final class StorageUri implements Comparable {
+ public static final String SCHEME_FILE = "file";
Review Comment:
Can we remove as it is not used?
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileType.java:
##
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import org.apache.iceberg.ContentFile;
+
+public enum FileType {
+ UNKNOWN(false, false),
+ ICEBERG_METADATA(true, false),
+ ICEBERG_STATISTICS(false, false),
+ ICEBERG_MANIFEST_LIST(false, false),
+ ICEBERG_MANIFEST_FILE(false, false),
+ ICEBERG_DATA_FILE(false, true),
+ ICEBERG_DELETE_FILE(false, true),
+ ;
+
+ private final boolean metadata;
+ private final boolean dataOrDelete;
Review Comment:
Looks like these two variables are never used. Can we remove them?
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileSpec.java:
##
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import com.google.errorprone.annotations.CanIgnoreReturnValue;
+import java.util.Optional;
+import java.util.OptionalLong;
+import org.apache.polaris.immutables.PolarisImmutable;
+
+/**
+ * Describes a single file/object in an object storage.
+ *
+ * Not all attributes are populated by every {@link FileOperations}
function.
+ */
+@PolarisImmutable
+public interface FileSpec {
+
+ /** The full object storage URI. */
+ String location();
+
+ /**
+ * The type of the file, if known, as a convenience hint.
+ *
+ * The type might have been guessed from the file name, in which case the
returned value is not
+ * guaranteed to be accurate.
+ *
+ * @see #guessTypeFromName()
+ */
+ Optional fileType();
+
+ /** The size of th
Re: [PR] Object storage operations [polaris]
jbonofre commented on PR #3256: URL: https://github.com/apache/polaris/pull/3256#issuecomment-3791276733 In regards of the last comments, and regarding #3256, I think we are pretty close to be able to move forward: 1. The Nessie dependency is not a blocker imho. Personally, I would prefer (as already said) to not have this dependency (and create required test classes), but that's my preference. I'm fine with Nessie test dep for now, and maybe an issue to track that it could be removed later. 2. There's a valid question about dedupe that should be clearly answer. 3. Also a question about file deletion rate limiter So, I think we should update #3256 first with: 1. Remove the Nessie dependency (e.g. #3513). 2. It would be great to provide feedback on the pending comments. I think we should be fine if we have these comments resolved. I think we can address that by having "specific topic" PRs. I like #3513 PR because it goes in the right direction to me. So, with #3513, we can update #3256 (assuming the pending comments will be resolved) and do a new reviews round. If I agree about questions regarding scope and dependencies, I don't think "scale" is a blocker now. We can start with something not optimal and work together (as a next step) to improve it. It's pretty presumptuous to think something is 200% fine at first shoot. Remember the JDBC support, it took some iterations to fix some performance issues. And that's the beauty of the community. Nothing can't be perfect at first run (I'm very humble in this regard). So, I would say that if #3256 works, we can address "scaling" questions in a follow up. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
flyrain commented on PR #3256: URL: https://github.com/apache/polaris/pull/3256#issuecomment-3788429475 @pingtimeout Thanks for laying out the proposal, I appreciate the effort to move things forward. I respectfully disagree with merging #3256 as is. While the contribution has merit, there are still unresolved concerns around scope, dependency choices, and validation at scale. These are not just process questions, they materially affect maintainability and whether this is something we would accept from any contributor. I agree that we should have a broader dev list discussion about review practices, but in the meantime I think the safer path is to split the work into smaller, clearly scoped PRs and address the remaining concerns directly before merging. That keeps the bar consistent and avoids relying on follow ups for core design issues. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on PR #3256: URL: https://github.com/apache/polaris/pull/3256#issuecomment-3785932202 As I mentioned before, it's just my opinion on how communities should operate, if you are committed to removing the Nessie dependencies then you already have my +1 on this PR. I don't even have any positive feelings on keeping the current implementation, I would rather we just replace it entirely. For me it's just procedural issues in getting code into the project. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
pingtimeout commented on PR #3256: URL: https://github.com/apache/polaris/pull/3256#issuecomment-3785516639 @RussellSpitzer We should certainly have a dev ML discussion about this. As you mentioned, there have been many PR in which we agreed to merge as-is and keep iterating/improving in follow-ups. So if we want to change that community practice, we should discuss it with the community. @flyrain - It sounds like you want to ensure we're making the right technical decision here, and I agree with you that it is critical we commit to the project's success. I think it's important we reflect on how our process aligns with Apache values, especially considering we are discussing graduation. The Apache Way emphasizes that contributions should be evaluated on their technical merit and demonstrated value, and that the decision making process should be open, transparent, and collaborative. I want to propose a path forward that honors both the Apache Way and respects everyone's contributions: 1. #3256 moves forward based on its demonstrated merit - it's tested, approved by diverse reviewers, and solves documented problems 2. #3415 can be evaluated independently on its own merits once it has comparable testing and validation. Both approaches can coexist. 3. We open another PR to add a feature flag that allows users to use the storage operations code for purge. The feature flag defaults to the current implementation (== no behaviour change in Polaris). This approach demonstrates the kind of open, collaborative, merit-based decision making that will serve us well as we work toward graduation. And it also enables the community to move forward. Am I missing anything that would make this path forward unreasonable? I would like to commit this by lazy consensus if no-one objects within the next two days. Let's use this PR reviews (Approval/Request for changes) to collect responses. cc @RusselSpitzer @dimas-b @adam-christian-software. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
flyrain commented on PR #3256: URL: https://github.com/apache/polaris/pull/3256#issuecomment-3775726356 Echo this comment, https://github.com/apache/polaris/pull/3415#discussion_r2704149700, I think it's a good idea to test the code at scale, thanks @pingtimeout for bringing it up. Is there any benchmark for this PR? Can I reuse it or use it as a reference, so that we can compare apple to apple? If not, I'd suggest to work together on a benchmark, so that it will benefits both PRs and any future improvement. WDYT? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on PR #3256: URL: https://github.com/apache/polaris/pull/3256#issuecomment-3774110829 > @RussellSpitzer I understand where you are coming from. I just want to note that the size of this PR has already increased by 500 LoC since it was opened because the dependency to a Nessie library was considered as a blocker. And I think it would be unusual to first block a PR because it depends on an external library, then request the code to be pulled into Polaris, and then block it again because it became too big. > That's unfortunately how it goes some times with large changes to the code base or the addition of new dependencies. I probably should have specified that the Nessie code should come in a different PR (or PRs) as a prerequisite for this one is based on, so sorry if that wasn't clear. **(Also i'm not blocking, just trying to encourage what I think are healthier community behaviors)** > I think I have seen multiple PRs in the project where contributions, however imperfect, are welcome, with the request that things are quickly fixed in follow-up PRs. I am sure I can do some digging to find examples. Isn't that a normal thing in OSS projects? This is my personal opinion but I am strongly against "fix in a followup" and I agree it's happened many times in this project. I'm saying I don't like that. Follow ups should be - planned (this is pr is an independent part of a larger whole), - issues discovered while reviewing that weren't previously known (another bug is discovered for example) - refactors (i'm fixing a bug and it would be great if the class structure was different but not necessary for fixing the bug). Ideally PRs are small, self contained, complete and have addressed all concerns. The biggest problem with "fix in a followup" is that it makes the community look like we don't have an equal playing-ground for all contributors. We should always be asking is this something we would accept from an unknown contributor? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
pingtimeout commented on PR #3256: URL: https://github.com/apache/polaris/pull/3256#issuecomment-3773910088 @RussellSpitzer I understand where you are coming from. I just want to note that the size of this PR has already increased by 500 LoC since it was opened because the dependency to a Nessie library was considered as a blocker. And I think it would be unusual to first block a PR because it depends on an external library, then request the code to be pulled into Polaris, and then block it again because it became too big. I think I have seen multiple PRs in the project where contributions, however imperfect, are welcome, with the request that things are quickly fixed in follow-up PRs. I am sure I can do some digging to find examples. Isn't that a normal thing in OSS projects? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on PR #3256: URL: https://github.com/apache/polaris/pull/3256#issuecomment-3773649070 > @flyrain @RussellSpitzer I am working on the removal of the `object-storage-mock` and other test dependencies. I should be able to propose something today or tomorrow. > > Would it be possible to do that in a follow-up PR? Rationale: this one has already been increased by +1k LoC with the removal of Nessie code in the production scope. And I think the testing scope would be trivial to review in isolation. This is actually a good argument that the Nessie code should be in a separate PR which we would merge first :) > > In other words: do you consider this dependency to Nessie libs for testing purposes as a blocker? Honestly if we go by normal OSS project patterns, breaking this PR itself into multiple sub-pr's is probably more appropriate. I've seen in past PR's in the Polaris project we are tending to merge large blocks of code into the project then patch things up post facto. This isn't really a good pattern since it necessarily biases towards contributors who have enough insider connections to be "trusted" to follow up. I think it would be best in this case to get in these testing deps in in separate PR's (including the Iceberg table construction code) prior to merging this code. I haven't been as active in the project lately due to new children though, so I don't want to disrupt what's going on but my gut would say this PR is trying to bite off too much in one go. A good question is would accept this from a PR from a new contributor to the project? Say a contributor to Gravatino wrote this same PR but had used Gravitino as a test dependency. So i'm a -0, but mostly because I want to convince everyone else to follow more normal review procedures if possible. I'm just one community member though. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
pingtimeout commented on PR #3256: URL: https://github.com/apache/polaris/pull/3256#issuecomment-3773395789 @flyrain @RussellSpitzer I am working on the removal of the `object-storage-mock` and other test dependencies. I should be able to propose something today or tomorrow. Would it be possible to do that in a follow-up PR? Rationale: this one has already been increased by +1k LoC with the removal of Nessie code in the production scope. And I think the testing scope would be trivial to review in isolation. In other words: do you consider this dependency to Nessie libs for testing purposes as a blocker? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2699876924
##
storage/files/impl/build.gradle.kts:
##
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+plugins {
+ id("polaris-server")
+ id("org.kordamp.gradle.jandex")
+}
+
+dependencies {
+ implementation(project(":polaris-storage-files-api"))
+
+ implementation(platform(libs.iceberg.bom))
+ implementation("org.apache.iceberg:iceberg-api")
+ implementation("org.apache.iceberg:iceberg-core")
+
+ implementation(platform(libs.jackson.bom))
+ implementation("com.fasterxml.jackson.core:jackson-annotations")
+ implementation("com.fasterxml.jackson.core:jackson-core")
+ implementation("com.fasterxml.jackson.core:jackson-databind")
+ implementation("com.fasterxml.jackson.dataformat:jackson-dataformat-smile")
+ runtimeOnly("com.fasterxml.jackson.datatype:jackson-datatype-guava")
+ runtimeOnly("com.fasterxml.jackson.datatype:jackson-datatype-jdk8")
+ runtimeOnly("com.fasterxml.jackson.datatype:jackson-datatype-jsr310")
+
+ implementation(libs.guava)
+ implementation(libs.slf4j.api)
+
+ implementation(libs.jakarta.inject.api)
+ implementation(libs.jakarta.validation.api)
+ implementation(libs.jakarta.enterprise.cdi.api)
+
+ runtimeOnly("org.apache.iceberg:iceberg-aws")
+ runtimeOnly(platform(libs.awssdk.bom))
+ runtimeOnly("software.amazon.awssdk:sts")
+ runtimeOnly("software.amazon.awssdk:iam-policy-builder")
+ runtimeOnly("software.amazon.awssdk:s3")
+ runtimeOnly("software.amazon.awssdk:kms")
+
+ runtimeOnly("org.apache.iceberg:iceberg-azure")
+ runtimeOnly(platform(libs.azuresdk.bom))
+ runtimeOnly("com.azure:azure-storage-blob")
+ runtimeOnly("com.azure:azure-storage-common")
+ runtimeOnly("com.azure:azure-identity")
+ runtimeOnly("com.azure:azure-storage-file-datalake")
+
+ runtimeOnly("org.apache.iceberg:iceberg-gcp")
+ runtimeOnly(platform(libs.google.cloud.storage.bom))
+ runtimeOnly("com.google.cloud:google-cloud-storage")
+
+ testFixturesApi(project(":polaris-storage-files-api"))
+
+ testFixturesApi(platform(libs.nessie.bom))
+ testImplementation("org.projectnessie.nessie:nessie-object-storage-mock")
+
+ testFixturesApi("com.fasterxml.jackson.core:jackson-core")
+ testFixturesApi("com.fasterxml.jackson.core:jackson-databind")
+ testFixturesApi(platform(libs.jackson.bom))
+ testRuntimeOnly("org.junit.platform:junit-platform-launcher")
+
+ testFixturesApi(platform(libs.iceberg.bom))
+ testFixturesApi("org.apache.iceberg:iceberg-api")
+ testFixturesApi("org.apache.iceberg:iceberg-core")
+ testFixturesApi("org.apache.iceberg:iceberg-aws")
+ testFixturesApi("org.apache.iceberg:iceberg-azure")
+ testFixturesApi("org.apache.iceberg:iceberg-gcp")
+
+ testFixturesRuntimeOnly("software.amazon.awssdk:url-connection-client")
+
+ compileOnly(libs.jakarta.annotation.api)
+}
+
+tasks.named("javadoc") { dependsOn("jandex") }
+
+tasks.withType {
+ isFailOnError = false
+ options.memberLevel = JavadocMemberLevel.PACKAGE
+}
+
+testing {
+ suites {
+val intTest by
+ registering(JvmTestSuite::class) {
+dependencies {
+ implementation(project(":polaris-storage-files-api"))
+
+
implementation("org.projectnessie.nessie:nessie-azurite-testcontainer")
Review Comment:
I really am not a fan of depending on test libraries from another project,
especially one that our production code isn't dependent on. Do we have other
options here?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2699874549
##
gradle/libs.versions.toml:
##
@@ -83,6 +83,7 @@ microprofile-fault-tolerance-api = { module =
"org.eclipse.microprofile.fault-to
mockito-core = { module = "org.mockito:mockito-core", version = "5.21.0" }
mockito-junit-jupiter = { module = "org.mockito:mockito-junit-jupiter",
version = "5.21.0" }
mongodb-driver-sync = { module = "org.mongodb:mongodb-driver-sync", version =
"5.6.2" }
+nessie-bom = { module = "org.projectnessie.nessie:nessie-bom", version =
"0.106.1" }
Review Comment:
Ideally we don't have a dependency on Nessie
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256: URL: https://github.com/apache/polaris/pull/3256#discussion_r2699871779 ## storage/files/impl/src/testFixtures/java/org/apache/polaris/storage/files/impl/IcebergFixtures.java: ## @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.polaris.storage.files.impl; + +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.ByteArrayOutputStream; +import java.util.Map; +import org.apache.avro.generic.GenericData; +import org.apache.iceberg.IcebergBridge; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; +import org.apache.iceberg.types.Types; +import org.projectnessie.catalog.formats.iceberg.IcebergSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergFileFormat; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntry; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntryStatus; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriterSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriterSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergPartitionSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergSchema; Review Comment: Looks like we still have some test dependencies on the Nessie Test jars? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
pingtimeout commented on PR #3256: URL: https://github.com/apache/polaris/pull/3256#issuecomment-3753593620 Thanks @flyrain for taking the time to offer an alternative. Unfortunately, #3415 does not solve the underlying issue. So considering that fact, the multiple approvals on this PR, and that the changes requested with the -1 have been implemented. I think the only thing to do is to move forward with this contribution. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
flyrain commented on PR #3256: URL: https://github.com/apache/polaris/pull/3256#issuecomment-3752448023 Thanks for the update. For those who have not been following the dev mailing list, I recommend reviewing the discussion there. A good entry point is https://lists.apache.org/thread/dzzdg5l3xoqgysdmp7mh3y8mp59w31tl, though you are welcome to read the thread from the beginning for full context. This PR was originally filed to address the issue described in https://github.com/apache/polaris/issues/2365. After further investigation, it appears that this was a false alarm. Pierre later pointed out that there may be other factors contributing to memory footprint pressure in certain use cases, as described here https://github.com/apache/polaris/issues/2365#issuecomment-3723184230. Those concerns can be addressed independently via a small change proposed in https://github.com/apache/polaris/pull/3415, please take a look to see if that solves the problem. It'd be really nice to run this with the same setup we used to validate the current PR which is this PR fixed the issue. If #3415 solved the issue, can we use it until we settle with permanent solution(delegation service)? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
dimas-b commented on code in PR #3256: URL: https://github.com/apache/polaris/pull/3256#discussion_r2691198442 ## storage/files/impl/src/testFixtures/java/org/apache/polaris/storage/files/impl/IcebergFixtures.java: ## @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.polaris.storage.files.impl; + +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.ByteArrayOutputStream; +import java.util.Map; +import org.apache.avro.generic.GenericData; +import org.apache.iceberg.IcebergBridge; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; +import org.apache.iceberg.types.Types; +import org.projectnessie.catalog.formats.iceberg.IcebergSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergFileFormat; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntry; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntryStatus; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriterSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriterSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergPartitionSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergSchema; Review Comment: @flyrain : WDYT? Issue resolved? ## storage/files/impl/src/testFixtures/java/org/apache/polaris/storage/files/impl/IcebergFixtures.java: ## @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.polaris.storage.files.impl; + +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.ByteArrayOutputStream; +import java.util.Map; +import org.apache.avro.generic.GenericData; +import org.apache.iceberg.IcebergBridge; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; +import org.apache.iceberg.types.Types; +import org.projectnessie.catalog.formats.iceberg.IcebergSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergFileFormat; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestContent; +import org.projectnessie.catalog.formats.iceberg.manifest.Ice
Re: [PR] Object storage operations [polaris]
adam-christian-software commented on code in PR #3256: URL: https://github.com/apache/polaris/pull/3256#discussion_r2691090040 ## storage/files/impl/src/testFixtures/java/org/apache/polaris/storage/files/impl/IcebergFixtures.java: ## @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.polaris.storage.files.impl; + +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.ByteArrayOutputStream; +import java.util.Map; +import org.apache.avro.generic.GenericData; +import org.apache.iceberg.IcebergBridge; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; +import org.apache.iceberg.types.Types; +import org.projectnessie.catalog.formats.iceberg.IcebergSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergFileFormat; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntry; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntryStatus; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriterSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriterSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergPartitionSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergSchema; Review Comment: Great! This looks to be handled. @snazy - We can resolve this, right? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
flyrain commented on code in PR #3256: URL: https://github.com/apache/polaris/pull/3256#discussion_r2677225429 ## storage/files/impl/src/testFixtures/java/org/apache/polaris/storage/files/impl/IcebergFixtures.java: ## @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.polaris.storage.files.impl; + +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.ByteArrayOutputStream; +import java.util.Map; +import org.apache.avro.generic.GenericData; +import org.apache.iceberg.IcebergBridge; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; +import org.apache.iceberg.types.Types; +import org.projectnessie.catalog.formats.iceberg.IcebergSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergFileFormat; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntry; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntryStatus; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriterSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriterSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergPartitionSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergSchema; Review Comment: I want to be explicit that this is a blocker for me. Having Polaris depend on Nessie for Iceberg metadata helpers creates ongoing coupling risk. Every time Iceberg evolves, Polaris potentially has to wait for Nessie to catch up, even when the changes are additive rather than spec breaking. That is not a great position for a project that aims to stay closely aligned with upstream Iceberg behavior. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
pingtimeout commented on PR #3256: URL: https://github.com/apache/polaris/pull/3256#issuecomment-3729278095 > There is always the opportunity to improve things later on 💯 We should absolutely keep improving the codebase if we find better alternatives, nothing is set in stone. > I do not think that there will be any new insights in the next 4/5 days [...] My main goal here is to try and find a reasonable timeframe to allow Yufei/Prashant to do some more investigations while also not blocking this contribution. And considering this PR has been open for almost a month now, waiting for just a couple more days could be acceptable. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
snazy commented on PR #3256: URL: https://github.com/apache/polaris/pull/3256#issuecomment-3729157891 There is always the opportunity to improve things later on. I do not think that there will be any new insights in the next 4/5 days, simply because there were no other proposals [since August 15](https://lists.apache.org/thread/9pgvhr9btfgzofbm6qhyfyqnk62hzp4m) or strong technical concerns. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
pingtimeout commented on PR #3256: URL: https://github.com/apache/polaris/pull/3256#issuecomment-3729044007 @flyrain I guess we could pause for a couple of days before merging this PR. Let's wait until Wednesday 14th and, unless the investigations find anything meaningful, we can merge this PR. Any strong objection? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
flyrain commented on PR #3256: URL: https://github.com/apache/polaris/pull/3256#issuecomment-3725228243 Thank you very much for the work you have put into this PR, we really appreciate the effort. The three of us, Adam, Prashant, and I, had a chance to sync offline and discuss it briefly. Would it be possible to pause for a short while so we can do the following: 1. Double check whether the underlying issue we are trying to address is indeed valid. 2. Explore a few additional options, for example whether we could reuse existing Iceberg native libraries instead of reimplementing similar functionality. Thanks again for your patience and for all the work here. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256: URL: https://github.com/apache/polaris/pull/3256#discussion_r2673402956 ## storage/files/impl/src/testFixtures/java/org/apache/polaris/storage/files/impl/IcebergFixtures.java: ## @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.polaris.storage.files.impl; + +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.ByteArrayOutputStream; +import java.util.Map; +import org.apache.avro.generic.GenericData; +import org.apache.iceberg.IcebergBridge; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; +import org.apache.iceberg.types.Types; +import org.projectnessie.catalog.formats.iceberg.IcebergSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergFileFormat; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntry; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntryStatus; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriterSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriterSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergPartitionSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergSchema; Review Comment: I'd have to agree we probably shouldn't use Nessie as a dependency here, if we want to re-use the fixture we should probably just copy the code into this project. I'm also onboard with @adam-christian-software 's implementation above. Long term probably makes sense for the Iceberg project to provide something along these lines that downstream projects can use. Another minor note is this only uses V2 Iceberg tables, we probably should be double checking against V3 as well. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2673392665
##
storage/files/impl/src/testFixtures/java/org/apache/polaris/storage/files/impl/IcebergFixtures.java:
##
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import java.io.ByteArrayOutputStream;
+import java.util.Map;
+import org.apache.avro.generic.GenericData;
+import org.apache.iceberg.IcebergBridge;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.SchemaParser;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.types.Types;
+import org.projectnessie.catalog.formats.iceberg.IcebergSpec;
+import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataContent;
+import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataFile;
+import org.projectnessie.catalog.formats.iceberg.manifest.IcebergFileFormat;
+import
org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestContent;
+import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntry;
+import
org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntryStatus;
+import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFile;
+import
org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriter;
+import
org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriterSpec;
+import
org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriter;
+import
org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriterSpec;
+import org.projectnessie.catalog.formats.iceberg.meta.IcebergPartitionSpec;
+import org.projectnessie.catalog.formats.iceberg.meta.IcebergSchema;
+
+public class IcebergFixtures {
+ public final Schema schema;
+ public final IcebergSchema nessieIcebergSchema;
+ public final PartitionSpec spec;
+ public final TableMetadata tableMetadata;
+ public final String tableMetadataString;
+ public final byte[] tableMetadataBytes;
+
+ public final String prefix;
+ public final int numSnapshots;
+ public final int numManifestFiles;
+ public final int numDataFiles;
+
+ public IcebergFixtures(String prefix, int numSnapshots, int
numManifestFiles, int numDataFiles) {
+this.prefix = prefix;
+this.numSnapshots = numSnapshots;
+this.numManifestFiles = numManifestFiles;
+this.numDataFiles = numDataFiles;
+
+schema = new Schema(1, Types.NestedField.required(1, "foo",
Types.StringType.get()));
+try {
+ nessieIcebergSchema =
+ new ObjectMapper().readValue(SchemaParser.toJson(schema),
IcebergSchema.class);
+} catch (JsonProcessingException e) {
+ throw new RuntimeException(e);
+}
+spec = PartitionSpec.unpartitioned();
+
+var tableMetadataBuilder =
+TableMetadata.buildFrom(
+TableMetadata.newTableMetadata(schema, spec, prefix,
Map.of()).withUUID());
+for (var snapshotId = 1; snapshotId <= numSnapshots; snapshotId++) {
+ var manifestList = manifestListPath(snapshotId);
+ var snapshot =
+ IcebergBridge.mockSnapshot(
+ snapshotId + 1,
+ snapshotId + 1,
+ snapshotId > 0 ? (long) snapshotId : null,
+ System.currentTimeMillis(),
+ "APPEND",
+ Map.of(),
+ schema.schemaId(),
+ manifestList,
+ (long) numManifestFiles * numManifestFiles);
+ tableMetadataBuilder.addSnapshot(snapshot);
+}
+tableMetadata = tableMetadataBuilder.build();
+
+tableMetadataString = TableMetadataParser.toJson(tableMetadata);
+tableMetadataBytes = tableMetadataString.getBytes(UTF_8);
+ }
+
+ public String manifestListPath(int snapshotId) {
+return format("%s%05d/snap-%d.avro", prefix, snapshotId, snapshotId);
+ }
+
+ public by
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2673361858
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsImpl.java:
##
@@ -0,0 +1,472 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+
+import com.google.common.collect.Streams;
+import com.google.common.util.concurrent.RateLimiter;
+import jakarta.annotation.Nonnull;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.io.BulkDeletionFailureException;
+import org.apache.iceberg.io.CloseableIterator;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
+import org.apache.iceberg.view.ViewMetadata;
+import org.apache.iceberg.view.ViewMetadataParser;
+import org.apache.polaris.storage.files.api.FileFilter;
+import org.apache.polaris.storage.files.api.FileOperations;
+import org.apache.polaris.storage.files.api.FileSpec;
+import org.apache.polaris.storage.files.api.FileType;
+import org.apache.polaris.storage.files.api.ImmutablePurgeStats;
+import org.apache.polaris.storage.files.api.PurgeSpec;
+import org.apache.polaris.storage.files.api.PurgeStats;
+import org.projectnessie.storage.uri.StorageUri;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @param fileIO the {@link FileIO} instance to use. The given instance must
implement both {@link
+ * org.apache.iceberg.io.SupportsBulkOperations} and {@link
+ * org.apache.iceberg.io.SupportsPrefixOperations}.
+ */
+record FileOperationsImpl(@Nonnull FileIO fileIO) implements FileOperations {
+ private static final Logger LOGGER =
LoggerFactory.getLogger(FileOperationsImpl.class);
+
+ @Override
+ public Stream findFiles(@Nonnull String prefix, @Nonnull
FileFilter filter) {
+var prefixUri = StorageUri.of(prefix).resolve("/");
+if (fileIO instanceof SupportsPrefixOperations prefixOps) {
+ return Streams.stream(prefixOps.listPrefix(prefix).iterator())
+ .filter(Objects::nonNull)
+ .map(
+ fileInfo -> {
+var location = StorageUri.of(fileInfo.location());
+if (!location.isAbsolute()) {
+ // ADLSFileIO does _not_ include the prefix, but GCSFileIO
and S3FileIO do.
+ location = prefixUri.resolve(location);
+}
+return FileSpec.builder()
+.location(location.toString())
+.size(fileInfo.size())
+.createdAtMillis(fileInfo.createdAtMillis())
+.build();
+ })
+ .filter(filter);
+}
+
+throw new IllegalStateException(
+format(
+"An Iceberg FileIO supporting prefix operations is required, but
the given %s does not",
+fileIO.getClass().getName()));
+ }
+
+ @Override
+ public Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate) {
+var metadataOpt = readTableMetadataFailsafe(tableMetadataLocation);
+if (metadataOpt.isEmpty()) {
+ return Stream.empty();
+}
+var metadata = metadataOpt.get();
+
+var metadataFileSpec =
+
FileSpec.fromLocation(tableMetadataLocation).fileType(FileType.ICEBERG_METADATA).build();
+
+var fileSources = new ArrayList>();
+
+fileSources.add(Stream.of(metadataFileSpec));
+
+var statisticsFiles = meta
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2673357722
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsImpl.java:
##
@@ -0,0 +1,472 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+
+import com.google.common.collect.Streams;
+import com.google.common.util.concurrent.RateLimiter;
+import jakarta.annotation.Nonnull;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.io.BulkDeletionFailureException;
+import org.apache.iceberg.io.CloseableIterator;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
+import org.apache.iceberg.view.ViewMetadata;
+import org.apache.iceberg.view.ViewMetadataParser;
+import org.apache.polaris.storage.files.api.FileFilter;
+import org.apache.polaris.storage.files.api.FileOperations;
+import org.apache.polaris.storage.files.api.FileSpec;
+import org.apache.polaris.storage.files.api.FileType;
+import org.apache.polaris.storage.files.api.ImmutablePurgeStats;
+import org.apache.polaris.storage.files.api.PurgeSpec;
+import org.apache.polaris.storage.files.api.PurgeStats;
+import org.projectnessie.storage.uri.StorageUri;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @param fileIO the {@link FileIO} instance to use. The given instance must
implement both {@link
+ * org.apache.iceberg.io.SupportsBulkOperations} and {@link
+ * org.apache.iceberg.io.SupportsPrefixOperations}.
+ */
+record FileOperationsImpl(@Nonnull FileIO fileIO) implements FileOperations {
+ private static final Logger LOGGER =
LoggerFactory.getLogger(FileOperationsImpl.class);
+
+ @Override
+ public Stream findFiles(@Nonnull String prefix, @Nonnull
FileFilter filter) {
+var prefixUri = StorageUri.of(prefix).resolve("/");
+if (fileIO instanceof SupportsPrefixOperations prefixOps) {
+ return Streams.stream(prefixOps.listPrefix(prefix).iterator())
+ .filter(Objects::nonNull)
+ .map(
+ fileInfo -> {
+var location = StorageUri.of(fileInfo.location());
+if (!location.isAbsolute()) {
+ // ADLSFileIO does _not_ include the prefix, but GCSFileIO
and S3FileIO do.
+ location = prefixUri.resolve(location);
+}
+return FileSpec.builder()
+.location(location.toString())
+.size(fileInfo.size())
+.createdAtMillis(fileInfo.createdAtMillis())
+.build();
+ })
+ .filter(filter);
+}
+
+throw new IllegalStateException(
+format(
+"An Iceberg FileIO supporting prefix operations is required, but
the given %s does not",
+fileIO.getClass().getName()));
+ }
+
+ @Override
+ public Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate) {
+var metadataOpt = readTableMetadataFailsafe(tableMetadataLocation);
+if (metadataOpt.isEmpty()) {
+ return Stream.empty();
+}
+var metadata = metadataOpt.get();
+
+var metadataFileSpec =
+
FileSpec.fromLocation(tableMetadataLocation).fileType(FileType.ICEBERG_METADATA).build();
+
+var fileSources = new ArrayList>();
+
+fileSources.add(Stream.of(metadataFileSpec));
+
+var statisticsFiles = meta
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2673348691
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsImpl.java:
##
@@ -0,0 +1,472 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+
+import com.google.common.collect.Streams;
+import com.google.common.util.concurrent.RateLimiter;
+import jakarta.annotation.Nonnull;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.io.BulkDeletionFailureException;
+import org.apache.iceberg.io.CloseableIterator;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
+import org.apache.iceberg.view.ViewMetadata;
+import org.apache.iceberg.view.ViewMetadataParser;
+import org.apache.polaris.storage.files.api.FileFilter;
+import org.apache.polaris.storage.files.api.FileOperations;
+import org.apache.polaris.storage.files.api.FileSpec;
+import org.apache.polaris.storage.files.api.FileType;
+import org.apache.polaris.storage.files.api.ImmutablePurgeStats;
+import org.apache.polaris.storage.files.api.PurgeSpec;
+import org.apache.polaris.storage.files.api.PurgeStats;
+import org.projectnessie.storage.uri.StorageUri;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @param fileIO the {@link FileIO} instance to use. The given instance must
implement both {@link
+ * org.apache.iceberg.io.SupportsBulkOperations} and {@link
+ * org.apache.iceberg.io.SupportsPrefixOperations}.
+ */
+record FileOperationsImpl(@Nonnull FileIO fileIO) implements FileOperations {
+ private static final Logger LOGGER =
LoggerFactory.getLogger(FileOperationsImpl.class);
+
+ @Override
+ public Stream findFiles(@Nonnull String prefix, @Nonnull
FileFilter filter) {
+var prefixUri = StorageUri.of(prefix).resolve("/");
+if (fileIO instanceof SupportsPrefixOperations prefixOps) {
+ return Streams.stream(prefixOps.listPrefix(prefix).iterator())
+ .filter(Objects::nonNull)
+ .map(
+ fileInfo -> {
+var location = StorageUri.of(fileInfo.location());
+if (!location.isAbsolute()) {
+ // ADLSFileIO does _not_ include the prefix, but GCSFileIO
and S3FileIO do.
+ location = prefixUri.resolve(location);
+}
+return FileSpec.builder()
+.location(location.toString())
+.size(fileInfo.size())
+.createdAtMillis(fileInfo.createdAtMillis())
+.build();
+ })
+ .filter(filter);
+}
+
+throw new IllegalStateException(
+format(
+"An Iceberg FileIO supporting prefix operations is required, but
the given %s does not",
+fileIO.getClass().getName()));
+ }
+
+ @Override
+ public Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate) {
+var metadataOpt = readTableMetadataFailsafe(tableMetadataLocation);
+if (metadataOpt.isEmpty()) {
+ return Stream.empty();
+}
+var metadata = metadataOpt.get();
+
+var metadataFileSpec =
+
FileSpec.fromLocation(tableMetadataLocation).fileType(FileType.ICEBERG_METADATA).build();
+
+var fileSources = new ArrayList>();
+
+fileSources.add(Stream.of(metadataFileSpec));
+
+var statisticsFiles = meta
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2673285544
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsImpl.java:
##
@@ -0,0 +1,472 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+
+import com.google.common.collect.Streams;
+import com.google.common.util.concurrent.RateLimiter;
+import jakarta.annotation.Nonnull;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.io.BulkDeletionFailureException;
+import org.apache.iceberg.io.CloseableIterator;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
+import org.apache.iceberg.view.ViewMetadata;
+import org.apache.iceberg.view.ViewMetadataParser;
+import org.apache.polaris.storage.files.api.FileFilter;
+import org.apache.polaris.storage.files.api.FileOperations;
+import org.apache.polaris.storage.files.api.FileSpec;
+import org.apache.polaris.storage.files.api.FileType;
+import org.apache.polaris.storage.files.api.ImmutablePurgeStats;
+import org.apache.polaris.storage.files.api.PurgeSpec;
+import org.apache.polaris.storage.files.api.PurgeStats;
+import org.projectnessie.storage.uri.StorageUri;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @param fileIO the {@link FileIO} instance to use. The given instance must
implement both {@link
+ * org.apache.iceberg.io.SupportsBulkOperations} and {@link
+ * org.apache.iceberg.io.SupportsPrefixOperations}.
+ */
+record FileOperationsImpl(@Nonnull FileIO fileIO) implements FileOperations {
+ private static final Logger LOGGER =
LoggerFactory.getLogger(FileOperationsImpl.class);
+
+ @Override
+ public Stream findFiles(@Nonnull String prefix, @Nonnull
FileFilter filter) {
+var prefixUri = StorageUri.of(prefix).resolve("/");
+if (fileIO instanceof SupportsPrefixOperations prefixOps) {
+ return Streams.stream(prefixOps.listPrefix(prefix).iterator())
+ .filter(Objects::nonNull)
+ .map(
+ fileInfo -> {
+var location = StorageUri.of(fileInfo.location());
+if (!location.isAbsolute()) {
+ // ADLSFileIO does _not_ include the prefix, but GCSFileIO
and S3FileIO do.
+ location = prefixUri.resolve(location);
+}
+return FileSpec.builder()
+.location(location.toString())
+.size(fileInfo.size())
+.createdAtMillis(fileInfo.createdAtMillis())
+.build();
+ })
+ .filter(filter);
+}
+
+throw new IllegalStateException(
+format(
+"An Iceberg FileIO supporting prefix operations is required, but
the given %s does not",
+fileIO.getClass().getName()));
+ }
+
+ @Override
+ public Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate) {
+var metadataOpt = readTableMetadataFailsafe(tableMetadataLocation);
+if (metadataOpt.isEmpty()) {
+ return Stream.empty();
+}
+var metadata = metadataOpt.get();
+
+var metadataFileSpec =
+
FileSpec.fromLocation(tableMetadataLocation).fileType(FileType.ICEBERG_METADATA).build();
+
+var fileSources = new ArrayList>();
+
+fileSources.add(Stream.of(metadataFileSpec));
+
+var statisticsFiles = meta
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2673230920
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/PurgeStats.java:
##
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import java.time.Duration;
+import org.apache.polaris.immutables.PolarisImmutable;
+
+@PolarisImmutable
+public interface PurgeStats {
+ Duration duration();
+
+ /**
+ * The number of purged files.
+ *
+ * The returned value may be wrong and include non-existing files.
Review Comment:
This is probably fine but the current implementations in Iceberg assume the
delete function should throw a runtime exception if the file does not exist.
That's at least the logic implemented in the Iceberg code which uses these
methods.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2673207028
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/PurgeSpec.java:
##
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import static com.google.common.base.Preconditions.checkState;
+
+import java.util.OptionalDouble;
+import java.util.function.Consumer;
+import org.apache.polaris.immutables.PolarisImmutable;
+import org.immutables.value.Value;
+
+@SuppressWarnings("unused")
+@PolarisImmutable
+public interface PurgeSpec {
+ PurgeSpec DEFAULT_INSTANCE = PurgeSpec.builder().build();
+
+ @Value.Default
+ default FileFilter fileFilter() {
+return FileFilter.alwaysTrue();
+ }
+
+ PurgeSpec withFileFilter(FileFilter fileFilter);
+
+ /**
+ * Delete batch size for purge/batch-deletion operations. Implementations
may opt to ignore this
+ * parameter and enforce a reasonable or required different limit.
+ */
+ @Value.Default
+ default int deleteBatchSize() {
+return 250;
+ }
+
+ PurgeSpec withDeleteBatchSize(int deleteBatchSize);
+
+ /**
+ * Callback being invoked right before a file location is being submitted to
be purged.
+ *
+ * Due to API constraints of {@link
+ * org.apache.iceberg.io.SupportsBulkOperations#deleteFiles(Iterable)} it's
barely possible to
+ * identify files that failed a deletion.
+ */
+ @Value.Default
+ default Consumer purgeIssuedCallback() {
+return location -> {};
+ }
+
+ PurgeSpec withPurgeIssuedCallback(Consumer purgeIssuedCallback);
+
+ /**
+ * Optional rate-limit on the number of individual file-deletions per second.
Review Comment:
number of individual files deleted per second?
file-deletions just seems like the spec also is allowing for individual file
deletions outside of bulk deletions
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2673162094
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileType.java:
##
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import org.apache.iceberg.ContentFile;
+
+public enum FileType {
+ UNKNOWN(false, false),
+ ICEBERG_METADATA(true, false),
+ ICEBERG_STATISTICS(false, false),
Review Comment:
May be better to just say Puffin? (I assume that's what this is covering)
because these could be Statistics or could be Delete Files and I guess
theoretically both? Or is it for the parquet statistics files for partitions?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2673191205
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileOperations.java:
##
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import jakarta.annotation.Nonnull;
+import java.util.stream.Stream;
+
+/**
+ * Object storage file operations, used to find files below a given prefix, to
purge files, to
+ * identify referenced files, etc.
+ *
+ * All functions of this interface rather yield incomplete results and
continue over throwing
+ * exceptions.
+ */
+public interface FileOperations {
+ /**
+ * Find files that match the given prefix and filter.
+ *
+ * Whether existing but inaccessible files are included in the result
depends on the object
+ * store.
+ *
+ * Call sites should consider rate-limiting the scan operations, for
example, by using Guava's
+ * {@code RateLimiter} via a {@code Stream.map(x -> { rateLimiter.acquire();
return x; }} step on
+ * the returned stream.
+ *
+ * @param prefix full object storage URI prefix, including scheme and bucket.
+ * @param filter file filter
+ * @return a stream of file specs with the {@link
FileSpec#createdAtMillis()} and {@link
+ * FileSpec#size()} attributes populated with the information provided
by the object store.
+ * The {@link FileSpec#fileType() file type} attribute is not populated,
it may be {@link
+ * FileSpec#guessTypeFromName() guessed}.
+ */
+ Stream findFiles(@Nonnull String prefix, @Nonnull FileFilter
filter);
+
+ /**
+ * Identifies all files referenced by the given table-metadata.
+ *
+ * In case "container" files, like the metadata, manifest-list or
manifest files, are not
+ * readable, the returned stream will just not include those.
+ *
+ * Rate-limiting the returned stream is recommended when identifying
multiple tables and/or
+ * views. Rate-limiting on a single invocation may not be effective as
expected.
+ *
+ * @param tableMetadataLocation Iceberg table-metadata location
+ * @param deduplicate if true, attempt to deduplicate files by their
location, adding additional
+ * heap pressure to the operation. Implementations may ignore this
parameter or may not
+ * deduplicate all identified files.
+ * @return a stream of {@link FileSpec file specs}. The {@link
FileSpec#createdAtMillis()}
+ * attribute is usually not populated, as it would have to be derived
from user-provided
+ * information in metadata or snapshot. The {@link FileSpec#fileType()}
attribute is populated
+ * based on where a file appears during identification.
+ */
+ Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate);
+
+ /**
+ * Identifies all files referenced by the given view-metadata.
+ *
+ * In case "container" files like the metadata are not readable, the
returned stream will just
+ * not include those.
+ *
+ * Rate-limiting the returned stream is recommended when identifying
multiple tables and/or
+ * views. Rate-limiting on a single invocation may not be effective as
expected.
+ *
+ * @param viewMetadataLocation Iceberg view-metadata location
+ * @param deduplicate if true, attempt to deduplicate files by their
location, adding additional
Review Comment:
Again i'm not sure why we need a deduplicate since I would think we can get
the unique listing without any additional processing, I may be missing
something here.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2673187084
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileOperations.java:
##
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import jakarta.annotation.Nonnull;
+import java.util.stream.Stream;
+
+/**
+ * Object storage file operations, used to find files below a given prefix, to
purge files, to
+ * identify referenced files, etc.
+ *
+ * All functions of this interface rather yield incomplete results and
continue over throwing
+ * exceptions.
+ */
+public interface FileOperations {
+ /**
+ * Find files that match the given prefix and filter.
+ *
+ * Whether existing but inaccessible files are included in the result
depends on the object
+ * store.
+ *
+ * Call sites should consider rate-limiting the scan operations, for
example, by using Guava's
+ * {@code RateLimiter} via a {@code Stream.map(x -> { rateLimiter.acquire();
return x; }} step on
+ * the returned stream.
+ *
+ * @param prefix full object storage URI prefix, including scheme and bucket.
+ * @param filter file filter
+ * @return a stream of file specs with the {@link
FileSpec#createdAtMillis()} and {@link
+ * FileSpec#size()} attributes populated with the information provided
by the object store.
+ * The {@link FileSpec#fileType() file type} attribute is not populated,
it may be {@link
+ * FileSpec#guessTypeFromName() guessed}.
+ */
+ Stream findFiles(@Nonnull String prefix, @Nonnull FileFilter
filter);
+
+ /**
+ * Identifies all files referenced by the given table-metadata.
+ *
+ * In case "container" files, like the metadata, manifest-list or
manifest files, are not
+ * readable, the returned stream will just not include those.
+ *
+ * Rate-limiting the returned stream is recommended when identifying
multiple tables and/or
+ * views. Rate-limiting on a single invocation may not be effective as
expected.
+ *
+ * @param tableMetadataLocation Iceberg table-metadata location
+ * @param deduplicate if true, attempt to deduplicate files by their
location, adding additional
+ * heap pressure to the operation. Implementations may ignore this
parameter or may not
Review Comment:
I'm also a little confused about why dedupe is needed here, unless the table
is violating the spec (by having files added with non-unique names or adding
the same file multiple time to the table) we shouldn't be able to any files to
dedupe. From the metadata we should be able to get a unique listing of the
files in the table by starting with the oldest snapshot, and then only adding
"added" files in all snapshots published after that point.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2673187084
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileOperations.java:
##
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import jakarta.annotation.Nonnull;
+import java.util.stream.Stream;
+
+/**
+ * Object storage file operations, used to find files below a given prefix, to
purge files, to
+ * identify referenced files, etc.
+ *
+ * All functions of this interface rather yield incomplete results and
continue over throwing
+ * exceptions.
+ */
+public interface FileOperations {
+ /**
+ * Find files that match the given prefix and filter.
+ *
+ * Whether existing but inaccessible files are included in the result
depends on the object
+ * store.
+ *
+ * Call sites should consider rate-limiting the scan operations, for
example, by using Guava's
+ * {@code RateLimiter} via a {@code Stream.map(x -> { rateLimiter.acquire();
return x; }} step on
+ * the returned stream.
+ *
+ * @param prefix full object storage URI prefix, including scheme and bucket.
+ * @param filter file filter
+ * @return a stream of file specs with the {@link
FileSpec#createdAtMillis()} and {@link
+ * FileSpec#size()} attributes populated with the information provided
by the object store.
+ * The {@link FileSpec#fileType() file type} attribute is not populated,
it may be {@link
+ * FileSpec#guessTypeFromName() guessed}.
+ */
+ Stream findFiles(@Nonnull String prefix, @Nonnull FileFilter
filter);
+
+ /**
+ * Identifies all files referenced by the given table-metadata.
+ *
+ * In case "container" files, like the metadata, manifest-list or
manifest files, are not
+ * readable, the returned stream will just not include those.
+ *
+ * Rate-limiting the returned stream is recommended when identifying
multiple tables and/or
+ * views. Rate-limiting on a single invocation may not be effective as
expected.
+ *
+ * @param tableMetadataLocation Iceberg table-metadata location
+ * @param deduplicate if true, attempt to deduplicate files by their
location, adding additional
+ * heap pressure to the operation. Implementations may ignore this
parameter or may not
Review Comment:
I'm also a little confused about why dedupe is needed here, unless the table
is violating the spec (by having files added with non-unique names or adding
the same file multiple time to the table) we shouldn't be able to find any
files to dedupe. From the metadata we should be able to get a unique listing of
the files in the table by starting with the oldest snapshot, and then only
adding "added" files in all snapshots published after that point.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
RussellSpitzer commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2673162094
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileType.java:
##
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import org.apache.iceberg.ContentFile;
+
+public enum FileType {
+ UNKNOWN(false, false),
+ ICEBERG_METADATA(true, false),
+ ICEBERG_STATISTICS(false, false),
Review Comment:
May be better to just say Puffin? (I assume that's what this is covereing)
because these could be Statistics or could be Delete Files and I guess
theoretically both? Or is it for the parquet statistics files for partitions??
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileType.java:
##
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import org.apache.iceberg.ContentFile;
+
+public enum FileType {
+ UNKNOWN(false, false),
+ ICEBERG_METADATA(true, false),
+ ICEBERG_STATISTICS(false, false),
Review Comment:
May be better to just say Puffin? (I assume that's what this is covering)
because these could be Statistics or could be Delete Files and I guess
theoretically both? Or is it for the parquet statistics files for partitions??
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
adam-christian-software commented on code in PR #3256: URL: https://github.com/apache/polaris/pull/3256#discussion_r2669371313 ## storage/files/impl/src/testFixtures/java/org/apache/polaris/storage/files/impl/IcebergFixtures.java: ## @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.polaris.storage.files.impl; + +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.ByteArrayOutputStream; +import java.util.Map; +import org.apache.avro.generic.GenericData; +import org.apache.iceberg.IcebergBridge; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; +import org.apache.iceberg.types.Types; +import org.projectnessie.catalog.formats.iceberg.IcebergSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergFileFormat; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntry; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntryStatus; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriterSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriterSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergPartitionSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergSchema; Review Comment: @pingtimeout, @snazy, & @flyrain - I'm fine merging this in as-is with a quick follow up to remove the Nessie dependency. I played around with it and I have a version of IcebergFixtures that can rely directly on Iceberg without Project Nessie (see below). ``` /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.polaris.storage.files.impl; import static java.lang.String.format; import static java.nio.charset.StandardCharsets.UTF_8; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Map; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; import org.apache.iceberg.FileFormat; import org.apache.iceberg.IcebergBridge; import org.apache.iceberg.ManifestFile; import org.apache.iceberg.ManifestWriter; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.TableMetadata; import org.apache.iceberg.TableMetadataParser; import org.apache.iceberg.avro.Avro; import org.apache.iceberg.io.FileAppender; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.io.PositionOutputStream; import org.apache.iceberg.types.Types;
Re: [PR] Object storage operations [polaris]
pingtimeout commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2668349706
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsImpl.java:
##
@@ -0,0 +1,471 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+
+import com.google.common.collect.Streams;
+import com.google.common.util.concurrent.RateLimiter;
+import jakarta.annotation.Nonnull;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.Set;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.io.BulkDeletionFailureException;
+import org.apache.iceberg.io.CloseableIterator;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
+import org.apache.iceberg.view.ViewMetadata;
+import org.apache.iceberg.view.ViewMetadataParser;
+import org.apache.polaris.storage.files.api.FileFilter;
+import org.apache.polaris.storage.files.api.FileOperations;
+import org.apache.polaris.storage.files.api.FileSpec;
+import org.apache.polaris.storage.files.api.FileType;
+import org.apache.polaris.storage.files.api.ImmutablePurgeStats;
+import org.apache.polaris.storage.files.api.PurgeSpec;
+import org.apache.polaris.storage.files.api.PurgeStats;
+import org.projectnessie.storage.uri.StorageUri;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @param fileIO the {@link FileIO} instance to use. The given instance must
implement both {@link
+ * org.apache.iceberg.io.SupportsBulkOperations} and {@link
+ * org.apache.iceberg.io.SupportsPrefixOperations}.
+ */
+record FileOperationsImpl(@Nonnull FileIO fileIO) implements FileOperations {
Review Comment:
Ok feel free to ignore then, I was just considering whether the cast could
be the caller's responsibility instead of the FilesOperationsImpl responsibility
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
snazy commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2667987486
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsImpl.java:
##
@@ -0,0 +1,471 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+
+import com.google.common.collect.Streams;
+import com.google.common.util.concurrent.RateLimiter;
+import jakarta.annotation.Nonnull;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.Set;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.io.BulkDeletionFailureException;
+import org.apache.iceberg.io.CloseableIterator;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
+import org.apache.iceberg.view.ViewMetadata;
+import org.apache.iceberg.view.ViewMetadataParser;
+import org.apache.polaris.storage.files.api.FileFilter;
+import org.apache.polaris.storage.files.api.FileOperations;
+import org.apache.polaris.storage.files.api.FileSpec;
+import org.apache.polaris.storage.files.api.FileType;
+import org.apache.polaris.storage.files.api.ImmutablePurgeStats;
+import org.apache.polaris.storage.files.api.PurgeSpec;
+import org.apache.polaris.storage.files.api.PurgeStats;
+import org.projectnessie.storage.uri.StorageUri;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @param fileIO the {@link FileIO} instance to use. The given instance must
implement both {@link
+ * org.apache.iceberg.io.SupportsBulkOperations} and {@link
+ * org.apache.iceberg.io.SupportsPrefixOperations}.
+ */
+record FileOperationsImpl(@Nonnull FileIO fileIO) implements FileOperations {
Review Comment:
Issue here is, that the input types are just `FileIO` as returne by the file
IO factories.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
snazy commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2667941592
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileOperations.java:
##
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import jakarta.annotation.Nonnull;
+import java.util.stream.Stream;
+
+/**
+ * Object storage file operations, used to find files below a given prefix, to
purge files, to
+ * identify referenced files, etc.
+ *
+ * All functions of this interface rather yield incomplete results and
continue over throwing
+ * exceptions.
+ */
+public interface FileOperations {
+ /**
+ * Find files that match the given prefix and filter.
+ *
+ * Whether existing but inaccessible files are included in the result
depends on the object
+ * store.
+ *
+ * Call sites should consider rate-limiting the scan operations, for
example, by using Guava's
+ * {@code RateLimiter} via a {@code Stream.map(x -> { rateLimiter.acquire();
return x; }} step on
+ * the returned stream.
+ *
+ * @param prefix full object storage URI prefix, including scheme and bucket.
+ * @param filter file filter
+ * @return a stream of file specs with the {@link
FileSpec#createdAtMillis()} and {@link
+ * FileSpec#size()} attributes populated with the information provided
by the object store.
+ * The {@link FileSpec#fileType() file type} attribute is not populated,
it may be {@link
+ * FileSpec#guessTypeFromName() guessed}.
+ */
+ Stream findFiles(@Nonnull String prefix, @Nonnull FileFilter
filter);
+
+ /**
+ * Identifies all files referenced by the given table-metadata.
+ *
+ * In case "container" files, like the metadata, manifest-list or
manifest files, are not
+ * readable, the returned stream will just not include those.
+ *
+ * Rate-limiting the returned stream is recommended when identifying
multiple tables and/or
+ * views. Rate-limiting on a single invocation may not be effective as
expected.
+ *
+ * @param tableMetadataLocation Iceberg table-metadata location
+ * @param deduplicate if true, attempt to deduplicate files by their
location, adding additional
+ * heap pressure to the operation. Implementations may ignore this
parameter or may not
+ * deduplicate all identified files.
+ * @return a stream of {@link FileSpec file specs}. The {@link
FileSpec#createdAtMillis()}
+ * attribute is usually not populated, as it would have to be derived
from user-provided
+ * information in metadata or snapshot. The {@link FileSpec#fileType()}
attribute is populated
+ * based on where a file appears during identification.
+ */
+ Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate);
+
+ /**
+ * Identifies all files referenced by the given view-metadata.
+ *
+ * In case "container" files like the metadata are not readable, the
returned stream will just
+ * not include those.
+ *
+ * Rate-limiting the returned stream is recommended when identifying
multiple tables and/or
+ * views. Rate-limiting on a single invocation may not be effective as
expected.
+ *
+ * @param viewMetadataLocation Iceberg view-metadata location
+ * @param deduplicate if true, attempt to deduplicate files by their
location, adding additional
+ * heap pressure to the operation. Implementations may ignore this
parameter or may not
+ * deduplicate all identified files.
+ * @return a stream of {@link FileSpec file specs}. The {@link
FileSpec#createdAtMillis()}
+ * attribute is usually not populated, as it would have been derived
from user-provided
Review Comment:
Not "have to be", "would have been" is correct.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
snazy commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2667934003
##
storage/files/impl/build.gradle.kts:
##
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+plugins {
+ id("polaris-server")
+ id("org.kordamp.gradle.jandex")
+}
+
+dependencies {
+ implementation(project(":polaris-storage-files-api"))
+
+ implementation(platform(libs.iceberg.bom))
+ implementation("org.apache.iceberg:iceberg-api")
+ implementation("org.apache.iceberg:iceberg-core")
+
+ implementation(platform(libs.jackson.bom))
+ implementation("com.fasterxml.jackson.core:jackson-annotations")
+ implementation("com.fasterxml.jackson.core:jackson-core")
+ implementation("com.fasterxml.jackson.core:jackson-databind")
+ implementation("com.fasterxml.jackson.dataformat:jackson-dataformat-smile")
+ runtimeOnly("com.fasterxml.jackson.datatype:jackson-datatype-guava")
+ runtimeOnly("com.fasterxml.jackson.datatype:jackson-datatype-jdk8")
+ runtimeOnly("com.fasterxml.jackson.datatype:jackson-datatype-jsr310")
+
+ implementation(libs.guava)
+ implementation(libs.slf4j.api)
+
+ implementation(libs.jakarta.inject.api)
+ implementation(libs.jakarta.validation.api)
+ implementation(libs.jakarta.inject.api)
Review Comment:
fixed
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
pingtimeout commented on code in PR #3256: URL: https://github.com/apache/polaris/pull/3256#discussion_r2665464698 ## storage/files/impl/src/testFixtures/resources/logback-test.xml: ## @@ -0,0 +1,30 @@ + +
Re: [PR] Object storage operations [polaris]
flyrain commented on code in PR #3256: URL: https://github.com/apache/polaris/pull/3256#discussion_r2663050338 ## storage/files/impl/src/testFixtures/java/org/apache/polaris/storage/files/impl/IcebergFixtures.java: ## @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.polaris.storage.files.impl; + +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.ByteArrayOutputStream; +import java.util.Map; +import org.apache.avro.generic.GenericData; +import org.apache.iceberg.IcebergBridge; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; +import org.apache.iceberg.types.Types; +import org.projectnessie.catalog.formats.iceberg.IcebergSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergFileFormat; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntry; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntryStatus; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriterSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriterSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergPartitionSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergSchema; Review Comment: I do not fully agree with this. These classes are not just test helpers. They introduce a parallel Iceberg metadata and behavior model, even if used only in tests today. That means they still encode assumptions about Iceberg semantics, which increases long term maintenance risk. The idea that they would only break with a breaking Iceberg spec change is also optimistic. Iceberg often evolves through additive or clarifying changes that do not break the spec but can still invalidate assumptions in helper classes like these. Finally, this creates an indirect dependency where Polaris has to wait for Nessie to catch up whenever Iceberg changes. That coupling feels unnecessary and makes it harder for Polaris to stay closely aligned with upstream Iceberg behavior. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
snazy commented on code in PR #3256: URL: https://github.com/apache/polaris/pull/3256#discussion_r2662100131 ## storage/files/impl/src/testFixtures/java/org/apache/polaris/storage/files/impl/IcebergFixtures.java: ## @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.polaris.storage.files.impl; + +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.ByteArrayOutputStream; +import java.util.Map; +import org.apache.avro.generic.GenericData; +import org.apache.iceberg.IcebergBridge; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; +import org.apache.iceberg.types.Types; +import org.projectnessie.catalog.formats.iceberg.IcebergSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergFileFormat; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntry; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntryStatus; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriterSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriterSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergPartitionSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergSchema; Review Comment: > it must have changed since I last looked it over. Nope. Was in there since the very first commit. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
snazy commented on code in PR #3256: URL: https://github.com/apache/polaris/pull/3256#discussion_r2660842228 ## storage/files/impl/src/testFixtures/java/org/apache/polaris/storage/files/impl/IcebergFixtures.java: ## @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.polaris.storage.files.impl; + +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.ByteArrayOutputStream; +import java.util.Map; +import org.apache.avro.generic.GenericData; +import org.apache.iceberg.IcebergBridge; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; +import org.apache.iceberg.types.Types; +import org.projectnessie.catalog.formats.iceberg.IcebergSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergFileFormat; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntry; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntryStatus; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriterSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriterSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergPartitionSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergSchema; Review Comment: These classes are only needed for simpler tests, which are not possible or only possible with a lot of hacks with Iceberg Java code. These classes would only break, if the Iceberg _spec_ changes in a breaking way. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
adam-christian-software commented on code in PR #3256: URL: https://github.com/apache/polaris/pull/3256#discussion_r2653156867 ## storage/files/impl/src/testFixtures/java/org/apache/polaris/storage/files/impl/IcebergFixtures.java: ## @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.polaris.storage.files.impl; + +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.ByteArrayOutputStream; +import java.util.Map; +import org.apache.avro.generic.GenericData; +import org.apache.iceberg.IcebergBridge; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; +import org.apache.iceberg.types.Types; +import org.projectnessie.catalog.formats.iceberg.IcebergSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergFileFormat; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntry; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntryStatus; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriterSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriterSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergPartitionSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergSchema; Review Comment: I didn't see this before; it must have changed since I last looked it over. Personally, I'm not 100% sure what these classes do, however I agree that we should not be relying on Project Nessie for Iceberg-native classes. If these classes are needed, I would rather move them into Polaris rather than keep them in Project Nessie. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
flyrain commented on code in PR #3256: URL: https://github.com/apache/polaris/pull/3256#discussion_r2652095385 ## storage/files/impl/src/testFixtures/java/org/apache/polaris/storage/files/impl/IcebergFixtures.java: ## @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.polaris.storage.files.impl; + +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.ByteArrayOutputStream; +import java.util.Map; +import org.apache.avro.generic.GenericData; +import org.apache.iceberg.IcebergBridge; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; +import org.apache.iceberg.types.Types; +import org.projectnessie.catalog.formats.iceberg.IcebergSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergDataFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergFileFormat; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestContent; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntry; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestEntryStatus; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFile; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestFileWriterSpec; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriter; +import org.projectnessie.catalog.formats.iceberg.manifest.IcebergManifestListWriterSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergPartitionSpec; +import org.projectnessie.catalog.formats.iceberg.meta.IcebergSchema; Review Comment: I do not think Polaris should depend on the Nessie project for Iceberg metadata. What if Iceberg itself changed, while Nessie doesn't change them or haven't change yet? cc @dennishuo @singhpk234 @adam-christian-software -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] Object storage operations [polaris]
adam-christian-software commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2643386963
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileOperations.java:
##
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import jakarta.annotation.Nonnull;
+import java.util.stream.Stream;
+
+/**
+ * Object storage file operations, used to find files below a given prefix, to
purge files, to
+ * identify referenced files, etc.
+ *
+ * All functions of this interface rather yield incomplete results and
continue over throwing
+ * exceptions.
+ */
+public interface FileOperations {
+ /**
+ * Find files that match the given prefix and filter.
+ *
+ * Whether existing but inaccessible files are included in the result
depends on the object
+ * store.
+ *
+ * Call sites should consider rate-limiting the scan operations, for
example, by using Guava's
+ * {@code RateLimiter} via a {@code Stream.map(x -> { rateLimiter.acquire();
return x; }} step on
+ * the returned stream.
+ *
+ * @param prefix full object storage URI prefix, including scheme and bucket.
+ * @param filter file filter
+ * @return a stream of file specs with the {@link
FileSpec#createdAtMillis()} and {@link
+ * FileSpec#size()} attributes populated with the information provided
by the object store.
+ * The {@link FileSpec#fileType() file type} attribute is not populated,
it may be {@link
+ * FileSpec#guessTypeFromName() guessed}.
+ */
+ Stream findFiles(@Nonnull String prefix, @Nonnull FileFilter
filter);
+
+ /**
+ * Identifies all files referenced by the given table-metadata.
+ *
+ * In case "container" files, like the metadata, manifest-list or
manifest files, are not
+ * readable, the returned stream will just not include those.
+ *
+ * Rate-limiting the returned stream is recommended when identifying
multiple tables and/or
+ * views. Rate-limiting on a single invocation may not be effective as
expected.
+ *
+ * @param tableMetadataLocation Iceberg table-metadata location
+ * @param deduplicate if true, attempt to deduplicate files by their
location, adding additional
+ * heap pressure to the operation. Implementations may ignore this
parameter or may not
Review Comment:
OK. Thanks. Resolved.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
adam-christian-software commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2643307706
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsImpl.java:
##
@@ -0,0 +1,471 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+
+import com.google.common.collect.Streams;
+import com.google.common.util.concurrent.RateLimiter;
+import jakarta.annotation.Nonnull;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.Set;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.io.BulkDeletionFailureException;
+import org.apache.iceberg.io.CloseableIterator;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
+import org.apache.iceberg.view.ViewMetadata;
+import org.apache.iceberg.view.ViewMetadataParser;
+import org.apache.polaris.storage.files.api.FileFilter;
+import org.apache.polaris.storage.files.api.FileOperations;
+import org.apache.polaris.storage.files.api.FileSpec;
+import org.apache.polaris.storage.files.api.FileType;
+import org.apache.polaris.storage.files.api.ImmutablePurgeStats;
+import org.apache.polaris.storage.files.api.PurgeSpec;
+import org.apache.polaris.storage.files.api.PurgeStats;
+import org.projectnessie.storage.uri.StorageUri;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @param fileIO the {@link FileIO} instance to use. The given instance must
implement both {@link
+ * org.apache.iceberg.io.SupportsBulkOperations} and {@link
+ * org.apache.iceberg.io.SupportsPrefixOperations}.
+ */
+record FileOperationsImpl(@Nonnull FileIO fileIO) implements FileOperations {
+ private static final Logger LOGGER =
LoggerFactory.getLogger(FileOperationsImpl.class);
+
+ @Override
+ public Stream findFiles(@Nonnull String prefix, @Nonnull
FileFilter filter) {
+var prefixUri = StorageUri.of(prefix).resolve("/");
+if (fileIO instanceof SupportsPrefixOperations prefixOps) {
+ return Streams.stream(prefixOps.listPrefix(prefix).iterator())
+ .filter(Objects::nonNull)
+ .map(
+ fileInfo -> {
+var location = StorageUri.of(fileInfo.location());
+if (!location.isAbsolute()) {
+ // ADLSFileIO does _not_ include the prefix, but GCSFileIO
and S3FileIO do.
+ location = prefixUri.resolve(location);
+}
+return FileSpec.builder()
+.location(location.toString())
+.size(fileInfo.size())
+.createdAtMillis(fileInfo.createdAtMillis())
+.build();
+ })
+ .filter(filter);
+}
+
+throw new IllegalStateException(
+format(
+"An Iceberg FileIO supporting prefix operations is required, but
the given %s does not",
+fileIO.getClass().getName()));
+ }
+
+ @Override
+ public Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate) {
+var metadataOpt = readTableMetadataFailsafe(tableMetadataLocation);
+if (metadataOpt.isEmpty()) {
+ return Stream.empty();
+}
+var metadata = metadataOpt.get();
+
+var metadataFileSpec =
+
FileSpec.fromLocation(tableMetadataLocation).fileType(FileType.ICEBERG_METADATA).build();
+
+var metadataFiles = Stream.of(metadataFileSpec);
+
+var statisticsFiles = metadata.statisticsFiles();
+if (statisticsFiles != null) {
+ v
Re: [PR] Object storage operations [polaris]
adam-christian-software commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2643305669
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/PurgeStats.java:
##
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import java.time.Duration;
+import org.apache.polaris.immutables.PolarisImmutable;
+
+@PolarisImmutable
+public interface PurgeStats {
+ Duration duration();
+
+ /**
+ * The number of purged files.
+ *
+ * The returned value may be wrong and include non-existing files.
Review Comment:
AH OK. Sounds good. I'm good to resolve.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
snazy commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2642404471
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/PurgeStats.java:
##
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import java.time.Duration;
+import org.apache.polaris.immutables.PolarisImmutable;
+
+@PolarisImmutable
+public interface PurgeStats {
+ Duration duration();
+
+ /**
+ * The number of purged files.
+ *
+ * The returned value may be wrong and include non-existing files.
Review Comment:
I'd prefer to leave it ;)
The reason is that this object-storage API doesn't _require_ us to use
Iceberg's `FileIO`. We could access the object-store SDKs directly, which would
be more efficient and yield correct numbers including deletion-error
information per file/object. But that would be some more work.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
snazy commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2642395923
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileOperations.java:
##
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import jakarta.annotation.Nonnull;
+import java.util.stream.Stream;
+
+/**
+ * Object storage file operations, used to find files below a given prefix, to
purge files, to
+ * identify referenced files, etc.
+ *
+ * All functions of this interface rather yield incomplete results and
continue over throwing
+ * exceptions.
+ */
+public interface FileOperations {
+ /**
+ * Find files that match the given prefix and filter.
+ *
+ * Whether existing but inaccessible files are included in the result
depends on the object
+ * store.
+ *
+ * Call sites should consider rate-limiting the scan operations, for
example, by using Guava's
+ * {@code RateLimiter} via a {@code Stream.map(x -> { rateLimiter.acquire();
return x; }} step on
+ * the returned stream.
+ *
+ * @param prefix full object storage URI prefix, including scheme and bucket.
+ * @param filter file filter
+ * @return a stream of file specs with the {@link
FileSpec#createdAtMillis()} and {@link
+ * FileSpec#size()} attributes populated with the information provided
by the object store.
+ * The {@link FileSpec#fileType() file type} attribute is not populated,
it may be {@link
+ * FileSpec#guessTypeFromName() guessed}.
+ */
+ Stream findFiles(@Nonnull String prefix, @Nonnull FileFilter
filter);
+
+ /**
+ * Identifies all files referenced by the given table-metadata.
+ *
+ * In case "container" files, like the metadata, manifest-list or
manifest files, are not
+ * readable, the returned stream will just not include those.
+ *
+ * Rate-limiting the returned stream is recommended when identifying
multiple tables and/or
+ * views. Rate-limiting on a single invocation may not be effective as
expected.
+ *
+ * @param tableMetadataLocation Iceberg table-metadata location
+ * @param deduplicate if true, attempt to deduplicate files by their
location, adding additional
+ * heap pressure to the operation. Implementations may ignore this
parameter or may not
Review Comment:
There is already a currently hard-coded limit of 100,000 objects "currently
considered for deduplication" - aka deduplication works on the last seen
100,000 objects.
We can certainly think of a better way to implement that in a follow-up.
Just not sure whether every caller should decide, but rather some
configuration.
That's why I phrased the docs for this parameter pretty vague - to give
implementations some freedom.
For the purge use cases, whether deduplication happens or not isn't that
important - at worst, it's some more work being done, but nothing would break.
Other use cases, for example to count the number of distinct data files, may
require deduplication across potentially even billions of data files, which
they could implement in a heap-friendly way by placing their own deduplication
"around the stream" - think: `Stream files =
myDeduplicator(fileOperations.identifyIcebergTableFiles(metadata, false))`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
snazy commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2642366635
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsImpl.java:
##
@@ -0,0 +1,471 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+
+import com.google.common.collect.Streams;
+import com.google.common.util.concurrent.RateLimiter;
+import jakarta.annotation.Nonnull;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.Set;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.io.BulkDeletionFailureException;
+import org.apache.iceberg.io.CloseableIterator;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
+import org.apache.iceberg.view.ViewMetadata;
+import org.apache.iceberg.view.ViewMetadataParser;
+import org.apache.polaris.storage.files.api.FileFilter;
+import org.apache.polaris.storage.files.api.FileOperations;
+import org.apache.polaris.storage.files.api.FileSpec;
+import org.apache.polaris.storage.files.api.FileType;
+import org.apache.polaris.storage.files.api.ImmutablePurgeStats;
+import org.apache.polaris.storage.files.api.PurgeSpec;
+import org.apache.polaris.storage.files.api.PurgeStats;
+import org.projectnessie.storage.uri.StorageUri;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @param fileIO the {@link FileIO} instance to use. The given instance must
implement both {@link
+ * org.apache.iceberg.io.SupportsBulkOperations} and {@link
+ * org.apache.iceberg.io.SupportsPrefixOperations}.
+ */
+record FileOperationsImpl(@Nonnull FileIO fileIO) implements FileOperations {
+ private static final Logger LOGGER =
LoggerFactory.getLogger(FileOperationsImpl.class);
+
+ @Override
+ public Stream findFiles(@Nonnull String prefix, @Nonnull
FileFilter filter) {
+var prefixUri = StorageUri.of(prefix).resolve("/");
+if (fileIO instanceof SupportsPrefixOperations prefixOps) {
+ return Streams.stream(prefixOps.listPrefix(prefix).iterator())
+ .filter(Objects::nonNull)
+ .map(
+ fileInfo -> {
+var location = StorageUri.of(fileInfo.location());
+if (!location.isAbsolute()) {
+ // ADLSFileIO does _not_ include the prefix, but GCSFileIO
and S3FileIO do.
+ location = prefixUri.resolve(location);
+}
+return FileSpec.builder()
+.location(location.toString())
+.size(fileInfo.size())
+.createdAtMillis(fileInfo.createdAtMillis())
+.build();
+ })
+ .filter(filter);
+}
+
+throw new IllegalStateException(
+format(
+"An Iceberg FileIO supporting prefix operations is required, but
the given %s does not",
+fileIO.getClass().getName()));
+ }
+
+ @Override
+ public Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate) {
+var metadataOpt = readTableMetadataFailsafe(tableMetadataLocation);
+if (metadataOpt.isEmpty()) {
+ return Stream.empty();
+}
+var metadata = metadataOpt.get();
+
+var metadataFileSpec =
+
FileSpec.fromLocation(tableMetadataLocation).fileType(FileType.ICEBERG_METADATA).build();
+
+var metadataFiles = Stream.of(metadataFileSpec);
+
+var statisticsFiles = metadata.statisticsFiles();
+if (statisticsFiles != null) {
+ var statisticsFileS
Re: [PR] Object storage operations [polaris]
adam-christian-software commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2641191500
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/PurgeStats.java:
##
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import java.time.Duration;
+import org.apache.polaris.immutables.PolarisImmutable;
+
+@PolarisImmutable
+public interface PurgeStats {
+ Duration duration();
+
+ /**
+ * The number of purged files.
+ *
+ * The returned value may be wrong and include non-existing files.
Review Comment:
So, if the number can be wrong, I'd just remove it.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
adam-christian-software commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2641190657
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileSpec.java:
##
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import com.google.errorprone.annotations.CanIgnoreReturnValue;
+import java.util.Optional;
+import java.util.OptionalLong;
+import org.apache.polaris.immutables.PolarisImmutable;
+
+/**
+ * Describes a single file/object in an object storage.
+ *
+ * Not all attributes are populated by every {@link FileOperations}
function.
+ */
+@PolarisImmutable
+public interface FileSpec {
+
+ /** The full object storage URI. */
+ String location();
+
+ /**
+ * The type of the file, if known.
+ *
+ * @see #guessTypeFromName()
+ */
+ Optional fileType();
+
+ /** The size of the file in bytes, if available. */
+ OptionalLong size();
+
+ /** The creation timestamp in milliseconds since the epoch, if available. */
+ OptionalLong createdAtMillis();
+
+ static Builder builder() {
+return ImmutableFileSpec.builder();
+ }
+
+ static Builder fromLocation(String location) {
+return builder().location(location);
+ }
+
+ static Builder fromLocationAndSize(String location, long size) {
+var b = fromLocation(location);
+if (size > 0L) {
+ b.size(size);
+}
+return b;
+ }
+
+ default FileType guessTypeFromName() {
+var location = location();
+var lastSlash = location.lastIndexOf('/');
+var fileName = lastSlash > 0 ? location.substring(lastSlash + 1) :
location;
+
+if (fileName.contains(".metadata.json")) {
+ return FileType.ICEBERG_METADATA;
+} else if (fileName.startsWith("snap-")) {
+ return FileType.ICEBERG_MANIFEST_LIST;
+} else if (fileName.contains("-m")) {
Review Comment:
Resolved.
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/PurgeSpec.java:
##
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import static com.google.common.base.Preconditions.checkState;
+
+import java.util.OptionalDouble;
+import java.util.function.Consumer;
+import org.apache.polaris.immutables.PolarisImmutable;
+import org.immutables.value.Value;
+
+@SuppressWarnings("unused")
+@PolarisImmutable
+public interface PurgeSpec {
+ PurgeSpec DEFAULT_INSTANCE = PurgeSpec.builder().build();
+
+ @Value.Default
+ default FileFilter fileFilter() {
+return FileFilter.alwaysTrue();
+ }
+
+ PurgeSpec withFileFilter(FileFilter fileFilter);
+
+ /**
+ * Delete batch size for purge/batch-deletion operations. Implementations
may opt to ignore this
+ * parameter and enforce a reasonable or required different limit.
+ */
+ @Value.Default
+ default int deleteBatchSize() {
+return 250;
Review Comment:
Resolved.
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/PurgeSpec.java:
##
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file e
Re: [PR] Object storage operations [polaris]
adam-christian-software commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2641190123
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileOperations.java:
##
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import jakarta.annotation.Nonnull;
+import java.util.stream.Stream;
+
+/**
+ * Object storage file operations, used to find files below a given prefix, to
purge files, to
+ * identify referenced files, etc.
+ *
+ * All functions of this interface rather yield incomplete results and
continue over throwing
+ * exceptions.
+ */
+public interface FileOperations {
+ /**
+ * Find files that match the given prefix and filter.
+ *
+ * Whether existing but inaccessible files are included in the result
depends on the object
+ * store.
+ *
+ * Call sites should consider rate-limiting the scan operations, for
example, by using Guava's
+ * {@code RateLimiter} via a {@code Stream.map(x -> { rateLimiter.acquire();
return x; }} step on
+ * the returned stream.
+ *
+ * @param prefix full object storage URI prefix, including scheme and bucket.
+ * @param filter file filter
+ * @return a stream of file specs with the {@link
FileSpec#createdAtMillis()} and {@link
+ * FileSpec#size()} attributes populated with the information provided
by the object store.
+ * The {@link FileSpec#fileType() file type} attribute is not populated,
it may be {@link
+ * FileSpec#guessTypeFromName() guessed}.
+ */
+ Stream findFiles(@Nonnull String prefix, @Nonnull FileFilter
filter);
+
+ /**
+ * Identifies all files referenced by the given table-metadata.
+ *
+ * In case "container" files, like the metadata, manifest-list or
manifest files, are not
Review Comment:
Resolved.
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileOperations.java:
##
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import jakarta.annotation.Nonnull;
+import java.util.stream.Stream;
+
+/**
+ * Object storage file operations, used to find files below a given prefix, to
purge files, to
+ * identify referenced files, etc.
+ *
+ * All functions of this interface rather yield incomplete results and
continue over throwing
+ * exceptions.
+ */
+public interface FileOperations {
Review Comment:
Resolved.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] Object storage operations [polaris]
adam-christian-software commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2641189646
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsImpl.java:
##
@@ -0,0 +1,471 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+
+import com.google.common.collect.Streams;
+import com.google.common.util.concurrent.RateLimiter;
+import jakarta.annotation.Nonnull;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.Set;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.io.BulkDeletionFailureException;
+import org.apache.iceberg.io.CloseableIterator;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
+import org.apache.iceberg.view.ViewMetadata;
+import org.apache.iceberg.view.ViewMetadataParser;
+import org.apache.polaris.storage.files.api.FileFilter;
+import org.apache.polaris.storage.files.api.FileOperations;
+import org.apache.polaris.storage.files.api.FileSpec;
+import org.apache.polaris.storage.files.api.FileType;
+import org.apache.polaris.storage.files.api.ImmutablePurgeStats;
+import org.apache.polaris.storage.files.api.PurgeSpec;
+import org.apache.polaris.storage.files.api.PurgeStats;
+import org.projectnessie.storage.uri.StorageUri;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @param fileIO the {@link FileIO} instance to use. The given instance must
implement both {@link
+ * org.apache.iceberg.io.SupportsBulkOperations} and {@link
+ * org.apache.iceberg.io.SupportsPrefixOperations}.
+ */
+record FileOperationsImpl(@Nonnull FileIO fileIO) implements FileOperations {
+ private static final Logger LOGGER =
LoggerFactory.getLogger(FileOperationsImpl.class);
+
+ @Override
+ public Stream findFiles(@Nonnull String prefix, @Nonnull
FileFilter filter) {
+var prefixUri = StorageUri.of(prefix).resolve("/");
+if (fileIO instanceof SupportsPrefixOperations prefixOps) {
+ return Streams.stream(prefixOps.listPrefix(prefix).iterator())
+ .filter(Objects::nonNull)
+ .map(
+ fileInfo -> {
+var location = StorageUri.of(fileInfo.location());
+if (!location.isAbsolute()) {
+ // ADLSFileIO does _not_ include the prefix, but GCSFileIO
and S3FileIO do.
+ location = prefixUri.resolve(location);
+}
+return FileSpec.builder()
+.location(location.toString())
+.size(fileInfo.size())
+.createdAtMillis(fileInfo.createdAtMillis())
+.build();
+ })
+ .filter(filter);
+}
+
+throw new IllegalStateException(
+format(
+"An Iceberg FileIO supporting prefix operations is required, but
the given %s does not",
+fileIO.getClass().getName()));
+ }
+
+ @Override
+ public Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate) {
+var metadataOpt = readTableMetadataFailsafe(tableMetadataLocation);
+if (metadataOpt.isEmpty()) {
+ return Stream.empty();
+}
+var metadata = metadataOpt.get();
+
+var metadataFileSpec =
+
FileSpec.fromLocation(tableMetadataLocation).fileType(FileType.ICEBERG_METADATA).build();
+
+var metadataFiles = Stream.of(metadataFileSpec);
+
+var statisticsFiles = metadata.statisticsFiles();
+if (statisticsFiles != null) {
+ v
Re: [PR] Object storage operations [polaris]
adam-christian-software commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2641188590
##
storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsImpl.java:
##
@@ -0,0 +1,471 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.impl;
+
+import static java.lang.String.format;
+
+import com.google.common.collect.Streams;
+import com.google.common.util.concurrent.RateLimiter;
+import jakarta.annotation.Nonnull;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.Set;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.TableMetadata;
+import org.apache.iceberg.TableMetadataParser;
+import org.apache.iceberg.io.BulkDeletionFailureException;
+import org.apache.iceberg.io.CloseableIterator;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
+import org.apache.iceberg.view.ViewMetadata;
+import org.apache.iceberg.view.ViewMetadataParser;
+import org.apache.polaris.storage.files.api.FileFilter;
+import org.apache.polaris.storage.files.api.FileOperations;
+import org.apache.polaris.storage.files.api.FileSpec;
+import org.apache.polaris.storage.files.api.FileType;
+import org.apache.polaris.storage.files.api.ImmutablePurgeStats;
+import org.apache.polaris.storage.files.api.PurgeSpec;
+import org.apache.polaris.storage.files.api.PurgeStats;
+import org.projectnessie.storage.uri.StorageUri;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @param fileIO the {@link FileIO} instance to use. The given instance must
implement both {@link
+ * org.apache.iceberg.io.SupportsBulkOperations} and {@link
+ * org.apache.iceberg.io.SupportsPrefixOperations}.
+ */
+record FileOperationsImpl(@Nonnull FileIO fileIO) implements FileOperations {
+ private static final Logger LOGGER =
LoggerFactory.getLogger(FileOperationsImpl.class);
+
+ @Override
+ public Stream findFiles(@Nonnull String prefix, @Nonnull
FileFilter filter) {
+var prefixUri = StorageUri.of(prefix).resolve("/");
+if (fileIO instanceof SupportsPrefixOperations prefixOps) {
+ return Streams.stream(prefixOps.listPrefix(prefix).iterator())
+ .filter(Objects::nonNull)
+ .map(
+ fileInfo -> {
+var location = StorageUri.of(fileInfo.location());
+if (!location.isAbsolute()) {
+ // ADLSFileIO does _not_ include the prefix, but GCSFileIO
and S3FileIO do.
+ location = prefixUri.resolve(location);
+}
+return FileSpec.builder()
+.location(location.toString())
+.size(fileInfo.size())
+.createdAtMillis(fileInfo.createdAtMillis())
+.build();
+ })
+ .filter(filter);
+}
+
+throw new IllegalStateException(
+format(
+"An Iceberg FileIO supporting prefix operations is required, but
the given %s does not",
+fileIO.getClass().getName()));
+ }
+
+ @Override
+ public Stream identifyIcebergTableFiles(
+ @Nonnull String tableMetadataLocation, boolean deduplicate) {
+var metadataOpt = readTableMetadataFailsafe(tableMetadataLocation);
+if (metadataOpt.isEmpty()) {
+ return Stream.empty();
+}
+var metadata = metadataOpt.get();
+
+var metadataFileSpec =
+
FileSpec.fromLocation(tableMetadataLocation).fileType(FileType.ICEBERG_METADATA).build();
+
+var metadataFiles = Stream.of(metadataFileSpec);
+
+var statisticsFiles = metadata.statisticsFiles();
+if (statisticsFiles != null) {
+ v
Re: [PR] Object storage operations [polaris]
adam-christian-software commented on code in PR #3256: URL: https://github.com/apache/polaris/pull/3256#discussion_r2641188151 ## storage/files/README.md: ## @@ -0,0 +1,70 @@ + + +# Polaris object store operations + +API and implementations to perform long-running operations against object stores, mostly to purge files. Review Comment: Resolved. ## storage/files/README.md: ## @@ -0,0 +1,70 @@ + + +# Polaris object store operations + +API and implementations to perform long-running operations against object stores, mostly to purge files. + +Functionalities to scan an object store and to purge files are separated. Filter mechanisms are used to +select the files to be deleted (purged). + +There are implementations to identify the files referenced by a particular Iceberg table or view metadata, including +statistics files, manifest lists of all snapshots, the manifest files and the data/delete files. + +The file operations perform no effort to identify duplicates during the identification of files referenced by +a table or view metadata. +This means that, for example, a data file referenced in multiple manifest files will be returned twice. + +Purge operations are performed in one or multiple bulk delete operations. +The implementation takes care of not including the same file more than once within a single bulk delete operation. + +One alternative implementation purges all files within the base location of a table or view metadata. Review Comment: Resolved. ## storage/files/README.md: ## @@ -0,0 +1,70 @@ + + +# Polaris object store operations + +API and implementations to perform long-running operations against object stores, mostly to purge files. + +Functionalities to scan an object store and to purge files are separated. Filter mechanisms are used to +select the files to be deleted (purged). + +There are implementations to identify the files referenced by a particular Iceberg table or view metadata, including +statistics files, manifest lists of all snapshots, the manifest files and the data/delete files. + +The file operations perform no effort to identify duplicates during the identification of files referenced by +a table or view metadata. +This means that, for example, a data file referenced in multiple manifest files will be returned twice. + +Purge operations are performed in one or multiple bulk delete operations. +The implementation takes care of not including the same file more than once within a single bulk delete operation. + +One alternative implementation purges all files within the base location of a table or view metadata. + +All implemented operations are designed to be resilient against failures as those are expected to be run as +maintenance operations or as part of such. +The operations are implemented to continue in case of errors and eventually succeed instead of failing eagerly. +Maintenance operations are usually not actively observed, and manually fixing consistency issues in object +stores is not a straightforward task for users. + +# Potential future enhancements + +The operations provided by `FileOperations` are meant for maintenance operations, which are not +time- or performance-critical. +It is more important that the operations are resilient against failures, do not add unnecessary CPU or heap pressure +and eventually succeed. +Further, maintenance operations should not eat up too much I/O bandwidth to not interfere with other user-facing +operations. + +Depending on the overall load of the system, it might be worth running some operations in parallel. + +# Code architecture + +The code is split in two modules. One for the (Polaris internal) API interfaces and one for the implementations. + +Tests against various object store implementations are included as unit tests using an on-heap object-store-mock +and as integration tests against test containers for S3, GCS and ADLS. +The object-store-mock used in unit tests is also used to validate the low heap-pressure required by the +implementations. + +The actual object store interaction of the current implementation is delegated to Iceberg `FileIO` implementations. +Only `FileIO` implementations that support prefix-operations (`SupportsPrefixOperations` interface) and Review Comment: Resolved. ## storage/files/impl/src/main/java/org/apache/polaris/storage/files/impl/FileOperationsFactoryImpl.java: ## @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless requi
Re: [PR] Object storage operations [polaris]
adam-christian-software commented on code in PR #3256:
URL: https://github.com/apache/polaris/pull/3256#discussion_r2641140268
##
storage/files/api/src/main/java/org/apache/polaris/storage/files/api/FileOperations.java:
##
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.storage.files.api;
+
+import jakarta.annotation.Nonnull;
+import java.util.stream.Stream;
+
+/**
+ * Object storage file operations, used to find files below a given prefix, to
purge files, to
+ * identify referenced files, etc.
+ *
+ * All functions of this interface rather yield incomplete results and
continue over throwing
+ * exceptions.
+ */
+public interface FileOperations {
+ /**
+ * Find files that match the given prefix and filter.
+ *
+ * Whether existing but inaccessible files are included in the result
depends on the object
+ * store.
+ *
+ * Call sites should consider rate-limiting the scan operations, for
example, by using Guava's
+ * {@code RateLimiter} via a {@code Stream.map(x -> { rateLimiter.acquire();
return x; }} step on
+ * the returned stream.
+ *
+ * @param prefix full object storage URI prefix, including scheme and bucket.
+ * @param filter file filter
+ * @return a stream of file specs with the {@link
FileSpec#createdAtMillis()} and {@link
+ * FileSpec#size()} attributes populated with the information provided
by the object store.
+ * The {@link FileSpec#fileType() file type} attribute is not populated,
it may be {@link
+ * FileSpec#guessTypeFromName() guessed}.
+ */
+ Stream findFiles(@Nonnull String prefix, @Nonnull FileFilter
filter);
+
+ /**
+ * Identifies all files referenced by the given table-metadata.
+ *
+ * In case "container" files, like the metadata, manifest-list or
manifest files, are not
+ * readable, the returned stream will just not include those.
+ *
+ * Rate-limiting the returned stream is recommended when identifying
multiple tables and/or
+ * views. Rate-limiting on a single invocation may not be effective as
expected.
+ *
+ * @param tableMetadataLocation Iceberg table-metadata location
+ * @param deduplicate if true, attempt to deduplicate files by their
location, adding additional
+ * heap pressure to the operation. Implementations may ignore this
parameter or may not
Review Comment:
I understand that, but should we have certain conditions where the
parameters is ignored or should we have the caller take the responsibility of
ensuring that we do not deduplicate if it would be too much work? For me, I
would have the caller take the responsibility. That would solidify the contract
in the interface, so that the caller does not have to always assume that the
deduplication was ignored.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
