This is an automated email from the ASF dual-hosted git repository.
amogh-jahagirdar pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/main by this push:
new b84b37f430 Core: Replace string-based schema projection with selection
on field-id (#16184)
b84b37f430 is described below
commit b84b37f430b6c6e6b80e0a11d16c4d842b9da2a0
Author: Hongyue/Steve Zhang <[email protected]>
AuthorDate: Wed May 6 09:09:39 2026 -0700
Core: Replace string-based schema projection with selection on field-id
(#16184)
---
.../org/apache/iceberg/FileCleanupStrategy.java | 19 +--
.../java/org/apache/iceberg/ManifestReader.java | 10 +-
.../java/org/apache/iceberg/PartitionsTable.java | 127 +++++++++++++--------
3 files changed, 95 insertions(+), 61 deletions(-)
diff --git a/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java
b/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java
index dd92d33cda..573aef057f 100644
--- a/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java
+++ b/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java
@@ -26,7 +26,9 @@ import org.apache.iceberg.io.BulkDeletionFailureException;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.util.Tasks;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -78,14 +80,15 @@ abstract class FileCleanupStrategy {
ExpireSnapshots.CleanupLevel cleanupLevel);
private static final Schema MANIFEST_PROJECTION =
- ManifestFile.schema()
- .select(
- "manifest_path",
- "manifest_length",
- "partition_spec_id",
- "added_snapshot_id",
- "added_files_count",
- "deleted_files_count");
+ TypeUtil.select(
+ ManifestFile.schema(),
+ ImmutableSet.of(
+ ManifestFile.PATH.fieldId(),
+ ManifestFile.LENGTH.fieldId(),
+ ManifestFile.SPEC_ID.fieldId(),
+ ManifestFile.SNAPSHOT_ID.fieldId(),
+ ManifestFile.ADDED_FILES_COUNT.fieldId(),
+ ManifestFile.DELETED_FILES_COUNT.fieldId()));
protected CloseableIterable<ManifestFile> readManifests(Snapshot snapshot) {
if (snapshot.manifestListLocation() != null) {
diff --git a/core/src/main/java/org/apache/iceberg/ManifestReader.java
b/core/src/main/java/org/apache/iceberg/ManifestReader.java
index 668a3764de..09bbe8b0cc 100644
--- a/core/src/main/java/org/apache/iceberg/ManifestReader.java
+++ b/core/src/main/java/org/apache/iceberg/ManifestReader.java
@@ -43,6 +43,7 @@ import
org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.PartitionSet;
import org.slf4j.Logger;
@@ -68,6 +69,11 @@ public class ManifestReader<F extends ContentFile<F>>
extends CloseableGroup
"upper_bounds",
"record_count");
+ private static final Schema STATUS_ONLY_PROJECTION =
+ TypeUtil.select(
+ ManifestEntry.getSchema(Types.StructType.of()),
+ ImmutableSet.of(ManifestEntry.STATUS.fieldId()));
+
protected enum FileType {
DATA_FILES(GenericDataFile.class),
DELETE_FILES(GenericDeleteFile.class);
@@ -157,9 +163,7 @@ public class ManifestReader<F extends ContentFile<F>>
extends CloseableGroup
Map<String, String> metadata;
try {
try (CloseableIterable<ManifestEntry<T>> headerReader =
- InternalData.read(FileFormat.AVRO, inputFile)
-
.project(ManifestEntry.getSchema(Types.StructType.of()).select("status"))
- .build()) {
+ InternalData.read(FileFormat.AVRO,
inputFile).project(STATUS_ONLY_PROJECTION).build()) {
if (headerReader instanceof AvroIterable) {
metadata = ((AvroIterable<ManifestEntry<T>>)
headerReader).getMetadata();
diff --git a/core/src/main/java/org/apache/iceberg/PartitionsTable.java
b/core/src/main/java/org/apache/iceberg/PartitionsTable.java
index 09c6e7893b..10366db5a5 100644
--- a/core/src/main/java/org/apache/iceberg/PartitionsTable.java
+++ b/core/src/main/java/org/apache/iceberg/PartitionsTable.java
@@ -27,7 +27,9 @@ import java.util.List;
import org.apache.iceberg.expressions.ManifestEvaluator;
import org.apache.iceberg.io.CloseableIterable;
import
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.types.Comparators;
+import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.ParallelIterable;
import org.apache.iceberg.util.PartitionUtil;
@@ -37,6 +39,58 @@ import org.apache.iceberg.util.StructProjection;
/** A {@link Table} implementation that exposes a table's partitions as rows.
*/
public class PartitionsTable extends BaseMetadataTable {
+ private static final int PARTITION_FIELD_ID = 1;
+
+ private static final Types.NestedField SPEC_ID =
+ Types.NestedField.required(4, "spec_id", Types.IntegerType.get());
+ private static final Types.NestedField RECORD_COUNT =
+ Types.NestedField.required(
+ 2, "record_count", Types.LongType.get(), "Count of records in data
files");
+ private static final Types.NestedField FILE_COUNT =
+ Types.NestedField.required(3, "file_count", Types.IntegerType.get(),
"Count of data files");
+ private static final Types.NestedField TOTAL_DATA_FILE_SIZE_IN_BYTES =
+ Types.NestedField.required(
+ 11,
+ "total_data_file_size_in_bytes",
+ Types.LongType.get(),
+ "Total size in bytes of data files");
+ private static final Types.NestedField POSITION_DELETE_RECORD_COUNT =
+ Types.NestedField.required(
+ 5,
+ "position_delete_record_count",
+ Types.LongType.get(),
+ "Count of records in position delete files");
+ private static final Types.NestedField POSITION_DELETE_FILE_COUNT =
+ Types.NestedField.required(
+ 6,
+ "position_delete_file_count",
+ Types.IntegerType.get(),
+ "Count of position delete files");
+ private static final Types.NestedField EQUALITY_DELETE_RECORD_COUNT =
+ Types.NestedField.required(
+ 7,
+ "equality_delete_record_count",
+ Types.LongType.get(),
+ "Count of records in equality delete files");
+ private static final Types.NestedField EQUALITY_DELETE_FILE_COUNT =
+ Types.NestedField.required(
+ 8,
+ "equality_delete_file_count",
+ Types.IntegerType.get(),
+ "Count of equality delete files");
+ private static final Types.NestedField LAST_UPDATED_AT =
+ Types.NestedField.optional(
+ 9,
+ "last_updated_at",
+ Types.TimestampType.withZone(),
+ "Commit time of snapshot that last updated this partition");
+ private static final Types.NestedField LAST_UPDATED_SNAPSHOT_ID =
+ Types.NestedField.optional(
+ 10,
+ "last_updated_snapshot_id",
+ Types.LongType.get(),
+ "Id of snapshot that last updated this partition");
+
private final Schema schema;
private final boolean unpartitionedTable;
@@ -50,47 +104,18 @@ public class PartitionsTable extends BaseMetadataTable {
this.schema =
new Schema(
- Types.NestedField.required(1, "partition",
Partitioning.partitionType(table)),
- Types.NestedField.required(4, "spec_id", Types.IntegerType.get()),
- Types.NestedField.required(
- 2, "record_count", Types.LongType.get(), "Count of records in
data files"),
- Types.NestedField.required(
- 3, "file_count", Types.IntegerType.get(), "Count of data
files"),
- Types.NestedField.required(
- 11,
- "total_data_file_size_in_bytes",
- Types.LongType.get(),
- "Total size in bytes of data files"),
Types.NestedField.required(
- 5,
- "position_delete_record_count",
- Types.LongType.get(),
- "Count of records in position delete files"),
- Types.NestedField.required(
- 6,
- "position_delete_file_count",
- Types.IntegerType.get(),
- "Count of position delete files"),
- Types.NestedField.required(
- 7,
- "equality_delete_record_count",
- Types.LongType.get(),
- "Count of records in equality delete files"),
- Types.NestedField.required(
- 8,
- "equality_delete_file_count",
- Types.IntegerType.get(),
- "Count of equality delete files"),
- Types.NestedField.optional(
- 9,
- "last_updated_at",
- Types.TimestampType.withZone(),
- "Commit time of snapshot that last updated this partition"),
- Types.NestedField.optional(
- 10,
- "last_updated_snapshot_id",
- Types.LongType.get(),
- "Id of snapshot that last updated this partition"));
+ PARTITION_FIELD_ID, "partition",
Partitioning.partitionType(table)),
+ SPEC_ID,
+ RECORD_COUNT,
+ FILE_COUNT,
+ TOTAL_DATA_FILE_SIZE_IN_BYTES,
+ POSITION_DELETE_RECORD_COUNT,
+ POSITION_DELETE_FILE_COUNT,
+ EQUALITY_DELETE_RECORD_COUNT,
+ EQUALITY_DELETE_FILE_COUNT,
+ LAST_UPDATED_AT,
+ LAST_UPDATED_SNAPSHOT_ID);
this.unpartitionedTable =
Partitioning.partitionType(table).fields().isEmpty();
}
@@ -102,16 +127,18 @@ public class PartitionsTable extends BaseMetadataTable {
@Override
public Schema schema() {
if (unpartitionedTable) {
- return schema.select(
- "record_count",
- "file_count",
- "total_data_file_size_in_bytes",
- "position_delete_record_count",
- "position_delete_file_count",
- "equality_delete_record_count",
- "equality_delete_file_count",
- "last_updated_at",
- "last_updated_snapshot_id");
+ return TypeUtil.select(
+ schema,
+ ImmutableSet.of(
+ RECORD_COUNT.fieldId(),
+ FILE_COUNT.fieldId(),
+ TOTAL_DATA_FILE_SIZE_IN_BYTES.fieldId(),
+ POSITION_DELETE_RECORD_COUNT.fieldId(),
+ POSITION_DELETE_FILE_COUNT.fieldId(),
+ EQUALITY_DELETE_RECORD_COUNT.fieldId(),
+ EQUALITY_DELETE_FILE_COUNT.fieldId(),
+ LAST_UPDATED_AT.fieldId(),
+ LAST_UPDATED_SNAPSHOT_ID.fieldId()));
}
return schema;
}