This is an automated email from the ASF dual-hosted git repository. xy720 pushed a commit to branch fix/iceberg-infer-format-light in repository https://gitbox.apache.org/repos/asf/doris.git
commit 7e53c657bb974ad9a750b4f6c9a3a5a6c42e64d7 Author: lambxu <[email protected]> AuthorDate: Tue Jun 23 23:24:03 2026 +0800 save --- .../doris/datasource/iceberg/IcebergUtils.java | 32 +++++++++++++++++----- .../doris/datasource/iceberg/IcebergUtilsTest.java | 30 ++++++++++++++++++++ 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java index ea36d755cfd..5376e4ecb4b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java @@ -79,9 +79,11 @@ import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.exception.ExceptionUtils; import org.apache.iceberg.BaseTable; import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.DataFile; import org.apache.iceberg.FileFormat; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.MetadataTableType; import org.apache.iceberg.MetadataTableUtils; @@ -106,6 +108,7 @@ import org.apache.iceberg.expressions.Projections; import org.apache.iceberg.expressions.Unbound; import org.apache.iceberg.hive.HiveCatalog; import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileIO; import org.apache.iceberg.types.Type.TypeID; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; @@ -133,6 +136,7 @@ import java.time.format.DateTimeFormatter; import java.time.temporal.ChronoField; import java.time.temporal.TemporalAccessor; import java.util.ArrayList; +import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.LinkedHashSet; @@ -1184,16 +1188,30 @@ public class IcebergUtils { } private static String inferFileFormatFromDataFiles(Table icebergTable) { - if (icebergTable.currentSnapshot() == null) { + Snapshot snapshot = icebergTable.currentSnapshot(); + if (snapshot == null) { LOG.info("Iceberg table {} has no snapshot, defaulting to {}", icebergTable.name(), PARQUET_NAME); return PARQUET_NAME; } - try (CloseableIterable<FileScanTask> files = icebergTable.newScan().planFiles()) { - java.util.Iterator<FileScanTask> it = files.iterator(); - if (it.hasNext()) { - String format = it.next().file().format().name().toLowerCase(); - LOG.info("Iceberg table {} inferred file format {} from data files", icebergTable.name(), format); - return format; + FileIO io = icebergTable.io(); + try { + for (ManifestFile manifest : snapshot.dataManifests(io)) { + // Read the `file_format` field from a single data manifest entry. + // To avoid many synchronous IO hanging FE planning. + if (manifest.addedFilesCount() != null && manifest.existingFilesCount() != null + && manifest.addedFilesCount() == 0 && manifest.existingFilesCount() == 0) { + continue; + } + try (CloseableIterable<DataFile> entries = ManifestFiles.read(manifest, io) + .select(Collections.singletonList(DataFile.FILE_FORMAT.name()))) { + java.util.Iterator<DataFile> it = entries.iterator(); + if (it.hasNext()) { + String format = it.next().format().name().toLowerCase(); + LOG.info("Iceberg table {} inferred file format {} from manifest {}", + icebergTable.name(), format, manifest.path()); + return format; + } + } } } catch (Exception e) { LOG.warn("Failed to infer file format from data files for table {}, defaulting to {}", diff --git a/fe/fe-core/src/test/java/org/apache/doris/datasource/iceberg/IcebergUtilsTest.java b/fe/fe-core/src/test/java/org/apache/doris/datasource/iceberg/IcebergUtilsTest.java index 5b25da419cc..af0eb0ff498 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/datasource/iceberg/IcebergUtilsTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/datasource/iceberg/IcebergUtilsTest.java @@ -25,6 +25,7 @@ import org.apache.doris.common.UserException; import org.apache.doris.datasource.iceberg.source.IcebergTableQueryInfo; import com.google.common.collect.ImmutableMap; +import org.apache.iceberg.FileFormat; import org.apache.iceberg.GenericPartitionFieldSummary; import org.apache.iceberg.HistoryEntry; import org.apache.iceberg.ManifestContent; @@ -711,4 +712,33 @@ public class IcebergUtilsTest { Assert.assertEquals(expectSchemaId, queryInfo.getSchemaId()); Assert.assertEquals(expectRef, queryInfo.getRef()); } + + @Test + public void testGetFileFormatHonorsWriteFormatProperty() { + Table table = Mockito.mock(Table.class); + Mockito.when(table.properties()).thenReturn(ImmutableMap.of(IcebergUtils.WRITE_FORMAT, "ORC")); + Assert.assertEquals(FileFormat.ORC, IcebergUtils.getFileFormat(table)); + // No snapshot lookup should be needed when properties are sufficient. + Mockito.verify(table, Mockito.never()).currentSnapshot(); + } + + @Test + public void testGetFileFormatHonorsDefaultFormatProperty() { + Table table = Mockito.mock(Table.class); + Mockito.when(table.properties()).thenReturn( + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, "parquet")); + Assert.assertEquals(FileFormat.PARQUET, IcebergUtils.getFileFormat(table)); + Mockito.verify(table, Mockito.never()).currentSnapshot(); + } + + @Test + public void testGetFileFormatFallsBackToParquetWhenNoSnapshotAndNoProperties() { + // Migrated tables without write-format / write.format.default and no snapshot + // must not throw and must default to parquet. + Table table = Mockito.mock(Table.class); + Mockito.when(table.properties()).thenReturn(ImmutableMap.of()); + Mockito.when(table.currentSnapshot()).thenReturn(null); + Mockito.when(table.name()).thenReturn("test_no_snapshot_table"); + Assert.assertEquals(FileFormat.PARQUET, IcebergUtils.getFileFormat(table)); + } } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
