This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new caeacdf33 IMPALA-14792: Try avoiding hadoop.fs.Path when loading 
Iceberg tables
caeacdf33 is described below

commit caeacdf331136b25669e08a7b1cc8ce9e4c1122d
Author: Csaba Ringhofer <[email protected]>
AuthorDate: Fri Feb 27 18:12:17 2026 +0100

    IMPALA-14792: Try avoiding hadoop.fs.Path when loading Iceberg tables
    
    Quick and dirty solution to speed up IcebergFileMetadataLoader.
    Its correctness is based on the assumption that Iceberg file
    locations must be normalized.
    
    Noticed in flamegraphs that org.apache.hadoop.fs.Path constructor
    is one of the main CPU consumers during Iceberg table loading,
    especially incremental reloads when most file descriptors are reused.
    hadoop.fs.Path was used to relativize locations compared to base
    table location and to get the "path" part of the URI. These can
    be done with simple String operations if we can assume that the
    URIs are normalized.
    
    Results on 1M file 25K partition Iceberg table:
    Full load:                  13s->10s
    Incremental load (0 files): 9s->3.5s
    
    hadoop.fs.Path constructor still uses significant CPU time after
    the change, but mainly in functions that run in parallel, so
    its effect is not longer that visible in total execution time.
    
    See Jira for before/after flamegraphs.
    
    Change-Id: Idce89117195e0fa64fdd6a6c576bce09ec2e75ea
    Reviewed-on: http://gerrit.cloudera.org:8080/24052
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Csaba Ringhofer <[email protected]>
---
 .../org/apache/impala/catalog/FileDescriptor.java  | 11 +++---
 .../impala/catalog/IcebergContentFileStore.java    | 36 ++++++++++++++++--
 .../impala/catalog/IcebergFileMetadataLoader.java  | 43 ++++++++++++++--------
 3 files changed, 65 insertions(+), 25 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/catalog/FileDescriptor.java 
b/fe/src/main/java/org/apache/impala/catalog/FileDescriptor.java
index 3bba32a82..46be0b216 100644
--- a/fe/src/main/java/org/apache/impala/catalog/FileDescriptor.java
+++ b/fe/src/main/java/org/apache/impala/catalog/FileDescriptor.java
@@ -220,12 +220,13 @@ public class FileDescriptor implements 
Comparable<FileDescriptor> {
   }
 
   public String getAbsolutePath(String rootPath) {
-    if (StringUtils.isEmpty(fbFileDescriptor_.relativePath())
-        && StringUtils.isNotEmpty(fbFileDescriptor_.absolutePath())) {
-      return fbFileDescriptor_.absolutePath();
-    } else {
-      return rootPath + Path.SEPARATOR + fbFileDescriptor_.relativePath();
+    String relativePath = fbFileDescriptor_.relativePath();
+    if (StringUtils.isNotEmpty(relativePath)) {
+      return rootPath + Path.SEPARATOR + relativePath;
     }
+    String absolutePath = fbFileDescriptor_.absolutePath();
+    if(StringUtils.isEmpty(absolutePath)) return rootPath + Path.SEPARATOR;
+    return absolutePath;
   }
 
   public String getPath() {
diff --git 
a/fe/src/main/java/org/apache/impala/catalog/IcebergContentFileStore.java 
b/fe/src/main/java/org/apache/impala/catalog/IcebergContentFileStore.java
index 12fdc2a6b..737a91fab 100644
--- a/fe/src/main/java/org/apache/impala/catalog/IcebergContentFileStore.java
+++ b/fe/src/main/java/org/apache/impala/catalog/IcebergContentFileStore.java
@@ -54,6 +54,9 @@ import org.apache.impala.util.Hash128;
 import org.apache.impala.util.IcebergUtil;
 import org.apache.impala.util.ListMap;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 /**
  * Helper class for storing Iceberg file descriptors. It stores data and 
delete files
  * separately, while also storing file descriptors belonging to earlier 
snapshots.
@@ -61,6 +64,9 @@ import org.apache.impala.util.ListMap;
  */
 public class IcebergContentFileStore {
 
+  private final static Logger LOG = LoggerFactory.getLogger(
+      IcebergContentFileStore.class);
+
   private static class EncodedFileDescriptor {
     public final byte[] fileDesc_;
     public final byte[] fileMetadata_;
@@ -189,9 +195,11 @@ public class IcebergContentFileStore {
     partitions_ = partitions;
 
     Map<String, IcebergFileDescriptor> fileDescMap = new HashMap<>();
+    String apiTableLocation = iceApiTable.location();
     for (IcebergFileDescriptor fileDesc : fileDescriptors) {
-      Path path = new Path(fileDesc.getAbsolutePath(iceApiTable.location()));
-      fileDescMap.put(path.toUri().getPath(), fileDesc);
+      String absPathStr = fileDesc.getAbsolutePath(apiTableLocation);
+      String pathStr = quickGetPath(absPathStr);
+      fileDescMap.put(pathStr, fileDesc);
     }
 
     for (DataFile dataFile : icebergFiles.dataFilesWithoutDeletes) {
@@ -348,11 +356,31 @@ public class IcebergContentFileStore {
         getIcebergFd(fileDescMap, contentFile));
   }
 
+  private static String quickGetPath(String uri) {
+    int pos1 = uri.indexOf('/');
+    int pos2 = uri.indexOf('/', pos1 + 1);
+    if (pos2 != pos1 + 1) {
+      // Assume no scheme and authority, return whole path.
+      return uri;
+    }
+    int pos3 = uri.indexOf('/', pos2 + 1);
+
+    if (pos3 != -1) {
+        // Return everything starting from the 3rd slash.
+        return uri.substring(pos3);
+    }
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("quickGetPath couldn't parse uri, falling back to slow path: 
{}", uri);
+    }
+    // Something is off, use slow path.
+    return new Path(uri).toUri().getPath();
+  }
+
   private EncodedFileDescriptor getIcebergFd(
       Map<String, IcebergFileDescriptor> fileDescMap,
       ContentFile<?> contentFile) {
-    Path path = new Path(contentFile.path().toString());
-    IcebergFileDescriptor fileDesc = fileDescMap.get(path.toUri().getPath());
+    String pathStr = quickGetPath(contentFile.location());
+    IcebergFileDescriptor fileDesc = fileDescMap.get(pathStr);
 
     if (fileDesc == null) return null;
 
diff --git 
a/fe/src/main/java/org/apache/impala/catalog/IcebergFileMetadataLoader.java 
b/fe/src/main/java/org/apache/impala/catalog/IcebergFileMetadataLoader.java
index b0bf86363..144782df4 100644
--- a/fe/src/main/java/org/apache/impala/catalog/IcebergFileMetadataLoader.java
+++ b/fe/src/main/java/org/apache/impala/catalog/IcebergFileMetadataLoader.java
@@ -69,7 +69,10 @@ public class IcebergFileMetadataLoader extends 
FileMetadataLoader {
       IcebergFileMetadataLoader.class);
 
   private final org.apache.iceberg.Table iceTbl_;
+  // Store Path as both String and Path to be able to use the more efficient 
one in
+  // functions.
   private final Path tablePath_;
+  private final String tablePathStr_;
   private final GroupedContentFiles icebergFiles_;
   private final List<TIcebergPartition> oldIcebergPartitions_;
   private AtomicInteger nextPartitionId_ = new AtomicInteger(0);
@@ -84,7 +87,8 @@ public class IcebergFileMetadataLoader extends 
FileMetadataLoader {
     super(iceTbl.location(), true, oldFds, hostIndex, null, null,
         HdfsFileFormat.ICEBERG);
     iceTbl_ = iceTbl;
-    tablePath_ = FileSystemUtil.createFullyQualifiedPath(new 
Path(iceTbl.location()));
+    tablePathStr_ = iceTbl.location();
+    tablePath_ = FileSystemUtil.createFullyQualifiedPath(new 
Path(tablePathStr_));
     icebergFiles_ = icebergFiles;
     oldIcebergPartitions_ = partitions;
     requiresDataFilesInTableLocation_ = requiresDataFilesInTableLocation;
@@ -125,7 +129,7 @@ public class IcebergFileMetadataLoader extends 
FileMetadataLoader {
     fileMetadataStats_ = new FileMetadataStats();
 
     // Process the existing Fd ContentFile and return the newly added 
ContentFile
-    Iterable<ContentFile<?>> newContentFiles = 
loadContentFilesWithOldFds(tablePath_);
+    Iterable<ContentFile<?>> newContentFiles = loadContentFilesWithOldFds();
     // Iterate through all the newContentFiles, determine if StorageIds are 
supported,
     // and use different handling methods accordingly.
     // This considers that different ContentFiles are on different FileSystems
@@ -173,14 +177,14 @@ public class IcebergFileMetadataLoader extends 
FileMetadataLoader {
    *  Iceberg tables are a collection of immutable, uniquely identifiable data 
files,
    *  which means we can safely reuse the old FDs.
    */
-  private Iterable<ContentFile<?>> loadContentFilesWithOldFds(Path partPath)
+  private Iterable<ContentFile<?>> loadContentFilesWithOldFds()
       throws TableLoadingException {
     if (forceRefreshLocations || oldFdsByPath_.isEmpty()) {
       return icebergFiles_.getAllContentFiles();
     }
     List<ContentFile<?>> newContentFiles = Lists.newArrayList();
     for (ContentFile<?> contentFile : icebergFiles_.getAllContentFiles()) {
-      IcebergFileDescriptor fd = getOldFd(contentFile, partPath);
+      IcebergFileDescriptor fd = getOldFd(contentFile);
       if (fd == null) {
         newContentFiles.add(contentFile);
       } else {
@@ -348,19 +352,26 @@ public class IcebergFileMetadataLoader extends 
FileMetadataLoader {
     return ret;
   }
 
-  IcebergFileDescriptor getOldFd(ContentFile<?> contentFile, Path partPath)
-      throws TableLoadingException {
-    Path contentFilePath = FileSystemUtil.createFullyQualifiedPath(
-        new Path(contentFile.path().toString()));
-    String lookupPath = FileSystemUtil.relativizePathNoThrow(contentFilePath, 
partPath);
-    if (lookupPath == null) {
-      if (requiresDataFilesInTableLocation_) {
-        throw new TableLoadingException(String.format("Failed to load Iceberg 
datafile " +
-            "%s, because it's outside of the table location", 
contentFilePath));
-      } else {
-        lookupPath = contentFilePath.toString();
-      }
+  private String toLookupPath(String path) throws TableLoadingException {
+    if (path.startsWith(tablePathStr_)
+        && path.length() > tablePathStr_.length()
+        && path.charAt(tablePathStr_.length()) == Path.SEPARATOR_CHAR) {
+      // Quick path for relativize fully qualified paths starting with the 
table path.
+      return path.substring(tablePathStr_.length() + 1);
+    } else {
+      // Slow path to handle paths outside the table dir and non fully 
qualified paths.
+      Path fqpath = FileSystemUtil.createFullyQualifiedPath(new Path(path));
+      String result = FileSystemUtil.relativizePathNoThrow(fqpath, tablePath_);
+      if (result != null) return result;
+      if (!requiresDataFilesInTableLocation_) return fqpath.toString();
+      throw new TableLoadingException(String.format("Failed to load Iceberg 
datafile " +
+          "%s, because it's outside of the table location",  
fqpath.toString()));
     }
+  }
+
+  IcebergFileDescriptor getOldFd(ContentFile<?> contentFile)
+      throws TableLoadingException {
+    String lookupPath = toLookupPath(contentFile.location());
     FileDescriptor fd = oldFdsByPath_.get(lookupPath);
     if (fd == null) return null;
 

Reply via email to