This is an automated email from the ASF dual-hosted git repository.

sollhui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new bacfb0fe377 [perf](s3) push down expandable S3 glob prefixes (#64684)
bacfb0fe377 is described below

commit bacfb0fe377b6094dbd13098db51d5c32b7527a8
Author: hui lai <[email protected]>
AuthorDate: Wed Jun 24 14:34:22 2026 +0800

    [perf](s3) push down expandable S3 glob prefixes (#64684)
    
    ### What problem does this PR solve?
    
    Before this change, S3-compatible glob listing derived the object-store
    `ListObjects` prefix by stopping at the first glob metacharacter. For a
    path like:
    
    
    
`s3://bucket/asin_trend/sale/month/date=2025-{0[3-9],1[0-2]}-01/mp_id=8/0/0/436/*`
    
    the old behavior listed the broad prefix:
    
    `asin_trend/sale/month/date=2025-`
    
    and then filtered all returned object keys in FE. If many unrelated
    objects existed under `date=2025-*`, for example other dates, `mp_id`s,
    or deeper paths, S3 TVF planning could spend a long time listing and
    filtering files before query execution started.
    
    After this change, Doris expands safely enumerable glob fragments before
    issuing object-store list requests. The same path is now listed through
    narrower prefixes such as:
    
    `asin_trend/sale/month/date=2025-03-01/mp_id=8/0/0/436/`
    ...
    `asin_trend/sale/month/date=2025-12-01/mp_id=8/0/0/436/`
    
    Doris still applies the full glob regex after listing, so result
    correctness is unchanged. The optimization only reduces the remote
    listing scope. Expansion is limited to bounded brace alternations and
    positive character classes, with a hard cap to avoid generating too many
    prefixes. Existing pagination behavior through `startAfter` and
    `maxFile` is preserved.
---
 .../apache/doris/filesystem/s3/S3FileSystem.java   |  34 ++
 .../filesystem/s3/S3FileSystemProperties.java      |   5 +
 .../doris/filesystem/s3/S3FileSystemTest.java      | 216 ++++++++++++
 .../filesystem/spi/S3CompatibleFileSystem.java     | 362 ++++++++++++++++++---
 4 files changed, 579 insertions(+), 38 deletions(-)

diff --git 
a/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystem.java
 
b/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystem.java
index 6157d81f712..0f6eeb35608 100644
--- 
a/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystem.java
+++ 
b/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystem.java
@@ -19,6 +19,7 @@ package org.apache.doris.filesystem.s3;
 
 import org.apache.doris.filesystem.spi.S3CompatibleFileSystem;
 
+import java.util.List;
 import java.util.Optional;
 
 /**
@@ -46,6 +47,35 @@ public class S3FileSystem extends S3CompatibleFileSystem {
         return Optional.ofNullable(properties);
     }
 
+    @Override
+    protected String globListPrefix(String globPattern) {
+        if (isDirectoryBucketEndpoint()) {
+            return slashTerminatedNonGlobPrefix(globPattern);
+        }
+        return super.globListPrefix(globPattern);
+    }
+
+    @Override
+    protected List<String> globListPrefixes(String globPattern, String 
listPrefix) {
+        if (isDirectoryBucketEndpoint()) {
+            return List.of(listPrefix);
+        }
+        return super.globListPrefixes(globPattern, listPrefix);
+    }
+
+    private boolean isDirectoryBucketEndpoint() {
+        return properties != null && properties.isDirectoryBucketEndpoint();
+    }
+
+    private static String slashTerminatedNonGlobPrefix(String globPattern) {
+        String prefix = longestNonGlobPrefix(globPattern);
+        if (prefix.isEmpty() || prefix.endsWith("/")) {
+            return prefix;
+        }
+        int slash = prefix.lastIndexOf('/');
+        return slash < 0 ? "" : prefix.substring(0, slash + 1);
+    }
+
     protected static boolean isSingleLevelGlob(String pathStr) {
         return S3CompatibleFileSystem.isSingleLevelGlob(pathStr);
     }
@@ -54,6 +84,10 @@ public class S3FileSystem extends S3CompatibleFileSystem {
         return S3CompatibleFileSystem.longestNonGlobPrefix(globPattern);
     }
 
+    protected static List<String> expandedGlobListPrefixes(String globPattern) 
{
+        return S3CompatibleFileSystem.expandedGlobListPrefixes(globPattern);
+    }
+
     protected static String globToRegex(String glob) {
         return S3CompatibleFileSystem.globToRegex(glob);
     }
diff --git 
a/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystemProperties.java
 
b/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystemProperties.java
index ccbfdf66990..34e8fb54b93 100644
--- 
a/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystemProperties.java
+++ 
b/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystemProperties.java
@@ -327,6 +327,11 @@ public final class S3FileSystemProperties
         return StringUtils.isNotBlank(roleArn);
     }
 
+    public boolean isDirectoryBucketEndpoint() {
+        return StringUtils.containsIgnoreCase(endpoint, "s3express-control.")
+                || StringUtils.containsIgnoreCase(endpoint, "s3express-");
+    }
+
     private static void putIfNotBlank(Map<String, String> map, String key, 
String value) {
         if (StringUtils.isNotBlank(value)) {
             map.put(key, value);
diff --git 
a/fe/fe-filesystem/fe-filesystem-s3/src/test/java/org/apache/doris/filesystem/s3/S3FileSystemTest.java
 
b/fe/fe-filesystem/fe-filesystem-s3/src/test/java/org/apache/doris/filesystem/s3/S3FileSystemTest.java
index 5709bd82e0a..fe9b28dd8c9 100644
--- 
a/fe/fe-filesystem/fe-filesystem-s3/src/test/java/org/apache/doris/filesystem/s3/S3FileSystemTest.java
+++ 
b/fe/fe-filesystem/fe-filesystem-s3/src/test/java/org/apache/doris/filesystem/s3/S3FileSystemTest.java
@@ -18,6 +18,7 @@
 package org.apache.doris.filesystem.s3;
 
 import org.apache.doris.filesystem.DorisOutputFile;
+import org.apache.doris.filesystem.GlobListing;
 import org.apache.doris.filesystem.Location;
 import org.apache.doris.filesystem.spi.ObjectListOptions;
 import org.apache.doris.filesystem.spi.RemoteObject;
@@ -27,13 +28,16 @@ import org.apache.doris.filesystem.spi.RequestBody;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
+import org.mockito.ArgumentCaptor;
 import org.mockito.ArgumentMatchers;
 import org.mockito.Mockito;
 
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.FileAlreadyExistsException;
 import java.util.List;
+import java.util.Map;
 
 /**
  * Unit tests for {@link S3FileSystem} using a mock {@link S3ObjStorage}.
@@ -69,6 +73,34 @@ class S3FileSystemTest {
                 ArgumentMatchers.anyString(), 
ArgumentMatchers.<ObjectListOptions>any());
     }
 
+    private void mockSingleObjectListWithStartAfter(String listUri, String 
key, long size) throws IOException {
+        Mockito.doAnswer(invocation -> {
+            ObjectListOptions options = invocation.getArgument(1);
+            String startAfter = options.startAfter();
+            if (startAfter != null && !startAfter.isEmpty() && 
compareUtf8Binary(key, startAfter) <= 0) {
+                return new RemoteObjects(List.of(), false, null);
+            }
+            return new RemoteObjects(
+                    List.of(new RemoteObject(key, 
key.substring(key.lastIndexOf('/') + 1), null, size, 0L)),
+                    false, null);
+        }).when(mockStorage).listObjectsWithOptions(
+                ArgumentMatchers.eq(listUri), 
ArgumentMatchers.<ObjectListOptions>any());
+    }
+
+    private static int compareUtf8Binary(String left, String right) {
+        byte[] leftBytes = left.getBytes(StandardCharsets.UTF_8);
+        byte[] rightBytes = right.getBytes(StandardCharsets.UTF_8);
+        int commonLength = Math.min(leftBytes.length, rightBytes.length);
+        for (int i = 0; i < commonLength; i++) {
+            int result = Integer.compare(
+                    Byte.toUnsignedInt(leftBytes[i]), 
Byte.toUnsignedInt(rightBytes[i]));
+            if (result != 0) {
+                return result;
+            }
+        }
+        return Integer.compare(leftBytes.length, rightBytes.length);
+    }
+
     // ------------------------------------------------------------------
     // exists() (inherited from ObjFileSystem)
     // ------------------------------------------------------------------
@@ -555,6 +587,190 @@ class S3FileSystemTest {
         Assertions.assertEquals("", 
S3FileSystem.longestNonGlobPrefix("*.csv"));
     }
 
+    @Test
+    void expandedGlobListPrefixes_expandsJiraDatePattern() {
+        List<String> prefixes = S3FileSystem.expandedGlobListPrefixes(
+                
"asin_trend/sale/month/date=2025-{0[3-9],1[0-2]}-01/mp_id=8/0/0/436/*");
+
+        Assertions.assertEquals(10, prefixes.size());
+        Assertions.assertEquals(
+                "asin_trend/sale/month/date=2025-03-01/mp_id=8/0/0/436/",
+                prefixes.get(0));
+        Assertions.assertEquals(
+                "asin_trend/sale/month/date=2025-12-01/mp_id=8/0/0/436/",
+                prefixes.get(9));
+    }
+
+    @Test
+    void expandedGlobListPrefixes_fallsBackWhenBraceArmContainsWildcard() {
+        Assertions.assertEquals(List.of("data/"),
+                
S3FileSystem.expandedGlobListPrefixes("data/{foo*,bar*}/part.parquet"));
+    }
+
+    @Test
+    void 
expandedGlobListPrefixes_fallsBackWhenCharacterClassContainsSupplementaryCodePoint()
 {
+        String emoji = new String(Character.toChars(0x1F600));
+        String pattern = "data/[" + emoji + "]/file.csv";
+
+        
Assertions.assertTrue(java.util.regex.Pattern.compile(S3FileSystem.globToRegex(pattern))
+                .matcher("data/" + emoji + "/file.csv")
+                .matches());
+        Assertions.assertEquals(List.of("data/"), 
S3FileSystem.expandedGlobListPrefixes(pattern));
+    }
+
+    @Test
+    void expandedGlobListPrefixes_sortsPrefixesByUtf8BinaryOrder() {
+        String emoji = new String(Character.toChars(0x1F600));
+        String privateUse = Character.toString(0xE000);
+
+        Assertions.assertEquals(
+                List.of("data/" + privateUse + "/file.csv", "data/" + emoji + 
"/file.csv"),
+                S3FileSystem.expandedGlobListPrefixes("data/{" + emoji + "," + 
privateUse + "}/file.csv"));
+    }
+
+    @Test
+    void globListWithLimit_doesNotAppendSuffixToPartialBraceArmPrefix() throws 
IOException {
+        Mockito.when(mockStorage.listObjects(
+                        ArgumentMatchers.eq("s3://bucket/data/"), 
ArgumentMatchers.isNull()))
+                .thenReturn(new RemoteObjects(
+                        List.of(
+                                new RemoteObject("data/foobar/part.parquet",
+                                        "foobar/part.parquet", null, 10L, 0L),
+                                new RemoteObject("data/barbaz/part.parquet",
+                                        "barbaz/part.parquet", null, 20L, 0L)),
+                        false, null));
+
+        GlobListing listing = fs.globListWithLimit(
+                Location.of("s3://bucket/data/{foo*,bar*}/part.parquet"), 
null, 0L, 0L);
+
+        Assertions.assertEquals(2, listing.getFiles().size());
+        Mockito.verify(mockStorage).listObjects(
+                ArgumentMatchers.eq("s3://bucket/data/"), 
ArgumentMatchers.isNull());
+        Mockito.verify(mockStorage, Mockito.never()).listObjects(
+                ArgumentMatchers.eq("s3://bucket/data/foo/part.parquet"), 
ArgumentMatchers.any());
+        Mockito.verify(mockStorage, Mockito.never()).listObjects(
+                ArgumentMatchers.eq("s3://bucket/data/bar/part.parquet"), 
ArgumentMatchers.any());
+    }
+
+    @Test
+    void globListWithLimit_paginatesExpandedPrefixesInUtf8BinaryOrder() throws 
IOException {
+        String emoji = new String(Character.toChars(0x1F600));
+        String privateUse = Character.toString(0xE000);
+        String privateUseKey = "data/" + privateUse + "/file.csv";
+        String emojiKey = "data/" + emoji + "/file.csv";
+        String pattern = "s3://bucket/data/{" + emoji + "," + privateUse + 
"}/file.csv";
+
+        mockSingleObjectListWithStartAfter("s3://bucket/" + privateUseKey, 
privateUseKey, 10L);
+        mockSingleObjectListWithStartAfter("s3://bucket/" + emojiKey, 
emojiKey, 20L);
+
+        GlobListing firstPage = fs.globListWithLimit(Location.of(pattern), 
null, 0L, 1L);
+
+        Assertions.assertEquals(1, firstPage.getFiles().size());
+        Assertions.assertEquals("s3://bucket/" + privateUseKey, 
firstPage.getFiles().get(0).location().uri());
+        Assertions.assertEquals(emojiKey, firstPage.getMaxFile());
+
+        GlobListing secondPage = fs.globListWithLimit(Location.of(pattern), 
privateUseKey, 0L, 1L);
+
+        Assertions.assertEquals(1, secondPage.getFiles().size());
+        Assertions.assertEquals("s3://bucket/" + emojiKey, 
secondPage.getFiles().get(0).location().uri());
+    }
+
+    @Test
+    void 
globListWithLimit_directoryBucketFallsBackToSlashTerminatedStaticPrefix() 
throws IOException {
+        S3FileSystemProperties properties = S3FileSystemProperties.of(Map.of(
+                "s3.endpoint", 
"https://s3express-usw2-az1.us-west-2.amazonaws.com";,
+                "s3.region", "us-west-2"));
+        S3FileSystem directoryBucketFs = new S3FileSystem(properties, 
mockStorage);
+        Mockito.when(mockStorage.listObjects(
+                        ArgumentMatchers.eq("s3://bucket/data/"), 
ArgumentMatchers.isNull()))
+                .thenReturn(new RemoteObjects(
+                        List.of(
+                                new RemoteObject("data/a.csv", "a.csv", null, 
10L, 0L),
+                                new RemoteObject("data/b.csv", "b.csv", null, 
20L, 0L)),
+                        false, null));
+
+        GlobListing listing = directoryBucketFs.globListWithLimit(
+                Location.of("s3://bucket/data/[ab]*.csv"), null, 0L, 0L);
+
+        Assertions.assertEquals(2, listing.getFiles().size());
+        Assertions.assertEquals("data/", listing.getPrefix());
+        Mockito.verify(mockStorage).listObjects(
+                ArgumentMatchers.eq("s3://bucket/data/"), 
ArgumentMatchers.isNull());
+        Mockito.verify(mockStorage, Mockito.never()).listObjects(
+                ArgumentMatchers.eq("s3://bucket/data/a"), 
ArgumentMatchers.any());
+        Mockito.verify(mockStorage, Mockito.never()).listObjects(
+                ArgumentMatchers.eq("s3://bucket/data/b"), 
ArgumentMatchers.any());
+    }
+
+    @Test
+    void globListWithLimit_listsExpandedDatePrefixesInsteadOfBroadDatePrefix() 
throws IOException {
+        Mockito.when(mockStorage.listObjects(ArgumentMatchers.anyString(), 
ArgumentMatchers.isNull()))
+                .thenReturn(new RemoteObjects(List.of(), false, null));
+        Mockito.when(mockStorage.listObjects(
+                        
ArgumentMatchers.eq("s3://bucket/asin_trend/sale/month/"
+                                + "date=2025-03-01/mp_id=8/0/0/436/"),
+                        ArgumentMatchers.isNull()))
+                .thenReturn(new RemoteObjects(
+                        List.of(new RemoteObject(
+                                
"asin_trend/sale/month/date=2025-03-01/mp_id=8/0/0/436/a.parquet",
+                                "a.parquet", null, 11L, 0L)),
+                        false, null));
+        Mockito.when(mockStorage.listObjects(
+                        
ArgumentMatchers.eq("s3://bucket/asin_trend/sale/month/"
+                                + "date=2025-10-01/mp_id=8/0/0/436/"),
+                        ArgumentMatchers.isNull()))
+                .thenReturn(new RemoteObjects(
+                        List.of(new RemoteObject(
+                                
"asin_trend/sale/month/date=2025-10-01/mp_id=8/0/0/436/b.parquet",
+                                "b.parquet", null, 12L, 0L)),
+                        false, null));
+
+        GlobListing listing = 
fs.globListWithLimit(Location.of("s3://bucket/asin_trend/sale/month/"
+                + "date=2025-{0[3-9],1[0-2]}-01/mp_id=8/0/0/436/*"), null, 0L, 
0L);
+
+        Assertions.assertEquals(2, listing.getFiles().size());
+        Assertions.assertEquals("asin_trend/sale/month/date=2025-", 
listing.getPrefix());
+
+        ArgumentCaptor<String> pathCaptor = 
ArgumentCaptor.forClass(String.class);
+        Mockito.verify(mockStorage, Mockito.atLeastOnce()).listObjects(
+                pathCaptor.capture(), ArgumentMatchers.isNull());
+        java.util.Set<String> listedPaths = new 
java.util.HashSet<>(pathCaptor.getAllValues());
+        
Assertions.assertFalse(listedPaths.contains("s3://bucket/asin_trend/sale/month/date=2025-"));
+        
Assertions.assertTrue(listedPaths.contains("s3://bucket/asin_trend/sale/month/"
+                + "date=2025-03-01/mp_id=8/0/0/436/"));
+        
Assertions.assertTrue(listedPaths.contains("s3://bucket/asin_trend/sale/month/"
+                + "date=2025-12-01/mp_id=8/0/0/436/"));
+    }
+
+    @Test
+    void globListWithLimit_findsNextMatchAcrossExpandedPrefixesAfterLimit() 
throws IOException {
+        Mockito.when(mockStorage.listObjects(
+                        ArgumentMatchers.eq("s3://bucket/date=2025-01/file"),
+                        ArgumentMatchers.isNull()))
+                .thenReturn(new RemoteObjects(
+                        List.of(
+                                new RemoteObject("date=2025-01/file-a.csv",
+                                        "file-a.csv", null, 10L, 0L),
+                                new RemoteObject("date=2025-01/file-b.csv",
+                                        "file-b.csv", null, 20L, 0L)),
+                        false, null));
+        Mockito.when(mockStorage.listObjects(
+                        ArgumentMatchers.eq("s3://bucket/date=2025-02/file"),
+                        ArgumentMatchers.isNull()))
+                .thenReturn(new RemoteObjects(
+                        List.of(new RemoteObject("date=2025-02/file-c.csv",
+                                "file-c.csv", null, 30L, 0L)),
+                        false, null));
+
+        GlobListing listing = fs.globListWithLimit(
+                Location.of("s3://bucket/date=2025-0[12]/file*.csv"), null, 
0L, 2L);
+
+        Assertions.assertEquals(2, listing.getFiles().size());
+        Assertions.assertEquals("date=2025-02/file-c.csv", 
listing.getMaxFile());
+        Mockito.verify(mockStorage).listObjects(
+                ArgumentMatchers.eq("s3://bucket/date=2025-02/file"), 
ArgumentMatchers.isNull());
+    }
+
     // ------------------------------------------------------------------
     // newOutputFile().create() / createOrOverwrite()
     // ------------------------------------------------------------------
diff --git 
a/fe/fe-filesystem/fe-filesystem-spi/src/main/java/org/apache/doris/filesystem/spi/S3CompatibleFileSystem.java
 
b/fe/fe-filesystem/fe-filesystem-spi/src/main/java/org/apache/doris/filesystem/spi/S3CompatibleFileSystem.java
index 76486870b8e..5746fed5dd8 100644
--- 
a/fe/fe-filesystem/fe-filesystem-spi/src/main/java/org/apache/doris/filesystem/spi/S3CompatibleFileSystem.java
+++ 
b/fe/fe-filesystem/fe-filesystem-spi/src/main/java/org/apache/doris/filesystem/spi/S3CompatibleFileSystem.java
@@ -28,10 +28,12 @@ import org.apache.doris.filesystem.Location;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.FileAlreadyExistsException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
@@ -53,6 +55,8 @@ public abstract class S3CompatibleFileSystem extends 
ObjFileSystem {
 
     // Object stores do not have real directories; use a zero-byte marker with 
trailing slash.
     private static final String DIR_MARKER_SUFFIX = "/";
+    private static final int MAX_EXPANDED_GLOB_LIST_PREFIXES = 256;
+    private static final Comparator<String> UTF8_BINARY_ORDER = 
S3CompatibleFileSystem::compareUtf8Binary;
 
     private final boolean usePathStyle;
 
@@ -847,6 +851,282 @@ public abstract class S3CompatibleFileSystem extends 
ObjFileSystem {
         return globPattern.substring(0, earliest);
     }
 
+    /**
+     * Returns object-store list prefixes that are safe to push down for a 
glob pattern.
+     *
+     * <p>Unlike {@link #longestNonGlobPrefix(String)}, this expands bounded 
glob constructs
+     * ({@code {...}} alternation and positive {@code [...]} character 
classes) before the first
+     * unbounded wildcard. That lets patterns such as
+     * {@code date=2025-{0[3-9],1[0-2]}-01/mp_id=8/*} list the concrete 
date/mp prefixes instead
+     * of scanning everything under {@code date=2025-}. If expansion would be 
too large or a glob
+     * construct is not safely enumerable, it falls back to the conservative 
longest static prefix.
+     */
+    protected static List<String> expandedGlobListPrefixes(String globPattern) 
{
+        List<String> prefixes = expandGlobListPrefixes(globPattern, true);
+        return prefixes == null ? List.of(longestNonGlobPrefix(globPattern)) : 
prefixes;
+    }
+
+    protected String globListPrefix(String globPattern) {
+        return longestNonGlobPrefix(globPattern);
+    }
+
+    protected List<String> globListPrefixes(String globPattern, String 
listPrefix) {
+        return expandedGlobListPrefixes(globPattern);
+    }
+
+    private static List<String> expandGlobListPrefixes(String globPattern, 
boolean allowPartialPrefix) {
+        List<String> prefixes = new ArrayList<>();
+        prefixes.add("");
+        int i = 0;
+        while (i < globPattern.length()) {
+            char c = globPattern.charAt(i);
+            if (c == '*' || c == '?') {
+                return allowPartialPrefix ? compactPrefixes(prefixes) : null;
+            }
+            if (c == '\\') {
+                if (i + 1 < globPattern.length()) {
+                    appendLiteral(prefixes, globPattern.charAt(i + 1));
+                    i += 2;
+                } else {
+                    appendLiteral(prefixes, c);
+                    i++;
+                }
+                continue;
+            }
+            if (c == '[') {
+                PrefixExpansion charClass = expandCharacterClass(globPattern, 
i);
+                if (charClass == null) {
+                    return allowPartialPrefix ? compactPrefixes(prefixes) : 
null;
+                }
+                prefixes = appendAlternatives(prefixes, charClass.values);
+                if (prefixes == null) {
+                    return null;
+                }
+                i = charClass.nextIndex;
+                continue;
+            }
+            if (c == '{') {
+                PrefixExpansion brace = expandBraceGroup(globPattern, i);
+                if (brace == null) {
+                    return allowPartialPrefix ? compactPrefixes(prefixes) : 
null;
+                }
+                prefixes = appendAlternatives(prefixes, brace.values);
+                if (prefixes == null) {
+                    return null;
+                }
+                i = brace.nextIndex;
+                continue;
+            }
+            appendLiteral(prefixes, c);
+            i++;
+        }
+        return compactPrefixes(prefixes);
+    }
+
+    private static void appendLiteral(List<String> prefixes, char c) {
+        for (int i = 0; i < prefixes.size(); i++) {
+            prefixes.set(i, prefixes.get(i) + c);
+        }
+    }
+
+    private static List<String> appendAlternatives(List<String> prefixes, 
List<String> alternatives) {
+        long expandedSize = (long) prefixes.size() * alternatives.size();
+        if (expandedSize > MAX_EXPANDED_GLOB_LIST_PREFIXES) {
+            return null;
+        }
+        List<String> expanded = new ArrayList<>((int) expandedSize);
+        for (String prefix : prefixes) {
+            for (String alternative : alternatives) {
+                expanded.add(prefix + alternative);
+            }
+        }
+        return expanded;
+    }
+
+    private static PrefixExpansion expandCharacterClass(String globPattern, 
int openIndex) {
+        int closeIndex = findClosingBracket(globPattern, openIndex);
+        if (closeIndex < 0 || closeIndex == openIndex + 1) {
+            return null;
+        }
+        int i = openIndex + 1;
+        char first = globPattern.charAt(i);
+        if (first == '!' || first == '^') {
+            return null;
+        }
+        if (containsSurrogate(globPattern, i, closeIndex)) {
+            return null;
+        }
+        List<String> values = new ArrayList<>();
+        while (i < closeIndex) {
+            char current = globPattern.charAt(i);
+            if (current == '\\') {
+                if (i + 1 >= closeIndex) {
+                    return null;
+                }
+                values.add(String.valueOf(globPattern.charAt(i + 1)));
+                i += 2;
+                continue;
+            }
+            if (i + 2 < closeIndex && globPattern.charAt(i + 1) == '-') {
+                char rangeEnd = globPattern.charAt(i + 2);
+                int step = current <= rangeEnd ? 1 : -1;
+                for (char ch = current; step > 0 ? ch <= rangeEnd : ch >= 
rangeEnd; ch += step) {
+                    values.add(String.valueOf(ch));
+                    if (values.size() > MAX_EXPANDED_GLOB_LIST_PREFIXES) {
+                        return null;
+                    }
+                }
+                i += 3;
+                continue;
+            }
+            values.add(String.valueOf(current));
+            i++;
+        }
+        return new PrefixExpansion(values, closeIndex + 1);
+    }
+
+    private static int findClosingBracket(String globPattern, int openIndex) {
+        for (int i = openIndex + 1; i < globPattern.length(); i++) {
+            char c = globPattern.charAt(i);
+            if (c == '\\') {
+                i++;
+                continue;
+            }
+            if (c == ']') {
+                return i;
+            }
+        }
+        return -1;
+    }
+
+    private static boolean containsSurrogate(String text, int start, int end) {
+        for (int i = start; i < end; i++) {
+            if (Character.isSurrogate(text.charAt(i))) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    private static PrefixExpansion expandBraceGroup(String globPattern, int 
openIndex) {
+        int closeIndex = findClosingBrace(globPattern, openIndex);
+        if (closeIndex < 0) {
+            return null;
+        }
+        List<String> alternatives = splitBraceAlternatives(
+                globPattern.substring(openIndex + 1, closeIndex));
+        if (alternatives.isEmpty()) {
+            return null;
+        }
+        List<String> values = new ArrayList<>();
+        for (String alternative : alternatives) {
+            List<String> expandedAlternative = 
expandGlobListPrefixes(alternative, false);
+            if (expandedAlternative == null) {
+                return null;
+            }
+            values.addAll(expandedAlternative);
+            if (values.size() > MAX_EXPANDED_GLOB_LIST_PREFIXES) {
+                return null;
+            }
+        }
+        return new PrefixExpansion(values, closeIndex + 1);
+    }
+
+    private static int findClosingBrace(String globPattern, int openIndex) {
+        int depth = 0;
+        for (int i = openIndex; i < globPattern.length(); i++) {
+            char c = globPattern.charAt(i);
+            if (c == '\\') {
+                i++;
+                continue;
+            }
+            if (c == '{') {
+                depth++;
+            } else if (c == '}') {
+                depth--;
+                if (depth == 0) {
+                    return i;
+                }
+            }
+        }
+        return -1;
+    }
+
+    private static List<String> splitBraceAlternatives(String content) {
+        List<String> alternatives = new ArrayList<>();
+        int depth = 0;
+        int start = 0;
+        for (int i = 0; i < content.length(); i++) {
+            char c = content.charAt(i);
+            if (c == '\\') {
+                i++;
+                continue;
+            }
+            if (c == '{') {
+                depth++;
+            } else if (c == '}') {
+                if (depth == 0) {
+                    return Collections.emptyList();
+                }
+                depth--;
+            } else if (c == ',' && depth == 0) {
+                alternatives.add(content.substring(start, i));
+                start = i + 1;
+            }
+        }
+        if (depth != 0) {
+            return Collections.emptyList();
+        }
+        alternatives.add(content.substring(start));
+        return alternatives;
+    }
+
+    private static List<String> compactPrefixes(List<String> prefixes) {
+        List<String> sorted = new ArrayList<>(prefixes);
+        sorted.sort(UTF8_BINARY_ORDER);
+        List<String> compact = new ArrayList<>();
+        for (String prefix : sorted) {
+            if (prefix.isEmpty()) {
+                return List.of("");
+            }
+            boolean covered = false;
+            for (String existing : compact) {
+                if (prefix.startsWith(existing)) {
+                    covered = true;
+                    break;
+                }
+            }
+            if (!covered) {
+                compact.add(prefix);
+            }
+        }
+        return compact;
+    }
+
+    private static int compareUtf8Binary(String left, String right) {
+        byte[] leftBytes = left.getBytes(StandardCharsets.UTF_8);
+        byte[] rightBytes = right.getBytes(StandardCharsets.UTF_8);
+        int commonLength = Math.min(leftBytes.length, rightBytes.length);
+        for (int i = 0; i < commonLength; i++) {
+            int result = Integer.compare(
+                    Byte.toUnsignedInt(leftBytes[i]), 
Byte.toUnsignedInt(rightBytes[i]));
+            if (result != 0) {
+                return result;
+            }
+        }
+        return Integer.compare(leftBytes.length, rightBytes.length);
+    }
+
+    private static class PrefixExpansion {
+        private final List<String> values;
+        private final int nextIndex;
+
+        private PrefixExpansion(List<String> values, int nextIndex) {
+            this.values = values;
+            this.nextIndex = nextIndex;
+        }
+    }
+
     /**
      * Expands {@code {N..M}} numeric range syntax in a glob pattern to the 
equivalent
      * comma-separated alternation {@code {N,N+1,...,M}} that Java's 
PathMatcher understands.
@@ -923,7 +1203,8 @@ public abstract class S3CompatibleFileSystem extends 
ObjFileSystem {
         // separator is '\' which would corrupt object storage keys, and (b) 
Paths.get rejects keys
         // containing characters illegal in the host OS path syntax (':', '\', 
etc.).
         Pattern matcher = Pattern.compile(globToRegex(expandedKeyPattern));
-        String listPrefix = longestNonGlobPrefix(expandedKeyPattern);
+        String listPrefix = globListPrefix(expandedKeyPattern);
+        List<String> listPrefixes = globListPrefixes(expandedKeyPattern, 
listPrefix);
 
         List<FileEntry> files = new ArrayList<>();
         long totalSize = 0L;
@@ -933,51 +1214,56 @@ public abstract class S3CompatibleFileSystem extends 
ObjFileSystem {
         String nextMatchAfterLimit = "";
         String lastMatchedKey = "";
         boolean isTruncated;
-        String continuationToken = null;
-        String listUri = base + listPrefix;
 
         try {
-            do {
-                RemoteObjects response = 
objStorage.listObjectsWithOptions(listUri, ObjectListOptions.builder()
-                        .continuationToken(continuationToken)
-                        .startAfter(continuationToken == null ? startAfter : 
null)
-                        .build());
-                for (RemoteObject obj : response.getObjectList()) {
-                    if (reachLimit) {
-                        // After hitting limit: find the first matching key so 
callers know more data exists.
-                        if (nextMatchAfterLimit.isEmpty()
-                                && matcher.matcher(obj.getKey()).matches()) {
-                            nextMatchAfterLimit = obj.getKey();
+            for (String currentListPrefix : listPrefixes) {
+                String continuationToken = null;
+                String listUri = base + currentListPrefix;
+                do {
+                    RemoteObjects response = 
objStorage.listObjectsWithOptions(listUri, ObjectListOptions.builder()
+                            .continuationToken(continuationToken)
+                            .startAfter(continuationToken == null ? startAfter 
: null)
+                            .build());
+                    for (RemoteObject obj : response.getObjectList()) {
+                        if (reachLimit) {
+                            // After hitting limit: find the first matching 
key so callers know more data exists.
+                            if (nextMatchAfterLimit.isEmpty()
+                                    && 
matcher.matcher(obj.getKey()).matches()) {
+                                nextMatchAfterLimit = obj.getKey();
+                            }
+                            continue;
                         }
-                        continue;
-                    }
 
-                    if (!matcher.matcher(obj.getKey()).matches()) {
-                        continue;
-                    }
+                        if (!matcher.matcher(obj.getKey()).matches()) {
+                            continue;
+                        }
 
-                    files.add(new FileEntry(
-                            Location.of(base + obj.getKey()),
-                            obj.getSize(),
-                            false,
-                            obj.getModificationTime(),
-                            null));
-                    totalSize += obj.getSize();
-                    lastMatchedKey = obj.getKey();
-
-                    if ((maxFiles > 0 && files.size() >= maxFiles)
-                            || (maxBytes > 0 && totalSize >= maxBytes)) {
-                        reachLimit = true;
+                        files.add(new FileEntry(
+                                Location.of(base + obj.getKey()),
+                                obj.getSize(),
+                                false,
+                                obj.getModificationTime(),
+                                null));
+                        totalSize += obj.getSize();
+                        lastMatchedKey = obj.getKey();
+
+                        if ((maxFiles > 0 && files.size() >= maxFiles)
+                                || (maxBytes > 0 && totalSize >= maxBytes)) {
+                            reachLimit = true;
+                        }
                     }
-                }
 
-                isTruncated = response.isTruncated();
-                if (isTruncated) {
-                    continuationToken = response.getContinuationToken();
+                    isTruncated = response.isTruncated();
+                    if (isTruncated) {
+                        continuationToken = response.getContinuationToken();
+                    }
+                    // Continue paginating after limit until we find the next 
matching key,
+                    // so callers can use it as a pagination cursor.
+                } while (isTruncated && (!reachLimit || 
nextMatchAfterLimit.isEmpty()));
+                if (reachLimit && !nextMatchAfterLimit.isEmpty()) {
+                    break;
                 }
-                // Continue paginating after limit until we find the next 
matching key,
-                // so callers can use it as a pagination cursor.
-            } while (isTruncated && (!reachLimit || 
nextMatchAfterLimit.isEmpty()));
+            }
         } catch (Exception e) {
             throw new IOException("Failed to list object storage objects at " 
+ uri + ": " + e.getMessage(), e);
         }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to