This is an automated email from the ASF dual-hosted git repository.
sollhui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new bacfb0fe377 [perf](s3) push down expandable S3 glob prefixes (#64684)
bacfb0fe377 is described below
commit bacfb0fe377b6094dbd13098db51d5c32b7527a8
Author: hui lai <[email protected]>
AuthorDate: Wed Jun 24 14:34:22 2026 +0800
[perf](s3) push down expandable S3 glob prefixes (#64684)
### What problem does this PR solve?
Before this change, S3-compatible glob listing derived the object-store
`ListObjects` prefix by stopping at the first glob metacharacter. For a
path like:
`s3://bucket/asin_trend/sale/month/date=2025-{0[3-9],1[0-2]}-01/mp_id=8/0/0/436/*`
the old behavior listed the broad prefix:
`asin_trend/sale/month/date=2025-`
and then filtered all returned object keys in FE. If many unrelated
objects existed under `date=2025-*`, for example other dates, `mp_id`s,
or deeper paths, S3 TVF planning could spend a long time listing and
filtering files before query execution started.
After this change, Doris expands safely enumerable glob fragments before
issuing object-store list requests. The same path is now listed through
narrower prefixes such as:
`asin_trend/sale/month/date=2025-03-01/mp_id=8/0/0/436/`
...
`asin_trend/sale/month/date=2025-12-01/mp_id=8/0/0/436/`
Doris still applies the full glob regex after listing, so result
correctness is unchanged. The optimization only reduces the remote
listing scope. Expansion is limited to bounded brace alternations and
positive character classes, with a hard cap to avoid generating too many
prefixes. Existing pagination behavior through `startAfter` and
`maxFile` is preserved.
---
.../apache/doris/filesystem/s3/S3FileSystem.java | 34 ++
.../filesystem/s3/S3FileSystemProperties.java | 5 +
.../doris/filesystem/s3/S3FileSystemTest.java | 216 ++++++++++++
.../filesystem/spi/S3CompatibleFileSystem.java | 362 ++++++++++++++++++---
4 files changed, 579 insertions(+), 38 deletions(-)
diff --git
a/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystem.java
b/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystem.java
index 6157d81f712..0f6eeb35608 100644
---
a/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystem.java
+++
b/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystem.java
@@ -19,6 +19,7 @@ package org.apache.doris.filesystem.s3;
import org.apache.doris.filesystem.spi.S3CompatibleFileSystem;
+import java.util.List;
import java.util.Optional;
/**
@@ -46,6 +47,35 @@ public class S3FileSystem extends S3CompatibleFileSystem {
return Optional.ofNullable(properties);
}
+ @Override
+ protected String globListPrefix(String globPattern) {
+ if (isDirectoryBucketEndpoint()) {
+ return slashTerminatedNonGlobPrefix(globPattern);
+ }
+ return super.globListPrefix(globPattern);
+ }
+
+ @Override
+ protected List<String> globListPrefixes(String globPattern, String
listPrefix) {
+ if (isDirectoryBucketEndpoint()) {
+ return List.of(listPrefix);
+ }
+ return super.globListPrefixes(globPattern, listPrefix);
+ }
+
+ private boolean isDirectoryBucketEndpoint() {
+ return properties != null && properties.isDirectoryBucketEndpoint();
+ }
+
+ private static String slashTerminatedNonGlobPrefix(String globPattern) {
+ String prefix = longestNonGlobPrefix(globPattern);
+ if (prefix.isEmpty() || prefix.endsWith("/")) {
+ return prefix;
+ }
+ int slash = prefix.lastIndexOf('/');
+ return slash < 0 ? "" : prefix.substring(0, slash + 1);
+ }
+
protected static boolean isSingleLevelGlob(String pathStr) {
return S3CompatibleFileSystem.isSingleLevelGlob(pathStr);
}
@@ -54,6 +84,10 @@ public class S3FileSystem extends S3CompatibleFileSystem {
return S3CompatibleFileSystem.longestNonGlobPrefix(globPattern);
}
+ protected static List<String> expandedGlobListPrefixes(String globPattern)
{
+ return S3CompatibleFileSystem.expandedGlobListPrefixes(globPattern);
+ }
+
protected static String globToRegex(String glob) {
return S3CompatibleFileSystem.globToRegex(glob);
}
diff --git
a/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystemProperties.java
b/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystemProperties.java
index ccbfdf66990..34e8fb54b93 100644
---
a/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystemProperties.java
+++
b/fe/fe-filesystem/fe-filesystem-s3/src/main/java/org/apache/doris/filesystem/s3/S3FileSystemProperties.java
@@ -327,6 +327,11 @@ public final class S3FileSystemProperties
return StringUtils.isNotBlank(roleArn);
}
+ public boolean isDirectoryBucketEndpoint() {
+ return StringUtils.containsIgnoreCase(endpoint, "s3express-control.")
+ || StringUtils.containsIgnoreCase(endpoint, "s3express-");
+ }
+
private static void putIfNotBlank(Map<String, String> map, String key,
String value) {
if (StringUtils.isNotBlank(value)) {
map.put(key, value);
diff --git
a/fe/fe-filesystem/fe-filesystem-s3/src/test/java/org/apache/doris/filesystem/s3/S3FileSystemTest.java
b/fe/fe-filesystem/fe-filesystem-s3/src/test/java/org/apache/doris/filesystem/s3/S3FileSystemTest.java
index 5709bd82e0a..fe9b28dd8c9 100644
---
a/fe/fe-filesystem/fe-filesystem-s3/src/test/java/org/apache/doris/filesystem/s3/S3FileSystemTest.java
+++
b/fe/fe-filesystem/fe-filesystem-s3/src/test/java/org/apache/doris/filesystem/s3/S3FileSystemTest.java
@@ -18,6 +18,7 @@
package org.apache.doris.filesystem.s3;
import org.apache.doris.filesystem.DorisOutputFile;
+import org.apache.doris.filesystem.GlobListing;
import org.apache.doris.filesystem.Location;
import org.apache.doris.filesystem.spi.ObjectListOptions;
import org.apache.doris.filesystem.spi.RemoteObject;
@@ -27,13 +28,16 @@ import org.apache.doris.filesystem.spi.RequestBody;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
+import org.mockito.ArgumentCaptor;
import org.mockito.ArgumentMatchers;
import org.mockito.Mockito;
import java.io.FileNotFoundException;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import java.nio.file.FileAlreadyExistsException;
import java.util.List;
+import java.util.Map;
/**
* Unit tests for {@link S3FileSystem} using a mock {@link S3ObjStorage}.
@@ -69,6 +73,34 @@ class S3FileSystemTest {
ArgumentMatchers.anyString(),
ArgumentMatchers.<ObjectListOptions>any());
}
+ private void mockSingleObjectListWithStartAfter(String listUri, String
key, long size) throws IOException {
+ Mockito.doAnswer(invocation -> {
+ ObjectListOptions options = invocation.getArgument(1);
+ String startAfter = options.startAfter();
+ if (startAfter != null && !startAfter.isEmpty() &&
compareUtf8Binary(key, startAfter) <= 0) {
+ return new RemoteObjects(List.of(), false, null);
+ }
+ return new RemoteObjects(
+ List.of(new RemoteObject(key,
key.substring(key.lastIndexOf('/') + 1), null, size, 0L)),
+ false, null);
+ }).when(mockStorage).listObjectsWithOptions(
+ ArgumentMatchers.eq(listUri),
ArgumentMatchers.<ObjectListOptions>any());
+ }
+
+ private static int compareUtf8Binary(String left, String right) {
+ byte[] leftBytes = left.getBytes(StandardCharsets.UTF_8);
+ byte[] rightBytes = right.getBytes(StandardCharsets.UTF_8);
+ int commonLength = Math.min(leftBytes.length, rightBytes.length);
+ for (int i = 0; i < commonLength; i++) {
+ int result = Integer.compare(
+ Byte.toUnsignedInt(leftBytes[i]),
Byte.toUnsignedInt(rightBytes[i]));
+ if (result != 0) {
+ return result;
+ }
+ }
+ return Integer.compare(leftBytes.length, rightBytes.length);
+ }
+
// ------------------------------------------------------------------
// exists() (inherited from ObjFileSystem)
// ------------------------------------------------------------------
@@ -555,6 +587,190 @@ class S3FileSystemTest {
Assertions.assertEquals("",
S3FileSystem.longestNonGlobPrefix("*.csv"));
}
+ @Test
+ void expandedGlobListPrefixes_expandsJiraDatePattern() {
+ List<String> prefixes = S3FileSystem.expandedGlobListPrefixes(
+
"asin_trend/sale/month/date=2025-{0[3-9],1[0-2]}-01/mp_id=8/0/0/436/*");
+
+ Assertions.assertEquals(10, prefixes.size());
+ Assertions.assertEquals(
+ "asin_trend/sale/month/date=2025-03-01/mp_id=8/0/0/436/",
+ prefixes.get(0));
+ Assertions.assertEquals(
+ "asin_trend/sale/month/date=2025-12-01/mp_id=8/0/0/436/",
+ prefixes.get(9));
+ }
+
+ @Test
+ void expandedGlobListPrefixes_fallsBackWhenBraceArmContainsWildcard() {
+ Assertions.assertEquals(List.of("data/"),
+
S3FileSystem.expandedGlobListPrefixes("data/{foo*,bar*}/part.parquet"));
+ }
+
+ @Test
+ void
expandedGlobListPrefixes_fallsBackWhenCharacterClassContainsSupplementaryCodePoint()
{
+ String emoji = new String(Character.toChars(0x1F600));
+ String pattern = "data/[" + emoji + "]/file.csv";
+
+
Assertions.assertTrue(java.util.regex.Pattern.compile(S3FileSystem.globToRegex(pattern))
+ .matcher("data/" + emoji + "/file.csv")
+ .matches());
+ Assertions.assertEquals(List.of("data/"),
S3FileSystem.expandedGlobListPrefixes(pattern));
+ }
+
+ @Test
+ void expandedGlobListPrefixes_sortsPrefixesByUtf8BinaryOrder() {
+ String emoji = new String(Character.toChars(0x1F600));
+ String privateUse = Character.toString(0xE000);
+
+ Assertions.assertEquals(
+ List.of("data/" + privateUse + "/file.csv", "data/" + emoji +
"/file.csv"),
+ S3FileSystem.expandedGlobListPrefixes("data/{" + emoji + "," +
privateUse + "}/file.csv"));
+ }
+
+ @Test
+ void globListWithLimit_doesNotAppendSuffixToPartialBraceArmPrefix() throws
IOException {
+ Mockito.when(mockStorage.listObjects(
+ ArgumentMatchers.eq("s3://bucket/data/"),
ArgumentMatchers.isNull()))
+ .thenReturn(new RemoteObjects(
+ List.of(
+ new RemoteObject("data/foobar/part.parquet",
+ "foobar/part.parquet", null, 10L, 0L),
+ new RemoteObject("data/barbaz/part.parquet",
+ "barbaz/part.parquet", null, 20L, 0L)),
+ false, null));
+
+ GlobListing listing = fs.globListWithLimit(
+ Location.of("s3://bucket/data/{foo*,bar*}/part.parquet"),
null, 0L, 0L);
+
+ Assertions.assertEquals(2, listing.getFiles().size());
+ Mockito.verify(mockStorage).listObjects(
+ ArgumentMatchers.eq("s3://bucket/data/"),
ArgumentMatchers.isNull());
+ Mockito.verify(mockStorage, Mockito.never()).listObjects(
+ ArgumentMatchers.eq("s3://bucket/data/foo/part.parquet"),
ArgumentMatchers.any());
+ Mockito.verify(mockStorage, Mockito.never()).listObjects(
+ ArgumentMatchers.eq("s3://bucket/data/bar/part.parquet"),
ArgumentMatchers.any());
+ }
+
+ @Test
+ void globListWithLimit_paginatesExpandedPrefixesInUtf8BinaryOrder() throws
IOException {
+ String emoji = new String(Character.toChars(0x1F600));
+ String privateUse = Character.toString(0xE000);
+ String privateUseKey = "data/" + privateUse + "/file.csv";
+ String emojiKey = "data/" + emoji + "/file.csv";
+ String pattern = "s3://bucket/data/{" + emoji + "," + privateUse +
"}/file.csv";
+
+ mockSingleObjectListWithStartAfter("s3://bucket/" + privateUseKey,
privateUseKey, 10L);
+ mockSingleObjectListWithStartAfter("s3://bucket/" + emojiKey,
emojiKey, 20L);
+
+ GlobListing firstPage = fs.globListWithLimit(Location.of(pattern),
null, 0L, 1L);
+
+ Assertions.assertEquals(1, firstPage.getFiles().size());
+ Assertions.assertEquals("s3://bucket/" + privateUseKey,
firstPage.getFiles().get(0).location().uri());
+ Assertions.assertEquals(emojiKey, firstPage.getMaxFile());
+
+ GlobListing secondPage = fs.globListWithLimit(Location.of(pattern),
privateUseKey, 0L, 1L);
+
+ Assertions.assertEquals(1, secondPage.getFiles().size());
+ Assertions.assertEquals("s3://bucket/" + emojiKey,
secondPage.getFiles().get(0).location().uri());
+ }
+
+ @Test
+ void
globListWithLimit_directoryBucketFallsBackToSlashTerminatedStaticPrefix()
throws IOException {
+ S3FileSystemProperties properties = S3FileSystemProperties.of(Map.of(
+ "s3.endpoint",
"https://s3express-usw2-az1.us-west-2.amazonaws.com",
+ "s3.region", "us-west-2"));
+ S3FileSystem directoryBucketFs = new S3FileSystem(properties,
mockStorage);
+ Mockito.when(mockStorage.listObjects(
+ ArgumentMatchers.eq("s3://bucket/data/"),
ArgumentMatchers.isNull()))
+ .thenReturn(new RemoteObjects(
+ List.of(
+ new RemoteObject("data/a.csv", "a.csv", null,
10L, 0L),
+ new RemoteObject("data/b.csv", "b.csv", null,
20L, 0L)),
+ false, null));
+
+ GlobListing listing = directoryBucketFs.globListWithLimit(
+ Location.of("s3://bucket/data/[ab]*.csv"), null, 0L, 0L);
+
+ Assertions.assertEquals(2, listing.getFiles().size());
+ Assertions.assertEquals("data/", listing.getPrefix());
+ Mockito.verify(mockStorage).listObjects(
+ ArgumentMatchers.eq("s3://bucket/data/"),
ArgumentMatchers.isNull());
+ Mockito.verify(mockStorage, Mockito.never()).listObjects(
+ ArgumentMatchers.eq("s3://bucket/data/a"),
ArgumentMatchers.any());
+ Mockito.verify(mockStorage, Mockito.never()).listObjects(
+ ArgumentMatchers.eq("s3://bucket/data/b"),
ArgumentMatchers.any());
+ }
+
+ @Test
+ void globListWithLimit_listsExpandedDatePrefixesInsteadOfBroadDatePrefix()
throws IOException {
+ Mockito.when(mockStorage.listObjects(ArgumentMatchers.anyString(),
ArgumentMatchers.isNull()))
+ .thenReturn(new RemoteObjects(List.of(), false, null));
+ Mockito.when(mockStorage.listObjects(
+
ArgumentMatchers.eq("s3://bucket/asin_trend/sale/month/"
+ + "date=2025-03-01/mp_id=8/0/0/436/"),
+ ArgumentMatchers.isNull()))
+ .thenReturn(new RemoteObjects(
+ List.of(new RemoteObject(
+
"asin_trend/sale/month/date=2025-03-01/mp_id=8/0/0/436/a.parquet",
+ "a.parquet", null, 11L, 0L)),
+ false, null));
+ Mockito.when(mockStorage.listObjects(
+
ArgumentMatchers.eq("s3://bucket/asin_trend/sale/month/"
+ + "date=2025-10-01/mp_id=8/0/0/436/"),
+ ArgumentMatchers.isNull()))
+ .thenReturn(new RemoteObjects(
+ List.of(new RemoteObject(
+
"asin_trend/sale/month/date=2025-10-01/mp_id=8/0/0/436/b.parquet",
+ "b.parquet", null, 12L, 0L)),
+ false, null));
+
+ GlobListing listing =
fs.globListWithLimit(Location.of("s3://bucket/asin_trend/sale/month/"
+ + "date=2025-{0[3-9],1[0-2]}-01/mp_id=8/0/0/436/*"), null, 0L,
0L);
+
+ Assertions.assertEquals(2, listing.getFiles().size());
+ Assertions.assertEquals("asin_trend/sale/month/date=2025-",
listing.getPrefix());
+
+ ArgumentCaptor<String> pathCaptor =
ArgumentCaptor.forClass(String.class);
+ Mockito.verify(mockStorage, Mockito.atLeastOnce()).listObjects(
+ pathCaptor.capture(), ArgumentMatchers.isNull());
+ java.util.Set<String> listedPaths = new
java.util.HashSet<>(pathCaptor.getAllValues());
+
Assertions.assertFalse(listedPaths.contains("s3://bucket/asin_trend/sale/month/date=2025-"));
+
Assertions.assertTrue(listedPaths.contains("s3://bucket/asin_trend/sale/month/"
+ + "date=2025-03-01/mp_id=8/0/0/436/"));
+
Assertions.assertTrue(listedPaths.contains("s3://bucket/asin_trend/sale/month/"
+ + "date=2025-12-01/mp_id=8/0/0/436/"));
+ }
+
+ @Test
+ void globListWithLimit_findsNextMatchAcrossExpandedPrefixesAfterLimit()
throws IOException {
+ Mockito.when(mockStorage.listObjects(
+ ArgumentMatchers.eq("s3://bucket/date=2025-01/file"),
+ ArgumentMatchers.isNull()))
+ .thenReturn(new RemoteObjects(
+ List.of(
+ new RemoteObject("date=2025-01/file-a.csv",
+ "file-a.csv", null, 10L, 0L),
+ new RemoteObject("date=2025-01/file-b.csv",
+ "file-b.csv", null, 20L, 0L)),
+ false, null));
+ Mockito.when(mockStorage.listObjects(
+ ArgumentMatchers.eq("s3://bucket/date=2025-02/file"),
+ ArgumentMatchers.isNull()))
+ .thenReturn(new RemoteObjects(
+ List.of(new RemoteObject("date=2025-02/file-c.csv",
+ "file-c.csv", null, 30L, 0L)),
+ false, null));
+
+ GlobListing listing = fs.globListWithLimit(
+ Location.of("s3://bucket/date=2025-0[12]/file*.csv"), null,
0L, 2L);
+
+ Assertions.assertEquals(2, listing.getFiles().size());
+ Assertions.assertEquals("date=2025-02/file-c.csv",
listing.getMaxFile());
+ Mockito.verify(mockStorage).listObjects(
+ ArgumentMatchers.eq("s3://bucket/date=2025-02/file"),
ArgumentMatchers.isNull());
+ }
+
// ------------------------------------------------------------------
// newOutputFile().create() / createOrOverwrite()
// ------------------------------------------------------------------
diff --git
a/fe/fe-filesystem/fe-filesystem-spi/src/main/java/org/apache/doris/filesystem/spi/S3CompatibleFileSystem.java
b/fe/fe-filesystem/fe-filesystem-spi/src/main/java/org/apache/doris/filesystem/spi/S3CompatibleFileSystem.java
index 76486870b8e..5746fed5dd8 100644
---
a/fe/fe-filesystem/fe-filesystem-spi/src/main/java/org/apache/doris/filesystem/spi/S3CompatibleFileSystem.java
+++
b/fe/fe-filesystem/fe-filesystem-spi/src/main/java/org/apache/doris/filesystem/spi/S3CompatibleFileSystem.java
@@ -28,10 +28,12 @@ import org.apache.doris.filesystem.Location;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
import java.nio.file.FileAlreadyExistsException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
+import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
@@ -53,6 +55,8 @@ public abstract class S3CompatibleFileSystem extends
ObjFileSystem {
// Object stores do not have real directories; use a zero-byte marker with
trailing slash.
private static final String DIR_MARKER_SUFFIX = "/";
+ private static final int MAX_EXPANDED_GLOB_LIST_PREFIXES = 256;
+ private static final Comparator<String> UTF8_BINARY_ORDER =
S3CompatibleFileSystem::compareUtf8Binary;
private final boolean usePathStyle;
@@ -847,6 +851,282 @@ public abstract class S3CompatibleFileSystem extends
ObjFileSystem {
return globPattern.substring(0, earliest);
}
+ /**
+ * Returns object-store list prefixes that are safe to push down for a
glob pattern.
+ *
+ * <p>Unlike {@link #longestNonGlobPrefix(String)}, this expands bounded
glob constructs
+ * ({@code {...}} alternation and positive {@code [...]} character
classes) before the first
+ * unbounded wildcard. That lets patterns such as
+ * {@code date=2025-{0[3-9],1[0-2]}-01/mp_id=8/*} list the concrete
date/mp prefixes instead
+ * of scanning everything under {@code date=2025-}. If expansion would be
too large or a glob
+ * construct is not safely enumerable, it falls back to the conservative
longest static prefix.
+ */
+ protected static List<String> expandedGlobListPrefixes(String globPattern)
{
+ List<String> prefixes = expandGlobListPrefixes(globPattern, true);
+ return prefixes == null ? List.of(longestNonGlobPrefix(globPattern)) :
prefixes;
+ }
+
+ protected String globListPrefix(String globPattern) {
+ return longestNonGlobPrefix(globPattern);
+ }
+
+ protected List<String> globListPrefixes(String globPattern, String
listPrefix) {
+ return expandedGlobListPrefixes(globPattern);
+ }
+
+ private static List<String> expandGlobListPrefixes(String globPattern,
boolean allowPartialPrefix) {
+ List<String> prefixes = new ArrayList<>();
+ prefixes.add("");
+ int i = 0;
+ while (i < globPattern.length()) {
+ char c = globPattern.charAt(i);
+ if (c == '*' || c == '?') {
+ return allowPartialPrefix ? compactPrefixes(prefixes) : null;
+ }
+ if (c == '\\') {
+ if (i + 1 < globPattern.length()) {
+ appendLiteral(prefixes, globPattern.charAt(i + 1));
+ i += 2;
+ } else {
+ appendLiteral(prefixes, c);
+ i++;
+ }
+ continue;
+ }
+ if (c == '[') {
+ PrefixExpansion charClass = expandCharacterClass(globPattern,
i);
+ if (charClass == null) {
+ return allowPartialPrefix ? compactPrefixes(prefixes) :
null;
+ }
+ prefixes = appendAlternatives(prefixes, charClass.values);
+ if (prefixes == null) {
+ return null;
+ }
+ i = charClass.nextIndex;
+ continue;
+ }
+ if (c == '{') {
+ PrefixExpansion brace = expandBraceGroup(globPattern, i);
+ if (brace == null) {
+ return allowPartialPrefix ? compactPrefixes(prefixes) :
null;
+ }
+ prefixes = appendAlternatives(prefixes, brace.values);
+ if (prefixes == null) {
+ return null;
+ }
+ i = brace.nextIndex;
+ continue;
+ }
+ appendLiteral(prefixes, c);
+ i++;
+ }
+ return compactPrefixes(prefixes);
+ }
+
+ private static void appendLiteral(List<String> prefixes, char c) {
+ for (int i = 0; i < prefixes.size(); i++) {
+ prefixes.set(i, prefixes.get(i) + c);
+ }
+ }
+
+ private static List<String> appendAlternatives(List<String> prefixes,
List<String> alternatives) {
+ long expandedSize = (long) prefixes.size() * alternatives.size();
+ if (expandedSize > MAX_EXPANDED_GLOB_LIST_PREFIXES) {
+ return null;
+ }
+ List<String> expanded = new ArrayList<>((int) expandedSize);
+ for (String prefix : prefixes) {
+ for (String alternative : alternatives) {
+ expanded.add(prefix + alternative);
+ }
+ }
+ return expanded;
+ }
+
+ private static PrefixExpansion expandCharacterClass(String globPattern,
int openIndex) {
+ int closeIndex = findClosingBracket(globPattern, openIndex);
+ if (closeIndex < 0 || closeIndex == openIndex + 1) {
+ return null;
+ }
+ int i = openIndex + 1;
+ char first = globPattern.charAt(i);
+ if (first == '!' || first == '^') {
+ return null;
+ }
+ if (containsSurrogate(globPattern, i, closeIndex)) {
+ return null;
+ }
+ List<String> values = new ArrayList<>();
+ while (i < closeIndex) {
+ char current = globPattern.charAt(i);
+ if (current == '\\') {
+ if (i + 1 >= closeIndex) {
+ return null;
+ }
+ values.add(String.valueOf(globPattern.charAt(i + 1)));
+ i += 2;
+ continue;
+ }
+ if (i + 2 < closeIndex && globPattern.charAt(i + 1) == '-') {
+ char rangeEnd = globPattern.charAt(i + 2);
+ int step = current <= rangeEnd ? 1 : -1;
+ for (char ch = current; step > 0 ? ch <= rangeEnd : ch >=
rangeEnd; ch += step) {
+ values.add(String.valueOf(ch));
+ if (values.size() > MAX_EXPANDED_GLOB_LIST_PREFIXES) {
+ return null;
+ }
+ }
+ i += 3;
+ continue;
+ }
+ values.add(String.valueOf(current));
+ i++;
+ }
+ return new PrefixExpansion(values, closeIndex + 1);
+ }
+
+ private static int findClosingBracket(String globPattern, int openIndex) {
+ for (int i = openIndex + 1; i < globPattern.length(); i++) {
+ char c = globPattern.charAt(i);
+ if (c == '\\') {
+ i++;
+ continue;
+ }
+ if (c == ']') {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ private static boolean containsSurrogate(String text, int start, int end) {
+ for (int i = start; i < end; i++) {
+ if (Character.isSurrogate(text.charAt(i))) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private static PrefixExpansion expandBraceGroup(String globPattern, int
openIndex) {
+ int closeIndex = findClosingBrace(globPattern, openIndex);
+ if (closeIndex < 0) {
+ return null;
+ }
+ List<String> alternatives = splitBraceAlternatives(
+ globPattern.substring(openIndex + 1, closeIndex));
+ if (alternatives.isEmpty()) {
+ return null;
+ }
+ List<String> values = new ArrayList<>();
+ for (String alternative : alternatives) {
+ List<String> expandedAlternative =
expandGlobListPrefixes(alternative, false);
+ if (expandedAlternative == null) {
+ return null;
+ }
+ values.addAll(expandedAlternative);
+ if (values.size() > MAX_EXPANDED_GLOB_LIST_PREFIXES) {
+ return null;
+ }
+ }
+ return new PrefixExpansion(values, closeIndex + 1);
+ }
+
+ private static int findClosingBrace(String globPattern, int openIndex) {
+ int depth = 0;
+ for (int i = openIndex; i < globPattern.length(); i++) {
+ char c = globPattern.charAt(i);
+ if (c == '\\') {
+ i++;
+ continue;
+ }
+ if (c == '{') {
+ depth++;
+ } else if (c == '}') {
+ depth--;
+ if (depth == 0) {
+ return i;
+ }
+ }
+ }
+ return -1;
+ }
+
+ private static List<String> splitBraceAlternatives(String content) {
+ List<String> alternatives = new ArrayList<>();
+ int depth = 0;
+ int start = 0;
+ for (int i = 0; i < content.length(); i++) {
+ char c = content.charAt(i);
+ if (c == '\\') {
+ i++;
+ continue;
+ }
+ if (c == '{') {
+ depth++;
+ } else if (c == '}') {
+ if (depth == 0) {
+ return Collections.emptyList();
+ }
+ depth--;
+ } else if (c == ',' && depth == 0) {
+ alternatives.add(content.substring(start, i));
+ start = i + 1;
+ }
+ }
+ if (depth != 0) {
+ return Collections.emptyList();
+ }
+ alternatives.add(content.substring(start));
+ return alternatives;
+ }
+
+ private static List<String> compactPrefixes(List<String> prefixes) {
+ List<String> sorted = new ArrayList<>(prefixes);
+ sorted.sort(UTF8_BINARY_ORDER);
+ List<String> compact = new ArrayList<>();
+ for (String prefix : sorted) {
+ if (prefix.isEmpty()) {
+ return List.of("");
+ }
+ boolean covered = false;
+ for (String existing : compact) {
+ if (prefix.startsWith(existing)) {
+ covered = true;
+ break;
+ }
+ }
+ if (!covered) {
+ compact.add(prefix);
+ }
+ }
+ return compact;
+ }
+
+ private static int compareUtf8Binary(String left, String right) {
+ byte[] leftBytes = left.getBytes(StandardCharsets.UTF_8);
+ byte[] rightBytes = right.getBytes(StandardCharsets.UTF_8);
+ int commonLength = Math.min(leftBytes.length, rightBytes.length);
+ for (int i = 0; i < commonLength; i++) {
+ int result = Integer.compare(
+ Byte.toUnsignedInt(leftBytes[i]),
Byte.toUnsignedInt(rightBytes[i]));
+ if (result != 0) {
+ return result;
+ }
+ }
+ return Integer.compare(leftBytes.length, rightBytes.length);
+ }
+
+ private static class PrefixExpansion {
+ private final List<String> values;
+ private final int nextIndex;
+
+ private PrefixExpansion(List<String> values, int nextIndex) {
+ this.values = values;
+ this.nextIndex = nextIndex;
+ }
+ }
+
/**
* Expands {@code {N..M}} numeric range syntax in a glob pattern to the
equivalent
* comma-separated alternation {@code {N,N+1,...,M}} that Java's
PathMatcher understands.
@@ -923,7 +1203,8 @@ public abstract class S3CompatibleFileSystem extends
ObjFileSystem {
// separator is '\' which would corrupt object storage keys, and (b)
Paths.get rejects keys
// containing characters illegal in the host OS path syntax (':', '\',
etc.).
Pattern matcher = Pattern.compile(globToRegex(expandedKeyPattern));
- String listPrefix = longestNonGlobPrefix(expandedKeyPattern);
+ String listPrefix = globListPrefix(expandedKeyPattern);
+ List<String> listPrefixes = globListPrefixes(expandedKeyPattern,
listPrefix);
List<FileEntry> files = new ArrayList<>();
long totalSize = 0L;
@@ -933,51 +1214,56 @@ public abstract class S3CompatibleFileSystem extends
ObjFileSystem {
String nextMatchAfterLimit = "";
String lastMatchedKey = "";
boolean isTruncated;
- String continuationToken = null;
- String listUri = base + listPrefix;
try {
- do {
- RemoteObjects response =
objStorage.listObjectsWithOptions(listUri, ObjectListOptions.builder()
- .continuationToken(continuationToken)
- .startAfter(continuationToken == null ? startAfter :
null)
- .build());
- for (RemoteObject obj : response.getObjectList()) {
- if (reachLimit) {
- // After hitting limit: find the first matching key so
callers know more data exists.
- if (nextMatchAfterLimit.isEmpty()
- && matcher.matcher(obj.getKey()).matches()) {
- nextMatchAfterLimit = obj.getKey();
+ for (String currentListPrefix : listPrefixes) {
+ String continuationToken = null;
+ String listUri = base + currentListPrefix;
+ do {
+ RemoteObjects response =
objStorage.listObjectsWithOptions(listUri, ObjectListOptions.builder()
+ .continuationToken(continuationToken)
+ .startAfter(continuationToken == null ? startAfter
: null)
+ .build());
+ for (RemoteObject obj : response.getObjectList()) {
+ if (reachLimit) {
+ // After hitting limit: find the first matching
key so callers know more data exists.
+ if (nextMatchAfterLimit.isEmpty()
+ &&
matcher.matcher(obj.getKey()).matches()) {
+ nextMatchAfterLimit = obj.getKey();
+ }
+ continue;
}
- continue;
- }
- if (!matcher.matcher(obj.getKey()).matches()) {
- continue;
- }
+ if (!matcher.matcher(obj.getKey()).matches()) {
+ continue;
+ }
- files.add(new FileEntry(
- Location.of(base + obj.getKey()),
- obj.getSize(),
- false,
- obj.getModificationTime(),
- null));
- totalSize += obj.getSize();
- lastMatchedKey = obj.getKey();
-
- if ((maxFiles > 0 && files.size() >= maxFiles)
- || (maxBytes > 0 && totalSize >= maxBytes)) {
- reachLimit = true;
+ files.add(new FileEntry(
+ Location.of(base + obj.getKey()),
+ obj.getSize(),
+ false,
+ obj.getModificationTime(),
+ null));
+ totalSize += obj.getSize();
+ lastMatchedKey = obj.getKey();
+
+ if ((maxFiles > 0 && files.size() >= maxFiles)
+ || (maxBytes > 0 && totalSize >= maxBytes)) {
+ reachLimit = true;
+ }
}
- }
- isTruncated = response.isTruncated();
- if (isTruncated) {
- continuationToken = response.getContinuationToken();
+ isTruncated = response.isTruncated();
+ if (isTruncated) {
+ continuationToken = response.getContinuationToken();
+ }
+ // Continue paginating after limit until we find the next
matching key,
+ // so callers can use it as a pagination cursor.
+ } while (isTruncated && (!reachLimit ||
nextMatchAfterLimit.isEmpty()));
+ if (reachLimit && !nextMatchAfterLimit.isEmpty()) {
+ break;
}
- // Continue paginating after limit until we find the next
matching key,
- // so callers can use it as a pagination cursor.
- } while (isTruncated && (!reachLimit ||
nextMatchAfterLimit.isEmpty()));
+ }
} catch (Exception e) {
throw new IOException("Failed to list object storage objects at "
+ uri + ": " + e.getMessage(), e);
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]