liaoxin01 commented on code in PR #64684:
URL: https://github.com/apache/doris/pull/64684#discussion_r3453830404
##########
fe/fe-filesystem/fe-filesystem-spi/src/main/java/org/apache/doris/filesystem/spi/S3CompatibleFileSystem.java:
##########
@@ -847,6 +849,247 @@ protected static String longestNonGlobPrefix(String
globPattern) {
return globPattern.substring(0, earliest);
}
+ /**
+ * Returns object-store list prefixes that are safe to push down for a
glob pattern.
+ *
+ * <p>Unlike {@link #longestNonGlobPrefix(String)}, this expands bounded
glob constructs
+ * ({@code {...}} alternation and positive {@code [...]} character
classes) before the first
+ * unbounded wildcard. That lets patterns such as
+ * {@code date=2025-{0[3-9],1[0-2]}-01/mp_id=8/*} list the concrete
date/mp prefixes instead
+ * of scanning everything under {@code date=2025-}. If expansion would be
too large or a glob
+ * construct is not safely enumerable, it falls back to the conservative
longest static prefix.
+ */
+ protected static List<String> expandedGlobListPrefixes(String globPattern)
{
+ List<String> prefixes = expandGlobListPrefixes(globPattern, true);
+ return prefixes == null ? List.of(longestNonGlobPrefix(globPattern)) :
prefixes;
+ }
+
+ private static List<String> expandGlobListPrefixes(String globPattern,
boolean allowPartialPrefix) {
+ List<String> prefixes = new ArrayList<>();
+ prefixes.add("");
+ int i = 0;
+ while (i < globPattern.length()) {
+ char c = globPattern.charAt(i);
+ if (c == '*' || c == '?') {
+ return allowPartialPrefix ? compactPrefixes(prefixes) : null;
+ }
+ if (c == '\\') {
+ if (i + 1 < globPattern.length()) {
+ appendLiteral(prefixes, globPattern.charAt(i + 1));
+ i += 2;
+ } else {
+ appendLiteral(prefixes, c);
+ i++;
+ }
+ continue;
+ }
+ if (c == '[') {
+ PrefixExpansion charClass = expandCharacterClass(globPattern,
i);
+ if (charClass == null) {
+ return allowPartialPrefix ? compactPrefixes(prefixes) :
null;
+ }
+ prefixes = appendAlternatives(prefixes, charClass.values);
+ if (prefixes == null) {
+ return null;
+ }
+ i = charClass.nextIndex;
+ continue;
+ }
+ if (c == '{') {
+ PrefixExpansion brace = expandBraceGroup(globPattern, i);
+ if (brace == null) {
+ return allowPartialPrefix ? compactPrefixes(prefixes) :
null;
+ }
+ prefixes = appendAlternatives(prefixes, brace.values);
+ if (prefixes == null) {
+ return null;
+ }
+ i = brace.nextIndex;
+ continue;
+ }
+ appendLiteral(prefixes, c);
+ i++;
+ }
+ return compactPrefixes(prefixes);
+ }
+
+ private static void appendLiteral(List<String> prefixes, char c) {
+ for (int i = 0; i < prefixes.size(); i++) {
+ prefixes.set(i, prefixes.get(i) + c);
+ }
+ }
+
+ private static List<String> appendAlternatives(List<String> prefixes,
List<String> alternatives) {
+ long expandedSize = (long) prefixes.size() * alternatives.size();
+ if (expandedSize > MAX_EXPANDED_GLOB_LIST_PREFIXES) {
+ return null;
+ }
+ List<String> expanded = new ArrayList<>((int) expandedSize);
+ for (String prefix : prefixes) {
+ for (String alternative : alternatives) {
+ expanded.add(prefix + alternative);
+ }
+ }
+ return expanded;
+ }
+
+ private static PrefixExpansion expandCharacterClass(String globPattern,
int openIndex) {
+ int closeIndex = findClosingBracket(globPattern, openIndex);
+ if (closeIndex < 0 || closeIndex == openIndex + 1) {
+ return null;
+ }
+ int i = openIndex + 1;
+ char first = globPattern.charAt(i);
+ if (first == '!' || first == '^') {
+ return null;
+ }
+ List<String> values = new ArrayList<>();
+ while (i < closeIndex) {
+ char current = globPattern.charAt(i);
Review Comment:
This character-class expansion operates on UTF-16 `char` values, while Java
regex character classes can match a supplementary Unicode code point as one
character. For example, `data/[😀]/file.csv` matches `data/😀/file.csv`, but this
loop expands the emoji into two unpaired-surrogate prefixes. Those prefixes
cannot list the real UTF-8 object key, so the matching file is omitted even
without pagination. Please iterate by Unicode code point (including ranges), or
conservatively fall back when a class contains supplementary characters, and
add a test for an emoji class.
##########
fe/fe-filesystem/fe-filesystem-spi/src/main/java/org/apache/doris/filesystem/spi/S3CompatibleFileSystem.java:
##########
@@ -847,6 +849,247 @@ protected static String longestNonGlobPrefix(String
globPattern) {
return globPattern.substring(0, earliest);
}
+ /**
+ * Returns object-store list prefixes that are safe to push down for a
glob pattern.
+ *
+ * <p>Unlike {@link #longestNonGlobPrefix(String)}, this expands bounded
glob constructs
+ * ({@code {...}} alternation and positive {@code [...]} character
classes) before the first
+ * unbounded wildcard. That lets patterns such as
+ * {@code date=2025-{0[3-9],1[0-2]}-01/mp_id=8/*} list the concrete
date/mp prefixes instead
+ * of scanning everything under {@code date=2025-}. If expansion would be
too large or a glob
+ * construct is not safely enumerable, it falls back to the conservative
longest static prefix.
+ */
+ protected static List<String> expandedGlobListPrefixes(String globPattern)
{
+ List<String> prefixes = expandGlobListPrefixes(globPattern, true);
+ return prefixes == null ? List.of(longestNonGlobPrefix(globPattern)) :
prefixes;
+ }
+
+ private static List<String> expandGlobListPrefixes(String globPattern,
boolean allowPartialPrefix) {
+ List<String> prefixes = new ArrayList<>();
+ prefixes.add("");
+ int i = 0;
+ while (i < globPattern.length()) {
+ char c = globPattern.charAt(i);
+ if (c == '*' || c == '?') {
+ return allowPartialPrefix ? compactPrefixes(prefixes) : null;
+ }
+ if (c == '\\') {
+ if (i + 1 < globPattern.length()) {
+ appendLiteral(prefixes, globPattern.charAt(i + 1));
+ i += 2;
+ } else {
+ appendLiteral(prefixes, c);
+ i++;
+ }
+ continue;
+ }
+ if (c == '[') {
+ PrefixExpansion charClass = expandCharacterClass(globPattern,
i);
+ if (charClass == null) {
+ return allowPartialPrefix ? compactPrefixes(prefixes) :
null;
+ }
+ prefixes = appendAlternatives(prefixes, charClass.values);
+ if (prefixes == null) {
+ return null;
+ }
+ i = charClass.nextIndex;
+ continue;
+ }
+ if (c == '{') {
+ PrefixExpansion brace = expandBraceGroup(globPattern, i);
+ if (brace == null) {
+ return allowPartialPrefix ? compactPrefixes(prefixes) :
null;
+ }
+ prefixes = appendAlternatives(prefixes, brace.values);
+ if (prefixes == null) {
+ return null;
+ }
+ i = brace.nextIndex;
+ continue;
+ }
+ appendLiteral(prefixes, c);
+ i++;
+ }
+ return compactPrefixes(prefixes);
+ }
+
+ private static void appendLiteral(List<String> prefixes, char c) {
+ for (int i = 0; i < prefixes.size(); i++) {
+ prefixes.set(i, prefixes.get(i) + c);
+ }
+ }
+
+ private static List<String> appendAlternatives(List<String> prefixes,
List<String> alternatives) {
+ long expandedSize = (long) prefixes.size() * alternatives.size();
+ if (expandedSize > MAX_EXPANDED_GLOB_LIST_PREFIXES) {
+ return null;
+ }
+ List<String> expanded = new ArrayList<>((int) expandedSize);
+ for (String prefix : prefixes) {
+ for (String alternative : alternatives) {
+ expanded.add(prefix + alternative);
+ }
+ }
+ return expanded;
+ }
+
+ private static PrefixExpansion expandCharacterClass(String globPattern,
int openIndex) {
+ int closeIndex = findClosingBracket(globPattern, openIndex);
+ if (closeIndex < 0 || closeIndex == openIndex + 1) {
+ return null;
+ }
+ int i = openIndex + 1;
+ char first = globPattern.charAt(i);
+ if (first == '!' || first == '^') {
+ return null;
+ }
+ List<String> values = new ArrayList<>();
+ while (i < closeIndex) {
+ char current = globPattern.charAt(i);
+ if (current == '\\') {
+ if (i + 1 >= closeIndex) {
+ return null;
+ }
+ values.add(String.valueOf(globPattern.charAt(i + 1)));
+ i += 2;
+ continue;
+ }
+ if (i + 2 < closeIndex && globPattern.charAt(i + 1) == '-') {
+ char rangeEnd = globPattern.charAt(i + 2);
+ int step = current <= rangeEnd ? 1 : -1;
+ for (char ch = current; step > 0 ? ch <= rangeEnd : ch >=
rangeEnd; ch += step) {
+ values.add(String.valueOf(ch));
+ if (values.size() > MAX_EXPANDED_GLOB_LIST_PREFIXES) {
+ return null;
+ }
+ }
+ i += 3;
+ continue;
+ }
+ values.add(String.valueOf(current));
+ i++;
+ }
+ return new PrefixExpansion(values, closeIndex + 1);
+ }
+
+ private static int findClosingBracket(String globPattern, int openIndex) {
+ for (int i = openIndex + 1; i < globPattern.length(); i++) {
+ char c = globPattern.charAt(i);
+ if (c == '\\') {
+ i++;
+ continue;
+ }
+ if (c == ']') {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ private static PrefixExpansion expandBraceGroup(String globPattern, int
openIndex) {
+ int closeIndex = findClosingBrace(globPattern, openIndex);
+ if (closeIndex < 0) {
+ return null;
+ }
+ List<String> alternatives = splitBraceAlternatives(
+ globPattern.substring(openIndex + 1, closeIndex));
+ if (alternatives.isEmpty()) {
+ return null;
+ }
+ List<String> values = new ArrayList<>();
+ for (String alternative : alternatives) {
+ List<String> expandedAlternative =
expandGlobListPrefixes(alternative, false);
+ if (expandedAlternative == null) {
+ return null;
+ }
+ values.addAll(expandedAlternative);
+ if (values.size() > MAX_EXPANDED_GLOB_LIST_PREFIXES) {
+ return null;
+ }
+ }
+ return new PrefixExpansion(values, closeIndex + 1);
+ }
+
+ private static int findClosingBrace(String globPattern, int openIndex) {
+ int depth = 0;
+ for (int i = openIndex; i < globPattern.length(); i++) {
+ char c = globPattern.charAt(i);
+ if (c == '\\') {
+ i++;
+ continue;
+ }
+ if (c == '{') {
+ depth++;
+ } else if (c == '}') {
+ depth--;
+ if (depth == 0) {
+ return i;
+ }
+ }
+ }
+ return -1;
+ }
+
+ private static List<String> splitBraceAlternatives(String content) {
+ List<String> alternatives = new ArrayList<>();
+ int depth = 0;
+ int start = 0;
+ for (int i = 0; i < content.length(); i++) {
+ char c = content.charAt(i);
+ if (c == '\\') {
+ i++;
+ continue;
+ }
+ if (c == '{') {
+ depth++;
+ } else if (c == '}') {
+ if (depth == 0) {
+ return Collections.emptyList();
+ }
+ depth--;
+ } else if (c == ',' && depth == 0) {
+ alternatives.add(content.substring(start, i));
+ start = i + 1;
+ }
+ }
+ if (depth != 0) {
+ return Collections.emptyList();
+ }
+ alternatives.add(content.substring(start));
+ return alternatives;
+ }
+
+ private static List<String> compactPrefixes(List<String> prefixes) {
+ TreeSet<String> sorted = new TreeSet<>(prefixes);
Review Comment:
`TreeSet<String>` uses Java UTF-16 ordering, which can differ from the
object-store key ordering for supplementary Unicode characters. For example,
Java orders `😀` before `\uE000`, while their UTF-8 byte order is `\uE000`
before `😀`. With `maxFiles=1`, scanning these prefixes in the opposite order
can return the later key first, expose the earlier key as `maxFile`, and then
duplicate/skip entries when that cursor is passed back as `startAfter`. The
prefix order must use the same unsigned UTF-8 byte ordering as the object
store, or the pagination logic must not depend on cross-prefix ordering. Please
add a two-page test covering this ordering mismatch.
##########
fe/fe-filesystem/fe-filesystem-spi/src/main/java/org/apache/doris/filesystem/spi/S3CompatibleFileSystem.java:
##########
@@ -933,51 +1177,56 @@ public GlobListing globListWithLimit(Location path,
String startAfter, long maxB
String nextMatchAfterLimit = "";
String lastMatchedKey = "";
boolean isTruncated;
- String continuationToken = null;
- String listUri = base + listPrefix;
try {
- do {
- RemoteObjects response =
objStorage.listObjectsWithOptions(listUri, ObjectListOptions.builder()
- .continuationToken(continuationToken)
- .startAfter(continuationToken == null ? startAfter :
null)
- .build());
- for (RemoteObject obj : response.getObjectList()) {
- if (reachLimit) {
- // After hitting limit: find the first matching key so
callers know more data exists.
- if (nextMatchAfterLimit.isEmpty()
- && matcher.matcher(obj.getKey()).matches()) {
- nextMatchAfterLimit = obj.getKey();
+ for (String currentListPrefix : listPrefixes) {
+ String continuationToken = null;
+ String listUri = base + currentListPrefix;
Review Comment:
This can regress S3 directory buckets (S3 Express). Directory buckets only
accept listing prefixes that end in `/`, but expansion can change a previously
valid static prefix such as `data/` into `data/a` and `data/b` for
`data/[ab]*.csv`, causing the ListObjectsV2 request to fail. Directory buckets
also do not support `StartAfter` and do not guarantee lexicographic listing
order, so this multi-prefix pagination model is not valid for them. Please
disable this optimization for directory buckets (falling back to the
slash-terminated static prefix) or add a directory-bucket-specific listing path.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]