This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch optimize_s3_skip_list_for_deterministic_paths in repository https://gitbox.apache.org/repos/asf/doris.git
commit 8fd43e3fc87de39a8d04bf3e28f875ce45e5e24b Author: Yongqiang YANG <[email protected]> AuthorDate: Sun Feb 1 12:44:21 2026 -0800 [opt](s3) Skip S3 listing for deterministic file paths using HEAD requests For S3 paths without wildcards (*, ?, [...]), use HEAD requests instead of ListObjectsV2 to avoid requiring s3:ListBucket permission. This is useful when only s3:GetObject permission is granted. Brace patterns like {1..10} are expanded to concrete file paths and verified individually with HEAD requests. --- .../java/org/apache/doris/common/util/S3Util.java | 111 +++++++++++++++++++++ .../java/org/apache/doris/fs/obj/S3ObjStorage.java | 64 ++++++++++++ .../org/apache/doris/common/util/S3UtilTest.java | 106 ++++++++++++++++++++ 3 files changed, 281 insertions(+) diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/S3Util.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/S3Util.java index e537d1f47b0..08e907e53de 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/S3Util.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/S3Util.java @@ -54,6 +54,8 @@ import software.amazon.awssdk.services.sts.auth.StsAssumeRoleCredentialsProvider import java.net.HttpURLConnection; import java.net.URI; import java.net.URL; +import java.util.ArrayList; +import java.util.List; import java.time.Duration; import java.util.ArrayList; import java.util.Arrays; @@ -433,4 +435,113 @@ public class S3Util { SecurityChecker.getInstance().stopSSRFChecking(); } } + + /** + * Check if a path pattern is deterministic, meaning all file paths can be determined + * without listing. A pattern is deterministic if it contains no wildcard characters + * (*, ?, [...]) but may contain brace patterns ({...}) which can be expanded. + * + * This allows skipping S3 ListBucket operations when only GetObject permission is available. + * + * @param pathPattern Path that may contain glob patterns + * @return true if the pattern is deterministic (no wildcards) + */ + public static boolean isDeterministicPattern(String pathPattern) { + // Check for wildcard characters that require listing + // Note: '{' is NOT a wildcard - it's a brace expansion pattern that can be deterministically expanded + char[] wildcardChars = {'*', '?', '['}; + for (char c : wildcardChars) { + if (pathPattern.indexOf(c) != -1) { + return false; + } + } + // Check for escaped characters which indicate complex patterns + if (pathPattern.indexOf('\\') != -1) { + return false; + } + return true; + } + + /** + * Expand brace patterns in a path to generate all concrete file paths. + * Handles nested and multiple brace patterns. + * + * Examples: + * - "file{1,2,3}.csv" => ["file1.csv", "file2.csv", "file3.csv"] + * - "data/part{1..3}/file.csv" => ["data/part1/file.csv", "data/part2/file.csv", "data/part3/file.csv"] + * - "file.csv" => ["file.csv"] (no braces) + * + * @param pathPattern Path with optional brace patterns (already processed by extendGlobs) + * @return List of expanded concrete paths + */ + public static List<String> expandBracePatterns(String pathPattern) { + List<String> result = new ArrayList<>(); + expandBracePatternsRecursive(pathPattern, result); + return result; + } + + private static void expandBracePatternsRecursive(String pattern, List<String> result) { + int braceStart = pattern.indexOf('{'); + if (braceStart == -1) { + // No more braces, add the pattern as-is + result.add(pattern); + return; + } + + // Find matching closing brace (handle nested braces) + int braceEnd = findMatchingBrace(pattern, braceStart); + if (braceEnd == -1) { + // Malformed pattern, treat as literal + result.add(pattern); + return; + } + + String prefix = pattern.substring(0, braceStart); + String braceContent = pattern.substring(braceStart + 1, braceEnd); + String suffix = pattern.substring(braceEnd + 1); + + // Split by comma, but respect nested braces + List<String> alternatives = splitBraceContent(braceContent); + + for (String alt : alternatives) { + // Recursively expand any remaining braces in the suffix + expandBracePatternsRecursive(prefix + alt + suffix, result); + } + } + + private static int findMatchingBrace(String pattern, int start) { + int depth = 0; + for (int i = start; i < pattern.length(); i++) { + char c = pattern.charAt(i); + if (c == '{') { + depth++; + } else if (c == '}') { + depth--; + if (depth == 0) { + return i; + } + } + } + return -1; + } + + private static List<String> splitBraceContent(String content) { + List<String> parts = new ArrayList<>(); + int depth = 0; + int start = 0; + + for (int i = 0; i < content.length(); i++) { + char c = content.charAt(i); + if (c == '{') { + depth++; + } else if (c == '}') { + depth--; + } else if (c == ',' && depth == 0) { + parts.add(content.substring(start, i)); + start = i + 1; + } + } + parts.add(content.substring(start)); + return parts; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/fs/obj/S3ObjStorage.java b/fe/fe-core/src/main/java/org/apache/doris/fs/obj/S3ObjStorage.java index 9cd0c6ea9e5..9c062f6d6e1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/fs/obj/S3ObjStorage.java +++ b/fe/fe-core/src/main/java/org/apache/doris/fs/obj/S3ObjStorage.java @@ -578,6 +578,70 @@ public class S3ObjStorage implements ObjStorage<S3Client> { } bucket = uri.getBucket(); + + // Optimization: For deterministic paths (no wildcards like *, ?, [...]), + // use HEAD requests instead of listing to avoid requiring ListBucket permission. + // This is useful when only GetObject permission is granted. + String keyPattern = uri.getKey(); + if (S3Util.isDeterministicPattern(keyPattern) && !hasLimits && startFile == null) { + // First expand any {..} patterns, then use HEAD requests + String expandedPattern = S3Util.extendGlobs(keyPattern); + List<String> expandedPaths = S3Util.expandBracePatterns(expandedPattern); + + if (LOG.isDebugEnabled()) { + LOG.debug("Using HEAD requests for deterministic path pattern: {}, expanded to {} paths", + remotePath, expandedPaths.size()); + } + + for (String key : expandedPaths) { + String fullPath = "s3://" + bucket + "/" + key; + try { + HeadObjectResponse headResponse = getClient() + .headObject(HeadObjectRequest.builder() + .bucket(bucket) + .key(key) + .build()); + + matchCnt++; + matchFileSize += headResponse.contentLength(); + RemoteFile remoteFile = new RemoteFile( + fileNameOnly ? Paths.get(key).getFileName().toString() : fullPath, + true, // isFile + headResponse.contentLength(), + headResponse.contentLength(), + headResponse.lastModified() != null + ? headResponse.lastModified().toEpochMilli() : 0 + ); + result.add(remoteFile); + + if (LOG.isDebugEnabled()) { + LOG.debug("HEAD success for {}: size={}", fullPath, headResponse.contentLength()); + } + } catch (NoSuchKeyException e) { + // File does not exist, skip it (this is expected for some expanded patterns) + if (LOG.isDebugEnabled()) { + LOG.debug("File does not exist (skipped): {}", fullPath); + } + } catch (S3Exception e) { + if (e.statusCode() == HttpStatus.SC_NOT_FOUND) { + if (LOG.isDebugEnabled()) { + LOG.debug("File does not exist (skipped): {}", fullPath); + } + } else { + throw e; + } + } + } + + if (LOG.isDebugEnabled()) { + long duration = System.nanoTime() - startTime; + LOG.debug("Deterministic path HEAD requests: checked {} paths, found {} files, took {} ms", + expandedPaths.size(), matchCnt, duration / 1000 / 1000); + } + + return new GlobListResult(Status.OK, currentMaxFile, bucket, ""); + } + String globPath = S3Util.extendGlobs(uri.getKey()); if (LOG.isDebugEnabled()) { diff --git a/fe/fe-core/src/test/java/org/apache/doris/common/util/S3UtilTest.java b/fe/fe-core/src/test/java/org/apache/doris/common/util/S3UtilTest.java index 23715440e8c..914434aea30 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/common/util/S3UtilTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/common/util/S3UtilTest.java @@ -20,6 +20,9 @@ package org.apache.doris.common.util; import org.junit.Assert; import org.junit.Test; +import java.util.Arrays; +import java.util.List; + public class S3UtilTest { @Test @@ -248,5 +251,108 @@ public class S3UtilTest { String result = S3Util.extendGlobs(input); Assert.assertEquals(expected, result); } + + // Tests for isDeterministicPattern + + @Test + public void testIsDeterministicPattern_simpleFile() { + // Simple file path without any patterns + Assert.assertTrue(S3Util.isDeterministicPattern("path/to/file.csv")); + } + + @Test + public void testIsDeterministicPattern_withBraces() { + // Path with brace pattern (deterministic - can be expanded) + Assert.assertTrue(S3Util.isDeterministicPattern("path/to/file{1,2,3}.csv")); + Assert.assertTrue(S3Util.isDeterministicPattern("path/to/file{1..3}.csv")); + } + + @Test + public void testIsDeterministicPattern_withAsterisk() { + // Path with asterisk wildcard (not deterministic) + Assert.assertFalse(S3Util.isDeterministicPattern("path/to/*.csv")); + Assert.assertFalse(S3Util.isDeterministicPattern("path/*/file.csv")); + } + + @Test + public void testIsDeterministicPattern_withQuestionMark() { + // Path with question mark wildcard (not deterministic) + Assert.assertFalse(S3Util.isDeterministicPattern("path/to/file?.csv")); + } + + @Test + public void testIsDeterministicPattern_withBrackets() { + // Path with bracket pattern (not deterministic) + Assert.assertFalse(S3Util.isDeterministicPattern("path/to/file[0-9].csv")); + Assert.assertFalse(S3Util.isDeterministicPattern("path/to/file[abc].csv")); + } + + @Test + public void testIsDeterministicPattern_withEscape() { + // Path with escape character (not deterministic - complex pattern) + Assert.assertFalse(S3Util.isDeterministicPattern("path/to/file\\*.csv")); + } + + @Test + public void testIsDeterministicPattern_mixed() { + // Path with both braces and wildcards + Assert.assertFalse(S3Util.isDeterministicPattern("path/to/file{1,2}/*.csv")); + } + + // Tests for expandBracePatterns + + @Test + public void testExpandBracePatterns_noBraces() { + // No braces - returns single path + List<String> result = S3Util.expandBracePatterns("path/to/file.csv"); + Assert.assertEquals(Arrays.asList("path/to/file.csv"), result); + } + + @Test + public void testExpandBracePatterns_simpleBrace() { + // Simple brace expansion + List<String> result = S3Util.expandBracePatterns("file{1,2,3}.csv"); + Assert.assertEquals(Arrays.asList("file1.csv", "file2.csv", "file3.csv"), result); + } + + @Test + public void testExpandBracePatterns_multipleBraces() { + // Multiple brace expansions + List<String> result = S3Util.expandBracePatterns("dir{a,b}/file{1,2}.csv"); + Assert.assertEquals(Arrays.asList( + "dira/file1.csv", "dira/file2.csv", + "dirb/file1.csv", "dirb/file2.csv"), result); + } + + @Test + public void testExpandBracePatterns_emptyBrace() { + // Empty brace content + List<String> result = S3Util.expandBracePatterns("file{}.csv"); + Assert.assertEquals(Arrays.asList("file.csv"), result); + } + + @Test + public void testExpandBracePatterns_singleValue() { + // Single value in brace + List<String> result = S3Util.expandBracePatterns("file{1}.csv"); + Assert.assertEquals(Arrays.asList("file1.csv"), result); + } + + @Test + public void testExpandBracePatterns_withPath() { + // Full path with braces + List<String> result = S3Util.expandBracePatterns("data/year{2023,2024}/month{01,02}/file.csv"); + Assert.assertEquals(8, result.size()); + Assert.assertTrue(result.contains("data/year2023/month01/file.csv")); + Assert.assertTrue(result.contains("data/year2024/month02/file.csv")); + } + + @Test + public void testExpandBracePatterns_extendedRange() { + // Test with extended range (after extendGlobs processing) + String expanded = S3Util.extendGlobs("file{1..3}.csv"); + List<String> result = S3Util.expandBracePatterns(expanded); + Assert.assertEquals(Arrays.asList("file1.csv", "file2.csv", "file3.csv"), result); + } } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
