This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 84fcb73ab5f [fix](Azure) Enhance the glob list's logic for azure file
system in FE (#37490)
84fcb73ab5f is described below
commit 84fcb73ab5feefe14feeea0440e7d2e14bf6ef61
Author: AlexYue <[email protected]>
AuthorDate: Tue Jul 9 09:46:24 2024 +0800
[fix](Azure) Enhance the glob list's logic for azure file system in FE
(#37490)
Previously in fe, for files like
`s3://qa-build/regression/tpcds/sf100_split/catalog_sales.dat.*.gz` it
can not work.
---
.../org/apache/doris/fs/obj/AzureObjStorage.java | 62 +++++++++++++++++-----
1 file changed, 48 insertions(+), 14 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/fs/obj/AzureObjStorage.java
b/fe/fe-core/src/main/java/org/apache/doris/fs/obj/AzureObjStorage.java
index 358b66b44b2..73d9444cc8a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/fs/obj/AzureObjStorage.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/fs/obj/AzureObjStorage.java
@@ -299,47 +299,81 @@ public class AzureObjStorage implements
ObjStorage<BlobServiceClient> {
return String.format("s3://%s/%s", bucket, fileName);
}
+ public static String getLongestPrefix(String globPattern) {
+ int length = globPattern.length();
+ int earliestSpecialCharIndex = length;
+
+ char[] specialChars = {'*', '?', '[', '{', '\\'};
+
+ for (char specialChar : specialChars) {
+ int index = globPattern.indexOf(specialChar);
+ if (index != -1 && index < earliestSpecialCharIndex) {
+ earliestSpecialCharIndex = index;
+ }
+ }
+
+ return globPattern.substring(0, earliestSpecialCharIndex);
+ }
+
public Status globList(String remotePath, List<RemoteFile> result, boolean
fileNameOnly) {
+ long roundCnt = 0;
+ long elementCnt = 0;
+ long matchCnt = 0;
+ long startTime = System.nanoTime();
+ Status st = Status.OK;
try {
S3URI uri = S3URI.create(remotePath, isUsePathStyle,
forceParsingByStandardUri);
String globPath = uri.getKey();
+ String bucket = uri.getBucket();
LOG.info("try to glob list for azure, remote path {}, orig {}",
globPath, remotePath);
- BlobContainerClient client =
getClient().getBlobContainerClient(uri.getBucket());
+ BlobContainerClient client =
getClient().getBlobContainerClient(bucket);
java.nio.file.Path pathPattern = Paths.get(globPath);
LOG.info("path pattern {}", pathPattern.toString());
PathMatcher matcher =
FileSystems.getDefault().getPathMatcher("glob:" + pathPattern.toString());
- ListBlobsOptions options = new
ListBlobsOptions().setPrefix(globPath);
+ String listPrefix = getLongestPrefix(globPath);
+ LOG.info("azure glob list prefix is {}", listPrefix);
+ ListBlobsOptions options = new
ListBlobsOptions().setPrefix(listPrefix);
String newContinuationToken = null;
do {
+ roundCnt++;
PagedIterable<BlobItem> pagedBlobs = client.listBlobs(options,
newContinuationToken, null);
PagedResponse<BlobItem> pagedResponse =
pagedBlobs.iterableByPage().iterator().next();
for (BlobItem blobItem : pagedResponse.getElements()) {
+ elementCnt++;
java.nio.file.Path blobPath =
Paths.get(blobItem.getName());
- if (matcher.matches(blobPath)) {
- RemoteFile remoteFile = new RemoteFile(
- fileNameOnly ?
blobPath.getFileName().toString() : constructS3Path(blobPath.toString(),
- uri.getBucket()),
- !blobItem.isPrefix(),
- blobItem.isPrefix() ? -1 :
blobItem.getProperties().getContentLength(),
- blobItem.getProperties().getContentLength(),
-
blobItem.getProperties().getLastModified().getSecond());
- result.add(remoteFile);
+ if (!matcher.matches(blobPath)) {
+ continue;
}
+ matchCnt++;
+ RemoteFile remoteFile = new RemoteFile(
+ fileNameOnly ? blobPath.getFileName().toString() :
constructS3Path(blobPath.toString(),
+ uri.getBucket()),
+ !blobItem.isPrefix(),
+ blobItem.isPrefix() ? -1 :
blobItem.getProperties().getContentLength(),
+ blobItem.getProperties().getContentLength(),
+
blobItem.getProperties().getLastModified().getSecond());
+ result.add(remoteFile);
}
newContinuationToken = pagedResponse.getContinuationToken();
} while (newContinuationToken != null);
} catch (BlobStorageException e) {
LOG.warn("glob file " + remotePath + " failed because azure error:
" + e.getMessage());
- return new Status(Status.ErrCode.COMMON_ERROR, "glob file " +
remotePath
+ st = new Status(Status.ErrCode.COMMON_ERROR, "glob file " +
remotePath
+ " failed because azure error: " + e.getMessage());
} catch (Exception e) {
LOG.warn("errors while glob file " + remotePath, e);
- return new Status(Status.ErrCode.COMMON_ERROR, "errors while glob
file " + remotePath + e.getMessage());
+ st = new Status(Status.ErrCode.COMMON_ERROR, "errors while glob
file " + remotePath + e.getMessage());
+ } finally {
+ long endTime = System.nanoTime();
+ long duration = endTime - startTime;
+ LOG.info("process {} elements under prefix {} for {} round, match
{} elements, take {} micro second",
+ remotePath, elementCnt, matchCnt, roundCnt,
+ duration / 1000);
}
- return Status.OK;
+ return st;
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]