>From Ritik Raj <[email protected]>:
Ritik Raj has uploaded this change for review. (
https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20503?usp=email )
Change subject: [NO ISSUE][CLOUD] Updating deleteObjects for AzureBlob
......................................................................
[NO ISSUE][CLOUD] Updating deleteObjects for AzureBlob
- user model changes: no
- storage format changes: no
- interface changes: no
Details:
Azure's bulk delete fails if any of the object among
the candidates do not exists.
Hence, adding a thresold of 38 requests, if the paths
are less than that, then individual deleteIfExists,
otherwise call LIST over the common prefix and
delete the objects.
Ext-ref: MB-68999
Change-Id: Iaeb705b396aa72440759e4008c72f276c936c98e
---
M
asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageClientConfig.java
M
asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageCloudClient.java
M
hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/StringUtil.java
M
hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/StringUtilTest.java
4 files changed, 186 insertions(+), 35 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb
refs/changes/03/20503/1
diff --git
a/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageClientConfig.java
b/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageClientConfig.java
index 889a6a4..4630d1b 100644
---
a/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageClientConfig.java
+++
b/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageClientConfig.java
@@ -31,6 +31,26 @@
public class AzBlobStorageClientConfig {
// Ref:
https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch?tabs=microsoft-entra-id
static final int DELETE_BATCH_SIZE = 256;
+ /*
+ * Performance Analysis: Individual deleteIfExists() vs Batch Delete
+ *
+ * Individual approach: N × 8ms (one DELETE per blob)
+ * Batch approach with LIST verification: 300ms (LIST entire container) +
(N / 256) × 8ms (batch deletes)
+ * Batch approach with exists() checks: N × 8ms (HEAD requests) + (N /
256) × 8ms (batch deletes)
+ *
+ * Break-even calculation (Individual vs LIST + Batch):
+ * N × 8 = 300 + (N / 256) × 8
+ * N × 8 × (1 - 1/256) = 300
+ * N × 7.96875 = 300
+ * N ≈ 38 blobs
+ *
+ * Conclusion: Individual deleteIfExists() is faster for < 38 blobs, batch
is faster for > 38 blobs.
+ * However, the exists() check approach (N × 8ms + batch overhead) is
always slower than individual.
+ *
+ * For typical collection creation (deleting 1 mask file), individual
deleteIfExists() is optimal.
+ * Using deleteIfExists() avoids the expensive LIST operation (300ms →
8ms, ~37x speedup).
+ */
+ static final int DELETE_INDIVIDUAL_OR_BATCH_THRESHOLD = 38;
private final int writeBufferSize;
private final String region;
diff --git
a/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageCloudClient.java
b/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageCloudClient.java
index 4a61f1c..fc95354 100644
---
a/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageCloudClient.java
+++
b/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageCloudClient.java
@@ -20,6 +20,7 @@
package org.apache.asterix.cloud.clients.azure.blobstorage;
import static
org.apache.asterix.cloud.clients.azure.blobstorage.AzBlobStorageClientConfig.DELETE_BATCH_SIZE;
+import static
org.apache.asterix.cloud.clients.azure.blobstorage.AzBlobStorageClientConfig.DELETE_INDIVIDUAL_OR_BATCH_THRESHOLD;
import java.io.ByteArrayOutputStream;
import java.io.FilenameFilter;
@@ -55,6 +56,7 @@
import org.apache.hyracks.api.exceptions.HyracksDataException;
import org.apache.hyracks.api.io.FileReference;
import org.apache.hyracks.control.nc.io.IOManager;
+import org.apache.hyracks.util.StringUtil;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@@ -254,7 +256,8 @@
String srcBlobUrl = srcBlobClient.getBlobUrl();
profiler.objectCopy();
guardian.checkWriteAccess(bucket, destPath.getRelativePath());
- BlobClient destBlobClient =
blobContainerClient.getBlobClient(destPath.getFile().getPath());
+ BlobClient destBlobClient =
+ blobContainerClient.getBlobClient(config.getPrefix() +
destPath.getFile().getPath());
destBlobClient.beginCopy(srcBlobUrl, null);
}
@@ -277,37 +280,96 @@
public void deleteObjects(String bucket, Collection<String> paths) throws
HyracksDataException {
if (paths.isEmpty())
return;
- Set<BlobItem> blobsToDelete = getBlobsMatchingThesePaths(paths);
- List<String> blobURLs = getBlobURLs(blobsToDelete);
- if (blobURLs.isEmpty())
- return;
- Collection<List<String>> batchedBlobURLs =
getBatchedBlobURLs(blobURLs);
- for (List<String> batch : batchedBlobURLs) {
- PagedIterable<Response<Void>> responses =
blobBatchClient.deleteBlobs(batch, null);
- Iterator<String> deletePathIter = paths.iterator();
- String deletedPath;
- String failedDeletedPath = null;
- for (Response<Void> response : responses) {
- deletedPath = deletePathIter.next();
- // The response.getStatusCode() method returns:
- // - 202 (Accepted) if the delete operation is successful
- // - exception if the delete operation fails
- int statusCode = response.getStatusCode();
- if (statusCode != SUCCESS_RESPONSE_CODE) {
- LOGGER.warn("Failed to delete blob: {} with status code:
{} while deleting {}", deletedPath,
- statusCode, paths.toString());
- if (failedDeletedPath == null) {
- failedDeletedPath = deletedPath;
- }
- }
+
+ // For small deletes, use individual deleteIfExists (efficient and
handles non-existent blobs)
+ // For large deletes, batch operations might be worth it despite the
overhead
+ guardian.checkWriteAccess(bucket, null);
+ if (paths.size() < DELETE_INDIVIDUAL_OR_BATCH_THRESHOLD) {
+ deleteObjectsIndividually(bucket, paths);
+ } else {
+ deleteObjectsWithListAndBatch(bucket, paths);
+ }
+ }
+
+ private void deleteObjectsIndividually(String bucket, Collection<String>
paths) throws HyracksDataException {
+ for (String path : paths) {
+ if (path == null || path.isEmpty()) {
+ continue;
}
- if (failedDeletedPath != null) {
- throw new RuntimeDataException(ErrorCode.CLOUD_IO_FAILURE,
"DELETE", failedDeletedPath,
- paths.toString());
+ try {
+ profiler.objectDelete();
+ BlobClient blobClient =
blobContainerClient.getBlobClient(config.getPrefix() + path);
+ blobClient.deleteIfExists();
+ } catch (Exception ex) {
+ throw HyracksDataException.create(ex);
}
}
}
+ private void deleteObjectsWithListAndBatch(String bucket,
Collection<String> paths) throws HyracksDataException {
+ String commonPrefix = StringUtil.findLongestCommonDirPrefix(paths);
+
+ // Build a lookup of targeted blob names (with configured prefix)
+ Set<String> targetNames = new java.util.HashSet<>();
+ for (String path : paths) {
+ if (path != null && !path.isEmpty()) {
+ targetNames.add(config.getPrefix() + path);
+ }
+ }
+
+ // List only under the commonPrefix to avoid full-container LIST
+ String listPrefix = config.getPrefix() + commonPrefix;
+ ListBlobsOptions options = new ListBlobsOptions().setPrefix(listPrefix)
+ .setDetails(new BlobListDetails().setRetrieveMetadata(false));
+
+ PagedIterable<BlobItem> blobItems =
blobContainerClient.listBlobs(options, null);
+ List<String> namesToDelete = new ArrayList<>();
+ for (BlobItem bi : blobItems) {
+ String name = bi.getName();
+ if (targetNames.contains(name)) {
+ namesToDelete.add(name);
+ }
+ }
+
+ if (namesToDelete.isEmpty()) {
+ LOGGER.debug("No existing blobs to delete under prefix {}",
commonPrefix);
+ return;
+ }
+
+ // Convert names to full URLs and batch-delete
+ List<String> blobURLs = toBlobUrlsFromNames(namesToDelete);
+ Collection<List<String>> batchedBlobURLs =
getBatchedBlobURLs(blobURLs);
+ for (List<String> batchUrls : batchedBlobURLs) {
+ profiler.objectDelete();
+ PagedIterable<Response<Void>> responses =
blobBatchClient.deleteBlobs(batchUrls, null);
+ int i = 0;
+ String failed = null;
+ for (Response<Void> response : responses) {
+ int statusCode = response.getStatusCode();
+ if (statusCode != SUCCESS_RESPONSE_CODE) {
+ String failedUrl = batchUrls.get(i);
+ LOGGER.warn("Failed to delete blob URL {} with status code
{}", failedUrl, statusCode);
+ if (failed == null) {
+ failed = failedUrl;
+ }
+ }
+ i++;
+ }
+ if (failed != null) {
+ throw new RuntimeDataException(ErrorCode.CLOUD_IO_FAILURE,
"DELETE", failed, batchUrls.toString());
+ }
+ }
+ }
+
+ private List<String> toBlobUrlsFromNames(Collection<String> blobNames) {
+ final String blobURLPrefix = blobContainerClient.getBlobContainerUrl()
+ "/";
+ List<String> urls = new ArrayList<>(blobNames.size());
+ for (String name : blobNames) {
+ urls.add(blobURLPrefix + name);
+ }
+ return urls;
+ }
+
private Collection<List<String>> getBatchedBlobURLs(List<String> blobURLs)
{
int startIdx = 0;
Collection<List<String>> batchedBLOBURLs = new ArrayList<>();
@@ -324,14 +386,6 @@
return batchedBLOBURLs;
}
- private Set<BlobItem> getBlobsMatchingThesePaths(Collection<String> paths)
{
- List<String> pathWithPrefix =
- paths.stream().map(path -> config.getPrefix() +
path).collect(Collectors.toList());
- PagedIterable<BlobItem> blobItems = blobContainerClient.listBlobs();
- return blobItems.stream().filter(blobItem ->
pathWithPrefix.contains(blobItem.getName()))
- .collect(Collectors.toSet());
- }
-
@Override
public long getObjectSize(String bucket, String path) throws
HyracksDataException {
guardian.checkReadAccess(bucket, path);
diff --git
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/StringUtil.java
b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/StringUtil.java
index df33f6b..56efbda 100644
---
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/StringUtil.java
+++
b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/StringUtil.java
@@ -67,4 +67,40 @@
public static ICharAccessor<byte[]> getByteArrayAsCharAccessor() {
return (input, index) -> (char) input[index];
}
+
+ /**
+ * Finds the longest common directory prefix among the given POSIX-style
paths.
+ * The returned prefix always ends at a directory boundary (i.e., after
the last '/'),
+ * or empty string if no common directory exists.
+ */
+ public static String findLongestCommonDirPrefix(Iterable<String> paths) {
+ if (paths == null) {
+ return "";
+ }
+ java.util.Iterator<String> it = paths.iterator();
+ if (!it.hasNext()) {
+ return "";
+ }
+ String prefix = it.next();
+ while (it.hasNext() && prefix != null && !prefix.isEmpty()) {
+ prefix = commonPrefix(prefix, it.next());
+ }
+ if (prefix == null || prefix.isEmpty()) {
+ return "";
+ }
+ int lastSlash = prefix.lastIndexOf('/');
+ return lastSlash >= 0 ? prefix.substring(0, lastSlash + 1) : "";
+ }
+
+ private static String commonPrefix(String a, String b) {
+ if (a == null || b == null) {
+ return "";
+ }
+ int len = Math.min(a.length(), b.length());
+ int i = 0;
+ while (i < len && a.charAt(i) == b.charAt(i)) {
+ i++;
+ }
+ return a.substring(0, i);
+ }
}
diff --git
a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/StringUtilTest.java
b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/StringUtilTest.java
index e279968..631d2be 100644
---
a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/StringUtilTest.java
+++
b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/StringUtilTest.java
@@ -35,4 +35,45 @@
Assert.assertEquals(expected, StringUtil.toCamelCase(input));
Assert.assertEquals(expected, StringUtil.toCamelCase(input));
}
+
+ @Test
+ public void testFindLongestCommonDirPrefix_basic() {
+ java.util.List<String> paths =
java.util.Arrays.asList("storage/partition_0/Metadata/ns1/ds1/file1",
+ "storage/partition_0/Metadata/ns1/ds1/file2",
"storage/partition_0/Metadata/ns1/ds1/sub/file3");
+ String prefix = StringUtil.findLongestCommonDirPrefix(paths);
+ Assert.assertEquals("storage/partition_0/Metadata/ns1/ds1/", prefix);
+ }
+
+ @Test
+ public void testFindLongestCommonDirPrefix_rootMismatch() {
+ java.util.List<String> paths = java.util.Arrays.asList("a/b/c/file1",
"x/y/z/file2");
+ String prefix = StringUtil.findLongestCommonDirPrefix(paths);
+ Assert.assertEquals("", prefix);
+ }
+
+ @Test
+ public void testFindLongestCommonDirPrefix_partialDirBoundary() {
+ java.util.List<String> paths =
java.util.Arrays.asList("abc/defg/file1", "abc/defh/file2");
+ String prefix = StringUtil.findLongestCommonDirPrefix(paths);
+ // common prefix is "abc/def" but directory boundary forces "abc/"
+ Assert.assertEquals("abc/", prefix);
+ }
+
+ @Test
+ public void testFindLongestCommonDirPrefix_single() {
+ java.util.List<String> paths =
java.util.Collections.singletonList("only/one/path/here");
+ String prefix = StringUtil.findLongestCommonDirPrefix(paths);
+ Assert.assertEquals("only/one/path/", prefix);
+ }
+
+ @Test
+ public void testFindLongestCommonDirPrefix_emptyAndNull() {
+ String prefix1 =
StringUtil.findLongestCommonDirPrefix(java.util.Collections.emptyList());
+ Assert.assertEquals("", prefix1);
+ java.util.List<String> paths = new java.util.ArrayList<>();
+ paths.add(null);
+ paths.add("a/b");
+ String prefix2 = StringUtil.findLongestCommonDirPrefix(paths);
+ Assert.assertEquals("", prefix2);
+ }
}
--
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20503?usp=email
To unsubscribe, or for help writing mail filters, visit
https://asterix-gerrit.ics.uci.edu/settings?usp=email
Gerrit-MessageType: newchange
Gerrit-Project: asterixdb
Gerrit-Branch: phoenix
Gerrit-Change-Id: Iaeb705b396aa72440759e4008c72f276c936c98e
Gerrit-Change-Number: 20503
Gerrit-PatchSet: 1
Gerrit-Owner: Ritik Raj <[email protected]>