>From Ritik Raj <[email protected]>:

Ritik Raj has uploaded this change for review. ( 
https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20503?usp=email )


Change subject: [NO ISSUE][CLOUD] Updating deleteObjects for AzureBlob
......................................................................

[NO ISSUE][CLOUD] Updating deleteObjects for AzureBlob

- user model changes: no
- storage format changes: no
- interface changes: no

Details:
Azure's bulk delete fails if any of the object among
the candidates do not exists.
Hence, adding a thresold of 38 requests, if the paths
are less than that, then individual deleteIfExists,
otherwise call LIST over the common prefix and
delete the objects.

Ext-ref: MB-68999

Change-Id: Iaeb705b396aa72440759e4008c72f276c936c98e
---
M 
asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageClientConfig.java
M 
asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageCloudClient.java
M 
hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/StringUtil.java
M 
hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/StringUtilTest.java
4 files changed, 186 insertions(+), 35 deletions(-)



  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb 
refs/changes/03/20503/1

diff --git 
a/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageClientConfig.java
 
b/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageClientConfig.java
index 889a6a4..4630d1b 100644
--- 
a/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageClientConfig.java
+++ 
b/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageClientConfig.java
@@ -31,6 +31,26 @@
 public class AzBlobStorageClientConfig {
     // Ref: 
https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch?tabs=microsoft-entra-id
     static final int DELETE_BATCH_SIZE = 256;
+    /*
+     * Performance Analysis: Individual deleteIfExists() vs Batch Delete
+     *
+     * Individual approach: N × 8ms (one DELETE per blob)
+     * Batch approach with LIST verification: 300ms (LIST entire container) + 
(N / 256) × 8ms (batch deletes)
+     * Batch approach with exists() checks: N × 8ms (HEAD requests) + (N / 
256) × 8ms (batch deletes)
+     *
+     * Break-even calculation (Individual vs LIST + Batch):
+     *   N × 8 = 300 + (N / 256) × 8
+     *   N × 8 × (1 - 1/256) = 300
+     *   N × 7.96875 = 300
+     *   N ≈ 38 blobs
+     *
+     * Conclusion: Individual deleteIfExists() is faster for < 38 blobs, batch 
is faster for > 38 blobs.
+     * However, the exists() check approach (N × 8ms + batch overhead) is 
always slower than individual.
+     *
+     * For typical collection creation (deleting 1 mask file), individual 
deleteIfExists() is optimal.
+     * Using deleteIfExists() avoids the expensive LIST operation (300ms → 
8ms, ~37x speedup).
+     */
+    static final int DELETE_INDIVIDUAL_OR_BATCH_THRESHOLD = 38;

     private final int writeBufferSize;
     private final String region;
diff --git 
a/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageCloudClient.java
 
b/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageCloudClient.java
index 4a61f1c..fc95354 100644
--- 
a/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageCloudClient.java
+++ 
b/asterixdb/asterix-cloud/src/main/java/org/apache/asterix/cloud/clients/azure/blobstorage/AzBlobStorageCloudClient.java
@@ -20,6 +20,7 @@
 package org.apache.asterix.cloud.clients.azure.blobstorage;

 import static 
org.apache.asterix.cloud.clients.azure.blobstorage.AzBlobStorageClientConfig.DELETE_BATCH_SIZE;
+import static 
org.apache.asterix.cloud.clients.azure.blobstorage.AzBlobStorageClientConfig.DELETE_INDIVIDUAL_OR_BATCH_THRESHOLD;

 import java.io.ByteArrayOutputStream;
 import java.io.FilenameFilter;
@@ -55,6 +56,7 @@
 import org.apache.hyracks.api.exceptions.HyracksDataException;
 import org.apache.hyracks.api.io.FileReference;
 import org.apache.hyracks.control.nc.io.IOManager;
+import org.apache.hyracks.util.StringUtil;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;

@@ -254,7 +256,8 @@
         String srcBlobUrl = srcBlobClient.getBlobUrl();
         profiler.objectCopy();
         guardian.checkWriteAccess(bucket, destPath.getRelativePath());
-        BlobClient destBlobClient = 
blobContainerClient.getBlobClient(destPath.getFile().getPath());
+        BlobClient destBlobClient =
+                blobContainerClient.getBlobClient(config.getPrefix() + 
destPath.getFile().getPath());
         destBlobClient.beginCopy(srcBlobUrl, null);
     }

@@ -277,37 +280,96 @@
     public void deleteObjects(String bucket, Collection<String> paths) throws 
HyracksDataException {
         if (paths.isEmpty())
             return;
-        Set<BlobItem> blobsToDelete = getBlobsMatchingThesePaths(paths);
-        List<String> blobURLs = getBlobURLs(blobsToDelete);
-        if (blobURLs.isEmpty())
-            return;
-        Collection<List<String>> batchedBlobURLs = 
getBatchedBlobURLs(blobURLs);
-        for (List<String> batch : batchedBlobURLs) {
-            PagedIterable<Response<Void>> responses = 
blobBatchClient.deleteBlobs(batch, null);
-            Iterator<String> deletePathIter = paths.iterator();
-            String deletedPath;
-            String failedDeletedPath = null;
-            for (Response<Void> response : responses) {
-                deletedPath = deletePathIter.next();
-                // The response.getStatusCode() method returns:
-                // - 202 (Accepted) if the delete operation is successful
-                // - exception if the delete operation fails
-                int statusCode = response.getStatusCode();
-                if (statusCode != SUCCESS_RESPONSE_CODE) {
-                    LOGGER.warn("Failed to delete blob: {} with status code: 
{} while deleting {}", deletedPath,
-                            statusCode, paths.toString());
-                    if (failedDeletedPath == null) {
-                        failedDeletedPath = deletedPath;
-                    }
-                }
+
+        // For small deletes, use individual deleteIfExists (efficient and 
handles non-existent blobs)
+        // For large deletes, batch operations might be worth it despite the 
overhead
+        guardian.checkWriteAccess(bucket, null);
+        if (paths.size() < DELETE_INDIVIDUAL_OR_BATCH_THRESHOLD) {
+            deleteObjectsIndividually(bucket, paths);
+        } else {
+            deleteObjectsWithListAndBatch(bucket, paths);
+        }
+    }
+
+    private void deleteObjectsIndividually(String bucket, Collection<String> 
paths) throws HyracksDataException {
+        for (String path : paths) {
+            if (path == null || path.isEmpty()) {
+                continue;
             }
-            if (failedDeletedPath != null) {
-                throw new RuntimeDataException(ErrorCode.CLOUD_IO_FAILURE, 
"DELETE", failedDeletedPath,
-                        paths.toString());
+            try {
+                profiler.objectDelete();
+                BlobClient blobClient = 
blobContainerClient.getBlobClient(config.getPrefix() + path);
+                blobClient.deleteIfExists();
+            } catch (Exception ex) {
+                throw HyracksDataException.create(ex);
             }
         }
     }

+    private void deleteObjectsWithListAndBatch(String bucket, 
Collection<String> paths) throws HyracksDataException {
+        String commonPrefix = StringUtil.findLongestCommonDirPrefix(paths);
+
+        // Build a lookup of targeted blob names (with configured prefix)
+        Set<String> targetNames = new java.util.HashSet<>();
+        for (String path : paths) {
+            if (path != null && !path.isEmpty()) {
+                targetNames.add(config.getPrefix() + path);
+            }
+        }
+
+        // List only under the commonPrefix to avoid full-container LIST
+        String listPrefix = config.getPrefix() + commonPrefix;
+        ListBlobsOptions options = new ListBlobsOptions().setPrefix(listPrefix)
+                .setDetails(new BlobListDetails().setRetrieveMetadata(false));
+
+        PagedIterable<BlobItem> blobItems = 
blobContainerClient.listBlobs(options, null);
+        List<String> namesToDelete = new ArrayList<>();
+        for (BlobItem bi : blobItems) {
+            String name = bi.getName();
+            if (targetNames.contains(name)) {
+                namesToDelete.add(name);
+            }
+        }
+
+        if (namesToDelete.isEmpty()) {
+            LOGGER.debug("No existing blobs to delete under prefix {}", 
commonPrefix);
+            return;
+        }
+
+        // Convert names to full URLs and batch-delete
+        List<String> blobURLs = toBlobUrlsFromNames(namesToDelete);
+        Collection<List<String>> batchedBlobURLs = 
getBatchedBlobURLs(blobURLs);
+        for (List<String> batchUrls : batchedBlobURLs) {
+            profiler.objectDelete();
+            PagedIterable<Response<Void>> responses = 
blobBatchClient.deleteBlobs(batchUrls, null);
+            int i = 0;
+            String failed = null;
+            for (Response<Void> response : responses) {
+                int statusCode = response.getStatusCode();
+                if (statusCode != SUCCESS_RESPONSE_CODE) {
+                    String failedUrl = batchUrls.get(i);
+                    LOGGER.warn("Failed to delete blob URL {} with status code 
{}", failedUrl, statusCode);
+                    if (failed == null) {
+                        failed = failedUrl;
+                    }
+                }
+                i++;
+            }
+            if (failed != null) {
+                throw new RuntimeDataException(ErrorCode.CLOUD_IO_FAILURE, 
"DELETE", failed, batchUrls.toString());
+            }
+        }
+    }
+
+    private List<String> toBlobUrlsFromNames(Collection<String> blobNames) {
+        final String blobURLPrefix = blobContainerClient.getBlobContainerUrl() 
+ "/";
+        List<String> urls = new ArrayList<>(blobNames.size());
+        for (String name : blobNames) {
+            urls.add(blobURLPrefix + name);
+        }
+        return urls;
+    }
+
     private Collection<List<String>> getBatchedBlobURLs(List<String> blobURLs) 
{
         int startIdx = 0;
         Collection<List<String>> batchedBLOBURLs = new ArrayList<>();
@@ -324,14 +386,6 @@
         return batchedBLOBURLs;
     }

-    private Set<BlobItem> getBlobsMatchingThesePaths(Collection<String> paths) 
{
-        List<String> pathWithPrefix =
-                paths.stream().map(path -> config.getPrefix() + 
path).collect(Collectors.toList());
-        PagedIterable<BlobItem> blobItems = blobContainerClient.listBlobs();
-        return blobItems.stream().filter(blobItem -> 
pathWithPrefix.contains(blobItem.getName()))
-                .collect(Collectors.toSet());
-    }
-
     @Override
     public long getObjectSize(String bucket, String path) throws 
HyracksDataException {
         guardian.checkReadAccess(bucket, path);
diff --git 
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/StringUtil.java
 
b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/StringUtil.java
index df33f6b..56efbda 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/StringUtil.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/StringUtil.java
@@ -67,4 +67,40 @@
     public static ICharAccessor<byte[]> getByteArrayAsCharAccessor() {
         return (input, index) -> (char) input[index];
     }
+
+    /**
+     * Finds the longest common directory prefix among the given POSIX-style 
paths.
+     * The returned prefix always ends at a directory boundary (i.e., after 
the last '/'),
+     * or empty string if no common directory exists.
+     */
+    public static String findLongestCommonDirPrefix(Iterable<String> paths) {
+        if (paths == null) {
+            return "";
+        }
+        java.util.Iterator<String> it = paths.iterator();
+        if (!it.hasNext()) {
+            return "";
+        }
+        String prefix = it.next();
+        while (it.hasNext() && prefix != null && !prefix.isEmpty()) {
+            prefix = commonPrefix(prefix, it.next());
+        }
+        if (prefix == null || prefix.isEmpty()) {
+            return "";
+        }
+        int lastSlash = prefix.lastIndexOf('/');
+        return lastSlash >= 0 ? prefix.substring(0, lastSlash + 1) : "";
+    }
+
+    private static String commonPrefix(String a, String b) {
+        if (a == null || b == null) {
+            return "";
+        }
+        int len = Math.min(a.length(), b.length());
+        int i = 0;
+        while (i < len && a.charAt(i) == b.charAt(i)) {
+            i++;
+        }
+        return a.substring(0, i);
+    }
 }
diff --git 
a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/StringUtilTest.java
 
b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/StringUtilTest.java
index e279968..631d2be 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/StringUtilTest.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/StringUtilTest.java
@@ -35,4 +35,45 @@
         Assert.assertEquals(expected, StringUtil.toCamelCase(input));
         Assert.assertEquals(expected, StringUtil.toCamelCase(input));
     }
+
+    @Test
+    public void testFindLongestCommonDirPrefix_basic() {
+        java.util.List<String> paths = 
java.util.Arrays.asList("storage/partition_0/Metadata/ns1/ds1/file1",
+                "storage/partition_0/Metadata/ns1/ds1/file2", 
"storage/partition_0/Metadata/ns1/ds1/sub/file3");
+        String prefix = StringUtil.findLongestCommonDirPrefix(paths);
+        Assert.assertEquals("storage/partition_0/Metadata/ns1/ds1/", prefix);
+    }
+
+    @Test
+    public void testFindLongestCommonDirPrefix_rootMismatch() {
+        java.util.List<String> paths = java.util.Arrays.asList("a/b/c/file1", 
"x/y/z/file2");
+        String prefix = StringUtil.findLongestCommonDirPrefix(paths);
+        Assert.assertEquals("", prefix);
+    }
+
+    @Test
+    public void testFindLongestCommonDirPrefix_partialDirBoundary() {
+        java.util.List<String> paths = 
java.util.Arrays.asList("abc/defg/file1", "abc/defh/file2");
+        String prefix = StringUtil.findLongestCommonDirPrefix(paths);
+        // common prefix is "abc/def" but directory boundary forces "abc/"
+        Assert.assertEquals("abc/", prefix);
+    }
+
+    @Test
+    public void testFindLongestCommonDirPrefix_single() {
+        java.util.List<String> paths = 
java.util.Collections.singletonList("only/one/path/here");
+        String prefix = StringUtil.findLongestCommonDirPrefix(paths);
+        Assert.assertEquals("only/one/path/", prefix);
+    }
+
+    @Test
+    public void testFindLongestCommonDirPrefix_emptyAndNull() {
+        String prefix1 = 
StringUtil.findLongestCommonDirPrefix(java.util.Collections.emptyList());
+        Assert.assertEquals("", prefix1);
+        java.util.List<String> paths = new java.util.ArrayList<>();
+        paths.add(null);
+        paths.add("a/b");
+        String prefix2 = StringUtil.findLongestCommonDirPrefix(paths);
+        Assert.assertEquals("", prefix2);
+    }
 }

--
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20503?usp=email
To unsubscribe, or for help writing mail filters, visit 
https://asterix-gerrit.ics.uci.edu/settings?usp=email

Gerrit-MessageType: newchange
Gerrit-Project: asterixdb
Gerrit-Branch: phoenix
Gerrit-Change-Id: Iaeb705b396aa72440759e4008c72f276c936c98e
Gerrit-Change-Number: 20503
Gerrit-PatchSet: 1
Gerrit-Owner: Ritik Raj <[email protected]>

Reply via email to