This is an automated email from the ASF dual-hosted git repository. mhubail pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/asterixdb.git
The following commit(s) were added to refs/heads/master by this push: new 381c707930 [NO ISSUE]: Support Reading Single Depth Files/Folder from S3 Container 381c707930 is described below commit 381c7079309572c35eee46142c5fcbcec1151de0 Author: utsavCbase <utsav.si...@couchbase.com> AuthorDate: Thu Mar 21 13:38:27 2024 +0530 [NO ISSUE]: Support Reading Single Depth Files/Folder from S3 Container Change-Id: Iae9d85eb9899419e63c86322cd8da2273adf89c6 Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/18206 Reviewed-by: Utsav Singh <utsav.si...@couchbase.com> Reviewed-by: Hussain Towaileb <hussai...@gmail.com> Integration-Tests: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Tested-by: Jenkins <jenk...@fulliautomatix.ics.uci.edu> --- .../external_dataset/ExternalDatasetTestUtils.java | 38 +++++++++++++++ .../aws/AwsS3ExternalDatasetTest.java | 1 + .../asterix/external/util/aws/s3/S3Utils.java | 57 ++++++++++++++++++++++ 3 files changed, 96 insertions(+) diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java index 55a515c879..34d209eb7a 100644 --- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java +++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java @@ -20,6 +20,7 @@ package org.apache.asterix.test.external_dataset; import static org.apache.asterix.test.external_dataset.avro.AvroFileConverterUtil.AVRO_GEN_BASEDIR; import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.BOM_FILE_CONTAINER; +import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.BROWSE_CONTAINER; import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.DYNAMIC_PREFIX_AT_START_CONTAINER; import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.FIXED_DATA_CONTAINER; import static org.apache.asterix.test.external_dataset.parquet.BinaryFileConverterUtil.BINARY_GEN_BASEDIR; @@ -78,6 +79,7 @@ public class ExternalDatasetTestUtils { private static Uploader fixedDataLoader; private static Uploader mixedDataLoader; private static Uploader bomFileLoader; + private static Uploader browseDataLoader; protected TestCaseContext tcCtx; @@ -148,6 +150,16 @@ public class ExternalDatasetTestUtils { ExternalDatasetTestUtils.bomFileLoader = bomFileLoader; } + public static void setUploaders(Uploader playgroundDataLoader, Uploader dynamicPrefixAtStartDataLoader, + Uploader fixedDataLoader, Uploader mixedDataLoader, Uploader bomFileLoader, Uploader browseDataLoader) { + ExternalDatasetTestUtils.playgroundDataLoader = playgroundDataLoader; + ExternalDatasetTestUtils.dynamicPrefixAtStartDataLoader = dynamicPrefixAtStartDataLoader; + ExternalDatasetTestUtils.fixedDataLoader = fixedDataLoader; + ExternalDatasetTestUtils.mixedDataLoader = mixedDataLoader; + ExternalDatasetTestUtils.bomFileLoader = bomFileLoader; + ExternalDatasetTestUtils.browseDataLoader = browseDataLoader; + } + /** * Creates a bucket and fills it with some files for testing purpose. */ @@ -183,6 +195,32 @@ public class ExternalDatasetTestUtils { LOGGER.info("Files added successfully"); } + public static void prepareBrowseContainer() { + /* + file hierarchy inside browse container + browse/1.json + browse/2.json + browse/level1/3.json + browse/level1/4.json + browse/level1/level2/5.json + browse/level2/level3/6.json + */ + // -- todo:Utsav add a test for Browse S3 path which returns multiple folders, skipped for now as S3 mock server does not support this. + LOGGER.info("Adding JSON files to " + BROWSE_CONTAINER); + browseDataLoader.upload("1.json", "{\"id\":" + 1 + "}"); + browseDataLoader.upload("2.json", "{\"id\":" + 2 + "}"); + browseDataLoader.upload("level1/3.json", "{\"id\":" + 3 + "}"); + browseDataLoader.upload("level1/4.json", "{\"id\":" + 4 + "}"); + browseDataLoader.upload("level1/level2/5.json", "{\"id\":" + 5 + "}"); + browseDataLoader.upload("level2/level3/6.json", "{\"id\":" + 6 + "}"); + + //Adding 1000+ files + for (int i = 1; i <= 1500; i++) { + browseDataLoader.upload("level3/" + i + ".json", "{\"id\":" + i + "}"); + } + LOGGER.info("JSON Files added successfully"); + } + /** * Special container where dynamic prefix is the first segment */ diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java index 7912d57960..9a2bfbcc7c 100644 --- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java +++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java @@ -147,6 +147,7 @@ public class AwsS3ExternalDatasetTest { protected TestCaseContext tcCtx; public static final String PLAYGROUND_CONTAINER = "playground"; + public static final String BROWSE_CONTAINER = "browse"; public static final String DYNAMIC_PREFIX_AT_START_CONTAINER = "dynamic-prefix-at-start-container"; public static final String FIXED_DATA_CONTAINER = "fixed-data"; // Do not use, has fixed data public static final String INCLUDE_EXCLUDE_CONTAINER = "include-exclude"; diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/aws/s3/S3Utils.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/aws/s3/S3Utils.java index 45e83b4216..6a16913258 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/aws/s3/S3Utils.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/aws/s3/S3Utils.java @@ -48,6 +48,7 @@ import static org.apache.hyracks.api.util.ExceptionUtils.getMessageOrToString; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -81,6 +82,7 @@ import software.amazon.awssdk.core.exception.SdkException; import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.S3ClientBuilder; +import software.amazon.awssdk.services.s3.model.CommonPrefix; import software.amazon.awssdk.services.s3.model.ListObjectsRequest; import software.amazon.awssdk.services.s3.model.ListObjectsResponse; import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; @@ -88,6 +90,7 @@ import software.amazon.awssdk.services.s3.model.ListObjectsV2Response; import software.amazon.awssdk.services.s3.model.S3Exception; import software.amazon.awssdk.services.s3.model.S3Object; import software.amazon.awssdk.services.s3.model.S3Response; +import software.amazon.awssdk.services.s3.paginators.ListObjectsV2Iterable; public class S3Utils { private S3Utils() { @@ -519,4 +522,58 @@ public class S3Utils { } } } + + public static Map<String, List<String>> S3ObjectsOfSingleDepth(Map<String, String> configuration, String container, + String prefix) throws CompilationException, HyracksDataException { + // create s3 client + S3Client s3Client = buildAwsS3Client(configuration); + // fetch all the s3 objects + return listS3ObjectsOfSingleDepth(s3Client, container, prefix); + } + + /** + * Uses the latest API to retrieve the objects from the storage of a single level. + * + * @param s3Client S3 client + * @param container container name + * @param prefix definition prefix + */ + private static Map<String, List<String>> listS3ObjectsOfSingleDepth(S3Client s3Client, String container, + String prefix) throws HyracksDataException { + Map<String, List<String>> allObjects = new HashMap<>(); + ListObjectsV2Iterable listObjectsInterable; + ListObjectsV2Request.Builder listObjectsBuilder = + ListObjectsV2Request.builder().bucket(container).prefix(prefix).delimiter("/"); + listObjectsBuilder.prefix(prefix); + List<String> files = new ArrayList<>(); + List<String> folders = new ArrayList<>(); + // to skip the prefix as a file from the response + boolean checkPrefixInFile = true; + listObjectsInterable = s3Client.listObjectsV2Paginator(listObjectsBuilder.build()); + for (ListObjectsV2Response response : listObjectsInterable) { + // put all the files + for (S3Object object : response.contents()) { + String fileName = object.key(); + fileName = fileName.substring(prefix.length(), fileName.length()); + if (checkPrefixInFile) { + if (prefix.equals(object.key())) + checkPrefixInFile = false; + else { + files.add(fileName); + } + } else { + files.add(fileName); + } + } + // put all the folders + for (CommonPrefix object : response.commonPrefixes()) { + String folderName = object.prefix(); + folderName = folderName.substring(prefix.length(), folderName.length()); + folders.add(folderName.endsWith("/") ? folderName.substring(0, folderName.length() - 1) : folderName); + } + } + allObjects.put("files", files); + allObjects.put("folders", folders); + return allObjects; + } }