>From Michael Blow <[email protected]>: Michael Blow has submitted this change. ( https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/18206 )
Change subject: [NO ISSUE]: Support Reading Single Depth Files/Folder from S3 Container ...................................................................... [NO ISSUE]: Support Reading Single Depth Files/Folder from S3 Container Change-Id: Iae9d85eb9899419e63c86322cd8da2273adf89c6 Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/18206 Reviewed-by: Utsav Singh <[email protected]> Reviewed-by: Hussain Towaileb <[email protected]> Integration-Tests: Jenkins <[email protected]> Tested-by: Jenkins <[email protected]> --- M asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/aws/s3/S3Utils.java M asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java 3 files changed, 110 insertions(+), 0 deletions(-) Approvals: Hussain Towaileb: Looks good to me, approved Utsav Singh: Looks good to me, but someone else must approve Jenkins: Verified; Verified diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java index 55a515c..34d209e 100644 --- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java +++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java @@ -20,6 +20,7 @@ import static org.apache.asterix.test.external_dataset.avro.AvroFileConverterUtil.AVRO_GEN_BASEDIR; import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.BOM_FILE_CONTAINER; +import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.BROWSE_CONTAINER; import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.DYNAMIC_PREFIX_AT_START_CONTAINER; import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.FIXED_DATA_CONTAINER; import static org.apache.asterix.test.external_dataset.parquet.BinaryFileConverterUtil.BINARY_GEN_BASEDIR; @@ -78,6 +79,7 @@ private static Uploader fixedDataLoader; private static Uploader mixedDataLoader; private static Uploader bomFileLoader; + private static Uploader browseDataLoader; protected TestCaseContext tcCtx; @@ -148,6 +150,16 @@ ExternalDatasetTestUtils.bomFileLoader = bomFileLoader; } + public static void setUploaders(Uploader playgroundDataLoader, Uploader dynamicPrefixAtStartDataLoader, + Uploader fixedDataLoader, Uploader mixedDataLoader, Uploader bomFileLoader, Uploader browseDataLoader) { + ExternalDatasetTestUtils.playgroundDataLoader = playgroundDataLoader; + ExternalDatasetTestUtils.dynamicPrefixAtStartDataLoader = dynamicPrefixAtStartDataLoader; + ExternalDatasetTestUtils.fixedDataLoader = fixedDataLoader; + ExternalDatasetTestUtils.mixedDataLoader = mixedDataLoader; + ExternalDatasetTestUtils.bomFileLoader = bomFileLoader; + ExternalDatasetTestUtils.browseDataLoader = browseDataLoader; + } + /** * Creates a bucket and fills it with some files for testing purpose. */ @@ -183,6 +195,32 @@ LOGGER.info("Files added successfully"); } + public static void prepareBrowseContainer() { + /* + file hierarchy inside browse container + browse/1.json + browse/2.json + browse/level1/3.json + browse/level1/4.json + browse/level1/level2/5.json + browse/level2/level3/6.json + */ + // -- todo:Utsav add a test for Browse S3 path which returns multiple folders, skipped for now as S3 mock server does not support this. + LOGGER.info("Adding JSON files to " + BROWSE_CONTAINER); + browseDataLoader.upload("1.json", "{\"id\":" + 1 + "}"); + browseDataLoader.upload("2.json", "{\"id\":" + 2 + "}"); + browseDataLoader.upload("level1/3.json", "{\"id\":" + 3 + "}"); + browseDataLoader.upload("level1/4.json", "{\"id\":" + 4 + "}"); + browseDataLoader.upload("level1/level2/5.json", "{\"id\":" + 5 + "}"); + browseDataLoader.upload("level2/level3/6.json", "{\"id\":" + 6 + "}"); + + //Adding 1000+ files + for (int i = 1; i <= 1500; i++) { + browseDataLoader.upload("level3/" + i + ".json", "{\"id\":" + i + "}"); + } + LOGGER.info("JSON Files added successfully"); + } + /** * Special container where dynamic prefix is the first segment */ diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java index 7912d57..9a2bfbc 100644 --- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java +++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java @@ -147,6 +147,7 @@ protected TestCaseContext tcCtx; public static final String PLAYGROUND_CONTAINER = "playground"; + public static final String BROWSE_CONTAINER = "browse"; public static final String DYNAMIC_PREFIX_AT_START_CONTAINER = "dynamic-prefix-at-start-container"; public static final String FIXED_DATA_CONTAINER = "fixed-data"; // Do not use, has fixed data public static final String INCLUDE_EXCLUDE_CONTAINER = "include-exclude"; diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/aws/s3/S3Utils.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/aws/s3/S3Utils.java index 45e83b4..6a16913 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/aws/s3/S3Utils.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/aws/s3/S3Utils.java @@ -48,6 +48,7 @@ import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -81,6 +82,7 @@ import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.S3ClientBuilder; +import software.amazon.awssdk.services.s3.model.CommonPrefix; import software.amazon.awssdk.services.s3.model.ListObjectsRequest; import software.amazon.awssdk.services.s3.model.ListObjectsResponse; import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; @@ -88,6 +90,7 @@ import software.amazon.awssdk.services.s3.model.S3Exception; import software.amazon.awssdk.services.s3.model.S3Object; import software.amazon.awssdk.services.s3.model.S3Response; +import software.amazon.awssdk.services.s3.paginators.ListObjectsV2Iterable; public class S3Utils { private S3Utils() { @@ -519,4 +522,58 @@ } } } + + public static Map<String, List<String>> S3ObjectsOfSingleDepth(Map<String, String> configuration, String container, + String prefix) throws CompilationException, HyracksDataException { + // create s3 client + S3Client s3Client = buildAwsS3Client(configuration); + // fetch all the s3 objects + return listS3ObjectsOfSingleDepth(s3Client, container, prefix); + } + + /** + * Uses the latest API to retrieve the objects from the storage of a single level. + * + * @param s3Client S3 client + * @param container container name + * @param prefix definition prefix + */ + private static Map<String, List<String>> listS3ObjectsOfSingleDepth(S3Client s3Client, String container, + String prefix) throws HyracksDataException { + Map<String, List<String>> allObjects = new HashMap<>(); + ListObjectsV2Iterable listObjectsInterable; + ListObjectsV2Request.Builder listObjectsBuilder = + ListObjectsV2Request.builder().bucket(container).prefix(prefix).delimiter("/"); + listObjectsBuilder.prefix(prefix); + List<String> files = new ArrayList<>(); + List<String> folders = new ArrayList<>(); + // to skip the prefix as a file from the response + boolean checkPrefixInFile = true; + listObjectsInterable = s3Client.listObjectsV2Paginator(listObjectsBuilder.build()); + for (ListObjectsV2Response response : listObjectsInterable) { + // put all the files + for (S3Object object : response.contents()) { + String fileName = object.key(); + fileName = fileName.substring(prefix.length(), fileName.length()); + if (checkPrefixInFile) { + if (prefix.equals(object.key())) + checkPrefixInFile = false; + else { + files.add(fileName); + } + } else { + files.add(fileName); + } + } + // put all the folders + for (CommonPrefix object : response.commonPrefixes()) { + String folderName = object.prefix(); + folderName = folderName.substring(prefix.length(), folderName.length()); + folders.add(folderName.endsWith("/") ? folderName.substring(0, folderName.length() - 1) : folderName); + } + } + allObjects.put("files", files); + allObjects.put("folders", folders); + return allObjects; + } } -- To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/18206 To unsubscribe, or for help writing mail filters, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-Project: asterixdb Gerrit-Branch: master Gerrit-Change-Id: Iae9d85eb9899419e63c86322cd8da2273adf89c6 Gerrit-Change-Number: 18206 Gerrit-PatchSet: 5 Gerrit-Owner: Utsav Singh <[email protected]> Gerrit-Reviewer: Anon. E. Moose #1000171 Gerrit-Reviewer: Hussain Towaileb <[email protected]> Gerrit-Reviewer: Hussain Towaileb <[email protected]> Gerrit-Reviewer: Jenkins <[email protected]> Gerrit-Reviewer: Michael Blow <[email protected]> Gerrit-Reviewer: Utsav Singh <[email protected]> Gerrit-MessageType: merged
