>From Peeyush Gupta <[email protected]>: Peeyush Gupta has submitted this change. ( https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/19807 )
Change subject: [ASTERIXDB-3612][EXT] Reading gzip file with multiple gzip streams ...................................................................... [ASTERIXDB-3612][EXT] Reading gzip file with multiple gzip streams - user model changes: no - storage format changes: no - interface changes: no Ext-ref: MB-66818 Change-Id: I47c2eb600dddba1198a92f5ff7dfc7f2da652c3e Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/19807 Reviewed-by: Hussain Towaileb <[email protected]> Tested-by: Jenkins <[email protected]> --- M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/gcs/GCSInputStream.java A asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/AvailableInputStream.java M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/azure/datalake/AzureDataLakeInputStream.java M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStream.java M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/azure/blob/AzureBlobInputStream.java 5 files changed, 83 insertions(+), 4 deletions(-) Approvals: Hussain Towaileb: Looks good to me, approved Jenkins: Verified Objections: Anon. E. Moose #1000171: Violations found diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStream.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStream.java index e7b7b29..45a40b0 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStream.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStream.java @@ -31,6 +31,7 @@ import org.apache.asterix.common.exceptions.ErrorCode; import org.apache.asterix.common.exceptions.RuntimeDataException; import org.apache.asterix.external.input.record.reader.abstracts.AbstractExternalInputStream; +import org.apache.asterix.external.input.record.reader.stream.AvailableInputStream; import org.apache.asterix.external.util.ExternalDataConstants; import org.apache.asterix.external.util.ExternalDataUtils; import org.apache.commons.lang3.StringUtils; @@ -71,7 +72,7 @@ } // Use gzip stream if needed if (StringUtils.endsWithIgnoreCase(fileName, ".gz") || StringUtils.endsWithIgnoreCase(fileName, ".gzip")) { - in = new GZIPInputStream(in, ExternalDataConstants.DEFAULT_BUFFER_SIZE); + in = new GZIPInputStream(new AvailableInputStream(in), ExternalDataConstants.DEFAULT_BUFFER_SIZE); } return true; } diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/azure/blob/AzureBlobInputStream.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/azure/blob/AzureBlobInputStream.java index cdb3834..567ceb7 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/azure/blob/AzureBlobInputStream.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/azure/blob/AzureBlobInputStream.java @@ -30,6 +30,7 @@ import org.apache.asterix.common.exceptions.ErrorCode; import org.apache.asterix.common.exceptions.RuntimeDataException; import org.apache.asterix.external.input.record.reader.abstracts.AbstractExternalInputStream; +import org.apache.asterix.external.input.record.reader.stream.AvailableInputStream; import org.apache.asterix.external.util.ExternalDataConstants; import org.apache.asterix.external.util.ExternalDataUtils; import org.apache.hyracks.api.exceptions.HyracksDataException; @@ -66,7 +67,7 @@ // Use gzip stream if needed String lowerCaseFileName = fileName.toLowerCase(); if (lowerCaseFileName.endsWith(".gz") || lowerCaseFileName.endsWith(".gzip")) { - in = new GZIPInputStream(in, ExternalDataConstants.DEFAULT_BUFFER_SIZE); + in = new GZIPInputStream(new AvailableInputStream(in), ExternalDataConstants.DEFAULT_BUFFER_SIZE); } } catch (BlobStorageException ex) { if (ex.getErrorCode().equals(BlobErrorCode.BLOB_NOT_FOUND)) { diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/azure/datalake/AzureDataLakeInputStream.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/azure/datalake/AzureDataLakeInputStream.java index e34d188..5fa8fd8 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/azure/datalake/AzureDataLakeInputStream.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/azure/datalake/AzureDataLakeInputStream.java @@ -30,6 +30,7 @@ import org.apache.asterix.common.exceptions.ErrorCode; import org.apache.asterix.common.exceptions.RuntimeDataException; import org.apache.asterix.external.input.record.reader.abstracts.AbstractExternalInputStream; +import org.apache.asterix.external.input.record.reader.stream.AvailableInputStream; import org.apache.asterix.external.util.ExternalDataConstants; import org.apache.asterix.external.util.ExternalDataUtils; import org.apache.hyracks.api.exceptions.HyracksDataException; @@ -66,7 +67,7 @@ // Use gzip stream if needed String lowerCaseFileName = fileName.toLowerCase(); if (lowerCaseFileName.endsWith(".gz") || lowerCaseFileName.endsWith(".gzip")) { - in = new GZIPInputStream(in, ExternalDataConstants.DEFAULT_BUFFER_SIZE); + in = new GZIPInputStream(new AvailableInputStream(in), ExternalDataConstants.DEFAULT_BUFFER_SIZE); } } catch (BlobStorageException ex) { if (ex.getErrorCode().equals(BlobErrorCode.BLOB_NOT_FOUND)) { diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/gcs/GCSInputStream.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/gcs/GCSInputStream.java index 5da4583..f154ee8 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/gcs/GCSInputStream.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/gcs/GCSInputStream.java @@ -31,6 +31,7 @@ import org.apache.asterix.common.exceptions.ErrorCode; import org.apache.asterix.common.exceptions.RuntimeDataException; import org.apache.asterix.external.input.record.reader.abstracts.AbstractExternalInputStream; +import org.apache.asterix.external.input.record.reader.stream.AvailableInputStream; import org.apache.asterix.external.util.ExternalDataConstants; import org.apache.asterix.external.util.ExternalDataUtils; import org.apache.commons.lang3.StringUtils; @@ -67,7 +68,7 @@ // Use gzip stream if needed if (StringUtils.endsWithIgnoreCase(fileName, ".gz") || StringUtils.endsWithIgnoreCase(fileName, ".gzip")) { - in = new GZIPInputStream(in, ExternalDataConstants.DEFAULT_BUFFER_SIZE); + in = new GZIPInputStream(new AvailableInputStream(in), ExternalDataConstants.DEFAULT_BUFFER_SIZE); } return true; } diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/AvailableInputStream.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/AvailableInputStream.java new file mode 100644 index 0000000..22f8df4 --- /dev/null +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/AvailableInputStream.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.external.input.record.reader.stream; + +import java.io.IOException; +import java.io.InputStream; + +public class AvailableInputStream extends InputStream { + private final InputStream is; + + public AvailableInputStream(InputStream inputstream) { + is = inputstream; + } + + public int read() throws IOException { + return (is.read()); + } + + public int read(byte[] b) throws IOException { + return (is.read(b)); + } + + public int read(byte[] b, int off, int len) throws IOException { + return (is.read(b, off, len)); + } + + public void close() throws IOException { + is.close(); + } + + public int available() throws IOException { + // Always say that we have 1 more byte in the + // buffer, even when we don't + int a = is.available(); + if (a == 0) { + return (1); + } else { + return (a); + } + } +} -- To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/19807 To unsubscribe, or for help writing mail filters, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-Project: asterixdb Gerrit-Branch: neo Gerrit-Change-Id: I47c2eb600dddba1198a92f5ff7dfc7f2da652c3e Gerrit-Change-Number: 19807 Gerrit-PatchSet: 5 Gerrit-Owner: Peeyush Gupta <[email protected]> Gerrit-Reviewer: Ali Alsuliman <[email protected]> Gerrit-Reviewer: Anon. E. Moose #1000171 Gerrit-Reviewer: Hussain Towaileb <[email protected]> Gerrit-Reviewer: Jenkins <[email protected]> Gerrit-Reviewer: Murtadha Hubail <[email protected]> Gerrit-Reviewer: Peeyush Gupta <[email protected]> Gerrit-MessageType: merged
