This is an automated email from the ASF dual-hosted git repository. bodewig pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-compress.git
commit 03b7d1ecec4811fe7e6ae63eacb1e63291eaaf6e Merge: f7f7c53 50569e5 Author: Stefan Bodewig <[email protected]> AuthorDate: Wed Jan 1 17:26:07 2020 +0100 Merge branch 'COMPRESS-124' of https://github.com/PeterAlfreadLee/commons-compress into PeterAlfreadLee-COMPRESS-124 .../compress/archivers/tar/TarArchiveEntry.java | 53 ++- .../archivers/tar/TarArchiveInputStream.java | 417 +++++++++++++++++++-- .../archivers/tar/TarArchiveSparseEntry.java | 19 + .../archivers/tar/TarArchiveStructSparse.java | 81 ++++ .../compress/archivers/tar/TarConstants.java | 24 ++ .../commons/compress/archivers/tar/TarUtils.java | 15 +- .../commons/compress/utils/BoundedInputStream.java | 9 + .../compress/archivers/tar/SparseFilesTest.java | 205 +++++++++- .../archivers/tar/TarArchiveInputStreamTest.java | 10 +- .../compress/archivers/tar/TarUtilsTest.java | 12 + src/test/resources/oldgnu_extended_sparse.tar | Bin 0 -> 10240 bytes 11 files changed, 806 insertions(+), 39 deletions(-) diff --cc src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java index 65b7e32,72b6653..0f6b70f --- a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java +++ b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java @@@ -66,8 -66,16 +66,16 @@@ public class TarArchiveInputStream exte private long entryOffset; /** An input stream to read from */ - private final InputStream is; + private final InputStream inputStream; + /** Input streams for reading sparse entries **/ + private List<InputStream> sparseInputStreams; + + /** the index of current input stream being read when reading sparse entries */ + private int currentSparseInputStreamIndex; + + private InputStream sparseInputStream; + /** The meta-data about the current entry */ private TarArchiveEntry currEntry; @@@ -185,7 -196,14 +196,14 @@@ */ @Override public void close() throws IOException { + // Close all the input streams in sparseInputStreams + if(sparseInputStreams != null) { + for (InputStream inputStream : sparseInputStreams) { + inputStream.close(); + } + } + - is.close(); + inputStream.close(); } /** @@@ -243,14 -262,48 +262,48 @@@ return 0; } - final long available = entrySize - entryOffset; - final long skipped = IOUtils.skip(inputStream, Math.min(n, available)); - long available = currEntry.getRealSize() - entryOffset; - long skipped; ++ final long available = currEntry.getRealSize() - entryOffset; ++ final long skipped; + if(!currEntry.isSparse()) { - skipped = IOUtils.skip(is, Math.min(n, available)); ++ skipped = IOUtils.skip(inputStream, Math.min(n, available)); + } else { + skipped = skipSparse(n); + } count(skipped); entryOffset += skipped; return skipped; } /** + * Skip n bytes from current input stream, if the current input stream doesn't have enough data to skip, + * jump to the next input stream and skip the rest bytes, keep doing this until total n bytes are skipped + * or the input streams are all skipped + * + * @param n bytes of data to skip + * @return actual bytes of data skipped + * @throws IOException + */ + private long skipSparse(final long n) throws IOException { + if (sparseInputStreams.size() == 0) { - return is.skip(n); ++ return inputStream.skip(n); + } + + long bytesSkipped = 0; + InputStream currentInputStream; + + while (bytesSkipped < n && currentSparseInputStreamIndex < sparseInputStreams.size()) { + currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex); + bytesSkipped += currentInputStream.skip(n - bytesSkipped); + + if (bytesSkipped < n) { + currentSparseInputStreamIndex++; + } + } + + return bytesSkipped; + } + + /** * Since we do not support marking just yet, we return false. * * @return False. @@@ -470,20 -523,164 +523,164 @@@ getNextEntry(); // Get the actual file entry } + /** + * For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes) + * may appear multi times, and they look like: + * + * GNU.sparse.size=size + * GNU.sparse.numblocks=numblocks + * repeat numblocks times + * GNU.sparse.offset=offset + * GNU.sparse.numbytes=numbytes + * end repeat + * + * + * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map + * + * GNU.sparse.map + * Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]" + * + * + * For PAX Format 1.X: + * The sparse map itself is stored in the file data block, preceding the actual file data. + * It consists of a series of decimal numbers delimited by newlines. The map is padded with nulls to the nearest block boundary. + * The first number gives the number of entries in the map. Following are map entries, each one consisting of two numbers + * giving the offset and size of the data block it describes. + * @throws IOException + */ private void paxHeaders() throws IOException{ - final Map<String, String> headers = parsePaxHeaders(this); + List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>(); + final Map<String, String> headers = parsePaxHeaders(this, sparseHeaders); + + // for 0.1 PAX Headers + if (headers.containsKey("GNU.sparse.map")) { + sparseHeaders = parsePAX01SparseHeaders(headers.get("GNU.sparse.map")); + } getNextEntry(); // Get the actual file entry - applyPaxHeadersToCurrentEntry(headers); + applyPaxHeadersToCurrentEntry(headers, sparseHeaders); + + // for 1.0 PAX Format, the sparse map is stored in the file data block + if(currEntry.isPaxGNU1XSparse()) { + sparseHeaders = parsePAX1XSparseHeaders(); + currEntry.setSparseHeaders(sparseHeaders); + } + + // sparse headers are all done reading, we need to build + // sparse input streams using these sparse headers + buildSparseInputStreams(); } - // NOTE, using a Map here makes it impossible to ever support GNU - // sparse files using the PAX Format 0.0, see - // https://www.gnu.org/software/tar/manual/html_section/tar_92.html#SEC188 - Map<String, String> parsePaxHeaders(final InputStream inputStream) + /** + * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map + * GNU.sparse.map + * Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]" + * + * @param sparseMap the sparse map string consisting of comma-separated values "offset,size[,offset-1,size-1...]" + * @return sparse headers parsed from sparse map + * @throws IOException + */ + private List<TarArchiveStructSparse> parsePAX01SparseHeaders(String sparseMap) throws IOException { + List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>(); + String[] sparseHeaderStrings = sparseMap.split(","); + + for (int i = 0; i < sparseHeaderStrings.length;i += 2) { + long sparseOffset = Long.parseLong(sparseHeaderStrings[i]); + long sparseNumbytes = Long.parseLong(sparseHeaderStrings[i + 1]); + sparseHeaders.add(new TarArchiveStructSparse(sparseOffset, sparseNumbytes)); + } + + return sparseHeaders; + } + + /** + * For PAX Format 1.X: + * The sparse map itself is stored in the file data block, preceding the actual file data. + * It consists of a series of decimal numbers delimited by newlines. The map is padded with nulls to the nearest block boundary. + * The first number gives the number of entries in the map. Following are map entries, each one consisting of two numbers + * giving the offset and size of the data block it describes. + * @return sparse headers + * @throws IOException + */ + private List<TarArchiveStructSparse> parsePAX1XSparseHeaders() throws IOException { + // for 1.X PAX Headers + List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>(); + long bytesRead = 0; + long[] readResult; + long sparseHeadersCount; + - readResult = readLineOfNumberForPax1X(is); ++ readResult = readLineOfNumberForPax1X(inputStream); + sparseHeadersCount = readResult[0]; + bytesRead += readResult[1]; + while (sparseHeadersCount-- > 0) { - readResult = readLineOfNumberForPax1X(is); ++ readResult = readLineOfNumberForPax1X(inputStream); + long sparseOffset = readResult[0]; + bytesRead += readResult[1]; + - readResult = readLineOfNumberForPax1X(is); ++ readResult = readLineOfNumberForPax1X(inputStream); + long sparseNumbytes = readResult[0]; + bytesRead += readResult[1]; + sparseHeaders.add(new TarArchiveStructSparse(sparseOffset, sparseNumbytes)); + } + + // skip the rest of this record data + long bytesToSkip = recordSize - bytesRead % recordSize; - IOUtils.skip(is, bytesToSkip); ++ IOUtils.skip(inputStream, bytesToSkip); + return sparseHeaders; + } + + /** + * For 1.X PAX Format, the sparse headers are stored in the file data block, preceding the actual file data. + * It consists of a series of decimal numbers delimited by newlines. + * + * @param inputStream the input stream of the tar file + * @return the decimal number delimited by '\n', and the bytes read from input stream + * @throws IOException + */ + private long[] readLineOfNumberForPax1X(InputStream inputStream) throws IOException { + int number; + long result = 0; + long bytesRead = 0; + + while((number = inputStream.read()) != '\n') { + bytesRead += 1; + if(number == -1) { + throw new IOException("Unexpected EOF when reading parse information of 1.X PAX format"); + } + result = result * 10 + (number - '0'); + } + bytesRead += 1; + + return new long[] {result, bytesRead}; + } + + /** + * For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes) + * may appear multi times, and they look like: + * + * GNU.sparse.size=size + * GNU.sparse.numblocks=numblocks + * repeat numblocks times + * GNU.sparse.offset=offset + * GNU.sparse.numbytes=numbytes + * end repeat + * + * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map + * + * GNU.sparse.map + * Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]" + * + * @param i inputstream to read keys and values + * @param sparseHeaders used in PAX Format 0.0 & 0.1, as it may appear multi times, + * the sparse headers need to be stored in an array, not a map + * @return + * @throws IOException + */ - Map<String, String> parsePaxHeaders(final InputStream i, List<TarArchiveStructSparse> sparseHeaders) ++ Map<String, String> parsePaxHeaders(final InputStream inputStream, List<TarArchiveStructSparse> sparseHeaders) throws IOException { final Map<String, String> headers = new HashMap<>(globalPaxHeaders); + TarArchiveStructSparse sparseHeader = null; // Format is "length keyword=value\n"; -- while(true){ // get length ++ while(true) { // get length int ch; int len = 0; int read = 0; @@@ -637,9 -846,25 +846,25 @@@ throw new IllegalStateException("No current tar entry"); } + if (!currEntry.isSparse()) { + if (entryOffset >= entrySize) { + return -1; + } + } else { + // for sparse entries, there are actually currEntry.getRealSize() bytes to read + if (entryOffset >= currEntry.getRealSize()) { + return -1; + } + } + numToRead = Math.min(numToRead, available()); - totalRead = inputStream.read(buf, offset, numToRead); + if (currEntry.isSparse()) { + // for sparse entries, we need to read them in another way + totalRead = readSparse(buf, offset, numToRead); + } else { - totalRead = is.read(buf, offset, numToRead); ++ totalRead = inputStream.read(buf, offset, numToRead); + } if (totalRead == -1) { if (numToRead > 0) { @@@ -655,6 -880,61 +880,61 @@@ } /** + * For sparse tar entries, there are many "holes"(consisting of all 0) in the file. Only the non-zero data is + * stored in tar files, and they are stored separately. The structure of non-zero data is introduced by the + * sparse headers using the offset, where a block of non-zero data starts, and numbytes, the length of the + * non-zero data block. + * When reading sparse entries, the actual data is read out with "holes" and non-zero data combined together + * according to the sparse headers. + * + * @param buf The buffer into which to place bytes read. + * @param offset The offset at which to place bytes read. + * @param numToRead The number of bytes to read. + * @return The number of bytes read, or -1 at EOF. + * @throws IOException on error + */ + private int readSparse(final byte[] buf, final int offset, int numToRead) throws IOException { + // if there are no actual input streams, just read from the original input stream + if (sparseInputStreams.size() == 0) { - return is.read(buf, offset, numToRead); ++ return inputStream.read(buf, offset, numToRead); + } + + if(currentSparseInputStreamIndex >= sparseInputStreams.size()) { + return -1; + } + + InputStream currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex); + int readLen = currentInputStream.read(buf, offset, numToRead); + + // if the current input stream is the last input stream, + // just return the number of bytes read from current input stream + if (currentSparseInputStreamIndex == sparseInputStreams.size() - 1) { + return readLen; + } + + // if EOF of current input stream is meet, open a new input stream and recursively call read + if (readLen == -1) { + currentSparseInputStreamIndex++; + return readSparse(buf, offset, numToRead); + } + + // if the rest data of current input stream is not long enough, open a new input stream + // and recursively call read + if (readLen < numToRead) { + currentSparseInputStreamIndex++; + int readLenOfNext = readSparse(buf, offset + readLen, numToRead - readLen); + if (readLenOfNext == -1) { + return readLen; + } + + return readLen + readLenOfNext; + } + + // if the rest data of current input stream is enough(which means readLen == len), just return readLen + return readLen; + } + + /** * Whether this class is able to read the given entry. * * <p>May return false if the current entry is a sparse file.</p> @@@ -745,4 -1025,85 +1025,85 @@@ signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN); } + /** + * Build the input streams consisting of all-zero input streams and non-zero input streams. + * When reading from the non-zero input streams, the data is actually read from the original input stream. + * The size of each input stream is introduced by the sparse headers. + * + * NOTE : Some all-zero input streams and non-zero input streams have the size of 0. We DO NOT store the + * 0 size input streams because they are meaningless. + */ + private void buildSparseInputStreams() throws IOException { + currentSparseInputStreamIndex = -1; + sparseInputStreams = new ArrayList<>(); + InputStream zeroInputStream = new TarArchiveSparseZeroInputStream(); + + long offset = 0; + List<TarArchiveStructSparse> sparseHeaders = currEntry.getSparseHeaders(); + // sort the sparse headers in case they are written in wrong order + if (sparseHeaders != null && sparseHeaders.size() > 1) { + final Comparator<TarArchiveStructSparse> sparseHeaderComparator = new Comparator<TarArchiveStructSparse>() { + @Override + public int compare(final TarArchiveStructSparse p, final TarArchiveStructSparse q) { + Long pOffset = p.getOffset(); + Long qOffset = q.getOffset(); + return pOffset.compareTo(qOffset); + } + }; + Collections.sort(sparseHeaders, sparseHeaderComparator); + } + + for (TarArchiveStructSparse sparseHeader : sparseHeaders) { + if (sparseHeader.getOffset() == 0 && sparseHeader.getNumbytes() == 0) { + break; + } + + if ((sparseHeader.getOffset() - offset) < 0) { + throw new IOException("Corrupted struct sparse detected"); + } + + // only store the input streams with non-zero size + if ((sparseHeader.getOffset() - offset) > 0) { + sparseInputStreams.add(new BoundedInputStream(zeroInputStream, sparseHeader.getOffset() - offset)); + } + + // only store the input streams with non-zero size + if (sparseHeader.getNumbytes() > 0) { - sparseInputStreams.add(new BoundedInputStream(is, sparseHeader.getNumbytes())); ++ sparseInputStreams.add(new BoundedInputStream(inputStream, sparseHeader.getNumbytes())); + } + + offset = sparseHeader.getOffset() + sparseHeader.getNumbytes(); + } + + if (sparseInputStreams.size() > 0) { + currentSparseInputStreamIndex = 0; + } + } + + /** + * This is an inputstream that always return 0, + * this is used when reading the "holes" of a sparse file + */ + public class TarArchiveSparseZeroInputStream extends InputStream { + /** + * Just return 0 + * @return + * @throws IOException + */ + @Override + public int read() throws IOException { + return 0; + } + + /** + * these's nothing need to do when skipping + * + * @param n bytes to skip + * @return bytes actually skipped + */ + @Override + public long skip(final long n) { + return n; + } + } }
