Author: bodewig Date: Wed Feb 18 14:51:10 2009 New Revision: 745528 URL: http://svn.apache.org/viewvc?rev=745528&view=rev Log: Add support for extra fields introduced by InfoZIP in order to store UTF-8 filenames in ZIPs. Submitted by Wolfgang Glas. SANDBOX-176
Added: commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/AbstractUnicodeExtraField.java (with props) commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnicodeCommentExtraField.java (with props) commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnicodePathExtraField.java (with props) commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java (with props) Modified: commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/ExtraFieldUtils.java Added: commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/AbstractUnicodeExtraField.java URL: http://svn.apache.org/viewvc/commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/AbstractUnicodeExtraField.java?rev=745528&view=auto ============================================================================== --- commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/AbstractUnicodeExtraField.java (added) +++ commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/AbstractUnicodeExtraField.java Wed Feb 18 14:51:10 2009 @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.commons.compress.archivers.zip; + +import java.io.UnsupportedEncodingException; +import java.util.zip.CRC32; +import java.util.zip.ZipException; + +/** + * A common base class for Unicode extra information extra fields. + */ +public abstract class AbstractUnicodeExtraField implements ZipExtraField { + private long nameCRC32; + private byte[] unicodeName; + private byte[] data; + + protected AbstractUnicodeExtraField() { + } + + /** + * Assemble as unicode path extension form the name and encoding + * of the orginal zip entry. + * + * @param name The file name or comment. + * @param zipEncoding The encoding of the filenames in the zip + * file, usually <code>"CP437"</code>. + */ + protected AbstractUnicodeExtraField(String name, String zipEncoding) { + + byte[] filename = ZipEncodingHelper.encodeName(name, zipEncoding); + + CRC32 crc32 = new CRC32(); + crc32.update(filename); + nameCRC32 = crc32.getValue(); + + try { + unicodeName = name.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException("FATAL: UTF-8 encoding not supported.", + e); + } + } + + private void assembleData() { + if (unicodeName == null) { + return; + } + + data = new byte[5 + unicodeName.length]; + // version 1 + data[0] = 0x01; + System.arraycopy(ZipLong.getBytes(nameCRC32), 0, data, 1, 4); + System.arraycopy(unicodeName, 0, data, 5, unicodeName.length); + } + + /** + * @return The CRC32 checksum of the filename or comment as + * encoded in the central directory of the zip file. + */ + public long getNameCRC32() { + return nameCRC32; + } + + /** + * @param nameCRC32 The CRC32 checksum of the filename as encoded + * in the central directory of the zip file to set. + */ + public void setNameCRC32(long nameCRC32) { + nameCRC32 = nameCRC32; + data = null; + } + + /** + * @return The utf-8 encoded name. + */ + public byte[] getUnicodeName() { + return unicodeName; + } + + /** + * @param unicodeName The utf-8 encoded name to set. + */ + public void setUnicodeName(byte[] unicodeName) { + unicodeName = unicodeName; + data = null; + } + + public byte[] getCentralDirectoryData() { + if (data == null) { + this.assembleData(); + } + return data; + } + + public ZipShort getCentralDirectoryLength() { + if (data == null) { + assembleData(); + } + return new ZipShort(data.length); + } + + public byte[] getLocalFileDataData() { + return getCentralDirectoryData(); + } + + public ZipShort getLocalFileDataLength() { + return getCentralDirectoryLength(); + } + + public void parseFromLocalFileData(byte[] buffer, int offset, int length) + throws ZipException { + + if (length < 5) { + throw new ZipException("UniCode path extra data must have at least" + + " 5 bytes."); + } + + int version = buffer[offset]; + + if (version != 0x01) { + throw new ZipException("Unsupported version [" + version + + "] for UniCode path extra data."); + } + + nameCRC32 = ZipLong.getValue(buffer, offset + 1); + unicodeName = new byte[length - 5]; + System.arraycopy(buffer, offset + 5, unicodeName, 0, length - 5); + data = null; + } + +} Propchange: commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/AbstractUnicodeExtraField.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/ExtraFieldUtils.java URL: http://svn.apache.org/viewvc/commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/ExtraFieldUtils.java?rev=745528&r1=745527&r2=745528&view=diff ============================================================================== --- commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/ExtraFieldUtils.java (original) +++ commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/ExtraFieldUtils.java Wed Feb 18 14:51:10 2009 @@ -43,6 +43,8 @@ implementations = new HashMap(); register(AsiExtraField.class); register(JarMarker.class); + register(UnicodePathExtraField.class); + register(UnicodeCommentExtraField.class); } /** Added: commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnicodeCommentExtraField.java URL: http://svn.apache.org/viewvc/commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnicodeCommentExtraField.java?rev=745528&view=auto ============================================================================== --- commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnicodeCommentExtraField.java (added) +++ commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnicodeCommentExtraField.java Wed Feb 18 14:51:10 2009 @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.commons.compress.archivers.zip; + +/** + * Info-ZIP Unicode Comment Extra Field (0x6375): + * + * <p>Stores the UTF-8 version of the file comment as stored in the + * central directory header.</p> + * + * <pre> + * Value Size Description + * ----- ---- ----------- + * (UCom) 0x6375 Short tag for this extra block type ("uc") + * TSize Short total data size for this block + * Version 1 byte version of this extra field, currently 1 + * ComCRC32 4 bytes Comment Field CRC32 Checksum + * UnicodeCom Variable UTF-8 version of the entry comment + * </pre> + */ +public class UnicodeCommentExtraField extends AbstractUnicodeExtraField { + + public static final ZipShort UCOM_ID = new ZipShort(0x6375); + + public UnicodeCommentExtraField () { + } + + /** + * Assemble as unicode comment extension form the comment and + * encoding of the orginal zip entry. + * + * @param name The file name + * @param zipEncoding The encoding of the comment in the zip file, + * usually <code>"CP437"</code>. + */ + public UnicodeCommentExtraField(String name, String zipEncoding) { + super(name, zipEncoding); + } + + public ZipShort getHeaderId() { + return UCOM_ID; + } + +} Propchange: commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnicodeCommentExtraField.java ------------------------------------------------------------------------------ svn:eol-style = native Added: commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnicodePathExtraField.java URL: http://svn.apache.org/viewvc/commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnicodePathExtraField.java?rev=745528&view=auto ============================================================================== --- commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnicodePathExtraField.java (added) +++ commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnicodePathExtraField.java Wed Feb 18 14:51:10 2009 @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.commons.compress.archivers.zip; + +/** + * Info-ZIP Unicode Path Extra Field (0x7075): + * + * <p>Stores the UTF-8 version of the file name field as stored in the + * local header and central directory header.</p> + * + * <pre> + * Value Size Description + * ----- ---- ----------- + * (UPath) 0x7075 Short tag for this extra block type ("up") + * TSize Short total data size for this block + * Version 1 byte version of this extra field, currently 1 + * NameCRC32 4 bytes File Name Field CRC32 Checksum + * UnicodeName Variable UTF-8 version of the entry File Name + * </pre> + */ +public class UnicodePathExtraField extends AbstractUnicodeExtraField { + + public static final ZipShort UPATH_ID = new ZipShort(0x7075); + + public UnicodePathExtraField () { + } + + /** + * Assemble as unicode path extension form the name and encoding + * of the orginal zip entry. + * + * @param name The file name + * @param zipEncoding The encoding of the filename in the zip + * file, usually <code>"CP437"</code>. + */ + public UnicodePathExtraField(String name, String zipEncoding) { + super(name, zipEncoding); + } + + public ZipShort getHeaderId() { + return UPATH_ID; + } +} Propchange: commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnicodePathExtraField.java ------------------------------------------------------------------------------ svn:eol-style = native Added: commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java URL: http://svn.apache.org/viewvc/commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java?rev=745528&view=auto ============================================================================== --- commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java (added) +++ commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java Wed Feb 18 14:51:10 2009 @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.commons.compress.archivers.zip; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; + +/** + * Static helper functions for robustly encoding filenames in zip files. + */ +abstract class ZipEncodingHelper { + + /** + * Grow a byte buffer, so it has a minimal capacity or at least + * the double capacity of the original buffer + * + * @param b The original buffer. + * @param newCapacity The minimal requested new capacity. + * @return A byte buffer <code>r</code> with + * <code>r.capacity() = max(b.capacity()*2,newCapacity)</code> and + * all the data contained in <code>b</code> copied to the beginning + * of <code>r</code>. + * + */ + static ByteBuffer growBuffer(ByteBuffer b, int newCapacity) { + b.limit(b.position()); + b.rewind(); + + int c2 = b.capacity() * 2; + ByteBuffer on = ByteBuffer.allocate(c2 < newCapacity ? newCapacity : c2); + + on.put(b); + return on; + } + + + /** + * The hexadecimal digits <code>0,...,9,A,...,F</code> encoded as + * ASCII bytes. + */ + private static final byte[] HEX_DIGITS = + new byte [] { + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x41, + 0x42, 0x43, 0x44, 0x45, 0x46 + }; + + /** + * Encode a filename or a comment to a byte array suitable for + * storing it to a serialized zip entry. + * + * Examples (in pseudo-notation, right hand side is C-style notation): + * <pre> + * encodeName("\u20AC_for_Dollar.txt","CP437") = "%U20AC_for_Dollar.txt" + * encodeName("\u00D6lf\u00E4sser.txt","CP437") = "\231lf\204sser.txt" + * </pre> + * + * @param name The filename or comment with possible non-ASCII + * unicode characters. + * @param encoding A valid encoding name. The standard zip + * encoding is <code>"CP437"</code>, + * <code>"UTF-8"</code> is supported in ZIP file + * version <code>6.3</code> or later. + * @return A byte array containing the mapped file + * name. Unmappable characters or malformed character + * sequences are mapped to a sequence of utf-16 words + * encoded in the format <code>%Uxxxx</code>. + */ + static final byte[] encodeName(String name, String encoding) { + Charset cs = Charset.forName(encoding); + CharsetEncoder enc = cs.newEncoder(); + + enc.onMalformedInput(CodingErrorAction.REPORT); + enc.onUnmappableCharacter(CodingErrorAction.REPORT); + + CharBuffer cb = CharBuffer.wrap(name); + ByteBuffer out = ByteBuffer.allocate(name.length() + + (name.length() + 1) / 2); + + while (cb.remaining() > 0) { + CoderResult res = enc.encode(cb, out,true); + + if (res.isUnmappable() || res.isMalformed()) { + + // write the unmappable characters in utf-16 + // pseudo-URL encoding style to ByteBuffer. + if (res.length() * 6 > out.remaining()) { + out = growBuffer(out,out.position() + res.length() * 6); + } + + for (int i=0; i<res.length(); ++i) { + out.put((byte) '%'); + out.put((byte) 'U'); + + char c = cb.get(); + + out.put(HEX_DIGITS[(c >> 12)&0x0f]); + out.put(HEX_DIGITS[(c >> 8)&0x0f]); + out.put(HEX_DIGITS[(c >> 4)&0x0f]); + out.put(HEX_DIGITS[c & 0x0f]); + } + + } else if (res.isOverflow()) { + + out = growBuffer(out, 0); + + } else if (res.isUnderflow()) { + + enc.flush(out); + break; + + } + } + + byte [] ret = new byte[out.position()]; + out.rewind(); + out.get(ret); + + return ret; + } + + /** + * Return, whether a filename or a comment may be encoded to a + * byte array suitable for storing it to a serialized zip entry + * without any losses. + * + * Examples (in pseudo-notation, right hand side is C-style notation): + * <pre> + * canEncodeName("\u20AC_for_Dollar.txt","CP437") = false + * canEncodeName("\u20AC_for_Dollar.txt","UTF-8") = true + * canEncodeName("\u00D6lf\u00E4sser.txt","CP437") = true + * </pre> + * + * @param name The filename or comment with possible non-ASCII + * unicode characters. + * @param encoding A valid encoding name. The standard zip + * encoding is <code>"CP437"</code>, + * <code>"UTF-8"</code> is supported in ZIP file + * version <code>6.3</code> or later. + * @return Whether the given encoding may encode the given name. + */ + static final boolean canEncodeName(String name, String encoding) { + + Charset cs = Charset.forName(encoding); + + CharsetEncoder enc = cs.newEncoder(); + enc.onMalformedInput(CodingErrorAction.REPORT); + enc.onUnmappableCharacter(CodingErrorAction.REPORT); + + return enc.canEncode(name); + } +} Propchange: commons/sandbox/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java ------------------------------------------------------------------------------ svn:eol-style = native