On Thu, Apr 16, 2020 at 12:59 PM Claes Redestad <claes.redes...@oracle.com> wrote:
> > I think this is an interesting idea and a good optimization. Looks like we get most of the performance win of Bloom filters while not introducing regressions for hits, footprint or complexity. We'll > deliberately cause a few more hash collisions, Will ZIP files commonly include entries for both "META-INF" and "META-INF/"? Some issues: > > - Need to be made to work on empty strings (hashN([], 0, [].length) -> > a[off + len - 1] -> a[-1] -> AIOOBE) > Oops. Should be safe to check for len > 0, right? if(len > 0 && a[off + len -1] == '/') len--; > - If name actually ends with a / we shouldn't look for name + /, > e.g.: > > (addSlash && name.length == nlen - 1 && name.length > 0 && > name[name.length - 1] != '/' && cen[pos + CENHDR + nlen - 1] == '/') > Fixed and added some comments for readability. Here's an updated patch: diff --git a/src/java.base/share/classes/java/util/zip/ZipFile.java b/src/java.base/share/classes/java/util/zip/ZipFile.java index 274e8788d1..ba5101e0db 100644 --- a/src/java.base/share/classes/java/util/zip/ZipFile.java +++ b/src/java.base/share/classes/java/util/zip/ZipFile.java @@ -1266,6 +1266,11 @@ public class ZipFile implements ZipConstants, Closeable { } private static final int hashN(byte[] a, int off, int len) { + // Performance optimization: getEntryPos assumes that the hash + // of a name remains unchanged when appending a trailing '/'. + if(len > 0 && a[off + len -1] == '/') + len--; + int h = 1; while (len-- > 0) { h = 31 * h + a[off++]; @@ -1273,10 +1278,6 @@ public class ZipFile implements ZipConstants, Closeable { return h; } - private static final int hash_append(int hash, byte b) { - return hash * 31 + b; - } - private static class End { int centot; // 4 bytes long cenlen; // 4 bytes @@ -1493,48 +1494,36 @@ public class ZipFile implements ZipConstants, Closeable { int hsh = hashN(name, 0, name.length); int idx = table[(hsh & 0x7fffffff) % tablelen]; /* - * This while loop is an optimization where a double lookup - * for name and name+/ is being performed. The name char - * array has enough room at the end to try again with a - * slash appended if the first table lookup does not succeed. + * Search down the target hash chain for a entry whose + * 32 bit hash matches the hashed name. */ - while (true) { - /* - * Search down the target hash chain for a entry whose - * 32 bit hash matches the hashed name. - */ - while (idx != ZIP_ENDCHAIN) { - if (getEntryHash(idx) == hsh) { - // The CEN name must match the specfied one - int pos = getEntryPos(idx); - if (name.length == CENNAM(cen, pos)) { - boolean matched = true; - int nameoff = pos + CENHDR; - for (int i = 0; i < name.length; i++) { - if (name[i] != cen[nameoff++]) { - matched = false; - break; - } - } - if (matched) { - return pos; + while (idx != ZIP_ENDCHAIN) { + if (getEntryHash(idx) == hsh) { + // The CEN name must match the specfied one + int pos = getEntryPos(idx); + final int nlen = CENNAM(cen, pos); + final int nstart = pos + CENHDR; + final boolean allowSlash = addSlash // we should match names with a trailing '/' + && nlen == name.length + 1 // nlen has one extra byte + && cen[nstart + nlen - 1] == '/' // that byte is a '/' + && name[name.length - 1] != '/'; // the name does not already end with '/' + if (name.length == nlen || allowSlash) { + boolean matched = true; + int nameoff = pos + CENHDR; + for (int i = 0; i < name.length; i++) { + if (name[i] != cen[nameoff++]) { + matched = false; + break; } - } + } + if (matched) { + return pos; + } } - idx = getEntryNext(idx); - } - /* If not addSlash, or slash is already there, we are done */ - if (!addSlash || name.length == 0 || name[name.length - 1] == '/') { - return -1; } - /* Add slash and try once more */ - name = Arrays.copyOf(name, name.length + 1); - name[name.length - 1] = '/'; - hsh = hash_append(hsh, (byte)'/'); - //idx = table[hsh % tablelen]; - idx = table[(hsh & 0x7fffffff) % tablelen]; - addSlash = false; + idx = getEntryNext(idx); } + return -1; } /**