Hi, I did some more testing with zlib-like and multiplicative hashing. The differences for the files I was testing were quite small. However, the proposed hash function, as well as a slightly modified version of it, did come out slightly ahead. So it might as well be used.
I also did a few quick tests with greedy parsing. In general, it seemed to improve performance by 10-20% and increase the size of the compressed files by 1-2%. If the performance improvement is considered more desirable, then I can change the patch to use greedy parsing. For now it's using lazy parsing, like it was before but more optimized. Here's the updated patch. ---- diff --git a/libntfs-3g/compress.c b/libntfs-3g/compress.c index 73ad283..1fefc3e 100644 --- a/libntfs-3g/compress.c +++ b/libntfs-3g/compress.c @@ -6,6 +6,7 @@ * Copyright (c) 2004-2006 Szabolcs Szakacsits * Copyright (c) 2005 Yura Pakhuchiy * Copyright (c) 2009-2014 Jean-Pierre Andre + * Copyright (c) 2014 Eric Biggers * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -21,17 +22,6 @@ * along with this program (in the main directory of the NTFS-3G * distribution in the file COPYING); if not, write to the Free Software * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * A part of the compression algorithm is based on lzhuf.c whose header - * describes the roles of the original authors (with no apparent copyright - * notice, and according to http://home.earthlink.net/~neilbawd/pall.html - * this was put into public domain in 1988 by Haruhiko OKUMURA). - * - * LZHUF.C English version 1.0 - * Based on Japanese version 29-NOV-1988 - * LZSS coded by Haruhiko OKUMURA - * Adaptive Huffman Coding coded by Haruyasu YOSHIZAKI - * Edited and translated to English by Kenji RIKITAKE */ #ifdef HAVE_CONFIG_H @@ -81,96 +71,256 @@ typedef enum { NTFS_SB_IS_COMPRESSED = 0x8000, } ntfs_compression_constants; +/* Match length at or above which ntfs_best_match() will stop searching for + * longer matches. */ +#define NICE_MATCH_LEN 18 + +/* Maximum number of potential matches that ntfs_best_match() will consider at + * each position. */ +#define MAX_SEARCH_DEPTH 24 + +/* Number of entries in the hash table for match-finding. + * + * This can be changed, but ntfs_hash() would need to be updated to use an + * appropriate shift. Also, if this is made more than 1 << 16 (not recommended + * for the 4096-byte buffers used in NTFS compression!), then 'crc_table' would + * need to be updated to use 32-bit entries. */ +#define HASH_LEN (1 << 14) + struct COMPRESS_CONTEXT { const unsigned char *inbuf; int bufsize; int size; int rel; int mxsz; - s16 head[256]; - s16 lson[NTFS_SB_SIZE]; - s16 rson[NTFS_SB_SIZE]; + s16 head[HASH_LEN]; + s16 prev[NTFS_SB_SIZE]; } ; /* - * Search for the longest sequence matching current position + * CRC table for hashing bytes for Lempel-Ziv match-finding. * - * A binary tree is maintained to locate all previously met sequences, - * and this function has to be called for all of them. + * We use a CRC32 for this purpose. But since log2(HASH_LEN) <= 16, we only + * need 16 bit entries, each of which contains the low 16 bits of the entry in a + * real CRC32 table. (CRC16 would also work, but it caused more collisions when + * I tried it.) * - * This function is heavily used, it has to be optimized carefully + * Hard-coding the table avoids dealing with thread-safe initialization. + */ +static const u16 crc_table[256] = { + 0x0000, 0x3096, 0x612C, 0x51BA, 0xC419, 0xF48F, 0xA535, 0x95A3, + 0x8832, 0xB8A4, 0xE91E, 0xD988, 0x4C2B, 0x7CBD, 0x2D07, 0x1D91, + 0x1064, 0x20F2, 0x7148, 0x41DE, 0xD47D, 0xE4EB, 0xB551, 0x85C7, + 0x9856, 0xA8C0, 0xF97A, 0xC9EC, 0x5C4F, 0x6CD9, 0x3D63, 0x0DF5, + 0x20C8, 0x105E, 0x41E4, 0x7172, 0xE4D1, 0xD447, 0x85FD, 0xB56B, + 0xA8FA, 0x986C, 0xC9D6, 0xF940, 0x6CE3, 0x5C75, 0x0DCF, 0x3D59, + 0x30AC, 0x003A, 0x5180, 0x6116, 0xF4B5, 0xC423, 0x9599, 0xA50F, + 0xB89E, 0x8808, 0xD9B2, 0xE924, 0x7C87, 0x4C11, 0x1DAB, 0x2D3D, + 0x4190, 0x7106, 0x20BC, 0x102A, 0x8589, 0xB51F, 0xE4A5, 0xD433, + 0xC9A2, 0xF934, 0xA88E, 0x9818, 0x0DBB, 0x3D2D, 0x6C97, 0x5C01, + 0x51F4, 0x6162, 0x30D8, 0x004E, 0x95ED, 0xA57B, 0xF4C1, 0xC457, + 0xD9C6, 0xE950, 0xB8EA, 0x887C, 0x1DDF, 0x2D49, 0x7CF3, 0x4C65, + 0x6158, 0x51CE, 0x0074, 0x30E2, 0xA541, 0x95D7, 0xC46D, 0xF4FB, + 0xE96A, 0xD9FC, 0x8846, 0xB8D0, 0x2D73, 0x1DE5, 0x4C5F, 0x7CC9, + 0x713C, 0x41AA, 0x1010, 0x2086, 0xB525, 0x85B3, 0xD409, 0xE49F, + 0xF90E, 0xC998, 0x9822, 0xA8B4, 0x3D17, 0x0D81, 0x5C3B, 0x6CAD, + 0x8320, 0xB3B6, 0xE20C, 0xD29A, 0x4739, 0x77AF, 0x2615, 0x1683, + 0x0B12, 0x3B84, 0x6A3E, 0x5AA8, 0xCF0B, 0xFF9D, 0xAE27, 0x9EB1, + 0x9344, 0xA3D2, 0xF268, 0xC2FE, 0x575D, 0x67CB, 0x3671, 0x06E7, + 0x1B76, 0x2BE0, 0x7A5A, 0x4ACC, 0xDF6F, 0xEFF9, 0xBE43, 0x8ED5, + 0xA3E8, 0x937E, 0xC2C4, 0xF252, 0x67F1, 0x5767, 0x06DD, 0x364B, + 0x2BDA, 0x1B4C, 0x4AF6, 0x7A60, 0xEFC3, 0xDF55, 0x8EEF, 0xBE79, + 0xB38C, 0x831A, 0xD2A0, 0xE236, 0x7795, 0x4703, 0x16B9, 0x262F, + 0x3BBE, 0x0B28, 0x5A92, 0x6A04, 0xFFA7, 0xCF31, 0x9E8B, 0xAE1D, + 0xC2B0, 0xF226, 0xA39C, 0x930A, 0x06A9, 0x363F, 0x6785, 0x5713, + 0x4A82, 0x7A14, 0x2BAE, 0x1B38, 0x8E9B, 0xBE0D, 0xEFB7, 0xDF21, + 0xD2D4, 0xE242, 0xB3F8, 0x836E, 0x16CD, 0x265B, 0x77E1, 0x4777, + 0x5AE6, 0x6A70, 0x3BCA, 0x0B5C, 0x9EFF, 0xAE69, 0xFFD3, 0xCF45, + 0xE278, 0xD2EE, 0x8354, 0xB3C2, 0x2661, 0x16F7, 0x474D, 0x77DB, + 0x6A4A, 0x5ADC, 0x0B66, 0x3BF0, 0xAE53, 0x9EC5, 0xCF7F, 0xFFE9, + 0xF21C, 0xC28A, 0x9330, 0xA3A6, 0x3605, 0x0693, 0x5729, 0x67BF, + 0x7A2E, 0x4AB8, 0x1B02, 0x2B94, 0xBE37, 0x8EA1, 0xDF1B, 0xEF8D +}; + +#if 0 + +#define CRC32_POLYNOMIAL 0xEDB88320 + +static void crc_table_init(void) +{ + int i, j; + u32 r; + + for (i = 0; i < 256; i++) { + r = i; + for (j = 0; j < 8; j++) + r = (r >> 1) ^ (CRC32_POLYNOMIAL & ~((r & 1) - 1)); + crc_table[i] = r; + } +} +#endif + +/* + * Hash the next 3-byte sequence in the input buffer * - * Returns the size of the longest match, - * zero if no match is found. + * Currently, we use a hash function similar to that used by 7-Zip's + * DEFLATE encoder. It takes slightly longer to compute than zlib's hash + * function, but it provides better collision resistance. */ +static inline unsigned int ntfs_hash(const u8 *p) +{ + unsigned int hash = 0; + + hash ^= (unsigned int)p[0] << 6; + hash ^= crc_table[p[1]]; + hash ^= p[2]; -static int ntfs_best_match(struct COMPRESS_CONTEXT *pctx, int i) + return hash % HASH_LEN; +} + +/* + * Search for the longest sequence matching current position + * + * A hash table, each entry of which points to a chain of sequence + * positions sharing the corresponding hash code, is maintained to speed up + * searching for matches. To maintain the hash table, either + * ntfs_best_match() or ntfs_skip_position() has to be called for each + * consecutive position. + * + * This function is heavily used; it has to be optimized carefully. + * + * This function sets pctx->size and pctx->rel to the length and offset, + * respectively, of the longest match found. + * + * The minimum match length is assumed to be 3, and the maximum match + * length is assumed to be pctx->mxsz. If this function produces + * pctx->size < 3, then no match was found. + * + * Note: for the following reasons, this function is not guaranteed to find + * *the* longest match up to pctx->mxsz: + * + * (1) If this function finds a match of NICE_MATCH_LEN bytes or greater, + * it ends early because a match this long is good enough and it's not + * worth spending more time searching. + * + * (2) If this function considers MAX_SEARCH_DEPTH matches with a single + * position, it ends early and returns the longest match found so far. + * This saves a lot of time on degenerate inputs. + */ +static void ntfs_best_match(struct COMPRESS_CONTEXT *pctx, const int i, int best_len) { - s16 *prev; - int node; - register long j; - long maxpos; - long startj; - long bestj; - int bufsize; - int bestnode; - register const unsigned char *p1,*p2; - - p1 = pctx->inbuf; - node = pctx->head[p1[i] & 255]; - if (node >= 0) { - /* search the best match at current position */ - bestnode = node; - bufsize = pctx->bufsize; - /* restrict matches to the longest allowed sequence */ - maxpos = bufsize; - if ((i + pctx->mxsz) < maxpos) - maxpos = i + pctx->mxsz; - startj = i + 1 - maxpos; - bestj = startj; - /* make indexes relative to end of allowed position */ - p1 = &p1[maxpos]; - if (startj < 0) { - do { - /* indexes are negative */ - p2 = &p1[node - i]; - /* no need to compare the first byte */ - j = startj; - /* the second byte cannot lead to useful compression */ - if (p1[j] == p2[j]) { - j++; - if (j < 0) { - do { - } while ((p1[j] == p2[j]) - && (++j < 0)); - } - /* remember the match, if better */ - if (j > bestj) { - bestj = j; - bestnode = node; - } - } - /* walk in the tree in the right direction */ - if ((j < 0) && (p1[j] < p2[j])) - prev = &pctx->lson[node]; - else - prev = &pctx->rson[node]; - node = *prev; - /* stop if reaching a leaf or maximum length */ - } while ((node >= 0) && (j < 0)); - /* put the node into the tree if we reached a leaf */ - if (node < 0) - *prev = i; + const u8 * const inbuf = pctx->inbuf; + const u8 * const strptr = &inbuf[i]; /* String we're matching against */ + const s16 * const prev = pctx->prev; + const int max_len = min(pctx->bufsize - i, pctx->mxsz); + const int nice_len = min(NICE_MATCH_LEN, max_len); + int depth_remaining = MAX_SEARCH_DEPTH; + const u8 *best_matchptr = strptr; + unsigned int hash; + s16 cur_match; + const u8 *matchptr; + int len; + + if (best_len >= max_len) { + /* It's not possible to find a match of the length desired. + * This can happen if there are fewer than 3 bytes remaining in + * the input buffer. This can also happen if a lazy match is + * attempted, but there aren't enough length bits remaining to + * code a longer match. In the latter case, the hash chain + * still must be updated. */ + if (max_len >= 3) { + hash = ntfs_hash(strptr); + cur_match = pctx->head[hash]; + pctx->prev[i] = cur_match; + pctx->head[hash] = i; } - /* done, return the best match */ - pctx->size = bestj + maxpos - i; - pctx->rel = bestnode - i; - } else { - pctx->head[p1[i] & 255] = i; - pctx->size = 0; - pctx->rel = 0; + goto out; + } + + /* Insert the current sequence into the appropriate hash chain. */ + + hash = ntfs_hash(strptr); + cur_match = pctx->head[hash]; + pctx->prev[i] = cur_match; + pctx->head[hash] = i; + + /* Search the appropriate hash chain for matches. */ + + for (; cur_match >= 0 && depth_remaining--; cur_match = prev[cur_match]) { + + matchptr = &inbuf[cur_match]; + + /* Considering the potential match at 'matchptr': is it longer + * than 'best_len'? + * + * The bytes at index 'best_len' are the most likely to differ, + * so check them first. + * + * The bytes at indices 'best_len - 1' and '0' are less + * important to check separately. But doing so still gives a + * slight performance improvement, at least on x86_64, probably + * because they create separate branches for the CPU to predict + * independently of the branches in the main comparison loops. + */ + if (matchptr[best_len] != strptr[best_len] || + matchptr[best_len - 1] != strptr[best_len - 1] || + matchptr[0] != strptr[0]) + goto next_match; + + for (len = 1; len < best_len - 1; len++) + if (matchptr[len] != strptr[len]) + goto next_match; + + /* The match is the longest found so far --- + * at least 'best_len' + 1 bytes. Continue extending it. */ + + best_matchptr = matchptr; + + do { + if (++best_len >= nice_len) { + /* 'nice_len' reached; don't waste time + * searching for longer matches. Extend the + * match as far as possible and terminate the + * search. */ + while (best_len < max_len && + best_matchptr[best_len] == strptr[best_len]) + { + best_len++; + } + goto out; + } + } while (best_matchptr[best_len] == strptr[best_len]); + + /* Found a longer match, but 'nice_len' not yet reached. */ + + next_match: + /* Continue to next match in the chain. */ + ; } - return (pctx->size); + + /* Reached end of chain, or ended early due to reaching the maximum + * search depth. */ + +out: + /* Return the longest match we were able to find. */ + pctx->size = best_len; + pctx->rel = best_matchptr - strptr; /* given as a negative number! */ +} + +/* + * Advance the match-finder, but don't search for matches. + */ +static void ntfs_skip_position(struct COMPRESS_CONTEXT *pctx, const int i) +{ + unsigned int hash; + + if (pctx->bufsize - i < 3) + return; + + /* Insert the current sequence into the appropriate hash chain. */ + hash = ntfs_hash(pctx->inbuf + i); + pctx->prev[i] = pctx->head[hash]; + pctx->head[hash] = i; } /* @@ -188,7 +338,7 @@ static int ntfs_best_match(struct COMPRESS_CONTEXT *pctx, int i) * 0 if an error has been met. */ -static unsigned int ntfs_compress_block(const char *inbuf, int bufsize, +static unsigned int ntfs_compress_block(const char *inbuf, const int bufsize, char *outbuf) { struct COMPRESS_CONTEXT *pctx; @@ -196,109 +346,167 @@ static unsigned int ntfs_compress_block(const char *inbuf, int bufsize, int j; /* end of best match from current position */ int k; /* end of best match from next position */ int offs; /* offset to best match */ - int n; int bp; /* bits to store offset */ + int bp_cur; /* saved bits to store offset at current position */ int mxoff; /* max match offset : 1 << bp */ - int mxsz2; unsigned int xout; unsigned int q; /* aggregated offset and size */ - int done; + int have_match; /* do we have a match at the current position? */ char *ptag; /* location reserved for a tag */ int tag; /* current value of tag */ int ntag; /* count of bits still undefined in tag */ pctx = (struct COMPRESS_CONTEXT*)ntfs_malloc(sizeof(struct COMPRESS_CONTEXT)); - if (pctx) { - for (n=0; n<NTFS_SB_SIZE; n++) - pctx->lson[n] = pctx->rson[n] = -1; - for (n=0; n<256; n++) - pctx->head[n] = -1; - pctx->inbuf = (const unsigned char*)inbuf; - pctx->bufsize = bufsize; - xout = 2; - n = 0; - i = 0; - bp = 4; - mxoff = 1 << bp; - pctx->mxsz = (1 << (16 - bp)) + 2; - tag = 0; - done = -1; - ntag = 8; - ptag = &outbuf[xout++]; - while ((i < bufsize) && (xout < (NTFS_SB_SIZE + 2))) { - /* adjust the longest match we can output */ + if (!pctx) { + errno = ENOMEM; + return 0; + } + + /* All hash chains start as empty. The special value '-1' indicates the + * end of each hash chain. */ + memset(pctx->head, 0xFF, sizeof(pctx->head)); + + pctx->inbuf = (const unsigned char*)inbuf; + pctx->bufsize = bufsize; + xout = 2; + i = 0; + bp = 4; + mxoff = 1 << bp; + pctx->mxsz = (1 << (16 - bp)) + 2; + have_match = 0; + tag = 0; + ntag = 8; + ptag = &outbuf[xout++]; + + while ((i < bufsize) && (xout < (NTFS_SB_SIZE + 2))) { + + /* This implementation uses "lazy" parsing: it always chooses + * the longest match, unless the match at the next position is + * longer. This is the same strategy used by the high + * compression modes of zlib. */ + + if (!have_match) { + /* Find the longest match at the current position. But + * first adjust the maximum match length if needed. + * (This loop might need to run more than one time in + * the case that we just output a long match.) */ while (mxoff < i) { bp++; mxoff <<= 1; pctx->mxsz = (pctx->mxsz + 2) >> 1; } - /* search the best match at current position */ - if (done < i) - do { - ntfs_best_match(pctx,++done); - } while (done < i); + ntfs_best_match(pctx, i, 2); + } + + if (pctx->size >= 3) { + + /* Found a match at the current position. */ + j = i + pctx->size; - if ((j - i) > pctx->mxsz) - j = i + pctx->mxsz; - - if ((j - i) > 2) { - offs = pctx->rel; - /* check whether there is a better run at i+1 */ - ntfs_best_match(pctx,i+1); - done = i+1; + bp_cur = bp; + offs = pctx->rel; + + if (pctx->size >= NICE_MATCH_LEN) { + + /* Choose long matches immediately. */ + + q = (~offs << (16 - bp_cur)) + (j - i - 3); + outbuf[xout++] = q & 255; + outbuf[xout++] = (q >> 8) & 255; + tag |= (1 << (8 - ntag)); + + if (j == bufsize) { + /* Shortcut if the match extends to the + * end of the buffer. */ + i = j; + --ntag; + break; + } + i += 1; + do { + ntfs_skip_position(pctx, i); + } while (++i != j); + have_match = 0; + } else { + /* Check for a longer match at the next + * position. */ + + /* Doesn't need to be while() since we just + * adjusted the maximum match length at the + * previous position. */ + if (mxoff < i + 1) { + bp++; + mxoff <<= 1; + pctx->mxsz = (pctx->mxsz + 2) >> 1; + } + ntfs_best_match(pctx, i + 1, pctx->size); k = i + 1 + pctx->size; - mxsz2 = pctx->mxsz; - if (mxoff <= i) - mxsz2 = (pctx->mxsz + 2) >> 1; - if ((k - i) > mxsz2) - k = i + mxsz2; + if (k > (j + 1)) { - /* issue a single byte */ - outbuf[xout++] = inbuf[i]; - i++; + /* Next match is longer. + * Output a literal. */ + outbuf[xout++] = inbuf[i++]; + have_match = 1; } else { - q = (~offs << (16 - bp)) - + (j - i - 3); + /* Next match isn't longer. + * Output the current match. */ + q = (~offs << (16 - bp_cur)) + (j - i - 3); outbuf[xout++] = q & 255; outbuf[xout++] = (q >> 8) & 255; tag |= (1 << (8 - ntag)); - i = j; + + /* The minimum match length is 3, and + * we've run two bytes through the + * matchfinder already. So the minimum + * number of positions we need to skip + * is 1. */ + i += 2; + do { + ntfs_skip_position(pctx, i); + } while (++i != j); + have_match = 0; } - } else { - outbuf[xout++] = inbuf[i]; - i++; - } - /* store the tag if fully used */ - if (!--ntag) { - *ptag = tag; - ntag = 8; - ptag = &outbuf[xout++]; - tag = 0; } + } else { + /* No match at current position. Output a literal. */ + outbuf[xout++] = inbuf[i++]; + have_match = 0; } - /* store the last tag, if partially used */ - if (ntag == 8) - xout--; - else + + /* Store the tag if fully used. */ + if (!--ntag) { *ptag = tag; - /* uncompressed must be full size, accept if better */ - if ((i >= bufsize) && (xout < (NTFS_SB_SIZE + 2))) { - outbuf[0] = (xout - 3) & 255; - outbuf[1] = 0xb0 + (((xout - 3) >> 8) & 15); - } else { - memcpy(&outbuf[2],inbuf,bufsize); - if (bufsize < NTFS_SB_SIZE) - memset(&outbuf[bufsize+2], 0, - NTFS_SB_SIZE - bufsize); - outbuf[0] = 0xff; - outbuf[1] = 0x3f; - xout = NTFS_SB_SIZE + 2; + ntag = 8; + ptag = &outbuf[xout++]; + tag = 0; } - free(pctx); + } + + /* Store the last tag if partially used. */ + if (ntag == 8) + xout--; + else + *ptag = tag; + + /* Determine whether to store the data compressed or uncompressed. */ + + if ((i >= bufsize) && (xout < (NTFS_SB_SIZE + 2))) { + /* Compressed. */ + outbuf[0] = (xout - 3) & 255; + outbuf[1] = 0xb0 + (((xout - 3) >> 8) & 15); } else { - xout = 0; - errno = ENOMEM; + /* Uncompressed. */ + memcpy(&outbuf[2], inbuf, bufsize); + if (bufsize < NTFS_SB_SIZE) + memset(&outbuf[bufsize + 2], 0, NTFS_SB_SIZE - bufsize); + outbuf[0] = 0xff; + outbuf[1] = 0x3f; + xout = NTFS_SB_SIZE + 2; } + + /* Free the compression context and return the total number of bytes + * written to 'outbuf'. */ + free(pctx); return (xout); } ------------------------------------------------------------------------------ _______________________________________________ ntfs-3g-devel mailing list ntfs-3g-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/ntfs-3g-devel