Hi, Very interesting.
So far I have just checked my tests and they run fine. You have defined the hash table on static data, and I do not want to enter into the meanings of static data in shared objects in various operating systems (allowed or not, shared by threads or not...). I prefer to have it dynamically allocated (hence never shared by mounts), and pointed to in the "volume" structure. Unfortunately this means adding an extra argument to ntfs_compress_block() and freeing the table when unmounting. (I will later post the code for that). Also a minor issue : please use http://sourceforge.net/p/ntfs-3g/ntfs-3g/ci/edge/tree/libntfs-3g/compress.c as a reference for your patch for easier integration. Jean-Pierre Eric Biggers wrote: > The current compression algorithm does "lazy" parsing of matches, backed > by a binary tree match-finder with one byte hashing. Performance-wise, > this approach is not very good for several reasons: > > (1) One byte hashing results in a lot of hash collisions, which slows > down searches. > > (2) With "lazy" parsing, many positions never actually need to be > matched. But when using binary trees, all the work needs to be done > anyway, because the sequence at each position needs to be inserted > into the appropriate binary tree. This makes binary trees better > suited for "optimal" parsing --- but that isn't being done and is > probably too slow to be practical for NTFS. > > (3) There was also no hard cut-off on the amount of work done per > position. This did not matter too much because the buffer size is > never greater than 4096 bytes, but in degenerate cases the binary > trees could generate into linked lists and there could be hundreds of > matches considered at each position. > > This patch changes the algorithm to use hash chains instead of binary > trees, with much stronger hashing. It also introduces useful (for > performance) parameters, such as the "nice match length" and "maximum > search depth", that are similar to those used in other commonly used > compression algorithms such as zlib's DEFLATE implementation. > > The performance improvement is very significant, but data-dependent. > Compressing text files is faster by about 3x; x86 executables files by > about 3x; random data by about 1.7x; all zeroes by about 1.2x; some > degenerate cases by over 10x. (I did my tests on an x86_64 CPU.) > > The compression ratio is the same or slightly worse. It is less than 1% > worse on all files I tested except an ASCII representation of a genome. > > No changes were made to the decompressor. > --- > libntfs-3g/compress.c | 484 > +++++++++++++++++++++++++++++++++----------------- > 1 file changed, 324 insertions(+), 160 deletions(-) > > diff --git a/libntfs-3g/compress.c b/libntfs-3g/compress.c > index 69b39ed..e62c7dd 100644 > --- a/libntfs-3g/compress.c > +++ b/libntfs-3g/compress.c > @@ -6,6 +6,7 @@ > * Copyright (c) 2004-2006 Szabolcs Szakacsits > * Copyright (c) 2005 Yura Pakhuchiy > * Copyright (c) 2009-2013 Jean-Pierre Andre > + * Copyright (c) 2014 Eric Biggers > * > * This program/include file is free software; you can redistribute it > and/or > * modify it under the terms of the GNU General Public License as published > @@ -21,17 +22,6 @@ > * along with this program (in the main directory of the NTFS-3G > * distribution in the file COPYING); if not, write to the Free Software > * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA > - * > - * A part of the compression algorithm is based on lzhuf.c whose header > - * describes the roles of the original authors (with no apparent copyright > - * notice, and according to http://home.earthlink.net/~neilbawd/pall.html > - * this was put into public domain in 1988 by Haruhiko OKUMURA). > - * > - * LZHUF.C English version 1.0 > - * Based on Japanese version 29-NOV-1988 > - * LZSS coded by Haruhiko OKUMURA > - * Adaptive Huffman Coding coded by Haruyasu YOSHIZAKI > - * Edited and translated to English by Kenji RIKITAKE > */ > > #ifdef HAVE_CONFIG_H > @@ -81,96 +71,210 @@ typedef enum { > NTFS_SB_IS_COMPRESSED = 0x8000, > } ntfs_compression_constants; > > +/* Match length at or above which ntfs_best_match() will stop searching for > + * longer matches. */ > +#define NICE_MATCH_LEN 16 > + > +/* Maximum length at which a lazy match will be attempted. */ > +#define MAX_LAZY_MATCH_LEN 20 > + > +/* Maximum number of potential matches that ntfs_best_match() will consider > at > + * each position. */ > +#define MAX_SEARCH_DEPTH 24 > + > +/* Number of entries in the hash table for match-finding. This can be > changed, > + * but it should be a power of 2 so that computing the hash bucket is fast. > */ > +#define HASH_LEN (1 << 14) > + > struct COMPRESS_CONTEXT { > const unsigned char *inbuf; > int bufsize; > int size; > int rel; > int mxsz; > - s16 head[256]; > - s16 lson[NTFS_SB_SIZE]; > - s16 rson[NTFS_SB_SIZE]; > + s16 head[HASH_LEN]; > + s16 prev[NTFS_SB_SIZE]; > } ; > > +#define CRC32_POLYNOMIAL 0xEDB88320 > + > +static u32 crc32_table[256]; > + > +static void do_crc32_init(void) > +{ > + int i, j; > + u32 r; > + > + for (i = 0; i < 256; i++) { > + r = i; > + for (j = 0; j < 8; j++) > + r = (r >> 1) ^ (CRC32_POLYNOMIAL & ~((r & 1) - 1)); > + crc32_table[i] = r; > + } > +} > + > +/* > + * Initialize the CRC32 table for ntfs_hash() if not done already > + */ > +static void crc32_init(void) > +{ > + static int done = 0; > + > + if (!done) { > + do_crc32_init(); > + done = 1; > + } > +} > + > +/* > + * Hash the next 3-byte sequence in the input buffer > + * > + * Currently, we use a hash function similar to that used in LZMA. It > + * takes slightly longer to compute than zlib's hash function, but it > + * provides better collision resistance. > + */ > +static inline unsigned int ntfs_hash(const u8 *p) > +{ > + unsigned int hash = 0; > + > + hash ^= crc32_table[p[0]]; > + hash ^= p[1]; > + hash ^= (unsigned int)p[2] << 6; > + > + return hash % HASH_LEN; > +} > + > /* > * Search for the longest sequence matching current position > * > - * A binary tree is maintained to locate all previously met sequences, > - * and this function has to be called for all of them. > + * A hash table, each entry of which points to a chain of sequence > + * positions sharing the corresponding hash code, is maintained to speed up > + * searching for matches. To maintain the hash table, either > + * ntfs_best_match() or ntfs_skip_position() has to be called for each > + * consecutive position. > + * > + * This function is heavily used; it has to be optimized carefully. > + * > + * This function sets pctx->size and pctx->rel to the length and offset, > + * respectively, of the longest match found. > * > - * This function is heavily used, it has to be optimized carefully > + * The minimum match length is assumed to be 3, and the maximum match > + * length is assumed to be pctx->mxsz. If this function produces > + * pctx->size < 3, then no match was found. > * > - * Returns the size of the longest match, > - * zero if no match is found. > + * Note: for the following reasons, this function is not guaranteed to find > + * *the* longest match up to pctx->mxsz: > + * > + * (1) If this function finds a match of NICE_MATCH_LEN bytes or greater, > + * it ends early because a match this long is good enough and it's not > + * worth spending more time searching. > + * > + * (2) If this function considers MAX_SEARCH_DEPTH matches with a single > + * position, it ends early and returns the longest match found so far. > + * This saves a lot of time on degenerate inputs. > */ > - > -static int ntfs_best_match(struct COMPRESS_CONTEXT *pctx, int i) > +static void ntfs_best_match(struct COMPRESS_CONTEXT *pctx, const int i) > { > - s16 *prev; > - int node; > - register long j; > - long maxpos; > - long startj; > - long bestj; > - int bufsize; > - int bestnode; > - register const unsigned char *p1,*p2; > - > - p1 = pctx->inbuf; > - node = pctx->head[p1[i] & 255]; > - if (node >= 0) { > - /* search the best match at current position */ > - bestnode = node; > - bufsize = pctx->bufsize; > - /* restrict matches to the longest allowed sequence */ > - maxpos = bufsize; > - if ((i + pctx->mxsz) < maxpos) > - maxpos = i + pctx->mxsz; > - startj = i + 1 - maxpos; > - bestj = startj; > - /* make indexes relative to end of allowed position */ > - p1 = &p1[maxpos]; > - if (startj < 0) { > - do { > - /* indexes are negative */ > - p2 = &p1[node - i]; > - /* no need to compare the first byte */ > - j = startj; > - /* the second byte cannot lead to useful compression */ > - if (p1[j] == p2[j]) { > - j++; > - if (j < 0) { > - do { > - } while ((p1[j] == p2[j]) > - && (++j < 0)); > - } > - /* remember the match, if better */ > - if (j > bestj) { > - bestj = j; > - bestnode = node; > - } > + const u8 * const inbuf = pctx->inbuf; > + const u8 * const strptr = &inbuf[i]; /* String we're matching against */ > + const s16 * const prev = pctx->prev; > + const int max_len = min(pctx->bufsize - i, pctx->mxsz); > + const int nice_len = min(NICE_MATCH_LEN, max_len); > + int depth_remaining = MAX_SEARCH_DEPTH; > + int best_len = 2; > + const u8 *best_matchptr = strptr; > + unsigned int hash; > + s16 cur_match; > + const u8 *matchptr; > + int len; > + > + if (max_len < 3) > + goto out; > + > + /* Insert the current sequence into the appropriate hash chain. */ > + > + hash = ntfs_hash(strptr); > + cur_match = pctx->head[hash]; > + pctx->prev[i] = cur_match; > + pctx->head[hash] = i; > + > + /* Search the appropriate hash chain for matches. */ > + > + for (; cur_match >= 0 && depth_remaining--; cur_match = > prev[cur_match]) { > + > + matchptr = &inbuf[cur_match]; > + > + /* Considering the potential match at 'matchptr': is it longer > + * than 'best_len'? > + * > + * The bytes at index 'best_len' are the most likely to differ, > + * so check them first. > + * > + * The bytes at indices 'best_len - 1' and '0' are less > + * important to check separately. But doing so still gives a > + * slight performance improvement, at least on x86_64, probably > + * because they create separate branches for the CPU to predict > + * independently of the branches in the main comparison loops. > + */ > + if (matchptr[best_len] != strptr[best_len] || > + matchptr[best_len - 1] != strptr[best_len - 1] || > + matchptr[0] != strptr[0]) > + goto next_match; > + > + for (len = 1; len < best_len - 1; len++) > + if (matchptr[len] != strptr[len]) > + goto next_match; > + > + /* The match is the longest found so far --- > + * at least 'best_len' + 1 bytes. Continue extending it. */ > + > + best_matchptr = matchptr; > + > + do { > + if (++best_len == nice_len) { > + /* 'nice_len' reached; don't waste time > + * searching for longer matches. Extend the > + * match as far as possible and terminate the > + * search. */ > + while (best_len < max_len && > + best_matchptr[best_len] == > strptr[best_len]) > + { > + best_len++; > } > - /* walk in the tree in the right direction */ > - if ((j < 0) && (p1[j] < p2[j])) > - prev = &pctx->lson[node]; > - else > - prev = &pctx->rson[node]; > - node = *prev; > - /* stop if reaching a leaf or maximum length */ > - } while ((node >= 0) && (j < 0)); > - /* put the node into the tree if we reached a leaf */ > - if (node < 0) > - *prev = i; > - } > - /* done, return the best match */ > - pctx->size = bestj + maxpos - i; > - pctx->rel = bestnode - i; > - } else { > - pctx->head[p1[i] & 255] = i; > - pctx->size = 0; > - pctx->rel = 0; > + goto out; > + } > + } while (best_matchptr[best_len] == strptr[best_len]); > + > + /* Found a longer match, but 'nice_len' not yet reached. */ > + > + next_match: > + /* Continue to next match in the chain. */ > + ; > } > - return (pctx->size); > + > + /* Reached end of chain, or ended early due to reaching the maximum > + * search depth. */ > + > +out: > + /* Return the longest match we were able to find. */ > + pctx->size = best_len; > + pctx->rel = best_matchptr - strptr; /* given as a negative number! */ > +} > + > +/* > + * Advance the match-finder, but don't search for matches. > + */ > +static void ntfs_skip_position(struct COMPRESS_CONTEXT *pctx, const int i) > +{ > + unsigned int hash; > + > + if (pctx->bufsize - i < 3) > + return; > + > + /* Insert the current sequence into the appropriate hash chain. */ > + hash = ntfs_hash(pctx->inbuf + i); > + pctx->prev[i] = pctx->head[hash]; > + pctx->head[hash] = i; > } > > /* > @@ -188,7 +292,7 @@ static int ntfs_best_match(struct COMPRESS_CONTEXT *pctx, > int i) > * 0 if an error has been met. > */ > > -static unsigned int ntfs_compress_block(const char *inbuf, int bufsize, > +static unsigned int ntfs_compress_block(const char *inbuf, const int bufsize, > char *outbuf) > { > struct COMPRESS_CONTEXT *pctx; > @@ -196,109 +300,169 @@ static unsigned int ntfs_compress_block(const char > *inbuf, int bufsize, > int j; /* end of best match from current position */ > int k; /* end of best match from next position */ > int offs; /* offset to best match */ > - int n; > int bp; /* bits to store offset */ > + int bp_cur; /* saved bits to store offset at current position */ > int mxoff; /* max match offset : 1 << bp */ > - int mxsz2; > unsigned int xout; > unsigned int q; /* aggregated offset and size */ > - int done; > + int have_match; /* do we have a match at the current position? */ > char *ptag; /* location reserved for a tag */ > int tag; /* current value of tag */ > int ntag; /* count of bits still undefined in tag */ > > pctx = (struct COMPRESS_CONTEXT*)ntfs_malloc(sizeof(struct > COMPRESS_CONTEXT)); > - if (pctx) { > - for (n=0; n<NTFS_SB_SIZE; n++) > - pctx->lson[n] = pctx->rson[n] = -1; > - for (n=0; n<256; n++) > - pctx->head[n] = -1; > - pctx->inbuf = (const unsigned char*)inbuf; > - pctx->bufsize = bufsize; > - xout = 2; > - n = 0; > - i = 0; > - bp = 4; > - mxoff = 1 << bp; > - pctx->mxsz = (1 << (16 - bp)) + 2; > - tag = 0; > - done = -1; > - ntag = 8; > - ptag = &outbuf[xout++]; > - while ((i < bufsize) && (xout < (NTFS_SB_SIZE + 2))) { > - /* adjust the longest match we can output */ > + if (!pctx) { > + errno = ENOMEM; > + return 0; > + } > + > + /* All hash chains start as empty. The special value '-1' indicates the > + * end of each hash chain. */ > + memset(pctx->head, 0xFF, sizeof(pctx->head)); > + > + crc32_init(); > + > + pctx->inbuf = (const unsigned char*)inbuf; > + pctx->bufsize = bufsize; > + xout = 2; > + i = 0; > + bp = 4; > + mxoff = 1 << bp; > + pctx->mxsz = (1 << (16 - bp)) + 2; > + have_match = 0; > + tag = 0; > + ntag = 8; > + ptag = &outbuf[xout++]; > + > + while ((i < bufsize) && (xout < (NTFS_SB_SIZE + 2))) { > + > + /* This implementation uses "lazy" parsing: it always chooses > + * the longest match, unless the match at the next position is > + * longer. This is the same strategy used by the high > + * compression modes of zlib. */ > + > + if (!have_match) { > + /* Find the longest match at the current position. But > + * first adjust the maximum match length if needed. > + * (This loop might need to run more than one time in > + * the case that we just output a long match.) */ > while (mxoff < i) { > bp++; > mxoff <<= 1; > pctx->mxsz = (pctx->mxsz + 2) >> 1; > } > - /* search the best match at current position */ > - if (done < i) > - do { > - ntfs_best_match(pctx,++done); > - } while (done < i); > + ntfs_best_match(pctx, i); > + } > + > + if (pctx->size >= 3) { > + > + /* Found a match at the current position. */ > + > j = i + pctx->size; > - if ((j - i) > pctx->mxsz) > - j = i + pctx->mxsz; > - > - if ((j - i) > 2) { > - offs = pctx->rel; > - /* check whether there is a better run at i+1 */ > - ntfs_best_match(pctx,i+1); > - done = i+1; > + bp_cur = bp; > + offs = pctx->rel; > + > + if (pctx->size > MAX_LAZY_MATCH_LEN) { > + > + /* Choose long matches immediately. */ > + > + q = (~offs << (16 - bp_cur)) + (j - i - 3); > + outbuf[xout++] = q & 255; > + outbuf[xout++] = (q >> 8) & 255; > + tag |= (1 << (8 - ntag)); > + > + if (j == bufsize) { > + /* Shortcut if the match extends to the > + * end of the buffer. */ > + i = j; > + --ntag; > + break; > + } > + i += 1; > + do { > + ntfs_skip_position(pctx, i); > + } while (++i != j); > + have_match = 0; > + } else { > + /* Check for a longer match at the next > + * position. */ > + > + /* Doesn't need to be while() since we just > + * adjusted the maximum match length at the > + * previous position. */ > + if (mxoff < i + 1) { > + bp++; > + mxoff <<= 1; > + pctx->mxsz = (pctx->mxsz + 2) >> 1; > + } > + ntfs_best_match(pctx, i + 1); > k = i + 1 + pctx->size; > - mxsz2 = pctx->mxsz; > - if (mxoff <= i) > - mxsz2 = (pctx->mxsz + 2) >> 1; > - if ((k - i) > mxsz2) > - k = i + mxsz2; > + > if (k > (j + 1)) { > - /* issue a single byte */ > - outbuf[xout++] = inbuf[i]; > - i++; > + /* Next match is longer. > + * Output a literal. */ > + outbuf[xout++] = inbuf[i++]; > + have_match = 1; > } else { > - q = (~offs << (16 - bp)) > - + (j - i - 3); > + /* Next match isn't longer. > + * Output the current match. */ > + q = (~offs << (16 - bp_cur)) + (j - i - > 3); > outbuf[xout++] = q & 255; > outbuf[xout++] = (q >> 8) & 255; > tag |= (1 << (8 - ntag)); > - i = j; > + > + /* The minimum match length is 3, and > + * we've run two bytes through the > + * matchfinder already. So the minimum > + * number of positions we need to skip > + * is 1. */ > + i += 2; > + do { > + ntfs_skip_position(pctx, i); > + } while (++i != j); > + have_match = 0; > } > - } else { > - outbuf[xout++] = inbuf[i]; > - i++; > - } > - /* store the tag if fully used */ > - if (!--ntag) { > - *ptag = tag; > - ntag = 8; > - ptag = &outbuf[xout++]; > - tag = 0; > } > + } else { > + /* No match at current position. Output a literal. */ > + outbuf[xout++] = inbuf[i++]; > + have_match = 0; > } > - /* store the last tag, if partially used */ > - if (ntag == 8) > - xout--; > - else > + > + /* Store the tag if fully used. */ > + if (!--ntag) { > *ptag = tag; > - /* uncompressed must be full size, accept if better */ > - if ((i >= bufsize) && (xout < (NTFS_SB_SIZE + 2))) { > - outbuf[0] = (xout - 3) & 255; > - outbuf[1] = 0xb0 + (((xout - 3) >> 8) & 15); > - } else { > - memcpy(&outbuf[2],inbuf,bufsize); > - if (bufsize < NTFS_SB_SIZE) > - memset(&outbuf[bufsize+2], 0, > - NTFS_SB_SIZE - bufsize); > - outbuf[0] = 0xff; > - outbuf[1] = 0x3f; > - xout = NTFS_SB_SIZE + 2; > + ntag = 8; > + ptag = &outbuf[xout++]; > + tag = 0; > } > - free(pctx); > + } > + > + /* Store the last tag if partially used. */ > + if (ntag == 8) > + xout--; > + else > + *ptag = tag; > + > + /* Determine whether to store the data compressed or uncompressed. */ > + > + if ((i >= bufsize) && (xout < (NTFS_SB_SIZE + 2))) { > + /* Compressed. */ > + outbuf[0] = (xout - 3) & 255; > + outbuf[1] = 0xb0 + (((xout - 3) >> 8) & 15); > } else { > - xout = 0; > - errno = ENOMEM; > + /* Uncompressed. */ > + memcpy(&outbuf[2], inbuf, bufsize); > + if (bufsize < NTFS_SB_SIZE) > + memset(&outbuf[bufsize + 2], 0, NTFS_SB_SIZE - bufsize); > + outbuf[0] = 0xff; > + outbuf[1] = 0x3f; > + xout = NTFS_SB_SIZE + 2; > } > + > + /* Free the compression context and return the total number of bytes > + * written to 'outbuf'. */ > + free(pctx); > return (xout); > } > ------------------------------------------------------------------------------ _______________________________________________ ntfs-3g-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/ntfs-3g-devel
