groupcompress_rabin

John Arbash Meinel Mon, 02 Mar 2009 10:53:08 -0800

At http://bzr.arbash-meinel.com/plugins/groupcompress_rabin


------------------------------------------------------------
revno: 72
revision-id: [email protected]
parent: [email protected]
committer: John Arbash Meinel <[email protected]>
branch nick: groupcompress_rabin
timestamp: Mon 2009-03-02 12:52:36 -0600
message:
  Change the code so that we can pass in multiple sources to match against.
  At the moment, we only use a single source, but that will soon change.

=== modified file '_groupcompress_pyx.pyx'
--- a/_groupcompress_pyx.pyx    2009-03-02 18:04:20 +0000
+++ b/_groupcompress_pyx.pyx    2009-03-02 18:52:36 +0000
@@ -24,18 +24,17 @@
     void memcpy(void *, void *, size_t)
 
 cdef extern from "delta.h":
-    void * diff_delta(void *src_buf, unsigned long src_bufsize,
-           void *trg_buf, unsigned long trg_bufsize,
-           unsigned long *delta_size, unsigned long max_delta_size)
     struct delta_index:
         unsigned long memsize
         void *src_buf
         unsigned long src_size
         unsigned int hash_mask
         # struct index_entry *hash[]
-    delta_index * create_delta_index(void *buf, unsigned long bufsize)
+    delta_index * create_delta_index(void *buf, unsigned long bufsize, unsigned
+                                     long agg_src_offset)
     void free_delta_index(delta_index *index)
-    void * create_delta(delta_index *index,
+    void *create_delta(delta_index **indexes,
+             unsigned int num_indexes,
              void *buf, unsigned long bufsize,
              unsigned long *delta_size, unsigned long max_delta_size)
     unsigned long get_delta_hdr_size(unsigned char **datap,
@@ -112,7 +111,7 @@
         #       fit just fine into the structure. But for now, we just wrap
         #       create_delta_index (For example, we could always reserve enough
         #       space to hash a 4MB string, etc.)
-        self._index = create_delta_index(c_source, c_source_size)
+        self._index = create_delta_index(c_source, c_source_size, 0)
         # TODO: Handle if _index == NULL
 
     cdef _ensure_no_index(self):
@@ -142,7 +141,7 @@
         # TODO: inline some of create_delta so we at least don't have to double
         #       malloc, and can instead use PyString_FromStringAndSize, to
         #       allocate the bytes into the final string
-        delta = create_delta(self._index, target, target_size,
+        delta = create_delta(&self._index, 1, target, target_size,
                              &delta_size, max_delta_size)
         result = None
         if delta:
@@ -175,9 +174,9 @@
     target_size = PyString_GET_SIZE(target_bytes)
 
     result = None
-    index = create_delta_index(source, source_size)
+    index = create_delta_index(source, source_size, 0)
     if index != NULL:
-        delta = create_delta(index, target, target_size,
+        delta = create_delta(&index, 1, target, target_size,
                              &delta_size, max_delta_size)
         free_delta_index(index);
         if delta:

=== modified file 'delta.h'
--- a/delta.h   2009-02-27 17:32:04 +0000
+++ b/delta.h   2009-03-02 18:52:36 +0000
@@ -16,7 +16,8 @@
  * using free_delta_index().
  */
 extern struct delta_index *
-create_delta_index(const void *buf, unsigned long bufsize);
+create_delta_index(const void *buf, unsigned long bufsize,
+                   unsigned long agg_src_offset);
 
 /*
  * free_delta_index: free the index created by create_delta_index()
@@ -43,7 +44,8 @@
  * must be freed by the caller.
  */
 extern void *
-create_delta(const struct delta_index *index,
+create_delta(struct delta_index **indexes,
+         unsigned int num_indexes,
             const void *buf, unsigned long bufsize,
             unsigned long *delta_size, unsigned long max_delta_size);
 
@@ -60,9 +62,9 @@
           const void *trg_buf, unsigned long trg_bufsize,
           unsigned long *delta_size, unsigned long max_delta_size)
 {
-       struct delta_index *index = create_delta_index(src_buf, src_bufsize);
+       struct delta_index *index = create_delta_index(src_buf, src_bufsize, 0);
        if (index) {
-               void *delta = create_delta(index, trg_buf, trg_bufsize,
+               void *delta = create_delta(&index, 1, trg_buf, trg_bufsize,
                                           delta_size, max_delta_size);
                free_delta_index(index);
                return delta;

=== modified file 'diff-delta.c'
--- a/diff-delta.c      2009-02-27 17:32:04 +0000
+++ b/diff-delta.c      2009-03-02 18:52:36 +0000
@@ -123,14 +123,18 @@
 };
 
 struct delta_index {
-       unsigned long memsize;
-       const void *src_buf;
-       unsigned long src_size;
-       unsigned int hash_mask;
+       unsigned long memsize; /* Total bytes pointed to by this index */
+       const void *src_buf; /* Pointer to the beginning of source data */
+       unsigned long src_size; /* Total length of source data */
+       unsigned long agg_src_offset; /* Start of source data as part of the
+                                                                        
aggregate source */
+       unsigned int hash_mask; /* val & hash_mask gives the hash index for a 
given
+                                                          entry */
        struct index_entry *hash[];
 };
 
-struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
+struct delta_index * create_delta_index(const void *buf, unsigned long bufsize,
+                                                                               
unsigned long agg_src_offset)
 {
        unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
        const unsigned char *data, *buffer = buf;
@@ -174,8 +178,8 @@
        /* then populate the index */
        prev_val = ~0;
        for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
-            data >= buffer;
-            data -= RABIN_WINDOW) {
+                data >= buffer;
+                data -= RABIN_WINDOW) {
                unsigned int val = 0;
                for (i = 1; i <= RABIN_WINDOW; i++)
                        val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
@@ -196,7 +200,7 @@
 
        /*
         * Determine a limit on the number of entries in the same hash
-        * bucket.  This guards us against pathological data sets causing
+        * bucket.      This guards us against pathological data sets causing
         * really bad hash distribution with most entries in the same hash
         * bucket that would bring us to O(m*n) computing costs (m and n
         * corresponding to reference and target buffer sizes).
@@ -221,7 +225,7 @@
                /*
                 * Assume that this loop is gone through exactly
                 * HASH_LIMIT times and is entered and left with
-                * acc==0.  So the first statement in the loop
+                * acc==0.      So the first statement in the loop
                 * contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
                 * to the accumulator, and the inner loop consequently
                 * is run (hash_count[i]-HASH_LIMIT) times, removing
@@ -262,6 +266,7 @@
        index->memsize = memsize;
        index->src_buf = buf;
        index->src_size = bufsize;
+       index->agg_src_offset = agg_src_offset;
        index->hash_mask = hmask;
 
        mem = index->hash;
@@ -308,11 +313,13 @@
 #define MAX_OP_SIZE    (5 + 5 + 1 + RABIN_WINDOW + 7)
 
 void *
-create_delta(const struct delta_index *index,
-            const void *trg_buf, unsigned long trg_size,
-            unsigned long *delta_size, unsigned long max_size)
+create_delta(struct delta_index **indexes,
+                        unsigned int num_indexes,
+                        const void *trg_buf, unsigned long trg_size,
+                        unsigned long *delta_size, unsigned long max_size)
 {
-       unsigned int i, outpos, outsize, moff, msize, val;
+       unsigned int i, j, outpos, outsize, moff, msize, val;
+       const struct delta_index *index, *mindex;
        int inscnt;
        const unsigned char *ref_data, *ref_top, *data, *top;
        unsigned char *out;
@@ -329,7 +336,11 @@
                return NULL;
 
        /* store reference buffer size */
-       i = index->src_size;
+       i = 0;
+       for (j = 0; j < num_indexes; ++j) {
+               index = indexes[j];
+               i += index->src_size;
+       }
        while (i >= 0x80) {
                out[outpos++] = i | 0x80;
                i >>= 7;
@@ -344,55 +355,85 @@
        }
        out[outpos++] = i;
 
-       ref_data = index->src_buf;
-       ref_top = ref_data + index->src_size;
        data = trg_buf;
        top = (const unsigned char *) trg_buf + trg_size;
 
-       outpos++;
+       /* Start the matching by filling out with a simple 'insert' 
instruction, of
+        * the first RABIN_WINDOW bytes of the input.
+        */
+       outpos++; /* leave a byte for the insert command */
        val = 0;
        for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
                out[outpos++] = *data;
                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
        }
+       /* we are now setup with an insert of 'i' bytes and val contains the 
RABIN
+        * hash for those bytes, and data points to the RABIN_WINDOW+1 byte of
+        * input.
+        */
        inscnt = i;
 
        moff = 0;
        msize = 0;
+       mindex = NULL;
        while (data < top) {
                if (msize < 4096) {
+                       /* we don't have a 'worthy enough' match yet, so let's 
look for
+                        * one.
+                        */
                        struct index_entry *entry;
+                       /* Shift the window by one byte. */
                        val ^= U[data[-RABIN_WINDOW]];
                        val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
-                       i = val & index->hash_mask;
-                       for (entry = index->hash[i]; entry < index->hash[i+1]; 
entry++) {
-                               const unsigned char *ref = entry->ptr;
-                               const unsigned char *src = data;
-                               unsigned int ref_size = ref_top - ref;
-                               if (entry->val != val)
-                                       continue;
-                               if (ref_size > top - src)
-                                       ref_size = top - src;
-                               if (ref_size <= msize)
-                                       break;
-                               while (ref_size-- && *src++ == *ref)
-                                       ref++;
-                               if (msize < ref - entry->ptr) {
-                                       /* this is our best match so far */
-                                       msize = ref - entry->ptr;
-                                       moff = entry->ptr - ref_data;
-                                       if (msize >= 4096) /* good enough */
+                       for (j = 0; j < num_indexes; ++j) {
+                               index = indexes[j];
+                               i = val & index->hash_mask;
+                               ref_top = index->src_buf + index->src_size;
+                               ref_data = index->src_buf;
+                               for (entry = index->hash[i]; entry < 
index->hash[i+1];
+                                        entry++) {
+                                       const unsigned char *ref = entry->ptr;
+                                       const unsigned char *src = data;
+                                       unsigned int ref_size = ref_top - ref;
+                                       if (entry->val != val)
+                                               continue;
+                                       /* ref_size is the longest possible 
match that we could make
+                                        * here. If ref_size <= msize, then we 
know that we cannot
+                                        * match more bytes with this location 
that we have already
+                                        * matched.
+                                        */
+                                       if (ref_size > top - src)
+                                               ref_size = top - src;
+                                       if (ref_size <= msize)
                                                break;
+                                       /* See how many bytes actually match at 
this location. */
+                                       while (ref_size-- && *src++ == *ref)
+                                               ref++;
+                                       if (msize < ref - entry->ptr) {
+                                               /* this is our best match so 
far */
+                                               msize = ref - entry->ptr;
+                                               moff = entry->ptr - ref_data;
+                                               mindex = index;
+                                               if (msize >= 4096) /* good 
enough */
+                                                       break;
+                                       }
                                }
                        }
                }
 
                if (msize < 4) {
+                       /* The best match right now is less than 4 bytes long. 
So just add
+                        * the current byte to the insert instruction. 
Increment the insert
+                        * counter, and copy the byte of data into the output 
buffer.
+                        */
                        if (!inscnt)
                                outpos++;
                        out[outpos++] = *data++;
                        inscnt++;
                        if (inscnt == 0x7f) {
+                               /* We have a max length insert instruction, 
finalize it in the
+                                * output.
+                                */
                                out[outpos - inscnt - 1] = inscnt;
                                inscnt = 0;
                        }
@@ -402,6 +443,7 @@
                        unsigned char *op;
 
                        if (inscnt) {
+                               ref_data = mindex->src_buf;
                                while (moff && ref_data[moff-1] == data[-1]) {
                                        /* we can match one byte back */
                                        msize++;
@@ -425,6 +467,10 @@
                        op = out + outpos++;
                        i = 0x80;
 
+                       /* so far, moff has been the offset in a single source, 
however,
+                        * now we encode it as the offset in the aggregate 
source
+                        */
+                       moff = moff + mindex->agg_src_offset;
                        if (moff & 0x000000ff)
                                out[outpos++] = moff >> 0,  i |= 0x01;
                        if (moff & 0x0000ff00)
@@ -450,7 +496,7 @@
                                val = 0;
                                for (j = -RABIN_WINDOW; j < 0; j++)
                                        val = ((val << 8) | data[j])
-                                             ^ T[val >> RABIN_SHIFT];
+                                                 ^ T[val >> RABIN_SHIFT];
                        }
                }
 
@@ -480,3 +526,5 @@
        *delta_size = outpos;
        return out;
 }
+
+/* vim: noet ts=4 sw=4 sts=4 */

-- 
bazaar-commits mailing list
[email protected]
https://lists.ubuntu.com/mailman/listinfo/bazaar-commits

Rev 72: Change the code so that we can pass in multiple sources to match against. in http://bzr.arbash-meinel.com/plugins/groupcompress_rabin

Reply via email to