rabin

John Arbash Meinel Mon, 02 Mar 2009 13:03:04 -0800

At http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin


------------------------------------------------------------
revno: 80
revision-id: [email protected]
parent: [email protected]
committer: John Arbash Meinel <[email protected]>
branch nick: rabin
timestamp: Mon 2009-03-02 15:02:23 -0600
message:
  Refactor the code a bit, so that I can re-use bits for a 
create_delta_index_from_delta.

=== modified file '_groupcompress_pyx.pyx'
--- a/_groupcompress_pyx.pyx    2009-03-02 19:36:29 +0000
+++ b/_groupcompress_pyx.pyx    2009-03-02 21:02:23 +0000
@@ -85,7 +85,7 @@
     cdef delta_index **_indexes
     cdef readonly unsigned int _num_indexes
     cdef readonly unsigned int _max_num_indexes
-    cdef readonly unsigned long _source_offset
+    cdef public unsigned long _source_offset
 
     def __repr__(self):
         return '%s(%d, %d, %d)' % (self.__class__.__name__,

=== modified file 'diff-delta.c'
--- a/diff-delta.c      2009-03-02 20:27:18 +0000
+++ b/diff-delta.c      2009-03-02 21:02:23 +0000
@@ -133,71 +133,13 @@
        struct index_entry *hash[];
 };
 
-struct delta_index * create_delta_index(const void *buf, unsigned long bufsize,
-                                                                               
unsigned long agg_src_offset)
+static unsigned int
+limit_hash_buckets(struct unpacked_index_entry **hash,
+                                  unsigned int *hash_count, unsigned int hsize,
+                                  unsigned int entries)
 {
-       unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
-       const unsigned char *data, *buffer = buf;
-       struct delta_index *index;
-       struct unpacked_index_entry *entry, **hash;
-       struct index_entry *packed_entry, **packed_hash;
-       void *mem;
-       unsigned long memsize;
-
-       if (!buf || !bufsize)
-               return NULL;
-
-       /* Determine index hash size.  Note that indexing skips the
-          first byte to allow for optimizing the Rabin's polynomial
-          initialization in create_delta(). */
-       entries = (bufsize - 1)  / RABIN_WINDOW;
-       hsize = entries / 4;
-       for (i = 4; (1u << i) < hsize && i < 31; i++);
-       hsize = 1 << i;
-       hmask = hsize - 1;
-
-       /* allocate lookup index */
-       memsize = sizeof(*hash) * hsize +
-                 sizeof(*entry) * entries;
-       mem = malloc(memsize);
-       if (!mem)
-               return NULL;
-       hash = mem;
-       mem = hash + hsize;
-       entry = mem;
-
-       memset(hash, 0, hsize * sizeof(*hash));
-
-       /* allocate an array to count hash entries */
-       hash_count = calloc(hsize, sizeof(*hash_count));
-       if (!hash_count) {
-               free(hash);
-               return NULL;
-       }
-
-       /* then populate the index */
-       prev_val = ~0;
-       for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
-                data >= buffer;
-                data -= RABIN_WINDOW) {
-               unsigned int val = 0;
-               for (i = 1; i <= RABIN_WINDOW; i++)
-                       val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
-               if (val == prev_val) {
-                       /* keep the lowest of consecutive identical blocks */
-                       entry[-1].entry.ptr = data + RABIN_WINDOW;
-                       --entries;
-               } else {
-                       prev_val = val;
-                       i = val & hmask;
-                       entry->entry.ptr = data + RABIN_WINDOW;
-                       entry->entry.val = val;
-                       entry->next = hash[i];
-                       hash[i] = entry++;
-                       hash_count[i]++;
-               }
-       }
-
+       struct unpacked_index_entry *entry;
+       unsigned int i;
        /*
         * Determine a limit on the number of entries in the same hash
         * bucket.      This guards us against pathological data sets causing
@@ -247,8 +189,18 @@
                        entry = entry->next;
                } while (entry);
        }
-       free(hash_count);
+       return entries;
+}
 
+static struct delta_index *
+pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,
+                                unsigned int entries)
+{
+       unsigned int i, memsize;
+       struct unpacked_index_entry *entry;
+       struct delta_index *index;
+       struct index_entry *packed_entry, **packed_hash;
+       void *mem;
        /*
         * Now create the packed index in array form
         * rather than linked lists.
@@ -258,16 +210,12 @@
                + sizeof(*packed_entry) * entries;
        mem = malloc(memsize);
        if (!mem) {
-               free(hash);
                return NULL;
        }
 
        index = mem;
        index->memsize = memsize;
-       index->src_buf = buf;
-       index->src_size = bufsize;
-       index->agg_src_offset = agg_src_offset;
-       index->hash_mask = hmask;
+       index->hash_mask = hsize - 1;
 
        mem = index->hash;
        packed_hash = mem;
@@ -288,8 +236,84 @@
        packed_hash[hsize] = packed_entry;
 
        assert(packed_entry - (struct index_entry *)mem == entries);
-       free(hash);
-
+       return index;
+}
+
+
+struct delta_index * create_delta_index(const void *buf, unsigned long bufsize,
+                                                                               
unsigned long agg_src_offset)
+{
+       unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
+       const unsigned char *data, *buffer = buf;
+       struct delta_index *index;
+       struct unpacked_index_entry *entry, **hash;
+       void *mem;
+       unsigned long memsize;
+
+       if (!buf || !bufsize)
+               return NULL;
+
+       /* Determine index hash size.  Note that indexing skips the
+          first byte to allow for optimizing the Rabin's polynomial
+          initialization in create_delta(). */
+       entries = (bufsize - 1)  / RABIN_WINDOW;
+       hsize = entries / 4;
+       for (i = 4; (1u << i) < hsize && i < 31; i++);
+       hsize = 1 << i;
+       hmask = hsize - 1;
+
+       /* allocate lookup index */
+       memsize = sizeof(*hash) * hsize +
+                 sizeof(*entry) * entries;
+       mem = malloc(memsize);
+       if (!mem)
+               return NULL;
+       hash = mem;
+       mem = hash + hsize;
+       entry = mem;
+
+       memset(hash, 0, hsize * sizeof(*hash));
+
+       /* allocate an array to count hash entries */
+       hash_count = calloc(hsize, sizeof(*hash_count));
+       if (!hash_count) {
+               free(hash);
+               return NULL;
+       }
+
+       /* then populate the index */
+       prev_val = ~0;
+       for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
+                data >= buffer;
+                data -= RABIN_WINDOW) {
+               unsigned int val = 0;
+               for (i = 1; i <= RABIN_WINDOW; i++)
+                       val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
+               if (val == prev_val) {
+                       /* keep the lowest of consecutive identical blocks */
+                       entry[-1].entry.ptr = data + RABIN_WINDOW;
+                       --entries;
+               } else {
+                       prev_val = val;
+                       i = val & hmask;
+                       entry->entry.ptr = data + RABIN_WINDOW;
+                       entry->entry.val = val;
+                       entry->next = hash[i];
+                       hash[i] = entry++;
+                       hash_count[i]++;
+               }
+       }
+
+       entries = limit_hash_buckets(hash, hash_count, hsize, entries);
+       free(hash_count);
+       index = pack_delta_index(hash, hsize, entries);
+       if (!index) {
+               free(hash);
+               return NULL;
+       }
+       index->src_buf = buf;
+       index->src_size = bufsize;
+       index->agg_src_offset = agg_src_offset;
        return index;
 }
 
@@ -326,6 +350,8 @@
 
        if (!trg_buf || !trg_size)
                return NULL;
+       if (num_indexes == 0)
+               return NULL;
 
        outpos = 0;
        outsize = 8192;
@@ -337,6 +363,7 @@
 
        /* store reference buffer size */
        i = 0;
+       index = indexes[0];
        for (j = 0; j < num_indexes; ++j) {
                index = indexes[j];
                i += index->src_size;

=== modified file 'groupcompress.py'
--- a/groupcompress.py  2009-03-02 20:16:09 +0000
+++ b/groupcompress.py  2009-03-02 21:02:23 +0000
@@ -178,6 +178,10 @@
                 new_chunks.insert(0, 'fulltext\n')
                 new_chunks.append('len:%s\n' % (input_len,))
             unadded_bytes = sum(map(len, new_chunks))
+            deltas_unadded = (self.endpoint - self._delta_index._source_offset)
+            if deltas_unadded != 0:
+                import pdb; pdb.set_trace()
+            unadded_bytes += deltas_unadded
             self._delta_index.add_source(target_text, unadded_bytes)
             new_chunks.append(target_text)
         else:
@@ -186,9 +190,10 @@
             else:
                 new_chunks.insert(0, 'delta\n')
                 new_chunks.append('len:%s\n' % (len(delta),))
-            unadded_bytes = sum(map(len, new_chunks))
-            self._delta_index.add_source(delta, unadded_bytes)
+            # self._delta_index.add_source(delta, unadded_bytes)
             new_chunks.append(delta)
+            unadded_bytes = sum(map(len, new_chunks))
+            self._delta_index._source_offset += unadded_bytes
         delta_start = (self.endpoint, len(self.lines))
         self.output_chunks(new_chunks)
         self.input_bytes += input_len

-- 
bazaar-commits mailing list
[email protected]
https://lists.ubuntu.com/mailman/listinfo/bazaar-commits

Rev 80: Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta. in http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin

Reply via email to