The following patch seems to fix the zlib problems. At least, I can
log in and cat reasonably large files. do_zlib lost some data, due to
a bad stop condition. Thanks to Markus for spotting this. I also fixed
a bug in the packet length estimator.

/Niels

diff -u -a -r1.22 zlib.c
--- zlib.c      2000/01/09 18:15:56     1.22
+++ zlib.c      2000/01/18 21:38:55
@@ -91,13 +91,15 @@
 }
 
 /* Estimates of the resulting packet sizes. We use fixnum arithmetic,
- * with one represented as 1<<10=1024. Only rates between 1/8 and 8
- * are used. */
+ * with one represented as 1<<10=1024. Only rates between 1/16 and 16
+ * are used. This may be a little too conservative; I have observed
+ * compression ratios of about 50. */
  
 #define RATE_UNIT 1024
-#define RATE_MAX (RATE_UNIT * 8)
-#define RATE_MIN (RATE_UNIT / 8)
+#define RATE_MAX (RATE_UNIT * 16)
+#define RATE_MIN (RATE_UNIT / 16)
 #define MARGIN 200
+#define INSIGNIFICANT 100
 
 static UINT32 estimate_size(UINT32 rate, UINT32 input, UINT32 max)
 {
@@ -108,17 +110,28 @@
 /* Assumes that input is nonzero */
 static UINT32 estimate_update(UINT32 rate, UINT32 input, UINT32 output)
 {
-  UINT32 estimate = output * rate / input;
-
-  if (estimate > RATE_MAX)
-    return RATE_MAX;
-  
   /* Decay old estimate */
   rate = rate * 15 / 16;
 
-  /* Follow the "envelope" */
-  rate = MAX(estimate, rate);
+  /* FIXME: Following the envelope is suboptimal for small inputs. We
+   * do it only for input packets of reasonable size. This method
+   * could be improved.
+   *
+   * Perhaps a linear combination k * rate + (1-k) estimate, where k
+   * depends on the size of the sample (i.e. input) would make sense?
+   * Or use different rate estimates for different lengths? */
+  
+  if (input > INSIGNIFICANT)
+    {
+      UINT32 estimate = output * RATE_UNIT / input;
+
+      if (estimate > RATE_MAX)
+       return RATE_MAX;
 
+      /* Follow the "envelope" */
+      rate = MAX(estimate, rate);
+    }
+  
   return MAX(rate, RATE_MIN);
 }
 
@@ -129,19 +142,30 @@
 {
   CAST(zlib_instance, self, c);
   struct string_buffer buffer;
-  UINT32 limit = self->max;
+  
+  /* LIMIT keeps track of the amount of storage we may still need to
+   * allocate. To detect that a packet grows unexpectedly large, we
+   * need a little extra buffer space beyond the maximum size. */
+  UINT32 limit = self->max + 1;
+
+  UINT32 estimate;
+  
+  debug("do_zlib: length in: %i\n", packet->length);
   
   if (!packet->length)
     {
       werror("do_zlib_deflate: Compressing empty packet.\n");
       return free ? packet : lsh_string_dup(packet);
     }
-  
+
+  estimate = estimate_size(self->rate, packet->length, self->max);
+  debug("do_zlib: estimate:  %i\n", estimate);
+
   string_buffer_init(&buffer, 
                     estimate_size(self->rate, packet->length, self->max));
 
   limit -= buffer.partial->length;
-  
+
   self->z.next_in = packet->data;
   self->z.avail_in = packet->length;
 
@@ -149,8 +173,6 @@
     {
       int rc;
       
-      assert(self->z.avail_in);
-      
       self->z.next_out = buffer.current;
       self->z.avail_out = buffer.left;
 
@@ -166,7 +188,32 @@
          return NULL;
        }
 
-      if (!self->z.avail_in)
+      /* NOTE: It's not enough to check that avail_in is zero to
+       * determine that all data have been flushed. avail_in == 0 and
+       * avail_out > 0 implies that all data has been flushed, but if
+       * avail_in == avail_out == 0, we have to allocate more output
+       * space. */
+        
+      if (!self->z.avail_in && !self->z.avail_out)
+       verbose("do_zlib: Both avail_in and avail_out are zero.\n");
+      
+      if (!self->z.avail_out)
+       { /* All output space consumed */  
+         if (!limit)
+           {
+             werror("do_zlib_deflate: Packet grew too large!\n");
+             if (free)
+               lsh_string_free(packet);
+
+             string_buffer_clear(&buffer);
+             return NULL;
+           }
+
+         /* Grow to about double size. */
+         string_buffer_grow(&buffer, MIN(limit, buffer.partial->length + buffer.total 
++ 100));
+         limit -= buffer.partial->length;
+       }
+      else if (!self->z.avail_in)
        { /* Compressed entire packet */
          UINT32 input = packet->length;
 
@@ -176,26 +223,17 @@
          packet =
            string_buffer_final(&buffer, self->z.avail_out);
 
-         self->rate = estimate_update(self->rate, input, packet->length);
+         assert(packet->length <= self->max);
 
-         return packet;
-       }
-      else
-       { /* All output space consumed */
-         assert(!self->z.avail_out);
-         
-         if (!limit)
-           {
-             werror("do_zlib_deflate: Packet grew too large!\n");
-             if (free)
-               lsh_string_free(packet);
+         debug("do_zlib: length out: %i\n", packet->length);
 
-             string_buffer_clear(&buffer);
-             return NULL;
-           }
+         if (packet->length > estimate)
+           verbose("do_zlib: Estimated size exceeded: input = %i, estimate = %i, 
+output = %i\n",
+                   input, estimate, packet->length);
+         
+         self->rate = estimate_update(self->rate, input, packet->length);
 
-         string_buffer_grow(&buffer, MIN(limit, packet->length + 100));
-         limit -= buffer.partial->length;
+         return packet;
        }
     }
 }

Reply via email to