-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Hi,

I ended up with this encoder structure:

/**************** code fragment ***************************/
while (!pdf_buffer_eob_p (in) && !pdf_buffer_full_p (out))
  {
    /* accumulate input data */
    st->string.suffix = in->data [in->rp++];
    if (!lzw_dict_find (&st->dict, &st->string))
      {
        /* emit output code of longest input sequence in dict */
        lzw_buffer_put_code (&st->buffer, st->string.prefix);

        /* is current bitsize enough? */
        if (st->dict.size - 1 == st->buffer.maxval - st->early_change)
          {
            if (!lzw_buffer_inc_bitsize (&st->buffer))
              {
                lzw_buffer_put_code (&st->buffer, LZW_RESET_CODE);
                lzw_buffer_set_bitsize (&st->buffer, LZW_MIN_BITSIZE);
                lzw_dict_reset (&st->dict);
                dict_reset = PDF_TRUE;
              }
          }
        /* create new dict entry */
        if(!dict_reset)
          lzw_dict_create_entry (&st->dict, &st->string);
        else
          dict_reset = PDF_FALSE;

        st->string.prefix = st->string.suffix
      }
  }
/**************** code fragment ***************************/

Bitsize increase should happen as late as possible (as mentioned in
en.wikipedia.org). This is why it is tested right before creating a
new table entry.

Unit test results are positive for this code (see attached patch).

If you agree with this encoder version I will refactor the decoder in
the same way.

Regards,
Georg

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v2.0.17 (GNU/Linux)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/

iEYEARECAAYFAk7XhEUACgkQ5sLITM1qIaINcACePiqwSZ1iodSIYmuCJBtLDaAK
bsMAoIhZHj6I+IItrmCugFfj/ksBWUcy
=HQFB
-----END PGP SIGNATURE-----
# Bazaar merge directive format 2 (Bazaar 0.90)
# revision_id: georg.gottleu...@uni-ulm.de-20111201133158-\
#   a122qhnp4qq5qhpj
# target_branch: bzr://bzr.savannah.gnu.org/pdf/libgnupdf/trunk/
# testament_sha1: 8476063b76ac6a4883b90928439b5a594f277b63
# timestamp: 2011-12-01 14:32:28 +0100
# base_revision_id: jema...@gnu.org-20111011211752-3goterimb0gpbqq7
# 
# Begin patch
=== modified file 'AUTHORS'
--- AUTHORS	2011-08-04 13:49:13 +0000
+++ AUTHORS	2011-12-01 13:07:45 +0000
@@ -33,7 +33,8 @@
 
 Franck Lesage: changed pdf-hash-helper.c pdf-hash-helper.h gnupdf.texi
 
-Georg Gottleuber: changed pdf-filter.c pdf-stm-f-pred.c pdf-stm-f-pred.h
+Georg Gottleuber: changed pdf-filter.c pdf-stm-f-pred.c pdf-stm-f-pred.h 
+  src/base/pdf-stm-f-lzw.c torture/unit/base/stm/pdf-stm-rw-filter-lzw.c 
 
 Gerardo E. Gidoni: changed gnupdf.texi pdf-stm-read.c gnupdf-tsd.texi
   pdf-stm-write.c configure.ac pdf-filter.c pdf-stm-f-flate.h

=== modified file 'ChangeLog'
--- ChangeLog	2011-10-11 21:17:52 +0000
+++ ChangeLog	2011-12-01 13:31:58 +0000
@@ -1,3 +1,14 @@
+2011-12-01  Georg Gottleuber <georg.gottleu...@uni-ulm.de>
+
+	base,stm,torture: refactored encoder of lzw filter
+	added testfiles and unit test for lzw filter
+	* src/base/pdf-stm-f-lzw.c: some bug fixes	
+	* torture/testdata/TD00006: new file
+	* torture/testdata/TD00007: new file
+	* torture/testdata/TD00008: new file
+	* torture/testdata/TD00009: new file
+	* torture/unit/base/stm/pdf-stm-rw-filter-lzw.c: new file
+
 2011-10-11  Jose E. Marchesi  <jema...@gnu.org>
 
 	prmgt: converters to html reverted to a previous revision.

=== modified file 'src/base/pdf-stm-f-lzw.c'
--- src/base/pdf-stm-f-lzw.c	2011-10-04 16:22:05 +0000
+++ src/base/pdf-stm-f-lzw.c	2011-12-01 13:31:58 +0000
@@ -236,6 +236,102 @@
 }
 
 static pdf_bool_t
+lzw_dict_find (struct lzw_dict_s   *d,
+              struct lzw_string_s *s)
+{
+  lzw_code_t index;
+
+  if (s->prefix == LZW_NULL_INDEX)
+    {
+      s->prefix = s->suffix;
+      return PDF_TRUE; /* The string is a basic character, found! */
+    }
+
+  index = d->table[s->prefix].first;
+  if (index == LZW_NULL_INDEX)
+    {
+      return PDF_FALSE; /* first string with this prefix */
+    }
+
+  /* search the binary ordered tree of suffixes */
+  while (PDF_TRUE)
+    {
+      if (s->suffix == d->table[index].suffix)
+        {
+          s->prefix = index;
+          return PDF_TRUE; /* The string is in the table, found! */
+        }
+      else
+        {
+          if (s->suffix < d->table[index].suffix)
+            {
+              if (d->table[index].left == LZW_NULL_INDEX)
+                return PDF_FALSE;
+              else
+                index = d->table[index].left;
+            }
+          else
+            {
+              if (d->table[index].right == LZW_NULL_INDEX)
+                return PDF_FALSE;
+              else
+                index = d->table[index].right;
+            }
+        }
+    }
+  /* never reached */
+  return PDF_FALSE;
+}
+
+static pdf_bool_t
+lzw_dict_create_entry (struct lzw_dict_s *d, struct lzw_string_s *s)
+{
+  lzw_code_t index;
+  pdf_bool_t must_add;
+
+  index = d->table[s->prefix].first;
+
+  if (index == LZW_NULL_INDEX)
+    {
+      d->table[s->prefix].first = d->size;
+    }
+  else
+    {
+      must_add = PDF_FALSE;
+      while (!must_add)
+        {
+          if (s->suffix < d->table[index].suffix)
+            {
+              if (d->table[index].left == LZW_NULL_INDEX)
+                {
+                  d->table[index].left = d->size;
+                  must_add = PDF_TRUE;
+                }
+              else
+                {
+                  index = d->table[index].left;
+                }
+            }
+          else
+            {
+              if (d->table[index].right == LZW_NULL_INDEX)
+                {
+                  d->table[index].right = d->size;
+                  must_add = PDF_TRUE;
+                }
+              else
+                {
+                  index = d->table[index].right;
+                }
+            }
+        }
+    }
+
+  d->table[d->size++] = *s;
+  return PDF_TRUE;
+}
+
+static pdf_bool_t
 lzw_dict_add (struct lzw_dict_s   *d,
               struct lzw_string_s *s)
 {
@@ -394,6 +490,7 @@
                     pdf_error_t  **error)
 {
   struct lzwenc_state_s *st;
+  pdf_bool_t dict_reset = PDF_FALSE;
 
   st  = state;
   st->buffer.buf = out;
@@ -410,16 +507,17 @@
       st->must_reset = PDF_FALSE;
     }
 
-  while (!pdf_buffer_eob_p (in) &&
-         !pdf_buffer_full_p (out))
+  while (!pdf_buffer_eob_p (in) && !pdf_buffer_full_p (out))
     {
+      /* accumulate input data */
       st->string.suffix = in->data [in->rp++];
-      if (lzw_dict_add (&st->dict, &st->string))
+      if (!lzw_dict_find (&st->dict, &st->string))
         {
+          /* emit output code of longest input sequence in dict */
           lzw_buffer_put_code (&st->buffer, st->string.prefix);
-          st->string.prefix = st->string.suffix;
 
-          if (st->dict.size == st->buffer.maxval - st->early_change)
+          /* is current bitsize enough? */
+          if (st->dict.size - 1 == st->buffer.maxval - st->early_change)
             {
               PDF_DEBUG_BASE ("[LZW encoder] Increasing bitsize... "
                               "(dictsize[%d] == maxval[%d] - earlychange[%d])",
@@ -431,15 +529,23 @@
                   lzw_buffer_put_code (&st->buffer, LZW_RESET_CODE);
                   lzw_buffer_set_bitsize (&st->buffer, LZW_MIN_BITSIZE);
                   lzw_dict_reset (&st->dict);
+                  dict_reset = PDF_TRUE;
                 }
             }
+          /* create new dict entry */
+          if(!dict_reset)
+            lzw_dict_create_entry (&st->dict, &st->string);
+          else
+            dict_reset = PDF_FALSE;
+
+          st->string.prefix = st->string.suffix;
         }
     }
 
   if (finish)
     {
       lzw_buffer_put_code (&st->buffer, st->string.prefix);
-      if (st->dict.size == st->buffer.maxval - st->early_change)
+      if (st->dict.size - 1 == st->buffer.maxval - st->early_change)
         {
           PDF_DEBUG_BASE ("[LZW encoder] Increasing bitsize (FINISHING)... "
                           "(dictsize[%d] == maxval[%d] - earlychange[%d])",
@@ -540,7 +646,7 @@
   lzw_buffer_init (&filter_state->buffer, LZW_MIN_BITSIZE);
   lzw_dict_init (&filter_state->dict);
   filter_state->old_code = LZW_NULL_INDEX;
-  filter_state->decoded = filter_state->dec_buf + (LZW_MAX_DICTSIZE-2);
+  filter_state->decoded = filter_state->dec_buf + (LZW_MAX_DICTSIZE - 2);
   filter_state->dec_size = 0;
   filter_state->state_pos = LZWDEC_STATE_START;
   filter_state->tmp_error = NULL;
@@ -697,7 +803,7 @@
           if (!lzwdec_put_decoded (st, out))
             return PDF_STM_FILTER_APPLY_STATUS_NO_OUTPUT; /* No more output */
 
-          if (st->dict.size == st->buffer.maxval - 1 - st->early_change)
+          if (st->dict.size == st->buffer.maxval + 1 - st->early_change)
             {
               lzw_buffer_inc_bitsize (&st->buffer);
               /* break; We must wait for the reset code, don't reset yet. */

=== modified file 'torture/unit/Makefile.am'
--- torture/unit/Makefile.am	2011-08-29 15:17:29 +0000
+++ torture/unit/Makefile.am	2011-12-01 13:07:45 +0000
@@ -76,7 +76,8 @@
                  base/stm/pdf-stm-rw-filter-a85.c \
                  base/stm/pdf-stm-rw-filter-flate.c \
                  base/stm/pdf-stm-rw-filter-v2.c \
-                 base/stm/pdf-stm-rw-filter-aesv2.c
+                 base/stm/pdf-stm-rw-filter-aesv2.c \
+                 base/stm/pdf-stm-rw-filter-lzw.c 
 
 TEST_SUITE_HASH = base/hash/pdf-hash-new.c \
                   base/hash/pdf-hash-add.c \

=== modified file 'torture/unit/base/stm/tsuite-stm.c'
--- torture/unit/base/stm/tsuite-stm.c	2011-05-17 22:17:31 +0000
+++ torture/unit/base/stm/tsuite-stm.c	2011-12-01 13:07:45 +0000
@@ -45,6 +45,7 @@
 extern TCase *test_pdf_stm_rw_filter_flate (void);
 extern TCase *test_pdf_stm_rw_filter_v2 (void);
 extern TCase *test_pdf_stm_rw_filter_aesv2 (void);
+extern TCase *test_pdf_stm_rw_filter_lzw (void);
 
 Suite *
 tsuite_stm ()
@@ -70,6 +71,7 @@
   suite_add_tcase (s, test_pdf_stm_rw_filter_v2 ());
   suite_add_tcase (s, test_pdf_stm_rw_filter_aesv2 ());
   suite_add_tcase (s, test_pdf_stm_flush ());
+  suite_add_tcase (s, test_pdf_stm_rw_filter_lzw ());
 
   return s;
 }

# Begin bundle
IyBCYXphYXIgcmV2aXNpb24gYnVuZGxlIHY0CiMKQlpoOTFBWSZTWak+reoADLzfgFV3c3/////n
3t6////6YBIuAfRca+uNxnNmCgBo0aA9zl4m9e49nXG3u172A0FClAGpGlYSkTU008p5Q1T9Hqp7
1J5E2lPQagAyNGgABoyaaZAEogACBNCaShsU/VPUDRgQ0AAAAAA5piMjJpk0AyGjIZMgAABkaZGg
YQyBIhE0gjU9TTEymAamTNQYnoj1AADQMmjIxBFJFNqnlPYmiNI9T0amB6oekxNA0GhoAADQyZAq
SQTRoCGhNMU9MJkmJT9NNTRkI0AGQGJ6TQiDgETnvIHO34AubNus1/4UitLtRbr1Nnc4Oh60wvOr
223A2qTIcREU5H6NHc7mxk+hufO5jzy9pW9Xfc9HfNw4GbZb239swobJbvcsXxlu5WMF7J1O/Wea
ZlcbGXDvUWBlgz/O5gZRjMdES1RJOh78ktJbNyN3Z05jikdtk7usQsmLOoCWD37S2EvDWjowkGsL
abBbU1i2bY5jRV50kvE4xZReXbQGbqqvGgMRksg4qxmqMtZukCSD9zzwhIFZJCOrEGGCO9wZ8J5p
PN7J9fgtIepmmJkDLLKv63vZ8Rz5Y27WufRq9RC+yNjOFYOYR00HE2AwZjS0VUSlVSqgqvd+UDtr
Dw7+/v6evVmZSw2udX0V27quStEVpcZKOTLNyPPWPOp0KZtMGaZGIu7tFMGy9Qm77kMhRLyk02vb
LBgkmIu7Ve9GGZuqjF4M7S5w1Zmll8oDEYq8RGiSMux9kfw4HXSlS9LUlivVkrx6PyTsJ0lFH6jt
89i49lLq4/BF0XTSqofhFH19GD461c9BggoRh7o5uS+D0CBGipVmEVWkaq5aVX02yxZEEPYIRllB
ZLiDYCF5mwUTEKNIiJi1H0g56HrD2MwS10k51PPSCOXY9QbHaWYzNsSfSiupydjr9ma7XbfXDLkC
/84TVD/Uf53X3CqLUK/J6+rGsVOM6NZ42tm8zzASTK2chlgCIUce/HUIKSFSUBIhtNHJqw8m7Bj7
Wgo0OvobSh54v669Mw+JA+4JEQKfRlUp7nuw0anvJIGzxT+W2VU4c+AsuTU7RqNucuYRL0JqTI00
d2q/yRZ3xM4yVJVRrTeWLFoOgcO+Q3o952ZmkxXRVTzNG8ubyaT3liYWreVHFVbew4qGwprwzJXA
EEBCo7zjVFUIsj1oVSjeQ1HKg4z6PqIbBeENfUMyOPlfHtREQREREFM6ARXLewFM4A4eDrOepPFJ
sEKLwhLDSuymofvtcbMVl7gVmGckB5QhCtQ5aQ5rtR9t4GJSEzLaGbQk2UmKXRFE2di+5s7tN1b0
vJThtsjdfYUMlZxAkJTDhApQUeihSbR1kRLgKkAFW9C8D4b7ogOGYJmKFPaMGvwMBoQvkwSB4bCa
hva5Cs6EEMS1XiU0bXWqFTCrwI0zGqxBnFW1PI/zXQcuFh94344cHkeBu0mym/PcSi0kk1WQliqS
Jp44RrNhYy35JExAvBwT2K16YzdkTrKcjs4SHW4gh0BQSvJVUP07vcXIbCApFB6nzB0C3eD84ogi
8C3rd4FHE7m5A167S6gstGg1BKoh15aVwGzEbKa9J6J6dhkygGwh2qcnCnE1cPGkTkeeUiDzrHxz
6jKPgwZqKxgW6DOlnOnNByxncDIKEQ/7BCm9BCfqqSSaUJpIykzcKXTYT14mDCXMa3M2hwdnCQUH
CKG5MDWojMTEiI8mV8Xsd1NQqVnYdwYIGlC5o05WdG6hFxsJRkaNJGuHHgxc7msneCdlPFEQKDyh
AuML8mo0xuUIlzIq8Ns6iIFALiPnByy0o4ZNwrUjZzXvZKNASRBcUTB9NECKIDIzrLXrbUmgmQLs
EHYRoq8lRIBANbEixElAEfOXJI1GuEjsWEn0rYaqxxEqaaVSCx8gj58EJ7MtNkDc1yKWB+rKMzY3
HlSMBsiTDBUejiZaJEvjGt5nwid0KoJLposLaWYiqi3jkhQKM4gIGeTScOZPyoqqPVNckG9LKuHR
uQ0ubkHX73c4ezaiiIfWoMecRIzCyNFFQbNZ1mGNNDZw8OW5rFa8QDmrGuAK3xKSCjh0JbFh/Sx9
ADtmYkbGWsbJmUaby0wiBHtFZ42RKaOvGjBmGMQSis5K6wCAIPBICxmMuoOqMQ0cww2gitSAu4rt
61Ia9WAUK65EoXNkGKhVFNbhetvLOGlhMLt2vcvaWKP/R1yHCRx3J6sZHbIqj43ENGuD52jUoFJm
SRKpqjjkZJ8ig4cYW7rWPaIeVj1glOHLIuKYZgPV3z37U4Py3DbyDNlES7k42HE4Hga4cOizbugn
YEz11kEbbeG/QQ2Xy0zOJIRbECTrdAkQwHqCbWYOkMaUvIsOGxeRGYJWBsibeVJVa5EBial5A4R7
MSJjtRwKUIXgVYUHsF3ohuGRAiVtRolRdTwliIxtGmmkCwJ7dxCI6KHAxhYW3EG2e9V0ZcBed6Uy
0VBD1BCZO7wi5Nybmz7XZmqnEBrn6e/5T66BLkJHSxkqbHYYaaDVrDkocNJQGCkSVz3kEm2RXdaR
fBpR7RHCUFrwhYa7qzdo6zl4fJrzmcSrUeWHeQ+VS5HG5qT1IRVbm8XwsvD4GMjEHiBaCwnJjEPv
FDiYgzEMsbl09pIePjrbubl18cK3c9byomRQqug0FlxkXIWwNMXF1ikmBZCovxLWSaJDRoKDm7+z
e8VEwQNvcmZixIxlETtpHGd0Q0MH5EmANoaHcN3DYNgxtDRoBd1DY6qlLFRVMIlu8+iPcRcVIPb0
1UlEKj9ELn5uNREAQEMKRDFf48vBQyD9L+0Dn/3N6SIiAhiCL/MHCtIUISQNLgDBIaZ9Bbt5eRDj
oXlWlosxLJaw5eYOkhdhaguBPIGOqFrjD1mzMyH8sGkc8m7iOMlt+0cIMs+z+7xD/DshlJ/May4e
MZ/SPlTuT9jBSYHagkhhLBr3QaIbaQSfNzULv937STxjBEUsVJBQXpDqofF8mPgPt/YA0ED74ogp
82SOJJQvBbw6dwvzGg8nkKOKxmTRkSwgdj28RJu8+8IN0MdPpyv7InxuZB2UyqpCwGogMRA/HFoP
chxAVetIyOc1HFrHdBwrk1xzg2XSatfV8+KTKOqO/LfKXlm36fZfH9vB+xcZM9izglWeU625c9r2
M1nX4e3HD2dtchyvyqq7Nmlo3tynj4KP74knm4IvFihgUNtMYXwlyJBcfYO2caKxsQQQORQhuEL4
zPkLCvEY8Yyw2YZ5DrhQZ4sHDDMPhgGi6TM6gtgNFJS91JrqIVwMRknBSEZCSsTQi3lwIsdLHKHW
C1i5rBoDAw4LZ5j6aaUBU28p7cg/fqFi1i0PqoaaSxIlu+ENq1V1JVjc5omKi5MT6UxnE+K50STU
38HBC5eYezdqbcxP52veDSyW3OPiXOXQ5espU9PJCcqos9Fbaw5R7dawnnJOGDNbffDZxWHBsk8C
3/ns/jeuvvvxSdJvht+FHYQc0KIfMEeLwTYRHJHCcW/kr4wweWaWQgJQEyCTCSE2PHViu5X5tvMY
K9yoXqcXyfQba/tCElzxp77YRdDWf1DG0D2mTJ3Hm55iHHJz2TwgnmITRg9VUqh1lS3JPVBfKRaQ
wYZZ9fy8kvnI3y+kRpgau7hfsYxLtJUbxKU8SRW+nrr60xfDasooYGRW5sadeModUaihttf/r0F+
iu244Tb6i4aY29v7O6F9J6hMBszElcoO4TTCEugemqOUOPs8alD62iBdpE9m9BegGkhWJRPbtKJO
8PY7aehvGRo20sUPPUMJMoknNZIpMxMROfI1Bx+udtzFo/FFJNnlbHne+zu4qfVi8LLsF72uaJgw
mph45+mWex52aPTZLQ8125tafE2tODKcS5qUk21JsbfqZ2c6ToDxj+CN89aLkdVHVHktKfK1Mdqi
kqXWp2DtsnaU8vSzrTX0jVGvI9iHDOeqU2JZLyC59UhUWo6SpUhUn3vilRpSlpSyUsqSQSYJY0nv
aaZUdQt7CPQjpBkVHWC6VqXeHSo+PusnuJP02SpQw/ar2o43+38Jh1nJ8xcobtrbHz9BHVJE0R8S
eB3T48cF2J5SRu+s9olHw0HubD3T7V1KqaUmcfCOEoJYyGiT8/Xg4yH/331WLFU/CJt1QuVL/edx
gdUToSNZlN3ySPl6o9qxT2OW4yI0kfdnXYUNbMDoT2JUG4bcN9QkFA553BG0xRomww8hhFBsSN5Y
4komN2ccfsQ3KifS0FkfCMxbviSZxLvWZEk1p66hoSaztk3sDzk9Yos/Rq/RVx1GzdKUhnJEpPn9
26NsOSN5xaZGqLhpGKaO4aBaQoCgXu6Oe0w2ZCqkCBkc5EvQNQTQEbIkhfk36UaLbyrLNq0hYUJl
GgrrRokvRhBhjJE6ik5Uae4X6F0kqUSqiNUMkHLRPribKYaW6HrdQe+FiNo2JtF4CsV6GXlGyOH1
SG6uCd0SpIl8OLbhEcTd+KWkicdD4Z9PmowGsv8CkUh0JpNN/l5FWKKjvZri8zSbTn5BzJy9nY1/
pdFG6chzdFOuJ3E2dH4D3i5+I7DNMclQ5q8KcyhItdIYmC5LuWHJ7Jdo4uGFhyM9IMHUqlfa0e6q
+mR8C9J1E2Ihs5ddtfVFjZDdMJNEOlOSTHHLQdKXsNDyLiyNEazJaXGqhhfVTG0r2Omz9bx6HHnu
+6VOPJ1Zb7H0HDcekLSTJ+f14DjYVVVFcyMeRI9RrrvMEXB42m0Np6TcQsyDk1B8iFwQOt4Z6SCG
xQdNr3XGKbJ3LGSn5T3bJJc+s0O0dEhrnMXmg01S6QOOk2ZaMHJwyLf1dxtv3uiYMXgcBHFVQYJs
TSEXCUSTlhuMwaLBcRBAvGCKjZtgwhrpL41YlWXWOwyqNxUeJo1O7iWZKIxNjVV6+b2s1U9RyjZE
k0pE25J6pmTVdeYQvoutMs/O5Dq3VebYkmMMa3EcwYpkVlOHXt7IFSEXYJBr5EzKjhuP1yROVw5G
zpz6A0648pnirHljPev5ic3bSR4k1RPuR4cSbvi0JRyFSHxkPfIWFyhz4A1GgYYITVHhRrnnIIYR
3wkE8yDm6Y7dYeMnKm9HcmOojqrpifPOQ1lapIelHtXOHiD4CkylQUFH/i7kinChIVJ9W9Q=

Reply via email to