Ignore the pako.js example output... It was just outputting the binary
representation of A-Z, not the huffman code

This is what I meant to send

For ABCD:

[65,'11'],
[66,'0'],
[67,'10'],
[68,'1'],
[256,'111']

It still doesn't seem to be sorting correctly lexographically, but I'm
not really in my comfort zone of understanding:

The RFC has this instead:

Symbol  Code
            ------  ----
            A       10
            B       0
            C       110
            D       111

I don't really know if it has to match the RFC or if each
implementation is able to do its own thing as long since it includes
the distance/reverse lookup table (whatever it's called).

FYI


This is where I inserted my code:

/* ===========================================================================
 * Generate the codes for a given tree and bit counts (which need not be
 * optimal).
 * IN assertion: the array bl_count contains the bit length statistics for
 * the given tree and the field len is set for all tree elements.
 * OUT assertion: the field code is set for all tree elements of non
 *     zero code length.
 */
function gen_codes(tree, max_code, bl_count)
//    ct_data *tree;             /* the tree to decorate */
//    int max_code;              /* largest code with non zero frequency */
//    ushf *bl_count;            /* number of codes at each bit length */
{
  var next_code = new Array(MAX_BITS+1); /* next code value for each
bit length */
  var code = 0;              /* running code value */
  var bits;                  /* bit index */
  var n;                     /* code index */

  /* The distribution counts are first used to generate the code values
   * without bit reversal.
   */
  for (bits = 1; bits <= MAX_BITS; bits++) {
    next_code[bits] = code = (code + bl_count[bits-1]) << 1;
  }
  /* Check that the bit counts in bl_count are consistent. The last code
   * must be all ones.
   */
  //Assert (code + bl_count[MAX_BITS]-1 == (1<<MAX_BITS)-1,
  //        "inconsistent bit counts");
  //Tracev((stderr,"\ngen_codes: max_code %d ", max_code));

  for (n = 0;  n <= max_code; n++) {
    var len = tree[n*2 + 1]/*.Len*/;
    if (len === 0) { continue; }
    /* Now reverse the bits */
    tree[n*2]/*.Code*/ = bi_reverse(next_code[len]++, len);

      if (tree!=static_ltree) {
          var v = tree[n*2];
          console.log('[' + n + ",'" + v.toString(2) + "'],");
      }
    //Tracecv(tree != static_ltree, (stderr,"\nn %3d %c l %2d c %4x (%x) ",
    //     n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1));
  }

}

On Thu, Sep 11, 2014 at 11:55 AM, Joe Bogner <[email protected]> wrote:
> I think the prefix coding looks OK, but the 2 rules does not:
>
> I modified the code[1] to allow passing in a string and outputting the codes
>
> C:\temp>deflate ABCDEFGHIJKLMONPQRSTUVWXYZ
> code 65 : 0 00000000000000000000000000000000
> code 66 : 6 00000000000000000000000000000110
> code 67 : 8 00000000000000000000000000001000
> code 68 : 4 00000000000000000000000000000100
> code 69 : 22 00000000000000000000000000010110
> code 70 : 14 00000000000000000000000000001110
> code 71 : 30 00000000000000000000000000011110
> code 72 : 1 00000000000000000000000000000001
> code 73 : 17 00000000000000000000000000010001
> code 74 : 12 00000000000000000000000000001100
> code 75 : 9 00000000000000000000000000001001
> code 76 : 25 00000000000000000000000000011001
> code 77 : 5 00000000000000000000000000000101
> code 78 : 21 00000000000000000000000000010101
> code 79 : 13 00000000000000000000000000001101
> code 80 : 29 00000000000000000000000000011101
> code 81 : 3 00000000000000000000000000000011
> code 82 : 19 00000000000000000000000000010011
> code 83 : 11 00000000000000000000000000001011
> code 84 : 27 00000000000000000000000000011011
> code 85 : 7 00000000000000000000000000000111
> code 86 : 23 00000000000000000000000000010111
> code 87 : 15 00000000000000000000000000001111
> code 88 : 31 00000000000000000000000000011111
> code 89 : 2 00000000000000000000000000000010
> code 90 : 10 00000000000000000000000000001010
>
>
> I think it violates the consecutive rule... Each letter has the same
> frequency. ABCD have the same bit length. The order is off:
>
> If I sort it lexographically using javascript:
>
> JSON.stringify([['a','00000000000000000000000000000000'],
> ['b','00000000000000000000000000000110'],
> ['c','00000000000000000000000000001000'],
> ['d','00000000000000000000000000000100']].sort(function(x,y) { return
> x[1] - y[1] }))
>
> "[["a","00000000000000000000000000000000"],["d","00000000000000000000000000000100"],["b","00000000000000000000000000000110"],["c","00000000000000000000000000001000"]]"
>
> As you can see, the order comes out a,d,b,c
>
> I played around with a javascript implementation, pako[2]. It seems to
> work correctly:
>
> As you can see, it sorts lexographically
>
> JSON.stringify([[65,'1000001'],
> [66,'1000010'],
> [67,'1000011'],
> [68,'1000100'],
> [69,'1000101'],
> [70,'1000110'],
> [71,'1000111'],
> [72,'1001000'],
> [73,'1001001'],
> [74,'1001010'],
> [75,'1001011'],
> [76,'1001100'],
> [77,'1001101'],
> [78,'1001110'],
> [79,'1001111'],
> [80,'1010000'],
> [81,'1010001'],
> [82,'1010010'],
> [83,'1010011'],
> [84,'1010100'],
> [85,'1010101'],
> [86,'1010110'],
> [87,'1010111'],
> [88,'1011000'],
> [89,'1011001'],
> [90,'1011010']].sort(function(x,y) { return x[1] - y[1] }))
>
> "[[65,"1000001"],[66,"1000010"],[67,"1000011"],[68,"1000100"],[69,"1000101"],[70,"1000110"],[71,"1000111"],[72,"1001000"],[73,"1001001"],[74,"1001010"],[75,"1001011"],[76,"1001100"],[77,"1001101"],[78,"1001110"],[79,"1001111"],[80,"1010000"],[81,"1010001"],[82,"1010010"],[83,"1010011"],[84,"1010100"],[85,"1010101"],[86,"1010110"],[87,"1010111"],[88,"1011000"],[89,"1011001"],[90,"1011010"]]"
>
> All the values are sorted correctly.
>
> Here it is with the same ABCD example:
>
> var pako = require('pako');
> var binaryString = pako.deflate('ABCD', { to: 'string' });
> console.log(binaryString);
> var restored = pako.inflate(binaryString, { to: 'string' });
> console.log(restored);
>
> It successfully deflates and inflates itself
>
> x?♣A☺☺   ? mcÿ7♣A♫☻?☺♂
> ABCD
>
>
> Hope this helps...
>
> [1] - https://gist.github.com/joebo/a3c2932f0e5a7a0c3f07#file-deflate-c-L2613
> [2] - https://rawgit.com/nodeca/pako/master/dist/pako.js
>
> On Thu, Sep 11, 2014 at 11:33 AM, bill lam <[email protected]> wrote:
>> This is strange since every author must had decode its own encoded
>> data as a smoke test.
>>
>> Did you test if huffman code or bit lengths it produced was
>> correct or not, ie it is a prefix coding and it satisfy the 2
>> rules in rfc.
>>
>> Чт, 11 сен 2014, Joe Bogner написал(а):
>>> unfortunately the dynamic coding in the putty fork doesn't seem to work:
>>>
>>> deflate -c deflate.c > out
>>> deflate -d out
>>>
>>> decoding error: incorrect data checksum
>>>
>>>
>>> it works fine with static tables
>>>
>>> C:\temp>echo ABCD > ABCD
>>>
>>> C:\temp>deflate -c ABCD > out
>>>
>>> C:\temp>deflate -d out
>>> ABCD
>>>
>>> I added some debugging code to determine that deflating deflate.c
>>> would be a dynamic table...  Assuming it's broke, I probably wouldn't
>>> use it as a reference implementation after all
>>>
>>> On Thu, Sep 11, 2014 at 3:45 AM, bill lam <[email protected]> wrote:
>>> > the frequencies (guessing from bit lengths) should be something like 2 3 
>>> > 1 1
>>> >   (2 3 1 1) hcodes 'ABCD'
>>> >
>>> > the hard part is the inverse problem: how to get the huffman code with
>>> > prior knowing the bits for each symbol.  Your pointer to the putty
>>> > fork looks like helpful.  The comment is in lines 861 to 914, the code
>>> > itself in line 915 to 964. Do you know how to express it in J?
>>> > Thanks.
>>> >
>>> > On Thu, Sep 11, 2014 at 2:57 PM, Joe Bogner <[email protected]> wrote:
>>> >> Here a few other links ... after reading through the RFC. Not sure if
>>> >> they help, but just sharing from my own research into assisting on
>>> >> this topic
>>> >>
>>> >> https://github.com/evegard/pngview/blob/master/huffman.c#L54
>>> >>
>>> >> And a fork of the putty version with dynamic huffman coding:
>>> >> http://rc.quest.com/viewvc/putty/trunk/halibut/deflate.c?diff_format=s&revision=2&view=markup
>>> >>
>>> >> Or just generally googling some of the code from the RFC:
>>> >> https://www.google.com/search?q=next_code%5Blen%5D%2B%2B%3B&oq=next_code%5Blen%5D%2B%2B%3B&aqs=chrome..69i57.387j0j7&sourceid=chrome&es_sm=93&ie=UTF-8#q=next_code%5Blen%5D%2B%2B%3B&start=20
>>> >>
>>> >>
>>> >> Using the code from
>>> >> http://www.jsoftware.com/jwiki/Essays/Huffman%20Coding, I got stuck
>>> >> trying to match a simple example to the binary tree in the RFC:
>>> >>
>>> >> From the RFC:
>>> >>
>>> >>                   /\              Symbol    Code
>>> >>                          0  1             ------    ----
>>> >>                         /    \                A      00
>>> >>                        /\     B               B       1
>>> >>                       0  1                    C     011
>>> >>                      /    \                   D     010
>>> >>                     A     /\
>>> >>                          0  1
>>> >>                         /    \
>>> >>                        D      C
>>> >>
>>> >>
>>> >>
>>> >>    (4#1) hcodes 'ABCD'
>>> >> ┌───┬───┬───┬───┐
>>> >> │0 0│0 1│1 0│1 1│
>>> >> └───┴───┴───┴───┘
>>> >>
>>> >> Per the RFC, ideally that should match this? '00';'1';'011';'010'
>>> >>
>>> >>
>>> >> From there, it seems like a pretty straightforward exercise to
>>> >> transliterate the C code from the RFC into J code to recode the
>>> >> example to:
>>> >>
>>> >>
>>> >>             Symbol  Code
>>> >>             ------  ----
>>> >>             A       10
>>> >>             B       0
>>> >>             C       110
>>> >>             D       111
>>> >>
>>> >>
>>> >> I would probably start with a looping construct like what's in the RFC
>>> >> and then figure out a more J way to do it, but first I would need to
>>> >> figure out how to create the binary tree in that initial format.
>>> >>
>>> >> On Wed, Sep 10, 2014 at 7:41 PM, bill lam <[email protected]> wrote:
>>> >>> Thanks Joe,
>>> >>> putty only use zlib static huffman for encoding so that it does not 
>>> >>> build
>>> >>> any huffman dictionary table.
>>> >>>
>>> >>> The zlib static huffman code does not care about individual symbol's
>>> >>> frequency, it just encode 0 to 286 into bits, see section 3.2.6.
>>> >>>  On Sep 11, 2014 1:26 AM, "Joe Bogner" <[email protected]> wrote:
>>> >>>
>>> >>>> You've already likely considered this, but if it were me I would 
>>> >>>> compare
>>> >>>> results to a working implementation. The one from putty seems pretty 
>>> >>>> clean
>>> >>>> and standalone:
>>> >>>> https://raw.githubusercontent.com/grumpydev/PortablePuTTY/master/SSHZLIB.C
>>> >>>> . I was able to compile it on windows no problem and I assume it'd be 
>>> >>>> fine
>>> >>>> on linux as well.
>>> >>>>
>>> >>>> On Wed, Sep 10, 2014 at 1:00 PM, Raul Miller <[email protected]>
>>> >>>> wrote:
>>> >>>>
>>> >>>> > I think the use of the term "consecutive" rather than "sequential" is
>>> >>>> > telling.
>>> >>>> >
>>> >>>> > The described algorithm is: compute the huffman code lengths:
>>> >>>> >    #@>F1 hcodes A1
>>> >>>> > 1 3 7 7 6 6 6 6 6 6 6 2
>>> >>>> >
>>> >>>> > Then assign ascending huffman codes first in length order and then
>>> >>>> > within codes of the same length.
>>> >>>> >
>>> >>>> > Taken literally, that might be something like this:
>>> >>>> >
>>> >>>> > H=: 4 :0
>>> >>>> >   L=.#@> x hcodes y
>>> >>>> >   U=.~.L
>>> >>>> >   ;<@(({.{.U e.~i.&.<:@{.)<@:+"1-@{.{."1 #:@i.@#)/.~L
>>> >>>> > )
>>> >>>> >
>>> >>>> >    ":@>F1 H A1
>>> >>>> > 0
>>> >>>> > 1 1 0
>>> >>>> > 1 1 1 0 0 1 0
>>> >>>> > 1 1 1 0 0 1 1
>>> >>>> > 1 1 1 0 0 0
>>> >>>> > 1 1 1 0 0 1
>>> >>>> > 1 1 1 0 1 0
>>> >>>> > 1 1 1 0 1 1
>>> >>>> > 1 1 1 1 0 0
>>> >>>> > 1 1 1 1 0 1
>>> >>>> > 1 1 1 1 1 0
>>> >>>> > 1 0
>>> >>>> >
>>> >>>> > But is this correct? Is it actually safe to leave the results like
>>> >>>> > this - with all codes of the same length being consecutive to each
>>> >>>> > other?
>>> >>>> >
>>> >>>> >    F (hcodes -:&:(#@>) H) A
>>> >>>> > 0
>>> >>>> >
>>> >>>> > No.
>>> >>>> >
>>> >>>> > So... "consecutive" must refer only to the values used and not their
>>> >>>> > order within the result.
>>> >>>> >
>>> >>>> > Perhaps something like this:
>>> >>>> >
>>> >>>> > deflatecodes=:4 :0
>>> >>>> >   L=.#@> x hcodes y
>>> >>>> >   U=.~.L
>>> >>>> >   R=. ;<@(({.{.U e.~i.&.<:@{.)<@:+"1-@{.{."1 #:@i.@#)/.~L
>>> >>>> >   R/:;(</. i.@#)L
>>> >>>> > )
>>> >>>> >
>>> >>>> >    F (hcodes -:&:(#@>) deflatecodes)  A
>>> >>>> > 1
>>> >>>> >
>>> >>>> > There should be a better way of doing this, but this should at least
>>> >>>> > get you started.
>>> >>>> >
>>> >>>> > Thanks,
>>> >>>> >
>>> >>>> > --
>>> >>>> > Raul
>>> >>>> >
>>> >>>> >
>>> >>>> > On Wed, Sep 10, 2014 at 10:45 AM, bill lam <[email protected]> 
>>> >>>> > wrote:
>>> >>>> > > For huffman coding used in zlib:
>>> >>>> > > https://www.ietf.org/rfc/rfc1951.txt section 3.2.2.
>>> >>>> > >
>>> >>>> > >  The Huffman codes used for each alphabet in the "deflate"
>>> >>>> > >  format have two additional rules:
>>> >>>> > >
>>> >>>> > >   * All codes of a given bit length have lexicographically
>>> >>>> > >   consecutive values, in the same order as the symbols
>>> >>>> > >   they represent;
>>> >>>> > >
>>> >>>> > >   * Shorter codes lexicographically precede longer codes.
>>> >>>> > > I tried jwiki hcodes in
>>> >>>> > > I try Roger's essay
>>> >>>> > > http://www.jsoftware.com/jwiki/Essays/Huffman%20Coding
>>> >>>> > >
>>> >>>> > > hc=: 4 : 0
>>> >>>> > > if. 1=#x do. y
>>> >>>> > > else. ((i{x),+/j{x) hc (i{y),<j{y [ i=. (i.#x) -. j=. 2{./:x end.
>>> >>>> > > )
>>> >>>> > >
>>> >>>> > > hcodes=: 4 : 0
>>> >>>> > > assert. x -:&$ y           NB. weights and words have same shape
>>> >>>> > > assert. (0<:x) *. 1=#$x    NB. weights are non-negative
>>> >>>> > > assert. 1 >: L.y           NB. words are boxed not more than once
>>> >>>> > > w=. ,&.> y                 NB. standardized words
>>> >>>> > > assert. w -: ~.w           NB. words are unique
>>> >>>> > > t=. 0 {:: x hc w           NB. minimal weight binary tree
>>> >>>> > > ((< S: 0 t) i. w) { <@(1&=)@; S: 1 {:: t
>>> >>>> > > )
>>> >>>> > >
>>> >>>> > > but the coding produced is malformed for zlib. eg,
>>> >>>> > > this is what I ran into trouble
>>> >>>> > >
>>> >>>> > > f1=: 1 256 17 1 1 9 1
>>> >>>> > > f2=: 2 1 0 1 255 0 1536
>>> >>>> > > F=: ,/(f1#f2)
>>> >>>> > > A=: i.286
>>> >>>> > >
>>> >>>> > > F hcodes A
>>> >>>> > >
>>> >>>> > > Or a shorter example
>>> >>>> > >
>>> >>>> > > A1=: i.12
>>> >>>> > > F1=: 2 1 0 0 0 0 0 0 0 0 0 1
>>> >>>> > >
>>> >>>> > > F1 hcodes A1
>>> >>>> > >
>>> >>>> > > Any idea?
>>> >>>> > >
>>> >>>> > > --
>>> >>>> > > regards,
>>> >>>> > > ====================================================
>>> >>>> > > GPG key 1024D/4434BAB3 2008-08-24
>>> >>>> > > gpg --keyserver subkeys.pgp.net --recv-keys 4434BAB3
>>> >>>> > > gpg --keyserver subkeys.pgp.net --armor --export 4434BAB3
>>> >>>> > > ----------------------------------------------------------------------
>>> >>>> > > For information about J forums see 
>>> >>>> > > http://www.jsoftware.com/forums.htm
>>> >>>> > ----------------------------------------------------------------------
>>> >>>> > For information about J forums see 
>>> >>>> > http://www.jsoftware.com/forums.htm
>>> >>>> >
>>> >>>> ----------------------------------------------------------------------
>>> >>>> For information about J forums see http://www.jsoftware.com/forums.htm
>>> >>>>
>>> >>> ----------------------------------------------------------------------
>>> >>> For information about J forums see http://www.jsoftware.com/forums.htm
>>> >> ----------------------------------------------------------------------
>>> >> For information about J forums see http://www.jsoftware.com/forums.htm
>>> > ----------------------------------------------------------------------
>>> > For information about J forums see http://www.jsoftware.com/forums.htm
>>> ----------------------------------------------------------------------
>>> For information about J forums see http://www.jsoftware.com/forums.htm
>>
>> --
>> regards,
>> ====================================================
>> GPG key 1024D/4434BAB3 2008-08-24
>> gpg --keyserver subkeys.pgp.net --recv-keys 4434BAB3
>> gpg --keyserver subkeys.pgp.net --armor --export 4434BAB3
>> ----------------------------------------------------------------------
>> For information about J forums see http://www.jsoftware.com/forums.htm
----------------------------------------------------------------------
For information about J forums see http://www.jsoftware.com/forums.htm

Reply via email to