Re: [Flac-dev] libFLAC bitbuffer optimizations

Eric Wong Sat, 01 Jan 2005 17:36:21 -0800

Josh Coalson <[EMAIL PROTECTED]> wrote:
> thanks for the patch.

No prob :)


> also, if you have miroslav's patch again a more updated version
> of bitbuffer.c that would be great.  I have been meaning to get
> around to applying it for a long time.

This is Miroslav's patch, from the mailing list post I dug up in the archives:

--- orig/src/libFLAC/bitbuffer.c
+++ mod/src/libFLAC/bitbuffer.c
@@ -62,6 +62,24 @@
  * keeping in mind the limit from the first paragraph.
  */
 static const unsigned FLAC__BITBUFFER_DEFAULT_CAPACITY = ((65536 - 64) * 8) / 
FLAC__BITS_PER_BLURB; /* blurbs */
+static const unsigned char byte_to_unary_table[] = {
+      8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
 
 #if FLAC__BITS_PER_BLURB == 8
 #define FLAC__BITS_PER_BLURB_LOG2 3
@@ -70,6 +88,7 @@
 #define FLAC__BLURB_TOP_BIT_ONE ((FLAC__byte)0x80)
 #define BLURB_BIT_TO_MASK(b) (((FLAC__blurb)'\x80') >> (b))
 #define CRC16_UPDATE_BLURB(bb, blurb, crc) FLAC__CRC16_UPDATE((blurb), (crc));
+#define FLAC__ALIGNED_BLURB_UNARY(blurb) (byte_to_unary_table[blurb])
 #elif FLAC__BITS_PER_BLURB == 32
 #define FLAC__BITS_PER_BLURB_LOG2 5
 #define FLAC__BYTES_PER_BLURB 4
@@ -77,6 +96,13 @@
 #define FLAC__BLURB_TOP_BIT_ONE ((FLAC__uint32)0x80000000)
 #define BLURB_BIT_TO_MASK(b) (((FLAC__blurb)0x80000000) >> (b))
 #define CRC16_UPDATE_BLURB(bb, blurb, crc) crc16_update_blurb((bb), (blurb));
+#define FLAC__ALIGNED_BLURB_UNARY(blurb) ((blurb) <= 0xff \
+               ? byte_to_unary_table[blurb] + 24 \
+               : ((blurb) <= 0xffff \
+                       ? byte_to_unary_table[(blurb) >> 8] + 16 \
+                       : ((blurb) <=0xffffff \
+                               ? byte_to_unary_table[(blurb) >> 16] + 8 \
+                               : byte_to_unary_table[(blurb) >> 24])))
 #else
 /* ERROR, only sizes of 8 and 32 are supported */
 #endif
@@ -2109,114 +2135,18 @@
        if(nvals == 0)
                return true;
 
+       cbits = bb->consumed_bits;
        i = bb->consumed_blurbs;
-       /*
-        * We unroll the main loop to take care of partially consumed blurbs 
here.
-        */
-       if(bb->consumed_bits > 0) {
-               save_blurb = blurb = buffer[i];
-               cbits = bb->consumed_bits;
-               blurb <<= cbits;
-
-               while(1) {
-                       if(state == 0) {
-                               if(blurb) {
-                                       for(j = 0; !(blurb & 
FLAC__BLURB_TOP_BIT_ONE); j++)
-                                               blurb <<= 1;
-                                       msbs += j;
-
-                                       /* dispose of the unary end bit */
-                                       blurb <<= 1;
-                                       j++;
-                                       cbits += j;
-
-                                       uval = 0;
-                                       lsbs_left = parameter;
-                                       state++;
-                                       if(cbits == FLAC__BITS_PER_BLURB) {
-                                               cbits = 0;
-                                               CRC16_UPDATE_BLURB(bb, 
save_blurb, bb->read_crc16);
-                                               break;
-                                       }
-                               }
-                               else {
-                                       msbs += FLAC__BITS_PER_BLURB - cbits;
-                                       cbits = 0;
-                                       CRC16_UPDATE_BLURB(bb, save_blurb, 
bb->read_crc16);
-                                       break;
-                               }
-                       }
-                       else {
-                               const unsigned available_bits = 
FLAC__BITS_PER_BLURB - cbits;
-                               if(lsbs_left >= available_bits) {
-                                       uval <<= available_bits;
-                                       uval |= (blurb >> cbits);
-                                       cbits = 0;
-                                       CRC16_UPDATE_BLURB(bb, save_blurb, 
bb->read_crc16);
-
-                                       if(lsbs_left == available_bits) {
-                                               /* compose the value */
-                                               uval |= (msbs << parameter);
-                                               if(uval & 1)
-                                                       vals[val_i++] = 
-((int)(uval >> 1)) - 1;
-                                               else
-                                                       vals[val_i++] = 
(int)(uval >> 1);
-                                               if(val_i == nvals)
-                                                       break;
-
-                                               msbs = 0;
-                                               state = 0;
-                                       }
-
-                                       lsbs_left -= available_bits;
-                                       break;
-                               }
-                               else {
-                                       uval <<= lsbs_left;
-                                       uval |= (blurb >> (FLAC__BITS_PER_BLURB 
- lsbs_left));
-                                       blurb <<= lsbs_left;
-                                       cbits += lsbs_left;
-
-                                       /* compose the value */
-                                       uval |= (msbs << parameter);
-                                       if(uval & 1)
-                                               vals[val_i++] = -((int)(uval >> 
1)) - 1;
-                                       else
-                                               vals[val_i++] = (int)(uval >> 
1);
-                                       if(val_i == nvals) {
-                                               /* back up one if we exited the 
for loop because we read all nvals but the end came in the middle of a blurb */
-                                               i--;
-                                               break;
-                                       }
-
-                                       msbs = 0;
-                                       state = 0;
-                               }
-                       }
-               }
-               i++;
-
-               bb->consumed_blurbs = i;
-               bb->consumed_bits = cbits;
-               bb->total_consumed_bits = (i << FLAC__BITS_PER_BLURB_LOG2) | 
cbits;
-       }
-
-       /*
-        * Now that we are blurb-aligned the logic is slightly simpler
-        */
+       
        while(val_i < nvals) {
-               for( ; i < bb->blurbs && val_i < nvals; i++) {
-                       save_blurb = blurb = buffer[i];
-                       cbits = 0;
+               for( ; i < bb->blurbs; i++) {
+                       blurb = (save_blurb = buffer[i]) << cbits;
                        while(1) {
                                if(state == 0) {
                                        if(blurb) {
-                                               for(j = 0; !(blurb & 
FLAC__BLURB_TOP_BIT_ONE); j++)
-                                                       blurb <<= 1;
+                                               j = 
FLAC__ALIGNED_BLURB_UNARY(blurb);
                                                msbs += j;
 
-                                               /* dispose of the unary end bit 
*/
-                                               blurb <<= 1;
                                                j++;
                                                cbits += j;
 
@@ -2228,6 +2158,7 @@
                                                        CRC16_UPDATE_BLURB(bb, 
save_blurb, bb->read_crc16);
                                                        break;
                                                }
+                                               blurb <<= j;
                                        }
                                        else {
                                                msbs += FLAC__BITS_PER_BLURB - 
cbits;
@@ -2247,12 +2178,11 @@
                                                if(lsbs_left == available_bits) 
{
                                                        /* compose the value */
                                                        uval |= (msbs << 
parameter);
-                                                       if(uval & 1)
-                                                               vals[val_i++] = 
-((int)(uval >> 1)) - 1;
-                                                       else
-                                                               vals[val_i++] = 
(int)(uval >> 1);
-                                                       if(val_i == nvals)
-                                                               break;
+                                                       vals[val_i++] = 
(int)(uval >> 1 ^ -(int)(uval & 1));
+                                                       if(val_i == nvals) {
+                                                               i++;
+                                                               goto break2;
+                                                       }
 
                                                        msbs = 0;
                                                        state = 0;
@@ -2262,29 +2192,23 @@
                                                break;
                                        }
                                        else {
+                                               cbits += lsbs_left;
                                                uval <<= lsbs_left;
                                                uval |= (blurb >> 
(FLAC__BITS_PER_BLURB - lsbs_left));
                                                blurb <<= lsbs_left;
-                                               cbits += lsbs_left;
 
                                                /* compose the value */
                                                uval |= (msbs << parameter);
-                                               if(uval & 1)
-                                                       vals[val_i++] = 
-((int)(uval >> 1)) - 1;
-                                               else
-                                                       vals[val_i++] = 
(int)(uval >> 1);
-                                               if(val_i == nvals) {
-                                                       /* back up one if we 
exited the for loop because we read all nvals but the end came in the middle of 
a blurb */
-                                                       i--;
-                                                       break;
-                                               }
-
+                                               vals[val_i++] = (int)(uval >> 1 
^ -(int)(uval & 1));
+                                               if(val_i == nvals)
+                                                       goto break2;
                                                msbs = 0;
                                                state = 0;
                                        }
                                }
                        }
                }
+break2:
                bb->consumed_blurbs = i;
                bb->consumed_bits = cbits;
                bb->total_consumed_bits = (i << FLAC__BITS_PER_BLURB_LOG2) | 
cbits;




This is my patch on top of Miroslavs, which micro optimizes away some indexing
and comparison overhead.

--- orig/src/libFLAC/bitbuffer.c
+++ mod/src/libFLAC/bitbuffer.c
@@ -2123,7 +2123,7 @@
 {
        const FLAC__blurb *buffer = bb->buffer;
 
-       unsigned i, j, val_i = 0;
+       unsigned i, j, val_i = nvals;
        unsigned cbits = 0, uval = 0, msbs = 0, lsbs_left = 0;
        FLAC__blurb blurb, save_blurb;
        unsigned state = 0; /* 0 = getting unary MSBs, 1 = getting binary LSBs 
*/
@@ -2138,7 +2138,7 @@
        cbits = bb->consumed_bits;
        i = bb->consumed_blurbs;
        
-       while(val_i < nvals) {
+       while(val_i != 0) {
                for( ; i < bb->blurbs; i++) {
                        blurb = (save_blurb = buffer[i]) << cbits;
                        while(1) {
@@ -2178,11 +2178,13 @@
                                                if(lsbs_left == available_bits) 
{
                                                        /* compose the value */
                                                        uval |= (msbs << 
parameter);
-                                                       vals[val_i++] = 
(int)(uval >> 1 ^ -(int)(uval & 1));
-                                                       if(val_i == nvals) {
+                                                       *vals = (int)(uval >> 1 
^ -(int)(uval & 1));
+                                                       --val_i;
+                                                       if(val_i == 0) {
                                                                i++;
                                                                goto break2;
                                                        }
+                                                       *(++vals);
 
                                                        msbs = 0;
                                                        state = 0;
@@ -2199,9 +2201,12 @@
 
                                                /* compose the value */
                                                uval |= (msbs << parameter);
-                                               vals[val_i++] = (int)(uval >> 1 
^ -(int)(uval & 1));
-                                               if(val_i == nvals)
+                                               *vals = (int)(uval >> 1 ^ 
-(int)(uval & 1));
+                                               --val_i;
+                                               if(val_i == 0)
                                                        goto break2;
+                                               *(++vals);
+
                                                msbs = 0;
                                                state = 0;
                                        }
@@ -2212,7 +2217,7 @@
                bb->consumed_blurbs = i;
                bb->consumed_bits = cbits;
                bb->total_consumed_bits = (i << FLAC__BITS_PER_BLURB_LOG2) | 
cbits;
-               if(val_i < nvals) {
+               if(val_i > 0) {
                        if(!bitbuffer_read_from_client_(bb, read_callback, 
client_data))
                                return false;
                        /* these must be zero because we can only get here if 
we got to the end of the buffer */



> 
> btw how are you playing it on the ipod?

With a modified version of Music Player Daemon (MPD, www.musicpd.org).
http://www.ipodlinux.org/MPD has more info about it.

> not sure how to help out with lpc_restore_signal().  there are
> x86 and PPC versions of the routines in CVS that might be good
> references.  it is basically a multiply-accumulate loop but you
> have to be careful about overflow.  an ARM7 version of this
> function would help the speed on several other FLAC-supported
> devices.

This is my current C version of lpc_restore_signal.  It only works when
(order<=8) (not a problem for me right now), but the inner loop for larger
orders could be done using Duff's device and still be fast without getting much
bigger.  I reading up on ARM assembly right now.

void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, 
const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 
data[])
{
        unsigned i;
        FLAC__int32 sum;
        const FLAC__int32 *history, *qlp;
        const int tmp = (0 - order - 1);
        
        for(i = data_len; i != 0; --i) {
                sum = 0;
                qlp = &qlp_coeff[order];
                history = &data[tmp];
                switch (order) {
                        case 8: sum += (*(--qlp)) * (*(++history));
                        case 7: sum += (*(--qlp)) * (*(++history));
                        case 6: sum += (*(--qlp)) * (*(++history));
                        case 5: sum += (*(--qlp)) * (*(++history));
                        case 4: sum += (*(--qlp)) * (*(++history));
                        case 3: sum += (*(--qlp)) * (*(++history));
                        case 2: sum += (*(--qlp)) * (*(++history));
                        case 1: sum += (*(--qlp)) * (*(++history));
                                break;
                }
                *(data++) = *(residual++) + (sum >> lp_quantization);
        }
}

-- 
Eric Wong

signature.asc
Description: Digital signature

_______________________________________________
Flac-dev mailing list
Flac-dev@xiph.org
http://lists.xiph.org/mailman/listinfo/flac-dev

Re: [Flac-dev] libFLAC bitbuffer optimizations

Reply via email to