Looking at GCC's assembly output, I was disgusted by what it was doing (note that I'm still using avr-gcc 4.9.1) and wrote an assembly version.
That let me do some useful space-saving tricks like allocating a different zero register so that r1 is free for the multiplier. Again, feel free to kibitz the code; this is literally my first AVR code ever. I'm not quite clear whether I need to support the avr1 architecture. I'm assuming not, and make significant use of adiw/sdiw and Z+/-Z. Is that right? It's now the same length (0x8c bytes) as the decimal-only part of __ultoa_invert, and 3/4 the time of my previous C version. Compared to __ultoa_invert, it's 2/3 of the time for large outputs, and less than 1/2 the time for small. Updating the previous table (__ultoa_invert numbers reduced to account for deleting the tests for bases 8 or 16): Time in clock cycles Input genprint genprint asm asm, !MUL __ultoa_invert !MUL 0 104 114 156 165 0xff 316 193 227 476 479 0xffff 584 393 479 788 793 0xffffff 1005 705 873 1256 1264 0xffffffff 1434 1045 1310 1568 1578 0xffffffffff 2024 1497 1889 48 ones 2626 1977 2511 56 ones 3286 2513 3207 64 ones 4103 3161 4045 For the !__AVR_HAVE_MUL__ case, the size is 0xa2 (my code) vs. 0x90 bytes. While the time saving is nice, I realize that space is very critical in !__AVR_HAVE_MUL__ code, so it might not be worth it. Unless you want 64-bit printing support, in which case this can provide it for very little additional space. Note that changing vfprintf() to keep track of a 16-bit pointer rather than a 32-bit value will reduce register pressure in it, saving a little bit there, which will partially compensate for the extra %llu code. #ifndef __tmp_reg__ # define __tmp_reg__ r0 #endif #ifndef __zero_reg__ # define __zero_reg__ r1 #endif /* Arguments */ #define out X /* Arrives in r24:r25, but we move it immediately */ #define out_lo r26 #define out_hi r27 #define bin Z /* Arrives in r22:r23, but we move it immediately */ #define bin_lo r30 #define bin_hi r31 #define len r20 /* Local variables */ #define acc_hi r25 #define acc_lo r24 #define digit r23 #define lsbit r22 #define tlen r21 /* Copy of len used for loop counter */ #if __AVR_HAVE_MUL__ #define zero r19 /* Used instead of r1 to free up multiplier */ #define k r18 /* Multiplier 0x33 */ #else #define zero __zero_reg__ #endif .text .global genprint .type genprint, @function 1: ret genprint: tst len /* Handle zero-length gracefully */ breq 1b #if __AVR_HAVE_MUL__ clr zero ldi k,0x33 #endif movw out_lo,r24 /* bin += len, point to msbyte */ movw bin_lo,r22 add bin_lo,len adc bin_hi,zero /* Strip trailing (most-significant) zeros from bin */ 2: ld __tmp_reg__,-bin cpse __tmp_reg__,zero rjmp 3f /* Found a non-zero byte, stop */ dec len brne 2b inc len /* But stop at 1 byte, so we print "0" */ 3: adiw bin_lo,1 /* The main loop, repeated while len > 0 */ 4: clr lsbit ser digit /* Sum of all bytes, mod 255 */ mov tlen,len /* * Pass 1, msb-to-lsb: Finding the input mod 10. * * We do two things here: divide by 2 (saving the lsbit), and sum * the result mod 255. This is then used to compute the result * mod 5, which combined with the lsbit gives the decimal digit * we want. */ 5: ld __tmp_reg__,-bin lsr lsbit /* lsbit to carry bit */ ror __tmp_reg__ st bin,__tmp_reg__ rol lsbit /* carry bit to lsbit */ add digit,__tmp_reg__ adc digit,zero /* End-around carry */ dec tlen brne 5b /* Reduce digit mod 15 (from 1 <= digit <= 255 to 1 <= digit <= 15) */ mov __tmp_reg__,digit swap __tmp_reg__ cbr digit,15 add digit,__tmp_reg__ /* Add high halves to get carry bit */ cbr digit,15 swap digit adc digit,zero /* End-around carry */ /* Reduce digit mod 5 */ cpi digit,10 brlo 6f subi digit,10 6: cpi digit,5 brlo 7f subi digit,5 7: /* Form and store ASCII digit (2*digit + lsbit) */ add lsbit,digit add lsbit,digit ori lsbit,'0' st out+,lsbit /* * Pass 2, lsb-to-msb: dividing by 5 * * Rather than do a general divide by 5, we can subtract the digit * to produce a multiple of 5, and then do an exact division by * multiplying by the 2-adic inverse of 5, 0xCCC...CCD. * * To get this into an even simpler form, we multiply by * 0x333...333 and negate. Each byte is multiplied by 0x33 and * added to an accumulator to be used for each higher byte. * * The accumulator has to be 16 bits wide, but after storing * each output byte, we can fold the msbyte into the lsbyte. * * Negating the output can be "complement and add one", but * we do it as "subtract one and complement", initializing the * accumulator to 0xff, then complementing before storing. * * To subtract the digit without an additional carry propagation * pass, subtract 0x33 times the digit from the accumulator * to start. (Since 0 <= digit <= 4, this is very easy.) */ /* acc = 255 - (digit * 0x33) */ #if __AVR_HAVE_MUL__ mul digit,k mov acc_lo,r0 #else mov acc_lo,digit swap acc_lo /* Digit < 16, so this is accum <<= 4 */ add acc_lo,digit /* Multiply by 0x11 */ mov r0,acc_lo add acc_lo,r0 add acc_lo,r0 /* Multiply by 3 */ #endif com acc_lo clr acc_hi /* Here's the actual loop */ mov tlen,len 8: ld r0,bin /* acc += 0x33 * r0 */ #if __AVR_HAVE_MUL__ mul r0,k add acc_lo,r0 adc acc_hi,r1 #else /* Compute 0x11*r0 into digit:lsbit */ mov lsbit,r0 swap lsbit mov digit,lsbit andi digit,15 /* Mask off high 4 bits */ eor lsbit,digit /* Mask off low 4 bits */ add lsbit,r0 adc digit,zero /* Now add it to the accumulator 3 times (there's no faster way) */ add acc_lo,lsbit adc acc_hi,digit add acc_lo,lsbit adc acc_hi,digit add acc_lo,lsbit adc acc_hi,digit #endif /* Store the complemented accumulator (*bin++ = ~accum) */ mov r0,acc_lo com r0 st bin+,r0 /* Fold the accumulator: acc = acc_hi + acc_lo */ add acc_lo,acc_hi clr acc_hi adc acc_hi,zero dec tlen brne 8b /* * End of main loop: check if the new msbyte was zero. If so, * drop it (reduce len by 1), and test for termination. */ cpse r0,zero rjmp 4b sbiw bin_lo,1 dec len #if __AVR_HAVE_MUL brne 4b clr __zero_reg__ #else breq 8f rjmp 4b 8: #endif /* Finally, put the return value in the expected place */ movw r24,out_lo ret .size genprint, .-genprint _______________________________________________ AVR-libc-dev mailing list AVR-libc-dev@nongnu.org https://lists.nongnu.org/mailman/listinfo/avr-libc-dev