Hi! I've found 32bit integer division in gcc too slow, and I managed to implement division in asm that's faster than what gcc 4.8 produces at -O4, about 2/3 the time (but I'm not sure if any of this is due to inlining). The algorithm is restoring division but unrolled and heavily optimized. Could this get in gcc or avr-libc, wherever the right place is?
The code is below, but can also be found on ideone: http://ideone.com/6i2yd3 #include <stdint.h> #define DIVIDE_ITER_0_6(i) \ " lsl %D[n]\n" \ " rol %A[p]\n" \ " cp %A[p],%A[d]\n" \ " cpc %B[p],%B[d]\n" \ " cpc %C[p],%C[d]\n" \ " cpc %D[p],%D[d]\n" \ " brcs zero_bit_" #i "__%=\n" \ " sub %A[p],%A[d]\n" \ " or %D[q],__tmp_reg__\n" \ "zero_bit_" #i "__%=:\n" \ " lsr __tmp_reg__\n" #define DIVIDE_ITER_7_7(i) \ " lsl %D[n]\n" \ " rol %A[p]\n" \ " cp %A[p],%A[d]\n" \ " cpc %B[p],%B[d]\n" \ " cpc %C[p],%C[d]\n" \ " cpc %D[p],%D[d]\n" \ " brcs zero_bit_" #i "__%=\n" \ " sub %A[p],%A[d]\n" \ " or %D[q],__tmp_reg__\n" \ "zero_bit_" #i "__%=:\n" \ " mov __tmp_reg__,%A[q]\n" #define DIVIDE_ITER_8_14(i) \ " lsl %C[n]\n" \ " rol %A[p]\n" \ " rol %B[p]\n" \ " cp %A[p],%A[d]\n" \ " cpc %B[p],%B[d]\n" \ " cpc %C[p],%C[d]\n" \ " cpc %D[p],%D[d]\n" \ " brcs zero_bit_" #i "__%=\n" \ " sub %A[p],%A[d]\n" \ " sbc %B[p],%B[d]\n" \ " or %C[q],__tmp_reg__\n" \ "zero_bit_" #i "__%=:\n" \ " lsr __tmp_reg__\n" #define DIVIDE_ITER_15_15(i) \ " lsl %C[n]\n" \ " rol %A[p]\n" \ " rol %B[p]\n" \ " cp %A[p],%A[d]\n" \ " cpc %B[p],%B[d]\n" \ " cpc %C[p],%C[d]\n" \ " cpc %D[p],%D[d]\n" \ " brcs zero_bit_" #i "__%=\n" \ " sub %A[p],%A[d]\n" \ " sbc %B[p],%B[d]\n" \ " or %C[q],__tmp_reg__\n" \ "zero_bit_" #i "__%=:\n" \ " mov __tmp_reg__,%A[q]\n" #define DIVIDE_ITER_16_22(i) \ " lsl %B[n]\n" \ " rol %A[p]\n" \ " rol %B[p]\n" \ " rol %C[p]\n" \ " cp %A[p],%A[d]\n" \ " cpc %B[p],%B[d]\n" \ " cpc %C[p],%C[d]\n" \ " cpc %D[p],%D[d]\n" \ " brcs zero_bit_" #i "__%=\n" \ " sub %A[p],%A[d]\n" \ " sbc %B[p],%B[d]\n" \ " sbc %C[p],%C[d]\n" \ " or %B[q],__tmp_reg__\n" \ "zero_bit_" #i "__%=:\n" \ " lsr __tmp_reg__\n" #define DIVIDE_ITER_23_23(i) \ " lsl %B[n]\n" \ " rol %A[p]\n" \ " rol %B[p]\n" \ " rol %C[p]\n" \ " cp %A[p],%A[d]\n" \ " cpc %B[p],%B[d]\n" \ " cpc %C[p],%C[d]\n" \ " cpc %D[p],%D[d]\n" \ " brcs zero_bit_" #i "__%=\n" \ " sub %A[p],%A[d]\n" \ " sbc %B[p],%B[d]\n" \ " sbc %C[p],%C[d]\n" \ " or %B[q],__tmp_reg__\n" \ "zero_bit_" #i "__%=:\n" \ " mov __tmp_reg__,%A[q]\n" \ " clr %A[q]\n" #define DIVIDE_ITER_24_29(i) \ " lsl %A[n]\n" \ " rol %A[p]\n" \ " rol %B[p]\n" \ " rol %C[p]\n" \ " rol %D[p]\n" \ " cp %A[p],%A[d]\n" \ " cpc %B[p],%B[d]\n" \ " cpc %C[p],%C[d]\n" \ " cpc %D[p],%D[d]\n" \ " brcs zero_bit_" #i "__%=\n" \ " sub %A[p],%A[d]\n" \ " sbc %B[p],%B[d]\n" \ " sbc %C[p],%C[d]\n" \ " sbc %D[p],%D[d]\n" \ " or %A[q],__tmp_reg__\n" \ "zero_bit_" #i "__%=:\n" \ " lsr __tmp_reg__\n" #define DIVIDE_ITER_30_30(i) \ " lsl %A[n]\n" \ " rol %A[p]\n" \ " rol %B[p]\n" \ " rol %C[p]\n" \ " rol %D[p]\n" \ " cp %A[p],%A[d]\n" \ " cpc %B[p],%B[d]\n" \ " cpc %C[p],%C[d]\n" \ " cpc %D[p],%D[d]\n" \ " brcs zero_bit_" #i "__%=\n" \ " sub %A[p],%A[d]\n" \ " sbc %B[p],%B[d]\n" \ " sbc %C[p],%C[d]\n" \ " sbc %D[p],%D[d]\n" \ " or %A[q],__tmp_reg__\n" \ "zero_bit_" #i "__%=:\n" \ uint32_t div_mine (uint32_t n, uint32_t d) { uint32_t p = 0; uint32_t q = 0x80; __asm__( " mov __tmp_reg__,%A[q]\n" DIVIDE_ITER_0_6(0) DIVIDE_ITER_0_6(1) DIVIDE_ITER_0_6(2) DIVIDE_ITER_0_6(3) DIVIDE_ITER_0_6(4) DIVIDE_ITER_0_6(5) DIVIDE_ITER_0_6(6) DIVIDE_ITER_7_7(7) DIVIDE_ITER_8_14(8) DIVIDE_ITER_8_14(9) DIVIDE_ITER_8_14(10) DIVIDE_ITER_8_14(11) DIVIDE_ITER_8_14(12) DIVIDE_ITER_8_14(13) DIVIDE_ITER_8_14(14) DIVIDE_ITER_15_15(15) DIVIDE_ITER_16_22(16) DIVIDE_ITER_16_22(17) DIVIDE_ITER_16_22(18) DIVIDE_ITER_16_22(19) DIVIDE_ITER_16_22(20) DIVIDE_ITER_16_22(21) DIVIDE_ITER_16_22(22) DIVIDE_ITER_23_23(23) DIVIDE_ITER_24_29(24) DIVIDE_ITER_24_29(25) DIVIDE_ITER_24_29(26) DIVIDE_ITER_24_29(27) DIVIDE_ITER_24_29(28) DIVIDE_ITER_24_29(29) DIVIDE_ITER_30_30(30) " lsl %A[n]\n" " rol %A[p]\n" " rol %B[p]\n" " rol %C[p]\n" " rol %D[p]\n" " cp %A[p],%A[d]\n" " cpc %B[p],%B[d]\n" " cpc %C[p],%C[d]\n" " cpc %D[p],%D[d]\n" " brcs end_zero_bit_%=\n" " inc %A[q]\n" "end_zero_bit_%=:\n" : [p] "=&r" (p), [q] "=&r" (q), [n] "=&r" (n) : "[p]" (p), "[q]" (q), "[n]" (n), [d] "r" (d) ); return q; } _______________________________________________ AVR-libc-dev mailing list AVR-libc-dev@nongnu.org https://lists.nongnu.org/mailman/listinfo/avr-libc-dev