The major malfunction here - this DOES NOT compile without optimization.  
Would it be acceptable to #define it away in that case? The lesser bug is 
that _delay_us_2() only works with a constant; who wants float arithmetic at 
runtime, anyway?

This delay.h adds _delay_us_2() which is more precise and has a larger max 
than _delay_us().  There is also a  _delay_loop_2b() added.

The _delay_us_2() uses the 4 clock, 16 bit sbiw loop. If a below 
minimum delay is commanded, it will resolve into the right number of NOPs. 
Also, 1,2, or 3 NOPs are appended to shim out the 4 clock loop exactly.

The other function,  _delay_loop_2b(), is a more constant-demanding 
_delay_loop_2(). The change is in using clobber to take R24+R25 for itself, 
and LDI'ng them directly with the constant argument. The way _delay_loop_2() 
works, it is less predictable with the chance of doing a MOV shuffle. 

The else {/*NOTREACHED*/} could be filled in with a still larger delay routine 
for almost unlimited us count.

The included delaytest.c is the only testing I did on this delay.h; the 
included listing is for straight -O. The C code skews a few lines after the 
generated assembler.
--- /usr/avr/include/util/delay.h	2006-05-01 06:04:50.000000000 -0400
+++ delay.h	2006-08-20 16:31:03.000000000 -0400
@@ -83,6 +83,7 @@
 #if !defined(__DOXYGEN__)
 static inline void _delay_loop_1(uint8_t __count) __attribute__((always_inline));
 static inline void _delay_loop_2(uint16_t __count) __attribute__((always_inline));
+static inline void _delay_loop_2b(uint16_t __count) __attribute__((always_inline));
 static inline void _delay_us(double __us) __attribute__((always_inline));
 static inline void _delay_ms(double __ms) __attribute__((always_inline));
 #endif
@@ -131,6 +132,33 @@
 	);
 }
 
+/** \ingroup util_delay
+
+    Just like __delay_loop_2 but "plan b" explicitly uses r24-5 and clobber
+    directive to (hopefully) repel any mov shenanigans that may eat clocks.
+    Delay loop using a 16-bit counter \c __count, so up to 65536
+    iterations are possible.  (The value 65536 would have to be
+    passed as 0.)  The loop executes four CPU cycles per iteration.
+    not including the overhead the compiler requires to setup the
+    counter register pair. 
+
+    Thus, at a CPU speed of 1 MHz, delays of up to about 262.1
+    milliseconds can be achieved.
+ */
+#define _delay_loop_2b( __count)\
+{\
+	__asm__ volatile (\
+		"ldi r24,lo8(%0) \n\t"\
+		"ldi r25,hi8(%0) \n\t"\
+		"1: sbiw r24,1 \n\t"\
+		"brne 1b \n\t"\
+                :\
+		: "n" (__count)\
+                : "r24", "r25"\
+	);\
+}
+
+
 #ifndef F_CPU
 /* prevent compiler error by supplying a default */
 # warning "F_CPU not defined for <util/delay.h>"
@@ -140,25 +168,46 @@
 /**
    \ingroup util_delay
 
-   Perform a delay of \c __us microseconds, using _delay_loop_1().
-
+   Perform a delay of \c __us microseconds, using _delay_loop_2b().
+   This routine will use nop shims to be as precise as possible, down to just 1 nop.
+   Or, in the case of 0.9 clocks-worth of delay, this will do NOTHING (IOW, it rounds down).
+   It needs gcc optimization (-O) in order to be inlined properly.
+   Optimization will clash with using -g to hand-count a listing, sorry.
    The macro F_CPU is supposed to be defined to a
    constant defining the CPU clock frequency (in Hertz).
 
-   The maximal possible delay is 768 us / F_CPU in MHz.
+   The maximal possible delay is ~262.14 ms (not us!) / F_CPU in MHz.
  */
 void
-_delay_us(double __us)
+_delay_us(const double __us)
 {
-	uint8_t __ticks;
-	double __tmp = ((F_CPU) / 3e6) * __us;
-	if (__tmp < 1.0)
-		__ticks = 1;
-	else if (__tmp > 255)
-		__ticks = 0;	/* i.e. 256 */
-	else
-		__ticks = (uint8_t)__tmp;
-	_delay_loop_1(__ticks);
+	const double __clocks_per_us=((F_CPU)/1e6);
+	const double __clocks_delay = __clocks_per_us * __us;
+	const char __loop_2b_fixed=7; // the 4 ldi clocks + the 3clk last loop
+        const uint32_t __loop_2b_runs = ((__clocks_delay - __loop_2b_fixed) / 4)+1; // +1, don't forget the 3clk loop!
+        const double __remainder_clocks= __clocks_delay - ((__loop_2b_runs-1) * 4 + __loop_2b_fixed); //the minus 1 un-considers the 3clk last loop, which is counted in __loop_3_fixed
+
+	if(__loop_2b_runs < 65536) {
+		if (__clocks_delay < 1) {/*DONOTHING*/}
+		else if (__clocks_delay < 2) {asm volatile ("nop");}
+		else if (__clocks_delay < 3) {asm volatile ("rjmp +0");} /*rjmp 0(implicit PC+1) = 2 nops*/
+		else if (__clocks_delay < 4) {asm volatile ("rjmp +0\n\t nop");}
+		else if (__clocks_delay < 5) {asm volatile ("rjmp +0\n\t rjmp +0");}
+                else if (__clocks_delay < 6) {asm volatile ("rjmp +0\n\t rjmp +0 \n\t nop");} 
+                else if (__clocks_delay < 7) {asm volatile ("rjmp +0\n\t rjmp +0 \n\t rjmp +0");}
+		else   if (__remainder_clocks < 1) {
+			_delay_loop_2b((uint16_t)__loop_2b_runs);
+		} else if (__remainder_clocks < 2) {
+			_delay_loop_2b((uint16_t)__loop_2b_runs);
+			asm volatile ("nop");
+		} else if (__remainder_clocks < 3) {
+			_delay_loop_2b((uint16_t)__loop_2b_runs);
+			asm volatile ("rjmp +0");
+		} else if (__remainder_clocks < 4) {
+			_delay_loop_2b((uint16_t)__loop_2b_runs);
+			asm volatile ("rjmp +0\n\t nop");
+		}
+	} else {/*NOTREACHED*/}
 }
 
 
/* Copyright (c) 2002, Marek Michalkiewicz
   Copyright (c) 2004,2005 Joerg Wunsch
   All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are met:

   * Redistributions of source code must retain the above copyright
     notice, this list of conditions and the following disclaimer.

   * Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in
     the documentation and/or other materials provided with the
     distribution.

   * Neither the name of the copyright holders nor the names of
     contributors may be used to endorse or promote products derived
     from this software without specific prior written permission.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  POSSIBILITY OF SUCH DAMAGE. */

/* $Id: delay.h,v 1.1.2.1 2005/12/12 23:19:49 joerg_wunsch Exp $ */

#ifndef _UTIL_DELAY_H_
#define _UTIL_DELAY_H_ 1

#include <inttypes.h>

/** \defgroup util_delay <util/delay.h>: Busy-wait delay loops
    \code
    #define F_CPU 1000000UL  // 1 MHz
    //#define F_CPU 14.7456E6
    #include <util/delay.h>
    \endcode

    \note As an alternative method, it is possible to pass the
    F_CPU macro down to the compiler from the Makefile.
    Obviously, in that case, no \c \#define statement should be
    used.

    The functions in this header file implement simple delay loops
    that perform a busy-waiting.  They are typically used to
    facilitate short delays in the program execution.  They are
    implemented as count-down loops with a well-known CPU cycle
    count per loop iteration.  As such, no other processing can
    occur simultaneously.  It should be kept in mind that the
    functions described here do not disable interrupts.

    In general, for long delays, the use of hardware timers is
    much preferrable, as they free the CPU, and allow for
    concurrent processing of other events while the timer is
    running.  However, in particular for very short delays, the
    overhead of setting up a hardware timer is too much compared
    to the overall delay time.

    Two inline functions are provided for the actual delay algorithms.

    Two wrapper functions allow the specification of microsecond, and
    millisecond delays directly, using the application-supplied macro
    F_CPU as the CPU clock frequency (in Hertz).  These functions
    operate on double typed arguments, however when optimization is
    turned on, the entire floating-point calculation will be done at
    compile-time.

    \note When using _delay_us() and _delay_ms(), the expressions
    passed as arguments to these functions shall be compile-time
    constants, otherwise the floating-point calculations to setup the
    loops will be done at run-time, thereby drastically increasing
    both the resulting code size, as well as the time required to
    setup the loops.
*/

#if !defined(__DOXYGEN__)
static inline void _delay_loop_1(uint8_t __count) __attribute__((always_inline));
static inline void _delay_loop_2(uint16_t __count) __attribute__((always_inline));
static inline void _delay_loop_2b(uint16_t __count) __attribute__((always_inline));
static inline void _delay_us(double __us) __attribute__((always_inline));
static inline void _delay_us_2(double __us) __attribute__((always_inline));
static inline void _delay_ms(double __ms) __attribute__((always_inline));
#endif

/** \ingroup util_delay

    Delay loop using an 8-bit counter \c __count, so up to 256
    iterations are possible.  (The value 256 would have to be passed
    as 0.)  The loop executes three CPU cycles per iteration, not
    including the overhead the compiler needs to setup the counter
    register.

    Thus, at a CPU speed of 1 MHz, delays of up to 768 microseconds
    can be achieved.
*/
void
_delay_loop_1(uint8_t __count)
{
	__asm__ volatile (
		"1: dec %0" "\n\t"
		"brne 1b"
		: "=r" (__count)
		: "0" (__count)
	);
}

/** \ingroup util_delay

    Delay loop using a 16-bit counter \c __count, so up to 65536
    iterations are possible.  (The value 65536 would have to be
    passed as 0.)  The loop executes four CPU cycles per iteration,
    not including the overhead the compiler requires to setup the
    counter register pair.

    Thus, at a CPU speed of 1 MHz, delays of up to about 262.1
    milliseconds can be achieved.
 */
void
_delay_loop_2(uint16_t __count)
{
	__asm__ volatile (
		"1: sbiw %0,1" "\n\t"
		"brne 1b"
		: "=w" (__count)
		: "0" (__count)
	);
}

/** \ingroup util_delay

    Just like __delay_loop_2 but "plan b" explicitly uses r24-5 and clobber
    directive to (hopefully) repel any mov shenanigans that may eat clocks.
    Delay loop using a 16-bit counter \c __count, so up to 65536
    iterations are possible.  (The value 65536 would have to be
    passed as 0.)  The loop executes four CPU cycles per iteration.
    not including the overhead the compiler requires to setup the
    counter register pair. 

    Thus, at a CPU speed of 1 MHz, delays of up to about 262.1
    milliseconds can be achieved.
 */
void
_delay_loop_2b(uint16_t __count)
{
	__asm__ volatile (
		"ldi r24,lo8(%0) \n\t"
		"ldi r25,hi8(%0) \n\t"
		"1: sbiw r24,1 \n\t"
		"brne 1b \n\t"
                :
		: "n" (__count)
                : "r24", "r25"
	);
}


#ifndef F_CPU
/* prevent compiler error by supplying a default */
# warning "F_CPU not defined for <util/delay.h>"
# define F_CPU 1000000UL
#endif

/**
   \ingroup util_delay

   Perform a delay of \c __us microseconds, using _delay_loop_1().

   The macro F_CPU is supposed to be defined to a
   constant defining the CPU clock frequency (in Hertz).

   The maximal possible delay is 768 us / F_CPU in MHz.
 */
void
_delay_us(double __us)
{
	uint8_t __ticks;
	double __tmp = ((F_CPU) / 3e6) * __us;
	if (__tmp < 1.0)
		__ticks = 1;
	else if (__tmp > 255)
		__ticks = 0;	/* i.e. 256 */
	else
		__ticks = (uint8_t)__tmp;
	_delay_loop_1(__ticks);
}

/**
   \ingroup util_delay

   Perform a delay of \c __us microseconds, using _delay_loop_2b().
   This routine will use nop shims to be as precise as possible, down to just 1 nop.
   Or, in the case of 0.9 clocks-worth of delay, this will do NOTHING (IOW, it rounds down).
   It must be called with a value known at compile-time.
   It needs gcc optimization (-O) in order to be inlined properly.
   Optimization will clash with using -g to hand-count a listing, sorry.
   The macro F_CPU is supposed to be defined to a
   constant defining the CPU clock frequency (in Hertz).

   The maximal possible delay is ~262.14 ms (not us!) / F_CPU in MHz.
 */
void
_delay_us_2(const double __us)
{
	const double __clocks_per_us=((F_CPU)/1e6);
	const double __clocks_delay = __clocks_per_us * __us;
	const char __loop_2b_fixed=7; // the 4 ldi clocks + the 3clk last loop
	const uint32_t __loop_2b_runs = ((__clocks_delay - __loop_2b_fixed) / 4)+1; // +1, don't forget the 3clk loop!
	const double __remainder_clocks= __clocks_delay - ((__loop_2b_runs-1) * 4 + __loop_2b_fixed); //the minus 1 un-considers the 3clk last loop, which is counted in __loop_3_fixed

	if(__loop_2b_runs < 65536) {
		if (__clocks_delay < 1) {/*DONOTHING*/}
		else if (__clocks_delay < 2) {asm volatile ("nop");}
		else if (__clocks_delay < 3) {asm volatile ("rjmp +0");} /*rjmp 0(implicit PC+1) = 2 nops*/
		else if (__clocks_delay < 4) {asm volatile ("rjmp +0\n\t nop");}
		else if (__clocks_delay < 5) {asm volatile ("rjmp +0\n\t rjmp +0");}
		else if (__clocks_delay < 6) {asm volatile ("rjmp +0\n\t rjmp +0 \n\t nop");} 
		else if (__clocks_delay < 7) {asm volatile ("rjmp +0\n\t rjmp +0 \n\t rjmp +0");}
		else   if (__remainder_clocks < 1) {
			_delay_loop_2b((uint16_t)__loop_2b_runs);
		} else if (__remainder_clocks < 2) {
			_delay_loop_2b((uint16_t)__loop_2b_runs);
			asm volatile ("nop");
		} else if (__remainder_clocks < 3) {
			_delay_loop_2b((uint16_t)__loop_2b_runs);
			asm volatile ("rjmp +0");
		} else if (__remainder_clocks < 4) {
			_delay_loop_2b((uint16_t)__loop_2b_runs);
			asm volatile ("rjmp +0\n\t nop");
		}
	} else {/*NOTREACHED*/}
}


/**
   \ingroup util_delay

   Perform a delay of \c __ms milliseconds, using _delay_loop_2().

   The macro F_CPU is supposed to be defined to a
   constant defining the CPU clock frequency (in Hertz).

   The maximal possible delay is 262.14 ms / F_CPU in MHz.
 */
void
_delay_ms(double __ms)
{
	uint16_t __ticks;
	double __tmp = ((F_CPU) / 4e3) * __ms;
	if (__tmp < 1.0)
		__ticks = 1;
	else if (__tmp > 65535)
		__ticks = 0;	/* i.e. 65536 */
	else
		__ticks = (uint16_t)__tmp;
	_delay_loop_2(__ticks);
}

#endif /* _UTIL_DELAY_H_ */
   1               		.file	"delaytest.c"
   2               		.arch atmega32
   3               	__SREG__ = 0x3f
   4               	__SP_H__ = 0x3e
   5               	__SP_L__ = 0x3d
   6               	__tmp_reg__ = 0
   7               	__zero_reg__ = 1
   8               		.global __do_copy_data
   9               		.global __do_clear_bss
  12               		.text
  13               	.Ltext0:
  69               	.global	main
  71               	main:
  72               		.stabd	46,0,0
   1:delaytest.c   **** #define F_CPU 10e6
   2:delaytest.c   **** #define MCU atmega32
   3:delaytest.c   **** #include <avr/io.h>
   4:delaytest.c   **** //#include <util/delay.h>
   5:delaytest.c   **** #include "delay.h"
   6:delaytest.c   **** int main() {
  74               	.LM0:
  75               	/* prologue: frame size=0 */
  76 0000 C0E0      		ldi r28,lo8(__stack - 0)
  77 0002 D0E0      		ldi r29,hi8(__stack - 0)
  78 0004 DEBF      		out __SP_H__,r29
  79 0006 CDBF      		out __SP_L__,r28
  80               	/* prologue end (size=4) */
   7:delaytest.c   **** 	char a=PINB;
  82               	.LM1:
  83 0008 86B3      		in r24,54-0x20
   8:delaytest.c   **** 	DDRC=12;
  85               	.LM2:
  86 000a 8CE0      		ldi r24,lo8(12)
  87 000c 84BB      		out 52-0x20,r24
  88               	.LBB90:
  89               	.LBB91:
  91               	.Ltext1:
   1:delay.h       **** /* Copyright (c) 2002, Marek Michalkiewicz
   2:delay.h       ****    Copyright (c) 2004,2005 Joerg Wunsch
   3:delay.h       ****    All rights reserved.
   4:delay.h       **** 
   5:delay.h       ****    Redistribution and use in source and binary forms, with or without
   6:delay.h       ****    modification, are permitted provided that the following conditions are met:
   7:delay.h       **** 
   8:delay.h       ****    * Redistributions of source code must retain the above copyright
   9:delay.h       ****      notice, this list of conditions and the following disclaimer.
  10:delay.h       **** 
  11:delay.h       ****    * Redistributions in binary form must reproduce the above copyright
  12:delay.h       ****      notice, this list of conditions and the following disclaimer in
  13:delay.h       ****      the documentation and/or other materials provided with the
  14:delay.h       ****      distribution.
  15:delay.h       **** 
  16:delay.h       ****    * Neither the name of the copyright holders nor the names of
  17:delay.h       ****      contributors may be used to endorse or promote products derived
  18:delay.h       ****      from this software without specific prior written permission.
  19:delay.h       **** 
  20:delay.h       ****   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  21:delay.h       ****   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22:delay.h       ****   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23:delay.h       ****   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  24:delay.h       ****   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25:delay.h       ****   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26:delay.h       ****   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27:delay.h       ****   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28:delay.h       ****   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29:delay.h       ****   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30:delay.h       ****   POSSIBILITY OF SUCH DAMAGE. */
  31:delay.h       **** 
  32:delay.h       **** /* $Id: delay.h,v 1.1.2.1 2005/12/12 23:19:49 joerg_wunsch Exp $ */
  33:delay.h       **** 
  34:delay.h       **** #ifndef _UTIL_DELAY_H_
  35:delay.h       **** #define _UTIL_DELAY_H_ 1
  36:delay.h       **** 
  37:delay.h       **** #include <inttypes.h>
  38:delay.h       **** 
  39:delay.h       **** /** \defgroup util_delay <util/delay.h>: Busy-wait delay loops
  40:delay.h       ****     \code
  41:delay.h       ****     #define F_CPU 1000000UL  // 1 MHz
  42:delay.h       ****     //#define F_CPU 14.7456E6
  43:delay.h       ****     #include <util/delay.h>
  44:delay.h       ****     \endcode
  45:delay.h       **** 
  46:delay.h       ****     \note As an alternative method, it is possible to pass the
  47:delay.h       ****     F_CPU macro down to the compiler from the Makefile.
  48:delay.h       ****     Obviously, in that case, no \c \#define statement should be
  49:delay.h       ****     used.
  50:delay.h       **** 
  51:delay.h       ****     The functions in this header file implement simple delay loops
  52:delay.h       ****     that perform a busy-waiting.  They are typically used to
  53:delay.h       ****     facilitate short delays in the program execution.  They are
  54:delay.h       ****     implemented as count-down loops with a well-known CPU cycle
  55:delay.h       ****     count per loop iteration.  As such, no other processing can
  56:delay.h       ****     occur simultaneously.  It should be kept in mind that the
  57:delay.h       ****     functions described here do not disable interrupts.
  58:delay.h       **** 
  59:delay.h       ****     In general, for long delays, the use of hardware timers is
  60:delay.h       ****     much preferrable, as they free the CPU, and allow for
  61:delay.h       ****     concurrent processing of other events while the timer is
  62:delay.h       ****     running.  However, in particular for very short delays, the
  63:delay.h       ****     overhead of setting up a hardware timer is too much compared
  64:delay.h       ****     to the overall delay time.
  65:delay.h       **** 
  66:delay.h       ****     Two inline functions are provided for the actual delay algorithms.
  67:delay.h       **** 
  68:delay.h       ****     Two wrapper functions allow the specification of microsecond, and
  69:delay.h       ****     millisecond delays directly, using the application-supplied macro
  70:delay.h       ****     F_CPU as the CPU clock frequency (in Hertz).  These functions
  71:delay.h       ****     operate on double typed arguments, however when optimization is
  72:delay.h       ****     turned on, the entire floating-point calculation will be done at
  73:delay.h       ****     compile-time.
  74:delay.h       **** 
  75:delay.h       ****     \note When using _delay_us() and _delay_ms(), the expressions
  76:delay.h       ****     passed as arguments to these functions shall be compile-time
  77:delay.h       ****     constants, otherwise the floating-point calculations to setup the
  78:delay.h       ****     loops will be done at run-time, thereby drastically increasing
  79:delay.h       ****     both the resulting code size, as well as the time required to
  80:delay.h       ****     setup the loops.
  81:delay.h       **** */
  82:delay.h       **** 
  83:delay.h       **** #if !defined(__DOXYGEN__)
  84:delay.h       **** static inline void _delay_loop_1(uint8_t __count) __attribute__((always_inline));
  85:delay.h       **** static inline void _delay_loop_2(uint16_t __count) __attribute__((always_inline));
  86:delay.h       **** static inline void _delay_loop_2b(uint16_t __count) __attribute__((always_inline));
  87:delay.h       **** static inline void _delay_us(double __us) __attribute__((always_inline));
  88:delay.h       **** static inline void _delay_us_2(double __us) __attribute__((always_inline));
  89:delay.h       **** static inline void _delay_ms(double __ms) __attribute__((always_inline));
  90:delay.h       **** #endif
  91:delay.h       **** 
  92:delay.h       **** /** \ingroup util_delay
  93:delay.h       **** 
  94:delay.h       ****     Delay loop using an 8-bit counter \c __count, so up to 256
  95:delay.h       ****     iterations are possible.  (The value 256 would have to be passed
  96:delay.h       ****     as 0.)  The loop executes three CPU cycles per iteration, not
  97:delay.h       ****     including the overhead the compiler needs to setup the counter
  98:delay.h       ****     register.
  99:delay.h       **** 
 100:delay.h       ****     Thus, at a CPU speed of 1 MHz, delays of up to 768 microseconds
 101:delay.h       ****     can be achieved.
 102:delay.h       **** */
 103:delay.h       **** void
 104:delay.h       **** _delay_loop_1(uint8_t __count)
 105:delay.h       **** {
 106:delay.h       **** 	__asm__ volatile (
 107:delay.h       **** 		"1: dec %0" "\n\t"
 108:delay.h       **** 		"brne 1b"
 109:delay.h       **** 		: "=r" (__count)
 110:delay.h       **** 		: "0" (__count)
 111:delay.h       **** 	);
 112:delay.h       **** }
 113:delay.h       **** 
 114:delay.h       **** /** \ingroup util_delay
 115:delay.h       **** 
 116:delay.h       ****     Delay loop using a 16-bit counter \c __count, so up to 65536
 117:delay.h       ****     iterations are possible.  (The value 65536 would have to be
 118:delay.h       ****     passed as 0.)  The loop executes four CPU cycles per iteration,
 119:delay.h       ****     not including the overhead the compiler requires to setup the
 120:delay.h       ****     counter register pair.
 121:delay.h       **** 
 122:delay.h       ****     Thus, at a CPU speed of 1 MHz, delays of up to about 262.1
 123:delay.h       ****     milliseconds can be achieved.
 124:delay.h       ****  */
 125:delay.h       **** void
 126:delay.h       **** _delay_loop_2(uint16_t __count)
 127:delay.h       **** {
 128:delay.h       **** 	__asm__ volatile (
 129:delay.h       **** 		"1: sbiw %0,1" "\n\t"
 130:delay.h       **** 		"brne 1b"
 131:delay.h       **** 		: "=w" (__count)
 132:delay.h       **** 		: "0" (__count)
 133:delay.h       **** 	);
 134:delay.h       **** }
 135:delay.h       **** 
 136:delay.h       **** /** \ingroup util_delay
 137:delay.h       **** 
 138:delay.h       ****     Just like __delay_loop_2 but "plan b" explicitly uses r24-5 and clobber
 139:delay.h       ****     directive to (hopefully) repel any mov shenanigans that may eat clocks.
 140:delay.h       ****     Delay loop using a 16-bit counter \c __count, so up to 65536
 141:delay.h       ****     iterations are possible.  (The value 65536 would have to be
 142:delay.h       ****     passed as 0.)  The loop executes four CPU cycles per iteration.
 143:delay.h       ****     not including the overhead the compiler requires to setup the
 144:delay.h       ****     counter register pair. 
 145:delay.h       **** 
 146:delay.h       ****     Thus, at a CPU speed of 1 MHz, delays of up to about 262.1
 147:delay.h       ****     milliseconds can be achieved.
 148:delay.h       ****  */
 149:delay.h       **** void
 150:delay.h       **** _delay_loop_2b(uint16_t __count)
 151:delay.h       **** {
 152:delay.h       **** 	__asm__ volatile (
 153:delay.h       **** 		"ldi r24,lo8(%0) \n\t"
 154:delay.h       **** 		"ldi r25,hi8(%0) \n\t"
 155:delay.h       **** 		"1: sbiw r24,1 \n\t"
 156:delay.h       **** 		"brne 1b \n\t"
 157:delay.h       ****                 :
 158:delay.h       **** 		: "n" (__count)
 159:delay.h       ****                 : "r24", "r25"
 160:delay.h       **** 	);
 161:delay.h       **** }
 162:delay.h       **** 
 163:delay.h       **** 
 164:delay.h       **** #ifndef F_CPU
 165:delay.h       **** /* prevent compiler error by supplying a default */
 166:delay.h       **** # warning "F_CPU not defined for <util/delay.h>"
 167:delay.h       **** # define F_CPU 1000000UL
 168:delay.h       **** #endif
 169:delay.h       **** 
 170:delay.h       **** /**
 171:delay.h       ****    \ingroup util_delay
 172:delay.h       **** 
 173:delay.h       ****    Perform a delay of \c __us microseconds, using _delay_loop_1().
 174:delay.h       **** 
 175:delay.h       ****    The macro F_CPU is supposed to be defined to a
 176:delay.h       ****    constant defining the CPU clock frequency (in Hertz).
 177:delay.h       **** 
 178:delay.h       ****    The maximal possible delay is 768 us / F_CPU in MHz.
 179:delay.h       ****  */
 180:delay.h       **** void
 181:delay.h       **** _delay_us(double __us)
 182:delay.h       **** {
 183:delay.h       ****         uint8_t __ticks;
 184:delay.h       ****         double __tmp = ((F_CPU) / 3e6) * __us;
 185:delay.h       ****         if (__tmp < 1.0)
 186:delay.h       ****                 __ticks = 1;
 187:delay.h       ****         else if (__tmp > 255)
 188:delay.h       ****                 __ticks = 0;    /* i.e. 256 */
 189:delay.h       ****         else
 190:delay.h       ****                 __ticks = (uint8_t)__tmp;
 191:delay.h       ****         _delay_loop_1(__ticks);
 192:delay.h       **** }
 193:delay.h       **** 
 194:delay.h       **** /**
 195:delay.h       ****    \ingroup util_delay
 196:delay.h       **** 
 197:delay.h       ****    Perform a delay of \c __us microseconds, using _delay_loop_2b().
 198:delay.h       ****    This routine will use nop shims to be as precise as possible, down to just 1 nop.
 199:delay.h       ****    Or, in the case of 0.9 clocks-worth of delay, this will do NOTHING (IOW, it rounds down).
 200:delay.h       ****    It must be called with a value known at compile-time.
 201:delay.h       ****    It needs gcc optimization (-O) in order to be inlined properly.
 202:delay.h       ****    Optimization will clash with using -g to hand-count a listing, sorry.
 203:delay.h       ****    The macro F_CPU is supposed to be defined to a
 204:delay.h       ****    constant defining the CPU clock frequency (in Hertz).
 205:delay.h       **** 
 206:delay.h       ****    The maximal possible delay is ~262.14 ms (not us!) / F_CPU in MHz.
 207:delay.h       ****  */
 208:delay.h       **** void
 209:delay.h       **** _delay_us_2(const double __us)
 210:delay.h       **** {
 211:delay.h       **** 	const double __clocks_per_us=((F_CPU)/1e6);
 212:delay.h       **** 	const double __clocks_delay = __clocks_per_us * __us;
 213:delay.h       **** 	const char __loop_2b_fixed=7; // the 4 ldi clocks + the 3clk last loop
 214:delay.h       ****         const uint32_t __loop_2b_runs = ((__clocks_delay - __loop_2b_fixed) / 4)+1; // +1, don't fo
 215:delay.h       ****         const double __remainder_clocks= __clocks_delay - ((__loop_2b_runs-1) * 4 + __loop_2b_fixed
 216:delay.h       **** 
 217:delay.h       **** 	if(__loop_2b_runs < 65536) {
 218:delay.h       **** 		if (__clocks_delay < 1) {/*DONOTHING*/}
 219:delay.h       **** 		else if (__clocks_delay < 2) {asm volatile ("nop");}
 220:delay.h       **** 		else if (__clocks_delay < 3) {asm volatile ("rjmp +0");} /*rjmp 0(implicit PC+1) = 2 nops*/
 221:delay.h       **** 		else if (__clocks_delay < 4) {asm volatile ("rjmp +0\n\t nop");}
 222:delay.h       **** 		else if (__clocks_delay < 5) {asm volatile ("rjmp +0\n\t rjmp +0");}
  93               	.LM3:
  94               	/* #APP */
  95 000e F8CF      		rjmp +0
  96 0010 F7CF      		 rjmp +0
  97               	/* #NOAPP */
  98               	.LBE91:
  99               	.LBE90:
 101               	.Ltext2:
   9:delaytest.c   **** 	_delay_us_2(0.4);
  10:delaytest.c   **** 	DDRC=13;
 103               	.LM4:
 104 0012 8DE0      		ldi r24,lo8(13)
 105 0014 84BB      		out 52-0x20,r24
 106               	.LBB92:
 107               	.LBB93:
 108               	.LBB94:
 109               	.LBB95:
 111               	.Ltext3:
 113               	.LM5:
 114               	/* #APP */
 115 0016 82E0      		ldi r24,lo8(2) 
 116 0018 90E0      		ldi r25,hi8(2) 
 117 001a 0197      		1: sbiw r24,1 
 118 001c F1F7      		brne 1b 
 119               		
 120               	/* #NOAPP */
 121               	.LBE95:
 122               	.LBE94:
 223:delay.h       ****                 else if (__clocks_delay < 6) {asm volatile ("rjmp +0\n\t rjmp +0 \n\t nop");} 
 224:delay.h       ****                 else if (__clocks_delay < 7) {asm volatile ("rjmp +0\n\t rjmp +0 \n\t rjmp +0");}
 225:delay.h       **** 		else   if (__remainder_clocks < 1) {
 226:delay.h       **** 			_delay_loop_2b((uint16_t)__loop_2b_runs);
 227:delay.h       **** 		} else if (__remainder_clocks < 2) {
 228:delay.h       **** 			_delay_loop_2b((uint16_t)__loop_2b_runs);
 229:delay.h       **** 			asm volatile ("nop");
 230:delay.h       **** 		} else if (__remainder_clocks < 3) {
 231:delay.h       **** 			_delay_loop_2b((uint16_t)__loop_2b_runs);
 232:delay.h       **** 			asm volatile ("rjmp +0");
 124               	.LM6:
 125               	/* #APP */
 126 001e F0CF      		rjmp +0
 127               	/* #NOAPP */
 128               	.LBE93:
 129               	.LBE92:
 131               	.Ltext4:
  11:delaytest.c   **** 	_delay_us_2(1.3);
  12:delaytest.c   **** 	DDRC=14;
 133               	.LM7:
 134 0020 8EE0      		ldi r24,lo8(14)
 135 0022 84BB      		out 52-0x20,r24
 136               	.LBB96:
 137               	.LBB97:
 138               	.LBB98:
 139               	.LBB99:
 141               	.Ltext5:
 143               	.LM8:
 144               	/* #APP */
 145 0024 84E0      		ldi r24,lo8(4) 
 146 0026 90E0      		ldi r25,hi8(4) 
 147 0028 0197      		1: sbiw r24,1 
 148 002a F1F7      		brne 1b 
 149               		
 150               	/* #NOAPP */
 151               	.LBE99:
 152               	.LBE98:
 233:delay.h       **** 		} else if (__remainder_clocks < 4) {
 234:delay.h       **** 			_delay_loop_2b((uint16_t)__loop_2b_runs);
 235:delay.h       **** 			asm volatile ("rjmp +0\n\t nop");
 154               	.LM9:
 155               	/* #APP */
 156 002c E9CF      		rjmp +0
 157 002e 0000      		 nop
 158               	/* #NOAPP */
 159               	.LBE97:
 160               	.LBE96:
 162               	.Ltext6:
  13:delaytest.c   **** 	_delay_us_2(2.2);
  14:delaytest.c   **** 	DDRC=15;
 164               	.LM10:
 165 0030 8FE0      		ldi r24,lo8(15)
 166 0032 84BB      		out 52-0x20,r24
 167               	.LBB100:
 168               	.LBB101:
 169               	.LBB102:
 170               	.LBB103:
 172               	.Ltext7:
 174               	.LM11:
 175               	/* #APP */
 176 0034 87E0      		ldi r24,lo8(7) 
 177 0036 90E0      		ldi r25,hi8(7) 
 178 0038 0197      		1: sbiw r24,1 
 179 003a F1F7      		brne 1b 
 180               		
 181               	/* #NOAPP */
 182               	.LBE103:
 183               	.LBE102:
 184               	.LBE101:
 185               	.LBE100:
 187               	.Ltext8:
  15:delaytest.c   **** 	_delay_us_2(3.1);
  16:delaytest.c   **** 	DDRC=16;
 189               	.LM12:
 190 003c 80E1      		ldi r24,lo8(16)
 191 003e 84BB      		out 52-0x20,r24
  17:delaytest.c   **** 	//_delay_us_2(a);
  18:delaytest.c   **** 	DDRC=4;
 193               	.LM13:
 194 0040 84E0      		ldi r24,lo8(4)
 195 0042 84BB      		out 52-0x20,r24
 196               	.LBB104:
 197               	.LBB105:
 198               	.LBB106:
 199               	.LBB107:
 201               	.Ltext9:
 203               	.LM14:
 204               	/* #APP */
 205 0044 86E3      		ldi r24,lo8(54) 
 206 0046 90E0      		ldi r25,hi8(54) 
 207 0048 0197      		1: sbiw r24,1 
 208 004a F1F7      		brne 1b 
 209               		
 210               	/* #NOAPP */
 211               	.LBE107:
 212               	.LBE106:
 214               	.LM15:
 215               	/* #APP */
 216 004c 0000      		nop
 217               	/* #NOAPP */
 218               	.LBE105:
 219               	.LBE104:
 221               	.Ltext10:
  19:delaytest.c   **** 	_delay_us_2(22);
  20:delaytest.c   **** 	DDRC=17;
 223               	.LM16:
 224 004e 81E1      		ldi r24,lo8(17)
 225 0050 84BB      		out 52-0x20,r24
 226               	.LBB108:
 227               	.LBB109:
 228               	.LBB110:
 229               	.LBB111:
 231               	.Ltext11:
 233               	.LM17:
 234               	/* #APP */
 235 0052 88E3      		ldi r24,lo8(56) 
 236 0054 90E0      		ldi r25,hi8(56) 
 237 0056 0197      		1: sbiw r24,1 
 238 0058 F1F7      		brne 1b 
 239               		
 240               	/* #NOAPP */
 241               	.LBE111:
 242               	.LBE110:
 244               	.LM18:
 245               	/* #APP */
 246 005a D2CF      		rjmp +0
 247 005c 0000      		 nop
 248               	/* #NOAPP */
 249               	.LBE109:
 250               	.LBE108:
 252               	.Ltext12:
  21:delaytest.c   **** 	_delay_us_2(23);
  22:delaytest.c   **** 	DDRC=18;
 254               	.LM19:
 255 005e 82E1      		ldi r24,lo8(18)
 256 0060 84BB      		out 52-0x20,r24
 257               	.LBB112:
 258               	.LBB113:
 259               	.LBB114:
 260               	.LBB115:
 262               	.Ltext13:
 264               	.LM20:
 265               	/* #APP */
 266 0062 8EE6      		ldi r24,lo8(30574) 
 267 0064 97E7      		ldi r25,hi8(30574) 
 268 0066 0197      		1: sbiw r24,1 
 269 0068 F1F7      		brne 1b 
 270               		
 271               	/* #NOAPP */
 272               	.LBE115:
 273               	.LBE114:
 275               	.LM21:
 276               	/* #APP */
 277 006a 0000      		nop
 278               	/* #NOAPP */
 279               	.LBE113:
 280               	.LBE112:
 282               	.Ltext14:
  23:delaytest.c   **** 	_delay_us_2(12230);
  24:delaytest.c   **** 	DDRC=19;
 284               	.LM22:
 285 006c 83E1      		ldi r24,lo8(19)
 286 006e 84BB      		out 52-0x20,r24
  25:delaytest.c   **** 	_delay_us_2(62230);
  26:delaytest.c   **** 	DDRC=20;
 288               	.LM23:
 289 0070 84E1      		ldi r24,lo8(20)
 290 0072 84BB      		out 52-0x20,r24
 291               	/* epilogue: frame size=0 */
 292 0074 0C94 0000 		jmp exit
 293               	/* epilogue end (size=2) */
 294               	/* function main size 105 (99) */
 296               	.Lscope0:
 298               		.stabd	78,0,0
 300               	.Letext0:
 301               	/* File "delaytest.c": code  105 = 0x0069 (  99), prologues   4, epilogues   2 */
DEFINED SYMBOLS
                            *ABS*:00000000 delaytest.c
     /tmp/ccysCnuE.s:3      *ABS*:0000003f __SREG__
     /tmp/ccysCnuE.s:4      *ABS*:0000003e __SP_H__
     /tmp/ccysCnuE.s:5      *ABS*:0000003d __SP_L__
     /tmp/ccysCnuE.s:6      *ABS*:00000000 __tmp_reg__
     /tmp/ccysCnuE.s:7      *ABS*:00000001 __zero_reg__
     /tmp/ccysCnuE.s:71     .text:00000000 main

UNDEFINED SYMBOLS
__do_copy_data
__do_clear_bss
__stack
exit
#define F_CPU 10e6
#define MCU atmega32
#include <avr/io.h>
//#include <util/delay.h>
#include "delay.h"
int main() {
	char a=PINB;
	DDRC=12;
	_delay_us_2(0.4);
	DDRC=13;
	_delay_us_2(1.3);
	DDRC=14;
	_delay_us_2(2.2);
	DDRC=15;
	_delay_us_2(3.1);
	DDRC=16;
	//_delay_us_2(a);
	DDRC=4;
	_delay_us_2(22);
	DDRC=17;
	_delay_us_2(23);
	DDRC=18;
	_delay_us_2(12230);
	DDRC=19;
	_delay_us_2(62230);
	DDRC=20;
	
	
}
_______________________________________________
AVR-libc-dev mailing list
[email protected]
http://lists.nongnu.org/mailman/listinfo/avr-libc-dev

Reply via email to