Index: delay.h.in
===================================================================
--- delay.h.in	(revision 2188)
+++ delay.h.in	(working copy)
@@ -109,16 +109,50 @@
    mode _delay_ms() will work with a resolution of 1/10 ms, providing
    delays up to 6.5535 seconds (independent from CPU frequency).  The
    user will not be informed about decreased resolution.
+
+   If the avr-gcc toolchain has __builtin_avr_delay_cycles(unsigned long)
+   support, maximal possible delay is 4294967.295 ms/ F_CPU in MHz. For
+   values greater than the maximal possible delay, overflows results in
+   no delay i.e., 0ms.
+
+   Conversion of __us into clock cycles may not always result in integer.
+   By default, the clock cycles rounded up to next integer. This ensures that
+   the user gets atleast __us microseconds of delay.
+
+   Alternatively, user can define __DELAY_ROUND_DOWN__ and __DELAY_ROUND_CLOSEST__
+   to round down and round to closest integer.
+
+   Note: The new implementation of _delay_ms(double __ms) with 
+    __builtin_avr_delay_cycles(unsigned long) support is not backward compatible. 
+   User can define __DELAY_BACKWARD_COMPATIBLE__ to get a backward compatible delay
+   although this will be deprecated in future.
+
  */
 void
 _delay_ms(double __ms)
 {
 	uint16_t __ticks;
-	double __tmp = ((F_CPU) / 4e3) * __ms;
+	double __tmp ; 
 #if __HAS_DELAY_CYCLES && defined(__OPTIMIZE__)
+	uint32_t __ticks_dc;
 	extern void __builtin_avr_delay_cycles(unsigned long);
-	__builtin_avr_delay_cycles(__tmp);
-#else
+	__tmp = ((F_CPU) / 1e3) * __ms;
+
+	#if defined(__DELAY_ROUND_DOWN__)
+		__ticks_dc = (uint32_t)fabs(__tmp);
+
+	#elif defined(__DELAY_ROUND_CLOSEST__)
+		__ticks_dc = (uint32_t)(fabs(__tmp)+0.5);
+
+	#else
+		//round up by default
+		__ticks_dc = (uint32_t)(ceil(fabs(__tmp)));
+	#endif
+
+	__builtin_avr_delay_cycles(__ticks_dc);
+
+#elif !__HAS_DELAY_CYCLES || defined (__DELAY_BACKWARD_COMPATIBLE__)
+	__tmp = ((F_CPU) / 4e3) * __ms;
 	if (__tmp < 1.0)
 		__ticks = 1;
 	else if (__tmp > 65535)
@@ -152,16 +186,50 @@
    If the user requests a delay greater than the maximal possible one,
    _delay_us() will automatically call _delay_ms() instead.  The user
    will not be informed about this case.
+
+   If the avr-gcc toolchain has __builtin_avr_delay_cycles(unsigned long)
+   support, maximal possible delay is 4294967.295 us/ F_CPU in MHz. For
+   values greater than the maximal possible delay, overflow results in
+   no delay i.e., 0us.
+  
+   Conversion of __us into clock cycles may not always result in integer.
+   By default, the clock cycles rounded up to next integer. This ensures that
+   the user gets atleast __us microseconds of delay.
+
+   Alternatively, user can define __DELAY_ROUND_DOWN__ and __DELAY_ROUND_CLOSEST__
+   to round down and round to closest integer.
+ 
+   Note: The new implementation of _delay_us(double __us) with 
+    __builtin_avr_delay_cycles(unsigned long) support is not backward compatible.
+   User can define __DELAY_BACKWARD_COMPATIBLE__ to get a backward compatible delay
+   although this will be deprecated in future.
+
  */
 void
 _delay_us(double __us)
 {
 	uint8_t __ticks;
-	double __tmp = ((F_CPU) / 3e6) * __us;
+	double __tmp ; 
 #if __HAS_DELAY_CYCLES && defined(__OPTIMIZE__)
+	uint32_t __ticks_dc;
 	extern void __builtin_avr_delay_cycles(unsigned long);
-	__builtin_avr_delay_cycles(__tmp);
-#else
+	__tmp = ((F_CPU) / 1e6) * __us;
+
+	#if defined(__DELAY_ROUND_DOWN__)
+		__ticks_dc = (uint32_t)fabs(__tmp);
+
+	#elif defined(__DELAY_ROUND_CLOSEST__)
+		__ticks_dc = (uint32_t)(fabs(__tmp)+0.5);
+
+	#else
+		//round up by default
+		__ticks_dc = (uint32_t)(ceil(fabs(__tmp)));
+	#endif
+
+	__builtin_avr_delay_cycles(__ticks_dc);
+
+#elif !__HAS_DELAY_CYCLES || defined (__DELAY_BACKWARD_COMPATIBLE__)
+	__tmp = ((F_CPU) / 3e6) * __us;
 	if (__tmp < 1.0)
 		__ticks = 1;
 	else if (__tmp > 255)
