Index: libavcodec/x86/lpc_mmx.c
===================================================================
--- libavcodec/x86/lpc_mmx.c	(revision 23359)
+++ libavcodec/x86/lpc_mmx.c	(working copy)
@@ -37,6 +37,7 @@
         "addsd   %%xmm6, %%xmm7                \n\t"
         ::"m"(c)
     );
+    /* FIXME: dangerous clobber */
 #define WELCH(MOVPD, offset)\
     __asm__ volatile(\
         "1:                                    \n\t"\
@@ -57,6 +58,7 @@
         "jl 1b                                 \n\t"\
         :"+&r"(i), "+&r"(j)\
         :"r"(w_data+n2), "r"(data+n2)\
+        :CLOBBER_8_XMM\
     );
     if(len&1)
         WELCH("movupd", -1)
@@ -111,7 +113,7 @@
                 "movsd     %%xmm2, 16(%1)           \n\t"
                 :"+&r"(i)
                 :"r"(autoc+j), "r"(data1+len), "r"(data1+len-j)
-                :"memory"
+                :"memory", CLOBBER_6_XMM
             );
         } else {
             __asm__ volatile(
@@ -134,6 +136,7 @@
                 "movsd     %%xmm1, %2               \n\t"
                 :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
                 :"r"(data1+len), "r"(data1+len-j)
+                :CLOBBER_5_XMM
             );
         }
     }
Index: libavcodec/x86/fdct_mmx.c
===================================================================
--- libavcodec/x86/fdct_mmx.c	(revision 23359)
+++ libavcodec/x86/fdct_mmx.c	(working copy)
@@ -31,6 +31,7 @@
  */
 
 #include "libavutil/common.h"
+#include "libavutil/x86_cpu.h"
 #include "libavcodec/dsputil.h"
 
 //////////////////////////////////////////////////////////////////////
@@ -285,7 +286,7 @@
 
 #define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
 
-#define FDCT_COL(cpu, mm, mov)\
+#define FDCT_COL(cpu, mm, mov, clobber)\
 static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
 {\
     __asm__ volatile (\
@@ -366,11 +367,11 @@
         #mov"    %%"#mm"3,   112(%3) \n\t" \
         : \
         : "r" (in  + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
-          "r" (out + offset), "r" (ocos_4_16)); \
+          "r" (out + offset), "r" (ocos_4_16) : clobber); \
 }
 
-FDCT_COL(mmx, mm, movq)
-FDCT_COL(sse2, xmm, movdqa)
+FDCT_COL(mmx, mm, movq, "%0")
+FDCT_COL(sse2, xmm, movdqa, CLOBBER_8_XMM)
 
 static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
 {
@@ -431,6 +432,7 @@
         FDCT_ROW_SSE2(80)
         :
         : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
+        : CLOBBER_8_XMM
     );
 }
 
Index: libavcodec/x86/motion_est_mmx.c
===================================================================
--- libavcodec/x86/motion_est_mmx.c	(revision 23359)
+++ libavcodec/x86/motion_est_mmx.c	(working copy)
@@ -93,27 +93,29 @@
 static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
 {
     int ret;
+    // FIXME: dangerous clobber
     __asm__ volatile(
-        "pxor %%xmm6, %%xmm6            \n\t"
+        "pxor %%xmm3, %%xmm3            \n\t"
         ASMALIGN(4)
         "1:                             \n\t"
         "movdqu (%1), %%xmm0            \n\t"
         "movdqu (%1, %3), %%xmm1        \n\t"
         "psadbw (%2), %%xmm0            \n\t"
         "psadbw (%2, %3), %%xmm1        \n\t"
-        "paddw %%xmm0, %%xmm6           \n\t"
-        "paddw %%xmm1, %%xmm6           \n\t"
+        "paddw %%xmm0, %%xmm3           \n\t"
+        "paddw %%xmm1, %%xmm3           \n\t"
         "lea (%1,%3,2), %1              \n\t"
         "lea (%2,%3,2), %2              \n\t"
         "sub $2, %0                     \n\t"
         " jg 1b                         \n\t"
         : "+r" (h), "+r" (blk1), "+r" (blk2)
         : "r" ((x86_reg)stride)
+        : CLOBBER_4_XMM
     );
     __asm__ volatile(
-        "movhlps %%xmm6, %%xmm0         \n\t"
-        "paddw   %%xmm0, %%xmm6         \n\t"
-        "movd    %%xmm6, %0             \n\t"
+        "movhlps %%xmm3, %%xmm0         \n\t"
+        "paddw   %%xmm0, %%xmm3         \n\t"
+        "movd    %%xmm3, %0             \n\t"
         : "=r"(ret)
     );
     return ret;
Index: libavcodec/x86/dsputil_h264_template_ssse3.c
===================================================================
--- libavcodec/x86/dsputil_h264_template_ssse3.c	(revision 23359)
+++ libavcodec/x86/dsputil_h264_template_ssse3.c	(working copy)
@@ -36,6 +36,7 @@
 
     if(y==0 || x==0)
     {
+        // FIXME: clobber not safe due to reg content reuse between blocks
         /* 1 dimensional filter only */
         __asm__ volatile(
             "movd %0, %%xmm7 \n\t"
@@ -160,6 +161,7 @@
 
 static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
 {
+    // FIXME: clobber not safe due to reg content reuse between blocks
     __asm__ volatile(
         "movd %0, %%mm7 \n\t"
         "movd %1, %%mm6 \n\t"
Index: libavcodec/x86/idct_sse2_xvid.c
===================================================================
--- libavcodec/x86/idct_sse2_xvid.c	(revision 23359)
+++ libavcodec/x86/idct_sse2_xvid.c	(working copy)
@@ -39,6 +39,7 @@
  */
 
 #include "libavcodec/dsputil.h"
+#include "libavutil/x86_cpu.h"
 #include "idct_xvid.h"
 #include "dsputil_mmx.h"
 
@@ -379,7 +380,7 @@
     "6:                                                          \n\t"
     : "+r"(block)
     :
-    : "%eax", "%ecx", "%edx", "%esi", "memory");
+    : "%eax", "%ecx", "%edx", "%esi", "memory", CLOBBER_16_XMM);
 }
 
 void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
Index: libavcodec/x86/vp3dsp_sse2.c
===================================================================
--- libavcodec/x86/vp3dsp_sse2.c	(revision 23359)
+++ libavcodec/x86/vp3dsp_sse2.c	(working copy)
@@ -24,6 +24,7 @@
  */
 
 #include "libavcodec/dsputil.h"
+#include "libavutil/x86_cpu.h"
 #include "dsputil_mmx.h"
 #include "vp3dsp_sse2.h"
 
@@ -171,6 +172,7 @@
         VP3_1D_IDCT_SSE2(ADD8, SHIFT4)
         PUT_BLOCK(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
         :: "r"(input_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8)
+        : CLOBBER_9_XMM
     );
 }
 
Index: libavcodec/x86/snowdsp_mmx.c
===================================================================
--- libavcodec/x86/snowdsp_mmx.c	(revision 23359)
+++ libavcodec/x86/snowdsp_mmx.c	(working copy)
@@ -40,6 +40,7 @@
         // calculate b[0] correctly afterwards.
 
         i = 0;
+        // FIXME: dangerous clobber
         __asm__ volatile(
             "pcmpeqd   %%xmm7, %%xmm7         \n\t"
             "pcmpeqd   %%xmm3, %%xmm3         \n\t"
@@ -64,7 +65,7 @@
                 "movdqa %%xmm2, (%0)          \n\t"
                 "movdqa %%xmm6, 16(%0)        \n\t"
                 :: "r"(&b[i]), "r"(&ref[i])
-                : "memory"
+                : "memory", CLOBBER_8_XMM
             );
         }
         snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
@@ -93,7 +94,7 @@
                 "movdqa %%xmm0, (%0)          \n\t"
                 "movdqa %%xmm4, 16(%0)        \n\t"
                 :: "r"(&dst[i]), "r"(&b[i])
-                : "memory"
+                : "memory", CLOBBER_7_XMM
             );
         }
         snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
@@ -104,6 +105,7 @@
         IDWTELEM b_0 = b[0];
 
         i = 0;
+        // FIXME: dangerous clobber
         __asm__ volatile(
             "psllw         $15, %%xmm7        \n\t"
             "pcmpeqw    %%xmm6, %%xmm6        \n\t"
@@ -137,7 +139,7 @@
                 "movdqa %%xmm0, (%0)          \n\t"
                 "movdqa %%xmm4, 16(%0)        \n\t"
                 :: "r"(&b[i]), "r"(&ref[i])
-                : "memory"
+                : "memory", CLOBBER_8_XMM
             );
         }
         snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
@@ -168,7 +170,7 @@
                 "movdqa %%xmm2, (%2)          \n\t"
                 "movdqa %%xmm6, 16(%2)        \n\t"
                 :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
-                 : "memory"
+                 : "memory", CLOBBER_7_XMM
                );
         }
         snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
@@ -208,7 +210,7 @@
                 "movdqa    %%xmm5, 80(%0)       \n\t"
                 "movdqa    %%xmm7, 112(%0)      \n\t"
                 :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
-                 : "memory"
+                 : "memory", CLOBBER_8_XMM
                );
         }
     }
@@ -507,7 +509,8 @@
         "sub $64, %%"REG_d"                          \n\t"
         "jge 1b                                      \n\t"
         :"+d"(i)
-        :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+        :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)
+        :CLOBBER_8_XMM);
 }
 
 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
@@ -670,7 +673,7 @@
              :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
              :\
              "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
-             "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+             "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"", CLOBBER_8_XMM);
 
 #define snow_inner_add_yblock_sse2_end_8\
              "sal $1, %%"REG_c"              \n\t"\
Index: libavcodec/x86/vp6dsp_sse2.c
===================================================================
--- libavcodec/x86/vp6dsp_sse2.c	(revision 23359)
+++ libavcodec/x86/vp6dsp_sse2.c	(working copy)
@@ -72,7 +72,7 @@
     "jnz 1b                              \n\t"
     : "+r"(src), "+r"(t)
     : "g"((x86_reg)stride), "r"(11), "m"(*(const int64_t*)h_weights)
-    : "memory");
+    : "memory", CLOBBER_8_XMM);
 
     t = tmp + 8;
 
@@ -94,5 +94,5 @@
     "jnz 1b                              \n\t"
     : "+r"(t), "+r"(dst)
     : "g"((x86_reg)stride), "r"(8), "m"(*(const int64_t*)v_weights)
-    : "memory");
+    : "memory", CLOBBER_8_XMM);
 }
Index: libavcodec/x86/dnxhd_mmx.c
===================================================================
--- libavcodec/x86/dnxhd_mmx.c	(revision 23359)
+++ libavcodec/x86/dnxhd_mmx.c	(working copy)
@@ -27,16 +27,16 @@
 static void get_pixels_8x4_sym_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
 {
     __asm__ volatile(
-        "pxor %%xmm7,      %%xmm7       \n\t"
+        "pxor %%xmm4,      %%xmm4       \n\t"
         "movq (%0),        %%xmm0       \n\t"
         "add  %2,          %0           \n\t"
         "movq (%0),        %%xmm1       \n\t"
         "movq (%0, %2),    %%xmm2       \n\t"
         "movq (%0, %2,2),  %%xmm3       \n\t"
-        "punpcklbw %%xmm7, %%xmm0       \n\t"
-        "punpcklbw %%xmm7, %%xmm1       \n\t"
-        "punpcklbw %%xmm7, %%xmm2       \n\t"
-        "punpcklbw %%xmm7, %%xmm3       \n\t"
+        "punpcklbw %%xmm4, %%xmm0       \n\t"
+        "punpcklbw %%xmm4, %%xmm1       \n\t"
+        "punpcklbw %%xmm4, %%xmm2       \n\t"
+        "punpcklbw %%xmm4, %%xmm3       \n\t"
         "movdqa %%xmm0,      (%1)       \n\t"
         "movdqa %%xmm1,    16(%1)       \n\t"
         "movdqa %%xmm2,    32(%1)       \n\t"
@@ -47,6 +47,7 @@
         "movdqa %%xmm0,   112(%1)       \n\t"
         : "+r" (pixels)
         : "r" (block), "r" ((x86_reg)line_size)
+        : CLOBBER_5_XMM
     );
 }
 
Index: libavcodec/x86/mpegvideo_mmx_template.c
===================================================================
--- libavcodec/x86/mpegvideo_mmx_template.c	(revision 23359)
+++ libavcodec/x86/mpegvideo_mmx_template.c	(working copy)
@@ -180,6 +180,9 @@
             : "+a" (last_non_zero_p1)
             : "r" (block+64), "r" (qmat), "r" (bias),
               "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
+#if HAVE_SSE2
+            : CLOBBER_8_XMM
+#endif
         );
     }else{ // FMT_H263
         __asm__ volatile(
@@ -212,8 +215,12 @@
             : "+a" (last_non_zero_p1)
             : "r" (block+64), "r" (qmat+64), "r" (bias+64),
               "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
+#if HAVE_SSE2
+            : CLOBBER_8_XMM
+#endif
         );
     }
+    // FIXME: dangerous clobber
     __asm__ volatile(
         "movd %1, "MM"1                     \n\t" // max_qcoeff
         SPREADW(MM"1")
Index: libavcodec/x86/mpegvideo_mmx.c
===================================================================
--- libavcodec/x86/mpegvideo_mmx.c	(revision 23359)
+++ libavcodec/x86/mpegvideo_mmx.c	(working copy)
@@ -580,6 +580,7 @@
             " jb 1b                             \n\t"
         : "+r" (block), "+r" (sum), "+r" (offset)
         : "r"(block+64)
+        : CLOBBER_8_XMM
     );
 }
 
Index: libavcodec/x86/dsputil_mmx.c
===================================================================
--- libavcodec/x86/dsputil_mmx.c	(revision 23359)
+++ libavcodec/x86/dsputil_mmx.c	(working copy)
@@ -460,7 +460,7 @@
          "jnz 1b                        \n\t"
          : "+g"(h), "+r" (pixels),  "+r" (block)
          : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
-         : "memory"
+         : "memory", CLOBBER_4_XMM
         );
 }
 
@@ -486,7 +486,7 @@
          "jnz 1b                        \n\t"
          : "+g"(h), "+r" (pixels),  "+r" (block)
          : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
-         : "memory"
+         : "memory", CLOBBER_4_XMM
         );
 }
 
@@ -524,7 +524,7 @@
         "movaps %%xmm0,  96(%0) \n"
         "movaps %%xmm0, 112(%0) \n"
         :: "r"(block)
-        : "memory"
+        : "memory", CLOBBER_1_XMM
     );
 }
 
@@ -546,7 +546,7 @@
         " js 1b                 \n"
         : : "r" (((uint8_t *)blocks)+128*6),
             "i" (-128*6)
-        : "%"REG_a
+        : "%"REG_a, CLOBBER_1_XMM
     );
 }
 
@@ -723,32 +723,28 @@
     }
 }
 
-static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
+static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
     __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
-        "movd  %4, %%mm0                \n\t"
-        "movd  %5, %%mm1                \n\t"
-        "movd  %6, %%mm2                \n\t"
-        "movd  %7, %%mm3                \n\t"
+        "movd  (%0), %%mm0              \n\t"
+        "movd  (%2), %%mm1              \n\t"
+        "movd  (%0, %1, 2), %%mm2       \n\t"
+        "movd  (%2, %1, 2), %%mm3       \n\t"
         "punpcklbw %%mm1, %%mm0         \n\t"
         "punpcklbw %%mm3, %%mm2         \n\t"
         "movq %%mm0, %%mm1              \n\t"
         "punpcklwd %%mm2, %%mm0         \n\t"
         "punpckhwd %%mm2, %%mm1         \n\t"
-        "movd  %%mm0, %0                \n\t"
+        "movd  %%mm0, (%3)              \n\t"
         "punpckhdq %%mm0, %%mm0         \n\t"
-        "movd  %%mm0, %1                \n\t"
-        "movd  %%mm1, %2                \n\t"
+        "movd  %%mm0, (%5)              \n\t"
+        "movd  %%mm1, (%3, %4, 2)       \n\t"
         "punpckhdq %%mm1, %%mm1         \n\t"
-        "movd  %%mm1, %3                \n\t"
+        "movd  %%mm1, (%5, %4, 2)       \n\t"
 
-        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
-          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
-          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
-          "=m" (*(uint32_t*)(dst + 3*dst_stride))
-        :  "m" (*(uint32_t*)(src + 0*src_stride)),
-           "m" (*(uint32_t*)(src + 1*src_stride)),
-           "m" (*(uint32_t*)(src + 2*src_stride)),
-           "m" (*(uint32_t*)(src + 3*src_stride))
+        :
+        : "r"(src), "r"(src_stride), "r"(src + src_stride),
+          "r"(dst), "r"(dst_stride), "r"(dst + dst_stride)
+        : "memory"
     );
 }
 
@@ -1916,6 +1912,7 @@
 {
     int i;
 
+    // FIXME: unsafe clobber
     __asm__ volatile(
             "movaps  %0,     %%xmm5 \n\t"
         ::"m"(ff_pdw_80000000[0])
@@ -1938,7 +1935,7 @@
             "movaps  %%xmm3, %1     \n\t"
             "movaps  %%xmm0, %0     \n\t"
             :"+m"(mag[i]), "+m"(ang[i])
-            ::"memory"
+            ::"memory", CLOBBER_6_XMM
         );
     }
 }
@@ -2004,7 +2001,7 @@
         "jl 1b \n"\
         :"+&r"(i), "=&r"(j), "=&r"(k)\
         :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
-        :"memory"\
+        :"memory", CLOBBER_8_XMM\
     );
 
 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
@@ -2012,6 +2009,7 @@
     int (*matrix_cmp)[2] = (int(*)[2])matrix;
     intptr_t i,j,k;
 
+    // FIXME: unsafe clobber
     i = -len*sizeof(float);
     if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
         MIX5(IF0,IF1);
@@ -2074,7 +2072,7 @@
         "jge 1b \n\t"
         :"+r"(i)
         :"r"(dst), "r"(src)
-        :"memory"
+        :"memory", CLOBBER_2_XMM
     );
 }
 
@@ -2113,6 +2111,7 @@
         "jge    1b \n\t"
         :"+r"(i), "+r"(src1)
         :"r"(dst), "r"(src0)
+        :CLOBBER_2_XMM
     );
 }
 
@@ -2154,7 +2153,7 @@
         "jge  1b \n\t"
         :"+r"(i)
         :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
-        :"memory"
+        :"memory", CLOBBER_2_XMM
     );
 }
 
@@ -2223,6 +2222,7 @@
             "jl 1b \n"
             :"+r"(i), "+r"(j)
             :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
+            :CLOBBER_6_XMM
         );
     }else
 #endif
@@ -2250,6 +2250,7 @@
         "jl 1b \n"
         :"+r"(i)
         :"r"(dst+len), "r"(src+len), "m"(mul)
+        :CLOBBER_5_XMM
     );
 }
 
@@ -2270,6 +2271,7 @@
         "jl 1b \n"
         :"+r"(i)
         :"r"(dst+len), "r"(src+len), "m"(mul)
+        :CLOBBER_5_XMM
     );
 }
 
@@ -2303,7 +2305,7 @@
         "jge 1b \n\t"
         :"+&r"(i)
         :"r"(dst), "r"(src), "m"(min), "m"(max)
-        :"memory"
+        :"memory", CLOBBER_6_XMM
     );
 }
 
@@ -2368,6 +2370,7 @@
         "add        $16         , %0        \n\t"
         " js 1b                             \n\t"
         :"+r"(reglen), "+r"(dst), "+r"(src)
+        ::CLOBBER_2_XMM
     );
 }
 
@@ -2402,7 +2405,7 @@
 #endif
 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
 
-#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
+#define FLOAT_TO_INT16_INTERLEAVE(cpu, body, clobber) \
 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
     DECLARE_ALIGNED(16, int16_t, tmp)[len];\
@@ -2429,6 +2432,7 @@
             "neg %0 \n"\
             body\
             :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
+            ::clobber\
         );\
     }else if(channels==6){\
         ff_float_to_int16_interleave6_##cpu(dst, src, len);\
@@ -2452,6 +2456,7 @@
     "add $16, %0                \n"
     "js 1b                      \n"
     "femms                      \n"
+    , "%0"
 )
 
 FLOAT_TO_INT16_INTERLEAVE(sse,
@@ -2470,6 +2475,7 @@
     "add $16, %0                \n"
     "js 1b                      \n"
     "emms                       \n"
+    , "%0"
 )
 
 FLOAT_TO_INT16_INTERLEAVE(sse2,
@@ -2482,6 +2488,7 @@
     "movdqa     %%xmm0, (%1,%0) \n"
     "add $16, %0                \n"
     "js 1b                      \n"
+    , CLOBBER_2_XMM
 )
 
 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
Index: libavcodec/x86/h264dsp_mmx.c
===================================================================
--- libavcodec/x86/h264dsp_mmx.c	(revision 23359)
+++ libavcodec/x86/h264dsp_mmx.c	(working copy)
@@ -298,6 +298,7 @@
         STORE_DIFF_8P(%%xmm1, (%0,%3),   %%xmm6, %%xmm7)
         :"+r"(dst)
         :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
+        :CLOBBER_9_XMM
     );
 }
 
@@ -1572,7 +1573,7 @@
         : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
         : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
           "m"(ff_pw_5), "m"(ff_pw_16)\
-        : "memory"\
+        : "memory", CLOBBER_16_XMM\
     );\
 }
 #else // ARCH_X86_64
@@ -1590,6 +1591,7 @@
 
 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    /* FIXME: dangerous clobber */\
     int h=8;\
     __asm__ volatile(\
         "pxor %%xmm7, %%xmm7        \n\t"\
@@ -1630,7 +1632,7 @@
         : "+a"(src), "+c"(dst), "+d"(src2)\
         : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
           "m"(ff_pw_16)\
-        : "memory"\
+        : "memory", CLOBBER_8_XMM\
     );\
     }while(--h);\
 }\
@@ -1672,7 +1674,7 @@
         " jnz 1b                    \n\t"\
         : "+a"(src), "+c"(dst), "+g"(h)\
         : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
-        : "memory"\
+        : "memory", CLOBBER_8_XMM\
     );\
 }\
 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
@@ -1687,6 +1689,7 @@
 #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
     src -= 2*srcStride;\
+    /* FIXME: dangerous clobber */\
     \
     __asm__ volatile(\
         "pxor %%xmm7, %%xmm7        \n\t"\
@@ -1716,7 +1719,7 @@
          \
         : "+a"(src), "+c"(dst)\
         : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
-        : "memory"\
+        : "memory", CLOBBER_8_XMM\
     );\
     if(h==16){\
         __asm__ volatile(\
@@ -1746,6 +1749,7 @@
 static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
     int w = (size+8)>>3;
     src -= 2*srcStride+2;
+    /* FIXME: dangerous clobber */\
     while(w--){
         __asm__ volatile(
             "pxor %%xmm7, %%xmm7        \n\t"
@@ -1774,7 +1778,7 @@
             QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
             : "+a"(src)
             : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
-            : "memory"
+            : "memory", CLOBBER_8_XMM
         );
         if(size==16){
             __asm__ volatile(
@@ -1852,7 +1856,7 @@
             " jnz 1b                    \n\t"\
             : "+a"(tmp), "+c"(dst), "+g"(h)\
             : "S"((x86_reg)dstStride)\
-            : "memory"\
+            : "memory", CLOBBER_8_XMM\
         );\
     }else{\
         __asm__ volatile(\
@@ -1886,7 +1890,7 @@
             " jnz 1b                    \n\t"\
             : "+a"(tmp), "+c"(dst), "+g"(h)\
             : "S"((x86_reg)dstStride)\
-            : "memory"\
+            : "memory", CLOBBER_6_XMM\
         );\
     }\
 }
Index: libavcodec/x86/fft_sse.c
===================================================================
--- libavcodec/x86/fft_sse.c	(revision 23359)
+++ libavcodec/x86/fft_sse.c	(working copy)
@@ -49,7 +49,7 @@
             "jl 1b \n"
             :"+r"(i)
             :"r"(z+n)
-            :"memory"
+            :"memory", CLOBBER_2_XMM
         );
     }
 }
@@ -66,6 +66,7 @@
             :"=m"(s->tmp_buf[s->revtab[i]]),
              "=m"(s->tmp_buf[s->revtab[i+1]])
             :"m"(z[i])
+            : CLOBBER_1_XMM
         );
     }
     memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
@@ -85,6 +86,7 @@
 
     /* pre rotation */
     for(k=n8-2; k>=0; k-=2) {
+        // FIXME: dangerous clobber
         __asm__ volatile(
             "movaps     (%2,%1,2), %%xmm0 \n" // { z[k].re,    z[k].im,    z[k+1].re,  z[k+1].im  }
             "movaps  -16(%2,%0,2), %%xmm1 \n" // { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im }
@@ -108,6 +110,7 @@
             "unpckhps      %%xmm2, %%xmm0 \n" // { z[-k-2], z[-k-1] }
             ::"r"(-4*k), "r"(4*k),
               "r"(input+n4), "r"(tcos+n8), "r"(tsin+n8)
+            : CLOBBER_6_XMM
         );
 #if ARCH_X86_64
         // if we have enough regs, don't let gcc make the luts latency-bound
@@ -168,7 +171,7 @@
         "jl 1b \n"
         :"+&r"(j), "+&r"(k)
         :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8)
-        :"memory"
+        :"memory", CLOBBER_8_XMM
     );
 }
 
@@ -183,13 +186,13 @@
     j = -n;
     k = n-16;
     __asm__ volatile(
-        "movaps %4, %%xmm7 \n"
+        "movaps %4, %%xmm2 \n"
         "1: \n"
         "movaps       (%2,%1), %%xmm0 \n"
         "movaps       (%3,%0), %%xmm1 \n"
         "shufps $0x1b, %%xmm0, %%xmm0 \n"
         "shufps $0x1b, %%xmm1, %%xmm1 \n"
-        "xorps         %%xmm7, %%xmm0 \n"
+        "xorps         %%xmm2, %%xmm0 \n"
         "movaps        %%xmm1, (%3,%1) \n"
         "movaps        %%xmm0, (%2,%0) \n"
         "sub $16, %1 \n"
@@ -198,6 +201,7 @@
         :"+r"(j), "+r"(k)
         :"r"(output+n4), "r"(output+n4*3),
          "m"(*m1m1m1m1)
+        :CLOBBER_3_XMM
     );
 }
 
Index: libavcodec/x86/dsputilenc_mmx.c
===================================================================
--- libavcodec/x86/dsputilenc_mmx.c	(revision 23359)
+++ libavcodec/x86/dsputilenc_mmx.c	(working copy)
@@ -60,16 +60,16 @@
 static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
 {
     __asm__ volatile(
-        "pxor %%xmm7,      %%xmm7         \n\t"
+        "pxor %%xmm4,      %%xmm4         \n\t"
         "movq (%0),        %%xmm0         \n\t"
         "movq (%0, %2),    %%xmm1         \n\t"
         "movq (%0, %2,2),  %%xmm2         \n\t"
         "movq (%0, %3),    %%xmm3         \n\t"
         "lea (%0,%2,4), %0                \n\t"
-        "punpcklbw %%xmm7, %%xmm0         \n\t"
-        "punpcklbw %%xmm7, %%xmm1         \n\t"
-        "punpcklbw %%xmm7, %%xmm2         \n\t"
-        "punpcklbw %%xmm7, %%xmm3         \n\t"
+        "punpcklbw %%xmm4, %%xmm0         \n\t"
+        "punpcklbw %%xmm4, %%xmm1         \n\t"
+        "punpcklbw %%xmm4, %%xmm2         \n\t"
+        "punpcklbw %%xmm4, %%xmm3         \n\t"
         "movdqa %%xmm0,      (%1)         \n\t"
         "movdqa %%xmm1,    16(%1)         \n\t"
         "movdqa %%xmm2,    32(%1)         \n\t"
@@ -78,16 +78,17 @@
         "movq (%0, %2),    %%xmm1         \n\t"
         "movq (%0, %2,2),  %%xmm2         \n\t"
         "movq (%0, %3),    %%xmm3         \n\t"
-        "punpcklbw %%xmm7, %%xmm0         \n\t"
-        "punpcklbw %%xmm7, %%xmm1         \n\t"
-        "punpcklbw %%xmm7, %%xmm2         \n\t"
-        "punpcklbw %%xmm7, %%xmm3         \n\t"
+        "punpcklbw %%xmm4, %%xmm0         \n\t"
+        "punpcklbw %%xmm4, %%xmm1         \n\t"
+        "punpcklbw %%xmm4, %%xmm2         \n\t"
+        "punpcklbw %%xmm4, %%xmm3         \n\t"
         "movdqa %%xmm0,    64(%1)         \n\t"
         "movdqa %%xmm1,    80(%1)         \n\t"
         "movdqa %%xmm2,    96(%1)         \n\t"
         "movdqa %%xmm3,   112(%1)         \n\t"
         : "+r" (pixels)
         : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
+        : CLOBBER_5_XMM
     );
 }
 
@@ -380,7 +381,8 @@
       "paddd %%xmm1,%%xmm7\n"
       "movd %%xmm7,%3\n"
       : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
-      : "r" ((x86_reg)line_size));
+      : "r" ((x86_reg)line_size)
+      : CLOBBER_8_XMM);
     return tmp;
 }
 
@@ -945,7 +947,7 @@
     "punpcklbw "#a", "#a"             \n\t"\
     "psubw     "#t", "#a"             \n\t"\
 
-#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
+#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp,clobber) {\
     uint8_t *p1b=p1, *p2b=p2;\
     __asm__ volatile(\
         DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
@@ -962,12 +964,13 @@
         "mov"#m1" %0, "#mm"0          \n\t"\
         : "+m"(temp), "+r"(p1b), "+r"(p2b)\
         : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
+        : clobber\
     );\
 }
     //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
 
-#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
-#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
+#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp, "%0")
+#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp, CLOBBER_8_XMM)
 
 #define LBUTTERFLY2(a1,b1,a2,b2)\
     "paddw " #b1 ", " #a1 "           \n\t"\
@@ -1161,6 +1164,7 @@
         HSUM_SSE2(%%xmm0, %%xmm1, %0)\
         : "=r" (sum)\
         : "r"(temp)\
+        : CLOBBER_8_XMM\
     );\
     return sum&0xFFFF;\
 }\
@@ -1224,11 +1228,13 @@
         DCT_SAD\
         :"=r"(sum)\
         :"r"(block)\
+        : DCT_SAD_CLOBBER\
     );\
     return sum&0xFFFF;\
 }
 
 #define DCT_SAD       DCT_SAD_MMX
+#define DCT_SAD_CLOBBER "%0"
 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
 #define MMABS(a,z)    MMABS_MMX(a,z)
 DCT_SAD_FUNC(mmx)
@@ -1239,9 +1245,11 @@
 #define MMABS(a,z)    MMABS_MMX2(a,z)
 DCT_SAD_FUNC(mmx2)
 #undef HSUM
+#undef DCT_SAD_CLOBBER
 #undef DCT_SAD
 
 #define DCT_SAD       DCT_SAD_SSE2
+#define DCT_SAD_CLOBBER CLOBBER_8_XMM
 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
 DCT_SAD_FUNC(sse2)
 #undef MMABS
@@ -1252,6 +1260,7 @@
 #undef MMABS
 #endif
 #undef HSUM
+#undef DCT_SAD_CLOBBER
 #undef DCT_SAD
 
 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
Index: libavutil/x86_cpu.h
===================================================================
--- libavutil/x86_cpu.h	(revision 23359)
+++ libavutil/x86_cpu.h	(working copy)
@@ -73,4 +73,32 @@
 #    define BROKEN_RELOCATIONS 1
 #endif
 
+#define CLOBBER_1_XMM "%xmm0"
+#define CLOBBER_2_XMM CLOBBER_1_XMM, "%xmm1"
+#define CLOBBER_3_XMM CLOBBER_2_XMM, "%xmm2"
+#define CLOBBER_4_XMM CLOBBER_3_XMM, "%xmm3"
+#define CLOBBER_5_XMM CLOBBER_4_XMM, "%xmm4"
+#define CLOBBER_6_XMM CLOBBER_5_XMM, "%xmm5"
+#define CLOBBER_7_XMM CLOBBER_6_XMM, "%xmm6"
+#define CLOBBER_8_XMM CLOBBER_7_XMM, "%xmm7"
+#if ARCH_X86_64
+#define CLOBBER_9_XMM  CLOBBER_8_XMM, "%xmm8"
+#define CLOBBER_10_XMM CLOBBER_9_XMM, "%xmm9"
+#define CLOBBER_11_XMM CLOBBER_10_XMM, "%xmm10"
+#define CLOBBER_12_XMM CLOBBER_11_XMM, "%xmm11"
+#define CLOBBER_13_XMM CLOBBER_12_XMM, "%xmm12"
+#define CLOBBER_14_XMM CLOBBER_13_XMM, "%xmm13"
+#define CLOBBER_15_XMM CLOBBER_14_XMM, "%xmm14"
+#define CLOBBER_16_XMM CLOBBER_15_XMM, "%xmm15"
+#else
+#define CLOBBER_9_XMM  CLOBBER_8_XMM
+#define CLOBBER_10_XMM CLOBBER_8_XMM
+#define CLOBBER_11_XMM CLOBBER_8_XMM
+#define CLOBBER_12_XMM CLOBBER_8_XMM
+#define CLOBBER_13_XMM CLOBBER_8_XMM
+#define CLOBBER_14_XMM CLOBBER_8_XMM
+#define CLOBBER_15_XMM CLOBBER_8_XMM
+#define CLOBBER_16_XMM CLOBBER_8_XMM
+#endif
+
 #endif /* AVUTIL_X86_CPU_H */
