Am Samstag 09 September 2006 12:01 schrieb Matthias Schwarzott:
> I did it another way: bisecting like Prakash Punnoor suggested.
> Now I found out that cSoftOsd::ScaleDownHoriz_MMX from SoftOsd.c causes the
> problem.
> If compiling that part with -O2 it does not work and with -O0 it works.

Nice, here is a hackish patch attached, which converts inline asm to 
intrinsics. Could you test this? It works for me when I play 720p files via 
softplay. Beware that it will only compile using gcc >4. If devs are OK, with 
converting inline asm to intrinsics, I could make it clean it up a bit,  
adding support for gcc <4, as well.

Cheers,
-- 
(°=                 =°)
//\ Prakash Punnoor /\\
V_/                 \_V
Index: SoftOsd.c
===================================================================
RCS file: /cvsroot/softdevice/softdevice/SoftOsd.c,v
retrieving revision 1.16
diff -u -r1.16 SoftOsd.c
--- SoftOsd.c	10 Jul 2006 18:23:28 -0000	1.16
+++ SoftOsd.c	9 Sep 2006 11:55:07 -0000
@@ -1372,6 +1372,14 @@
         unsigned int g_sum=0;
         unsigned int r_sum=0;
         unsigned int c;
+#else
+#define USE_INTRINSICS
+		typedef int   v2si __attribute__ ((vector_size (8)));
+		v2si v6;
+		v2si v1;
+		v2si v2;
+		v2si v0;
+		v2si v7;
 #endif
         uint32_t new_pixel_width=(OSD_WIDTH*ScaleFactor)/dest_Width;
         uint32_t new_pixel_width_rec=(dest_Width*ScaleFactor)/OSD_WIDTH;
@@ -1379,12 +1387,19 @@
         int32_t pos=new_pixel_width;
         
 #ifdef USE_MMX2
+#ifndef USE_INTRINSICS
         __asm__ __volatile__ (
                  " pxor %%mm0,%%mm0 \n" //mm0: dest pixel
                  " movd (%0),%%mm6  \n"
                  " pshufw $0,%%mm6,%%mm6 \n"// mm6: new_pixel_width_rec
                  " pxor %%mm7,%%mm7 \n" //mm7: 00 00 00 ...
                  : : "r" (&new_pixel_width_rec)  );
+#else
+		v0 = __builtin_ia32_pxor(v0, v0);
+		v6 = __builtin_ia32_vec_init_v2si(new_pixel_width_rec, 0);
+		v6 = __builtin_ia32_pshufw(v6, 0);
+		v7 = __builtin_ia32_pxor(v7, v7);
+#endif
 #endif
         SCALEDEBH("OSD_WIDTH: %d dest_width: %d new_pixel_width: %d\n",
                         OSD_WIDTH,dest_Width,new_pixel_width);
@@ -1402,12 +1417,19 @@
                         g_sum+=GET_G(c)*ScaleFactor;
                         r_sum+=GET_R(c)*ScaleFactor;
 #else
+#ifndef USE_INTRINSICS
                         __asm__ __volatile__(
                                " movd (%0),%%mm1 \n"
                                " punpcklbw %%mm7, %%mm1 \n"
                                " psllw $"SHIFT_BITS",%%mm1 \n"
                                " paddw %%mm1,%%mm0 \n"
                                : : "r" (pixmap)  );
+#else
+				v1 = __builtin_ia32_vec_init_v2si(*pixmap, 0);
+				v1 = __builtin_ia32_punpcklbw(v1, v7);
+				v1 = __builtin_ia32_psllw(v1, SHIFT_BITS_NUM);
+				v0 = __builtin_ia32_paddw(v0, v1);
+#endif
 #endif
                        
                         pos -=ScaleFactor;
@@ -1423,6 +1445,7 @@
                 g_sum+=GET_G(c)*pos;
                 r_sum+=GET_R(c)*pos;
 #else
+#ifndef USE_INTRINSICS
                 __asm__ __volatile__(
                       " movd (%0),%%mm1 \n"
                       " movd %1,%%mm2 \n"
@@ -1431,6 +1454,14 @@
                       " pmullw %%mm2,%%mm1 \n"
                       " paddw %%mm1,%%mm0 \n"
                       : : "r" (pixmap),"r" (pos)  );
+#else
+				v1 = __builtin_ia32_vec_init_v2si(*pixmap, 0);
+				v2 = __builtin_ia32_vec_init_v2si(pos, 0);
+				v1 = __builtin_ia32_punpcklbw(v1, v7);
+				v2 = __builtin_ia32_pshufw(v2, 0);
+				v1 = __builtin_ia32_pmullw(v1, v2);
+				v0 = __builtin_ia32_paddw(v0, v1);
+#endif
 #endif
                 
                 SCALEDEBH("a_sum: %d new_pixel_width_rec: %d a pixel: %d",
@@ -1452,6 +1483,7 @@
                 dest[3]=a_sum;*/
                 a_sum=b_sum=g_sum=r_sum=0;
 #else
+#ifndef USE_INTRINSICS
                 __asm__ __volatile__ (
                       " psrlw $"SHIFT_BITS",%%mm0 \n"
                       " pmullw %%mm6,%%mm0 \n"
@@ -1460,6 +1492,14 @@
                       " movd %%mm0,(%0) \n"
                       " pxor %%mm0,%%mm0 \n"
                       : : "r"(dest) );
+#else
+                v0 = __builtin_ia32_psrlw(v0, SHIFT_BITS_NUM);
+				v0 = __builtin_ia32_pmullw(v0, v6);
+				v0 = __builtin_ia32_psrlw(v0, SHIFT_BITS_NUM);
+				v0 = __builtin_ia32_packuswb(v0, v0);
+				*dest = __builtin_ia32_vec_ext_v2si(v0, 0);
+				v0 = __builtin_ia32_pxor(v0, v0);
+#endif
 #endif
                 SCALEDEBH(", %d, %d, %d\n",r_sum,g_sum,b_sum);
                 dest++;
@@ -1478,6 +1518,7 @@
                 g_sum=GET_G(c)*apos;
                 r_sum=GET_R(c)*apos;
 #else
+#ifndef USE_INTRINSICS
                 __asm__ __volatile__ (
                       " movd (%0),%%mm1 \n"
                       " movd %1,%%mm2 \n"
@@ -1486,6 +1527,14 @@
                       " pmullw %%mm2,%%mm1 \n"
                       " paddw %%mm1,%%mm0 \n"
                       : : "r" (pixmap),"r" (apos)  );
+#else
+				v1 = __builtin_ia32_vec_init_v2si(*pixmap, 0);
+				v2 = __builtin_ia32_vec_init_v2si(apos, 0);
+				v1 = __builtin_ia32_punpcklbw(v1, v7);
+				v2 = __builtin_ia32_pshufw(v2, 0x0);
+				v1 = __builtin_ia32_pmullw(v1, v2);
+				v0 = __builtin_ia32_paddw(v0, v1);
+#endif
 #endif                
                 pixmap++;
                 pos += new_pixel_width;

Attachment: pgph8FOdn4UYj.pgp
Description: PGP signature

_______________________________________________
Softdevice-devel mailing list
[email protected]
https://lists.berlios.de/mailman/listinfo/softdevice-devel

Reply via email to