I've attached a sample file to this email. The class defined in the cpp
file is a cut down and modfied version of the class used in Dirac.

I compiled it using the following options:
g++ -mmmx -g -O3  test_mmx_diff4.cpp


The run time comparison is attached to this email as well.

Hope this helps.

Regards
A. Suraparaju

On Tue, 2005-05-03 at 16:29 -0700, James E Wilson wrote:
> Anuradha Suraparaju wrote:
> > My question is how do I report this as a bug? What information do I
> > need to provide in the bug report? Did anybody else face similar
> > problems with GCC-4.0.0 and MMX-enabled programs.
> 
> See
>      http://gcc.gnu.org/bugs.html
> for info on reporting bugs.
> 
> If you can narrow this down to a small testcase, then you are more 
> likely to get a solution from us.  If you want us compile the entire 
> Dirac project and take a look, we probably won't bother.
> 
> There have been changes to the MMX support in gcc, but without specific 
> details about your testcase, it is hard to say anything definite.  For 
> instance, we don't know what the Dirac --enable-mmx option does.  Which 
> specific gcc options does it enable?
> 
> What about SSE?  The SSE support is generally preferred over the older 
> MMX support.  Does Dirac make any use of this?  If not, perhaps it should.
-- 
#include <iostream>
#ifdef __MMX__
#include <mmintrin.h>
#endif

typedef short **PicArray;

PicArray refdata;
PicArray picdata;


class SimpleBlockDiff
{
    public:
      //! Default lConstructor
    SimpleBlockDiff( const PicArray &pic_data, const PicArray &ref_data, int rows, int cols) : pic_data(picdata), ref_data(ref_data), xl(cols), yl(rows)
	{}

    //! Do the actual difference without bounds checking
    int Diff();

    private:
        //! Private, bodyless copy-constructor: class should not be copied
        SimpleBlockDiff(const SimpleBlockDiff& cpy);

       //! Private, bodyless assignment=: class should not be assigned
        SimpleBlockDiff& operator=(const SimpleBlockDiff& rhs);

		PicArray pic_data, ref_data;
		int xl, yl;

};

int SimpleBlockDiff::Diff ()
{
#ifdef __MMX__
    __m64 sum = _mm_set_pi32(0, 0);

   	for (int j=0 ; j < yl ; j++)
	{
		short *p = &pic_data[j][0];
		short *r = &ref_data[j][0];

   		for (int i=0 ; i < xl ; i+=4, p +=4, r+=4 )
        {
           __m64 pic = *(__m64 *)p;
           __m64 ref = *(__m64 *)r;
            // pic - ref
            pic = _mm_sub_pi16 (pic, ref);
            // abs (pic - ref)
            ref = _mm_srai_pi16(pic, 15);
            pic = _mm_xor_si64(pic, ref);
            pic = _mm_sub_pi16 (pic, ref);
            // sum += abs(pic -ref)
            ref = _mm_xor_si64(ref, ref);
            ref = _mm_unpackhi_pi16(pic, ref);
            pic = _mm_unpacklo_pi16(pic, pic);
            pic = _mm_srai_pi32 (pic, 16);
            //ref = _mm_srai_pi32 (ref, 16);
            pic = _mm_add_pi32 (pic, ref);
            sum = _mm_add_pi32 (sum, pic);
        }
    }
    int *result = (int *) &sum;
    _mm_empty();

    return result[0] + result[1];
#else
	int sum = 0;

	for (int j=0; j < yl; j++)
	{
		for (int i=0; i < xl; i++)
		{
			sum += std::abs(pic_data[j][i] - ref_data[j][i]);
		}
	}
	return sum;
#endif
}

void setup_data()
{
	short *pic_data = new short [12*12];
	short *ref_data = new short [12*12];

	picdata = new short *[12];
	refdata = new short *[12];
	for (int j = 0; j<12; j++)
	{
		picdata[j] = pic_data + j*12;
		for (int i = 0; i < 12; i++)
			picdata[j][i] = 2;
	}

	for (int j = 0; j<12; j++)
	{
		refdata[j] = ref_data + j*12;
		for (int i = 0; i < 12; i++)
			refdata[j][i] = 1;
	}
}

void cleanup()
{
	delete [] refdata[0];
	delete [] picdata[0];

	
	delete[] picdata;
	delete[] refdata;
}

extern int main (int argc, char **argv)
{

	setup_data();

	SimpleBlockDiff diff (picdata, refdata, 12, 12);

	std::cout << diff.Diff () << std::endl;

 	for (int i = 0; i < 4000000 ; i++)
 	{
		diff.Diff ();
 	}

}
Compile line
g++ -mmmx -g -O3  test_mmx_diff4.cpp

Tests conducted using gcc3.4.3 and gcc 4.0.1 20050503 (prerelease)


1. AMD Dual Opteron Processor, Suse 9.2 (32 bit)

Results:

    gcc-3.4.3          gcc-4.0.1 20050503 (prerelease)

    real 1.25          real 2.87
    user 1.24          user 2.87
    sys 0.00           sys 0.00


2. Intel Dual Xeon 3.0 GHz, Suse 9.2 64 bit

Results:

    gcc-3.4.3          gcc-4.0.0

    real 1.09          real 1.58
    user 1.09          user 1.54
    sys 0.00           sys 0.00

3. Pentium 4 2.66GHz, Suse 9.2

Results:

    gcc3.3 20030226    gcc-4.0.0

    real 1.35          real 4.98
    user 1.32          user 4.96
    sys 0.00           sys 0.00


gcc-4.0.0 performed worse than gcc-3.3.3 or gcc3.4.3 even for this simple 
program. The test results using Dirac were similar to this.

Reply via email to