I've attached a sample file to this email. The class defined in the cpp file is a cut down and modfied version of the class used in Dirac.
I compiled it using the following options: g++ -mmmx -g -O3 test_mmx_diff4.cpp The run time comparison is attached to this email as well. Hope this helps. Regards A. Suraparaju On Tue, 2005-05-03 at 16:29 -0700, James E Wilson wrote: > Anuradha Suraparaju wrote: > > My question is how do I report this as a bug? What information do I > > need to provide in the bug report? Did anybody else face similar > > problems with GCC-4.0.0 and MMX-enabled programs. > > See > http://gcc.gnu.org/bugs.html > for info on reporting bugs. > > If you can narrow this down to a small testcase, then you are more > likely to get a solution from us. If you want us compile the entire > Dirac project and take a look, we probably won't bother. > > There have been changes to the MMX support in gcc, but without specific > details about your testcase, it is hard to say anything definite. For > instance, we don't know what the Dirac --enable-mmx option does. Which > specific gcc options does it enable? > > What about SSE? The SSE support is generally preferred over the older > MMX support. Does Dirac make any use of this? If not, perhaps it should. --
#include <iostream> #ifdef __MMX__ #include <mmintrin.h> #endif typedef short **PicArray; PicArray refdata; PicArray picdata; class SimpleBlockDiff { public: //! Default lConstructor SimpleBlockDiff( const PicArray &pic_data, const PicArray &ref_data, int rows, int cols) : pic_data(picdata), ref_data(ref_data), xl(cols), yl(rows) {} //! Do the actual difference without bounds checking int Diff(); private: //! Private, bodyless copy-constructor: class should not be copied SimpleBlockDiff(const SimpleBlockDiff& cpy); //! Private, bodyless assignment=: class should not be assigned SimpleBlockDiff& operator=(const SimpleBlockDiff& rhs); PicArray pic_data, ref_data; int xl, yl; }; int SimpleBlockDiff::Diff () { #ifdef __MMX__ __m64 sum = _mm_set_pi32(0, 0); for (int j=0 ; j < yl ; j++) { short *p = &pic_data[j][0]; short *r = &ref_data[j][0]; for (int i=0 ; i < xl ; i+=4, p +=4, r+=4 ) { __m64 pic = *(__m64 *)p; __m64 ref = *(__m64 *)r; // pic - ref pic = _mm_sub_pi16 (pic, ref); // abs (pic - ref) ref = _mm_srai_pi16(pic, 15); pic = _mm_xor_si64(pic, ref); pic = _mm_sub_pi16 (pic, ref); // sum += abs(pic -ref) ref = _mm_xor_si64(ref, ref); ref = _mm_unpackhi_pi16(pic, ref); pic = _mm_unpacklo_pi16(pic, pic); pic = _mm_srai_pi32 (pic, 16); //ref = _mm_srai_pi32 (ref, 16); pic = _mm_add_pi32 (pic, ref); sum = _mm_add_pi32 (sum, pic); } } int *result = (int *) ∑ _mm_empty(); return result[0] + result[1]; #else int sum = 0; for (int j=0; j < yl; j++) { for (int i=0; i < xl; i++) { sum += std::abs(pic_data[j][i] - ref_data[j][i]); } } return sum; #endif } void setup_data() { short *pic_data = new short [12*12]; short *ref_data = new short [12*12]; picdata = new short *[12]; refdata = new short *[12]; for (int j = 0; j<12; j++) { picdata[j] = pic_data + j*12; for (int i = 0; i < 12; i++) picdata[j][i] = 2; } for (int j = 0; j<12; j++) { refdata[j] = ref_data + j*12; for (int i = 0; i < 12; i++) refdata[j][i] = 1; } } void cleanup() { delete [] refdata[0]; delete [] picdata[0]; delete[] picdata; delete[] refdata; } extern int main (int argc, char **argv) { setup_data(); SimpleBlockDiff diff (picdata, refdata, 12, 12); std::cout << diff.Diff () << std::endl; for (int i = 0; i < 4000000 ; i++) { diff.Diff (); } }
Compile line g++ -mmmx -g -O3 test_mmx_diff4.cpp Tests conducted using gcc3.4.3 and gcc 4.0.1 20050503 (prerelease) 1. AMD Dual Opteron Processor, Suse 9.2 (32 bit) Results: gcc-3.4.3 gcc-4.0.1 20050503 (prerelease) real 1.25 real 2.87 user 1.24 user 2.87 sys 0.00 sys 0.00 2. Intel Dual Xeon 3.0 GHz, Suse 9.2 64 bit Results: gcc-3.4.3 gcc-4.0.0 real 1.09 real 1.58 user 1.09 user 1.54 sys 0.00 sys 0.00 3. Pentium 4 2.66GHz, Suse 9.2 Results: gcc3.3 20030226 gcc-4.0.0 real 1.35 real 4.98 user 1.32 user 4.96 sys 0.00 sys 0.00 gcc-4.0.0 performed worse than gcc-3.3.3 or gcc3.4.3 even for this simple program. The test results using Dirac were similar to this.