[Bug libgomp/122280] target teams distribute parallel for collapse(2) yields different results in a matmul than separate loops (one with omp target teams distribute the second with omp parallel for) on nvptx target. Clang compiles the code correctly

schulz.benjamin at googlemail dot com via Gcc-bugs Thu, 30 Oct 2025 20:20:43 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122280


Benjamin Schulz <schulz.benjamin at googlemail dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
  Attachment #62555|0                           |1
        is obsolete|                            |

--- Comment #5 from Benjamin Schulz <schulz.benjamin at googlemail dot com> ---
Created attachment 62678
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=62678&action=edit
Archiv.tar.gz

I made some updates to add more tests. I added the entire source to
Archiv.tar.gz.


 First, there is the header in_kernel_mathfunctions.h. It contains blas
functions that execute single threaded (abbreviation _s, just with openmp_simd
(abbreviation _v) and with openmp_parallel for simd. These functions are for
being executed on host or within a target region, i.e. if you have hundreds of
matrix multiplications in parallel... 

For example, one has a single threaded function matrix_multiply_dot_s



#pragma omp begin declare target
template <typename T>
void In_Kernel_Mathfunctions<T>::matrix_multiply_dot_s( const DataBlock<T>& A,
const  DataBlock<T>& B, DataBlock<T>& C)
{
    const size_t rows=A.dpextents[0];
    const size_t cols=B.dpextents[1];
    const size_t inner_dim=A.dpextents[1];

    const size_t Astr0=A.dpstrides[0];
    const size_t Astr1=A.dpstrides[1];
    const size_t Bstr0=B.dpstrides[0];
    const size_t Bstr1=B.dpstrides[1];
    const size_t Cstr0=C.dpstrides[0];
    const size_t Cstr1=C.dpstrides[1];

    for (size_t i = 0; i < rows; ++i)
    {
        for (size_t j = 0; j < cols; ++j)
        {
            T sum = 0;
            for (size_t k = 0; k < inner_dim; ++k)
            {
                sum += A.dpdata[i*Astr0+k*Astr1] *B.dpdata[k*Bstr0+j*Bstr1];
            }
            C.dpdata[i*Cstr0+j*Cstr1]= sum;
        }
    }
}
#pragma omp end declare target


and a parallel matrix multiplication matrix_multiply_dot_w


#pragma omp begin declare target
template <typename T>
void In_Kernel_Mathfunctions<T>::matrix_multiply_dot_w( const DataBlock<T>& A,
const  DataBlock<T>& B, DataBlock<T>& C)
{
    const size_t rows=A.dpextents[0];
    const size_t cols=B.dpextents[1];
    const size_t inner_dim=A.dpextents[1];

    const size_t Astr0=A.dpstrides[0];
    const size_t Astr1=A.dpstrides[1];
    const size_t Bstr0=B.dpstrides[0];
    const size_t Bstr1=B.dpstrides[1];
    const size_t Cstr0=C.dpstrides[0];
    const size_t Cstr1=C.dpstrides[1];

    #pragma omp parallel for collapse(2) shared(A,B,C)
    for (size_t i = 0; i < rows; ++i)
    {
        for (size_t j = 0; j < cols; ++j)
        {
            T sum = 0;
            #pragma omp simd reduction(+:sum)
            for (size_t k = 0; k < inner_dim; ++k)
            {
                sum += A.dpdata[i*Astr0+k*Astr1] *B.dpdata[k*Bstr0+j*Bstr1];
            }
            C.dpdata[i*Cstr0+j*Cstr1]= sum;
        }
    }
}
#pragma omp end declare target

On my nvptx system, the host version works with the collapse(2) statement
without problem.



Now there is the function matrix_multiply_dot_g in the header
gpu_mathfunctions.h

Its code is not surprising:



template <typename T>
void GPU_Math_Functions<T>::matrix_multiply_dot_g( const DataBlock<T>& A, const
DataBlock<T>& B, DataBlock<T>& C,int dev,bool update_host)
{
    const size_t rows=A.dpextents[0];
    const size_t cols=B.dpextents[1];
    const size_t inner_dim=A.dpextents[1];

    //these functions check isdevptr to see whether data was allocated with
malloc. they do only offload if that is not the case.
    typename DataBlock_GPU_Memory_Functions<T>::OffloadHelperConst offloadA(A,
dev, false);
    typename DataBlock_GPU_Memory_Functions<T>::OffloadHelperConst offloadB(B,
dev, false);
    typename DataBlock_GPU_Memory_Functions<T>::OffloadHelper offloadC(C, dev,
true, update_host);

    const size_t Astr0=A.dpstrides[0];
    const size_t Astr1=A.dpstrides[1];
    const size_t Bstr0=B.dpstrides[0];
    const size_t Bstr1=B.dpstrides[1];
    const size_t Cstr0=C.dpstrides[0];
    const size_t Cstr1=C.dpstrides[1];

    #pragma omp target teams distribute parallel for collapse(2) shared(A,B,C)
device(dev)
    for (size_t i = 0; i < rows; ++i)
        for (size_t j = 0; j < cols; ++j)
        {
            T sum = T(0);
            #pragma omp simd reduction(+:sum)
            for (size_t k = 0; k < inner_dim; ++k)
            {
                sum += A.dpdata[i*Astr0+k*Astr1] *B.dpdata[k*Bstr0+j*Bstr1];
            }
            C.dpdata[i*Cstr0+j*Cstr1]= sum;
        }

}


Then in mathdemonstrations.cpp, it fills two datablocks with strides and
extents and data, then it tests the matrix multiplication single threaded on
the host, then multithreaded on the host, then multi threaded on the gpu, and
then with a strassen algorithm on gpu:


  cout<<"We define two matrices"<<endl;
        vector<double>A_data(12*12,0);
        vector<double>B_data(12*12,0);
        size_t rowsA = 12, colsA = 12;
        A_data=
        {
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
            12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
            2, 4, 6, 8, 10, 12, 1, 3, 5, 7, 9, 11,
            11, 9, 7, 5, 3, 1, 12, 10, 8, 6, 4, 2,
            3, 6, 9, 12, 2, 5, 8, 11, 1, 4, 7, 10,
            10, 7, 4, 1, 11, 8, 5, 2, 12, 9, 6, 3,
            4, 8, 12, 3, 7, 11, 2, 6, 10, 1, 5, 9,
            9, 5, 1, 7, 3, 11, 8, 4, 12, 6, 2, 10,
            5, 10, 3, 8, 1, 6, 11, 4, 9, 2, 7, 12,
            12, 7, 2, 9, 4, 11, 6, 1, 8, 3, 10, 5,
            6, 1, 8, 3, 10, 5, 12, 7, 2, 9, 4, 11,
            11, 2, 9, 4, 12, 7, 3, 10, 5, 1, 8, 6
        };
        B_data= {12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                 3, 6, 9, 12, 2, 5, 8, 11, 1, 4, 7, 10,
                 10, 7, 4, 1, 11, 8, 5, 2, 12, 9, 6, 3,
                 5, 10, 3, 8, 1, 6, 11, 4, 9, 2, 7, 12,
                 12, 9, 6, 3, 10, 7, 4, 1, 8, 5, 2, 11,
                 2, 4, 6, 8, 10, 12, 1, 3, 5, 7, 9, 11,
                 11, 8, 5, 2, 9, 6, 3, 12, 7, 4, 1, 10,
                 3, 6, 9, 12, 2, 5, 8, 11, 1, 4, 7, 10,
                 10, 7, 4, 1, 11, 8, 5, 2, 12, 9, 6, 3,
                 4, 8, 12, 3, 7, 11, 2, 6, 10, 1, 5, 9,
                 9, 5, 1, 7, 3, 11, 8, 4, 12, 6, 2, 10
                };


        cout<<"the same code base can have the strides and extents on
heap(vector) or on the stack(array). "<<endl;
        cout <<"The library works as well with col major data but in this
example, we define row-major data"<<endl;

        mdspan<double, std::vector<size_t>> A(A_data.data(), {rowsA, colsA});
        mdspan<double, std::array<size_t,2>> B(B_data.data(), {rowsA, colsA});



        cout<<"Ordinary matrix multiplication, foced on gpu with a policy
object"<<std::endl;


        A.printtensor();
        B.printtensor();

        cout <<"the header In_Kernel_mathfunctions executes math functions
either on the host or can run them in parallel. Abbreviations v just with simd,
s without parallel loops"<<endl;

        mdspan_data<double, std::vector<size_t>> C0({rowsA, colsA});
        In_Kernel_Mathfunctions<double>::matrix_multiply_dot_s(A, B, C0);
        cout<<"per default update_host is set to true. If one has several
calculations on gpu, this may not be desired and can be switched to
false"<<endl;

        C0.printtensor();

        cout <<"the header In_Kernel_mathfunctions executes math functions
either on the host or can run them in parallel. Abbreviations w mean with
parallel for"<<endl;
        mdspan_data<double, std::vector<size_t>> C1({rowsA, colsA});
        In_Kernel_Mathfunctions<double>::matrix_multiply_dot_w(A, B, C1);
        cout<<"per default update_host is set to true. If one has several
calculations on gpu, this may not be desired and can be switched to
false"<<endl;

        C1.printtensor();



        mdspan_data<double, std::vector<size_t>> C2({rowsA, colsA});

        cout <<"CPU_ONLY lets it multiply on CPU. GPU_ONLY executes on gpu.
AUTO lets the library decide based on whether the data is already on gpu, the
algorithm, and the data size."<<endl;

        Math_Functions_Policy p1(Math_Functions_Policy::GPU_ONLY);
        cout<<"supplying nullptr instead of a pointer to Math_Functions_Policy
lets the library use a global default that can be configured."<<endl;
        Math_Functions<double>::matrix_multiply_dot(A, B, C2,&p1);
        cout<<"per default update_host is set to true. If one has several
calculations on gpu, this may not be desired and can be switched to
false"<<endl;

        C2.printtensor();

        cout<<"We can also use the Strassen algorithm or its Winograd variant
for the multiplication."<<std::endl;
        cout<<"It may offload on gpu. With the Message Passing Interface
enabled, it can do so in parallel. "<<std::endl;
        cout<<"otherwise it offloads sequentially. The algorithm can also work
entirely on device with devicepointers to the data"<<std::endl;

        cout<<"in auto mode, the following default treshholds are set in
mathfunctions.h and can be changed for convenience"<<std::endl;
        cout << "max_problem_size_for_gpu;" << "This is the size of the gpu
memory, data larger than this is not offloaded"<< std::endl;
        cout <<" default_cubic_treshold = 256;"<< "The default number of
elements at which matrices are auto offloaded in multiplication"<< std::endl;
        cout<< " default_square_treshold = 1000;"<<"The default number of
elements at which matrices are auto offloaded for addition"<< std::endl;
        cout <<" default_linear_treshold = 1000000;"<<"The default number of
elements at which vectors are auto offloaded for addition"<<std::endl;
        cout <<std::endl;

        mdspan_data<double, std::vector<size_t>> C3({rowsA, colsA});


        cout<<"we now set it on gpu and set the size when to stop recursion to
2, per default, this is at 64"<<endl;

        Math_MPI_RecursiveMultiplication_Policy
p(Math_Functions_Policy::GPU_ONLY,false,false);
        p.size_to_stop_recursion=2;

        Math_Functions_MPI<double>::strassen_multiply(A, B, C3,&p);

        C3.printtensor();


Afterwards, the mathdemonstrations.cpp tests Cholesky, LU, and QR
decompositions, with simple and advanced algorithms, on gpu and cpu.



I will come to the output below in the next post.

[Bug libgomp/122280] target teams distribute parallel for collapse(2) yields different results in a matmul than separate loops (one with omp target teams distribute the second with omp parallel for) on nvptx target. Clang compiles the code correctly

Reply via email to