The patch LGTM
> -----Original Message----- > From: Beignet [mailto:[email protected]] On Behalf Of > Yang Rong > Sent: Wednesday, October 08, 2014 3:56 PM > To: [email protected] > Cc: Yang, Rong R > Subject: [Beignet] [Patch V2] Fix memcpy and memset bug. > > In ocl_memcpy.ll and ocl_memset.ll, index+4 should be less and equal than > size when use int in memcpy and memset, and need consider alignment. > > V2: Add the alignment argument, fix the condition. > Signed-off-by: Yang Rong <[email protected]> > --- > backend/src/libocl/src/ocl_memcpy.ll | 704 > ++++++++++++++++----------- > backend/src/libocl/src/ocl_memset.ll | 225 +++++---- > backend/src/llvm/llvm_intrinsic_lowering.cpp | 14 +- > 3 files changed, 546 insertions(+), 397 deletions(-) > > diff --git a/backend/src/libocl/src/ocl_memcpy.ll > b/backend/src/libocl/src/ocl_memcpy.ll > index 476033e..64c68bb 100644 > --- a/backend/src/libocl/src/ocl_memcpy.ll > +++ b/backend/src/libocl/src/ocl_memcpy.ll > @@ -1,336 +1,446 @@ > ;The memcpy's source code. > -; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, > size_t size) { > +; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, > size_t > +size, uint alignment) { > ; size_t index = 0; > -; while((index + 4) >= size) { > -; *((uint *)(dst + index)) = *((uint *)(src + index)); > -; index += 4; > -; } > -; while(index < size) { > -; dst[index] = src[index]; > -; index++; > -; } > +; if(alignment % 4 == 0) { > +; while((index + 4) <= size) { > +; *((__global uint *)(dst + index)) = *((__global uint *)(src + > index)); > +; index += 4; > +; } > +; } > +; while(index < size) { > +; dst[index] = src[index]; > +; index++; > +; } > ; } > > -define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 > addrspace(1)* %src, i32 %size) nounwind alwaysinline { > +define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* > +%src, i32 %size, i32 %alignment) nounwind alwaysinline { > entry: > - br label %while.cond > - > -while.cond: ; preds > = %while.body, %entry > - %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > - %add = add i32 %index.0, 4 > - %cmp = icmp ult i32 %add, %size > - br i1 %cmp, label %while.cond3, label %while.body > - > -while.body: ; preds > = %while.cond > - %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0 > - %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* > - %1 = load i32 addrspace(1)* %0, align 4 > - %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0 > - %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* > - store i32 %1, i32 addrspace(1)* %2, align 4 > - br label %while.cond > - > -while.cond3: ; preds > = %while.cond, %while.body5 > - %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > - %cmp4 = icmp ult i32 %index.1, %size > - br i1 %cmp4, label %while.body5, label %while.end7 > - > -while.body5: ; preds > = %while.cond3 > - %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1 > - %3 = load i8 addrspace(1)* %arrayidx, align 1 > - %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1 > - store i8 %3, i8 addrspace(1)* %arrayidx6, align 1 > - %inc = add i32 %index.1, 1 > - br label %while.cond3 > - > -while.end7: ; preds > = %while.cond3 > + %rem = and i32 %alignment, 3 > + %cmp = icmp ne i32 %rem, 0 > + %cmp113 = icmp ult i32 %size, 4 > + %or.cond = or i1 %cmp, %cmp113 > + br i1 %or.cond, label %while.cond4.preheader, label %while.body > + > +while.cond4.preheader: ; preds > = %entry, %while.body > + %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ] > + %cmp511 = icmp ult i32 %index.1.ph, %size > + br i1 %cmp511, label %while.body6, label %while.end8 > + > +while.body: ; preds > = %entry, %while.body > + %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ] > + %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ] > + %0 = ptrtoint i8 addrspace(1)* %src to i32 > + %1 = add i32 %0, %index.014 > + %2 = inttoptr i32 %1 to i8 addrspace(1)* > + %3 = bitcast i8 addrspace(1)* %2 to i32 addrspace(1)* > + %4 = load i32 addrspace(1)* %3, align 4 > + %5 = ptrtoint i8 addrspace(1)* %dst to i32 > + %6 = add i32 %5, %index.014 > + %7 = inttoptr i32 %6 to i8 addrspace(1)* > + %8 = bitcast i8 addrspace(1)* %7 to i32 addrspace(1)* > + store i32 %4, i32 addrspace(1)* %8, align 4 > + %add = add i32 %add15, 4 > + %cmp1 = icmp ugt i32 %add, %size > + br i1 %cmp1, label %while.cond4.preheader, label %while.body > + > +while.body6: ; preds > = %while.cond4.preheader, %while.body6 > + %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph, > +%while.cond4.preheader ] > + %9 = ptrtoint i8 addrspace(1)* %src to i32 > + %10 = add i32 %9, %index.112 > + %11 = inttoptr i32 %10 to i8 addrspace(1)* > + %12 = load i8 addrspace(1)* %11, align 1 > + %13 = ptrtoint i8 addrspace(1)* %dst to i32 > + %14 = add i32 %13, %index.112 > + %15 = inttoptr i32 %14 to i8 addrspace(1)* > + store i8 %12, i8 addrspace(1)* %15, align 1 > + %inc = add i32 %index.112, 1 > + %cmp5 = icmp ult i32 %inc, %size > + br i1 %cmp5, label %while.body6, label %while.end8 > + > +while.end8: ; preds > = %while.body6, %while.cond4.preheader > ret void > } > > -define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 > addrspace(0)* %src, i32 %size) nounwind alwaysinline { > +define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* > +%src, i32 %size, i32 %alignment) nounwind alwaysinline { > entry: > - br label %while.cond > - > -while.cond: ; preds > = %while.body, %entry > - %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > - %add = add i32 %index.0, 4 > - %cmp = icmp ult i32 %add, %size > - br i1 %cmp, label %while.cond3, label %while.body > - > -while.body: ; preds > = %while.cond > - %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0 > - %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)* > - %1 = load i32 addrspace(0)* %0, align 4 > - %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0 > - %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* > - store i32 %1, i32 addrspace(1)* %2, align 4 > - br label %while.cond > - > -while.cond3: ; preds > = %while.cond, %while.body5 > - %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > - %cmp4 = icmp ult i32 %index.1, %size > - br i1 %cmp4, label %while.body5, label %while.end7 > - > -while.body5: ; preds > = %while.cond3 > - %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1 > - %3 = load i8 addrspace(0)* %arrayidx, align 1 > - %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1 > - store i8 %3, i8 addrspace(1)* %arrayidx6, align 1 > - %inc = add i32 %index.1, 1 > - br label %while.cond3 > - > -while.end7: ; preds > = %while.cond3 > + %rem = and i32 %alignment, 3 > + %cmp = icmp ne i32 %rem, 0 > + %cmp113 = icmp ult i32 %size, 4 > + %or.cond = or i1 %cmp, %cmp113 > + br i1 %or.cond, label %while.cond4.preheader, label %while.body > + > +while.cond4.preheader: ; preds > = %entry, %while.body > + %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ] > + %cmp511 = icmp ult i32 %index.1.ph, %size > + br i1 %cmp511, label %while.body6, label %while.end8 > + > +while.body: ; preds > = %entry, %while.body > + %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ] > + %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ] > + %0 = ptrtoint i8 addrspace(0)* %src to i32 > + %1 = add i32 %0, %index.014 > + %2 = inttoptr i32 %1 to i8 addrspace(0)* > + %3 = bitcast i8 addrspace(0)* %2 to i32 addrspace(0)* > + %4 = load i32 addrspace(0)* %3, align 4 > + %5 = ptrtoint i8 addrspace(1)* %dst to i32 > + %6 = add i32 %5, %index.014 > + %7 = inttoptr i32 %6 to i8 addrspace(1)* > + %8 = bitcast i8 addrspace(1)* %7 to i32 addrspace(1)* > + store i32 %4, i32 addrspace(1)* %8, align 4 > + %add = add i32 %add15, 4 > + %cmp1 = icmp ugt i32 %add, %size > + br i1 %cmp1, label %while.cond4.preheader, label %while.body > + > +while.body6: ; preds > = %while.cond4.preheader, %while.body6 > + %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph, > +%while.cond4.preheader ] > + %9 = ptrtoint i8 addrspace(0)* %src to i32 > + %10 = add i32 %9, %index.112 > + %11 = inttoptr i32 %10 to i8 addrspace(0)* > + %12 = load i8 addrspace(0)* %11, align 1 > + %13 = ptrtoint i8 addrspace(1)* %dst to i32 > + %14 = add i32 %13, %index.112 > + %15 = inttoptr i32 %14 to i8 addrspace(1)* > + store i8 %12, i8 addrspace(1)* %15, align 1 > + %inc = add i32 %index.112, 1 > + %cmp5 = icmp ult i32 %inc, %size > + br i1 %cmp5, label %while.body6, label %while.end8 > + > +while.end8: ; preds > = %while.body6, %while.cond4.preheader > ret void > } > > -define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 > addrspace(3)* %src, i32 %size) nounwind alwaysinline { > +define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* > +%src, i32 %size, i32 %alignment) nounwind alwaysinline { > entry: > - br label %while.cond > - > -while.cond: ; preds > = %while.body, %entry > - %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > - %add = add i32 %index.0, 4 > - %cmp = icmp ult i32 %add, %size > - br i1 %cmp, label %while.cond3, label %while.body > - > -while.body: ; preds > = %while.cond > - %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0 > - %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)* > - %1 = load i32 addrspace(3)* %0, align 4 > - %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0 > - %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* > - store i32 %1, i32 addrspace(1)* %2, align 4 > - br label %while.cond > - > -while.cond3: ; preds > = %while.cond, %while.body5 > - %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > - %cmp4 = icmp ult i32 %index.1, %size > - br i1 %cmp4, label %while.body5, label %while.end7 > - > -while.body5: ; preds > = %while.cond3 > - %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1 > - %3 = load i8 addrspace(3)* %arrayidx, align 1 > - %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1 > - store i8 %3, i8 addrspace(1)* %arrayidx6, align 1 > - %inc = add i32 %index.1, 1 > - br label %while.cond3 > - > -while.end7: ; preds > = %while.cond3 > + %rem = and i32 %alignment, 3 > + %cmp = icmp ne i32 %rem, 0 > + %cmp113 = icmp ult i32 %size, 4 > + %or.cond = or i1 %cmp, %cmp113 > + br i1 %or.cond, label %while.cond4.preheader, label %while.body > + > +while.cond4.preheader: ; preds > = %entry, %while.body > + %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ] > + %cmp511 = icmp ult i32 %index.1.ph, %size > + br i1 %cmp511, label %while.body6, label %while.end8 > + > +while.body: ; preds > = %entry, %while.body > + %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ] > + %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ] > + %0 = ptrtoint i8 addrspace(3)* %src to i32 > + %1 = add i32 %0, %index.014 > + %2 = inttoptr i32 %1 to i8 addrspace(3)* > + %3 = bitcast i8 addrspace(3)* %2 to i32 addrspace(3)* > + %4 = load i32 addrspace(3)* %3, align 4 > + %5 = ptrtoint i8 addrspace(1)* %dst to i32 > + %6 = add i32 %5, %index.014 > + %7 = inttoptr i32 %6 to i8 addrspace(1)* > + %8 = bitcast i8 addrspace(1)* %7 to i32 addrspace(1)* > + store i32 %4, i32 addrspace(1)* %8, align 4 > + %add = add i32 %add15, 4 > + %cmp1 = icmp ugt i32 %add, %size > + br i1 %cmp1, label %while.cond4.preheader, label %while.body > + > +while.body6: ; preds > = %while.cond4.preheader, %while.body6 > + %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph, > +%while.cond4.preheader ] > + %9 = ptrtoint i8 addrspace(3)* %src to i32 > + %10 = add i32 %9, %index.112 > + %11 = inttoptr i32 %10 to i8 addrspace(3)* > + %12 = load i8 addrspace(3)* %11, align 1 > + %13 = ptrtoint i8 addrspace(1)* %dst to i32 > + %14 = add i32 %13, %index.112 > + %15 = inttoptr i32 %14 to i8 addrspace(1)* > + store i8 %12, i8 addrspace(1)* %15, align 1 > + %inc = add i32 %index.112, 1 > + %cmp5 = icmp ult i32 %inc, %size > + br i1 %cmp5, label %while.body6, label %while.end8 > + > +while.end8: ; preds > = %while.body6, %while.cond4.preheader > ret void > } > > -define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 > addrspace(1)* %src, i32 %size) nounwind alwaysinline { > +define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* > +%src, i32 %size, i32 %alignment) nounwind alwaysinline { > entry: > - br label %while.cond > - > -while.cond: ; preds > = %while.body, %entry > - %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > - %add = add i32 %index.0, 4 > - %cmp = icmp ult i32 %add, %size > - br i1 %cmp, label %while.cond3, label %while.body > - > -while.body: ; preds > = %while.cond > - %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0 > - %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* > - %1 = load i32 addrspace(1)* %0, align 4 > - %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0 > - %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)* > - store i32 %1, i32 addrspace(0)* %2, align 4 > - br label %while.cond > - > -while.cond3: ; preds > = %while.cond, %while.body5 > - %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > - %cmp4 = icmp ult i32 %index.1, %size > - br i1 %cmp4, label %while.body5, label %while.end7 > - > -while.body5: ; preds > = %while.cond3 > - %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1 > - %3 = load i8 addrspace(1)* %arrayidx, align 1 > - %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1 > - store i8 %3, i8 addrspace(0)* %arrayidx6, align 1 > - %inc = add i32 %index.1, 1 > - br label %while.cond3 > - > -while.end7: ; preds > = %while.cond3 > + %rem = and i32 %alignment, 3 > + %cmp = icmp ne i32 %rem, 0 > + %cmp113 = icmp ult i32 %size, 4 > + %or.cond = or i1 %cmp, %cmp113 > + br i1 %or.cond, label %while.cond4.preheader, label %while.body > + > +while.cond4.preheader: ; preds > = %entry, %while.body > + %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ] > + %cmp511 = icmp ult i32 %index.1.ph, %size > + br i1 %cmp511, label %while.body6, label %while.end8 > + > +while.body: ; preds > = %entry, %while.body > + %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ] > + %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ] > + %0 = ptrtoint i8 addrspace(1)* %src to i32 > + %1 = add i32 %0, %index.014 > + %2 = inttoptr i32 %1 to i8 addrspace(1)* > + %3 = bitcast i8 addrspace(1)* %2 to i32 addrspace(1)* > + %4 = load i32 addrspace(1)* %3, align 4 > + %5 = ptrtoint i8 addrspace(0)* %dst to i32 > + %6 = add i32 %5, %index.014 > + %7 = inttoptr i32 %6 to i8 addrspace(0)* > + %8 = bitcast i8 addrspace(0)* %7 to i32 addrspace(0)* > + store i32 %4, i32 addrspace(0)* %8, align 4 > + %add = add i32 %add15, 4 > + %cmp1 = icmp ugt i32 %add, %size > + br i1 %cmp1, label %while.cond4.preheader, label %while.body > + > +while.body6: ; preds > = %while.cond4.preheader, %while.body6 > + %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph, > +%while.cond4.preheader ] > + %9 = ptrtoint i8 addrspace(1)* %src to i32 > + %10 = add i32 %9, %index.112 > + %11 = inttoptr i32 %10 to i8 addrspace(1)* > + %12 = load i8 addrspace(1)* %11, align 1 > + %13 = ptrtoint i8 addrspace(0)* %dst to i32 > + %14 = add i32 %13, %index.112 > + %15 = inttoptr i32 %14 to i8 addrspace(0)* > + store i8 %12, i8 addrspace(0)* %15, align 1 > + %inc = add i32 %index.112, 1 > + %cmp5 = icmp ult i32 %inc, %size > + br i1 %cmp5, label %while.body6, label %while.end8 > + > +while.end8: ; preds > = %while.body6, %while.cond4.preheader > ret void > } > > -define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 > addrspace(0)* %src, i32 %size) nounwind alwaysinline { > +define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* > +%src, i32 %size, i32 %alignment) nounwind alwaysinline { > entry: > - br label %while.cond > - > -while.cond: ; preds > = %while.body, %entry > - %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > - %add = add i32 %index.0, 4 > - %cmp = icmp ult i32 %add, %size > - br i1 %cmp, label %while.cond3, label %while.body > - > -while.body: ; preds > = %while.cond > - %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0 > - %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)* > - %1 = load i32 addrspace(0)* %0, align 4 > - %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0 > - %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)* > - store i32 %1, i32 addrspace(0)* %2, align 4 > - br label %while.cond > - > -while.cond3: ; preds > = %while.cond, %while.body5 > - %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > - %cmp4 = icmp ult i32 %index.1, %size > - br i1 %cmp4, label %while.body5, label %while.end7 > - > -while.body5: ; preds > = %while.cond3 > - %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1 > - %3 = load i8 addrspace(0)* %arrayidx, align 1 > - %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1 > - store i8 %3, i8 addrspace(0)* %arrayidx6, align 1 > - %inc = add i32 %index.1, 1 > - br label %while.cond3 > - > -while.end7: ; preds > = %while.cond3 > + %rem = and i32 %alignment, 3 > + %cmp = icmp ne i32 %rem, 0 > + %cmp113 = icmp ult i32 %size, 4 > + %or.cond = or i1 %cmp, %cmp113 > + br i1 %or.cond, label %while.cond4.preheader, label %while.body > + > +while.cond4.preheader: ; preds > = %entry, %while.body > + %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ] > + %cmp511 = icmp ult i32 %index.1.ph, %size > + br i1 %cmp511, label %while.body6, label %while.end8 > + > +while.body: ; preds > = %entry, %while.body > + %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ] > + %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ] > + %0 = ptrtoint i8 addrspace(0)* %src to i32 > + %1 = add i32 %0, %index.014 > + %2 = inttoptr i32 %1 to i8 addrspace(0)* > + %3 = bitcast i8 addrspace(0)* %2 to i32 addrspace(0)* > + %4 = load i32 addrspace(0)* %3, align 4 > + %5 = ptrtoint i8 addrspace(0)* %dst to i32 > + %6 = add i32 %5, %index.014 > + %7 = inttoptr i32 %6 to i8 addrspace(0)* > + %8 = bitcast i8 addrspace(0)* %7 to i32 addrspace(0)* > + store i32 %4, i32 addrspace(0)* %8, align 4 > + %add = add i32 %add15, 4 > + %cmp1 = icmp ugt i32 %add, %size > + br i1 %cmp1, label %while.cond4.preheader, label %while.body > + > +while.body6: ; preds > = %while.cond4.preheader, %while.body6 > + %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph, > +%while.cond4.preheader ] > + %9 = ptrtoint i8 addrspace(0)* %src to i32 > + %10 = add i32 %9, %index.112 > + %11 = inttoptr i32 %10 to i8 addrspace(0)* > + %12 = load i8 addrspace(0)* %11, align 1 > + %13 = ptrtoint i8 addrspace(0)* %dst to i32 > + %14 = add i32 %13, %index.112 > + %15 = inttoptr i32 %14 to i8 addrspace(0)* > + store i8 %12, i8 addrspace(0)* %15, align 1 > + %inc = add i32 %index.112, 1 > + %cmp5 = icmp ult i32 %inc, %size > + br i1 %cmp5, label %while.body6, label %while.end8 > + > +while.end8: ; preds > = %while.body6, %while.cond4.preheader > ret void > } > > -define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 > addrspace(3)* %src, i32 %size) nounwind alwaysinline { > +define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* > +%src, i32 %size, i32 %alignment) nounwind alwaysinline { > entry: > - br label %while.cond > - > -while.cond: ; preds > = %while.body, %entry > - %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > - %add = add i32 %index.0, 4 > - %cmp = icmp ult i32 %add, %size > - br i1 %cmp, label %while.cond3, label %while.body > - > -while.body: ; preds > = %while.cond > - %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0 > - %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)* > - %1 = load i32 addrspace(3)* %0, align 4 > - %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0 > - %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)* > - store i32 %1, i32 addrspace(0)* %2, align 4 > - br label %while.cond > - > -while.cond3: ; preds > = %while.cond, %while.body5 > - %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > - %cmp4 = icmp ult i32 %index.1, %size > - br i1 %cmp4, label %while.body5, label %while.end7 > - > -while.body5: ; preds > = %while.cond3 > - %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1 > - %3 = load i8 addrspace(3)* %arrayidx, align 1 > - %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1 > - store i8 %3, i8 addrspace(0)* %arrayidx6, align 1 > - %inc = add i32 %index.1, 1 > - br label %while.cond3 > - > -while.end7: ; preds > = %while.cond3 > + %rem = and i32 %alignment, 3 > + %cmp = icmp ne i32 %rem, 0 > + %cmp113 = icmp ult i32 %size, 4 > + %or.cond = or i1 %cmp, %cmp113 > + br i1 %or.cond, label %while.cond4.preheader, label %while.body > + > +while.cond4.preheader: ; preds > = %entry, %while.body > + %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ] > + %cmp511 = icmp ult i32 %index.1.ph, %size > + br i1 %cmp511, label %while.body6, label %while.end8 > + > +while.body: ; preds > = %entry, %while.body > + %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ] > + %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ] > + %0 = ptrtoint i8 addrspace(3)* %src to i32 > + %1 = add i32 %0, %index.014 > + %2 = inttoptr i32 %1 to i8 addrspace(3)* > + %3 = bitcast i8 addrspace(3)* %2 to i32 addrspace(3)* > + %4 = load i32 addrspace(3)* %3, align 4 > + %5 = ptrtoint i8 addrspace(0)* %dst to i32 > + %6 = add i32 %5, %index.014 > + %7 = inttoptr i32 %6 to i8 addrspace(0)* > + %8 = bitcast i8 addrspace(0)* %7 to i32 addrspace(0)* > + store i32 %4, i32 addrspace(0)* %8, align 4 > + %add = add i32 %add15, 4 > + %cmp1 = icmp ugt i32 %add, %size > + br i1 %cmp1, label %while.cond4.preheader, label %while.body > + > +while.body6: ; preds > = %while.cond4.preheader, %while.body6 > + %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph, > +%while.cond4.preheader ] > + %9 = ptrtoint i8 addrspace(3)* %src to i32 > + %10 = add i32 %9, %index.112 > + %11 = inttoptr i32 %10 to i8 addrspace(3)* > + %12 = load i8 addrspace(3)* %11, align 1 > + %13 = ptrtoint i8 addrspace(0)* %dst to i32 > + %14 = add i32 %13, %index.112 > + %15 = inttoptr i32 %14 to i8 addrspace(0)* > + store i8 %12, i8 addrspace(0)* %15, align 1 > + %inc = add i32 %index.112, 1 > + %cmp5 = icmp ult i32 %inc, %size > + br i1 %cmp5, label %while.body6, label %while.end8 > + > +while.end8: ; preds > = %while.body6, %while.cond4.preheader > ret void > } > > -define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 > addrspace(1)* %src, i32 %size) nounwind alwaysinline { > +define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* > +%src, i32 %size, i32 %alignment) nounwind alwaysinline { > entry: > - br label %while.cond > - > -while.cond: ; preds > = %while.body, %entry > - %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > - %add = add i32 %index.0, 4 > - %cmp = icmp ult i32 %add, %size > - br i1 %cmp, label %while.cond3, label %while.body > - > -while.body: ; preds > = %while.cond > - %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0 > - %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* > - %1 = load i32 addrspace(1)* %0, align 4 > - %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0 > - %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)* > - store i32 %1, i32 addrspace(3)* %2, align 4 > - br label %while.cond > - > -while.cond3: ; preds > = %while.cond, %while.body5 > - %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > - %cmp4 = icmp ult i32 %index.1, %size > - br i1 %cmp4, label %while.body5, label %while.end7 > - > -while.body5: ; preds > = %while.cond3 > - %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1 > - %3 = load i8 addrspace(1)* %arrayidx, align 1 > - %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1 > - store i8 %3, i8 addrspace(3)* %arrayidx6, align 1 > - %inc = add i32 %index.1, 1 > - br label %while.cond3 > - > -while.end7: ; preds > = %while.cond3 > + %rem = and i32 %alignment, 3 > + %cmp = icmp ne i32 %rem, 0 > + %cmp113 = icmp ult i32 %size, 4 > + %or.cond = or i1 %cmp, %cmp113 > + br i1 %or.cond, label %while.cond4.preheader, label %while.body > + > +while.cond4.preheader: ; preds > = %entry, %while.body > + %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ] > + %cmp511 = icmp ult i32 %index.1.ph, %size > + br i1 %cmp511, label %while.body6, label %while.end8 > + > +while.body: ; preds > = %entry, %while.body > + %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ] > + %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ] > + %0 = ptrtoint i8 addrspace(1)* %src to i32 > + %1 = add i32 %0, %index.014 > + %2 = inttoptr i32 %1 to i8 addrspace(1)* > + %3 = bitcast i8 addrspace(1)* %2 to i32 addrspace(1)* > + %4 = load i32 addrspace(1)* %3, align 4 > + %5 = ptrtoint i8 addrspace(3)* %dst to i32 > + %6 = add i32 %5, %index.014 > + %7 = inttoptr i32 %6 to i8 addrspace(3)* > + %8 = bitcast i8 addrspace(3)* %7 to i32 addrspace(3)* > + store i32 %4, i32 addrspace(3)* %8, align 4 > + %add = add i32 %add15, 4 > + %cmp1 = icmp ugt i32 %add, %size > + br i1 %cmp1, label %while.cond4.preheader, label %while.body > + > +while.body6: ; preds > = %while.cond4.preheader, %while.body6 > + %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph, > +%while.cond4.preheader ] > + %9 = ptrtoint i8 addrspace(1)* %src to i32 > + %10 = add i32 %9, %index.112 > + %11 = inttoptr i32 %10 to i8 addrspace(1)* > + %12 = load i8 addrspace(1)* %11, align 1 > + %13 = ptrtoint i8 addrspace(3)* %dst to i32 > + %14 = add i32 %13, %index.112 > + %15 = inttoptr i32 %14 to i8 addrspace(3)* > + store i8 %12, i8 addrspace(3)* %15, align 1 > + %inc = add i32 %index.112, 1 > + %cmp5 = icmp ult i32 %inc, %size > + br i1 %cmp5, label %while.body6, label %while.end8 > + > +while.end8: ; preds > = %while.body6, %while.cond4.preheader > ret void > } > > -define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 > addrspace(0)* %src, i32 %size) nounwind alwaysinline { > +define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* > +%src, i32 %size, i32 %alignment) nounwind alwaysinline { > entry: > - br label %while.cond > - > -while.cond: ; preds > = %while.body, %entry > - %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > - %add = add i32 %index.0, 4 > - %cmp = icmp ult i32 %add, %size > - br i1 %cmp, label %while.cond3, label %while.body > - > -while.body: ; preds > = %while.cond > - %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0 > - %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)* > - %1 = load i32 addrspace(0)* %0, align 4 > - %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0 > - %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)* > - store i32 %1, i32 addrspace(3)* %2, align 4 > - br label %while.cond > - > -while.cond3: ; preds > = %while.cond, %while.body5 > - %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > - %cmp4 = icmp ult i32 %index.1, %size > - br i1 %cmp4, label %while.body5, label %while.end7 > - > -while.body5: ; preds > = %while.cond3 > - %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1 > - %3 = load i8 addrspace(0)* %arrayidx, align 1 > - %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1 > - store i8 %3, i8 addrspace(3)* %arrayidx6, align 1 > - %inc = add i32 %index.1, 1 > - br label %while.cond3 > - > -while.end7: ; preds > = %while.cond3 > + %rem = and i32 %alignment, 3 > + %cmp = icmp ne i32 %rem, 0 > + %cmp113 = icmp ult i32 %size, 4 > + %or.cond = or i1 %cmp, %cmp113 > + br i1 %or.cond, label %while.cond4.preheader, label %while.body > + > +while.cond4.preheader: ; preds > = %entry, %while.body > + %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ] > + %cmp511 = icmp ult i32 %index.1.ph, %size > + br i1 %cmp511, label %while.body6, label %while.end8 > + > +while.body: ; preds > = %entry, %while.body > + %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ] > + %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ] > + %0 = ptrtoint i8 addrspace(0)* %src to i32 > + %1 = add i32 %0, %index.014 > + %2 = inttoptr i32 %1 to i8 addrspace(0)* > + %3 = bitcast i8 addrspace(0)* %2 to i32 addrspace(0)* > + %4 = load i32 addrspace(0)* %3, align 4 > + %5 = ptrtoint i8 addrspace(3)* %dst to i32 > + %6 = add i32 %5, %index.014 > + %7 = inttoptr i32 %6 to i8 addrspace(3)* > + %8 = bitcast i8 addrspace(3)* %7 to i32 addrspace(3)* > + store i32 %4, i32 addrspace(3)* %8, align 4 > + %add = add i32 %add15, 4 > + %cmp1 = icmp ugt i32 %add, %size > + br i1 %cmp1, label %while.cond4.preheader, label %while.body > + > +while.body6: ; preds > = %while.cond4.preheader, %while.body6 > + %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph, > +%while.cond4.preheader ] > + %9 = ptrtoint i8 addrspace(0)* %src to i32 > + %10 = add i32 %9, %index.112 > + %11 = inttoptr i32 %10 to i8 addrspace(0)* > + %12 = load i8 addrspace(0)* %11, align 1 > + %13 = ptrtoint i8 addrspace(3)* %dst to i32 > + %14 = add i32 %13, %index.112 > + %15 = inttoptr i32 %14 to i8 addrspace(3)* > + store i8 %12, i8 addrspace(3)* %15, align 1 > + %inc = add i32 %index.112, 1 > + %cmp5 = icmp ult i32 %inc, %size > + br i1 %cmp5, label %while.body6, label %while.end8 > + > +while.end8: ; preds > = %while.body6, %while.cond4.preheader > ret void > } > > -define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 > addrspace(3)* %src, i32 %size) nounwind alwaysinline { > +define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* > +%src, i32 %size, i32 %alignment) nounwind alwaysinline { > entry: > - br label %while.cond > - > -while.cond: ; preds > = %while.body, %entry > - %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > - %add = add i32 %index.0, 4 > - %cmp = icmp ult i32 %add, %size > - br i1 %cmp, label %while.cond3, label %while.body > - > -while.body: ; preds > = %while.cond > - %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0 > - %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)* > - %1 = load i32 addrspace(3)* %0, align 4 > - %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0 > - %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)* > - store i32 %1, i32 addrspace(3)* %2, align 4 > - br label %while.cond > - > -while.cond3: ; preds > = %while.cond, %while.body5 > - %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > - %cmp4 = icmp ult i32 %index.1, %size > - br i1 %cmp4, label %while.body5, label %while.end7 > - > -while.body5: ; preds > = %while.cond3 > - %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1 > - %3 = load i8 addrspace(3)* %arrayidx, align 1 > - %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1 > - store i8 %3, i8 addrspace(3)* %arrayidx6, align 1 > - %inc = add i32 %index.1, 1 > - br label %while.cond3 > - > -while.end7: ; preds > = %while.cond3 > + %rem = and i32 %alignment, 3 > + %cmp = icmp ne i32 %rem, 0 > + %cmp113 = icmp ult i32 %size, 4 > + %or.cond = or i1 %cmp, %cmp113 > + br i1 %or.cond, label %while.cond4.preheader, label %while.body > + > +while.cond4.preheader: ; preds > = %entry, %while.body > + %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ] > + %cmp511 = icmp ult i32 %index.1.ph, %size > + br i1 %cmp511, label %while.body6, label %while.end8 > + > +while.body: ; preds > = %entry, %while.body > + %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ] > + %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ] > + %0 = ptrtoint i8 addrspace(3)* %src to i32 > + %1 = add i32 %0, %index.014 > + %2 = inttoptr i32 %1 to i8 addrspace(3)* > + %3 = bitcast i8 addrspace(3)* %2 to i32 addrspace(3)* > + %4 = load i32 addrspace(3)* %3, align 4 > + %5 = ptrtoint i8 addrspace(3)* %dst to i32 > + %6 = add i32 %5, %index.014 > + %7 = inttoptr i32 %6 to i8 addrspace(3)* > + %8 = bitcast i8 addrspace(3)* %7 to i32 addrspace(3)* > + store i32 %4, i32 addrspace(3)* %8, align 4 > + %add = add i32 %add15, 4 > + %cmp1 = icmp ugt i32 %add, %size > + br i1 %cmp1, label %while.cond4.preheader, label %while.body > + > +while.body6: ; preds > = %while.cond4.preheader, %while.body6 > + %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph, > +%while.cond4.preheader ] > + %9 = ptrtoint i8 addrspace(3)* %src to i32 > + %10 = add i32 %9, %index.112 > + %11 = inttoptr i32 %10 to i8 addrspace(3)* > + %12 = load i8 addrspace(3)* %11, align 1 > + %13 = ptrtoint i8 addrspace(3)* %dst to i32 > + %14 = add i32 %13, %index.112 > + %15 = inttoptr i32 %14 to i8 addrspace(3)* > + store i8 %12, i8 addrspace(3)* %15, align 1 > + %inc = add i32 %index.112, 1 > + %cmp5 = icmp ult i32 %inc, %size > + br i1 %cmp5, label %while.body6, label %while.end8 > + > +while.end8: ; preds > = %while.body6, %while.cond4.preheader > ret void > } > diff --git a/backend/src/libocl/src/ocl_memset.ll > b/backend/src/libocl/src/ocl_memset.ll > index addf9f5..661520d 100644 > --- a/backend/src/libocl/src/ocl_memset.ll > +++ b/backend/src/libocl/src/ocl_memset.ll > @@ -1,127 +1,160 @@ > ;The memset's source code. > -; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t > size) { > +; INLINE_OVERLOADABLE void __gen_memset(__global uchar* dst, uchar > val, > +size_t size, uint alignment) { > ; size_t index = 0; > -; uint v = (val << 24) | (val << 16) | (val << 8) | val; > -; while((index + 4) >= size) { > -; *((uint *)(dst + index)) = v; > -; index += 4; > -; } > +; uint v; > +; if(alignment % 4 == 0) { > +; v = (val << 24) | (val << 16) | (val << 8) | val; > +; while((index + 4) <= size) { > +; *((__global uint *)(dst + index)) = v; > +; index += 4; > +; } > +; } > ; while(index < size) { > ; dst[index] = val; > ; index++; > ; } > ; } > > -define void @__gen_memset_p(i8* %dst, i8 zeroext %val, i32 %size) > nounwind alwaysinline { > +define void @__gen_memset_p(i8 addrspace(0)* %dst, i8 zeroext %val, i32 > +%size, i32 %alignment) nounwind alwaysinline { > entry: > + %rem = and i32 %alignment, 3 > + %cmp = icmp eq i32 %rem, 0 > + br i1 %cmp, label %if.then, label %while.cond11.preheader > + > +if.then: ; preds = %entry > %conv = zext i8 %val to i32 > %shl = shl nuw i32 %conv, 24 > %shl2 = shl nuw nsw i32 %conv, 16 > - %or = or i32 %shl, %shl2 > %shl4 = shl nuw nsw i32 %conv, 8 > - %or5 = or i32 %or, %shl4 > - %or7 = or i32 %or5, %conv > - br label %while.cond > - > -while.cond: ; preds > = %while.body, %entry > - %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > - %add = add i32 %index.0, 4 > - %cmp = icmp ult i32 %add, %size > - br i1 %cmp, label %while.cond10, label %while.body > - > -while.body: ; preds > = %while.cond > - %add.ptr = getelementptr inbounds i8* %dst, i32 %index.0 > - %0 = bitcast i8* %add.ptr to i32* > - store i32 %or7, i32* %0, align 4 > - br label %while.cond > - > -while.cond10: ; preds > = %while.cond, %while.body13 > - %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ] > - %cmp11 = icmp ult i32 %index.1, %size > - br i1 %cmp11, label %while.body13, label %while.end14 > - > -while.body13: ; preds > = %while.cond10 > - %arrayidx = getelementptr inbounds i8* %dst, i32 %index.1 > - store i8 %val, i8* %arrayidx, align 1 > - %inc = add i32 %index.1, 1 > - br label %while.cond10 > - > -while.end14: ; preds > = %while.cond10 > + %or = or i32 %shl2, %conv > + %or5 = or i32 %or, %shl > + %or7 = or i32 %or5, %shl4 > + %cmp814 = icmp ult i32 %size, 4 > + br i1 %cmp814, label %while.cond11.preheader, label %while.body > + > +while.cond11.preheader: ; preds > = %if.then, %while.body, %entry > + %index.1.ph = phi i32 [ 0, %entry ], [ 0, %if.then ], [ %add16, > +%while.body ] > + %cmp1212 = icmp ult i32 %index.1.ph, %size > + br i1 %cmp1212, label %while.body14, label %while.end15 > + > +while.body: ; preds > = %if.then, %while.body > + %add16 = phi i32 [ %add, %while.body ], [ 4, %if.then ] > + %index.015 = phi i32 [ %add16, %while.body ], [ 0, %if.then ] > + %0 = ptrtoint i8 addrspace(0)* %dst to i32 > + %1 = add i32 %0, %index.015 > + %2 = inttoptr i32 %1 to i8 addrspace(0)* > + %3 = bitcast i8 addrspace(0)* %2 to i32 addrspace(0)* > + store i32 %or7, i32 addrspace(0)* %3, align 4 > + %add = add i32 %add16, 4 > + %cmp8 = icmp ugt i32 %add, %size > + br i1 %cmp8, label %while.cond11.preheader, label %while.body > + > +while.body14: ; preds > = %while.cond11.preheader, %while.body14 > + %index.113 = phi i32 [ %inc, %while.body14 ], [ %index.1.ph, > +%while.cond11.preheader ] > + %4 = ptrtoint i8 addrspace(0)* %dst to i32 > + %5 = add i32 %4, %index.113 > + %6 = inttoptr i32 %5 to i8 addrspace(0)* > + store i8 %val, i8 addrspace(0)* %6, align 1 > + %inc = add i32 %index.113, 1 > + %cmp12 = icmp ult i32 %inc, %size > + br i1 %cmp12, label %while.body14, label %while.end15 > + > +while.end15: ; preds > = %while.body14, %while.cond11.preheader > ret void > } > > -define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, > i32 %size) nounwind alwaysinline { > +define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 > +%size, i32 %alignment) nounwind alwaysinline { > entry: > + %rem = and i32 %alignment, 3 > + %cmp = icmp eq i32 %rem, 0 > + br i1 %cmp, label %if.then, label %while.cond11.preheader > + > +if.then: ; preds = %entry > %conv = zext i8 %val to i32 > %shl = shl nuw i32 %conv, 24 > %shl2 = shl nuw nsw i32 %conv, 16 > - %or = or i32 %shl, %shl2 > %shl4 = shl nuw nsw i32 %conv, 8 > - %or5 = or i32 %or, %shl4 > - %or7 = or i32 %or5, %conv > - br label %while.cond > - > -while.cond: ; preds > = %while.body, %entry > - %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > - %add = add i32 %index.0, 4 > - %cmp = icmp ult i32 %add, %size > - br i1 %cmp, label %while.cond10, label %while.body > - > -while.body: ; preds > = %while.cond > - %add.ptr = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0 > - %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* > - store i32 %or7, i32 addrspace(1)* %0, align 4 > - br label %while.cond > - > -while.cond10: ; preds > = %while.cond, %while.body13 > - %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ] > - %cmp11 = icmp ult i32 %index.1, %size > - br i1 %cmp11, label %while.body13, label %while.end14 > - > -while.body13: ; preds > = %while.cond10 > - %arrayidx = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1 > - store i8 %val, i8 addrspace(1)* %arrayidx, align 1 > - %inc = add i32 %index.1, 1 > - br label %while.cond10 > - > -while.end14: ; preds > = %while.cond10 > + %or = or i32 %shl2, %conv > + %or5 = or i32 %or, %shl > + %or7 = or i32 %or5, %shl4 > + %cmp814 = icmp ult i32 %size, 4 > + br i1 %cmp814, label %while.cond11.preheader, label %while.body > + > +while.cond11.preheader: ; preds > = %if.then, %while.body, %entry > + %index.1.ph = phi i32 [ 0, %entry ], [ 0, %if.then ], [ %add16, > +%while.body ] > + %cmp1212 = icmp ult i32 %index.1.ph, %size > + br i1 %cmp1212, label %while.body14, label %while.end15 > + > +while.body: ; preds > = %if.then, %while.body > + %add16 = phi i32 [ %add, %while.body ], [ 4, %if.then ] > + %index.015 = phi i32 [ %add16, %while.body ], [ 0, %if.then ] > + %0 = ptrtoint i8 addrspace(1)* %dst to i32 > + %1 = add i32 %0, %index.015 > + %2 = inttoptr i32 %1 to i8 addrspace(1)* > + %3 = bitcast i8 addrspace(1)* %2 to i32 addrspace(1)* > + store i32 %or7, i32 addrspace(1)* %3, align 4 > + %add = add i32 %add16, 4 > + %cmp8 = icmp ugt i32 %add, %size > + br i1 %cmp8, label %while.cond11.preheader, label %while.body > + > +while.body14: ; preds > = %while.cond11.preheader, %while.body14 > + %index.113 = phi i32 [ %inc, %while.body14 ], [ %index.1.ph, > +%while.cond11.preheader ] > + %4 = ptrtoint i8 addrspace(1)* %dst to i32 > + %5 = add i32 %4, %index.113 > + %6 = inttoptr i32 %5 to i8 addrspace(1)* > + store i8 %val, i8 addrspace(1)* %6, align 1 > + %inc = add i32 %index.113, 1 > + %cmp12 = icmp ult i32 %inc, %size > + br i1 %cmp12, label %while.body14, label %while.end15 > + > +while.end15: ; preds > = %while.body14, %while.cond11.preheader > ret void > } > > -define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, > i32 %size) nounwind alwaysinline { > +define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 > +%size, i32 %alignment) nounwind alwaysinline { > entry: > + %rem = and i32 %alignment, 3 > + %cmp = icmp eq i32 %rem, 0 > + br i1 %cmp, label %if.then, label %while.cond11.preheader > + > +if.then: ; preds = %entry > %conv = zext i8 %val to i32 > %shl = shl nuw i32 %conv, 24 > %shl2 = shl nuw nsw i32 %conv, 16 > - %or = or i32 %shl, %shl2 > %shl4 = shl nuw nsw i32 %conv, 8 > - %or5 = or i32 %or, %shl4 > - %or7 = or i32 %or5, %conv > - br label %while.cond > - > -while.cond: ; preds > = %while.body, %entry > - %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > - %add = add i32 %index.0, 4 > - %cmp = icmp ult i32 %add, %size > - br i1 %cmp, label %while.cond10, label %while.body > - > -while.body: ; preds > = %while.cond > - %add.ptr = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0 > - %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)* > - store i32 %or7, i32 addrspace(3)* %0, align 4 > - br label %while.cond > - > -while.cond10: ; preds > = %while.cond, %while.body13 > - %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ] > - %cmp11 = icmp ult i32 %index.1, %size > - br i1 %cmp11, label %while.body13, label %while.end14 > - > -while.body13: ; preds > = %while.cond10 > - %arrayidx = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1 > - store i8 %val, i8 addrspace(3)* %arrayidx, align 1 > - %inc = add i32 %index.1, 1 > - br label %while.cond10 > - > -while.end14: ; preds > = %while.cond10 > + %or = or i32 %shl2, %conv > + %or5 = or i32 %or, %shl > + %or7 = or i32 %or5, %shl4 > + %cmp814 = icmp ult i32 %size, 4 > + br i1 %cmp814, label %while.cond11.preheader, label %while.body > + > +while.cond11.preheader: ; preds > = %if.then, %while.body, %entry > + %index.1.ph = phi i32 [ 0, %entry ], [ 0, %if.then ], [ %add16, > +%while.body ] > + %cmp1212 = icmp ult i32 %index.1.ph, %size > + br i1 %cmp1212, label %while.body14, label %while.end15 > + > +while.body: ; preds > = %if.then, %while.body > + %add16 = phi i32 [ %add, %while.body ], [ 4, %if.then ] > + %index.015 = phi i32 [ %add16, %while.body ], [ 0, %if.then ] > + %0 = ptrtoint i8 addrspace(3)* %dst to i32 > + %1 = add i32 %0, %index.015 > + %2 = inttoptr i32 %1 to i8 addrspace(3)* > + %3 = bitcast i8 addrspace(3)* %2 to i32 addrspace(3)* > + store i32 %or7, i32 addrspace(3)* %3, align 4 > + %add = add i32 %add16, 4 > + %cmp8 = icmp ugt i32 %add, %size > + br i1 %cmp8, label %while.cond11.preheader, label %while.body > + > +while.body14: ; preds > = %while.cond11.preheader, %while.body14 > + %index.113 = phi i32 [ %inc, %while.body14 ], [ %index.1.ph, > +%while.cond11.preheader ] > + %4 = ptrtoint i8 addrspace(3)* %dst to i32 > + %5 = add i32 %4, %index.113 > + %6 = inttoptr i32 %5 to i8 addrspace(3)* > + store i8 %val, i8 addrspace(3)* %6, align 1 > + %inc = add i32 %index.113, 1 > + %cmp12 = icmp ult i32 %inc, %size > + br i1 %cmp12, label %while.body14, label %while.end15 > + > +while.end15: ; preds > = %while.body14, %while.cond11.preheader > ret void > } > diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp > b/backend/src/llvm/llvm_intrinsic_lowering.cpp > index 7d04318..1466de0 100644 > --- a/backend/src/llvm/llvm_intrinsic_lowering.cpp > +++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp > @@ -126,14 +126,17 @@ namespace gbe { > Type *IntPtr = TD.getIntPtrType(Context); > Value *Size = > Builder.CreateIntCast(CI->getArgOperand(2), IntPtr, > /* isSigned > */ false); > - Value *Ops[3]; > + Value *alignment = > Builder.CreateIntCast(CI->getArgOperand(3), IntPtr, > + /* > isSigned */ false); > + Value *Ops[4]; > Ops[0] = CI->getArgOperand(0); > Ops[1] = CI->getArgOperand(1); > Ops[2] = Size; > + Ops[3] = alignment; > char name[16] = "__gen_memcpy_xx"; > name[13] = convertSpaceToName(Ops[0]); > name[14] = convertSpaceToName(Ops[1]); > - replaceCallWith(name, CI, Ops, Ops+3, > Type::getVoidTy(Context)); > + replaceCallWith(name, CI, Ops, Ops+4, > + Type::getVoidTy(Context)); > break; > } > case Intrinsic::memset: { @@ -143,14 +146,17 @@ > namespace gbe { > Type *IntPtr = TD.getIntPtrType(Op0->getType()); > Value *Size = > Builder.CreateIntCast(CI->getArgOperand(2), IntPtr, > /* isSigned > */ false); > - Value *Ops[3]; > + Value *alignment = > Builder.CreateIntCast(CI->getArgOperand(3), IntPtr, > + /* > isSigned */ false); > + Value *Ops[4]; > Ops[0] = Op0; > // Extend the amount to i32. > Ops[1] = val; > Ops[2] = Size; > + Ops[3] = alignment; > char name[16] = "__gen_memset_x"; > name[13] = convertSpaceToName(Ops[0]); > - replaceCallWith(name, CI, Ops, Ops+3, > Type::getVoidTy(Context)); > + replaceCallWith(name, CI, Ops, Ops+4, > + Type::getVoidTy(Context)); > break; > } > default: > -- > 1.8.3.2 > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
