Tested on my HSW platform, no obvious regression found.
On Thu, 2014-06-12 at 19:42 +0800, Yang Rong wrote: > GT3 have 4 half slice, so should shift left 2 bits, and also should enlarge > the stack buffer size, > otherwize, if thread generate is non-balance, may out of bound. > Per bspec, scratch size need set 2X of desired. > > Signed-off-by: Yang Rong <rong.r.y...@intel.com> > --- > backend/src/backend/gen75_context.cpp | 4 ++-- > src/cl_command_queue_gen7.c | 6 ++++++ > src/intel/intel_gpgpu.c | 3 +++ > 3 files changed, 11 insertions(+), 2 deletions(-) > > diff --git a/backend/src/backend/gen75_context.cpp > b/backend/src/backend/gen75_context.cpp > index aedd4d3..da0db85 100644 > --- a/backend/src/backend/gen75_context.cpp > +++ b/backend/src/backend/gen75_context.cpp > @@ -92,12 +92,12 @@ namespace gbe > p->curr.predicate = GEN_PREDICATE_NONE; > //p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), > GenRegister::immud(0x1ff)); > p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), > GenRegister::immud(0x7f)); > - p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), > GenRegister::immud(0x80)); > + p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), > GenRegister::immud(0x180)); > p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), > GenRegister::immud(7)); > p->curr.execWidth = this->simdWidth; > p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift)); > p->curr.execWidth = 1; > - p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::immud(1)); > + p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::immud(2)); > p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::ud1grf(126, 4)); > p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::immud(perThreadShift)); > p->curr.execWidth = this->simdWidth; > diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c > index 9680535..af3030c 100644 > --- a/src/cl_command_queue_gen7.c > +++ b/src/cl_command_queue_gen7.c > @@ -244,6 +244,12 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker) > assert(offset >= 0); > stack_sz *= gbe_kernel_get_simd_width(ker->opaque); > stack_sz *= device->max_compute_unit; > + /* Because HSW calc stack offset per thread is relative with half slice, > when > + thread schedule in half slice is not balance, would out of bound. > Because > + the max half slice is 4 in GT4, multiply stack size with 4 for safe. > + */ > + if(cl_driver_get_ver(ctx->drv) == 75) > + stack_sz *= 4; > cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cl_gpgpu_get_cache_ctrl()); > } > > diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c > index 5093583..cae843b 100644 > --- a/src/intel/intel_gpgpu.c > +++ b/src/intel/intel_gpgpu.c > @@ -833,6 +833,9 @@ intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t > per_thread_size) > drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr; > drm_intel_bo* old = gpgpu->scratch_b.bo; > uint32_t total = per_thread_size * gpgpu->max_threads; > + /* Per Bspec, scratch should 2X the desired size, otherwise luxmark may > hang */ > + if (IS_HASWELL(gpgpu->drv->device_id)) > + total *= 2; > > gpgpu->per_thread_scratch = per_thread_size; > _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet