good, thanks for the review comments. I will push my patch soon.
On Mon, Aug 05, 2013 at 08:19:30AM +0000, Xing, Homer wrote: > Yes. I will do the rebase work. > > -----Original Message----- > From: Zhigang Gong [mailto:zhigang.g...@gmail.com] > Sent: Monday, August 5, 2013 4:09 PM > To: Xing, Homer > Cc: beignet@lists.freedesktop.org > Subject: Re: [Beignet] [PATCH] GBE: refactor double support. > > Homer, > > Did you aware of that this patch conflicts your 64 bit vector allocation > patch and all your 64 bit int patchset? > > I will ignore your 64bit vector allocation patch, as it was already handled > in this patch. And all your other patchset need to be rebased to this patch? > > Are you ok with that? > > On Mon, Aug 05, 2013 at 07:30:52AM +0000, Xing, Homer wrote: > > You turned off predicate and mask in 64-bit reading/writing. And use SIMD16 > > data port reading for SIMD8 read. Good idea. > > > > This patch looks good to me. > > > > Homer > > > > -----Original Message----- > > From: beignet-bounces+homer.xing=intel....@lists.freedesktop.org > > [mailto:beignet-bounces+homer.xing=intel....@lists.freedesktop.org] On > > Behalf Of Zhigang Gong > > Sent: Monday, August 5, 2013 11:08 AM > > To: beignet@lists.freedesktop.org > > Cc: Zhigang Gong > > Subject: [Beignet] [PATCH] GBE: refactor double support. > > > > There are two major issues in double support: > > 1. Doesn't work at SIMD16 mode. > > 2. The incorrect usage of vectors. We only need to allocate those temporary > > register to contiguous registers. > > > > If you look at the previous implementation of > > READ_FLOAT64/WRITE_FLOAT64 in gen_encoder.cpp. You can easily find it > > contains many duplicate code and considering the SIMD16 code path never > > work correctly, it's so difficult to work based on that code. So I choose > > to refactor those two major functions. > > And refine other parts in the instruction selection stage to fix the above > > two major problem with a cleaner code. > > > > Now, it works well on both SIMD16/SIMD8 mode. > > Another minor improvement is for the READ_FLOAT64 on SIMD8 mode, this > > patch saves one time of send instruction to read all the > > 8 double data into registers. > > > > Signed-off-by: Zhigang Gong <zhigang.g...@linux.intel.com> > > --- > > backend/src/backend/gen_context.cpp | 21 ++- > > backend/src/backend/gen_encoder.cpp | 231 > > +++++++++++++--------------- > > backend/src/backend/gen_encoder.hpp | 7 +- > > backend/src/backend/gen_insn_selection.cpp | 64 +++++--- > > backend/src/backend/gen_reg_allocation.cpp | 10 +- > > backend/src/backend/gen_register.hpp | 25 +++ > > backend/src/llvm/llvm_gen_backend.cpp | 4 +- > > 7 files changed, 200 insertions(+), 162 deletions(-) > > > > diff --git a/backend/src/backend/gen_context.cpp > > b/backend/src/backend/gen_context.cpp > > index e33d8da..655b1d7 100644 > > --- a/backend/src/backend/gen_context.cpp > > +++ b/backend/src/backend/gen_context.cpp > > @@ -354,12 +354,18 @@ namespace gbe > > p->pop(); > > } > > > > + // For SIMD8, we allocate 2*elemNum temporary registers from > > + dst(0), and // then follow the real destination registers. > > + // For SIMD16, we allocate elemNum temporary registers from dst(0). > > void GenContext::emitReadFloat64Instruction(const SelectionInstruction > > &insn) { > > - const GenRegister dst = ra->genReg(insn.dst(0)); > > + const uint32_t elemNum = insn.extra.elem; > > + const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : > > elemNum; > > + const GenRegister dst = ra->genReg(insn.dst(tmpRegSize)); > > + const GenRegister tmp = ra->genReg(insn.dst(0)); > > const GenRegister src = ra->genReg(insn.src(0)); > > + const GenRegister tempAddr = ra->genReg(insn.src(1)); > > const uint32_t bti = insn.extra.function; > > - const uint32_t elemNum = insn.extra.elem; > > - p->READ_FLOAT64(dst, src, bti, elemNum); > > + p->READ_FLOAT64(dst, tmp, tempAddr, src, bti, elemNum); > > } > > > > void GenContext::emitUntypedReadInstruction(const SelectionInstruction > > &insn) { @@ -370,11 +376,16 @@ namespace gbe > > p->UNTYPED_READ(dst, src, bti, elemNum); > > } > > > > + // For SIMD8, we allocate 2*elemNum temporary registers from > > + dst(0), and // then follow the real destination registers. > > + // For SIMD16, we allocate elemNum temporary registers from dst(0). > > void GenContext::emitWriteFloat64Instruction(const SelectionInstruction > > &insn) { > > const GenRegister src = ra->genReg(insn.src(0)); > > - const uint32_t bti = insn.extra.function; > > const uint32_t elemNum = insn.extra.elem; > > - p->WRITE_FLOAT64(src, bti, elemNum); > > + const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : > > elemNum; > > + const GenRegister data = ra->genReg(insn.src(tmpRegSize + 1)); > > + const uint32_t bti = insn.extra.function; > > + p->WRITE_FLOAT64(src, data, bti, elemNum); > > } > > > > void GenContext::emitUntypedWriteInstruction(const > > SelectionInstruction &insn) { diff --git > > a/backend/src/backend/gen_encoder.cpp > > b/backend/src/backend/gen_encoder.cpp > > index f84c6dd..5930926 100644 > > --- a/backend/src/backend/gen_encoder.cpp > > +++ b/backend/src/backend/gen_encoder.cpp > > @@ -356,103 +356,69 @@ namespace gbe > > 0 > > }; > > > > - static int dst_type(int exec_width) { > > - if (exec_width == 8) > > - return GEN_TYPE_UD; > > - if (exec_width == 16) > > - return GEN_TYPE_UW; > > - NOT_IMPLEMENTED; > > - return 0; > > - } > > - > > - void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t > > bti, uint32_t elemNum) { > > - int w = curr.execWidth; > > - dst = GenRegister::h2(dst); > > - dst.type = GEN_TYPE_UD; > > - src.type = GEN_TYPE_UD; > > - GenRegister r = GenRegister::retype(GenRegister::suboffset(src, w*2), > > GEN_TYPE_UD); > > - GenRegister imm4 = GenRegister::immud(4); > > - GenInstruction *insn; > > - insn = next(GEN_OPCODE_SEND); > > - setHeader(insn); > > - setDst(insn, GenRegister::uw16grf(r.nr, 0)); > > - setSrc0(insn, GenRegister::ud8grf(src.nr, 0)); > > - setSrc1(insn, GenRegister::immud(0)); > > - setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, > > curr.execWidth / 8, curr.execWidth / 8); > > - push(); > > - curr.quarterControl = 0; > > - curr.nibControl = 0; > > - MOV(dst, r); > > - if (w == 8) > > - curr.nibControl = 1; > > - else > > - curr.quarterControl = 1; > > - MOV(GenRegister::suboffset(dst, w), GenRegister::suboffset(r, w / 2)); > > - pop(); > > - ADD(src, src, imm4); > > - insn = next(GEN_OPCODE_SEND); > > - setHeader(insn); > > - setDst(insn, GenRegister::uw16grf(r.nr, 0)); > > - setSrc0(insn, GenRegister::ud8grf(src.nr, 0)); > > - setSrc1(insn, GenRegister::immud(0)); > > - setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, > > curr.execWidth / 8, curr.execWidth / 8); > > + void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister tmp, > > GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum) { > > + GenRegister dst32 = GenRegister::retype(dst, GEN_TYPE_UD); > > + src = GenRegister::retype(src, GEN_TYPE_UD); > > + addr = GenRegister::retype(addr, GEN_TYPE_UD); > > + tmp = GenRegister::retype(tmp, GEN_TYPE_UD); > > + uint32_t originSimdWidth = curr.execWidth; > > + uint32_t originPredicate = curr.predicate; > > + uint32_t originMask = curr.noMask; > > push(); > > - curr.quarterControl = 0; > > - curr.nibControl = 0; > > - MOV(GenRegister::suboffset(dst, 1), r); > > - if (w == 8) > > - curr.nibControl = 1; > > - else > > - curr.quarterControl = 1; > > - MOV(GenRegister::suboffset(dst, w + 1), GenRegister::suboffset(r, w / > > 2)); > > + for ( uint32_t channels = 0, currQuarter = GEN_COMPRESSION_Q1; > > + channels < originSimdWidth; channels += 8, currQuarter++) { > > + curr.predicate = GEN_PREDICATE_NONE; > > + curr.noMask = GEN_MASK_DISABLE; > > + curr.execWidth = 8; > > + /* XXX The following instruction is illegal, but it works as SIMD > > 1*4 mode > > + which is what we want here. */ > > + MOV(GenRegister::h2(addr), GenRegister::suboffset(src, channels)); > > + ADD(GenRegister::h2(GenRegister::suboffset(addr, 1)), > > GenRegister::suboffset(src, channels), GenRegister::immd(4)); > > + MOV(GenRegister::h2(GenRegister::suboffset(addr, 8)), > > GenRegister::suboffset(src, channels + 4)); > > + ADD(GenRegister::h2(GenRegister::suboffset(addr, 9)), > > GenRegister::suboffset(src, channels + 4), GenRegister::immd(4)); > > + // get the first 8 doubles. > > + curr.execWidth = 16; > > + this->UNTYPED_READ(tmp, addr, bti, elemNum); > > + if (originSimdWidth == 16) > > + curr.quarterControl = currQuarter; > > + curr.predicate = originPredicate; > > + curr.noMask = originMask; > > + // Back to simd8 for correct predication flag. > > + curr.execWidth = 8; > > + MOV(GenRegister::retype(GenRegister::suboffset(dst32, channels * 2), > > GEN_TYPE_DF), GenRegister::retype(tmp, GEN_TYPE_DF)); > > + } > > pop(); > > } > > > > - void GenEncoder::WRITE_FLOAT64(GenRegister msg, uint32_t bti, uint32_t > > elemNum) { > > - int w = curr.execWidth; > > - GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*3), > > GEN_TYPE_UD); > > - r.type = GEN_TYPE_UD; > > - GenRegister hdr = GenRegister::h2(r); > > - GenRegister src = GenRegister::ud16grf(msg.nr + w / 8, 0); > > - src.hstride = GEN_HORIZONTAL_STRIDE_2; > > - GenRegister data = GenRegister::offset(r, w / 8); > > - GenRegister imm4 = GenRegister::immud(4); > > - MOV(r, GenRegister::ud8grf(msg.nr, 0)); > > + void GenEncoder::WRITE_FLOAT64(GenRegister msg, GenRegister data, > > uint32_t bti, uint32_t elemNum) { > > + GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD); > > + msg = GenRegister::retype(msg, GEN_TYPE_UD); > > + int originSimdWidth = curr.execWidth; > > + int originPredicate = curr.predicate; > > + int originMask = curr.noMask; > > push(); > > - curr.quarterControl = 0; > > - curr.nibControl = 0; > > - MOV(data, src); > > - if (w == 8) > > - curr.nibControl = 1; > > - else > > - curr.quarterControl = 1; > > - MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, > > w)); > > - pop(); > > - GenInstruction *insn; > > - insn = next(GEN_OPCODE_SEND); > > - setHeader(insn); > > - setDst(insn, GenRegister::retype(GenRegister::null(), > > dst_type(curr.execWidth))); > > - setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0)); > > - setSrc1(insn, GenRegister::immud(0)); > > - setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, > > curr.execWidth / 4, 0); > > - > > - ADD(r, GenRegister::ud8grf(msg.nr, 0), imm4); > > - push(); > > - curr.quarterControl = 0; > > - curr.nibControl = 0; > > - MOV(data, GenRegister::suboffset(src, 1)); > > - if (w == 8) > > - curr.nibControl = 1; > > - else > > - curr.quarterControl = 1; > > - MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w > > + 1)); > > + for (uint32_t half = 0; half < 2; half++) { > > + curr.predicate = GEN_PREDICATE_NONE; > > + curr.noMask = GEN_MASK_DISABLE; > > + curr.execWidth = 8; > > + MOV(GenRegister::suboffset(msg, originSimdWidth), > > GenRegister::unpacked_ud(data32.nr, data32.subnr + half)); > > + if (originSimdWidth == 16) { > > + MOV(GenRegister::suboffset(msg, originSimdWidth + 8), > > GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half)); > > + curr.execWidth = 16; > > + } > > + if (half == 1) > > + ADD(GenRegister::retype(msg, GEN_TYPE_UD), > > GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(4)); > > + curr.predicate = originPredicate; > > + curr.noMask = originMask; > > + this->UNTYPED_WRITE(msg, bti, elemNum); > > + } > > + /* Let's restore the original message(addr) register. */ > > + /* XXX could be optimized if we don't allocate the address to the > > header > > + position of the message. */ > > + curr.predicate = GEN_PREDICATE_NONE; > > + curr.noMask = GEN_MASK_DISABLE; > > + ADD(msg, GenRegister::retype(msg, GEN_TYPE_UD), > > + GenRegister::immd(-4)); > > pop(); > > - insn = next(GEN_OPCODE_SEND); > > - setHeader(insn); > > - setDst(insn, GenRegister::retype(GenRegister::null(), > > dst_type(curr.execWidth))); > > - setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0)); > > - setSrc1(insn, GenRegister::immud(0)); > > - setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, > > curr.execWidth / 4, 0); > > } > > > > void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t > > bti, uint32_t elemNum) { @@ -470,7 +436,7 @@ namespace gbe > > NOT_IMPLEMENTED; > > > > this->setHeader(insn); > > - this->setDst(insn, GenRegister::uw16grf(dst.nr, 0)); > > + this->setDst(insn, GenRegister::uw16grf(dst.nr, 0)); > > this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0)); > > this->setSrc1(insn, GenRegister::immud(0)); > > setDPUntypedRW(this, > > @@ -601,25 +567,53 @@ namespace gbe > > return &this->store.back(); > > } > > > > - INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, > > GenRegister src) { > > - if (dst.isdf() && src.isdf()) { > > + INLINE void _handleDouble(GenEncoder *p, uint32_t opcode, GenRegister > > dst, > > + GenRegister src0, GenRegister src1 = > > + GenRegister::null()) { > > int w = p->curr.execWidth; > > p->push(); > > - p->curr.quarterControl = 0; > > p->curr.nibControl = 0; > > GenInstruction *insn = p->next(opcode); > > p->setHeader(insn); > > p->setDst(insn, dst); > > - p->setSrc0(insn, src); > > + p->setSrc0(insn, src0); > > + if (!GenRegister::isNull(src1)) > > + p->setSrc1(insn, src1); > > if (w == 8) > > p->curr.nibControl = 1; // second 1/8 mask > > - else // w == 16 > > - p->curr.quarterControl = 1; // second 1/4 mask > > insn = p->next(opcode); > > p->setHeader(insn); > > p->setDst(insn, GenRegister::suboffset(dst, w / 2)); > > - p->setSrc0(insn, GenRegister::suboffset(src, w / 2)); > > + p->setSrc0(insn, GenRegister::suboffset(src0, w / 2)); > > + if (!GenRegister::isNull(src1)) > > + p->setSrc1(insn, GenRegister::suboffset(src1, w / 2)); > > p->pop(); > > + } > > + > > + // Double register accessing is a little special, // Per Gen spec, > > + then only supported mode is SIMD8 and, it only // handles four > > + doubles each time. > > + // We need to lower down SIMD16 to two SIMD8 and lower down SIMD8 > > + // to two SIMD1x4. > > + INLINE void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, > > + GenRegister src0, GenRegister src1 = > > GenRegister::null()) { > > + if (p->curr.execWidth == 8) > > + _handleDouble(p, opcode, dst, src0, src1); > > + else if (p->curr.execWidth == 16) { > > + p->push(); > > + p->curr.execWidth = 8; > > + p->curr.quarterControl = GEN_COMPRESSION_Q1; > > + _handleDouble(p, opcode, dst, src0, src1); > > + p->curr.quarterControl = GEN_COMPRESSION_Q2; > > + if (!GenRegister::isNull(src1)) > > + src1 = GenRegister::offset(src1, 2); > > + _handleDouble(p, opcode, GenRegister::offset(dst, 2), > > GenRegister::offset(src0, 2), src1); > > + p->pop(); > > + } > > + } > > + > > + INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, > > GenRegister src) { > > + if (dst.isdf() && src.isdf()) { > > + handleDouble(p, opcode, dst, src); > > } else if (needToSplitAlu1(p, dst, src) == false) { > > GenInstruction *insn = p->next(opcode); > > p->setHeader(insn); > > @@ -653,25 +647,7 @@ namespace gbe > > GenRegister src1) > > { > > if (dst.isdf() && src0.isdf() && src1.isdf()) { > > - int w = p->curr.execWidth; > > - p->push(); > > - p->curr.quarterControl = 0; > > - p->curr.nibControl = 0; > > - GenInstruction *insn = p->next(opcode); > > - p->setHeader(insn); > > - p->setDst(insn, dst); > > - p->setSrc0(insn, src0); > > - p->setSrc1(insn, src1); > > - if (w == 8) > > - p->curr.nibControl = 1; // second 1/8 mask > > - else // w == 16 > > - p->curr.quarterControl = 1; // second 1/4 mask > > - insn = p->next(opcode); > > - p->setHeader(insn); > > - p->setDst(insn, GenRegister::suboffset(dst, w / 2)); > > - p->setSrc0(insn, GenRegister::suboffset(src0, w / 2)); > > - p->setSrc1(insn, GenRegister::suboffset(src1, w / 2)); > > - p->pop(); > > + handleDouble(p, opcode, dst, src0, src1); > > } else if (needToSplitAlu2(p, dst, src0, src1) == false) { > > GenInstruction *insn = p->next(opcode); > > p->setHeader(insn); > > @@ -808,7 +784,16 @@ namespace gbe > > r.width = GEN_WIDTH_1; > > r.hstride = GEN_HORIZONTAL_STRIDE_0; > > push(); > > + uint32_t width = curr.execWidth; > > + curr.execWidth = 8; > > + curr.predicate = GEN_PREDICATE_NONE; > > + curr.noMask = 1; > > + curr.quarterControl = GEN_COMPRESSION_Q1; > > MOV(dest, r); > > + if (width == 16) { > > + curr.quarterControl = GEN_COMPRESSION_Q2; > > + MOV(GenRegister::offset(dest, 2), r); > > + } > > pop(); > > } > > > > @@ -839,14 +824,8 @@ namespace gbe > > void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister > > r) { > > int w = curr.execWidth; > > if (src0.isdf()) { > > - push(); > > - curr.execWidth = 16; > > - MOV(dest, src0); > > - if (w == 16) { > > - curr.quarterControl = 1; > > - MOV(GenRegister::QnPhysical(dest, w / 4), > > GenRegister::QnPhysical(src0, w / 4)); > > - } > > - pop(); > > + GBE_ASSERT(0); // MOV DF is called from convert instruction, > > + // We should never convert a df to a df. > > } else { > > GenRegister r0 = GenRegister::h2(r); > > push(); > > diff --git a/backend/src/backend/gen_encoder.hpp > > b/backend/src/backend/gen_encoder.hpp > > index d3a7165..86e1a71 100644 > > --- a/backend/src/backend/gen_encoder.hpp > > +++ b/backend/src/backend/gen_encoder.hpp > > @@ -118,10 +118,11 @@ namespace gbe > > ALU2(LINE) > > ALU2(PLN) > > ALU3(MAD) > > - ALU2(MOV_DF); > > + //ALU2(MOV_DF); > > #undef ALU1 > > #undef ALU2 > > #undef ALU3 > > + void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = > > + GenRegister::null()); > > void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value); > > /*! Barrier message (to synchronize threads of a workgroup) */ > > void BARRIER(GenRegister src); > > @@ -142,9 +143,9 @@ namespace gbe > > /*! Atomic instructions */ > > void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, > > uint32_t bti, uint32_t srcNum); > > /*! Read 64-bits float arrays */ > > - void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, > > uint32_t elemNum); > > + void READ_FLOAT64(GenRegister dst, GenRegister tmp, GenRegister > > + addr, GenRegister src, uint32_t bti, uint32_t elemNum); > > /*! Write 64-bits float arrays */ > > - void WRITE_FLOAT64(GenRegister src, uint32_t bti, uint32_t elemNum); > > + void WRITE_FLOAT64(GenRegister src, GenRegister data, uint32_t > > + bti, uint32_t elemNum); > > /*! Untyped read (upto 4 channels) */ > > void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, > > uint32_t elemNum); > > /*! Untyped write (upto 4 channels) */ diff --git > > a/backend/src/backend/gen_insn_selection.cpp > > b/backend/src/backend/gen_insn_selection.cpp > > index d4be8bf..727d07d 100644 > > --- a/backend/src/backend/gen_insn_selection.cpp > > +++ b/backend/src/backend/gen_insn_selection.cpp > > @@ -466,9 +466,9 @@ namespace gbe > > /*! Atomic instruction */ > > void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg > > src1, Reg src2, uint32_t bti); > > /*! Read 64 bits float array */ > > - void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, > > uint32_t bti); > > + void READ_FLOAT64(Reg addr, Reg tempAddr, const GenRegister *dst, > > + uint32_t elemNum, uint32_t valueNum, uint32_t bti); > > /*! Write 64 bits float array */ > > - void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t elemNum, > > uint32_t bti); > > + void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t > > + elemNum, uint32_t valueNum, uint32_t bti); > > /*! Untyped read (up to 4 elements) */ > > void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, > > uint32_t bti); > > /*! Untyped write (up to 4 elements) */ @@ -760,12 +760,16 @@ > > namespace gbe > > void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); } > > void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, > > 0, 0); } > > > > + /* elemNum contains all the temporary register and the > > + real destination registers.*/ > > void Selection::Opaque::READ_FLOAT64(Reg addr, > > + Reg tempAddr, > > const GenRegister *dst, > > uint32_t elemNum, > > + uint32_t valueNum, > > uint32_t bti) > > { > > - SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, > > elemNum, 1); > > + SelectionInstruction *insn = > > + this->appendInsn(SEL_OP_READ_FLOAT64, > > + elemNum, 2); > > SelectionVector *srcVector = this->appendVector(); > > SelectionVector *dstVector = this->appendVector(); > > > > @@ -773,11 +777,12 @@ namespace gbe > > for (uint32_t elemID = 0; elemID < elemNum; ++elemID) > > insn->dst(elemID) = dst[elemID]; > > insn->src(0) = addr; > > + insn->src(1) = tempAddr; > > insn->extra.function = bti; > > - insn->extra.elem = elemNum; > > + insn->extra.elem = valueNum; > > > > - // Sends require contiguous allocation > > - dstVector->regNum = elemNum; > > + // Only the temporary registers need contiguous allocation > > + dstVector->regNum = elemNum - valueNum; > > dstVector->isSrc = 0; > > dstVector->reg = &insn->dst(0); > > > > @@ -814,9 +819,12 @@ namespace gbe > > srcVector->reg = &insn->src(0); > > } > > > > + /* elemNum contains all the temporary register and the > > + real data registers.*/ > > void Selection::Opaque::WRITE_FLOAT64(Reg addr, > > const GenRegister *src, > > uint32_t elemNum, > > + uint32_t valueNum, > > uint32_t bti) > > { > > SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE_FLOAT64, 0, > > elemNum+1); @@ -827,10 +835,10 @@ namespace gbe > > for (uint32_t elemID = 0; elemID < elemNum; ++elemID) > > insn->src(elemID+1) = src[elemID]; > > insn->extra.function = bti; > > - insn->extra.elem = elemNum; > > + insn->extra.elem = valueNum; > > > > - // Sends require contiguous allocation for the sources > > - vector->regNum = elemNum+1; > > + // Only the addr + temporary registers need to be contiguous. > > + vector->regNum = (elemNum - valueNum) + 1; > > vector->reg = &insn->src(0); > > vector->isSrc = 1; > > } > > @@ -1871,13 +1879,17 @@ namespace gbe > > { > > using namespace ir; > > const uint32_t valueNum = insn.getValueNum(); > > - vector<GenRegister> dst(valueNum); > > - for (uint32_t dstID = 0; dstID < valueNum; ++dstID) > > - dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), > > GEN_TYPE_F); > > - dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD))); > > - if (sel.ctx.getSimdWidth() == 16) > > - dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD))); > > - sel.READ_FLOAT64(addr, dst.data(), dst.size(), bti); > > + GenRegister dst[valueNum * 3]; > > + uint32_t dstID; > > + /* XXX support scalar only right now. */ > > + GBE_ASSERT(valueNum == 1); > > + uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : > > valueNum; > > + // The first 16 DWORD register space is for temporary usage at > > encode stage. > > + for (dstID = 0; dstID < tmpRegNum ; ++dstID) > > + dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD)); > > + for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID) > > + dst[dstID] = sel.selReg(insn.getValue(valueID)); > > + sel.READ_FLOAT64(addr, sel.selReg(sel.reg(FAMILY_QWORD)), dst, > > + valueNum + tmpRegNum, valueNum, bti); > > } > > > > void emitByteGather(Selection::Opaque &sel, @@ -1971,15 +1983,19 @@ > > namespace gbe > > const uint32_t valueNum = insn.getValueNum(); > > const uint32_t addrID = ir::StoreInstruction::addressIndex; > > GenRegister addr; > > - vector<GenRegister> value(valueNum); > > - > > + GenRegister src[valueNum * 2]; > > + uint32_t srcID; > > + /* XXX support scalar only right now. */ > > + GBE_ASSERT(valueNum == 1); > > addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), > > GEN_TYPE_F); > > - for (uint32_t valueID = 0; valueID < valueNum; ++valueID) > > - value[valueID] = > > GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F); > > - value.push_back(sel.selReg(sel.reg(FAMILY_QWORD))); > > - if (sel.ctx.getSimdWidth() == 16) > > - value.push_back(sel.selReg(sel.reg(FAMILY_QWORD))); > > - sel.WRITE_FLOAT64(addr, value.data(), value.size(), bti); > > + uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : > > valueNum; > > + // The first 16 DWORD register space is for temporary usage at > > encode stage. > > + for (srcID = 0; srcID < tmpRegNum; ++srcID) > > + src[srcID] = sel.selReg(sel.reg(FAMILY_DWORD)); > > + > > + for (uint32_t valueID = 0; valueID < valueNum; ++srcID, ++valueID) > > + src[srcID] = sel.selReg(insn.getValue(valueID)); > > + sel.WRITE_FLOAT64(addr, src, valueNum + tmpRegNum, valueNum, > > + bti); > > } > > > > void emitByteScatter(Selection::Opaque &sel, diff --git > > a/backend/src/backend/gen_reg_allocation.cpp > > b/backend/src/backend/gen_reg_allocation.cpp > > index e7c96ac..4ba03ea 100644 > > --- a/backend/src/backend/gen_reg_allocation.cpp > > +++ b/backend/src/backend/gen_reg_allocation.cpp > > @@ -474,7 +474,12 @@ namespace gbe > > if (it != vectorMap.end()) { > > const SelectionVector *vector = it->second.first; > > const uint32_t simdWidth = ctx.getSimdWidth(); > > - const uint32_t alignment = simdWidth * sizeof(uint32_t); > > + > > + const ir::RegisterData regData = ctx.sel->getRegisterData(reg); > > + const ir::RegisterFamily family = regData.family; > > + const uint32_t typeSize = familyVectorSize[family]; > > + const uint32_t alignment = simdWidth*typeSize; > > + > > const uint32_t size = vector->regNum * alignment; > > uint32_t grfOffset; > > while ((grfOffset = ctx.allocate(size, alignment)) == 0) { @@ > > -483,7 +488,8 @@ namespace gbe > > } > > for (uint32_t regID = 0; regID < vector->regNum; ++regID, > > grfOffset += alignment) { > > const ir::Register reg = vector->reg[regID].reg(); > > - GBE_ASSERT(RA.contains(reg) == false); > > + GBE_ASSERT(RA.contains(reg) == false > > + && ctx.sel->getRegisterData(reg).family == > > + family); > > RA.insert(std::make_pair(reg, grfOffset)); > > } > > } > > diff --git a/backend/src/backend/gen_register.hpp > > b/backend/src/backend/gen_register.hpp > > index fedb743..7e48837 100644 > > --- a/backend/src/backend/gen_register.hpp > > +++ b/backend/src/backend/gen_register.hpp > > @@ -553,6 +553,11 @@ namespace gbe > > GEN_HORIZONTAL_STRIDE_1); > > } > > > > + static INLINE bool isNull(GenRegister reg) { > > + return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE > > + && reg.nr == GEN_ARF_NULL); > > + } > > + > > static INLINE GenRegister acc(void) { > > return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE, > > GEN_ARF_ACCUMULATOR, @@ -832,6 +837,26 @@ > > namespace gbe > > GEN_HORIZONTAL_STRIDE_2); > > } > > > > + static INLINE GenRegister packed_ud(uint32_t nr, uint32_t subnr) { > > + return GenRegister(GEN_GENERAL_REGISTER_FILE, > > + nr, > > + subnr, > > + GEN_TYPE_UD, > > + GEN_VERTICAL_STRIDE_8, > > + GEN_WIDTH_4, > > + GEN_HORIZONTAL_STRIDE_1); > > + } > > + > > + static INLINE GenRegister unpacked_ud(uint32_t nr, uint32_t subnr) { > > + return GenRegister(GEN_GENERAL_REGISTER_FILE, > > + nr, > > + subnr, > > + GEN_TYPE_UD, > > + GEN_VERTICAL_STRIDE_8, > > + GEN_WIDTH_4, > > + GEN_HORIZONTAL_STRIDE_2); > > + } > > + > > static INLINE GenRegister mask(uint32_t subnr) { > > return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_MASK, subnr); > > } > > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > > b/backend/src/llvm/llvm_gen_backend.cpp > > index c8c5484..b5963ad 100644 > > --- a/backend/src/llvm/llvm_gen_backend.cpp > > +++ b/backend/src/llvm/llvm_gen_backend.cpp > > @@ -2371,8 +2371,8 @@ namespace gbe > > // Scalar is easy. We neednot build register tuples > > if (isScalarType(llvmType) == true) { > > const ir::Type type = getType(ctx, llvmType); > > - if(type == ir::TYPE_DOUBLE) // 64bit-float load(store) don't support > > SIMD16 > > - OCL_SIMD_WIDTH = 8; > > + //if(type == ir::TYPE_DOUBLE) // 64bit-float load(store) don't > > support SIMD16 > > + // OCL_SIMD_WIDTH = 8; > > const ir::Register values = this->getRegister(llvmValues); > > if (isLoad) > > ctx.LOAD(type, ptr, addrSpace, dwAligned, values); > > -- > > 1.7.9.5 > > > > _______________________________________________ > > Beignet mailing list > > Beignet@lists.freedesktop.org > > http://lists.freedesktop.org/mailman/listinfo/beignet > > _______________________________________________ > > Beignet mailing list > > Beignet@lists.freedesktop.org > > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet