[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Create CU's ports in the standard way
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/32836 ) Change subject: gpu-compute: Create CU's ports in the standard way .. gpu-compute: Create CU's ports in the standard way The CU would initialize its ports in getMasterPort(), which is not desirable as getMasterPort() may be called several times for the same port. This can lead to a fatal if the CU expects to only create a single port of a given type, and may lead to other issues where stat names are duplicated. This change instantiates and initializes the CU's ports in the CU constructor using the CU params. The index field is also removed from the CU's ports because the base class already has an ID field, which will be set to the default value in the base class's constructor for scalar ports. It doesn't make sense for scalar port's to take an index because they are scalar, so we let the base class initialize the ID to the invalid port ID. Change-Id: Id18386f5f53800a6447d968380676d8fd9bac9df Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/32836 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/gpu-compute/compute_unit.cc M src/gpu-compute/compute_unit.hh M src/gpu-compute/fetch_unit.cc M src/gpu-compute/shader.cc 4 files changed, 99 insertions(+), 126 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 9a41233..2d64fa3 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -96,6 +96,11 @@ resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()), _masterId(p->system->getMasterId(this, "ComputeUnit")), lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this), +ldsPort(csprintf("%s-port", name()), this), +scalarDataPort(csprintf("%s-port", name()), this), +scalarDTLBPort(csprintf("%s-port", name()), this), +sqcPort(csprintf("%s-port", name()), this), +sqcTLBPort(csprintf("%s-port", name()), this), _cacheLineSize(p->system->cacheLineSize()), _numBarrierSlots(p->num_barrier_slots), globalSeqNum(0), wavefrontSize(p->wf_size), @@ -169,16 +174,18 @@ fatal("Invalid WF execution policy (CU)\n"); } -memPort.resize(wfSize()); +for (int i = 0; i < p->port_memory_port_connection_count; ++i) { +memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i); +} + +for (int i = 0; i < p->port_translation_port_connection_count; ++i) { +tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i); +} // Setup tokens for slave ports. The number of tokens in memSlaveTokens // is the total token count for the entire vector port (i.e., this CU). memPortTokens = new TokenManager(p->max_cu_tokens); -// resize the tlbPort vectorArray -int tlbPort_width = perLaneTLB ? wfSize() : 1; -tlbPort.resize(tlbPort_width); - registerExitCallback([this]() { exitCallback(); }); lastExecCycle.resize(numVectorALUs, 0); @@ -214,7 +221,6 @@ lastVaddrSimd[j].clear(); } lastVaddrCU.clear(); -delete ldsPort; } int @@ -781,7 +787,7 @@ // appropriate cycle to process the timing memory response // This delay represents the pipeline delay SenderState *sender_state = safe_cast(pkt->senderState); -int index = sender_state->port_index; +PortID index = sender_state->port_index; GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; GPUDispatcher &dispatcher = computeUnit->shader->dispatcher(); @@ -886,7 +892,7 @@ } EventFunctionWrapper *mem_resp_event = -computeUnit->memPort[index]->createMemRespEvent(pkt); +computeUnit->memPort[index].createMemRespEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n", @@ -1007,7 +1013,7 @@ } void -ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) +ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt) { // There must be a way around this check to do the globalMemStart... Addr tmp_vaddr = pkt->req->getVaddr(); @@ -1039,7 +1045,7 @@ tlbCycles -= curTick(); ++tlbRequests; -int tlbPort_index = perLaneTLB ? index : 0; +PortID tlbPort_index = perLaneTLB ? index : 0; if (shader->timingSim) { if (debugSegFault) { @@ -1074,7 +1080,7 @@ pkt->senderState = translation_state; if (functionalTLB) { -tlbPort[tlbPort_index]->sendFunctional(pkt); +tlbPort[tlbPort_index].sendFunctional(pkt); // update the hitLevel distribution int hit_level = translation_state->hitLevel; @@ -1117,33 +1123,33 @@ // translation
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Create CU's ports in the standard way
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/32836 to review the following change. Change subject: gpu-compute: Create CU's ports in the standard way .. gpu-compute: Create CU's ports in the standard way The CU would initialize its ports in getMasterPort(), which is not desirable as getMasterPort() may be called several times for the same port. This can lead to a fatal if the CU expects to only create a single port of a given type, and may lead to other issues where stat names are duplicated. This change instantiates and initializes the CU's ports in the CU constructor using the CU params. The index field is also removed from the CU's ports because the base class already has an ID field, which will be set to the default value in the base class's constructor for scalar ports. It doesn't make sense for scalar port's to take an index because they are scalar, so we let the base class initialize the ID to the invalid port ID. Change-Id: Id18386f5f53800a6447d968380676d8fd9bac9df --- M src/gpu-compute/compute_unit.cc M src/gpu-compute/compute_unit.hh M src/gpu-compute/fetch_unit.cc 3 files changed, 86 insertions(+), 103 deletions(-) diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 7e0947f..b9f7dec 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -96,6 +96,11 @@ resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()), _masterId(p->system->getMasterId(this, "ComputeUnit")), lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this), +ldsPort(csprintf("%s-port", name()), this), +scalarDataPort(csprintf("%s-port", name()), this), +scalarDTLBPort(csprintf("%s-port", name()), this), +sqcPort(csprintf("%s-port", name()), this), +sqcTLBPort(csprintf("%s-port", name()), this), _cacheLineSize(p->system->cacheLineSize()), _numBarrierSlots(p->num_barrier_slots), globalSeqNum(0), wavefrontSize(p->wf_size), @@ -169,16 +174,20 @@ fatal("Invalid WF execution policy (CU)\n"); } -memPort.resize(wfSize()); +for (int i = 0; i < p->port_memory_port_connection_count; ++i) { +memPort.push_back( +new DataPort(csprintf("%s-port%d", name(), i), this, i)); +} + +for (int i = 0; i < p->port_translation_port_connection_count; ++i) { +tlbPort.push_back( +new DTLBPort(csprintf("%s-port%d", name(), i), this, i)); +} // Setup tokens for slave ports. The number of tokens in memSlaveTokens // is the total token count for the entire vector port (i.e., this CU). memPortTokens = new TokenManager(p->max_cu_tokens); -// resize the tlbPort vectorArray -int tlbPort_width = perLaneTLB ? wfSize() : 1; -tlbPort.resize(tlbPort_width); - registerExitCallback([this]() { exitCallback(); }); lastExecCycle.resize(numVectorALUs, 0); @@ -214,7 +223,14 @@ lastVaddrSimd[j].clear(); } lastVaddrCU.clear(); -delete ldsPort; + +for (auto mem_port : memPort) { +delete mem_port; +} + +for (auto tlb_port : tlbPort) { +delete tlb_port; +} } int @@ -781,7 +797,7 @@ // appropriate cycle to process the timing memory response // This delay represents the pipeline delay SenderState *sender_state = safe_cast(pkt->senderState); -int index = sender_state->port_index; +PortID index = sender_state->port_index; GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; GPUDispatcher &dispatcher = computeUnit->shader->dispatcher(); @@ -1007,7 +1023,7 @@ } void -ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) +ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt) { // There must be a way around this check to do the globalMemStart... Addr tmp_vaddr = pkt->req->getVaddr(); @@ -1039,7 +1055,7 @@ tlbCycles -= curTick(); ++tlbRequests; -int tlbPort_index = perLaneTLB ? index : 0; +PortID tlbPort_index = perLaneTLB ? index : 0; if (shader->timingSim) { if (debugSegFault) { @@ -1205,12 +1221,12 @@ new TheISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc, false, pkt->senderState); -if (scalarDTLBPort->isStalled()) { -assert(scalarDTLBPort->retries.size()); -scalarDTLBPort->retries.push_back(pkt); -} else if (!scalarDTLBPort->sendTimingReq(pkt)) { -scalarDTLBPort->stallPort(); -scalarDTLBPort->retries.push_back(pkt); +if (scalarDTLBPort.isStalled()) { +assert(scalarDTLBPort.retries.size()); +scalarDTLBPort.retries.push_back(pkt); +} else if (!scalarDTLBPort.sendTimingReq(pkt)) { +scalarDTLBPort.stallPort(); +scalarDTLBPort.retries.push_back(pkt)
[gem5-dev] Change in gem5/gem5[develop]: configs: Replace DirMem w/RubyDirectoryMemory, set addr_ranges
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/32674 ) Change subject: configs: Replace DirMem w/RubyDirectoryMemory, set addr_ranges .. configs: Replace DirMem w/RubyDirectoryMemory, set addr_ranges This was originally from the GCN staging branch, which only had GPU_VIPER.py, but the other GPU_VIPER configs had DirMem as well, so I applied this change to all of them. The patch replaces the Directory in DirCntrl from DirMem to RubyDirectoryMemory. This fixes errors that DirMem caused relating to setting class variables. It also generates and sets addr_ranges in DirCntrl as RubyDirectoryMemory uses the parent object's addr_ranges in its code The style checker complained about a line length in GPU_VIPER_Region, so the patch also fixes that Change-Id: Icec96777a51d8a826b576fc752fae0f7f15427bc Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/32674 Reviewed-by: Matt Sinclair Reviewed-by: Bradford Beckmann Maintainer: Bradford Beckmann Tested-by: kokoro --- M configs/ruby/GPU_VIPER.py M configs/ruby/GPU_VIPER_Baseline.py M configs/ruby/GPU_VIPER_Region.py 3 files changed, 69 insertions(+), 47 deletions(-) Approvals: Bradford Beckmann: Looks good to me, approved; Looks good to me, approved Matt Sinclair: Looks good to me, but someone else must approve kokoro: Regressions pass diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py index 92dcf5e..967b4d3 100644 --- a/configs/ruby/GPU_VIPER.py +++ b/configs/ruby/GPU_VIPER.py @@ -322,24 +322,14 @@ self.probeToL3 = probe_to_l3 self.respToL3 = resp_to_l3 -class DirMem(RubyDirectoryMemory, CntrlBase): -def create(self, options, ruby_system, system): -self.version = self.versionCount() - -phys_mem_size = AddrRange(options.mem_size).size() -mem_module_size = phys_mem_size / options.num_dirs -dir_size = MemorySize('0B') -dir_size.value = mem_module_size -self.size = dir_size - class DirCntrl(Directory_Controller, CntrlBase): -def create(self, options, ruby_system, system): +def create(self, options, dir_ranges, ruby_system, system): self.version = self.versionCount() self.response_latency = 30 -self.directory = DirMem() -self.directory.create(options, ruby_system, system) +self.addr_ranges = dir_ranges +self.directory = RubyDirectoryMemory() self.L3CacheMemory = L3Cache() self.L3CacheMemory.create(options, ruby_system, system) @@ -441,6 +431,17 @@ # Clusters crossbar_bw = None mainCluster = None + +if options.numa_high_bit: +numa_bit = options.numa_high_bit +else: +# if the numa_bit is not specified, set the directory bits as the +# lowest bits above the block offset bits, and the numa_bit as the +# highest of those directory bits +dir_bits = int(math.log(options.num_dirs, 2)) +block_size_bits = int(math.log(options.cacheline_size, 2)) +numa_bit = block_size_bits + dir_bits - 1 + if hasattr(options, 'bw_scalor') and options.bw_scalor > 0: #Assuming a 2GHz clock crossbar_bw = 16 * options.num_compute_units * options.bw_scalor @@ -448,9 +449,16 @@ else: mainCluster = Cluster(intBW=8) # 16 GB/s for i in range(options.num_dirs): +dir_ranges = [] +for r in system.mem_ranges: +addr_range = m5.objects.AddrRange(r.start, size = r.size(), + intlvHighBit = numa_bit, + intlvBits = dir_bits, + intlvMatch = i) +dir_ranges.append(addr_range) dir_cntrl = DirCntrl(noTCCdir = True, TCC_select_num_bits = TCC_bits) -dir_cntrl.create(options, ruby_system, system) +dir_cntrl.create(options, dir_ranges, ruby_system, system) dir_cntrl.number_of_TBEs = options.num_tbes dir_cntrl.useL3OnWT = options.use_L3_on_WT # the number_of_TBEs is inclusive of TBEs below diff --git a/configs/ruby/GPU_VIPER_Baseline.py b/configs/ruby/GPU_VIPER_Baseline.py index 5388a4e..5a3 100644 --- a/configs/ruby/GPU_VIPER_Baseline.py +++ b/configs/ruby/GPU_VIPER_Baseline.py @@ -301,22 +301,12 @@ self.probeToL3 = probe_to_l3 self.respToL3 = resp_to_l3 -class DirMem(RubyDirectoryMemory, CntrlBase): -def create(self, options, ruby_system, system): -self.version = self.versionCount() - -phys_mem_size = AddrRange(options.mem_size).size() -mem_module_size = phys_mem_size / options.num_dirs -dir_size = MemorySize('0B') -dir_size.value = mem_module_size -self.size = dir_size - class DirCntrl(Directory_Controller, CntrlBase): -def create(self, options, ruby_system, system): +def c
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute, arch-gcn3: Change how waitcnts are implemented
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29973 ) Change subject: gpu-compute, arch-gcn3: Change how waitcnts are implemented .. gpu-compute, arch-gcn3: Change how waitcnts are implemented Use single counters per memory operation type and increment them upon issue, not execute. Change-Id: I6afc0b66b21882538ef90a14a57a3ab3cc7bd6f3 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29973 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/arch/gcn3/insts/instructions.cc M src/gpu-compute/global_memory_pipeline.cc M src/gpu-compute/gpu_dyn_inst.cc M src/gpu-compute/local_memory_pipeline.cc M src/gpu-compute/scalar_memory_pipeline.cc M src/gpu-compute/schedule_stage.cc M src/gpu-compute/wavefront.cc M src/gpu-compute/wavefront.hh 8 files changed, 106 insertions(+), 18 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 9987fad..7c2cf0e 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -32565,6 +32565,7 @@ vdst.write(); +wf->decLGKMInstsIssued(); wf->rdLmReqsInPipe--; wf->validateRequestCounters(); } // execute @@ -32635,6 +32636,7 @@ vdst.write(); +wf->decLGKMInstsIssued(); wf->rdLmReqsInPipe--; wf->validateRequestCounters(); } // execute @@ -39400,6 +39402,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; return; @@ -39496,6 +39500,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; return; @@ -39592,6 +39598,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; return; @@ -39660,6 +39668,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; return; @@ -39728,6 +39738,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; return; @@ -39805,6 +39817,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; } @@ -39884,6 +39898,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -39952,6 +39968,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -40021,6 +40039,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -40090,6 +40110,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -40159,6 +40181,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -40237,6 +40261,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--;
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add case to op selector when operand is vcc_hi
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29971 ) Change subject: arch-gcn3: Add case to op selector when operand is vcc_hi .. arch-gcn3: Add case to op selector when operand is vcc_hi Change-Id: Ib8846656e18aad04ccb8c9112bc629c69078fe36 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29971 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/registers.cc 1 file changed, 2 insertions(+), 0 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/registers.cc b/src/arch/gcn3/registers.cc index 016160f..d5c4903 100644 --- a/src/arch/gcn3/registers.cc +++ b/src/arch/gcn3/registers.cc @@ -141,6 +141,8 @@ * */ regIdx = numScalarRegs - 2; +} else if (idx == REG_VCC_HI) { +regIdx = numScalarRegs - 1; } else if (idx == REG_FLAT_SCRATCH_LO) { /** * the FLAT_SCRATCH register occupies the two SRF entries -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29971 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Ib8846656e18aad04ccb8c9112bc629c69078fe36 Gerrit-Change-Number: 29971 Gerrit-PatchSet: 8 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Use refs to CU in pipe stages/mem pipes
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29969 ) Change subject: gpu-compute: Use refs to CU in pipe stages/mem pipes .. gpu-compute: Use refs to CU in pipe stages/mem pipes The pipe stages and memory pipes are changed to store a reference to their parent CU as opposed to a pointer. These objects will never change which CU they belong to, and they are constructed by their parent CU. Change-Id: Ie5476e1e2e124a024c2efebceb28cb3a9baa78c1 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29969 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/gpu-compute/compute_unit.cc M src/gpu-compute/exec_stage.cc M src/gpu-compute/exec_stage.hh M src/gpu-compute/fetch_stage.cc M src/gpu-compute/fetch_stage.hh M src/gpu-compute/fetch_unit.cc M src/gpu-compute/fetch_unit.hh M src/gpu-compute/global_memory_pipeline.cc M src/gpu-compute/global_memory_pipeline.hh M src/gpu-compute/local_memory_pipeline.cc M src/gpu-compute/local_memory_pipeline.hh M src/gpu-compute/scalar_memory_pipeline.cc M src/gpu-compute/scalar_memory_pipeline.hh M src/gpu-compute/schedule_stage.cc M src/gpu-compute/schedule_stage.hh M src/gpu-compute/scoreboard_check_stage.cc M src/gpu-compute/scoreboard_check_stage.hh 17 files changed, 193 insertions(+), 193 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 653c074..a59a7fd 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -67,13 +67,13 @@ vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width), coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width), registerManager(p->register_manager), -fetchStage(p, this), -scoreboardCheckStage(p, this), -scheduleStage(p, this), -execStage(p, this), -globalMemoryPipe(p, this), -localMemoryPipe(p, this), -scalarMemoryPipe(p, this), +fetchStage(p, *this), +scoreboardCheckStage(p, *this), +scheduleStage(p, *this), +execStage(p, *this), +globalMemoryPipe(p, *this), +localMemoryPipe(p, *this), +scalarMemoryPipe(p, *this), tickEvent([this]{ exec(); }, "Compute unit tick event", false, Event::CPU_Tick_Pri), cu_id(p->cu_id), diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc index e420579..2b0a797 100644 --- a/src/gpu-compute/exec_stage.cc +++ b/src/gpu-compute/exec_stage.cc @@ -41,10 +41,10 @@ #include "gpu-compute/vector_register_file.hh" #include "gpu-compute/wavefront.hh" -ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit *cu) +ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu) : computeUnit(cu), lastTimeInstExecuted(false), thisTimeInstExecuted(false), instrExecuted (false), - executionResourcesUsed(0), _name(cu->name() + ".ExecStage") + executionResourcesUsed(0), _name(cu.name() + ".ExecStage") { numTransActiveIdle = 0; @@ -54,7 +54,7 @@ void ExecStage::init() { -dispatchList = &computeUnit->dispatchList; +dispatchList = &computeUnit.dispatchList; idle_dur = 0; } @@ -127,7 +127,7 @@ { std::stringstream ss; bool empty = true; -for (int i = 0; i < computeUnit->numExeUnits(); i++) { +for (int i = 0; i < computeUnit.numExeUnits(); i++) { DISPATCH_STATUS s = dispatchList->at(i).second; ss << i << ": " << dispStatusToStr(s); if (s != EMPTY) { @@ -151,7 +151,7 @@ if (Debug::GPUSched) { dumpDispList(); } -for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) { +for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) { DISPATCH_STATUS s = dispatchList->at(unitId).second; switch (s) { case EMPTY: @@ -168,7 +168,7 @@ (w->instructionBuffer.front())->disassemble()); DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId); dispatchList->at(unitId).first->exec(); -(computeUnit->scheduleStage).deleteFromSch(w); +(computeUnit.scheduleStage).deleteFromSch(w); dispatchList->at(unitId).second = EMPTY; dispatchList->at(unitId).first->freeResources(); dispatchList->at(unitId).first = nullptr; @@ -208,7 +208,7 @@ ; spc -.init(0, computeUnit->numExeUnits(), 1) +.init(0, computeUnit.numExeUnits(), 1) .name(name() + ".spc") .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)") ; @@ -220,26 +220,26 @@ ; numCyclesWithInstrTypeIssued -.init(computeUnit->numExeUnits()) +.init(computeUnit.numExeUnits()) .name(name() + ".num_cycles_issue_exec_rsrc") .desc("Number of cycle
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: No RF scheduling in case of SKIP or EMPTY
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29970 ) Change subject: gpu-compute: No RF scheduling in case of SKIP or EMPTY .. gpu-compute: No RF scheduling in case of SKIP or EMPTY In case of flat memory instructions the status for the LM pipe execution unit is set to SKIP or EMPTY, as the bus between the VRF and the GM and LM pipe is shared. The destination operands should not be scheduled for the LM pipe, event if the wave is in the dispatch list. This can lead to deadlock in the destination cache as DCEs are reused and the slotsAvailableForBank count gets artificially incremented. Change-Id: I2230c53e3bc1032d2cccbe00fab62c99ab8de6cd Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29970 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/gpu-compute/schedule_stage.cc 1 file changed, 5 insertions(+), 1 deletion(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc index 0785aa0..e0600a6 100644 --- a/src/gpu-compute/schedule_stage.cc +++ b/src/gpu-compute/schedule_stage.cc @@ -236,9 +236,13 @@ ScheduleStage::scheduleRfDestOperands() { for (int j = 0; j < computeUnit.numExeUnits(); ++j) { -if (!dispatchList->at(j).first) { +if (dispatchList->at(j).second == EMPTY || +dispatchList->at(j).second == SKIP) { continue; } + +assert(dispatchList->at(j).first); + // get the wave on dispatch list and attempt to allocate write // resources in the RFs Wavefront *w = dispatchList->at(j).first; -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29970 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I2230c53e3bc1032d2cccbe00fab62c99ab8de6cd Gerrit-Change-Number: 29970 Gerrit-PatchSet: 9 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Alexandru Duțu Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Replace some instances of std::isnormal with std::fpclassify
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29967 ) Change subject: arch-gcn3: Replace some instances of std::isnormal with std::fpclassify .. arch-gcn3: Replace some instances of std::isnormal with std::fpclassify Affected instructions: V_DIV_SCALE_F64, V_CMP_CLASS_F64, V_CMPX_CLASS_F64 and their VOPC, VOP3, F32 variants. These instances of std::isnormal were being used to check for subnormal (denorms) values. std::isnormal is not specific enough. It returns true for normal values but false for NaN, Inf, 0.0, and subnormals. std::fpclassify returns macros for each category of floating point numbers. Now we only catch subnormals. Change-Id: I8d8f4452ff58de71e7c8e0b2b5e73467b532e196 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29967 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 23 insertions(+), 21 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 302dad4..9987fad 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -9439,7 +9439,7 @@ } if (bits(src1[lane], 4)) { // is -denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -9463,7 +9463,7 @@ } if (bits(src1[lane], 7)) { // is +denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && !std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -9551,7 +9551,7 @@ } if (bits(src1[lane], 4)) { // is -denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -9575,7 +9575,7 @@ } if (bits(src1[lane], 7)) { // is +denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && !std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -9664,7 +9664,7 @@ } if (bits(src1[lane], 4)) { // is -denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -9688,7 +9688,7 @@ } if (bits(src1[lane], 7)) { // is +denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && !std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -9777,7 +9777,7 @@ } if (bits(src1[lane], 4)) { // is -denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -9801,7 +9801,7 @@ } if (bits(src1[lane], 7)) { // is +denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && !std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -15550,7 +15550,7 @@ } if (bits(src1[lane], 4)) { // is -denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && std::signbit(src0[lane])) { sdst.setBit(lane, 1); continue; @@ -15574,7 +15574,7 @@ } if (bits(src1[lane], 7)) { // is +denormal -if (!std::isnormal(s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix stride bug in buffer OOB detection logic
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29968 ) Change subject: arch-gcn3: Fix stride bug in buffer OOB detection logic .. arch-gcn3: Fix stride bug in buffer OOB detection logic The out-of-range logic for buffer accesses is missing the top 4 bits of const_stride when dealing with scratch buffers. This can cause perfectly valid scratch acceses to be suppressed when const_stride is large. Change-Id: I8f94d44c242fda26cf6dfb75db04fa3aca934b3e Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29968 Reviewed-by: Anthony Gutierrez Reviewed-by: Matt Sinclair Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/arch/gcn3/insts/op_encodings.hh 1 file changed, 3 insertions(+), 3 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Matt Sinclair: Looks good to me, but someone else must approve kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/op_encodings.hh b/src/arch/gcn3/insts/op_encodings.hh index 202dd1d..b35fb3d 100644 --- a/src/arch/gcn3/insts/op_encodings.hh +++ b/src/arch/gcn3/insts/op_encodings.hh @@ -651,7 +651,7 @@ * non-formatted accesses, this is done on a per-lane * basis. */ -if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) { +if (stride == 0 || !rsrc_desc.swizzleEn) { if (buf_off + stride * buf_idx >= rsrc_desc.numRecords - s_offset.rawData()) { DPRINTF(GCN3, "mubuf out-of-bounds condition 1: " @@ -659,13 +659,13 @@ "const_stride = %llx, " "const_num_records = %llx\n", lane, buf_off + stride * buf_idx, -rsrc_desc.stride, rsrc_desc.numRecords); +stride, rsrc_desc.numRecords); oobMask.set(lane); continue; } } -if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) { +if (stride != 0 && rsrc_desc.swizzleEn) { if (buf_idx >= rsrc_desc.numRecords || buf_off >= stride) { DPRINTF(GCN3, "mubuf out-of-bounds condition 2: " -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29968 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I8f94d44c242fda26cf6dfb75db04fa3aca934b3e Gerrit-Change-Number: 29968 Gerrit-PatchSet: 7 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Michael LeBeane Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix VOP3 V_LDEXP_F64
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29966 ) Change subject: arch-gcn3: Fix VOP3 V_LDEXP_F64 .. arch-gcn3: Fix VOP3 V_LDEXP_F64 Replaced !std::isnormal with std::fpclassify because std::isnormal is not specific enough. !std::isnormal was incorrectly catching NaN, Inf, 0.0, and subnormals (aka denormals), where as it was only suppose to catch subnormals. The return value and error handling spec of std::ldexp listed on cppreference.com appears to match up in nearly all cases after making these changes. If std::ldexp handled subnormals as described in the GCN3 2016 guide, we could have used vdst[lane] = std::ldexp and not need to check for any corner cases. Change-Id: I4c77af77c3b7798f86d40442610cef1296a28441 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29966 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 4 insertions(+), 3 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 2b992b1..302dad4 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -30282,10 +30282,11 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { -if (std::isnan(src1[lane]) || std::isinf(src1[lane])) { +if (std::isnan(src0[lane]) || std::isinf(src0[lane])) { vdst[lane] = src0[lane]; -} else if (!std::isnormal(src1[lane])) { -if (std::signbit(src1[lane])) { +} else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL + || std::fpclassify(src0[lane]) == FP_ZERO) { +if (std::signbit(src0[lane])) { vdst[lane] = -0.0; } else { vdst[lane] = +0.0; -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29966 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I4c77af77c3b7798f86d40442610cef1296a28441 Gerrit-Change-Number: 29966 Gerrit-PatchSet: 7 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix roundNearestEven for V_RNDNE_F64 and V_RNDNE_F32
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29964 ) Change subject: arch-gcn3: Fix roundNearestEven for V_RNDNE_F64 and V_RNDNE_F32 .. arch-gcn3: Fix roundNearestEven for V_RNDNE_F64 and V_RNDNE_F32 roundNearestEven is an inst_util function that RNDNE_F64 and F32 call, including both VOP1 and VOP3 formats. IEEE 754 spec says this function should round inputs to the nearest integer but round ties to the nearest even integer. Prior to this patch it was rounding all inputs to nearest even, not just the ties. It was probably implemented this way originally because the language in the ISA manual is ambiguous although it provided the correct logic. Fixed roundNearestEven to use the semantics originally described in the GCN3 ISA manual. Change-Id: I83ecb1d516fcf5bdf17e54ddf409b447a129a9a7 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29964 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/inst_util.hh 1 file changed, 7 insertions(+), 1 deletion(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/inst_util.hh b/src/arch/gcn3/insts/inst_util.hh index b40e890..15ffe9a 100644 --- a/src/arch/gcn3/insts/inst_util.hh +++ b/src/arch/gcn3/insts/inst_util.hh @@ -258,7 +258,13 @@ template inline T roundNearestEven(T val) { -T nearest_round = std::round(val * 0.5) * 2.0; +T int_part = 0; +T nearest_round = std::floor(val + 0.5); +if ((int)std::floor(val) % 2 == 0 +&& std::modf(std::abs(val), &int_part) == 0.5) { + nearest_round = nearest_round - 1; +} + return nearest_round; } -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29964 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I83ecb1d516fcf5bdf17e54ddf409b447a129a9a7 Gerrit-Change-Number: 29964 Gerrit-PatchSet: 7 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Fix Y-dimension ABI decode
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29965 ) Change subject: gpu-compute: Fix Y-dimension ABI decode .. gpu-compute: Fix Y-dimension ABI decode We currently have a bug in decoding workitem ID from the kernel descriptor with multiple dimensions. The enable_vgpr_workitem_id bits are currently seperated into x and y components, when they should be treated as a single 2 bit value, where y is enabled when it is > 0, and z is enabled when it is > 1. The current setup allows a kernel launch with vgprs reserved for the z dimension and not the y dimension, which is incorrect. Change-Id: Iee64b207feb95bcf064898d5db33b8f201e25323 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29965 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/gpu-compute/hsa_queue_entry.hh M src/gpu-compute/kernel_code.hh 2 files changed, 3 insertions(+), 4 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh index 5fc5e56..ea79869 100644 --- a/src/gpu-compute/hsa_queue_entry.hh +++ b/src/gpu-compute/hsa_queue_entry.hh @@ -417,8 +417,8 @@ * workitem Id in the X dimension is always initialized. */ initialVgprState.set(WorkitemIdX, true); -initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id_y); -initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id_z); +initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id > 0); +initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id > 1); } // name of the kernel associated with the AQL entry diff --git a/src/gpu-compute/kernel_code.hh b/src/gpu-compute/kernel_code.hh index b3560c7..680dd72 100644 --- a/src/gpu-compute/kernel_code.hh +++ b/src/gpu-compute/kernel_code.hh @@ -130,8 +130,7 @@ uint32_t enable_sgpr_workgroup_id_y : 1; uint32_t enable_sgpr_workgroup_id_z : 1; uint32_t enable_sgpr_workgroup_info : 1; -uint32_t enable_vgpr_workitem_id_y : 1; -uint32_t enable_vgpr_workitem_id_z : 1; +uint32_t enable_vgpr_workitem_id : 2; uint32_t enable_exception_address_watch : 1; uint32_t enable_exception_memory_violation : 1; uint32_t granulated_lds_size : 9; -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29965 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Iee64b207feb95bcf064898d5db33b8f201e25323 Gerrit-Change-Number: 29965 Gerrit-PatchSet: 7 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Michael LeBeane Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Don't track vector store insts in CU's headTailMap
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29963 ) Change subject: gpu-compute: Don't track vector store insts in CU's headTailMap .. gpu-compute: Don't track vector store insts in CU's headTailMap This change fixes a memory leak due to live GPUDynInstPtr references to vector store insts being stored in the CU's headTailMap and never released. This happened because store insts are not supposed to have their head-tail latencies tracked by the headTailMap; instead they use timing information from the GPUCoalescer. When updating the headTailLatency stat via the headTailMap, only loads were considered and removed from the headTailMap, however when inserting into the headTailMap loads and stores were considered, thus leading to the memory leak. This change fixes the issue by only adding loads to the headTailMap. Change-Id: I8a8f5b79f55e00481ae5e82519a9ed627a7ecbd1 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29963 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/gpu-compute/compute_unit.cc 1 file changed, 5 insertions(+), 3 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index f3387a7..653c074 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -1389,9 +1389,11 @@ gpuDynInst->wfSlotId); } } else { -if (!compute_unit->headTailMap.count(gpuDynInst)) { -compute_unit->headTailMap.insert( -std::make_pair(gpuDynInst, curTick())); +if (pkt->isRead()) { +if (!compute_unit->headTailMap.count(gpuDynInst)) { +compute_unit->headTailMap +.insert(std::make_pair(gpuDynInst, curTick())); +} } } -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29963 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I8a8f5b79f55e00481ae5e82519a9ed627a7ecbd1 Gerrit-Change-Number: 29963 Gerrit-PatchSet: 7 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: add all s_buffer_load_dword instructions
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29962 ) Change subject: arch-gcn3: add all s_buffer_load_dword instructions .. arch-gcn3: add all s_buffer_load_dword instructions Adds the other s_buffer_load_dword* instruction implementations to f134a84. Change-Id: I8d97527278900dc68c32463ea1824409ccd04e1d Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29962 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 125 insertions(+), 8 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 002c4d5..2b992b1 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -4737,17 +4737,46 @@ void Inst_SMEM__S_BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); +ScalarRegU32 offset(0); +ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE); + +rsrcDesc.read(); + +if (instData.IMM) { +offset = extData.OFFSET; +} else { +ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET); +off_sgpr.read(); +offset = off_sgpr.rawData(); +} + +calcAddr(gpuDynInst, rsrcDesc, offset); + +gpuDynInst->computeUnit()->scalarMemoryPipe +.getGMReqFIFO().push(gpuDynInst); + +wf->scalarRdGmReqsInPipe--; +wf->scalarOutstandingReqsRdGm++; +gpuDynInst->wavefront()->outstandingReqs++; +gpuDynInst->wavefront()->validateRequestCounters(); +} // execute void Inst_SMEM__S_BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst) { +initMemRead<1>(gpuDynInst); } // initiateAcc void Inst_SMEM__S_BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst) { +// 1 request, size 32 +ScalarOperandU32 sdst(gpuDynInst, instData.SDATA); +sdst.write(); } // completeAcc Inst_SMEM__S_BUFFER_LOAD_DWORDX2::Inst_SMEM__S_BUFFER_LOAD_DWORDX2( @@ -4767,17 +4796,46 @@ void Inst_SMEM__S_BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); +ScalarRegU32 offset(0); +ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE); + +rsrcDesc.read(); + +if (instData.IMM) { +offset = extData.OFFSET; +} else { +ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET); +off_sgpr.read(); +offset = off_sgpr.rawData(); +} + +calcAddr(gpuDynInst, rsrcDesc, offset); + +gpuDynInst->computeUnit()->scalarMemoryPipe +.getGMReqFIFO().push(gpuDynInst); + +wf->scalarRdGmReqsInPipe--; +wf->scalarOutstandingReqsRdGm++; +gpuDynInst->wavefront()->outstandingReqs++; +gpuDynInst->wavefront()->validateRequestCounters(); +} // execute void Inst_SMEM__S_BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst) { +initMemRead<2>(gpuDynInst); } // initiateAcc void Inst_SMEM__S_BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst) { +// use U64 because 2 requests, each size 32 +ScalarOperandU64 sdst(gpuDynInst, instData.SDATA); +sdst.write(); } // completeAcc Inst_SMEM__S_BUFFER_LOAD_DWORDX4::Inst_SMEM__S_BUFFER_LOAD_DWORDX4( @@ -4797,17 +4855,46 @@ void Inst_SMEM__S_BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); +ScalarRegU32 offset(0); +ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE); + +rsrcDesc.read(); + +if (instData.IMM) { +offset = extData.OFFSET; +} else { +ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET); +off_sgpr.read(); +offset = off_sgpr.rawData(); +} + +calcAddr(gpuDyn
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add memcpy condition when writing EXEC_LO
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29961 ) Change subject: arch-gcn3: Add memcpy condition when writing EXEC_LO .. arch-gcn3: Add memcpy condition when writing EXEC_LO Some compilers emit an error on the operand template class when writing exec mask. Add a condition to explicitly set memcpy size argument to 32b or 64b based on the number of dwords. Change-Id: I49b0e4a1680283e772d0a5a8efd687b31d4f1624 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29961 Reviewed-by: Anthony Gutierrez Reviewed-by: Matt Sinclair Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/arch/gcn3/operand.hh 1 file changed, 9 insertions(+), 2 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Matt Sinclair: Looks good to me, but someone else must approve kokoro: Regressions pass diff --git a/src/arch/gcn3/operand.hh b/src/arch/gcn3/operand.hh index 9d28deb..97c6310 100644 --- a/src/arch/gcn3/operand.hh +++ b/src/arch/gcn3/operand.hh @@ -437,8 +437,15 @@ if (_opIdx == REG_EXEC_LO) { ScalarRegU64 new_exec_mask_val = wf->execMask().to_ullong(); -std::memcpy((void*)&new_exec_mask_val, -(void*)srfData.data(), sizeof(srfData)); +if (NumDwords == 1) { +std::memcpy((void*)&new_exec_mask_val, +(void*)srfData.data(), sizeof(VecElemU32)); +} else if (NumDwords == 2) { +std::memcpy((void*)&new_exec_mask_val, +(void*)srfData.data(), sizeof(VecElemU64)); +} else { +panic("Trying to write more than 2 DWORDS to EXEC\n"); +} VectorMask new_exec_mask(new_exec_mask_val); wf->execMask() = new_exec_mask; DPRINTF(GPUSRF, "Write EXEC\n"); -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29961 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I49b0e4a1680283e772d0a5a8efd687b31d4f1624 Gerrit-Change-Number: 29961 Gerrit-PatchSet: 8 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Matthew Poremba Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Remove invalid assert when reading EXEC_LO
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29960 ) Change subject: arch-gcn3: Remove invalid assert when reading EXEC_LO .. arch-gcn3: Remove invalid assert when reading EXEC_LO This assert assumed all reads to EXEC_LO would be 64b, that is, we would always read the entire EXEC mask. This is invalid as some kernels read only the low 32b of EXEC. The write to EXEC_LO is also updated to handle 32b writes. Change-Id: Ifeb167578515bf112b1eab70bbf2201a5e936358 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29960 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/operand.hh 1 file changed, 3 insertions(+), 3 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/operand.hh b/src/arch/gcn3/operand.hh index 960d05e..9d28deb 100644 --- a/src/arch/gcn3/operand.hh +++ b/src/arch/gcn3/operand.hh @@ -435,9 +435,10 @@ if (!isScalarReg(_opIdx)) { if (_opIdx == REG_EXEC_LO) { -ScalarRegU64 new_exec_mask_val(0); +ScalarRegU64 new_exec_mask_val += wf->execMask().to_ullong(); std::memcpy((void*)&new_exec_mask_val, -(void*)srfData.data(), sizeof(new_exec_mask_val)); +(void*)srfData.data(), sizeof(srfData)); VectorMask new_exec_mask(new_exec_mask_val); wf->execMask() = new_exec_mask; DPRINTF(GPUSRF, "Write EXEC\n"); @@ -513,7 +514,6 @@ switch(_opIdx) { case REG_EXEC_LO: { -assert(NumDwords == 2); ScalarRegU64 exec_mask = _gpuDynInst->wavefront()-> execMask().to_ullong(); std::memcpy((void*)srfData.data(), (void*)&exec_mask, -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29960 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Ifeb167578515bf112b1eab70bbf2201a5e936358 Gerrit-Change-Number: 29960 Gerrit-PatchSet: 8 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: gpu_compute: Support loading BLIT kernels
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29959 ) Change subject: gpu_compute: Support loading BLIT kernels .. gpu_compute: Support loading BLIT kernels The BLIT kernels used to implement DMA through the shaders don't fill out all of the standard fields in an amd_kernel_code_t object. This patch modifies the code object parsing logic to support these new kernels. BLIT kernels are used in APUs when using ROCm memcopies for certain size buffers, and are used for dGPUs when the SDMA engines are disabled. Change-Id: Id4e667474d05e311097dbec443def07dfad14a79 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29959 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/gpu-compute/gpu_command_processor.cc M src/gpu-compute/hsa_queue_entry.hh 2 files changed, 31 insertions(+), 4 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc index b6205ac..fccc035 100644 --- a/src/gpu-compute/gpu_command_processor.cc +++ b/src/gpu-compute/gpu_command_processor.cc @@ -100,11 +100,25 @@ machine_code_addr); Addr kern_name_addr(0); -virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10, -(uint8_t*)&kern_name_addr, 0x8); - std::string kernel_name; -virt_proxy.readString(kernel_name, kern_name_addr); + +/** + * BLIT kernels don't have symbol names. BLIT kernels are built-in compute + * kernels issued by ROCm to handle DMAs for dGPUs when the SDMA + * hardware engines are unavailable or explicitly disabled. They can also + * be used to do copies that ROCm things would be better performed + * by the shader than the SDMA engines. They are also sometimes used on + * APUs to implement asynchronous memcopy operations from 2 pointers in + * host memory. I have no idea what BLIT stands for. + * */ +if (akc.runtime_loader_kernel_symbol) { +virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10, +(uint8_t*)&kern_name_addr, 0x8); + +virt_proxy.readString(kernel_name, kern_name_addr); +} else { +kernel_name = "Blit kernel"; +} DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str()); diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh index a6917db..5fc5e56 100644 --- a/src/gpu-compute/hsa_queue_entry.hh +++ b/src/gpu-compute/hsa_queue_entry.hh @@ -88,6 +88,19 @@ _globalWgId(0), dispatchComplete(false) { +// Precompiled BLIT kernels actually violate the spec a bit +// and don't set many of the required akc fields. For these kernels, +// we need to rip register usage from the resource registers. +// +// We can't get an exact number of registers from the resource +// registers because they round, but we can get an upper bound on it +if (!numVgprs) +numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4; + +// TODO: Granularity changes for GFX9! +if (!numSgprs) +numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8; + initialVgprState.reset(); initialSgprState.reset(); -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29959 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Id4e667474d05e311097dbec443def07dfad14a79 Gerrit-Change-Number: 29959 Gerrit-PatchSet: 7 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Michael LeBeane Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement ds_swizzle
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29958 ) Change subject: arch-gcn3: Implement ds_swizzle .. arch-gcn3: Implement ds_swizzle Change-Id: I7d188388afa16932217ae207368666a724207c52 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29958 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 102 insertions(+), 2 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 71efd8f..002c4d5 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -32266,6 +32266,7 @@ Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt) : Inst_DS(iFmt, "ds_swizzle_b32") { + setFlag(Load); } // Inst_DS__DS_SWIZZLE_B32 Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32() @@ -32277,8 +32278,107 @@ void Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +wf->rdLmReqsInPipe--; +wf->validateRequestCounters(); + +if (gpuDynInst->exec_mask.none()) { +return; +} + +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit() +->cyclesToTicks(Cycles(24))); + +ConstVecOperandU32 data(gpuDynInst, extData.DATA0); +VecOperandU32 vdst(gpuDynInst, extData.VDST); +/** + * The "DS pattern" is comprised of both offset fields. That is, the + * swizzle pattern between lanes. Bit 15 of the DS pattern dictates + * which swizzle mode to use. There are two different swizzle + * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use + * QDMode else use Bit-masks mode. The remaining bits dictate how to + * swizzle the lanes. + * + * QDMode: Chunks the lanes into 4s and swizzles among them. + * Bits 7:6 dictate where lane 3 (of the current chunk) + * gets its date, 5:4 lane 2, etc. + * + * Bit-mask:This mode breaks bits 14:0 into 3 equal-sized chunks. + * 14:10 is the xor_mask, 9:5 is the or_mask, and 4:0 + * is the and_mask. Each lane is swizzled by performing + * the appropriate operation using these masks. + */ +VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) | instData.OFFSET0); + +data.read(); + +if (bits(ds_pattern, 15)) { +// QDMode +for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) { +/** + * This operation allows data sharing between groups + * of four consecutive threads. Note the increment by + * 4 in the for loop. + */ +if (gpuDynInst->exec_mask[lane]) { +int index0 = lane + bits(ds_pattern, 1, 0); +panic_if(index0 >= NumVecElemPerVecReg, "%s: index0 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index0); +vdst[lane] += gpuDynInst->exec_mask[index0] ? data[index0]: 0; +} +if (gpuDynInst->exec_mask[lane + 1]) { +int index1 = lane + bits(ds_pattern, 3, 2); +panic_if(index1 >= NumVecElemPerVecReg, "%s: index1 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index1); +vdst[lane + 1] += gpuDynInst->exec_mask[index1] ? data[index1]: 0; +} +if (gpuDynInst->exec_mask[lane + 2]) { +int index2 = lane + bits(ds_pattern, 5, 4); +panic_if(index2 >= NumVecElemPerVecReg, "%s: index2 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index2); +vdst[lane + 2] += gpuDynInst->exec_mask[index2] ? data[index2]: 0; +} +if (gpuDynInst->exec_mask[lane + 3]) { +int index3 = lane + bits(ds_pattern, 7, 6); +panic_if(index3 >= NumVecElemPerVecReg, "%s: index3 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index3); +
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement s_buffer_load_dwordx16
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29957 ) Change subject: arch-gcn3: Implement s_buffer_load_dwordx16 .. arch-gcn3: Implement s_buffer_load_dwordx16 Change-Id: I25382dcae9bb55eaf035385fa925157f25d39c20 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29957 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/instructions.cc M src/arch/gcn3/insts/op_encodings.hh 2 files changed, 90 insertions(+), 31 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 567cc10..71efd8f 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -4857,17 +4857,45 @@ void Inst_SMEM__S_BUFFER_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); +ScalarRegU32 offset(0); +ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE); + +rsrcDesc.read(); + +if (instData.IMM) { +offset = extData.OFFSET; +} else { +ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET); +off_sgpr.read(); +offset = off_sgpr.rawData(); +} + +calcAddr(gpuDynInst, rsrcDesc, offset); + +gpuDynInst->computeUnit()->scalarMemoryPipe +.getGMReqFIFO().push(gpuDynInst); + +wf->scalarRdGmReqsInPipe--; +wf->scalarOutstandingReqsRdGm++; +gpuDynInst->wavefront()->outstandingReqs++; +gpuDynInst->wavefront()->validateRequestCounters(); +} // execute void Inst_SMEM__S_BUFFER_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst) { +initMemRead<16>(gpuDynInst); } // initiateAcc void Inst_SMEM__S_BUFFER_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst) { +ScalarOperandU512 sdst(gpuDynInst, instData.SDATA); +sdst.write(); } // completeAcc Inst_SMEM__S_STORE_DWORD::Inst_SMEM__S_STORE_DWORD(InFmt_SMEM *iFmt) diff --git a/src/arch/gcn3/insts/op_encodings.hh b/src/arch/gcn3/insts/op_encodings.hh index 4056f0a..202dd1d 100644 --- a/src/arch/gcn3/insts/op_encodings.hh +++ b/src/arch/gcn3/insts/op_encodings.hh @@ -46,6 +46,29 @@ namespace Gcn3ISA { +struct BufferRsrcDescriptor +{ +uint64_t baseAddr : 48; +uint32_t stride : 14; +uint32_t cacheSwizzle : 1; +uint32_t swizzleEn : 1; +uint32_t numRecords : 32; +uint32_t dstSelX : 3; +uint32_t dstSelY : 3; +uint32_t dstSelZ : 3; +uint32_t dstSelW : 3; +uint32_t numFmt : 3; +uint32_t dataFmt : 4; +uint32_t elemSize : 2; +uint32_t idxStride : 2; +uint32_t addTidEn : 1; +uint32_t atc : 1; +uint32_t hashEn : 1; +uint32_t heap : 1; +uint32_t mType : 3; +uint32_t type : 2; +}; + // --- purely virtual instruction classes --- class Inst_SOP2 : public GCN3GPUStaticInst @@ -197,14 +220,45 @@ MemCmd::WriteReq); } +/** + * For normal s_load_dword/s_store_dword instruction addresses. + */ void -calcAddr(GPUDynInstPtr gpuDynInst, ConstScalarOperandU64 &addr, -ScalarRegU32 offset) +calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU64 &addr, + ScalarRegU32 offset) { -Addr vaddr = addr.rawData(); -vaddr += offset; -vaddr &= ~0x3; -gpuDynInst->scalarAddr = vaddr; +Addr vaddr = ((addr.rawData() + offset) & ~0x3); +gpu_dyn_inst->scalarAddr = vaddr; +} + +/** + * For s_buffer_load_dword/s_buffer_store_dword instruction addresses. + * The s_buffer instructions use the same buffer resource descriptor + * as the MUBUF instructions. + */ +void +calcAddr(GPUDynInstPtr gpu_dyn_inst, + ConstScalarOperandU128 &s_rsrc_desc, ScalarRegU32 offset) +{ +BufferRsrcDescriptor rsrc_desc; +ScalarRegU32 clamped_offset(offset); +std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(), +sizeof(BufferRsrcDescriptor)); + +/** + * The address is clamped if: + * Stride is zero: clamp if offset >= num_records + * Stride is non-zer
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fixup DIV instructions
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29956 ) Change subject: arch-gcn3: Fixup DIV instructions .. arch-gcn3: Fixup DIV instructions Adds support to handle the special cases for GCN3 DIV instructions. Change-Id: I18f91870e802407c93831f313ce76be053bc4230 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29956 Reviewed-by: Anthony Gutierrez Reviewed-by: Matt Sinclair Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 44 insertions(+), 42 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Matt Sinclair: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index a25ec17..567cc10 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -28952,34 +28952,35 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { -int signOut = std::signbit(src1[lane]) ^ - std::signbit(src2[lane]); -int exp1, exp2; -std::frexp(src1[lane],&exp1); -std::frexp(src2[lane],&exp2); -if (std::isnan(src2[lane])) { -vdst[lane] = src2[lane]; -} else if (std::isnan(src1[lane])) { -vdst[lane] = src1[lane]; -} else if (src1[lane] == 0.0 && src2[lane] == 0.0) { -vdst[lane] = -NAN; +int sign_out = std::signbit(src1[lane]) + ^ std::signbit(src2[lane]); +int exp1(0); +int exp2(0); +std::frexp(src1[lane], &exp1); +std::frexp(src2[lane], &exp2); + +if (std::isnan(src1[lane]) || std::isnan(src2[lane])) { +vdst[lane] = std::numeric_limits::quiet_NaN(); +} else if (std::fpclassify(src1[lane]) == FP_ZERO + && std::fpclassify(src2[lane]) == FP_ZERO) { +vdst[lane] += std::numeric_limits::signaling_NaN(); } else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) { -vdst[lane] = -NAN; -} else if (src1[lane] == 0.0 || std::isinf(src2[lane])) { -vdst[lane] = signOut ? -INFINITY : +INFINITY; -} else if (src2[lane] == 0.0 || std::isinf(src1[lane])) { -vdst[lane] = signOut ? -0.0 : +0.0; +vdst[lane] += std::numeric_limits::signaling_NaN(); +} else if (std::fpclassify(src1[lane]) == FP_ZERO + || std::isinf(src2[lane])) { +vdst[lane] = sign_out ? -INFINITY : +INFINITY; +} else if (std::isinf(src1[lane]) + || std::fpclassify(src2[lane]) == FP_ZERO) { +vdst[lane] = sign_out ? -0.0 : +0.0; } else if (exp2 - exp1 < -1075) { -warn_once("fixup_f64 unimplemented case:" - "exp2 - ex1 < -1075"); vdst[lane] = src0[lane]; } else if (exp1 == 2047) { -warn_once("fixup_f64 unimplemented case:" - "exp1 == 2047"); vdst[lane] = src0[lane]; } else { -vdst[lane] = ((uint64_t)signOut<<63) | -((uint64_t)src0[lane] & 0x7fffULL); +vdst[lane] = sign_out ? -std::fabs(src0[lane]) +: std::fabs(src0[lane]); } } } @@ -29089,36 +29090,37 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { -int exp1, exp2; -std::frexp(src1[lane],&exp1); -std::frexp(src2[lane],&exp2); +int exp1(0); +int exp2(0); +std::frexp(src1[lane], &exp1); +std::frexp(src2[lane], &exp2); vcc.setBit(lane, 0); -if (src2[lane] == 0 || src1[lane] == 0) { + +if (std::fpclassify(src1[lane]) == FP_ZERO +|| std::fpclassify(src2[lane]) == FP_ZERO) { vdst[lane] = NAN; } else if (exp2 - exp1 >= 768) { vcc.setBit(lane, 1); if (src0[lane] == src1[lane]) { -vdst[lane] = std::ldexp(src0[lane],128); +vdst[lane] = std::ldexp(src0[lane], 128); } -} else if (exp1 == 0) { -
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add handling for Inf/overflow in CVT insts
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29953 ) Change subject: arch-gcn3: Add handling for Inf/overflow in CVT insts .. arch-gcn3: Add handling for Inf/overflow in CVT insts Change-Id: I0fddffdeaebd9f45fe89f44d536f80a43de63ff5 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29953 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 77 insertions(+), 1 deletion(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index e93278a..a7b8923 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -7260,8 +7260,16 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { +int exp; +std::frexp(src[lane],&exp); if (std::isnan(src[lane])) { vdst[lane] = 0; +} else if (std::isinf(src[lane]) || exp > 30) { +if (std::signbit(src[lane])) { +vdst[lane] = INT_MIN; +} else { +vdst[lane] = INT_MAX; +} } else { vdst[lane] = (VecElemI32)src[lane]; } @@ -7386,8 +7394,18 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { +int exp; +std::frexp(src[lane],&exp); if (std::isnan(src[lane])) { vdst[lane] = 0; +} else if (std::isinf(src[lane])) { +if (std::signbit(src[lane])) { +vdst[lane] = 0; +} else { +vdst[lane] = UINT_MAX; +} +} else if (exp > 31) { +vdst[lane] = UINT_MAX; } else { vdst[lane] = (VecElemU32)src[lane]; } @@ -7422,8 +7440,16 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { +int exp; +std::frexp(src[lane],&exp); if (std::isnan(src[lane])) { vdst[lane] = 0; +} else if (std::isinf(src[lane]) || exp > 30) { +if (std::signbit(src[lane])) { +vdst[lane] = INT_MIN; +} else { +vdst[lane] = INT_MAX; +} } else { vdst[lane] = (VecElemI32)src[lane]; } @@ -7772,8 +7798,18 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { +int exp; +std::frexp(src[lane],&exp); if (std::isnan(src[lane])) { vdst[lane] = 0; +} else if (std::isinf(src[lane])) { +if (std::signbit(src[lane])) { +vdst[lane] = 0; +} else { +vdst[lane] = UINT_MAX; +} +} else if (exp > 31) { +vdst[lane] = UINT_MAX; } else { vdst[lane] = (VecElemU32)src[lane]; } @@ -25075,8 +25111,16 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { +int exp; +std::frexp(src[lane],&exp); if (std::isnan(src[lane])) { vdst[lane] = 0; +} else if (std::isinf(src[lane]) || exp > 30) { +if (std::signbit(src[lane])) { +vdst[lane] = INT_MIN; +} else { +vdst[lane] = INT_MAX; +} } else { vdst[lane] = (VecElemI32)src[lane]; } @@ -25235,8 +25279,18 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { +int exp; +std::frexp(src[lane],&exp); if (std::isnan(src[lane])) { vdst[lane] = 0; +} else if (std::isinf(src[lane])) { +if (std::signbit(src[lane])) { +vdst[lane] = 0; +} else { +vdst[lane] = UINT_MAX; +} +} else if (exp > 31) { +vdst[lane] = UINT_MAX; } else { vdst[lane] = (V
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix s_getpc operand information
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29954 ) Change subject: arch-gcn3: Fix s_getpc operand information .. arch-gcn3: Fix s_getpc operand information s_getpc was currently reporting only a single operand, and was only considering the SSRC operand. However, this instruction' source is implicitly the PC. Because its destination register was never tracked for dependence checking purposes, dependence violations are possible. Change-Id: Ia80b8b3e24d5885f646a9ee41212a2cb35b9ffe6 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29954 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/instructions.hh M src/arch/gcn3/insts/op_encodings.cc 2 files changed, 15 insertions(+), 10 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.hh b/src/arch/gcn3/insts/instructions.hh index b0cc37e..f561043 100644 --- a/src/arch/gcn3/insts/instructions.hh +++ b/src/arch/gcn3/insts/instructions.hh @@ -5846,9 +5846,7 @@ getOperandSize(int opIdx) override { switch (opIdx) { - case 0: //ssrc -return 8; - case 1: //sdst + case 0: //sdst return 8; default: fatal("op idx %i out of bounds\n", opIdx); @@ -5860,9 +5858,7 @@ isSrcOperand(int opIdx) override { switch (opIdx) { - case 0: //ssrc -return true; - case 1: //sdst + case 0: //sdst return false; default: fatal("op idx %i out of bounds\n", opIdx); @@ -5874,9 +5870,7 @@ isDstOperand(int opIdx) override { switch (opIdx) { - case 0: //ssrc -return false; - case 1: //sdst + case 0: //sdst return true; default: fatal("op idx %i out of bounds\n", opIdx); diff --git a/src/arch/gcn3/insts/op_encodings.cc b/src/arch/gcn3/insts/op_encodings.cc index 22d0f48..997b22f 100644 --- a/src/arch/gcn3/insts/op_encodings.cc +++ b/src/arch/gcn3/insts/op_encodings.cc @@ -326,7 +326,12 @@ switch (opIdx) { case 0: - return isScalarReg(instData.SSRC0); +if (instData.OP == 0x1C) { +// Special case for s_getpc, which has no source reg. +// Instead, it implicitly reads the PC. +return isScalarReg(instData.SDST); +} +return isScalarReg(instData.SSRC0); case 1: return isScalarReg(instData.SDST); default: @@ -353,6 +358,12 @@ switch (opIdx) { case 0: +if (instData.OP == 0x1C) { +// Special case for s_getpc, which has no source reg. +// Instead, it implicitly reads the PC. +return opSelectorToRegIdx(instData.SDST, +gpuDynInst->wavefront()->reservedScalarRegs); +} return opSelectorToRegIdx(instData.SSRC0, gpuDynInst->wavefront()->reservedScalarRegs); case 1: -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29954 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Ia80b8b3e24d5885f646a9ee41212a2cb35b9ffe6 Gerrit-Change-Number: 29954 Gerrit-PatchSet: 7 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: fixed scale,fixup,fmas f64 ops
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29955 ) Change subject: arch-gcn3: fixed scale,fixup,fmas f64 ops .. arch-gcn3: fixed scale,fixup,fmas f64 ops Change-Id: Ie13794554db8a958fda1f7103ec18058fda2e66d Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29955 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 65 insertions(+), 17 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index a7b8923..a25ec17 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -28952,22 +28952,34 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { -if (std::fpclassify(src1[lane]) == FP_ZERO) { -if (std::signbit(src1[lane])) { -vdst[lane] = -INFINITY; -} else { -vdst[lane] = +INFINITY; -} -} else if (std::isnan(src2[lane]) || std::isnan(src1[lane])) { -vdst[lane] = NAN; -} else if (std::isinf(src1[lane])) { -if (std::signbit(src1[lane])) { -vdst[lane] = -INFINITY; -} else { -vdst[lane] = +INFINITY; -} +int signOut = std::signbit(src1[lane]) ^ + std::signbit(src2[lane]); +int exp1, exp2; +std::frexp(src1[lane],&exp1); +std::frexp(src2[lane],&exp2); +if (std::isnan(src2[lane])) { +vdst[lane] = src2[lane]; +} else if (std::isnan(src1[lane])) { +vdst[lane] = src1[lane]; +} else if (src1[lane] == 0.0 && src2[lane] == 0.0) { +vdst[lane] = -NAN; +} else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) { +vdst[lane] = -NAN; +} else if (src1[lane] == 0.0 || std::isinf(src2[lane])) { +vdst[lane] = signOut ? -INFINITY : +INFINITY; +} else if (src2[lane] == 0.0 || std::isinf(src1[lane])) { +vdst[lane] = signOut ? -0.0 : +0.0; +} else if (exp2 - exp1 < -1075) { +warn_once("fixup_f64 unimplemented case:" + "exp2 - ex1 < -1075"); +vdst[lane] = src0[lane]; +} else if (exp1 == 2047) { +warn_once("fixup_f64 unimplemented case:" + "exp1 == 2047"); +vdst[lane] = src0[lane]; } else { -vdst[lane] = src2[lane] / src1[lane]; +vdst[lane] = ((uint64_t)signOut<<63) | +((uint64_t)src0[lane] & 0x7fffULL); } } } @@ -29077,8 +29089,37 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { -vdst[lane] = src0[lane]; +int exp1, exp2; +std::frexp(src1[lane],&exp1); +std::frexp(src2[lane],&exp2); vcc.setBit(lane, 0); +if (src2[lane] == 0 || src1[lane] == 0) { +vdst[lane] = NAN; +} else if (exp2 - exp1 >= 768) { +vcc.setBit(lane, 1); +if (src0[lane] == src1[lane]) { +vdst[lane] = std::ldexp(src0[lane],128); +} +} else if (exp1 == 0) { +vdst[lane] = std::ldexp(src0[lane],128); +} else if (exp1 >= 0x7fd && exp2 - exp1 <= -768) { +vcc.setBit(lane, 1); +if (src0[lane] == src1[lane]) { +vdst[lane] = std::ldexp(src0[lane],-128); +} +} else if (exp1 >= 0x7fd) { +vdst[lane] = std::ldexp(src0[lane],-128); +} else if (exp2 - exp1 <= -768) { +vcc.setBit(lane, 1); +if (src0[lane] != src2[lane]) { +vdst[lane] = std::ldexp(src0[lane],128); +} +} else if (exp2 <= 53) { +vdst[lane] = std::ldexp(src0[lane],128); +} +else { +vdst[lane] = src0[lane]; +} } } @@ -29171,10 +29212,12 @@ ConstVecOperandF64 src1(gpuDynInst, ext
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: ds_read_u8 and ds_read_u16 fix
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29951 ) Change subject: arch-gcn3: ds_read_u8 and ds_read_u16 fix .. arch-gcn3: ds_read_u8 and ds_read_u16 fix This changeset zero extends the destination register for ds_read_u8 and ds_read_u16 instructions. Change-Id: I193adadd68adf2572b59743b1504f18ad225f506 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29951 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 4 insertions(+), 4 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 8b72e0d..6e5ff42 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -32016,11 +32016,11 @@ void Inst_DS__DS_READ_U8::completeAcc(GPUDynInstPtr gpuDynInst) { -VecOperandU8 vdst(gpuDynInst, extData.VDST); +VecOperandU32 vdst(gpuDynInst, extData.VDST); for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { -vdst[lane] = (reinterpret_cast( +vdst[lane] = (VecElemU32)(reinterpret_cast( gpuDynInst->d_data))[lane]; } } @@ -32096,11 +32096,11 @@ void Inst_DS__DS_READ_U16::completeAcc(GPUDynInstPtr gpuDynInst) { -VecOperandU16 vdst(gpuDynInst, extData.VDST); +VecOperandU32 vdst(gpuDynInst, extData.VDST); for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { -vdst[lane] = (reinterpret_cast( +vdst[lane] = (VecElemU32)(reinterpret_cast( gpuDynInst->d_data))[lane]; } } -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29951 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I193adadd68adf2572b59743b1504f18ad225f506 Gerrit-Change-Number: 29951 Gerrit-PatchSet: 7 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Alexandru Duțu Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add ds_bpermute and ds_permute insts
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29952 ) Change subject: arch-gcn3: Add ds_bpermute and ds_permute insts .. arch-gcn3: Add ds_bpermute and ds_permute insts The implementation of these insts provided by this change is based on the description provided here: https://gpuopen.com/amd-gcn-assembly-cross-lane-operations/ Change-Id: Id63b6c34c9fdc6e0dbd445d859e7b209023f2874 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29952 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 113 insertions(+), 4 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 6e5ff42..e93278a 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -32129,6 +32129,13 @@ Inst_DS__DS_PERMUTE_B32::Inst_DS__DS_PERMUTE_B32(InFmt_DS *iFmt) : Inst_DS(iFmt, "ds_permute_b32") { +setFlag(MemoryRef); +/** + * While this operation doesn't actually use DS storage we classify + * it as a load here because it does a writeback to a VGPR, which + * fits in better with the LDS pipeline logic. + */ + setFlag(Load); } // Inst_DS__DS_PERMUTE_B32 Inst_DS__DS_PERMUTE_B32::~Inst_DS__DS_PERMUTE_B32() @@ -32139,12 +32146,66 @@ void Inst_DS__DS_PERMUTE_B32::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit() +->cyclesToTicks(Cycles(24))); +ConstVecOperandU32 addr(gpuDynInst, extData.ADDR); +ConstVecOperandU32 data(gpuDynInst, extData.DATA0); +VecOperandU32 vdst(gpuDynInst, extData.VDST); + +addr.read(); +data.read(); + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (wf->execMask(lane)) { +/** + * One of the offset fields can be used for the index. + * It is assumed OFFSET0 would be used, as OFFSET1 is + * typically only used for DS ops that operate on two + * disparate pieces of data. + */ +assert(!instData.OFFSET1); +/** + * The address provided is a byte address, but VGPRs are + * 4 bytes, so we must divide by 4 to get the actual VGPR + * index. Additionally, the index is calculated modulo the + * WF size, 64 in this case, so we simply extract bits 7-2. + */ +int index = bits(addr[lane] + instData.OFFSET0, 7, 2); +panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out " + "of bounds.\n", gpuDynInst->disassemble(), index); +/** + * If the shuffled index corresponds to a lane that is + * inactive then this instruction writes a 0 to the active + * lane in VDST. + */ +if (wf->execMask(index)) { +vdst[index] = data[lane]; +} else { +vdst[index] = 0; +} +} +} + +vdst.write(); + +wf->rdLmReqsInPipe--; +wf->validateRequestCounters(); +} // execute +// --- Inst_DS__DS_BPERMUTE_B32 class methods --- Inst_DS__DS_BPERMUTE_B32::Inst_DS__DS_BPERMUTE_B32(InFmt_DS *iFmt) : Inst_DS(iFmt, "ds_bpermute_b32") { +setFlag(MemoryRef); +/** + * While this operation doesn't actually use DS storage we classify + * it as a load here because it does a writeback to a VGPR, which + * fits in better with the LDS pipeline logic. + */ +setFlag(Load); } // Inst_DS__DS_BPERMUTE_B32 Inst_DS__DS_BPERMUTE_B32::~Inst_DS__DS_BPERMUTE_B32() @@ -32155,8 +32216,56 @@ void Inst_DS__DS_BPERMUTE_B32::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit() +->cyclesToTicks(Cycles(24))); +ConstVecOperandU32 addr(gpuDynInst, extData.ADDR); +ConstVecOperandU32 data(gpuDynInst, extData.DATA0); +VecOperandU32 vdst(gpuDynInst, extData.V
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: convert vALU instruction counters from 32 to 64-bit
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29950 ) Change subject: arch-gcn3: convert vALU instruction counters from 32 to 64-bit .. arch-gcn3: convert vALU instruction counters from 32 to 64-bit The vALU instruction counters were previously 32 bits, but for some workloads this value wraps around and triggers an assert failure because the max vALU operations are reached. To resolve this, this commit increases the counter size to 64 bits. Change-Id: I90ed4514669485cfea7ccc37ba9d69665277bccb Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29950 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/gpu-compute/shader.hh 1 file changed, 2 insertions(+), 2 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh index 238f6e0..3e2e569 100644 --- a/src/gpu-compute/shader.hh +++ b/src/gpu-compute/shader.hh @@ -258,8 +258,8 @@ Stats::Vector vectorInstDstOperand; void regStats(); -int max_valu_insts; -int total_valu_insts; +int64_t max_valu_insts; +int64_t total_valu_insts; Shader(const Params *p); ~Shader(); -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29950 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I90ed4514669485cfea7ccc37ba9d69665277bccb Gerrit-Change-Number: 29950 Gerrit-PatchSet: 7 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: add support for v_mbcnt_hi and v_mbcnt_lo
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29948 ) Change subject: arch-gcn3: add support for v_mbcnt_hi and v_mbcnt_lo .. arch-gcn3: add support for v_mbcnt_hi and v_mbcnt_lo Change-Id: I1c70fe693c904f1abd7d5a2b99220c74a075eae5 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29948 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 60 insertions(+), 4 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 79e7dda..6ffd049 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -30309,8 +30309,36 @@ void Inst_VOP3__V_MBCNT_LO_U32_B32::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +ConstVecOperandU32 src0(gpuDynInst, extData.SRC0); +ConstVecOperandU32 src1(gpuDynInst, extData.SRC1); +VecOperandU32 vdst(gpuDynInst, instData.VDST); +uint64_t threadMask = 0; + +src0.readSrc(); +src1.readSrc(); + +/** + * input modifiers are supported by FP operations only + */ +assert(!(instData.ABS & 0x1)); +assert(!(instData.ABS & 0x2)); +assert(!(instData.ABS & 0x4)); +assert(!(extData.NEG & 0x1)); +assert(!(extData.NEG & 0x2)); +assert(!(extData.NEG & 0x4)); + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (wf->execMask(lane)) { +threadMask = ((1LL << lane) - 1LL); +vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) + + src1[lane]; +} +} + +vdst.write(); +} // execute +// --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods --- Inst_VOP3__V_MBCNT_HI_U32_B32::Inst_VOP3__V_MBCNT_HI_U32_B32( InFmt_VOP3 *iFmt) @@ -30330,8 +30358,36 @@ void Inst_VOP3__V_MBCNT_HI_U32_B32::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +ConstVecOperandU32 src0(gpuDynInst, extData.SRC0); +ConstVecOperandU32 src1(gpuDynInst, extData.SRC1); +VecOperandU32 vdst(gpuDynInst, instData.VDST); +uint64_t threadMask = 0; + +src0.readSrc(); +src1.readSrc(); + +/** + * input modifiers are supported by FP operations only + */ +assert(!(instData.ABS & 0x1)); +assert(!(instData.ABS & 0x2)); +assert(!(instData.ABS & 0x4)); +assert(!(extData.NEG & 0x1)); +assert(!(extData.NEG & 0x2)); +assert(!(extData.NEG & 0x4)); + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (wf->execMask(lane)) { +threadMask = ((1LL << lane) - 1LL); +vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) + + src1[lane]; +} +} + +vdst.write(); +} // execute +// --- Inst_VOP3__V_LSHLREV_B64 class methods --- Inst_VOP3__V_LSHLREV_B64::Inst_VOP3__V_LSHLREV_B64(InFmt_VOP3 *iFmt) : Inst_VOP3(iFmt, "v_lshlrev_b64", false) -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29948 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I1c70fe693c904f1abd7d5a2b99220c74a075eae5 Gerrit-Change-Number: 29948 Gerrit-PatchSet: 7 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute, arch-gcn3: refactor barriers
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29943 ) Change subject: gpu-compute, arch-gcn3: refactor barriers .. gpu-compute, arch-gcn3: refactor barriers Barriers were not modeled properly. Firstly, barriers were allocated to each WG that was launched, which is not correct, and the CU would provide an infinite number of barrier slots. There are a limited number of barrier slots per CU in reality. In addition, the CU will not allocate barrier slots to WGs with a single WF (nothing to sync if only one WF). Beyond modeling problems, there also the issue of deadlock. The barrier could deadlock because not all WFs are freed from the barrier once it has been satisfied. Instead, we relied on the scoreboard stage to release them lazily, one-by-one. Under this implementation the scoreboard may not fully release all WFs participating in a barrier; this happens because the first WF to be freed from the barrier could reach an s_barrier instruction again, forever causing the barrier counts across WFs to be out-of-sync. This change refactors the barrier logic to: 1) Create a proper barrier slot implementation 2) Enforce (via a parameter) the number of barrier slots on the CU. 3) Simplify the logic and cleanup the code (i.e., we no longer iterate through the entire WF list each time we check if a barrier is satisfied). 4) Fix deadlock issues. Change-Id: If53955b54931886baaae322640a7b9da7a1595e0 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29943 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/arch/gcn3/insts/instructions.cc M src/gpu-compute/GPU.py M src/gpu-compute/compute_unit.cc M src/gpu-compute/compute_unit.hh M src/gpu-compute/scoreboard_check_stage.cc M src/gpu-compute/shader.cc M src/gpu-compute/wavefront.cc M src/gpu-compute/wavefront.hh 8 files changed, 386 insertions(+), 101 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 607e3c6..817b339 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -39,6 +39,7 @@ #include "arch/gcn3/insts/inst_util.hh" #include "debug/GCN3.hh" +#include "debug/GPUSync.hh" #include "gpu-compute/shader.hh" namespace Gcn3ISA @@ -3709,6 +3710,7 @@ Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst) { Wavefront *wf = gpuDynInst->wavefront(); +ComputeUnit *cu = gpuDynInst->computeUnit(); // delete extra instructions fetched for completed work-items wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1, @@ -3725,6 +3727,25 @@ int refCount = wf->computeUnit->getLds() .decreaseRefCounter(wf->dispatchId, wf->wgId); +/** + * The parent WF of this instruction is exiting, therefore + * it should not participate in this barrier any longer. This + * prevents possible deadlock issues if WFs exit early. + */ +int bar_id = WFBarrier::InvalidID; +if (wf->hasBarrier()) { +assert(wf->getStatus() != Wavefront::S_BARRIER); +bar_id = wf->barrierId(); +assert(bar_id != WFBarrier::InvalidID); +wf->releaseBarrier(); +cu->decMaxBarrierCnt(bar_id); +DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the " +"program and decrementing max barrier count for " +"barrier Id%d. New max count: %d.\n", cu->cu_id, +wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id, +cu->maxBarrierCnt(bar_id)); +} + DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n", wf->computeUnit->cu_id, wf->wgId, refCount); @@ -3748,6 +3769,20 @@ wf->lastInstExec = 0; if (!refCount) { +/** + * If all WFs have finished, and hence the WG has finished, + * then we can free up the barrier belonging to the parent + * WG, but only if we actually used a barrier (i.e., more + * than one WF in the WG). + */ +if (bar_id != WFBarrier::InvalidID) { +DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are " +"now complete. Releasing barrier Id%d.\n", cu->cu_id, +wf->simdId, wf->wfSlotId, wf->wfDynId, +wf->barrierId()); +cu->releaseBarrier(bar_id); +} + /** * Last wavefront of the workgroup has executed return. If the * workgroup is not the final one in the kernel, then simply @@ -4027,12 +4062,21 @@ Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuD
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: add support of 64-bit SOPK instruction
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29942 ) Change subject: arch-gcn3: add support of 64-bit SOPK instruction .. arch-gcn3: add support of 64-bit SOPK instruction s_setreg_imm32_b32 is a 64-bit instruction, using a 32-bit literal constant. Related functions are added to support decoding the second dword. Change-Id: I290f8578f726885c137dbfac3773035f814e0a3a Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29942 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Xianwei Zhang --- M src/arch/gcn3/insts/op_encodings.cc M src/arch/gcn3/insts/op_encodings.hh 2 files changed, 43 insertions(+), 4 deletions(-) Approvals: Xianwei Zhang: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/op_encodings.cc b/src/arch/gcn3/insts/op_encodings.cc index fe501f2..22d0f48 100644 --- a/src/arch/gcn3/insts/op_encodings.cc +++ b/src/arch/gcn3/insts/op_encodings.cc @@ -160,6 +160,14 @@ // copy first instruction DWORD instData = iFmt[0]; +if (hasSecondDword(iFmt)) { +// copy second instruction DWORD into union +extData = ((MachInst)iFmt)[1]; +_srcLiteral = *reinterpret_cast(&iFmt[1]); +varSize = 4 + 4; +} else { +varSize = 4; +} // if } // Inst_SOPK Inst_SOPK::~Inst_SOPK() @@ -169,18 +177,43 @@ int Inst_SOPK::instSize() const { -return 4; +return varSize; } // instSize +bool +Inst_SOPK::hasSecondDword(InFmt_SOPK *iFmt) +{ +/* + SOPK can be a 64-bit instruction, i.e., have a second dword: + S_SETREG_IMM32_B32 writes some or all of the LSBs of a 32-bit + literal constant into a hardware register; + the way to detect such special case is to explicitly check the + opcode (20/0x14) +*/ +if (iFmt->OP == 0x14) +return true; + +return false; +} + + void Inst_SOPK::generateDisassembly() { std::stringstream dis_stream; dis_stream << _opcode << " "; -dis_stream << opSelectorToRegSym(instData.SDST) << ", "; -dis_stream << "0x" << std::hex << std::setfill('0') << std::setw(4) - << instData.SIMM16; +// S_SETREG_IMM32_B32 is a 64-bit instruction, using a +// 32-bit literal constant +if (instData.OP == 0x14) { +dis_stream << "0x" << std::hex << std::setfill('0') +<< std::setw(8) << extData.imm_u32 << ", "; +} else { +dis_stream << opSelectorToRegSym(instData.SDST) << ", "; +} + +dis_stream << "0x" << std::hex << std::setfill('0') << std::setw(4) + << instData.SIMM16; disassembly = dis_stream.str(); } diff --git a/src/arch/gcn3/insts/op_encodings.hh b/src/arch/gcn3/insts/op_encodings.hh index 22c146a..4f151b9 100644 --- a/src/arch/gcn3/insts/op_encodings.hh +++ b/src/arch/gcn3/insts/op_encodings.hh @@ -87,6 +87,12 @@ protected: // first instruction DWORD InFmt_SOPK instData; +// possible second DWORD +InstFormat extData; +uint32_t varSize; + + private: +bool hasSecondDword(InFmt_SOPK *); }; // Inst_SOPK class Inst_SOP1 : public GCN3GPUStaticInst -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29942 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I290f8578f726885c137dbfac3773035f814e0a3a Gerrit-Change-Number: 29942 Gerrit-PatchSet: 7 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: Xianwei Zhang Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: implement multi-dword buffer loads and stores
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29946 ) Change subject: arch-gcn3: implement multi-dword buffer loads and stores .. arch-gcn3: implement multi-dword buffer loads and stores Add support for all multi-dword buffer loads and stores: buffer_load_dword x2, x3, and x4 and buffer_store_dword x2, x3, and x4 Change-Id: I4017b6b4f625fc92002ce8ade695ae29700fa55e Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29946 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/instructions.cc M src/arch/gcn3/insts/op_encodings.hh 2 files changed, 504 insertions(+), 18 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 817b339..b852281 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -34777,7 +34777,11 @@ { setFlag(MemoryRef); setFlag(Load); -setFlag(GlobalSegment); +if (instData.LDS) { +setFlag(GroupSegment); +} else { +setFlag(GlobalSegment); +} } // Inst_MUBUF__BUFFER_LOAD_DWORDX2 Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2() @@ -34788,17 +34792,88 @@ void Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->exec_mask = wf->execMask(); +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + +ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR); +ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1); +ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4); +ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET); + +rsrcDesc.read(); +offset.read(); + +int inst_offset = instData.OFFSET; + +if (!instData.IDXEN && !instData.OFFEN) { +calcAddr(gpuDynInst, +addr0, addr1, rsrcDesc, offset, inst_offset); +} else if (!instData.IDXEN && instData.OFFEN) { +addr0.read(); +calcAddr(gpuDynInst, +addr0, addr1, rsrcDesc, offset, inst_offset); +} else if (instData.IDXEN && !instData.OFFEN) { +addr0.read(); +calcAddr(gpuDynInst, +addr1, addr0, rsrcDesc, offset, inst_offset); +} else { +addr0.read(); +addr1.read(); +calcAddr(gpuDynInst, +addr1, addr0, rsrcDesc, offset, inst_offset); +} + +if (isLocalMem()) { +gpuDynInst->computeUnit()->localMemoryPipe +.issueRequest(gpuDynInst); +wf->rdLmReqsInPipe--; +wf->outstandingReqsRdLm++; +} else { +gpuDynInst->computeUnit()->globalMemoryPipe +.issueRequest(gpuDynInst); +wf->rdGmReqsInPipe--; +wf->outstandingReqsRdGm++; +} + +wf->outstandingReqs++; +wf->validateRequestCounters(); +} // execute void Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst) { +initMemRead<2>(gpuDynInst); } // initiateAcc void Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst) { +VecOperandU32 vdst0(gpuDynInst, extData.VDATA); +VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1); + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (gpuDynInst->exec_mask[lane]) { +if (!oobMask[lane]) { +vdst0[lane] = (reinterpret_cast( +gpuDynInst->d_data))[lane * 2]; +vdst1[lane] = (reinterpret_cast( +gpuDynInst->d_data))[lane * 2 + 1]; +} else { +vdst0[lane] = 0; +vdst1[lane] = 0; +} +} +} + +vdst0.write(); +vdst1.write(); } // completeAcc Inst_MUBUF__BUFFER_LOAD_DWORDX3 @@ -34807,7 +34882,11 @@ { setFlag(MemoryRef); setFlag(Load); -setFlag(GlobalSegment); +if (instData.LDS) { +setFlag(GroupSegment); +} else { +setFlag(GlobalSegment); +} } // Inst_MUBUF__BUFFER_LOAD_DWORDX3 Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3() @@ -34818,17 +34897,93 @@ void Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst) { -panic
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add support for rd/wr EXEC_HI to operand class
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29944 ) Change subject: arch-gcn3: Add support for rd/wr EXEC_HI to operand class .. arch-gcn3: Add support for rd/wr EXEC_HI to operand class Change-Id: Ib22dd604f88ea56801964235082835002deffca1 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29944 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/operand.hh 1 file changed, 35 insertions(+), 1 deletion(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/operand.hh b/src/arch/gcn3/operand.hh index 7f70fab..960d05e 100644 --- a/src/arch/gcn3/operand.hh +++ b/src/arch/gcn3/operand.hh @@ -435,13 +435,30 @@ if (!isScalarReg(_opIdx)) { if (_opIdx == REG_EXEC_LO) { -uint64_t new_exec_mask_val(0); +ScalarRegU64 new_exec_mask_val(0); std::memcpy((void*)&new_exec_mask_val, (void*)srfData.data(), sizeof(new_exec_mask_val)); VectorMask new_exec_mask(new_exec_mask_val); wf->execMask() = new_exec_mask; DPRINTF(GPUSRF, "Write EXEC\n"); DPRINTF(GPUSRF, "EXEC = %#x\n", new_exec_mask_val); +} else if (_opIdx == REG_EXEC_HI) { +/** + * If we're writing only the upper half of the EXEC mask + * this ought to be a single dword operand. + */ +assert(NumDwords == 1); +ScalarRegU32 new_exec_mask_hi_val(0); +ScalarRegU64 new_exec_mask_val += wf->execMask().to_ullong(); +std::memcpy((void*)&new_exec_mask_hi_val, +(void*)srfData.data(), sizeof(new_exec_mask_hi_val)); +replaceBits(new_exec_mask_val, 63, 32, +new_exec_mask_hi_val); +VectorMask new_exec_mask(new_exec_mask_val); +wf->execMask() = new_exec_mask; +DPRINTF(GPUSRF, "Write EXEC\n"); +DPRINTF(GPUSRF, "EXEC = %#x\n", new_exec_mask_val); } else { _gpuDynInst->writeMiscReg(_opIdx, srfData[0]); } @@ -505,6 +522,23 @@ DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask); } break; + case REG_EXEC_HI: +{ +/** + * If we're reading only the upper half of the EXEC mask + * this ought to be a single dword operand. + */ +assert(NumDwords == 1); +ScalarRegU64 exec_mask = _gpuDynInst->wavefront() +->execMask().to_ullong(); + +ScalarRegU32 exec_mask_hi = bits(exec_mask, 63, 32); +std::memcpy((void*)srfData.data(), (void*)&exec_mask_hi, +sizeof(srfData)); +DPRINTF(GPUSRF, "Read EXEC_HI\n"); +DPRINTF(GPUSRF, "EXEC_HI = %#x\n", exec_mask_hi); +} +break; case REG_SRC_SWDA: case REG_SRC_DPP: case REG_SRC_LITERAL: -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29944 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Ib22dd604f88ea56801964235082835002deffca1 Gerrit-Change-Number: 29944 Gerrit-PatchSet: 7 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Fix LDS out-of-bounds behavior
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29940 ) Change subject: gpu-compute: Fix LDS out-of-bounds behavior .. gpu-compute: Fix LDS out-of-bounds behavior The LDS is capable of handling out-of-bounds accesses, that is, accesses that are outside the bounds of the chunk allocated to a WG. Currently, the simulator asserts on these accesses. This patch changes the behavior of the LDS to return 0 for reads and dropping writes that are out-of-bounds. Change-Id: I5f467d0f52113e8565e1a3029e82fb89cc6f07ea Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29940 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/gpu-compute/lds_state.hh 1 file changed, 16 insertions(+), 6 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh index 58171e3..d793f0f 100644 --- a/src/gpu-compute/lds_state.hh +++ b/src/gpu-compute/lds_state.hh @@ -69,9 +69,14 @@ T read(const uint32_t index) { -fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0"); -fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS " -"chunk"); +/** + * For reads that are outside the bounds of the LDS + * chunk allocated to this WG we return 0. + */ +if (index >= chunk.size()) { +return (T)0; +} + T *p0 = (T *) (&(chunk.at(index))); return *p0; } @@ -83,9 +88,14 @@ void write(const uint32_t index, const T value) { -fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0"); -fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS " -"chunk"); +/** + * Writes that are outside the bounds of the LDS + * chunk allocated to this WG are dropped. + */ +if (index >= chunk.size()) { +return; +} + T *p0 = (T *) (&(chunk.at(index))); *p0 = value; } -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29940 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I5f467d0f52113e8565e1a3029e82fb89cc6f07ea Gerrit-Change-Number: 29940 Gerrit-PatchSet: 7 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: ensure that atomics follow HSA conventions
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29941 ) Change subject: arch-gcn3: ensure that atomics follow HSA conventions .. arch-gcn3: ensure that atomics follow HSA conventions Add asserts to make sure atomics are following the HSA conventions that atomics should be word aligned (i.e., can't be byte aligned) and should not be misaligned such that a given lane's access spans multiple cache lines. Change-Id: Ia48758b9ed96764864234dc607f337e30e287d1c Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29941 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/gpu_mem_helpers.hh 1 file changed, 6 insertions(+), 0 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/gpu_mem_helpers.hh b/src/arch/gcn3/gpu_mem_helpers.hh index 40ca565..562158d 100644 --- a/src/arch/gcn3/gpu_mem_helpers.hh +++ b/src/arch/gcn3/gpu_mem_helpers.hh @@ -80,6 +80,12 @@ misaligned_acc = split_addr > vaddr; if (is_atomic) { +// make sure request is word aligned +assert((vaddr & 0x3) == 0); + +// a given lane's atomic can't cross cache lines +assert(!misaligned_acc); + req = std::make_shared(vaddr, sizeof(T), 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId, -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29941 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Ia48758b9ed96764864234dc607f337e30e287d1c Gerrit-Change-Number: 29941 Gerrit-PatchSet: 7 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: implement instruction s_setreg_b32
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29949 ) Change subject: arch-gcn3: implement instruction s_setreg_b32 .. arch-gcn3: implement instruction s_setreg_b32 Instruction s_setreg_b32 was unimplemented, but is used by hipified rodinia 'srad'. The instruction sets values of hardware internal registers. If the instruction is writing into MODE to control single-precision FP round and denorm modes, a simple warn will be printed; for all other cases (non-MODE hw register or other precisions), panic will happen. Change-Id: Idb1cd5f60548a146bc980f1a27faff30259e74ce Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29949 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Xianwei Zhang --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 27 insertions(+), 0 deletions(-) Approvals: Xianwei Zhang: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 6ffd049..8b72e0d 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -1800,6 +1800,7 @@ Inst_SOPK__S_SETREG_B32::Inst_SOPK__S_SETREG_B32(InFmt_SOPK *iFmt) : Inst_SOPK(iFmt, "s_setreg_b32") { +setFlag(ALU); } // Inst_SOPK__S_SETREG_B32 Inst_SOPK__S_SETREG_B32::~Inst_SOPK__S_SETREG_B32() @@ -1813,6 +1814,32 @@ void Inst_SOPK__S_SETREG_B32::execute(GPUDynInstPtr gpuDynInst) { +ScalarRegI16 simm16 = instData.SIMM16; +ScalarRegU32 hwregId = simm16 & 0x3f; +ScalarRegU32 offset = (simm16 >> 6) & 31; +ScalarRegU32 size = ((simm16 >> 11) & 31) + 1; + +ScalarOperandU32 hwreg(gpuDynInst, hwregId); +ScalarOperandU32 sdst(gpuDynInst, instData.SDST); +hwreg.read(); +sdst.read(); + +// Store value from SDST to part of the hardware register. +ScalarRegU32 mask = (((1U << size) - 1U) << offset); +hwreg = ((hwreg.rawData() & ~mask) +| ((sdst.rawData() << offset) & mask)); +hwreg.write(); + +// set MODE register to control the behavior of single precision +// floating-point numbers: denormal mode or round mode +if (hwregId==1 && size==2 +&& (offset==4 || offset==0)) { +warn_once("Be cautious that s_setreg_b32 has no real effect " +"on FP modes: %s\n", gpuDynInst->disassemble()); +return; +} + +// panic if not changing MODE of floating-point numbers panicUnimplemented(); } -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29949 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Idb1cd5f60548a146bc980f1a27faff30259e74ce Gerrit-Change-Number: 29949 Gerrit-PatchSet: 7 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: Xianwei Zhang Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Init CU object for pipe stages in their ctors
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29945 ) Change subject: gpu-compute: Init CU object for pipe stages in their ctors .. gpu-compute: Init CU object for pipe stages in their ctors This change updates the constructors of the CU's pipe stages/memory pipelines to accept a pointer to their parent CU. Because the CU creates these objects, and can pass a pointer to itself to these object via their constructors, this is the safer way to initalize these classes. Change-Id: I0b3732ce7c03781ee15332dac7a21c097ad387a4 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29945 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/gpu-compute/compute_unit.cc M src/gpu-compute/exec_stage.cc M src/gpu-compute/exec_stage.hh M src/gpu-compute/fetch_stage.cc M src/gpu-compute/fetch_stage.hh M src/gpu-compute/fetch_unit.cc M src/gpu-compute/fetch_unit.hh M src/gpu-compute/global_memory_pipeline.cc M src/gpu-compute/global_memory_pipeline.hh M src/gpu-compute/local_memory_pipeline.cc M src/gpu-compute/local_memory_pipeline.hh M src/gpu-compute/scalar_memory_pipeline.cc M src/gpu-compute/scalar_memory_pipeline.hh M src/gpu-compute/schedule_stage.cc M src/gpu-compute/schedule_stage.hh M src/gpu-compute/scoreboard_check_stage.cc M src/gpu-compute/scoreboard_check_stage.hh 17 files changed, 65 insertions(+), 80 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 0fcbb1a..f3387a7 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -66,9 +66,14 @@ numScalarALUs(p->num_scalar_cores), vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width), coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width), -registerManager(p->register_manager), fetchStage(p), -scoreboardCheckStage(p), scheduleStage(p, this), execStage(p), -globalMemoryPipe(p), localMemoryPipe(p), scalarMemoryPipe(p), +registerManager(p->register_manager), +fetchStage(p, this), +scoreboardCheckStage(p, this), +scheduleStage(p, this), +execStage(p, this), +globalMemoryPipe(p, this), +localMemoryPipe(p, this), +scalarMemoryPipe(p, this), tickEvent([this]{ exec(); }, "Compute unit tick event", false, Event::CPU_Tick_Pri), cu_id(p->cu_id), @@ -788,13 +793,11 @@ dispatchList.push_back(std::make_pair(nullptr, EMPTY)); } -fetchStage.init(this); -scoreboardCheckStage.init(this); -scheduleStage.init(this); -execStage.init(this); -globalMemoryPipe.init(this); -localMemoryPipe.init(this); -scalarMemoryPipe.init(this); +fetchStage.init(); +scoreboardCheckStage.init(); +scheduleStage.init(); +execStage.init(); +globalMemoryPipe.init(); gmTokenPort.setTokenManager(memPortTokens); } diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc index 2dece18..e420579 100644 --- a/src/gpu-compute/exec_stage.cc +++ b/src/gpu-compute/exec_stage.cc @@ -41,19 +41,19 @@ #include "gpu-compute/vector_register_file.hh" #include "gpu-compute/wavefront.hh" -ExecStage::ExecStage(const ComputeUnitParams *p) : lastTimeInstExecuted(false), -thisTimeInstExecuted(false), instrExecuted (false), -executionResourcesUsed(0) +ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit *cu) +: computeUnit(cu), lastTimeInstExecuted(false), + thisTimeInstExecuted(false), instrExecuted (false), + executionResourcesUsed(0), _name(cu->name() + ".ExecStage") + { numTransActiveIdle = 0; idle_dur = 0; } void -ExecStage::init(ComputeUnit *cu) +ExecStage::init() { -computeUnit = cu; -_name = computeUnit->name() + ".ExecStage"; dispatchList = &computeUnit->dispatchList; idle_dur = 0; } diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh index 670252c..f984d72 100644 --- a/src/gpu-compute/exec_stage.hh +++ b/src/gpu-compute/exec_stage.hh @@ -69,9 +69,9 @@ class ExecStage { public: -ExecStage(const ComputeUnitParams* params); +ExecStage(const ComputeUnitParams* p, ComputeUnit *cu); ~ExecStage() { } -void init(ComputeUnit *cu); +void init(); void exec(); std::string dispStatusToStr(int j); diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc index cf0b39e..b9df6ce 100644 --- a/src/gpu-compute/fetch_stage.cc +++ b/src/gpu-compute/fetch_stage.cc @@ -36,11 +36,12 @@ #include "gpu-compute/compute_unit.hh" #include "gpu-compute/wavefront.hh" -FetchStage::FetchStage(const ComputeUnitParams* p) : -numVectorALUs(p->num_SIMDs), computeUnit(nullptr) +FetchStage::FetchStage(const ComputeUnitParams* p, ComputeUnit *cu) +: numVectorALUs(p->num_SIMD
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: fix bug with DPP support
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29947 ) Change subject: arch-gcn3: fix bug with DPP support .. arch-gcn3: fix bug with DPP support Instructions that use the DPP field need to use the extra SRC0 register associated with the DPP instruction instead of the "default" SRC0 register, since the default SRC0 register contains the DPP information when DPP is being used. This commit fixes 2735c3bb88 to take this into account. Additionally, this commit removes write of the src register from the DPP helper functions, to avoid overwriting any changes made to the destination register. Finally, this change modifies the instructions that use DPP to simplify the flow through the execute() functions. Change-Id: I80fd0af1f131f287f18ff73b3c1c9122d8c60823 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29947 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/inst_util.hh M src/arch/gcn3/insts/instructions.cc 2 files changed, 41 insertions(+), 20 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/inst_util.hh b/src/arch/gcn3/insts/inst_util.hh index 433ccbe..b40e890 100644 --- a/src/arch/gcn3/insts/inst_util.hh +++ b/src/arch/gcn3/insts/inst_util.hh @@ -505,7 +505,6 @@ src0[lane] = 0; } -src0.write(); // reset for next iteration laneDisabled = false; } diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index b852281..79e7dda 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -5296,8 +5296,12 @@ VecOperandF32 src1(gpuDynInst, instData.VSRC1); VecOperandF32 vdst(gpuDynInst, instData.VDST); +src0.readSrc(); +src1.read(); + if (isDPPInst()) { VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0); +src0_dpp.read(); DPRINTF(GCN3, "Handling V_ADD_F32 SRC DPP. SRC0: register v[%d], " "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, " @@ -5313,14 +5317,17 @@ extData.iFmt_VOP_DPP.ROW_MASK); processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1); -} -src0.readSrc(); -src1.read(); - -for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { -if (wf->execMask(lane)) { -vdst[lane] = src0[lane] + src1[lane]; +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (wf->execMask(lane)) { +vdst[lane] = src0_dpp[lane] + src1[lane]; +} +} +} else { +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (wf->execMask(lane)) { +vdst[lane] = src0[lane] + src1[lane]; +} } } @@ -6164,6 +6171,7 @@ if (isDPPInst()) { VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0); +src0_dpp.read(); DPRINTF(GCN3, "Handling V_MAC_F32 SRC DPP. SRC0: register v[%d], " "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, " @@ -6179,11 +6187,18 @@ extData.iFmt_VOP_DPP.ROW_MASK); processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1); -} -for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { -if (wf->execMask(lane)) { -vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]); +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (wf->execMask(lane)) { +vdst[lane] = std::fma(src0_dpp[lane], src1[lane], + vdst[lane]); +} +} +} else { +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (wf->execMask(lane)) { +vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]); +} } } @@ -7117,8 +7132,11 @@ ConstVecOperandU32 src(gpuDynInst, instData.SRC0); VecOperandU32 vdst(gpuDynInst, instData.VDST); +src.readSrc(); + if (isDPPInst()) { -VecOperandU32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0); +VecOperandU32 src_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0); +src_dpp.read(); DPRINTF(GCN3, "Handling V_MOV_B32 SRC DPP. SRC0: register v[%d], " "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, " @@ -7137,14 +7155,18 @@ // to negate it or take the absolute value of it assert(!e
[gem5-dev] Change in gem5/gem5[develop]: mem-ruby: Add support for MemSync reqs in VIPER
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29939 ) Change subject: mem-ruby: Add support for MemSync reqs in VIPER .. mem-ruby: Add support for MemSync reqs in VIPER Change-Id: Ib129e82be5348c641a8ae18093324bcedfb38abe Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29939 Reviewed-by: Jason Lowe-Power Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/mem/ruby/system/GPUCoalescer.cc M src/mem/ruby/system/GPUCoalescer.hh M src/mem/ruby/system/RubyPort.cc 3 files changed, 22 insertions(+), 21 deletions(-) Approvals: Jason Lowe-Power: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc index d9793fa..80bc19a 100644 --- a/src/mem/ruby/system/GPUCoalescer.cc +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -553,24 +553,25 @@ assert(pkt->req->hasInstSeqNum()); if (pkt->cmd == MemCmd::MemSyncReq) { -// let the child coalescer handle MemSyncReq because this is -// cache coherence protocol specific -return RequestStatus_Issued; -} -// otherwise, this must be either read or write command -assert(pkt->isRead() || pkt->isWrite()); +// issue mem_sync requests immediately to the cache system without +// going through uncoalescedTable like normal LD/ST/Atomic requests +issueMemSyncRequest(pkt); +} else { +// otherwise, this must be either read or write command +assert(pkt->isRead() || pkt->isWrite()); -// the pkt is temporarily stored in the uncoalesced table until -// it's picked for coalescing process later in this cycle or in a -// future cycle -uncoalescedTable.insertPacket(pkt); -DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n", -pkt->getAddr()); +// the pkt is temporarily stored in the uncoalesced table until +// it's picked for coalescing process later in this cycle or in a +// future cycle +uncoalescedTable.insertPacket(pkt); +DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n", +pkt->getAddr()); -// we schedule an issue event here to process the uncoalesced table -// and try to issue Ruby request to cache system -if (!issueEvent.scheduled()) { -schedule(issueEvent, curTick()); +// we schedule an issue event here to process the uncoalesced table +// and try to issue Ruby request to cache system +if (!issueEvent.scheduled()) { +schedule(issueEvent, curTick()); +} } // we always return RequestStatus_Issued in this coalescer diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh index 74236cb..401f70b 100644 --- a/src/mem/ruby/system/GPUCoalescer.hh +++ b/src/mem/ruby/system/GPUCoalescer.hh @@ -367,7 +367,7 @@ // since the two following issue functions are protocol-specific, // they must be implemented in a derived coalescer virtual void issueRequest(CoalescedRequest* crequest) = 0; -//virtual void issueMemSyncRequest(PacketPtr pkt) = 0; +virtual void issueMemSyncRequest(PacketPtr pkt) {} void kernelCallback(int wavefront_id); diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc index 0526e65..4510e3a 100644 --- a/src/mem/ruby/system/RubyPort.cc +++ b/src/mem/ruby/system/RubyPort.cc @@ -251,7 +251,7 @@ } // Check for pio requests and directly send them to the dedicated // pio port. -if (pkt->cmd != MemCmd::MemFenceReq) { +if (pkt->cmd != MemCmd::MemSyncReq) { if (!isPhysMemAddress(pkt)) { assert(ruby_port->memMasterPort.isConnected()); DPRINTF(RubyPort, "Request address %#x assumed to be a " @@ -312,7 +312,7 @@ // Check for pio requests and directly send them to the dedicated // pio port. -if (pkt->cmd != MemCmd::MemFenceReq) { +if (pkt->cmd != MemCmd::MemSyncReq) { if (!isPhysMemAddress(pkt)) { assert(ruby_port->memMasterPort.isConnected()); DPRINTF(RubyPort, "Request address %#x assumed to be a " @@ -539,7 +539,7 @@ } // Flush, acquire, release requests don't access physical memory -if (pkt->isFlush() || pkt->cmd == MemCmd::MemFenceReq) { +if (pkt->isFlush() || pkt->cmd == MemCmd::MemSyncReq) { accessPhysMem = false; } @@ -649,4 +649,4 @@ } } return num_written; -} \ No newline at end of file +} -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29939 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Ib129e82be5348c641a8ae18093324b
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: enable kernel-end WB functionality
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29938 ) Change subject: gpu-compute: enable kernel-end WB functionality .. gpu-compute: enable kernel-end WB functionality Change-Id: Ib17e1d700586d1aa04d408e7b924270f0de82efe Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29938 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Xianwei Zhang --- M src/gpu-compute/compute_unit.cc M src/gpu-compute/shader.cc M src/mem/request.hh 3 files changed, 27 insertions(+), 18 deletions(-) Approvals: Xianwei Zhang: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index b0616d6..178fd6e 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -1218,23 +1218,25 @@ schedule(mem_req_event, curTick() + req_tick_latency); } else { -assert(gpuDynInst->isEndOfKernel()); + // kernel end release must be enabled + assert(shader->impl_kern_end_rel); + assert(gpuDynInst->isEndOfKernel()); -req->setCacheCoherenceFlags(Request::RELEASE); -req->setReqInstSeqNum(gpuDynInst->seqNum()); -req->setFlags(Request::KERNEL); -pkt = new Packet(req, MemCmd::MemSyncReq); -pkt->pushSenderState( - new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); + req->setCacheCoherenceFlags(Request::WB_L2); + req->setReqInstSeqNum(gpuDynInst->seqNum()); + req->setFlags(Request::KERNEL); + pkt = new Packet(req, MemCmd::MemSyncReq); + pkt->pushSenderState( + new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); -EventFunctionWrapper *mem_req_event = - memPort[0]->createMemReqEvent(pkt); + EventFunctionWrapper *mem_req_event = +memPort[0]->createMemReqEvent(pkt); -DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling " -"a release\n", cu_id, gpuDynInst->simdId, -gpuDynInst->wfSlotId, 0, pkt->req->getPaddr()); + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling " + "a release\n", cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, 0, pkt->req->getPaddr()); -schedule(mem_req_event, curTick() + req_tick_latency); + schedule(mem_req_event, curTick() + req_tick_latency); } } else { gpuDynInst->setRequestFlags(req); diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index f5e9444..59ce239 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -223,11 +223,11 @@ // flush has never been started, performed only once at kernel end assert(_dispatcher.getOutstandingWbs(kernId) == 0); -// iterate all cus, managed by the shader, to perform flush. -for (int i_cu = 0; i_cu < n_cu; ++i_cu) { -_dispatcher.updateWbCounter(kernId, +1); -cuList[i_cu]->doFlush(gpuDynInst); -} +// the first cu, managed by the shader, performs flush operation, +// assuming that L2 cache is shared by all cus in the shader +int i_cu = 0; +_dispatcher.updateWbCounter(kernId, +1); +cuList[i_cu]->doFlush(gpuDynInst); } bool diff --git a/src/mem/request.hh b/src/mem/request.hh index 4e0ba97..718d5fa 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -225,6 +225,9 @@ * See the AMD GCN3 ISA Architecture Manual for more * details. * + * INV_L1: L1 cache invalidation + * WB_L2: L2 cache writeback + * * SLC: System Level Coherent. Accesses are forced to miss in * the L2 cache and are coherent with system memory. * @@ -237,6 +240,10 @@ * between atomic return/no-return operations. */ enum : CacheCoherenceFlagsType { +/** mem_sync_op flags */ +INV_L1 = 0x0001, +WB_L2 = 0x0020, +/** user-policy flags */ /** user-policy flags */ SLC_BIT = 0x0080, GLC_BIT = 0x0100, -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29938 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Ib17e1d700586d1aa04d408e7b924270f0de82efe Gerrit-Change-Number: 29938 Gerrit-PatchSet: 9 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: Xianwei Zhang Gerrit-Reviewer: kokoro Gerrit-MessageType: merged
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implementation of flat atomic swap instruction
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29937 ) Change subject: arch-gcn3: Implementation of flat atomic swap instruction .. arch-gcn3: Implementation of flat atomic swap instruction Change-Id: I9b9042899e65e8c9848b31c509eb2e3b13293e52 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29937 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/instructions.cc M src/arch/gcn3/insts/instructions.hh 2 files changed, 78 insertions(+), 4 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 2e39bf5..607e3c6 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -39231,8 +39231,80 @@ void Inst_FLAT__FLAT_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); + +if (wf->execMask().none()) { +wf->wrGmReqsInPipe--; +wf->rdGmReqsInPipe--; +return; +} + +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->exec_mask = wf->execMask(); +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + +ConstVecOperandU64 addr(gpuDynInst, extData.ADDR); + +addr.read(); + +calcAddr(gpuDynInst, addr); + +if (gpuDynInst->executedAs() == Enums::SC_GLOBAL || +gpuDynInst->executedAs() == Enums::SC_PRIVATE) { +// TODO: additional address computation required for scratch +panic_if(gpuDynInst->executedAs() == Enums::SC_PRIVATE, + "Flats to private aperture not tested yet\n"); +gpuDynInst->computeUnit()->globalMemoryPipe. +issueRequest(gpuDynInst); +wf->wrGmReqsInPipe--; +wf->outstandingReqsWrGm++; +wf->rdGmReqsInPipe--; +wf->outstandingReqsRdGm++; +} else { +fatal("Non global flat instructions not implemented yet.\n"); +} + +gpuDynInst->wavefront()->outstandingReqs++; +gpuDynInst->wavefront()->validateRequestCounters(); + +ConstVecOperandU32 data(gpuDynInst, extData.DATA); + +data.read(); + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (gpuDynInst->exec_mask[lane]) { +(reinterpret_cast(gpuDynInst->a_data))[lane] += data[lane]; +} +} + +} // execute + +void +Inst_FLAT__FLAT_ATOMIC_SWAP::initiateAcc(GPUDynInstPtr gpuDynInst) +{ +initAtomicAccess(gpuDynInst); +} // initiateAcc + +void +Inst_FLAT__FLAT_ATOMIC_SWAP::completeAcc(GPUDynInstPtr gpuDynInst) +{ +if (isAtomicRet()) { +VecOperandU32 vdst(gpuDynInst, extData.VDST); + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (gpuDynInst->exec_mask[lane]) { +vdst[lane] = (reinterpret_cast( +gpuDynInst->d_data))[lane]; +} +} + +vdst.write(); +} +} // completeAcc + +// --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP class methods --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP ::Inst_FLAT__FLAT_ATOMIC_CMPSWAP(InFmt_FLAT *iFmt) diff --git a/src/arch/gcn3/insts/instructions.hh b/src/arch/gcn3/insts/instructions.hh index ff0cfea..b0cc37e 100644 --- a/src/arch/gcn3/insts/instructions.hh +++ b/src/arch/gcn3/insts/instructions.hh @@ -79949,9 +79949,9 @@ case 0: //vgpr_addr return 8; case 1: //vgpr_src -return 32; +return 4; case 2: //vgpr_dst -return 32; +return 4; default: fatal("op idx %i out of bounds\n", opIdx); return -1; @@ -79991,6 +79991,8 @@ } // isDstOperand void execute(GPUDynInstPtr) override; +void initiateAcc(GPUDynInstPtr) override; +void completeAcc(GPUDynInstPtr) override; }; // Inst_FLAT__FLAT_ATOMIC_SWAP class Inst_FLAT__FLAT_ATOMIC_CMPSWAP : public Inst_FLAT -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29937 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I9b9042899e65e8c9848b31c509eb2e3b13293e52 Gerrit-Change-Number: 29937 Gerrit-PatchSet: 9 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Alexandru Duțu Gerrit-Reviewer: Anthony Gutierrez Gerrit-Rev
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix VOP2 dissasembly prints
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29936 ) Change subject: arch-gcn3: Fix VOP2 dissasembly prints .. arch-gcn3: Fix VOP2 dissasembly prints VOP2 prints VSRC1 register index as hex instead of decimal if the instruction contains a literal operand. This patch resets the format specifiers in the stream to print the register correctly. Change-Id: Icc7e6588b3c5af545be6590ce412460e72df253f Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29936 Tested-by: kokoro Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez --- M src/arch/gcn3/insts/op_encodings.cc 1 file changed, 2 insertions(+), 1 deletion(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/op_encodings.cc b/src/arch/gcn3/insts/op_encodings.cc index 2eb4042..fe501f2 100644 --- a/src/arch/gcn3/insts/op_encodings.cc +++ b/src/arch/gcn3/insts/op_encodings.cc @@ -763,7 +763,8 @@ << extData.imm_u32 << ", "; } -dis_stream << "v" << instData.VSRC1; +dis_stream << std::resetiosflags(std::ios_base::basefield) << "v" +<< instData.VSRC1; if (readsVCC()) dis_stream << ", vcc"; -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29936 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Icc7e6588b3c5af545be6590ce412460e72df253f Gerrit-Change-Number: 29936 Gerrit-PatchSet: 9 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Michael LeBeane Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3, gpu-compute: Implement out-of-range accesses
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29935 ) Change subject: arch-gcn3, gpu-compute: Implement out-of-range accesses .. arch-gcn3, gpu-compute: Implement out-of-range accesses Certain buffer out-of-range memory accesses should be special cased and not generate memory accesses. This patch implements those special cases and supresses lanes from accessing memory when the calculated address falls in an ISA-specified out-of-range condition. Change-Id: I8298f861c6b59587789853a01e503ba7d98cb13d Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29935 Tested-by: kokoro Reviewed-by: Anthony Gutierrez Reviewed-by: Matt Sinclair Maintainer: Anthony Gutierrez --- M src/arch/gcn3/insts/instructions.cc M src/arch/gcn3/insts/op_encodings.hh M src/gpu-compute/global_memory_pipeline.cc 3 files changed, 96 insertions(+), 6 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Matt Sinclair: Looks good to me, but someone else must approve kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index b923eae..2e39bf5 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -34453,8 +34453,12 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { -vdst[lane] = (VecElemU32)((reinterpret_cast( -gpuDynInst->d_data))[lane]); +if (!oobMask[lane]) { +vdst[lane] = (VecElemU32)((reinterpret_cast( +gpuDynInst->d_data))[lane]); +} else { +vdst[lane] = 0; +} } } @@ -34580,8 +34584,12 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { -vdst[lane] = (VecElemU32)((reinterpret_cast( -gpuDynInst->d_data))[lane]); +if (!oobMask[lane]) { +vdst[lane] = (VecElemU32)((reinterpret_cast( +gpuDynInst->d_data))[lane]); +} else { +vdst[lane] = 0; +} } } @@ -34707,8 +34715,12 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { -vdst[lane] = (reinterpret_cast( -gpuDynInst->d_data))[lane]; +if (!oobMask[lane]) { +vdst[lane] = (reinterpret_cast( +gpuDynInst->d_data))[lane]; +} else { +vdst[lane] = 0; +} } } diff --git a/src/arch/gcn3/insts/op_encodings.hh b/src/arch/gcn3/insts/op_encodings.hh index 308560a..22c146a 100644 --- a/src/arch/gcn3/insts/op_encodings.hh +++ b/src/arch/gcn3/insts/op_encodings.hh @@ -40,6 +40,7 @@ #include "arch/gcn3/gpu_mem_helpers.hh" #include "arch/gcn3/insts/gpu_static_inst.hh" #include "arch/gcn3/operand.hh" +#include "debug/GCN3.hh" #include "debug/GPUExec.hh" #include "mem/ruby/system/RubySystem.hh" @@ -489,14 +490,26 @@ void initMemRead(GPUDynInstPtr gpuDynInst) { +// temporarily modify exec_mask to supress memory accesses to oob +// regions. Only issue memory requests for lanes that have their +// exec_mask set and are not out of bounds. +VectorMask old_exec_mask = gpuDynInst->exec_mask; +gpuDynInst->exec_mask &= ~oobMask; initMemReqHelper(gpuDynInst, MemCmd::ReadReq); +gpuDynInst->exec_mask = old_exec_mask; } template void initMemWrite(GPUDynInstPtr gpuDynInst) { +// temporarily modify exec_mask to supress memory accesses to oob +// regions. Only issue memory requests for lanes that have their +// exec_mask set and are not out of bounds. +VectorMask old_exec_mask = gpuDynInst->exec_mask; +gpuDynInst->exec_mask &= ~oobMask; initMemReqHelper(gpuDynInst, MemCmd::WriteReq); +gpuDynInst->exec_mask = old_exec_mask; } void @@ -566,6 +579,42 @@ buf_off = v_off[lane] + inst_offset; + +/** + * Range check behavior causes out of range accesses to + * to be treated differently. Out of range accesses return + * 0 for loads and are ignored for stores. For + * non-formatted accesses, this is done on a per-lane + * basis. + */ +if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleE
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix writelane src0,src1 usage
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29933 ) Change subject: arch-gcn3: Fix writelane src0,src1 usage .. arch-gcn3: Fix writelane src0,src1 usage Src1 should only be used for lane select. The data should come from src0. Change-Id: Ibe960df2e56d351a3819b40194104d2972a5cd4c Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29933 Tested-by: kokoro Maintainer: Anthony Gutierrez Reviewed-by: Anthony Gutierrez Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 1 insertion(+), 1 deletion(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Matt Sinclair: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 0256d46..b923eae 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -30181,7 +30181,7 @@ assert(!(extData.NEG & 0x2)); assert(!(extData.NEG & 0x4)); -vdst[src1.rawData() & 0x3f] = src1.rawData(); +vdst[src1.rawData() & 0x3f] = src0.rawData(); vdst.write(); } -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29933 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Ibe960df2e56d351a3819b40194104d2972a5cd4c Gerrit-Change-Number: 29933 Gerrit-PatchSet: 9 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Michael LeBeane Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Dropping fetchs when no entry is reserved in the buffer
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29932 ) Change subject: gpu-compute: Dropping fetchs when no entry is reserved in the buffer .. gpu-compute: Dropping fetchs when no entry is reserved in the buffer This changeset drops fetches if there is no entry reserved in the fetch buffer for that instruction. This can happen due to a fetch attempted to be issued in the same cycle where a branch instruction flushed the fetch buffer, while an ITLB or I-cache request is still pending. Change-Id: I3b80dbd71af27ccf790b543bd5c034bb9b02624a Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29932 Tested-by: kokoro Reviewed-by: Anthony Gutierrez Reviewed-by: Onur Kayıran Maintainer: Anthony Gutierrez --- M src/gpu-compute/fetch_unit.cc M src/gpu-compute/fetch_unit.hh 2 files changed, 22 insertions(+), 0 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Onur Kayıran: Looks good to me, approved kokoro: Regressions pass diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc index fb04cd2..447ff12 100644 --- a/src/gpu-compute/fetch_unit.cc +++ b/src/gpu-compute/fetch_unit.cc @@ -235,6 +235,16 @@ delete oldPkt; /** + * if we have not reserved an entry in the fetch buffer, + * stop fetching. this can happen due to a branch instruction + * flushing the fetch buffer while an ITLB or I-cache request is still + * pending, in the same cycle another instruction is trying to fetch. + */ +if (!fetchBuf.at(wavefront->wfSlotId).isReserved(pkt->req->getVaddr())) { +return; +} + +/** * we should have reserved an entry in the fetch buffer * for this cache line. here we get the pointer to the * entry used to buffer this request's line data. diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh index 2cfe3f0..798c264 100644 --- a/src/gpu-compute/fetch_unit.hh +++ b/src/gpu-compute/fetch_unit.hh @@ -120,6 +120,18 @@ return reserved_pc->second; } +/** + * returns true if there is an entry reserved for this address, + * and false otherwise + */ +bool +isReserved(Addr vaddr) const +{ +auto reserved_pc = reservedPCs.find(vaddr); +bool is_reserved = (reserved_pc != reservedPCs.end()); +return is_reserved; +} + void fetchDone(Addr vaddr); /** -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29932 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I3b80dbd71af27ccf790b543bd5c034bb9b02624a Gerrit-Change-Number: 29932 Gerrit-PatchSet: 9 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Onur Kayıran Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: fix bits that SDWA selects
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29931 ) Change subject: arch-gcn3: fix bits that SDWA selects .. arch-gcn3: fix bits that SDWA selects This commit fixes a bug in 200f2408 where the SDWA support was selecting bits backwards. As part of this commit, to help resolve this problem in the future, I have added asserts in the helper functions in bitfield.hh to ensure that the number of bits aren't negative. Change-Id: I4b0ecb0e7c110600c0b5063101b75f9adcc512ac Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29931 Tested-by: kokoro Reviewed-by: Anthony Gutierrez Reviewed-by: Matt Sinclair Maintainer: Anthony Gutierrez --- M src/arch/gcn3/insts/inst_util.hh M src/base/bitfield.hh 2 files changed, 37 insertions(+), 30 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Matt Sinclair: Looks good to me, but someone else must approve kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/inst_util.hh b/src/arch/gcn3/insts/inst_util.hh index 292e3ba..433ccbe 100644 --- a/src/arch/gcn3/insts/inst_util.hh +++ b/src/arch/gcn3/insts/inst_util.hh @@ -551,7 +551,7 @@ const SDWASelVals sel, const bool signExt) { // local variables -int first_bit = 0, last_bit = 0; +int low_bit = 0, high_bit = 0; bool signExt_local = signExt; T retVal = 0; @@ -566,17 +566,19 @@ of byte 0, or makes the bits of the selected byte be byte 0 (and next either sign extends or zero's out upper bits). */ -first_bit = (sel * Gcn3ISA::BITS_PER_BYTE); -last_bit = first_bit + Gcn3ISA::MSB_PER_BYTE; -retVal = bits(currOperVal, first_bit, last_bit); +low_bit = (sel * Gcn3ISA::BITS_PER_BYTE); +high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE; +retVal = bits(currOperVal, high_bit, low_bit); // make sure update propagated, since used next -assert(bits(retVal, Gcn3ISA::MSB_PER_BYTE) == - bits(origOperVal, (sel * Gcn3ISA::BITS_PER_BYTE) + -Gcn3ISA::MSB_PER_BYTE)); +fatal_if(bits(retVal, Gcn3ISA::MSB_PER_BYTE) != + bits(origOperVal, high_bit), + "ERROR: SDWA byte update not propagated: retVal: %d, " + "orig: %d\n", bits(retVal, Gcn3ISA::MSB_PER_BYTE), + bits(origOperVal, high_bit)); // sign extended value depends on upper-most bit of the new byte 0 signExt_local = (signExt && - (bits(retVal, 0, Gcn3ISA::MSB_PER_BYTE) & 0x80)); + (bits(retVal, Gcn3ISA::MSB_PER_BYTE, 0) & 0x80)); // process all other bytes -- if sign extending, make them 1, else // all 0's so leave as is @@ -589,17 +591,20 @@ of word 0, or makes the bits of the selected word be word 0 (and next either sign extends or zero's out upper bits). */ -first_bit = (sel & 1) * Gcn3ISA::BITS_PER_WORD; -last_bit = first_bit + Gcn3ISA::MSB_PER_WORD; -retVal = bits(currOperVal, first_bit, last_bit); +low_bit = (sel & 1) * Gcn3ISA::BITS_PER_WORD; +high_bit = low_bit + Gcn3ISA::MSB_PER_WORD; +retVal = bits(currOperVal, high_bit, low_bit); // make sure update propagated, since used next -assert(bits(retVal, Gcn3ISA::MSB_PER_WORD) == - bits(origOperVal, ((sel & 1) * Gcn3ISA::BITS_PER_WORD) + -Gcn3ISA::MSB_PER_WORD)); +fatal_if(bits(retVal, Gcn3ISA::MSB_PER_WORD) != + bits(origOperVal, high_bit), + "ERROR: SDWA word update not propagated: retVal: %d, " + "orig: %d\n", + bits(retVal, Gcn3ISA::MSB_PER_WORD), + bits(origOperVal, high_bit)); // sign extended value depends on upper-most bit of the new word 0 signExt_local = (signExt && - (bits(retVal, 0, Gcn3ISA::MSB_PER_WORD) & + (bits(retVal, Gcn3ISA::MSB_PER_WORD, 0) & 0x8000)); // process other word -- if sign extending, make them 1, else all @@ -659,7 +664,7 @@ const SDWADstVals unusedBits_format) { // local variables -int first_bit = 0, last_bit = 0; +int low_bit = 0, high_bit = 0; bool signExt = (unusedBits_format == SDWA_UNUSED_SEXT); //bool pad = (unusedBits_format == SDWA_UNUSED_PAD); bool preserve = (unusedBits_format == SDWA_UNUSED_PRESERVE); @@ -679,11 +684,11 @@
[gem5-dev] Change in gem5/gem5[develop]: arch-arm: Initialized some variables
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/31094 ) Change subject: arch-arm: Initialized some variables .. arch-arm: Initialized some variables Some of the variables in pauth_helpers.cc are uninitialized in certain control paths which causes a compiler warning. We initialize these to false since they should be updated to the correct value in all valid code paths. Change-Id: If34d7daaf2404c2cf014c7b4c0c2f979580f36b9 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/31094 Reviewed-by: Giacomo Travaglini Maintainer: Giacomo Travaglini Tested-by: kokoro --- M src/arch/arm/pauth_helpers.cc 1 file changed, 28 insertions(+), 28 deletions(-) Approvals: Giacomo Travaglini: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/arm/pauth_helpers.cc b/src/arch/arm/pauth_helpers.cc index c88795f..7424eb3 100644 --- a/src/arch/arm/pauth_helpers.cc +++ b/src/arch/arm/pauth_helpers.cc @@ -286,9 +286,9 @@ using the same algorithm and key as AddPACDA(). */ -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2 = false; +bool trapEL3 = false; +bool enable = false; uint64_t hi_key= tc->readMiscReg(MISCREG_APDAKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APDAKeyLo_EL1); @@ -354,9 +354,9 @@ using the same algorithm and key as AddPACDA(). */ -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2 = false; +bool trapEL3 = false; +bool enable = false; uint64_t hi_key= tc->readMiscReg(MISCREG_APDBKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APDBKeyLo_EL1); @@ -424,9 +424,9 @@ using the same algorithm and key as AddPACDA(). */ -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2 = false; +bool trapEL3 = false; +bool enable = false; uint64_t hi_key= tc->readMiscReg(MISCREG_APIAKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APIAKeyLo_EL1); @@ -498,9 +498,9 @@ using the same algorithm and key as AddPACDA(). */ -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2 = false; +bool trapEL3 = false; +bool enable = false; uint64_t hi_key= tc->readMiscReg(MISCREG_APIBKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APIBKeyLo_EL1); @@ -566,9 +566,9 @@ Fault ArmISA::addPACDA(ThreadContext* tc, uint64_t X, uint64_t Y, uint64_t* out) { -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2 = false; +bool trapEL3 = false; +bool enable = false; uint64_t hi_key= tc->readMiscReg(MISCREG_APDAKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APDAKeyLo_EL1); @@ -630,9 +630,9 @@ Fault ArmISA::addPACDB(ThreadContext* tc, uint64_t X, uint64_t Y, uint64_t* out) { -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2 = false; +bool trapEL3 = false; +bool enable = false; uint64_t hi_key= tc->readMiscReg(MISCREG_APDBKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APDBKeyLo_EL1); @@ -691,8 +691,8 @@ Fault ArmISA::addPACGA(ThreadContext * tc, uint64_t X, uint64_t Y, uint64_t* out) { -bool trapEL2; -bool trapEL3; +bool trapEL2 = false; +bool trapEL3 = false; uint64_t hi_key= tc->readMiscReg(MISCREG_APGAKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APGAKeyLo_EL1); @@ -738,9 +738,9 @@ Fault ArmISA::addPACIA(ThreadContext * tc, uint64_t X, uint64_t Y, uint64_t* out){ -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2 = false; +bool trapEL3 = false; +bool enable = false; uint64_t hi_key= tc->readMiscReg(MISCREG_APIAKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APIAKeyLo_EL1); @@ -797,9 +797,9 @@ Fault ArmISA::addPACIB(ThreadContext* tc, uint64_t X, uint64_t Y, uint64_t* out){ -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2 = false; +bool trapEL3 = false; +bool enable = false; uint64_t hi_key= tc->readMiscReg(MISCREG_APIBKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APIBKeyLo_EL1); @@ -859,8 +859,8 @@ Fault ArmISA::stripPAC(ThreadContext* tc, uint64_t A, bool data, uint64_t* out){ -bool trapEL2; -bool trapEL3; +bool trapEL2 = false; +bool trapEL3 = false; uint64_t ptr; -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/31094 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: If34d7daaf2404c2cf014c7b4c0c2f979580f36b9 Gerrit-Change-Number: 31094 Gerrit-PatchSet: 3 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Ciro Santilli Gerrit-Reviewer: Giacomo Travaglini Gerrit-Reviewer: Jason Lowe-Power Gerrit-Reviewer: Tony Gutierrez Ger
[gem5-dev] Change in gem5/gem5[develop]: arch-arm: Initialized some variables
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/31094 to review the following change. Change subject: arch-arm: Initialized some variables .. arch-arm: Initialized some variables Some of the variables in pauth_helpers.cc are uninitialized in certain control paths which causes a compiler warning. We initialize these to false since they should be updated to the correct value in all valid code paths. Change-Id: If34d7daaf2404c2cf014c7b4c0c2f979580f36b9 --- M src/arch/arm/pauth_helpers.cc 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/arch/arm/pauth_helpers.cc b/src/arch/arm/pauth_helpers.cc index c88795f..e996fd5 100644 --- a/src/arch/arm/pauth_helpers.cc +++ b/src/arch/arm/pauth_helpers.cc @@ -286,9 +286,9 @@ using the same algorithm and key as AddPACDA(). */ -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2(false); +bool trapEL3(false); +bool enable(false); uint64_t hi_key= tc->readMiscReg(MISCREG_APDAKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APDAKeyLo_EL1); @@ -354,9 +354,9 @@ using the same algorithm and key as AddPACDA(). */ -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2(false); +bool trapEL3(false); +bool enable(false); uint64_t hi_key= tc->readMiscReg(MISCREG_APDBKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APDBKeyLo_EL1); @@ -424,9 +424,9 @@ using the same algorithm and key as AddPACDA(). */ -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2(false); +bool trapEL3(false); +bool enable(false); uint64_t hi_key= tc->readMiscReg(MISCREG_APIAKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APIAKeyLo_EL1); @@ -498,9 +498,9 @@ using the same algorithm and key as AddPACDA(). */ -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2(false); +bool trapEL3(false); +bool enable(false); uint64_t hi_key= tc->readMiscReg(MISCREG_APIBKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APIBKeyLo_EL1); @@ -566,9 +566,9 @@ Fault ArmISA::addPACDA(ThreadContext* tc, uint64_t X, uint64_t Y, uint64_t* out) { -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2(false); +bool trapEL3(false); +bool enable(false); uint64_t hi_key= tc->readMiscReg(MISCREG_APDAKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APDAKeyLo_EL1); @@ -630,9 +630,9 @@ Fault ArmISA::addPACDB(ThreadContext* tc, uint64_t X, uint64_t Y, uint64_t* out) { -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2(false); +bool trapEL3(false); +bool enable(false); uint64_t hi_key= tc->readMiscReg(MISCREG_APDBKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APDBKeyLo_EL1); @@ -691,8 +691,8 @@ Fault ArmISA::addPACGA(ThreadContext * tc, uint64_t X, uint64_t Y, uint64_t* out) { -bool trapEL2; -bool trapEL3; +bool trapEL2(false); +bool trapEL3(false); uint64_t hi_key= tc->readMiscReg(MISCREG_APGAKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APGAKeyLo_EL1); @@ -738,9 +738,9 @@ Fault ArmISA::addPACIA(ThreadContext * tc, uint64_t X, uint64_t Y, uint64_t* out){ -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2(false); +bool trapEL3(false); +bool enable(false); uint64_t hi_key= tc->readMiscReg(MISCREG_APIAKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APIAKeyLo_EL1); @@ -797,9 +797,9 @@ Fault ArmISA::addPACIB(ThreadContext* tc, uint64_t X, uint64_t Y, uint64_t* out){ -bool trapEL2; -bool trapEL3; -bool enable; +bool trapEL2(false); +bool trapEL3(false); +bool enable(false); uint64_t hi_key= tc->readMiscReg(MISCREG_APIBKeyHi_EL1); uint64_t lo_key= tc->readMiscReg(MISCREG_APIBKeyLo_EL1); @@ -859,8 +859,8 @@ Fault ArmISA::stripPAC(ThreadContext* tc, uint64_t A, bool data, uint64_t* out){ -bool trapEL2; -bool trapEL3; +bool trapEL2(false); +bool trapEL3(false); uint64_t ptr; -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/31094 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: If34d7daaf2404c2cf014c7b4c0c2f979580f36b9 Gerrit-Change-Number: 31094 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: sim: Add M5_VAR_USED to var used in dprint
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/30896 ) Change subject: sim: Add M5_VAR_USED to var used in dprint .. sim: Add M5_VAR_USED to var used in dprint Change-Id: I8f8654b8546ee8df3d4acd1ccbc5080ad38764c1 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/30896 Reviewed-by: Jason Lowe-Power Reviewed-by: Andreas Sandberg Maintainer: Gabe Black Maintainer: Andreas Sandberg Tested-by: kokoro --- M src/sim/system.cc 1 file changed, 1 insertion(+), 1 deletion(-) Approvals: Jason Lowe-Power: Looks good to me, approved Andreas Sandberg: Looks good to me, approved; Looks good to me, approved Gabe Black: Looks good to me, approved kokoro: Regressions pass diff --git a/src/sim/system.cc b/src/sim/system.cc index 7057a97..7841ec0 100644 --- a/src/sim/system.cc +++ b/src/sim/system.cc @@ -179,7 +179,7 @@ { auto &t = thread(id); # if THE_ISA != NULL_ISA -BaseCPU *cpu = t.context->getCpuPtr(); +BaseCPU M5_VAR_USED *cpu = t.context->getCpuPtr(); DPRINTFS(Quiesce, cpu, "quiesce()\n"); # endif t.quiesce(); -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/30896 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I8f8654b8546ee8df3d4acd1ccbc5080ad38764c1 Gerrit-Change-Number: 30896 Gerrit-PatchSet: 2 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Andreas Sandberg Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Ciro Santilli Gerrit-Reviewer: Gabe Black Gerrit-Reviewer: Giacomo Gabrielli Gerrit-Reviewer: Jason Lowe-Power Gerrit-Reviewer: Jason Lowe-Power Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-arm: Remove some unused vars from self_debug.hh
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/30895 ) Change subject: arch-arm: Remove some unused vars from self_debug.hh .. arch-arm: Remove some unused vars from self_debug.hh Change-Id: I68b4ddfe66a34a29c0abfd52a8448e0b8a5bbe94 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/30895 Reviewed-by: Jason Lowe-Power Reviewed-by: Andreas Sandberg Reviewed-by: Jordi Vaquero Maintainer: Andreas Sandberg Tested-by: kokoro --- M src/arch/arm/self_debug.hh 1 file changed, 1 insertion(+), 6 deletions(-) Approvals: Jason Lowe-Power: Looks good to me, approved Andreas Sandberg: Looks good to me, but someone else must approve; Looks good to me, approved Jordi Vaquero: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/arm/self_debug.hh b/src/arch/arm/self_debug.hh index 9739c77..67654d2 100644 --- a/src/arch/arm/self_debug.hh +++ b/src/arch/arm/self_debug.hh @@ -58,7 +58,6 @@ private: MiscRegIndex ctrlRegIndex; MiscRegIndex valRegIndex; -MiscRegIndex xRegIndex; SelfDebug * conf; bool isCntxtAware; bool VMID16enabled; @@ -72,7 +71,7 @@ MiscRegIndex _xIndex, SelfDebug* _conf, bool _ctxAw, bool lva, bool vmid16, bool aarch32): ctrlRegIndex(_ctrlIndex), valRegIndex(_valIndex), -xRegIndex(_xIndex), conf(_conf), isCntxtAware(_ctxAw), +conf(_conf), isCntxtAware(_ctxAw), VMID16enabled(vmid16), active_pc(0x0), enable(false) { maxAddrSize = lva ? 52: 48 ; @@ -215,10 +214,6 @@ bool prevSteppedLdx; bool cpsrD; -bool ctrStepped; -bool ctrActivate; - - public: SoftwareStep(SelfDebug* s): bSS(false), stateSS(INACTIVE_STATE), conf(s), steppedLdx(false) { } -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/30895 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I68b4ddfe66a34a29c0abfd52a8448e0b8a5bbe94 Gerrit-Change-Number: 30895 Gerrit-PatchSet: 2 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Andreas Sandberg Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Ciro Santilli Gerrit-Reviewer: Giacomo Gabrielli Gerrit-Reviewer: Jason Lowe-Power Gerrit-Reviewer: Jason Lowe-Power Gerrit-Reviewer: Jordi Vaquero Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-arm: Add missing override to ARM faults
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/30894 ) Change subject: arch-arm: Add missing override to ARM faults .. arch-arm: Add missing override to ARM faults Change-Id: I7d64bdb4dfb0ba204e734f727b016bea168180ef Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/30894 Reviewed-by: Jason Lowe-Power Reviewed-by: Andreas Sandberg Maintainer: Andreas Sandberg Tested-by: kokoro --- M src/arch/arm/faults.hh 1 file changed, 1 insertion(+), 1 deletion(-) Approvals: Jason Lowe-Power: Looks good to me, approved Andreas Sandberg: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/arm/faults.hh b/src/arch/arm/faults.hh index a552757..7a2f69e 100644 --- a/src/arch/arm/faults.hh +++ b/src/arch/arm/faults.hh @@ -647,7 +647,7 @@ bool routeToHyp(ThreadContext *tc) const override; uint32_t iss() const override; ExceptionClass ec(ThreadContext *tc) const override; -void annotate(AnnotationIDs id, uint64_t val); +void annotate(AnnotationIDs id, uint64_t val) override; }; class SoftwareStepFault : public ArmFaultVals -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/30894 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I7d64bdb4dfb0ba204e734f727b016bea168180ef Gerrit-Change-Number: 30894 Gerrit-PatchSet: 2 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Andreas Sandberg Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Ciro Santilli Gerrit-Reviewer: Giacomo Gabrielli Gerrit-Reviewer: Jason Lowe-Power Gerrit-Reviewer: Jason Lowe-Power Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-arm: Add missing override to ARM faults
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/30894 to review the following change. Change subject: arch-arm: Add missing override to ARM faults .. arch-arm: Add missing override to ARM faults Change-Id: I7d64bdb4dfb0ba204e734f727b016bea168180ef --- M src/arch/arm/faults.hh 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arch/arm/faults.hh b/src/arch/arm/faults.hh index 8a127ff..09513de 100644 --- a/src/arch/arm/faults.hh +++ b/src/arch/arm/faults.hh @@ -635,7 +635,7 @@ bool routeToHyp(ThreadContext *tc) const override; uint32_t iss() const override; ExceptionClass ec(ThreadContext *tc) const override; -void annotate(AnnotationIDs id, uint64_t val); +void annotate(AnnotationIDs id, uint64_t val) override; }; class SoftwareStepFault : public ArmFaultVals -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/30894 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I7d64bdb4dfb0ba204e734f727b016bea168180ef Gerrit-Change-Number: 30894 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-arm: Remove some unused vars from self_debug.hh
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/30895 to review the following change. Change subject: arch-arm: Remove some unused vars from self_debug.hh .. arch-arm: Remove some unused vars from self_debug.hh Change-Id: I68b4ddfe66a34a29c0abfd52a8448e0b8a5bbe94 --- M src/arch/arm/self_debug.hh 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/arch/arm/self_debug.hh b/src/arch/arm/self_debug.hh index 48d784c..ce35a8b 100644 --- a/src/arch/arm/self_debug.hh +++ b/src/arch/arm/self_debug.hh @@ -58,7 +58,6 @@ private: MiscRegIndex ctrlRegIndex; MiscRegIndex valRegIndex; -MiscRegIndex xRegIndex; SelfDebug * conf; bool isCntxtAware; bool VMID16enabled; @@ -72,7 +71,7 @@ MiscRegIndex _xIndex, SelfDebug* _conf, bool _ctxAw, bool lva, bool vmid16, bool aarch32): ctrlRegIndex(_ctrlIndex), valRegIndex(_valIndex), -xRegIndex(_xIndex), conf(_conf), isCntxtAware(_ctxAw), +conf(_conf), isCntxtAware(_ctxAw), VMID16enabled(vmid16), active_pc(0x0), enable(false) { maxAddrSize = lva ? 52: 48 ; @@ -215,10 +214,6 @@ bool prevSteppedLdx; bool cpsrD; -bool ctrStepped; -bool ctrActivate; - - public: SoftwareStep(SelfDebug* s): bSS(false), stateSS(INACTIVE_STATE), conf(s), steppedLdx(false) { } -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/30895 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I68b4ddfe66a34a29c0abfd52a8448e0b8a5bbe94 Gerrit-Change-Number: 30895 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: sim: Add M5_VAR_USED to var used in dprint
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/30896 to review the following change. Change subject: sim: Add M5_VAR_USED to var used in dprint .. sim: Add M5_VAR_USED to var used in dprint Change-Id: I8f8654b8546ee8df3d4acd1ccbc5080ad38764c1 --- M src/sim/system.cc 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sim/system.cc b/src/sim/system.cc index 7057a97..7841ec0 100644 --- a/src/sim/system.cc +++ b/src/sim/system.cc @@ -179,7 +179,7 @@ { auto &t = thread(id); # if THE_ISA != NULL_ISA -BaseCPU *cpu = t.context->getCpuPtr(); +BaseCPU M5_VAR_USED *cpu = t.context->getCpuPtr(); DPRINTFS(Quiesce, cpu, "quiesce()\n"); # endif t.quiesce(); -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/30896 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I8f8654b8546ee8df3d4acd1ccbc5080ad38764c1 Gerrit-Change-Number: 30896 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3, gpu-compute: Fix issue when reading const operands
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29927 ) Change subject: arch-gcn3, gpu-compute: Fix issue when reading const operands .. arch-gcn3, gpu-compute: Fix issue when reading const operands Currently, when an instruction has an operand that reads a const value, it goes thru the same readMiscReg() api call as other misc registers (real HW registers, not constant values). There is an issue, however, when casting from the const values (which are 32b) to higher precision values, like 64b. This change creates a separate, templated function call to the GPU's ISA state that will return the correct type. Change-Id: I41965ebeeed20bb70e919fce5ad94d957b3af802 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29927 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/arch/gcn3/gpu_isa.hh M src/arch/gcn3/isa.cc M src/arch/gcn3/operand.hh M src/arch/gcn3/registers.cc M src/arch/gcn3/registers.hh M src/gpu-compute/gpu_exec_context.hh 6 files changed, 66 insertions(+), 17 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/gpu_isa.hh b/src/arch/gcn3/gpu_isa.hh index 26b79c7..228c3fe 100644 --- a/src/arch/gcn3/gpu_isa.hh +++ b/src/arch/gcn3/gpu_isa.hh @@ -37,6 +37,7 @@ #define __ARCH_GCN3_GPU_ISA_HH__ #include +#include #include "arch/gcn3/registers.hh" #include "gpu-compute/dispatcher.hh" @@ -52,6 +53,24 @@ public: GPUISA(Wavefront &wf); +template T +readConstVal(int opIdx) const +{ +panic_if(!std::is_integral::value, "Constant values must " + "be an integer.\n"); +T val(0); + +if (isPosConstVal(opIdx)) { +val = (T)readPosConstReg(opIdx); +} + +if (isNegConstVal(opIdx)) { +val = (T)readNegConstReg(opIdx); +} + +return val; +} + ScalarRegU32 readMiscReg(int opIdx) const; void writeMiscReg(int opIdx, ScalarRegU32 operandVal); bool hasScalarUnit() const { return true; } @@ -63,10 +82,9 @@ return posConstRegs[opIdx - REG_INT_CONST_POS_MIN]; } -ScalarRegU32 readNegConstReg(int opIdx) const +ScalarRegI32 readNegConstReg(int opIdx) const { -return *((ScalarRegU32*) -&negConstRegs[opIdx - REG_INT_CONST_NEG_MIN]); +return negConstRegs[opIdx - REG_INT_CONST_NEG_MIN]; } static const std::array diff --git a/src/arch/gcn3/isa.cc b/src/arch/gcn3/isa.cc index 036c771..3bd122d 100644 --- a/src/arch/gcn3/isa.cc +++ b/src/arch/gcn3/isa.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017 Advanced Micro Devices, Inc. + * Copyright (c) 2016-2018 Advanced Micro Devices, Inc. * All rights reserved. * * For use for simulation and test purposes only @@ -49,14 +49,6 @@ ScalarRegU32 GPUISA::readMiscReg(int opIdx) const { -if (opIdx >= REG_INT_CONST_POS_MIN && opIdx <= REG_INT_CONST_POS_MAX) { -return readPosConstReg(opIdx); -} - -if (opIdx >= REG_INT_CONST_NEG_MIN && opIdx <= REG_INT_CONST_NEG_MAX) { -return readNegConstReg(opIdx); -} - switch (opIdx) { case REG_M0: return m0; diff --git a/src/arch/gcn3/operand.hh b/src/arch/gcn3/operand.hh index 218faf8..7f70fab 100644 --- a/src/arch/gcn3/operand.hh +++ b/src/arch/gcn3/operand.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 Advanced Micro Devices, Inc. + * Copyright (c) 2017-2018 Advanced Micro Devices, Inc. * All rights reserved. * * For use for simulation and test purposes only @@ -583,10 +583,15 @@ default: { assert(sizeof(DataType) <= sizeof(srfData)); -DataType misc_val -= (DataType)_gpuDynInst->readMiscReg(_opIdx); +DataType misc_val(0); +if (isConstVal(_opIdx)) { +misc_val = (DataType)_gpuDynInst +->readConstVal(_opIdx); +} else { +misc_val = (DataType)_gpuDynInst->readMiscReg(_opIdx); +} std::memcpy((void*)srfData.data(), (void*)&misc_val, -sizeof(DataType)); +sizeof(DataType)); } } } diff --git a/src/arch/gcn3/registers.cc b/src/arch/gcn3/registers.cc index 0872ff9..016160f 100644 --- a/src/arch/gcn3/registers.cc +++ b/src/arch/gcn3/registers.cc @@ -163,6 +163,31 @@ } bool +isPosConstVal(int opIdx) +{ +bool is_pos_const_val = (opIdx >= REG_INT_CONST_POS_MIN +
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix V_MAD_I32_I24 sign extension
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29928 ) Change subject: arch-gcn3: Fix V_MAD_I32_I24 sign extension .. arch-gcn3: Fix V_MAD_I32_I24 sign extension We are not properly sign extending the bits we hack off for V_MAD_I32_I24. This fixes rnn_fwdBwd 64 1 1 lstm pte assertion failure. Change-Id: I2516e5715227cbd822e6a62630674f64f7a109e0 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29928 Reviewed-by: Anthony Gutierrez Reviewed-by: Matt Sinclair Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 2 insertions(+), 2 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Matt Sinclair: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 32719ad..0256d46 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -27446,8 +27446,8 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { -vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0) -+ src2[lane]; +vdst[lane] = sext<24>(bits(src0[lane], 23, 0)) +* sext<24>(bits(src1[lane], 23, 0)) + src2[lane]; } } -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29928 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I2516e5715227cbd822e6a62630674f64f7a109e0 Gerrit-Change-Number: 29928 Gerrit-PatchSet: 6 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Michael LeBeane Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Make headTailMap a std::unordered_map
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29930 ) Change subject: gpu-compute: Make headTailMap a std::unordered_map .. gpu-compute: Make headTailMap a std::unordered_map There is no reason that the headTailMap needs to be sorted, so let's use a std::unordered_map. Change-Id: I18641b893352c18ec86e3775c8947a05a6c6547d Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29930 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/gpu-compute/compute_unit.hh 1 file changed, 1 insertion(+), 1 deletion(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index 187cbc9..110097e 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -981,7 +981,7 @@ // hold the time of the arrival of the first cache block related to // a particular GPUDynInst. This is used to calculate the difference // between the first and last chace block arrival times. -std::map headTailMap; +std::unordered_map headTailMap; }; #endif // __COMPUTE_UNIT_HH__ -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29930 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I18641b893352c18ec86e3775c8947a05a6c6547d Gerrit-Change-Number: 29930 Gerrit-PatchSet: 6 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: dev: add support for HSA's barrier bit kernel synchronization
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29925 ) Change subject: dev: add support for HSA's barrier bit kernel synchronization .. dev: add support for HSA's barrier bit kernel synchronization This commit adds support for the HSA's barrier bit version of synchronization. This method of synchronization is used for all HIP benchmarks, and thus is necessary to ensure that multiple kernels from the same queue are synchronizing properly. Change-Id: I64f2d311a3970b71194e0555e2b932800df65e98 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29925 Reviewed-by: Anthony Gutierrez Reviewed-by: Matt Sinclair Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/dev/hsa/hsa_packet_processor.cc M src/dev/hsa/hsa_packet_processor.hh 2 files changed, 39 insertions(+), 3 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Matt Sinclair: Looks good to me, approved kokoro: Regressions pass diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc index f9880e4..4143019 100644 --- a/src/dev/hsa/hsa_packet_processor.cc +++ b/src/dev/hsa/hsa_packet_processor.cc @@ -60,6 +60,11 @@ #define PKT_TYPE(PKT) ((hsa_packet_type_t)(((PKT->header) >> \ HSA_PACKET_HEADER_TYPE) & (HSA_PACKET_HEADER_WIDTH_TYPE - 1))) +// checks if the barrier bit is set in the header -- shift the barrier bit +// to LSB, then bitwise "and" to mask off all other bits +#define IS_BARRIER(PKT) ((hsa_packet_header_t)(((PKT->header) >> \ +HSA_PACKET_HEADER_BARRIER) & HSA_PACKET_HEADER_WIDTH_BARRIER)) + HSAPP_EVENT_DESCRIPTION_GENERATOR(UpdateReadDispIdDmaEvent) HSAPP_EVENT_DESCRIPTION_GENERATOR(CmdQueueCmdDmaEvent) HSAPP_EVENT_DESCRIPTION_GENERATOR(QueueProcessEvent) @@ -280,7 +285,7 @@ HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx) { RQLEntry *queue = regdQList[rl_idx]; -if (!queue->aqlProcessEvent.scheduled()) { +if (!queue->aqlProcessEvent.scheduled() && !queue->getBarrierBit()) { Tick processingTick = curTick() + pktProcessDelay; schedule(queue->aqlProcessEvent, processingTick); DPRINTF(HSAPacketProcessor, "AQL processing scheduled at tick: %d\n", @@ -316,6 +321,16 @@ // Submit packet to HSA device (dispatcher) hsa_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr); is_submitted = true; +/* + If this packet is using the "barrier bit" to enforce ordering with + subsequent kernels, set the bit for this queue now, after + dispatching. +*/ +if (IS_BARRIER(disp_pkt)) { +DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \ +" list ID = %d\n", __FUNCTION__, rl_idx); +regdQList[rl_idx]->setBarrierBit(true); +} } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) { DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \ " active list ID = %d\n", __FUNCTION__, rl_idx); @@ -631,6 +646,23 @@ HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx) { HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc; + +// if barrier bit was set, unset it here -- we assume that finishPkt is +// only called after the completion of a kernel +if (regdQList[rl_idx]->getBarrierBit()) { +DPRINTF(HSAPacketProcessor, +"Unset barrier bit for active list ID %d\n", rl_idx); +regdQList[rl_idx]->setBarrierBit(false); +// if pending kernels in the queue after this kernel, reschedule +if (regdQList[rl_idx]->dispPending()) { +DPRINTF(HSAPacketProcessor, +"Rescheduling active list ID %d after unsetting barrier " +"bit\n", rl_idx); +schedAQLProcessing(rl_idx); +} +} + +// If set, then blocked schedule, so need to reschedule if (regdQList[rl_idx]->qCntxt.aqlBuf->freeEntry(pvPkt)) updateReadIndex(0, rl_idx); DPRINTF(HSAPacketProcessor, diff --git a/src/dev/hsa/hsa_packet_processor.hh b/src/dev/hsa/hsa_packet_processor.hh index 206d9ab..3ff7ad2 100644 --- a/src/dev/hsa/hsa_packet_processor.hh +++ b/src/dev/hsa/hsa_packet_processor.hh @@ -168,11 +168,13 @@ typedef struct QueueContext { HSAQueueDescriptor* qDesc; AQLRingBuffer* aqlBuf; +// used for HSA packets that enforce synchronization with barrier bit +bool barrierBit; QueueContext(HSAQueueDescriptor* q_desc, AQLRingBuffer* aql_buf) - : qDesc(q_desc), aqlBuf(aql_buf) + : qDesc(q_desc), aqlBuf(aql_buf), barrierBit(false) {} -QueueContext() : qDesc(NULL), aqlBuf(NULL) {} +QueueContext() : qDesc(NULL), aqlBuf(NULL), barrierBit(false) {} } QCntxt; class HSAPacketPr
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Remove unused function hostWakeUp from shader
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29929 ) Change subject: gpu-compute: Remove unused function hostWakeUp from shader .. gpu-compute: Remove unused function hostWakeUp from shader Change-Id: Ib4415a7c5918da03bbd16fe9adb4dd593dcaa95c Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29929 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/gpu-compute/shader.cc M src/gpu-compute/shader.hh 2 files changed, 0 insertions(+), 14 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index aa7a6dd..f5e9444 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -153,19 +153,6 @@ assert(gpuTc); } -void -Shader::hostWakeUp(BaseCPU *cpu) { -if (cpuPointer == cpu) { -if (gpuTc->status() == ThreadContext::Suspended) -cpu->activateContext(gpuTc->threadId()); -} else { -//Make sure both dispatcher and shader are trying to -//wakeup same host. Hack here to enable kernel launch -//from multiple CPUs -panic("Dispatcher wants to wakeup a different host"); -} -} - Shader* ShaderParams::create() { diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh index eeaf343..238f6e0 100644 --- a/src/gpu-compute/shader.hh +++ b/src/gpu-compute/shader.hh @@ -301,7 +301,6 @@ Addr mmap(int length); void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode); void updateContext(int cid); -void hostWakeUp(BaseCPU *cpu); void notifyCuSleep(); }; -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29929 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Ib4415a7c5918da03bbd16fe9adb4dd593dcaa95c Gerrit-Change-Number: 29929 Gerrit-PatchSet: 6 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Updating implementation of atomics
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29926 ) Change subject: arch-gcn3: Updating implementation of atomics .. arch-gcn3: Updating implementation of atomics This changeset is moving the access of the data operand from initiateAcc to the execute method of atomic instructions. Change-Id: I1debae302f0b13f79ed2b7a9ed2f6b07fcec5128 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29926 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 45 insertions(+), 52 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 26af241..32719ad 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -39261,11 +39261,24 @@ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); ConstVecOperandU64 addr(gpuDynInst, extData.ADDR); +ConstVecOperandU32 data(gpuDynInst, extData.DATA); +ConstVecOperandU32 cmp(gpuDynInst, extData.DATA + 1); addr.read(); +data.read(); +cmp.read(); calcAddr(gpuDynInst, addr); +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (gpuDynInst->exec_mask[lane]) { +(reinterpret_cast(gpuDynInst->x_data))[lane] += data[lane]; +(reinterpret_cast(gpuDynInst->a_data))[lane] += cmp[lane]; +} +} + if (gpuDynInst->executedAs() == Enums::SC_GLOBAL || gpuDynInst->executedAs() == Enums::SC_PRIVATE) { /** @@ -39293,21 +39306,6 @@ void Inst_FLAT__FLAT_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst) { -ConstVecOperandU32 data(gpuDynInst, extData.DATA); -ConstVecOperandU32 cmp(gpuDynInst, extData.DATA + 1); - -data.read(); -cmp.read(); - -for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { -if (gpuDynInst->exec_mask[lane]) { -(reinterpret_cast(gpuDynInst->x_data))[lane] -= data[lane]; -(reinterpret_cast(gpuDynInst->a_data))[lane] -= cmp[lane]; -} -} - initAtomicAccess(gpuDynInst); } // initiateAcc @@ -39364,11 +39362,20 @@ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); ConstVecOperandU64 addr(gpuDynInst, extData.ADDR); +ConstVecOperandU32 data(gpuDynInst, extData.DATA); addr.read(); +data.read(); calcAddr(gpuDynInst, addr); +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (gpuDynInst->exec_mask[lane]) { +(reinterpret_cast(gpuDynInst->a_data))[lane] += data[lane]; +} +} + if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); @@ -39387,17 +39394,6 @@ void Inst_FLAT__FLAT_ATOMIC_ADD::initiateAcc(GPUDynInstPtr gpuDynInst) { -ConstVecOperandU32 data(gpuDynInst, extData.DATA); - -data.read(); - -for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { -if (gpuDynInst->exec_mask[lane]) { -(reinterpret_cast(gpuDynInst->a_data))[lane] -= data[lane]; -} -} - initAtomicAccess(gpuDynInst); } // initiateAcc @@ -39733,11 +39729,24 @@ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); ConstVecOperandU64 addr(gpuDynInst, extData.ADDR); +ConstVecOperandU64 data(gpuDynInst, extData.DATA); +ConstVecOperandU64 cmp(gpuDynInst, extData.DATA + 2); addr.read(); +data.read(); +cmp.read(); calcAddr(gpuDynInst, addr); +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (gpuDynInst->exec_mask[lane]) { +(reinterpret_cast(gpuDynInst->x_data))[lane] += data[lane]; +(reinterpret_cast(gpuDynInst->a_data))[lane] += cmp[lane]; +} +} + if (gpuDynInst->executedAs() == Enums::SC_GLOBAL || gpuDynInst->executedAs() == Enums::SC_PRIVATE) { /** @@ -39765,21 +39774,6 @@ void Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::initiateAcc(GPUDynInstPtr gpuDynInst) { -ConstVecOperandU64 data(gpuDynInst, extData.DATA); -ConstVecOperandU64 cmp(gpuDynInst, extData.DATA + 2); - -data.read(); -cmp.read(); - -for (int lane = 0; lane < NumVecElemPer
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement instruction v_div_fixup_f32
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29924 ) Change subject: arch-gcn3: Implement instruction v_div_fixup_f32 .. arch-gcn3: Implement instruction v_div_fixup_f32 Instruction v_div_fixup_f32 was unimplemented. The implementation was added by mimicking v_div_fixup_f64. Change-Id: I9306b198f327e9fde3414aa1bb2bec20503b1efd Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29924 Reviewed-by: Anthony Gutierrez Reviewed-by: Matt Sinclair Reviewed-by: Xianwei Zhang Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 59 insertions(+), 3 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Matt Sinclair: Looks good to me, but someone else must approve Xianwei Zhang: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 308fd5d..26af241 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -28671,9 +28671,65 @@ void Inst_VOP3__V_DIV_FIXUP_F32::execute(GPUDynInstPtr gpuDynInst) { -// Could not parse sq_uc.arch desc field -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +ConstVecOperandF32 src0(gpuDynInst, extData.SRC0); +ConstVecOperandF32 src1(gpuDynInst, extData.SRC1); +ConstVecOperandF32 src2(gpuDynInst, extData.SRC2); +VecOperandF32 vdst(gpuDynInst, instData.VDST); + +src0.readSrc(); +src1.readSrc(); +src2.readSrc(); + +if (instData.ABS & 0x1) { +src0.absModifier(); +} + +if (instData.ABS & 0x2) { +src1.absModifier(); +} + +if (instData.ABS & 0x4) { +src2.absModifier(); +} + +if (extData.NEG & 0x1) { +src0.negModifier(); +} + +if (extData.NEG & 0x2) { +src1.negModifier(); +} + +if (extData.NEG & 0x4) { +src2.negModifier(); +} + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (wf->execMask(lane)) { +if (std::fpclassify(src1[lane]) == FP_ZERO) { +if (std::signbit(src1[lane])) { +vdst[lane] = -INFINITY; +} else { +vdst[lane] = +INFINITY; +} +} else if (std::isnan(src2[lane]) || std::isnan(src1[lane])) { +vdst[lane] = NAN; +} else if (std::isinf(src1[lane])) { +if (std::signbit(src1[lane])) { +vdst[lane] = -INFINITY; +} else { +vdst[lane] = +INFINITY; +} +} else { +vdst[lane] = src2[lane] / src1[lane]; +} +} +} + +vdst.write(); +} // execute +// --- Inst_VOP3__V_DIV_FIXUP_F64 class methods --- Inst_VOP3__V_DIV_FIXUP_F64::Inst_VOP3__V_DIV_FIXUP_F64(InFmt_VOP3 *iFmt) : Inst_VOP3(iFmt, "v_div_fixup_f64", false) -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29924 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I9306b198f327e9fde3414aa1bb2bec20503b1efd Gerrit-Change-Number: 29924 Gerrit-PatchSet: 6 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: Xianwei Zhang Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement instruction v_div_fmas_f32
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29923 ) Change subject: arch-gcn3: Implement instruction v_div_fmas_f32 .. arch-gcn3: Implement instruction v_div_fmas_f32 Instruction v_div_fmas_f32 was unimplemented. The implementation was added by mimicking v_div_fmas_f64. Change-Id: I262820a7a66877d140eb99b538715c3cae4d1860 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29923 Reviewed-by: Anthony Gutierrez Reviewed-by: Xianwei Zhang Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 43 insertions(+), 2 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Xianwei Zhang: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 2789f3e..308fd5d 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -28879,8 +28879,49 @@ void Inst_VOP3__V_DIV_FMAS_F32::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +ConstVecOperandF32 src0(gpuDynInst, extData.SRC0); +ConstVecOperandF32 src1(gpuDynInst, extData.SRC1); +ConstVecOperandF32 src2(gpuDynInst, extData.SRC2); +VecOperandF64 vdst(gpuDynInst, instData.VDST); + +src0.readSrc(); +src1.readSrc(); +src2.readSrc(); + +if (instData.ABS & 0x1) { +src0.absModifier(); +} + +if (instData.ABS & 0x2) { +src1.absModifier(); +} + +if (instData.ABS & 0x4) { +src2.absModifier(); +} + +if (extData.NEG & 0x1) { +src0.negModifier(); +} + +if (extData.NEG & 0x2) { +src1.negModifier(); +} + +if (extData.NEG & 0x4) { +src2.negModifier(); +} + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (wf->execMask(lane)) { +vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]); +} +} + +//vdst.write(); +} // execute +// --- Inst_VOP3__V_DIV_FMAS_F64 class methods --- Inst_VOP3__V_DIV_FMAS_F64::Inst_VOP3__V_DIV_FMAS_F64(InFmt_VOP3 *iFmt) : Inst_VOP3(iFmt, "v_div_fmas_f64", false) -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29923 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I262820a7a66877d140eb99b538715c3cae4d1860 Gerrit-Change-Number: 29923 Gerrit-PatchSet: 6 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: Xianwei Zhang Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: fix bug with SDWA support
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29922 ) Change subject: arch-gcn3: fix bug with SDWA support .. arch-gcn3: fix bug with SDWA support Instructions that use the SDWA field need to use the extra SRC0 register associated with the SDWA instruction instead of the "default" SRC0 register, since the default SRC0 register contains the SDWA information when SDWA is being used. This commit fixes 15de044c to take this into account. Additionally, this commit removes reads of the registers from the SDWA helper functions, since they overwrite any changes made to the destination register. Finally, this change modifies the instructions that use SDWA to simplify the flow through the execute() functions. Change-Id: I3bad83133808dfffc6a4c40bbd49c3d76599e669 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29922 Reviewed-by: Anthony Gutierrez Reviewed-by: Matt Sinclair Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/arch/gcn3/insts/inst_util.hh M src/arch/gcn3/insts/instructions.cc 2 files changed, 133 insertions(+), 110 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Matt Sinclair: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/inst_util.hh b/src/arch/gcn3/insts/inst_util.hh index a3b2f4a..292e3ba 100644 --- a/src/arch/gcn3/insts/inst_util.hh +++ b/src/arch/gcn3/insts/inst_util.hh @@ -547,8 +547,8 @@ * operations are done on it. */ template -T sdwaInstSrcImpl_helper(T currOperVal, T origOperVal, SDWASelVals sel, - bool signExt) +T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal, + const SDWASelVals sel, const bool signExt) { // local variables int first_bit = 0, last_bit = 0; @@ -635,16 +635,14 @@ * 2. if sign extend is set, then sign extend the value */ template -void sdwaInstSrcImpl(T & currOper, T & origCurrOper, SDWASelVals sel, - bool signExt) +void sdwaInstSrcImpl(T & currOper, T & origCurrOper, + const SDWASelVals sel, const bool signExt) { // iterate over all lanes, setting appropriate, selected value -currOper.read(); -origCurrOper.read(); for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane], - origCurrOper[lane], sel, - signExt); +origCurrOper[lane], sel, +signExt); } } @@ -656,8 +654,9 @@ * operations are done on it. */ template -T sdwaInstDstImpl_helper(T currDstVal, T origDstVal, bool clamp, - SDWASelVals sel, SDWADstVals unusedBits_format) +T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal, + const bool clamp, const SDWASelVals sel, + const SDWADstVals unusedBits_format) { // local variables int first_bit = 0, last_bit = 0; @@ -756,12 +755,11 @@ * 2 (SDWA_UNUSED_PRESERVE): select data[31:0] */ template -void sdwaInstDstImpl(T & dstOper, T & origDstOper, bool clamp, - SDWASelVals sel, SDWADstVals unusedBits_format) +void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp, + const SDWASelVals sel, + const SDWADstVals unusedBits_format) { // iterate over all lanes, setting appropriate, selected value -dstOper.read(); -origDstOper.read(); for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane], origDstOper[lane], clamp, @@ -779,8 +777,9 @@ */ template void processSDWA_src_helper(T & currSrc, T & origCurrSrc, -SDWASelVals src_sel, bool src_signExt, -bool src_abs, bool src_neg) +const SDWASelVals src_sel, +const bool src_signExt, const bool src_abs, +const bool src_neg) { /** * STEP 1: check if the absolute value (ABS) or negation (NEG) tags @@ -812,14 +811,13 @@ * processSDWA_src is called before the math. */ template -void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst, - T & src0, T & origSrc0) +void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & or
[gem5-dev] Change in gem5/gem5[develop]: tests: remove deprecated hsail gpu_hello
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29921 ) Change subject: tests: remove deprecated hsail gpu_hello .. tests: remove deprecated hsail gpu_hello Change-Id: I7e15075e7805af732e89c3269fdff9d65a144219 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29921 Reviewed-by: Anthony Gutierrez Reviewed-by: Jason Lowe-Power Reviewed-by: Matt Sinclair Reviewed-by: Xianwei Zhang Maintainer: Anthony Gutierrez Maintainer: Jason Lowe-Power Tested-by: kokoro --- D tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello D tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm D tests/test-progs/gpu-hello/src/gpu-hello-kernel.cl D tests/test-progs/gpu-hello/src/gpu-hello.cpp 4 files changed, 0 insertions(+), 420 deletions(-) Approvals: Jason Lowe-Power: Looks good to me, approved; Looks good to me, approved Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Matt Sinclair: Looks good to me, but someone else must approve Xianwei Zhang: Looks good to me, approved kokoro: Regressions pass diff --git a/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello b/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello deleted file mode 100755 index de248ee..000 --- a/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello +++ /dev/null Binary files differ diff --git a/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm b/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm deleted file mode 100644 index a4ad144..000 --- a/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm +++ /dev/null Binary files differ diff --git a/tests/test-progs/gpu-hello/src/gpu-hello-kernel.cl b/tests/test-progs/gpu-hello/src/gpu-hello-kernel.cl deleted file mode 100755 index 496f9b5..000 --- a/tests/test-progs/gpu-hello/src/gpu-hello-kernel.cl +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. - * All rights reserved. - * - * For use for simulation and test purposes only - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * Author: Marc Orr - */ - - -__kernel void read_kernel(size_t code_size, - __global char *code_in, - __global int *key_arr, - __global char *msg_out, - __global int *chars_decoded) -{ -size_t gid = get_global_id(0); -size_t my_idx = gid % code_size; -bool decode = 0; -__local atomic_int lcount; - -if (get_local_id(0) == 0) { -lcount=0; -} -barrier(CLK_LOCAL_MEM_FENCE); - -// read code -char mycode = code_in[my_idx]; - -// decode -int my_key = key_arr[my_idx]; -if (my_key) { -decode = 1; -for (int n = 0; n < my_key; n++) { -mycode++; -} -} - -// write out msg -msg_out[gid] = mycode; - -if (decode) { -atomic_fetch_add((atomic_int *)(&lcount), 1); -} -barrier(CLK_LOCAL_MEM_FENCE); - - -if (get_local_id(0) == 0) { -int _lcount = atomic_load(&lcount); -atomic_fetch_add((atomic_int *)chars_decoded, _lcount); -} -} diff --git a/tests/test-progs/gpu-hello/src/gpu-hello.cpp b/tests/test-progs/gpu-hello/src/gpu-hello.cpp deleted file mode 100755 index bdff074..000 --- a/tests/test-progs/gpu-hello/src/gpu-hello.cpp +++ /dev/null @@ -1,342 +0,0 @@ -/*
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: add support for unaligned accesses
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29920 ) Change subject: arch-gcn3: add support for unaligned accesses .. arch-gcn3: add support for unaligned accesses Previously, with HSAIL, we were guaranteed by the HSA specification that the GPU will never issue unaligned accesses. However, now that we are directly running GCN this is no longer true. Accordingly, this commit adds support for unaligned accesses. Moreover, to reduce the replication of nearly identical code for the different request types, I also added new helper functions that are called by all the different memory request producing instruction types in op_encodings.hh. Adding support for unaligned instructions requires changing the statusBitVector used to track the status of the memory requests for each lane from a bit per lane to an int per lane. This is necessary because an unaligned access may span multiple cache lines. In the worst case, each lane may span multiple cache lines. There are corresponding changes in the files that use the statusBitVector. Change-Id: I319bf2f0f644083e98ca546d2bfe68cf87a5f967 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29920 Reviewed-by: Anthony Gutierrez Reviewed-by: Matt Sinclair Maintainer: Anthony Gutierrez Tested-by: kokoro --- A src/arch/gcn3/gpu_mem_helpers.hh M src/arch/gcn3/insts/op_encodings.hh M src/gpu-compute/compute_unit.cc M src/gpu-compute/gpu_dyn_inst.cc M src/gpu-compute/gpu_dyn_inst.hh M src/mem/ruby/common/DataBlock.cc M src/mem/ruby/system/RubyPort.cc 7 files changed, 298 insertions(+), 242 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Matt Sinclair: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/gpu_mem_helpers.hh b/src/arch/gcn3/gpu_mem_helpers.hh new file mode 100644 index 000..40ca565 --- /dev/null +++ b/src/arch/gcn3/gpu_mem_helpers.hh @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2018 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Matt Sinclair + */ + +#ifndef __ARCH_GCN3_GPU_MEM_HELPERS_HH__ +#define __ARCH_GCN3_GPU_MEM_HELPERS_HH__ + +#include "arch/gcn3/insts/gpu_static_inst.hh" +#include "arch/gcn3/insts/op_encodings.hh" +#include "debug/GPUMem.hh" +#include "gpu-compute/gpu_dyn_inst.hh" + +/** + * Helper function for instructions declared in op_encodings. This function + * takes in all of the arguments for a given memory request we are trying to + * initialize, then submits the request or requests depending on if the + * original request is aligned or unaligned. + */ +template +inline void +initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type, + bool is_atomic=false) +{ +// local variables +int req_size = N * sizeof(T); +int block_size = gpuDynInst->computeUnit()->cacheLineSize(); +Addr vaddr = 0, split_addr = 0; +bool misaligned_acc = false; +RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr; +PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr; + +gpuDynInst->resetEntireStatusVector(); +for (int lane = 0; lane < Gcn3ISA::NumVecElemPerVecReg; ++lane) { +if (gpuDynInst->exec_mask[lane]) { +vaddr = gpuDynInst->addr[lane]; + +
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement instruction v_div_scale_f32
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29919 ) Change subject: arch-gcn3: Implement instruction v_div_scale_f32 .. arch-gcn3: Implement instruction v_div_scale_f32 Instruction v_div_scale_f32 was unimplemented, the implementation was added by mimicking v_div_scale_f64. Change-Id: I89cdfd02ab01b5936de0e9f6c41e7f3fc4f10ae1 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29919 Reviewed-by: Anthony Gutierrez Reviewed-by: Xianwei Zhang Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 34 insertions(+), 2 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Xianwei Zhang: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 8d63296..bd6e4f4 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -28746,8 +28746,40 @@ void Inst_VOP3__V_DIV_SCALE_F32::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +ConstVecOperandF32 src0(gpuDynInst, extData.SRC0); +ConstVecOperandF32 src1(gpuDynInst, extData.SRC1); +ConstVecOperandF32 src2(gpuDynInst, extData.SRC2); +ScalarOperandU64 vcc(gpuDynInst, instData.SDST); +VecOperandF32 vdst(gpuDynInst, instData.VDST); + +src0.readSrc(); +src1.readSrc(); +src2.readSrc(); + +if (extData.NEG & 0x1) { +src0.negModifier(); +} + +if (extData.NEG & 0x2) { +src1.negModifier(); +} + +if (extData.NEG & 0x4) { +src2.negModifier(); +} + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (wf->execMask(lane)) { +vdst[lane] = src0[lane]; +vcc.setBit(lane, 0); +} +} + +vcc.write(); +vdst.write(); +} // execute +// --- Inst_VOP3__V_DIV_SCALE_F64 class methods --- Inst_VOP3__V_DIV_SCALE_F64::Inst_VOP3__V_DIV_SCALE_F64( InFmt_VOP3_SDST_ENC *iFmt) -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29919 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I89cdfd02ab01b5936de0e9f6c41e7f3fc4f10ae1 Gerrit-Change-Number: 29919 Gerrit-PatchSet: 6 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: Xianwei Zhang Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: config: fix settings of kernel boundary sync flags
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29918 ) Change subject: config: fix settings of kernel boundary sync flags .. config: fix settings of kernel boundary sync flags Change-Id: I58a8edc5d324bdcaa84e3d715e2712a43e8ede0d Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29918 Reviewed-by: Anthony Gutierrez Reviewed-by: Matt Sinclair Reviewed-by: Xianwei Zhang Maintainer: Anthony Gutierrez Tested-by: kokoro --- M configs/example/apu_se.py 1 file changed, 15 insertions(+), 5 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Matt Sinclair: Looks good to me, approved Xianwei Zhang: Looks good to me, approved kokoro: Regressions pass diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py index fee85f0..0ff80d8 100644 --- a/configs/example/apu_se.py +++ b/configs/example/apu_se.py @@ -209,13 +209,23 @@ # So, all GPU protocols other than GPU_RfO should make their writes # visible to the global memory and should read from global memory # during kernal boundary. The pipeline initiates(or do not initiate) -# the acquire/release operation depending on this impl_kern_boundary_sync -# flag. This flag=true means pipeline initiates a acquire/release operation -# at kernel boundary. +# the acquire/release operation depending on these impl_kern_launch_rel +# and impl_kern_end_rel flags. The flag=true means pipeline initiates +# a acquire/release operation at kernel launch/end. +# VIPER protocols (GPU_VIPER, GPU_VIPER_Region and GPU_VIPER_Baseline) +# are write-through based, and thus only imple_kern_launch_acq needs to +# set. if buildEnv['PROTOCOL'] == 'GPU_RfO': -shader.impl_kern_boundary_sync = False +shader.impl_kern_launch_acq = False +shader.impl_kern_end_rel = False +elif (buildEnv['PROTOCOL'] != 'GPU_VIPER' or +buildEnv['PROTOCOL'] != 'GPU_VIPER_Region' or +buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline'): +shader.impl_kern_launch_acq = True +shader.impl_kern_end_rel = False else: -shader.impl_kern_boundary_sync = True +shader.impl_kern_launch_acq = True +shader.impl_kern_end_rel = True # Switching off per-lane TLB by default per_lane = False -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29918 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I58a8edc5d324bdcaa84e3d715e2712a43e8ede0d Gerrit-Change-Number: 29918 Gerrit-PatchSet: 6 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: Xianwei Zhang Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: enable flexible control of kernel boundary syncs
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29917 ) Change subject: gpu-compute: enable flexible control of kernel boundary syncs .. gpu-compute: enable flexible control of kernel boundary syncs Kernel end release was turned on for VIPER protocol, which is in fact write-through based and thus no need to have release operation. This changeset splits the option 'impl_kern_boundary_sync' into 'impl_kern_launch_acq' and 'impl_kern_end_rel', and turns off release on VIPER. Change-Id: I5490019b6765a25bd801cc78fb7445b90eb02a3d Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29917 Reviewed-by: Anthony Gutierrez Reviewed-by: Xianwei Zhang Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/arch/gcn3/insts/instructions.cc M src/gpu-compute/GPU.py M src/gpu-compute/dispatcher.cc M src/gpu-compute/shader.cc M src/gpu-compute/shader.hh 5 files changed, 20 insertions(+), 11 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Xianwei Zhang: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 7578694..8d63296 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -3759,9 +3759,13 @@ // the last workgroup in the kernel). bool kernelEnd = wf->computeUnit->shader->dispatcher().isReachingKernelEnd(wf); +// further check whether 'release @ kernel end' is needed +bool relNeeded = +wf->computeUnit->shader->impl_kern_end_rel; -// if it is not a kernel end, then retire the workgroup directly -if (!kernelEnd) { +// if not a kernel end or no release needed, retire the workgroup +// directly +if (!kernelEnd || !relNeeded) { wf->computeUnit->shader->dispatcher().notifyWgCompl(wf); wf->setStatus(Wavefront::S_STOPPED); wf->computeUnit->completedWGs++; @@ -3770,8 +3774,8 @@ } /** - * If it is a kernel end, inject a memory sync and retire the - * workgroup after receving response. + * If a kernel end and release needed, inject a memory sync and + * retire the workgroup after receving all acks. */ setFlag(MemSync); setFlag(GlobalSegment); diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index 6b033f4..8a2ad81 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -213,8 +213,10 @@ gpu_cmd_proc = Param.GPUCommandProcessor('Command processor for GPU') dispatcher = Param.GPUDispatcher('GPU workgroup dispatcher') n_wf = Param.Int(10, 'Number of wavefront slots per SIMD') -impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into - ruby at kernel boundaries""") +impl_kern_launch_acq = Param.Bool(True, """Insert acq packet into + ruby at kernel launch""") +impl_kern_end_rel = Param.Bool(False, """Insert rel packet into + ruby at kernel end""") globalmem = Param.MemorySize('64kB', 'Memory size') timing = Param.Bool(False, 'timing memory accesses') diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc index 51f5e97..6a8242f 100644 --- a/src/gpu-compute/dispatcher.cc +++ b/src/gpu-compute/dispatcher.cc @@ -166,12 +166,12 @@ auto task = hsaQueueEntries[exec_id]; bool launched(false); -// invalidate is needed before starting dispatch -if (shader->impl_kern_boundary_sync) { +// acq is needed before starting dispatch +if (shader->impl_kern_launch_acq) { // try to invalidate cache shader->prepareInvalidate(task); } else { -// kern boundary sync is not set, skip invalidate +// kern launch acquire is not set, skip invalidate task->markInvDone(); } diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index 4be2fbf..aa7a6dd 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -56,7 +56,8 @@ tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event", false, Event::CPU_Tick_Pri), timingSim(p->timing), hsail_mode(SIMT), -impl_kern_boundary_sync(p->impl_kern_boundary_sync), +impl_kern_launch_acq(p->impl_kern_launch_acq), +impl_kern_end_rel(p->impl_kern_end_rel), coissue_return(1), trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf), globalMemSize(p->globalmem), diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh index 72063a4..eeaf343 10
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: remove recvToken from GM pipe exec
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29916 ) Change subject: gpu-compute: remove recvToken from GM pipe exec .. gpu-compute: remove recvToken from GM pipe exec Tokens were previously acquired in GM pipe exec but has been moved to acqCoalescerToken. This removes the extraneous code which was acquiring tokens twice, causing them to be depleted and triggering an assertion. Change-Id: Ic92de8f06cc85828b29c69790bdadde057ef1777 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29916 Reviewed-by: Anthony Gutierrez Reviewed-by: Matthew Poremba Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/gpu-compute/global_memory_pipeline.cc 1 file changed, 0 insertions(+), 6 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Matthew Poremba: Looks good to me, approved kokoro: Regressions pass diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index 0bbacc4..c73184a 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -190,12 +190,6 @@ DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n", mp->disassemble(), mp->seqNum()); -// Memfences will not return tokens and must be issued so we should -// not request one as this will deplete the token count until deadlock -if (!mp->isMemSync()) { -assert(mp->computeUnit()->getTokenManager()->haveTokens(1)); -mp->computeUnit()->getTokenManager()->acquireTokens(1); -} mp->initiateAcc(mp); if (((mp->isMemSync() && !mp->isEndOfKernel()) | | !mp->isMemSync())) { -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29916 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Ic92de8f06cc85828b29c69790bdadde057ef1777 Gerrit-Change-Number: 29916 Gerrit-PatchSet: 6 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Matthew Poremba Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: mem-ruby: Add DMA support to MOESI_AMD_Base-dir.sm
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29914 ) Change subject: mem-ruby: Add DMA support to MOESI_AMD_Base-dir.sm .. mem-ruby: Add DMA support to MOESI_AMD_Base-dir.sm This change adds DMA support to the MOESI_AMD_Base-dir.sm, which is needed to support ROCm apps/GCN3 ISA in the VIPER ptl. The DMA controller is copied from the MOESI_hammer-dma.sm with few modifications. Change-Id: I56141436eee1c8f62c2a0915fa3b63b83bbcbc9a Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29914 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/mem/ruby/protocol/GPU_VIPER.slicc M src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm A src/mem/ruby/protocol/MOESI_AMD_Base-dma.sm M src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm 4 files changed, 499 insertions(+), 6 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/mem/ruby/protocol/GPU_VIPER.slicc b/src/mem/ruby/protocol/GPU_VIPER.slicc index 55ed671..196058b 100644 --- a/src/mem/ruby/protocol/GPU_VIPER.slicc +++ b/src/mem/ruby/protocol/GPU_VIPER.slicc @@ -2,6 +2,7 @@ include "RubySlicc_interfaces.slicc"; include "MOESI_AMD_Base-msg.sm"; include "MOESI_AMD_Base-dir.sm"; +include "MOESI_AMD_Base-dma.sm"; include "MOESI_AMD_Base-CorePair.sm"; include "GPU_VIPER-msg.sm"; include "GPU_VIPER-TCP.sm"; diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm index efbffbd..c8dafd5 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm @@ -42,6 +42,10 @@ bool useL3OnWT := "False"; Cycles to_memory_controller_latency := 1; + // DMA + MessageBuffer * requestFromDMA, network="From", virtual_network="1", vnet_type="request"; + MessageBuffer * responseToDMA, network="To", virtual_network="3", vnet_type="request"; + // From the Cores MessageBuffer * requestFromCores, network="From", virtual_network="0", vnet_type="request"; MessageBuffer * responseFromCores, network="From", virtual_network="2", vnet_type="response"; @@ -63,13 +67,17 @@ // BL is Busy because it's possible for the data only to be in the network // in the WB, L3 has sent it and gone on with its business in possibly I // state. +BDR_M, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for memory"; BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; +BDR_PM, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for probes and memory"; BS_PM, AccessPermission:Backing_Store,desc="blocked waiting for probes and Memory"; BM_PM, AccessPermission:Backing_Store,desc="blocked waiting for probes and Memory"; B_PM, AccessPermission:Backing_Store,desc="blocked waiting for probes and Memory"; +BDW_P, AccessPermission:Backing_Store, desc="DMA write, blocked waiting for probes, no need for memory"; +BDR_Pm, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for probes, already got memory"; BS_Pm, AccessPermission:Backing_Store,desc="blocked waiting for probes, already got memory"; BM_Pm, AccessPermission:Backing_Store,desc="blocked waiting for probes, already got memory"; B_Pm, AccessPermission:Backing_Store,desc="blocked waiting for probes, already got memory"; @@ -107,6 +115,10 @@ UnblockWriteThrough,desc="Unblock because of writethrough request finishing"; StaleVicDirty,desc="Core invalidated before VicDirty processed"; + +// DMA +DmaRead,desc="DMA read"; +DmaWrite, desc="DMA write"; } enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { @@ -148,6 +160,7 @@ bool L3Hit, default="false", desc="Was this an L3 hit?"; uint64_t probe_id,desc="probe id for lifetime profiling"; WriteMask writeMask,desc="outstanding write through mask"; +int Len,desc="Length of memory request for DMA"; } structure(TBETable, external="yes") { @@ -266,6 +279,8 @@ } // ** OUT_PORTS ** + out_port(dmaResponseQueue_out, DMAResponseMsg, responseToDMA); + out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore); out_port(responseNetwork_out, ResponseMsg, responseToCore); @@ -276,6 +291,23 @@ // ** IN_PORTS ** + // DMA Ports +
[gem5-dev] Change in gem5/gem5[develop]: arch, gpu-compute: Remove HSAIL related files
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/28410 ) Change subject: arch, gpu-compute: Remove HSAIL related files .. arch, gpu-compute: Remove HSAIL related files Change-Id: Iefba0a38d62da7598bbfe3fe6ff46454d35144b1 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/28410 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- M MAINTAINERS M SConstruct D build_opts/HSAIL_X86 M src/arch/gcn3/SConscript D src/arch/hsail/Brig.h D src/arch/hsail/Brig_new.hpp D src/arch/hsail/SConscript D src/arch/hsail/SConsopts D src/arch/hsail/gen.py D src/arch/hsail/gpu_decoder.hh D src/arch/hsail/gpu_isa.hh D src/arch/hsail/gpu_types.hh D src/arch/hsail/insts/branch.cc D src/arch/hsail/insts/branch.hh D src/arch/hsail/insts/decl.hh D src/arch/hsail/insts/gpu_static_inst.cc D src/arch/hsail/insts/gpu_static_inst.hh D src/arch/hsail/insts/main.cc D src/arch/hsail/insts/mem.cc D src/arch/hsail/insts/mem.hh D src/arch/hsail/insts/mem_impl.hh D src/arch/hsail/insts/pseudo_inst.cc D src/arch/hsail/operand.cc D src/arch/hsail/operand.hh D src/gpu-compute/brig_object.cc D src/gpu-compute/brig_object.hh D src/gpu-compute/cl_driver.cc D src/gpu-compute/cl_driver.hh D src/gpu-compute/cl_event.hh D src/gpu-compute/condition_register_state.cc D src/gpu-compute/condition_register_state.hh D src/gpu-compute/hsa_code.hh D src/gpu-compute/hsa_kernel_info.hh D src/gpu-compute/hsa_object.cc D src/gpu-compute/hsa_object.hh D src/gpu-compute/hsail_code.cc D src/gpu-compute/hsail_code.hh D src/gpu-compute/kernel_cfg.cc D src/gpu-compute/kernel_cfg.hh D src/gpu-compute/ndrange.hh D src/gpu-compute/qstruct.hh D src/gpu-compute/vector_register_state.cc D src/gpu-compute/vector_register_state.hh M util/git-commit-msg.py M util/regress 45 files changed, 6 insertions(+), 12,854 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/28410 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Iefba0a38d62da7598bbfe3fe6ff46454d35144b1 Gerrit-Change-Number: 28410 Gerrit-PatchSet: 10 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Alexandru Duțu Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Bradford Beckmann Gerrit-Reviewer: Gabe Black Gerrit-Reviewer: Jason Lowe-Power Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Matthew Poremba Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29912 ) Change subject: gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model .. gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912 Reviewed-by: Anthony Gutierrez Reviewed-by: Jason Lowe-Power Maintainer: Anthony Gutierrez Tested-by: kokoro --- A build_opts/GCN3_X86 M configs/common/GPUTLBConfig.py M src/arch/gcn3/insts/instructions.cc M src/arch/gcn3/insts/op_encodings.hh M src/arch/gcn3/operand.hh M src/dev/hsa/hsa_device.cc M src/dev/hsa/hsa_driver.cc M src/dev/hsa/hsa_driver.hh M src/dev/hsa/hsa_packet_processor.cc M src/dev/hsa/hw_scheduler.cc M src/gpu-compute/GPU.py M src/gpu-compute/GPUStaticInstFlags.py M src/gpu-compute/SConscript M src/gpu-compute/compute_unit.cc M src/gpu-compute/compute_unit.hh M src/gpu-compute/dispatcher.cc M src/gpu-compute/dispatcher.hh M src/gpu-compute/exec_stage.cc M src/gpu-compute/exec_stage.hh M src/gpu-compute/fetch_stage.cc M src/gpu-compute/fetch_stage.hh M src/gpu-compute/fetch_unit.cc M src/gpu-compute/fetch_unit.hh M src/gpu-compute/global_memory_pipeline.cc M src/gpu-compute/global_memory_pipeline.hh A src/gpu-compute/gpu_command_processor.cc A src/gpu-compute/gpu_command_processor.hh A src/gpu-compute/gpu_compute_driver.cc A src/gpu-compute/gpu_compute_driver.hh M src/gpu-compute/gpu_dyn_inst.cc M src/gpu-compute/gpu_dyn_inst.hh M src/gpu-compute/gpu_exec_context.cc M src/gpu-compute/gpu_static_inst.cc M src/gpu-compute/gpu_static_inst.hh M src/gpu-compute/gpu_tlb.cc M src/gpu-compute/gpu_tlb.hh A src/gpu-compute/hsa_queue_entry.hh A src/gpu-compute/kernel_code.hh M src/gpu-compute/lds_state.cc M src/gpu-compute/lds_state.hh M src/gpu-compute/local_memory_pipeline.cc M src/gpu-compute/local_memory_pipeline.hh M src/gpu-compute/misc.hh M src/gpu-compute/pool_manager.cc M src/gpu-compute/pool_manager.hh A src/gpu-compute/register_file.cc A src/gpu-compute/register_file.hh A src/gpu-compute/register_manager.cc A src/gpu-compute/register_manager.hh A src/gpu-compute/register_manager_policy.hh M src/gpu-compute/rr_scheduling_policy.hh A src/gpu-compute/scalar_memory_pipeline.cc A src/gpu-compute/scalar_memory_pipeline.hh A src/gpu-compute/scalar_register_file.cc A src/gpu-compute/scalar_register_file.hh M src/gpu-compute/schedule_stage.cc M src/gpu-compute/schedule_stage.hh M src/gpu-compute/scoreboard_check_stage.cc M src/gpu-compute/scoreboard_check_stage.hh M src/gpu-compute/shader.cc M src/gpu-compute/shader.hh M src/gpu-compute/simple_pool_manager.cc M src/gpu-compute/simple_pool_manager.hh A src/gpu-compute/static_register_manager_policy.cc A src/gpu-compute/static_register_manager_policy.hh M src/gpu-compute/tlb_coalescer.cc M src/gpu-compute/tlb_coalescer.hh M src/gpu-compute/vector_register_file.cc M src/gpu-compute/vector_register_file.hh M src/gpu-compute/wavefront.cc M src/gpu-compute/wavefront.hh M src/mem/packet.cc M src/mem/packet.hh M src/mem/ruby/protocol/GPU_VIPER-TCP.sm A src/mem/ruby/protocol/GPU_VIPER-msg.sm M src/mem/ruby/protocol/GPU_VIPER.slicc M src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm M src/mem/ruby/protocol/RubySlicc_Exports.sm M src/mem/ruby/protocol/RubySlicc_Types.sm M src/mem/ruby/slicc_interface/AbstractController.cc M src/mem/ruby/slicc_interface/RubyRequest.hh M src/mem/ruby/system/GPUCoalescer.cc M src/mem/ruby/system/GPUCoalescer.hh M src/mem/ruby/system/GPUCoalescer.py M src/mem/ruby/system/VIPERCoalescer.hh M src/mem/ruby/system/VIPERCoalescer.py 86 files changed, 10,299 insertions(+), 3,734 deletions(-) Approvals: Jason Lowe-Power: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29912 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254 Gerrit-Change-Number: 29912 Gerrit-PatchSet: 8 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Bradford Beckmann Gerrit-Reviewer: Gabe Black Gerrit-Reviewer: Jason Lowe-Power Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Matthew Poremba Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: mem-ruby: Add codes for pure virtual functions for compilation
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/28409 ) Change subject: mem-ruby: Add codes for pure virtual functions for compilation .. mem-ruby: Add codes for pure virtual functions for compilation Change-Id: Ic34f9ccf10ec28d68eed236dc6246e2ae2ef1b89 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/28409 Tested-by: kokoro Reviewed-by: Anthony Gutierrez Reviewed-by: Matt Sinclair Maintainer: Anthony Gutierrez --- M src/mem/ruby/system/VIPERCoalescer.cc M src/mem/ruby/system/VIPERCoalescer.hh 2 files changed, 13 insertions(+), 0 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved Matt Sinclair: Looks good to me, approved kokoro: Regressions pass diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc index d8977ac..cdef2b1 100644 --- a/src/mem/ruby/system/VIPERCoalescer.cc +++ b/src/mem/ruby/system/VIPERCoalescer.cc @@ -76,6 +76,16 @@ { } +void +VIPERCoalescer::issueRequest(CoalescedRequest* crequest) +{ +} + +void +VIPERCoalescer::issueMemSyncRequest(PacketPtr pkt) +{ +} + // Places an uncoalesced packet in uncoalescedTable. If the packet is a // special type (MemFence, scoping, etc), it is issued immediately. RequestStatus diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh index 2b6e86e..814166d 100644 --- a/src/mem/ruby/system/VIPERCoalescer.hh +++ b/src/mem/ruby/system/VIPERCoalescer.hh @@ -57,6 +57,9 @@ typedef VIPERCoalescerParams Params; VIPERCoalescer(const Params *); ~VIPERCoalescer(); + +void issueMemSyncRequest(PacketPtr pkt); +void issueRequest(CoalescedRequest* crequest) override; void wbCallback(Addr address); void invCallback(Addr address); RequestStatus makeRequest(PacketPtr pkt); -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/28409 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Ic34f9ccf10ec28d68eed236dc6246e2ae2ef1b89 Gerrit-Change-Number: 28409 Gerrit-PatchSet: 6 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Alexandru Duțu Gerrit-Reviewer: Anthony Gutierrez Gerrit-Reviewer: Bradford Beckmann Gerrit-Reviewer: Jason Lowe-Power Gerrit-Reviewer: Matt Sinclair Gerrit-Reviewer: Matthew Poremba Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Tuan Ta Gerrit-Reviewer: Xianwei Zhang Gerrit-Reviewer: kokoro Gerrit-MessageType: merged ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: mem-ruby: update memory interfaces to support GPU ISA
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/28408 ) Change subject: mem-ruby: update memory interfaces to support GPU ISA .. mem-ruby: update memory interfaces to support GPU ISA This patch deprecates HSA-based memory request types and adds new types that can be used by real ISA instructions. Change-Id: Ie107a69d8a35e9de0853f1407392ad01a8b3e930 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/28408 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- M src/mem/packet.cc M src/mem/packet.hh M src/mem/request.hh M src/mem/ruby/slicc_interface/RubyRequest.hh 4 files changed, 45 insertions(+), 131 deletions(-) Approvals: Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/mem/packet.cc b/src/mem/packet.cc index 2d69ba2..1c1da21 100644 --- a/src/mem/packet.cc +++ b/src/mem/packet.cc @@ -181,6 +181,10 @@ { 0, InvalidCmd, "Deprecated_MessageResp" }, /* MemFenceReq -- for synchronization requests */ {SET2(IsRequest, NeedsResponse), MemFenceResp, "MemFenceReq"}, +/* MemSyncReq */ +{SET2(IsRequest, NeedsResponse), MemSyncResp, "MemSyncReq"}, +/* MemSyncResp */ +{SET1(IsResponse), InvalidCmd, "MemSyncResp"}, /* MemFenceResp -- for synchronization responses */ {SET1(IsResponse), InvalidCmd, "MemFenceResp"}, /* Cache Clean Request -- Update with the latest data all existing diff --git a/src/mem/packet.hh b/src/mem/packet.hh index d390c00..42d286a 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -110,6 +110,8 @@ SwapResp, // MessageReq and MessageResp are deprecated. MemFenceReq = SwapResp + 3, +MemSyncReq, // memory synchronization request (e.g., cache invalidate) +MemSyncResp, // memory synchronization response MemFenceResp, CleanSharedReq, CleanSharedResp, diff --git a/src/mem/request.hh b/src/mem/request.hh index 01252bf..4e0ba97 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -110,7 +110,7 @@ * STRICT_ORDER flag should be set if such reordering is * undesirable. */ -UNCACHEABLE= 0x0400, +UNCACHEABLE = 0x0400, /** * The request is required to be strictly ordered by CPU * models and is non-speculative. @@ -216,35 +216,30 @@ }; /** @} */ -typedef uint32_t MemSpaceConfigFlagsType; -typedef ::Flags MemSpaceConfigFlags; +typedef uint64_t CacheCoherenceFlagsType; +typedef ::Flags CacheCoherenceFlags; -enum : MemSpaceConfigFlagsType { -/** Has a synchronization scope been set? */ -SCOPE_VALID= 0x0001, -/** Access has Wavefront scope visibility */ -WAVEFRONT_SCOPE= 0x0002, -/** Access has Workgroup scope visibility */ -WORKGROUP_SCOPE= 0x0004, -/** Access has Device (e.g., GPU) scope visibility */ -DEVICE_SCOPE = 0x0008, -/** Access has System (e.g., CPU + GPU) scope visibility */ -SYSTEM_SCOPE = 0x0010, - -/** Global Segment */ -GLOBAL_SEGMENT = 0x0020, -/** Group Segment */ -GROUP_SEGMENT = 0x0040, -/** Private Segment */ -PRIVATE_SEGMENT= 0x0080, -/** Kergarg Segment */ -KERNARG_SEGMENT= 0x0100, -/** Readonly Segment */ -READONLY_SEGMENT = 0x0200, -/** Spill Segment */ -SPILL_SEGMENT = 0x0400, -/** Arg Segment */ -ARG_SEGMENT= 0x0800, +/** + * These bits are used to set the coherence policy + * for the GPU and are encoded in the GCN3 instructions. + * See the AMD GCN3 ISA Architecture Manual for more + * details. + * + * SLC: System Level Coherent. Accesses are forced to miss in + * the L2 cache and are coherent with system memory. + * + * GLC: Globally Coherent. Controls how reads and writes are + * handled by the L1 cache. Global here referes to the + * data being visible globally on the GPU (i.e., visible + * to all WGs). + * + * For atomics, the GLC bit is used to distinguish between + * between atomic return/no-return operations. + */ +enum : CacheCoherenceFlagsType { +/** user-policy flags */ +SLC_BIT = 0x0080, +GLC_BIT = 0x0100, }; using LocalAccessor = @@ -305,8 +300,8 @@ /** Flag structure for the request. */ Flags _flags; -/** Memory space configuraiton flag structure for the request. */ -MemSpaceConfigFlags _memSpaceConfigFlags; +/** Flags that control
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute, arch-gcn3: Change how waitcnts are implemented
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29973 to review the following change. Change subject: gpu-compute, arch-gcn3: Change how waitcnts are implemented .. gpu-compute, arch-gcn3: Change how waitcnts are implemented Use single counters per memory operation type and increment them upon issue, not execute. Change-Id: I6afc0b66b21882538ef90a14a57a3ab3cc7bd6f3 --- M src/arch/gcn3/insts/instructions.cc M src/gpu-compute/global_memory_pipeline.cc M src/gpu-compute/gpu_dyn_inst.cc M src/gpu-compute/local_memory_pipeline.cc M src/gpu-compute/scalar_memory_pipeline.cc M src/gpu-compute/schedule_stage.cc M src/gpu-compute/wavefront.cc M src/gpu-compute/wavefront.hh 8 files changed, 106 insertions(+), 18 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 9987fad..7c2cf0e 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -32565,6 +32565,7 @@ vdst.write(); +wf->decLGKMInstsIssued(); wf->rdLmReqsInPipe--; wf->validateRequestCounters(); } // execute @@ -32635,6 +32636,7 @@ vdst.write(); +wf->decLGKMInstsIssued(); wf->rdLmReqsInPipe--; wf->validateRequestCounters(); } // execute @@ -39400,6 +39402,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; return; @@ -39496,6 +39500,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; return; @@ -39592,6 +39598,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; return; @@ -39660,6 +39668,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; return; @@ -39728,6 +39738,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; return; @@ -39805,6 +39817,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; } @@ -39884,6 +39898,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -39952,6 +39968,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -40021,6 +40039,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -40090,6 +40110,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -40159,6 +40181,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -40237,6 +40261,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -40325,6 +40351,8 @@ Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { +wf->decVMemInstsIssued(); +wf->decLGKMInstsIssued();
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Add pipeline stage interface classes
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29972 to review the following change. Change subject: gpu-compute: Add pipeline stage interface classes .. gpu-compute: Add pipeline stage interface classes This change separates the pipeline stage interfaces for the GPU's compute unit into their own classes with a well-defined interface. This helps to create a cleaner interface for users to extend the CU pipeline's capabilities and also helps consolidate all the pipeline communication code in one place in the source. Change-Id: I569d52bce84dc1b9fbf8f0f96d53a81a2b6773c6 --- M src/gpu-compute/SConscript A src/gpu-compute/comm.cc A src/gpu-compute/comm.hh M src/gpu-compute/compute_unit.cc M src/gpu-compute/compute_unit.hh M src/gpu-compute/exec_stage.cc M src/gpu-compute/exec_stage.hh M src/gpu-compute/schedule_stage.cc M src/gpu-compute/schedule_stage.hh M src/gpu-compute/scoreboard_check_stage.cc M src/gpu-compute/scoreboard_check_stage.hh 11 files changed, 578 insertions(+), 308 deletions(-) diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript index 244791b..0f1afbc 100644 --- a/src/gpu-compute/SConscript +++ b/src/gpu-compute/SConscript @@ -41,6 +41,7 @@ SimObject('LdsState.py') SimObject('X86GPUTLB.py') +Source('comm.cc') Source('compute_unit.cc') Source('dispatcher.cc') Source('exec_stage.cc') diff --git a/src/gpu-compute/comm.cc b/src/gpu-compute/comm.cc new file mode 100644 index 000..b1dd031 --- /dev/null +++ b/src/gpu-compute/comm.cc @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2018 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Anthony Gutierrez + */ + +#include "gpu-compute/comm.hh" + +#include + +#include "gpu-compute/wavefront.hh" +#include "params/ComputeUnit.hh" + +/** + * Scoreboard/Schedule stage interface. + */ +ScoreboardCheckToSchedule::ScoreboardCheckToSchedule(const ComputeUnitParams + *p) +{ +int num_func_units = p->num_SIMDs + p->num_scalar_cores ++ p->num_global_mem_pipes + p->num_shared_mem_pipes ++ p->num_scalar_mem_pipes; +_readyWFs.resize(num_func_units); + +for (auto &func_unit_wf_list : _readyWFs) { +func_unit_wf_list.reserve(p->n_wf); +} +} + +void +ScoreboardCheckToSchedule::reset() +{ +for (auto &func_unit_wf_list : _readyWFs) { +func_unit_wf_list.resize(0); +} +} + +void +ScoreboardCheckToSchedule::markWFReady(Wavefront *wf, int func_unit_id) +{ +_readyWFs[func_unit_id].push_back(wf); +} + +int +ScoreboardCheckToSchedule::numReadyLists() const +{ +return _readyWFs.size(); +} + +std::vector& +ScoreboardCheckToSchedule::readyWFs(int func_unit_id) +{ +return _readyWFs[func_unit_id]; +} + +/** + * Delete all wavefronts that have been marked as ready at scoreboard stage + * but are found to have empty instruction buffers at schedule stage. + */ +void +ScoreboardCheckToSchedule::updateReadyList(int func_unit_id) +{ +std::vector &func_unit_wf_list = _readyWFs[func_unit_id]; + +for (auto it = func_unit_wf_list.begin(); it != func_unit_wf_list.end();) { +if ((*it)->instructionBuffer.empty()) { +it = func_unit_wf_list.erase(it); +} else { +
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Use refs to CU in pipe stages/mem pipes
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29969 to review the following change. Change subject: gpu-compute: Use refs to CU in pipe stages/mem pipes .. gpu-compute: Use refs to CU in pipe stages/mem pipes The pipe stages and memory pipes are changed to store a reference to their parent CU as opposed to a pointer. These objects will never change which CU they belong to, and they are constructed by their parent CU. Change-Id: Ie5476e1e2e124a024c2efebceb28cb3a9baa78c1 --- M src/gpu-compute/compute_unit.cc M src/gpu-compute/exec_stage.cc M src/gpu-compute/exec_stage.hh M src/gpu-compute/fetch_stage.cc M src/gpu-compute/fetch_stage.hh M src/gpu-compute/fetch_unit.cc M src/gpu-compute/fetch_unit.hh M src/gpu-compute/global_memory_pipeline.cc M src/gpu-compute/global_memory_pipeline.hh M src/gpu-compute/local_memory_pipeline.cc M src/gpu-compute/local_memory_pipeline.hh M src/gpu-compute/scalar_memory_pipeline.cc M src/gpu-compute/scalar_memory_pipeline.hh M src/gpu-compute/schedule_stage.cc M src/gpu-compute/schedule_stage.hh M src/gpu-compute/scoreboard_check_stage.cc M src/gpu-compute/scoreboard_check_stage.hh 17 files changed, 191 insertions(+), 191 deletions(-) diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 653c074..a59a7fd 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -67,13 +67,13 @@ vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width), coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width), registerManager(p->register_manager), -fetchStage(p, this), -scoreboardCheckStage(p, this), -scheduleStage(p, this), -execStage(p, this), -globalMemoryPipe(p, this), -localMemoryPipe(p, this), -scalarMemoryPipe(p, this), +fetchStage(p, *this), +scoreboardCheckStage(p, *this), +scheduleStage(p, *this), +execStage(p, *this), +globalMemoryPipe(p, *this), +localMemoryPipe(p, *this), +scalarMemoryPipe(p, *this), tickEvent([this]{ exec(); }, "Compute unit tick event", false, Event::CPU_Tick_Pri), cu_id(p->cu_id), diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc index 3c6aaad..1fc04f5 100644 --- a/src/gpu-compute/exec_stage.cc +++ b/src/gpu-compute/exec_stage.cc @@ -40,10 +40,10 @@ #include "gpu-compute/vector_register_file.hh" #include "gpu-compute/wavefront.hh" -ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit *cu) +ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu) : computeUnit(cu), lastTimeInstExecuted(false), thisTimeInstExecuted(false), instrExecuted (false), - executionResourcesUsed(0), _name(cu->name() + ".ExecStage") + executionResourcesUsed(0), _name(cu.name() + ".ExecStage") { numTransActiveIdle = 0; @@ -53,7 +53,7 @@ void ExecStage::init() { -dispatchList = &computeUnit->dispatchList; +dispatchList = &computeUnit.dispatchList; idle_dur = 0; } @@ -126,7 +126,7 @@ { std::stringstream ss; bool empty = true; -for (int i = 0; i < computeUnit->numExeUnits(); i++) { +for (int i = 0; i < computeUnit.numExeUnits(); i++) { DISPATCH_STATUS s = dispatchList->at(i).second; ss << i << ": " << dispStatusToStr(s); if (s != EMPTY) { @@ -150,7 +150,7 @@ if (Debug::GPUSched) { dumpDispList(); } -for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) { +for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) { DISPATCH_STATUS s = dispatchList->at(unitId).second; switch (s) { case EMPTY: @@ -167,7 +167,7 @@ (w->instructionBuffer.front())->disassemble()); DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId); dispatchList->at(unitId).first->exec(); -(computeUnit->scheduleStage).deleteFromSch(w); +(computeUnit.scheduleStage).deleteFromSch(w); dispatchList->at(unitId).second = EMPTY; dispatchList->at(unitId).first->freeResources(); dispatchList->at(unitId).first = nullptr; @@ -207,7 +207,7 @@ ; spc -.init(0, computeUnit->numExeUnits(), 1) +.init(0, computeUnit.numExeUnits(), 1) .name(name() + ".spc") .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)") ; @@ -219,26 +219,26 @@ ; numCyclesWithInstrTypeIssued -.init(computeUnit->numExeUnits()) +.init(computeUnit.numExeUnits()) .name(name() + ".num_cycles_issue_exec_rsrc") .desc("Number of cycles at least one instruction issued to " "execution resource type") ; numCyclesWithNoInstrTypeIssued -.init(computeUnit->numExeUnits()) +.init(computeUnit.numExeUnits(
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add case to op selector when operand is vcc_hi
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29971 to review the following change. Change subject: arch-gcn3: Add case to op selector when operand is vcc_hi .. arch-gcn3: Add case to op selector when operand is vcc_hi Change-Id: Ib8846656e18aad04ccb8c9112bc629c69078fe36 --- M src/arch/gcn3/registers.cc 1 file changed, 2 insertions(+), 0 deletions(-) diff --git a/src/arch/gcn3/registers.cc b/src/arch/gcn3/registers.cc index 016160f..d5c4903 100644 --- a/src/arch/gcn3/registers.cc +++ b/src/arch/gcn3/registers.cc @@ -141,6 +141,8 @@ * */ regIdx = numScalarRegs - 2; +} else if (idx == REG_VCC_HI) { +regIdx = numScalarRegs - 1; } else if (idx == REG_FLAT_SCRATCH_LO) { /** * the FLAT_SCRATCH register occupies the two SRF entries -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29971 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Ib8846656e18aad04ccb8c9112bc629c69078fe36 Gerrit-Change-Number: 29971 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: No RF scheduling in case of SKIP or EMPTY
Hello Tony Gutierrez, Alexandru Duțu, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29970 to review the following change. Change subject: gpu-compute: No RF scheduling in case of SKIP or EMPTY .. gpu-compute: No RF scheduling in case of SKIP or EMPTY In case of flat memory instructions the status for the LM pipe execution unit is set to SKIP or EMPTY, as the bus between the VRF and the GM and LM pipe is shared. The destination operands should not be scheduled for the LM pipe, event if the wave is in the dispatch list. This can lead to deadlock in the destination cache as DCEs are reused and the slotsAvailableForBank count gets artificially incremented. Change-Id: I2230c53e3bc1032d2cccbe00fab62c99ab8de6cd --- M src/gpu-compute/schedule_stage.cc 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc index 0785aa0..e0600a6 100644 --- a/src/gpu-compute/schedule_stage.cc +++ b/src/gpu-compute/schedule_stage.cc @@ -236,9 +236,13 @@ ScheduleStage::scheduleRfDestOperands() { for (int j = 0; j < computeUnit.numExeUnits(); ++j) { -if (!dispatchList->at(j).first) { +if (dispatchList->at(j).second == EMPTY || +dispatchList->at(j).second == SKIP) { continue; } + +assert(dispatchList->at(j).first); + // get the wave on dispatch list and attempt to allocate write // resources in the RFs Wavefront *w = dispatchList->at(j).first; -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29970 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I2230c53e3bc1032d2cccbe00fab62c99ab8de6cd Gerrit-Change-Number: 29970 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Alexandru Duțu Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix stride bug in buffer OOB detection logic
Hello Michael LeBeane, Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29968 to review the following change. Change subject: arch-gcn3: Fix stride bug in buffer OOB detection logic .. arch-gcn3: Fix stride bug in buffer OOB detection logic The out-of-range logic for buffer accesses is missing the top 4 bits of const_stride when dealing with scratch buffers. This can cause perfectly valid scratch acceses to be suppressed when const_stride is large. Change-Id: I8f94d44c242fda26cf6dfb75db04fa3aca934b3e --- M src/arch/gcn3/insts/op_encodings.hh 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/arch/gcn3/insts/op_encodings.hh b/src/arch/gcn3/insts/op_encodings.hh index 202dd1d..b35fb3d 100644 --- a/src/arch/gcn3/insts/op_encodings.hh +++ b/src/arch/gcn3/insts/op_encodings.hh @@ -651,7 +651,7 @@ * non-formatted accesses, this is done on a per-lane * basis. */ -if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) { +if (stride == 0 || !rsrc_desc.swizzleEn) { if (buf_off + stride * buf_idx >= rsrc_desc.numRecords - s_offset.rawData()) { DPRINTF(GCN3, "mubuf out-of-bounds condition 1: " @@ -659,13 +659,13 @@ "const_stride = %llx, " "const_num_records = %llx\n", lane, buf_off + stride * buf_idx, -rsrc_desc.stride, rsrc_desc.numRecords); +stride, rsrc_desc.numRecords); oobMask.set(lane); continue; } } -if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) { +if (stride != 0 && rsrc_desc.swizzleEn) { if (buf_idx >= rsrc_desc.numRecords || buf_off >= stride) { DPRINTF(GCN3, "mubuf out-of-bounds condition 2: " -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29968 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I8f94d44c242fda26cf6dfb75db04fa3aca934b3e Gerrit-Change-Number: 29968 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Michael LeBeane Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Replace some instances of std::isnormal with std::fpclassify
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29967 to review the following change. Change subject: arch-gcn3: Replace some instances of std::isnormal with std::fpclassify .. arch-gcn3: Replace some instances of std::isnormal with std::fpclassify Affected instructions: V_DIV_SCALE_F64, V_CMP_CLASS_F64, V_CMPX_CLASS_F64 and their VOPC, VOP3, F32 variants. These instances of std::isnormal were being used to check for subnormal (denorms) values. std::isnormal is not specific enough. It returns true for normal values but false for NaN, Inf, 0.0, and subnormals. std::fpclassify returns macros for each category of floating point numbers. Now we only catch subnormals. Change-Id: I8d8f4452ff58de71e7c8e0b2b5e73467b532e196 --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 302dad4..9987fad 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -9439,7 +9439,7 @@ } if (bits(src1[lane], 4)) { // is -denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -9463,7 +9463,7 @@ } if (bits(src1[lane], 7)) { // is +denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && !std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -9551,7 +9551,7 @@ } if (bits(src1[lane], 4)) { // is -denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -9575,7 +9575,7 @@ } if (bits(src1[lane], 7)) { // is +denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && !std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -9664,7 +9664,7 @@ } if (bits(src1[lane], 4)) { // is -denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -9688,7 +9688,7 @@ } if (bits(src1[lane], 7)) { // is +denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && !std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -9777,7 +9777,7 @@ } if (bits(src1[lane], 4)) { // is -denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -9801,7 +9801,7 @@ } if (bits(src1[lane], 7)) { // is +denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && !std::signbit(src0[lane])) { vcc.setBit(lane, 1); continue; @@ -15550,7 +15550,7 @@ } if (bits(src1[lane], 4)) { // is -denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && std::signbit(src0[lane])) { sdst.setBit(lane, 1); continue; @@ -15574,7 +15574,7 @@ } if (bits(src1[lane], 7)) { // is +denormal -if (!std::isnormal(src0[lane]) +if (std::fpclassify(src0[lane]) == FP_SUBNORMAL && !std::signbit(src0[lane])) { sdst.setBit(lane, 1); continue; @@
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add memcpy condition when writing EXEC_LO
Hello Matthew Poremba, Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29961 to review the following change. Change subject: arch-gcn3: Add memcpy condition when writing EXEC_LO .. arch-gcn3: Add memcpy condition when writing EXEC_LO Some compilers emit an error on the operand template class when writing exec mask. Add a condition to explicitly set memcpy size argument to 32b or 64b based on the number of dwords. Change-Id: I49b0e4a1680283e772d0a5a8efd687b31d4f1624 --- M src/arch/gcn3/operand.hh 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/arch/gcn3/operand.hh b/src/arch/gcn3/operand.hh index 9d28deb..97c6310 100644 --- a/src/arch/gcn3/operand.hh +++ b/src/arch/gcn3/operand.hh @@ -437,8 +437,15 @@ if (_opIdx == REG_EXEC_LO) { ScalarRegU64 new_exec_mask_val = wf->execMask().to_ullong(); -std::memcpy((void*)&new_exec_mask_val, -(void*)srfData.data(), sizeof(srfData)); +if (NumDwords == 1) { +std::memcpy((void*)&new_exec_mask_val, +(void*)srfData.data(), sizeof(VecElemU32)); +} else if (NumDwords == 2) { +std::memcpy((void*)&new_exec_mask_val, +(void*)srfData.data(), sizeof(VecElemU64)); +} else { +panic("Trying to write more than 2 DWORDS to EXEC\n"); +} VectorMask new_exec_mask(new_exec_mask_val); wf->execMask() = new_exec_mask; DPRINTF(GPUSRF, "Write EXEC\n"); -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29961 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I49b0e4a1680283e772d0a5a8efd687b31d4f1624 Gerrit-Change-Number: 29961 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Matthew Poremba Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Remove invalid assert when reading EXEC_LO
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29960 to review the following change. Change subject: arch-gcn3: Remove invalid assert when reading EXEC_LO .. arch-gcn3: Remove invalid assert when reading EXEC_LO This assert assumed all reads to EXEC_LO would be 64b, that is, we would always read the entire EXEC mask. This is invalid as some kernels read only the low 32b of EXEC. The write to EXEC_LO is also updated to handle 32b writes. Change-Id: Ifeb167578515bf112b1eab70bbf2201a5e936358 --- M src/arch/gcn3/operand.hh 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/arch/gcn3/operand.hh b/src/arch/gcn3/operand.hh index 960d05e..9d28deb 100644 --- a/src/arch/gcn3/operand.hh +++ b/src/arch/gcn3/operand.hh @@ -435,9 +435,10 @@ if (!isScalarReg(_opIdx)) { if (_opIdx == REG_EXEC_LO) { -ScalarRegU64 new_exec_mask_val(0); +ScalarRegU64 new_exec_mask_val += wf->execMask().to_ullong(); std::memcpy((void*)&new_exec_mask_val, -(void*)srfData.data(), sizeof(new_exec_mask_val)); +(void*)srfData.data(), sizeof(srfData)); VectorMask new_exec_mask(new_exec_mask_val); wf->execMask() = new_exec_mask; DPRINTF(GPUSRF, "Write EXEC\n"); @@ -513,7 +514,6 @@ switch(_opIdx) { case REG_EXEC_LO: { -assert(NumDwords == 2); ScalarRegU64 exec_mask = _gpuDynInst->wavefront()-> execMask().to_ullong(); std::memcpy((void*)srfData.data(), (void*)&exec_mask, -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29960 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Ifeb167578515bf112b1eab70bbf2201a5e936358 Gerrit-Change-Number: 29960 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Fix Y-dimension ABI decode
Hello Michael LeBeane, Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29965 to review the following change. Change subject: gpu-compute: Fix Y-dimension ABI decode .. gpu-compute: Fix Y-dimension ABI decode We currently have a bug in decoding workitem ID from the kernel descriptor with multiple dimensions. The enable_vgpr_workitem_id bits are currently seperated into x and y components, when they should be treated as a single 2 bit value, where y is enabled when it is > 0, and z is enabled when it is > 1. The current setup allows a kernel launch with vgprs reserved for the z dimension and not the y dimension, which is incorrect. Change-Id: Iee64b207feb95bcf064898d5db33b8f201e25323 --- M src/gpu-compute/hsa_queue_entry.hh M src/gpu-compute/kernel_code.hh 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh index 5fc5e56..ea79869 100644 --- a/src/gpu-compute/hsa_queue_entry.hh +++ b/src/gpu-compute/hsa_queue_entry.hh @@ -417,8 +417,8 @@ * workitem Id in the X dimension is always initialized. */ initialVgprState.set(WorkitemIdX, true); -initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id_y); -initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id_z); +initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id > 0); +initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id > 1); } // name of the kernel associated with the AQL entry diff --git a/src/gpu-compute/kernel_code.hh b/src/gpu-compute/kernel_code.hh index b3560c7..680dd72 100644 --- a/src/gpu-compute/kernel_code.hh +++ b/src/gpu-compute/kernel_code.hh @@ -130,8 +130,7 @@ uint32_t enable_sgpr_workgroup_id_y : 1; uint32_t enable_sgpr_workgroup_id_z : 1; uint32_t enable_sgpr_workgroup_info : 1; -uint32_t enable_vgpr_workitem_id_y : 1; -uint32_t enable_vgpr_workitem_id_z : 1; +uint32_t enable_vgpr_workitem_id : 2; uint32_t enable_exception_address_watch : 1; uint32_t enable_exception_memory_violation : 1; uint32_t granulated_lds_size : 9; -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29965 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Iee64b207feb95bcf064898d5db33b8f201e25323 Gerrit-Change-Number: 29965 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Michael LeBeane Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix VOP3 V_LDEXP_F64
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29966 to review the following change. Change subject: arch-gcn3: Fix VOP3 V_LDEXP_F64 .. arch-gcn3: Fix VOP3 V_LDEXP_F64 Replaced !std::isnormal with std::fpclassify because std::isnormal is not specific enough. !std::isnormal was incorrectly catching NaN, Inf, 0.0, and subnormals (aka denormals), where as it was only suppose to catch subnormals. The return value and error handling spec of std::ldexp listed on cppreference.com appears to match up in nearly all cases after making these changes. If std::ldexp handled subnormals as described in the GCN3 2016 guide, we could have used vdst[lane] = std::ldexp and not need to check for any corner cases. Change-Id: I4c77af77c3b7798f86d40442610cef1296a28441 --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 2b992b1..302dad4 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -30282,10 +30282,11 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { -if (std::isnan(src1[lane]) || std::isinf(src1[lane])) { +if (std::isnan(src0[lane]) || std::isinf(src0[lane])) { vdst[lane] = src0[lane]; -} else if (!std::isnormal(src1[lane])) { -if (std::signbit(src1[lane])) { +} else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL + || std::fpclassify(src0[lane]) == FP_ZERO) { +if (std::signbit(src0[lane])) { vdst[lane] = -0.0; } else { vdst[lane] = +0.0; -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29966 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I4c77af77c3b7798f86d40442610cef1296a28441 Gerrit-Change-Number: 29966 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Don't track vector store insts in CU's headTailMap
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29963 to review the following change. Change subject: gpu-compute: Don't track vector store insts in CU's headTailMap .. gpu-compute: Don't track vector store insts in CU's headTailMap This change fixes a memory leak due to live GPUDynInstPtr references to vector store insts being stored in the CU's headTailMap and never released. This happened because store insts are not supposed to have their head-tail latencies tracked by the headTailMap; instead they use timing information from the GPUCoalescer. When updating the headTailLatency stat via the headTailMap, only loads were considered and removed from the headTailMap, however when inserting into the headTailMap loads and stores were considered, thus leading to the memory leak. This change fixes the issue by only adding loads to the headTailMap. Change-Id: I8a8f5b79f55e00481ae5e82519a9ed627a7ecbd1 --- M src/gpu-compute/compute_unit.cc 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index f3387a7..653c074 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -1389,9 +1389,11 @@ gpuDynInst->wfSlotId); } } else { -if (!compute_unit->headTailMap.count(gpuDynInst)) { -compute_unit->headTailMap.insert( -std::make_pair(gpuDynInst, curTick())); +if (pkt->isRead()) { +if (!compute_unit->headTailMap.count(gpuDynInst)) { +compute_unit->headTailMap +.insert(std::make_pair(gpuDynInst, curTick())); +} } } -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29963 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I8a8f5b79f55e00481ae5e82519a9ed627a7ecbd1 Gerrit-Change-Number: 29963 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix roundNearestEven for V_RNDNE_F64 and V_RNDNE_F32
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29964 to review the following change. Change subject: arch-gcn3: Fix roundNearestEven for V_RNDNE_F64 and V_RNDNE_F32 .. arch-gcn3: Fix roundNearestEven for V_RNDNE_F64 and V_RNDNE_F32 roundNearestEven is an inst_util function that RNDNE_F64 and F32 call, including both VOP1 and VOP3 formats. IEEE 754 spec says this function should round inputs to the nearest integer but round ties to the nearest even integer. Prior to this patch it was rounding all inputs to nearest even, not just the ties. It was probably implemented this way originally because the language in the ISA manual is ambiguous although it provided the correct logic. Fixed roundNearestEven to use the semantics originally described in the GCN3 ISA manual. Change-Id: I83ecb1d516fcf5bdf17e54ddf409b447a129a9a7 --- M src/arch/gcn3/insts/inst_util.hh 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/arch/gcn3/insts/inst_util.hh b/src/arch/gcn3/insts/inst_util.hh index b40e890..15ffe9a 100644 --- a/src/arch/gcn3/insts/inst_util.hh +++ b/src/arch/gcn3/insts/inst_util.hh @@ -258,7 +258,13 @@ template inline T roundNearestEven(T val) { -T nearest_round = std::round(val * 0.5) * 2.0; +T int_part = 0; +T nearest_round = std::floor(val + 0.5); +if ((int)std::floor(val) % 2 == 0 +&& std::modf(std::abs(val), &int_part) == 0.5) { + nearest_round = nearest_round - 1; +} + return nearest_round; } -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29964 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I83ecb1d516fcf5bdf17e54ddf409b447a129a9a7 Gerrit-Change-Number: 29964 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: add all s_buffer_load_dword instructions
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29962 to review the following change. Change subject: arch-gcn3: add all s_buffer_load_dword instructions .. arch-gcn3: add all s_buffer_load_dword instructions Adds the other s_buffer_load_dword* instruction implementations to f134a84. Change-Id: I8d97527278900dc68c32463ea1824409ccd04e1d --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 125 insertions(+), 8 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 002c4d5..2b992b1 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -4737,17 +4737,46 @@ void Inst_SMEM__S_BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); +ScalarRegU32 offset(0); +ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE); + +rsrcDesc.read(); + +if (instData.IMM) { +offset = extData.OFFSET; +} else { +ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET); +off_sgpr.read(); +offset = off_sgpr.rawData(); +} + +calcAddr(gpuDynInst, rsrcDesc, offset); + +gpuDynInst->computeUnit()->scalarMemoryPipe +.getGMReqFIFO().push(gpuDynInst); + +wf->scalarRdGmReqsInPipe--; +wf->scalarOutstandingReqsRdGm++; +gpuDynInst->wavefront()->outstandingReqs++; +gpuDynInst->wavefront()->validateRequestCounters(); +} // execute void Inst_SMEM__S_BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst) { +initMemRead<1>(gpuDynInst); } // initiateAcc void Inst_SMEM__S_BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst) { +// 1 request, size 32 +ScalarOperandU32 sdst(gpuDynInst, instData.SDATA); +sdst.write(); } // completeAcc Inst_SMEM__S_BUFFER_LOAD_DWORDX2::Inst_SMEM__S_BUFFER_LOAD_DWORDX2( @@ -4767,17 +4796,46 @@ void Inst_SMEM__S_BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); +ScalarRegU32 offset(0); +ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE); + +rsrcDesc.read(); + +if (instData.IMM) { +offset = extData.OFFSET; +} else { +ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET); +off_sgpr.read(); +offset = off_sgpr.rawData(); +} + +calcAddr(gpuDynInst, rsrcDesc, offset); + +gpuDynInst->computeUnit()->scalarMemoryPipe +.getGMReqFIFO().push(gpuDynInst); + +wf->scalarRdGmReqsInPipe--; +wf->scalarOutstandingReqsRdGm++; +gpuDynInst->wavefront()->outstandingReqs++; +gpuDynInst->wavefront()->validateRequestCounters(); +} // execute void Inst_SMEM__S_BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst) { +initMemRead<2>(gpuDynInst); } // initiateAcc void Inst_SMEM__S_BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst) { +// use U64 because 2 requests, each size 32 +ScalarOperandU64 sdst(gpuDynInst, instData.SDATA); +sdst.write(); } // completeAcc Inst_SMEM__S_BUFFER_LOAD_DWORDX4::Inst_SMEM__S_BUFFER_LOAD_DWORDX4( @@ -4797,17 +4855,46 @@ void Inst_SMEM__S_BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); +ScalarRegU32 offset(0); +ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE); + +rsrcDesc.read(); + +if (instData.IMM) { +offset = extData.OFFSET; +} else { +ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET); +off_sgpr.read(); +offset = off_sgpr.rawData(); +} + +calcAddr(gpuDynInst, rsrcDesc, offset); + +gpuDynInst->computeUnit()->scalarMemoryPipe +.getGMReqFIFO().push(gpuDynInst); + +wf->scalarRdGmReqsInPipe--; +wf->scalarOutstandingReqsRdGm++; +gpuDyn
[gem5-dev] Change in gem5/gem5[develop]: gpu_compute: Support loading BLIT kernels
Hello Michael LeBeane, Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29959 to review the following change. Change subject: gpu_compute: Support loading BLIT kernels .. gpu_compute: Support loading BLIT kernels The BLIT kernels used to implement DMA through the shaders don't fill out all of the standard fields in an amd_kernel_code_t object. This patch modifies the code object parsing logic to support these new kernels. BLIT kernels are used in APUs when using ROCm memcopies for certain size buffers, and are used for dGPUs when the SDMA engines are disabled. Change-Id: Id4e667474d05e311097dbec443def07dfad14a79 --- M src/gpu-compute/gpu_command_processor.cc M src/gpu-compute/hsa_queue_entry.hh 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc index b5e9452..aee3e1b 100644 --- a/src/gpu-compute/gpu_command_processor.cc +++ b/src/gpu-compute/gpu_command_processor.cc @@ -100,11 +100,25 @@ machine_code_addr); Addr kern_name_addr(0); -virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10, -(uint8_t*)&kern_name_addr, 0x8); - std::string kernel_name; -virt_proxy.readString(kernel_name, kern_name_addr); + +/** + * BLIT kernels don't have symbol names. BLIT kernels are built-in compute + * kernels issued by ROCm to handle DMAs for dGPUs when the SDMA + * hardware engines are unavailable or explicitly disabled. They can also + * be used to do copies that ROCm things would be better performed + * by the shader than the SDMA engines. They are also sometimes used on + * APUs to implement asynchronous memcopy operations from 2 pointers in + * host memory. I have no idea what BLIT stands for. + * */ +if (akc.runtime_loader_kernel_symbol) { +virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10, +(uint8_t*)&kern_name_addr, 0x8); + +virt_proxy.readString(kernel_name, kern_name_addr); +} else { +kernel_name = "Blit kernel"; +} DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str()); diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh index a6917db..5fc5e56 100644 --- a/src/gpu-compute/hsa_queue_entry.hh +++ b/src/gpu-compute/hsa_queue_entry.hh @@ -88,6 +88,19 @@ _globalWgId(0), dispatchComplete(false) { +// Precompiled BLIT kernels actually violate the spec a bit +// and don't set many of the required akc fields. For these kernels, +// we need to rip register usage from the resource registers. +// +// We can't get an exact number of registers from the resource +// registers because they round, but we can get an upper bound on it +if (!numVgprs) +numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4; + +// TODO: Granularity changes for GFX9! +if (!numSgprs) +numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8; + initialVgprState.reset(); initialSgprState.reset(); -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29959 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Id4e667474d05e311097dbec443def07dfad14a79 Gerrit-Change-Number: 29959 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Michael LeBeane Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement ds_swizzle
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29958 to review the following change. Change subject: arch-gcn3: Implement ds_swizzle .. arch-gcn3: Implement ds_swizzle Change-Id: I7d188388afa16932217ae207368666a724207c52 --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 102 insertions(+), 2 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 71efd8f..002c4d5 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -32266,6 +32266,7 @@ Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt) : Inst_DS(iFmt, "ds_swizzle_b32") { + setFlag(Load); } // Inst_DS__DS_SWIZZLE_B32 Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32() @@ -32277,8 +32278,107 @@ void Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +wf->rdLmReqsInPipe--; +wf->validateRequestCounters(); + +if (gpuDynInst->exec_mask.none()) { +return; +} + +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit() +->cyclesToTicks(Cycles(24))); + +ConstVecOperandU32 data(gpuDynInst, extData.DATA0); +VecOperandU32 vdst(gpuDynInst, extData.VDST); +/** + * The "DS pattern" is comprised of both offset fields. That is, the + * swizzle pattern between lanes. Bit 15 of the DS pattern dictates + * which swizzle mode to use. There are two different swizzle + * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use + * QDMode else use Bit-masks mode. The remaining bits dictate how to + * swizzle the lanes. + * + * QDMode: Chunks the lanes into 4s and swizzles among them. + * Bits 7:6 dictate where lane 3 (of the current chunk) + * gets its date, 5:4 lane 2, etc. + * + * Bit-mask:This mode breaks bits 14:0 into 3 equal-sized chunks. + * 14:10 is the xor_mask, 9:5 is the or_mask, and 4:0 + * is the and_mask. Each lane is swizzled by performing + * the appropriate operation using these masks. + */ +VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) | instData.OFFSET0); + +data.read(); + +if (bits(ds_pattern, 15)) { +// QDMode +for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) { +/** + * This operation allows data sharing between groups + * of four consecutive threads. Note the increment by + * 4 in the for loop. + */ +if (gpuDynInst->exec_mask[lane]) { +int index0 = lane + bits(ds_pattern, 1, 0); +panic_if(index0 >= NumVecElemPerVecReg, "%s: index0 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index0); +vdst[lane] += gpuDynInst->exec_mask[index0] ? data[index0]: 0; +} +if (gpuDynInst->exec_mask[lane + 1]) { +int index1 = lane + bits(ds_pattern, 3, 2); +panic_if(index1 >= NumVecElemPerVecReg, "%s: index1 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index1); +vdst[lane + 1] += gpuDynInst->exec_mask[index1] ? data[index1]: 0; +} +if (gpuDynInst->exec_mask[lane + 2]) { +int index2 = lane + bits(ds_pattern, 5, 4); +panic_if(index2 >= NumVecElemPerVecReg, "%s: index2 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index2); +vdst[lane + 2] += gpuDynInst->exec_mask[index2] ? data[index2]: 0; +} +if (gpuDynInst->exec_mask[lane + 3]) { +int index3 = lane + bits(ds_pattern, 7, 6); +panic_if(index3 >= NumVecElemPerVecReg, "%s: index3 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index3); +vdst[lane + 3] += gpuDynInst->exec_mask[index3] ? data[index3]: 0; +} +} +} else { +// Bit Mode +int and_mask = bits(ds_pa
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement s_buffer_load_dwordx16
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29957 to review the following change. Change subject: arch-gcn3: Implement s_buffer_load_dwordx16 .. arch-gcn3: Implement s_buffer_load_dwordx16 Change-Id: I25382dcae9bb55eaf035385fa925157f25d39c20 --- M src/arch/gcn3/insts/instructions.cc M src/arch/gcn3/insts/op_encodings.hh 2 files changed, 90 insertions(+), 31 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 567cc10..71efd8f 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -4857,17 +4857,45 @@ void Inst_SMEM__S_BUFFER_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); +ScalarRegU32 offset(0); +ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE); + +rsrcDesc.read(); + +if (instData.IMM) { +offset = extData.OFFSET; +} else { +ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET); +off_sgpr.read(); +offset = off_sgpr.rawData(); +} + +calcAddr(gpuDynInst, rsrcDesc, offset); + +gpuDynInst->computeUnit()->scalarMemoryPipe +.getGMReqFIFO().push(gpuDynInst); + +wf->scalarRdGmReqsInPipe--; +wf->scalarOutstandingReqsRdGm++; +gpuDynInst->wavefront()->outstandingReqs++; +gpuDynInst->wavefront()->validateRequestCounters(); +} // execute void Inst_SMEM__S_BUFFER_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst) { +initMemRead<16>(gpuDynInst); } // initiateAcc void Inst_SMEM__S_BUFFER_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst) { +ScalarOperandU512 sdst(gpuDynInst, instData.SDATA); +sdst.write(); } // completeAcc Inst_SMEM__S_STORE_DWORD::Inst_SMEM__S_STORE_DWORD(InFmt_SMEM *iFmt) diff --git a/src/arch/gcn3/insts/op_encodings.hh b/src/arch/gcn3/insts/op_encodings.hh index 4056f0a..202dd1d 100644 --- a/src/arch/gcn3/insts/op_encodings.hh +++ b/src/arch/gcn3/insts/op_encodings.hh @@ -46,6 +46,29 @@ namespace Gcn3ISA { +struct BufferRsrcDescriptor +{ +uint64_t baseAddr : 48; +uint32_t stride : 14; +uint32_t cacheSwizzle : 1; +uint32_t swizzleEn : 1; +uint32_t numRecords : 32; +uint32_t dstSelX : 3; +uint32_t dstSelY : 3; +uint32_t dstSelZ : 3; +uint32_t dstSelW : 3; +uint32_t numFmt : 3; +uint32_t dataFmt : 4; +uint32_t elemSize : 2; +uint32_t idxStride : 2; +uint32_t addTidEn : 1; +uint32_t atc : 1; +uint32_t hashEn : 1; +uint32_t heap : 1; +uint32_t mType : 3; +uint32_t type : 2; +}; + // --- purely virtual instruction classes --- class Inst_SOP2 : public GCN3GPUStaticInst @@ -197,14 +220,45 @@ MemCmd::WriteReq); } +/** + * For normal s_load_dword/s_store_dword instruction addresses. + */ void -calcAddr(GPUDynInstPtr gpuDynInst, ConstScalarOperandU64 &addr, -ScalarRegU32 offset) +calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU64 &addr, + ScalarRegU32 offset) { -Addr vaddr = addr.rawData(); -vaddr += offset; -vaddr &= ~0x3; -gpuDynInst->scalarAddr = vaddr; +Addr vaddr = ((addr.rawData() + offset) & ~0x3); +gpu_dyn_inst->scalarAddr = vaddr; +} + +/** + * For s_buffer_load_dword/s_buffer_store_dword instruction addresses. + * The s_buffer instructions use the same buffer resource descriptor + * as the MUBUF instructions. + */ +void +calcAddr(GPUDynInstPtr gpu_dyn_inst, + ConstScalarOperandU128 &s_rsrc_desc, ScalarRegU32 offset) +{ +BufferRsrcDescriptor rsrc_desc; +ScalarRegU32 clamped_offset(offset); +std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(), +sizeof(BufferRsrcDescriptor)); + +/** + * The address is clamped if: + * Stride is zero: clamp if offset >= num_records + * Stride is non-zero: clamp if offset > (stride * num_records) + */ +if (!rsrc_desc.stride && offset >= rsrc_desc.numRecords) { +clamped_offset = rsrc_desc.numRecords; +} else if (rsrc_de
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: implement instruction s_setreg_b32
Hello Tony Gutierrez, Xianwei Zhang, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29949 to review the following change. Change subject: arch-gcn3: implement instruction s_setreg_b32 .. arch-gcn3: implement instruction s_setreg_b32 Instruction s_setreg_b32 was unimplemented, but is used by hipified rodinia 'srad'. The instruction sets values of hardware internal registers. If the instruction is writing into MODE to control single-precision FP round and denorm modes, a simple warn will be printed; for all other cases (non-MODE hw register or other precisions), panic will happen. Change-Id: Idb1cd5f60548a146bc980f1a27faff30259e74ce --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 27 insertions(+), 0 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 6ffd049..8b72e0d 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -1800,6 +1800,7 @@ Inst_SOPK__S_SETREG_B32::Inst_SOPK__S_SETREG_B32(InFmt_SOPK *iFmt) : Inst_SOPK(iFmt, "s_setreg_b32") { +setFlag(ALU); } // Inst_SOPK__S_SETREG_B32 Inst_SOPK__S_SETREG_B32::~Inst_SOPK__S_SETREG_B32() @@ -1813,6 +1814,32 @@ void Inst_SOPK__S_SETREG_B32::execute(GPUDynInstPtr gpuDynInst) { +ScalarRegI16 simm16 = instData.SIMM16; +ScalarRegU32 hwregId = simm16 & 0x3f; +ScalarRegU32 offset = (simm16 >> 6) & 31; +ScalarRegU32 size = ((simm16 >> 11) & 31) + 1; + +ScalarOperandU32 hwreg(gpuDynInst, hwregId); +ScalarOperandU32 sdst(gpuDynInst, instData.SDST); +hwreg.read(); +sdst.read(); + +// Store value from SDST to part of the hardware register. +ScalarRegU32 mask = (((1U << size) - 1U) << offset); +hwreg = ((hwreg.rawData() & ~mask) +| ((sdst.rawData() << offset) & mask)); +hwreg.write(); + +// set MODE register to control the behavior of single precision +// floating-point numbers: denormal mode or round mode +if (hwregId==1 && size==2 +&& (offset==4 || offset==0)) { +warn_once("Be cautious that s_setreg_b32 has no real effect " +"on FP modes: %s\n", gpuDynInst->disassemble()); +return; +} + +// panic if not changing MODE of floating-point numbers panicUnimplemented(); } -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29949 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Idb1cd5f60548a146bc980f1a27faff30259e74ce Gerrit-Change-Number: 29949 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-Reviewer: Xianwei Zhang Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: fixed scale,fixup,fmas f64 ops
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29955 to review the following change. Change subject: arch-gcn3: fixed scale,fixup,fmas f64 ops .. arch-gcn3: fixed scale,fixup,fmas f64 ops Change-Id: Ie13794554db8a958fda1f7103ec18058fda2e66d --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 65 insertions(+), 17 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index a7b8923..a25ec17 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -28952,22 +28952,34 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { -if (std::fpclassify(src1[lane]) == FP_ZERO) { -if (std::signbit(src1[lane])) { -vdst[lane] = -INFINITY; -} else { -vdst[lane] = +INFINITY; -} -} else if (std::isnan(src2[lane]) || std::isnan(src1[lane])) { -vdst[lane] = NAN; -} else if (std::isinf(src1[lane])) { -if (std::signbit(src1[lane])) { -vdst[lane] = -INFINITY; -} else { -vdst[lane] = +INFINITY; -} +int signOut = std::signbit(src1[lane]) ^ + std::signbit(src2[lane]); +int exp1, exp2; +std::frexp(src1[lane],&exp1); +std::frexp(src2[lane],&exp2); +if (std::isnan(src2[lane])) { +vdst[lane] = src2[lane]; +} else if (std::isnan(src1[lane])) { +vdst[lane] = src1[lane]; +} else if (src1[lane] == 0.0 && src2[lane] == 0.0) { +vdst[lane] = -NAN; +} else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) { +vdst[lane] = -NAN; +} else if (src1[lane] == 0.0 || std::isinf(src2[lane])) { +vdst[lane] = signOut ? -INFINITY : +INFINITY; +} else if (src2[lane] == 0.0 || std::isinf(src1[lane])) { +vdst[lane] = signOut ? -0.0 : +0.0; +} else if (exp2 - exp1 < -1075) { +warn_once("fixup_f64 unimplemented case:" + "exp2 - ex1 < -1075"); +vdst[lane] = src0[lane]; +} else if (exp1 == 2047) { +warn_once("fixup_f64 unimplemented case:" + "exp1 == 2047"); +vdst[lane] = src0[lane]; } else { -vdst[lane] = src2[lane] / src1[lane]; +vdst[lane] = ((uint64_t)signOut<<63) | +((uint64_t)src0[lane] & 0x7fffULL); } } } @@ -29077,8 +29089,37 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { -vdst[lane] = src0[lane]; +int exp1, exp2; +std::frexp(src1[lane],&exp1); +std::frexp(src2[lane],&exp2); vcc.setBit(lane, 0); +if (src2[lane] == 0 || src1[lane] == 0) { +vdst[lane] = NAN; +} else if (exp2 - exp1 >= 768) { +vcc.setBit(lane, 1); +if (src0[lane] == src1[lane]) { +vdst[lane] = std::ldexp(src0[lane],128); +} +} else if (exp1 == 0) { +vdst[lane] = std::ldexp(src0[lane],128); +} else if (exp1 >= 0x7fd && exp2 - exp1 <= -768) { +vcc.setBit(lane, 1); +if (src0[lane] == src1[lane]) { +vdst[lane] = std::ldexp(src0[lane],-128); +} +} else if (exp1 >= 0x7fd) { +vdst[lane] = std::ldexp(src0[lane],-128); +} else if (exp2 - exp1 <= -768) { +vcc.setBit(lane, 1); +if (src0[lane] != src2[lane]) { +vdst[lane] = std::ldexp(src0[lane],128); +} +} else if (exp2 <= 53) { +vdst[lane] = std::ldexp(src0[lane],128); +} +else { +vdst[lane] = src0[lane]; +} } } @@ -29171,10 +29212,12 @@ ConstVecOperandF64 src1(gpuDynInst, extData.SRC1); ConstVecOperandF64 src2(gpuDynInst, extData.SRC2); VecOperandF64 vdst(gpuDynInst, instData.VDST); +ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO); src0.readSrc();
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fixup DIV instructions
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29956 to review the following change. Change subject: arch-gcn3: Fixup DIV instructions .. arch-gcn3: Fixup DIV instructions Adds support to handle the special cases for GCN3 DIV instructions. Change-Id: I18f91870e802407c93831f313ce76be053bc4230 --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 44 insertions(+), 42 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index a25ec17..567cc10 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -28952,34 +28952,35 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { -int signOut = std::signbit(src1[lane]) ^ - std::signbit(src2[lane]); -int exp1, exp2; -std::frexp(src1[lane],&exp1); -std::frexp(src2[lane],&exp2); -if (std::isnan(src2[lane])) { -vdst[lane] = src2[lane]; -} else if (std::isnan(src1[lane])) { -vdst[lane] = src1[lane]; -} else if (src1[lane] == 0.0 && src2[lane] == 0.0) { -vdst[lane] = -NAN; +int sign_out = std::signbit(src1[lane]) + ^ std::signbit(src2[lane]); +int exp1(0); +int exp2(0); +std::frexp(src1[lane], &exp1); +std::frexp(src2[lane], &exp2); + +if (std::isnan(src1[lane]) || std::isnan(src2[lane])) { +vdst[lane] = std::numeric_limits::quiet_NaN(); +} else if (std::fpclassify(src1[lane]) == FP_ZERO + && std::fpclassify(src2[lane]) == FP_ZERO) { +vdst[lane] += std::numeric_limits::signaling_NaN(); } else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) { -vdst[lane] = -NAN; -} else if (src1[lane] == 0.0 || std::isinf(src2[lane])) { -vdst[lane] = signOut ? -INFINITY : +INFINITY; -} else if (src2[lane] == 0.0 || std::isinf(src1[lane])) { -vdst[lane] = signOut ? -0.0 : +0.0; +vdst[lane] += std::numeric_limits::signaling_NaN(); +} else if (std::fpclassify(src1[lane]) == FP_ZERO + || std::isinf(src2[lane])) { +vdst[lane] = sign_out ? -INFINITY : +INFINITY; +} else if (std::isinf(src1[lane]) + || std::fpclassify(src2[lane]) == FP_ZERO) { +vdst[lane] = sign_out ? -0.0 : +0.0; } else if (exp2 - exp1 < -1075) { -warn_once("fixup_f64 unimplemented case:" - "exp2 - ex1 < -1075"); vdst[lane] = src0[lane]; } else if (exp1 == 2047) { -warn_once("fixup_f64 unimplemented case:" - "exp1 == 2047"); vdst[lane] = src0[lane]; } else { -vdst[lane] = ((uint64_t)signOut<<63) | -((uint64_t)src0[lane] & 0x7fffULL); +vdst[lane] = sign_out ? -std::fabs(src0[lane]) +: std::fabs(src0[lane]); } } } @@ -29089,36 +29090,37 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { -int exp1, exp2; -std::frexp(src1[lane],&exp1); -std::frexp(src2[lane],&exp2); +int exp1(0); +int exp2(0); +std::frexp(src1[lane], &exp1); +std::frexp(src2[lane], &exp2); vcc.setBit(lane, 0); -if (src2[lane] == 0 || src1[lane] == 0) { + +if (std::fpclassify(src1[lane]) == FP_ZERO +|| std::fpclassify(src2[lane]) == FP_ZERO) { vdst[lane] = NAN; } else if (exp2 - exp1 >= 768) { vcc.setBit(lane, 1); if (src0[lane] == src1[lane]) { -vdst[lane] = std::ldexp(src0[lane],128); +vdst[lane] = std::ldexp(src0[lane], 128); } -} else if (exp1 == 0) { -vdst[lane] = std::ldexp(src0[lane],128); -} else if (exp1 >= 0x7fd && exp2 - exp1 <= -768) { +} else if (!std::isnormal(src1[lane])) { +vdst[lane] = std::ldexp(src0[lane], 128); +} else if (!std::isnormal
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add handling for Inf/overflow in CVT insts
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29953 to review the following change. Change subject: arch-gcn3: Add handling for Inf/overflow in CVT insts .. arch-gcn3: Add handling for Inf/overflow in CVT insts Change-Id: I0fddffdeaebd9f45fe89f44d536f80a43de63ff5 --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 77 insertions(+), 1 deletion(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index e93278a..a7b8923 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -7260,8 +7260,16 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { +int exp; +std::frexp(src[lane],&exp); if (std::isnan(src[lane])) { vdst[lane] = 0; +} else if (std::isinf(src[lane]) || exp > 30) { +if (std::signbit(src[lane])) { +vdst[lane] = INT_MIN; +} else { +vdst[lane] = INT_MAX; +} } else { vdst[lane] = (VecElemI32)src[lane]; } @@ -7386,8 +7394,18 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { +int exp; +std::frexp(src[lane],&exp); if (std::isnan(src[lane])) { vdst[lane] = 0; +} else if (std::isinf(src[lane])) { +if (std::signbit(src[lane])) { +vdst[lane] = 0; +} else { +vdst[lane] = UINT_MAX; +} +} else if (exp > 31) { +vdst[lane] = UINT_MAX; } else { vdst[lane] = (VecElemU32)src[lane]; } @@ -7422,8 +7440,16 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { +int exp; +std::frexp(src[lane],&exp); if (std::isnan(src[lane])) { vdst[lane] = 0; +} else if (std::isinf(src[lane]) || exp > 30) { +if (std::signbit(src[lane])) { +vdst[lane] = INT_MIN; +} else { +vdst[lane] = INT_MAX; +} } else { vdst[lane] = (VecElemI32)src[lane]; } @@ -7772,8 +7798,18 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { +int exp; +std::frexp(src[lane],&exp); if (std::isnan(src[lane])) { vdst[lane] = 0; +} else if (std::isinf(src[lane])) { +if (std::signbit(src[lane])) { +vdst[lane] = 0; +} else { +vdst[lane] = UINT_MAX; +} +} else if (exp > 31) { +vdst[lane] = UINT_MAX; } else { vdst[lane] = (VecElemU32)src[lane]; } @@ -25075,8 +25111,16 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { +int exp; +std::frexp(src[lane],&exp); if (std::isnan(src[lane])) { vdst[lane] = 0; +} else if (std::isinf(src[lane]) || exp > 30) { +if (std::signbit(src[lane])) { +vdst[lane] = INT_MIN; +} else { +vdst[lane] = INT_MAX; +} } else { vdst[lane] = (VecElemI32)src[lane]; } @@ -25235,8 +25279,18 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { +int exp; +std::frexp(src[lane],&exp); if (std::isnan(src[lane])) { vdst[lane] = 0; +} else if (std::isinf(src[lane])) { +if (std::signbit(src[lane])) { +vdst[lane] = 0; +} else { +vdst[lane] = UINT_MAX; +} +} else if (exp > 31) { +vdst[lane] = UINT_MAX; } else { vdst[lane] = (VecElemU32)src[lane]; } @@ -25287,8 +25341,16 @@ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { +int exp; +std::frex
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix s_getpc operand information
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29954 to review the following change. Change subject: arch-gcn3: Fix s_getpc operand information .. arch-gcn3: Fix s_getpc operand information s_getpc was currently reporting only a single operand, and was only considering the SSRC operand. However, this instruction' source is implicitly the PC. Because its destination register was never tracked for dependence checking purposes, dependence violations are possible. Change-Id: Ia80b8b3e24d5885f646a9ee41212a2cb35b9ffe6 --- M src/arch/gcn3/insts/instructions.hh M src/arch/gcn3/insts/op_encodings.cc 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.hh b/src/arch/gcn3/insts/instructions.hh index b0cc37e..f561043 100644 --- a/src/arch/gcn3/insts/instructions.hh +++ b/src/arch/gcn3/insts/instructions.hh @@ -5846,9 +5846,7 @@ getOperandSize(int opIdx) override { switch (opIdx) { - case 0: //ssrc -return 8; - case 1: //sdst + case 0: //sdst return 8; default: fatal("op idx %i out of bounds\n", opIdx); @@ -5860,9 +5858,7 @@ isSrcOperand(int opIdx) override { switch (opIdx) { - case 0: //ssrc -return true; - case 1: //sdst + case 0: //sdst return false; default: fatal("op idx %i out of bounds\n", opIdx); @@ -5874,9 +5870,7 @@ isDstOperand(int opIdx) override { switch (opIdx) { - case 0: //ssrc -return false; - case 1: //sdst + case 0: //sdst return true; default: fatal("op idx %i out of bounds\n", opIdx); diff --git a/src/arch/gcn3/insts/op_encodings.cc b/src/arch/gcn3/insts/op_encodings.cc index 22d0f48..997b22f 100644 --- a/src/arch/gcn3/insts/op_encodings.cc +++ b/src/arch/gcn3/insts/op_encodings.cc @@ -326,7 +326,12 @@ switch (opIdx) { case 0: - return isScalarReg(instData.SSRC0); +if (instData.OP == 0x1C) { +// Special case for s_getpc, which has no source reg. +// Instead, it implicitly reads the PC. +return isScalarReg(instData.SDST); +} +return isScalarReg(instData.SSRC0); case 1: return isScalarReg(instData.SDST); default: @@ -353,6 +358,12 @@ switch (opIdx) { case 0: +if (instData.OP == 0x1C) { +// Special case for s_getpc, which has no source reg. +// Instead, it implicitly reads the PC. +return opSelectorToRegIdx(instData.SDST, +gpuDynInst->wavefront()->reservedScalarRegs); +} return opSelectorToRegIdx(instData.SSRC0, gpuDynInst->wavefront()->reservedScalarRegs); case 1: -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29954 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: Ia80b8b3e24d5885f646a9ee41212a2cb35b9ffe6 Gerrit-Change-Number: 29954 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add ds_bpermute and ds_permute insts
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29952 to review the following change. Change subject: arch-gcn3: Add ds_bpermute and ds_permute insts .. arch-gcn3: Add ds_bpermute and ds_permute insts The implementation of these insts provided by this change is based on the description provided here: https://gpuopen.com/amd-gcn-assembly-cross-lane-operations/ Change-Id: Id63b6c34c9fdc6e0dbd445d859e7b209023f2874 --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 113 insertions(+), 4 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 6e5ff42..e93278a 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -32129,6 +32129,13 @@ Inst_DS__DS_PERMUTE_B32::Inst_DS__DS_PERMUTE_B32(InFmt_DS *iFmt) : Inst_DS(iFmt, "ds_permute_b32") { +setFlag(MemoryRef); +/** + * While this operation doesn't actually use DS storage we classify + * it as a load here because it does a writeback to a VGPR, which + * fits in better with the LDS pipeline logic. + */ + setFlag(Load); } // Inst_DS__DS_PERMUTE_B32 Inst_DS__DS_PERMUTE_B32::~Inst_DS__DS_PERMUTE_B32() @@ -32139,12 +32146,66 @@ void Inst_DS__DS_PERMUTE_B32::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit() +->cyclesToTicks(Cycles(24))); +ConstVecOperandU32 addr(gpuDynInst, extData.ADDR); +ConstVecOperandU32 data(gpuDynInst, extData.DATA0); +VecOperandU32 vdst(gpuDynInst, extData.VDST); + +addr.read(); +data.read(); + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (wf->execMask(lane)) { +/** + * One of the offset fields can be used for the index. + * It is assumed OFFSET0 would be used, as OFFSET1 is + * typically only used for DS ops that operate on two + * disparate pieces of data. + */ +assert(!instData.OFFSET1); +/** + * The address provided is a byte address, but VGPRs are + * 4 bytes, so we must divide by 4 to get the actual VGPR + * index. Additionally, the index is calculated modulo the + * WF size, 64 in this case, so we simply extract bits 7-2. + */ +int index = bits(addr[lane] + instData.OFFSET0, 7, 2); +panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out " + "of bounds.\n", gpuDynInst->disassemble(), index); +/** + * If the shuffled index corresponds to a lane that is + * inactive then this instruction writes a 0 to the active + * lane in VDST. + */ +if (wf->execMask(index)) { +vdst[index] = data[lane]; +} else { +vdst[index] = 0; +} +} +} + +vdst.write(); + +wf->rdLmReqsInPipe--; +wf->validateRequestCounters(); +} // execute +// --- Inst_DS__DS_BPERMUTE_B32 class methods --- Inst_DS__DS_BPERMUTE_B32::Inst_DS__DS_BPERMUTE_B32(InFmt_DS *iFmt) : Inst_DS(iFmt, "ds_bpermute_b32") { +setFlag(MemoryRef); +/** + * While this operation doesn't actually use DS storage we classify + * it as a load here because it does a writeback to a VGPR, which + * fits in better with the LDS pipeline logic. + */ +setFlag(Load); } // Inst_DS__DS_BPERMUTE_B32 Inst_DS__DS_BPERMUTE_B32::~Inst_DS__DS_BPERMUTE_B32() @@ -32155,8 +32216,56 @@ void Inst_DS__DS_BPERMUTE_B32::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit() +->cyclesToTicks(Cycles(24))); +ConstVecOperandU32 addr(gpuDynInst, extData.ADDR); +ConstVecOperandU32 data(gpuDynInst, extData.DATA0); +VecOperandU32 vdst(gpuDynInst, extData.VDST); + +addr.read(); +data.read(); + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (wf->execMask(lane)) { +/** + * One of the offset field
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: convert vALU instruction counters from 32 to 64-bit
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29950 to review the following change. Change subject: arch-gcn3: convert vALU instruction counters from 32 to 64-bit .. arch-gcn3: convert vALU instruction counters from 32 to 64-bit The vALU instruction counters were previously 32 bits, but for some workloads this value wraps around and triggers an assert failure because the max vALU operations are reached. To resolve this, this commit increases the counter size to 64 bits. Change-Id: I90ed4514669485cfea7ccc37ba9d69665277bccb --- M src/gpu-compute/shader.hh 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh index 238f6e0..3e2e569 100644 --- a/src/gpu-compute/shader.hh +++ b/src/gpu-compute/shader.hh @@ -258,8 +258,8 @@ Stats::Vector vectorInstDstOperand; void regStats(); -int max_valu_insts; -int total_valu_insts; +int64_t max_valu_insts; +int64_t total_valu_insts; Shader(const Params *p); ~Shader(); -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29950 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I90ed4514669485cfea7ccc37ba9d69665277bccb Gerrit-Change-Number: 29950 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: add support for v_mbcnt_hi and v_mbcnt_lo
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29948 to review the following change. Change subject: arch-gcn3: add support for v_mbcnt_hi and v_mbcnt_lo .. arch-gcn3: add support for v_mbcnt_hi and v_mbcnt_lo Change-Id: I1c70fe693c904f1abd7d5a2b99220c74a075eae5 --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 60 insertions(+), 4 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 79e7dda..6ffd049 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -30309,8 +30309,36 @@ void Inst_VOP3__V_MBCNT_LO_U32_B32::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +ConstVecOperandU32 src0(gpuDynInst, extData.SRC0); +ConstVecOperandU32 src1(gpuDynInst, extData.SRC1); +VecOperandU32 vdst(gpuDynInst, instData.VDST); +uint64_t threadMask = 0; + +src0.readSrc(); +src1.readSrc(); + +/** + * input modifiers are supported by FP operations only + */ +assert(!(instData.ABS & 0x1)); +assert(!(instData.ABS & 0x2)); +assert(!(instData.ABS & 0x4)); +assert(!(extData.NEG & 0x1)); +assert(!(extData.NEG & 0x2)); +assert(!(extData.NEG & 0x4)); + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (wf->execMask(lane)) { +threadMask = ((1LL << lane) - 1LL); +vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) + + src1[lane]; +} +} + +vdst.write(); +} // execute +// --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods --- Inst_VOP3__V_MBCNT_HI_U32_B32::Inst_VOP3__V_MBCNT_HI_U32_B32( InFmt_VOP3 *iFmt) @@ -30330,8 +30358,36 @@ void Inst_VOP3__V_MBCNT_HI_U32_B32::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +ConstVecOperandU32 src0(gpuDynInst, extData.SRC0); +ConstVecOperandU32 src1(gpuDynInst, extData.SRC1); +VecOperandU32 vdst(gpuDynInst, instData.VDST); +uint64_t threadMask = 0; + +src0.readSrc(); +src1.readSrc(); + +/** + * input modifiers are supported by FP operations only + */ +assert(!(instData.ABS & 0x1)); +assert(!(instData.ABS & 0x2)); +assert(!(instData.ABS & 0x4)); +assert(!(extData.NEG & 0x1)); +assert(!(extData.NEG & 0x2)); +assert(!(extData.NEG & 0x4)); + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (wf->execMask(lane)) { +threadMask = ((1LL << lane) - 1LL); +vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) + + src1[lane]; +} +} + +vdst.write(); +} // execute +// --- Inst_VOP3__V_LSHLREV_B64 class methods --- Inst_VOP3__V_LSHLREV_B64::Inst_VOP3__V_LSHLREV_B64(InFmt_VOP3 *iFmt) : Inst_VOP3(iFmt, "v_lshlrev_b64", false) -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29948 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I1c70fe693c904f1abd7d5a2b99220c74a075eae5 Gerrit-Change-Number: 29948 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: ds_read_u8 and ds_read_u16 fix
Hello Tony Gutierrez, Alexandru Duțu, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29951 to review the following change. Change subject: arch-gcn3: ds_read_u8 and ds_read_u16 fix .. arch-gcn3: ds_read_u8 and ds_read_u16 fix This changeset zero extends the destination register for ds_read_u8 and ds_read_u16 instructions. Change-Id: I193adadd68adf2572b59743b1504f18ad225f506 --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 8b72e0d..6e5ff42 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -32016,11 +32016,11 @@ void Inst_DS__DS_READ_U8::completeAcc(GPUDynInstPtr gpuDynInst) { -VecOperandU8 vdst(gpuDynInst, extData.VDST); +VecOperandU32 vdst(gpuDynInst, extData.VDST); for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { -vdst[lane] = (reinterpret_cast( +vdst[lane] = (VecElemU32)(reinterpret_cast( gpuDynInst->d_data))[lane]; } } @@ -32096,11 +32096,11 @@ void Inst_DS__DS_READ_U16::completeAcc(GPUDynInstPtr gpuDynInst) { -VecOperandU16 vdst(gpuDynInst, extData.VDST); +VecOperandU32 vdst(gpuDynInst, extData.VDST); for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { -vdst[lane] = (reinterpret_cast( +vdst[lane] = (VecElemU32)(reinterpret_cast( gpuDynInst->d_data))[lane]; } } -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29951 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I193adadd68adf2572b59743b1504f18ad225f506 Gerrit-Change-Number: 29951 Gerrit-PatchSet: 1 Gerrit-Owner: Anthony Gutierrez Gerrit-Reviewer: Alexandru Duțu Gerrit-Reviewer: Tony Gutierrez Gerrit-MessageType: newchange ___ gem5-dev mailing list -- gem5-dev@gem5.org To unsubscribe send an email to gem5-dev-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: implement multi-dword buffer loads and stores
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29946 to review the following change. Change subject: arch-gcn3: implement multi-dword buffer loads and stores .. arch-gcn3: implement multi-dword buffer loads and stores Add support for all multi-dword buffer loads and stores: buffer_load_dword x2, x3, and x4 and buffer_store_dword x2, x3, and x4 Change-Id: I4017b6b4f625fc92002ce8ade695ae29700fa55e --- M src/arch/gcn3/insts/instructions.cc M src/arch/gcn3/insts/op_encodings.hh 2 files changed, 504 insertions(+), 18 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 817b339..b852281 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -34777,7 +34777,11 @@ { setFlag(MemoryRef); setFlag(Load); -setFlag(GlobalSegment); +if (instData.LDS) { +setFlag(GroupSegment); +} else { +setFlag(GlobalSegment); +} } // Inst_MUBUF__BUFFER_LOAD_DWORDX2 Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2() @@ -34788,17 +34792,88 @@ void Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->exec_mask = wf->execMask(); +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + +ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR); +ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1); +ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4); +ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET); + +rsrcDesc.read(); +offset.read(); + +int inst_offset = instData.OFFSET; + +if (!instData.IDXEN && !instData.OFFEN) { +calcAddr(gpuDynInst, +addr0, addr1, rsrcDesc, offset, inst_offset); +} else if (!instData.IDXEN && instData.OFFEN) { +addr0.read(); +calcAddr(gpuDynInst, +addr0, addr1, rsrcDesc, offset, inst_offset); +} else if (instData.IDXEN && !instData.OFFEN) { +addr0.read(); +calcAddr(gpuDynInst, +addr1, addr0, rsrcDesc, offset, inst_offset); +} else { +addr0.read(); +addr1.read(); +calcAddr(gpuDynInst, +addr1, addr0, rsrcDesc, offset, inst_offset); +} + +if (isLocalMem()) { +gpuDynInst->computeUnit()->localMemoryPipe +.issueRequest(gpuDynInst); +wf->rdLmReqsInPipe--; +wf->outstandingReqsRdLm++; +} else { +gpuDynInst->computeUnit()->globalMemoryPipe +.issueRequest(gpuDynInst); +wf->rdGmReqsInPipe--; +wf->outstandingReqsRdGm++; +} + +wf->outstandingReqs++; +wf->validateRequestCounters(); +} // execute void Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst) { +initMemRead<2>(gpuDynInst); } // initiateAcc void Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst) { +VecOperandU32 vdst0(gpuDynInst, extData.VDATA); +VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1); + +for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { +if (gpuDynInst->exec_mask[lane]) { +if (!oobMask[lane]) { +vdst0[lane] = (reinterpret_cast( +gpuDynInst->d_data))[lane * 2]; +vdst1[lane] = (reinterpret_cast( +gpuDynInst->d_data))[lane * 2 + 1]; +} else { +vdst0[lane] = 0; +vdst1[lane] = 0; +} +} +} + +vdst0.write(); +vdst1.write(); } // completeAcc Inst_MUBUF__BUFFER_LOAD_DWORDX3 @@ -34807,7 +34882,11 @@ { setFlag(MemoryRef); setFlag(Load); -setFlag(GlobalSegment); +if (instData.LDS) { +setFlag(GroupSegment); +} else { +setFlag(GlobalSegment); +} } // Inst_MUBUF__BUFFER_LOAD_DWORDX3 Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3() @@ -34818,17 +34897,93 @@ void Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->exec_mask = wf->execMask(); +gpuDynInst->latency.init(gpuDynInst->comp
[gem5-dev] Change in gem5/gem5[develop]: gpu-compute, arch-gcn3: refactor barriers
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29943 to review the following change. Change subject: gpu-compute, arch-gcn3: refactor barriers .. gpu-compute, arch-gcn3: refactor barriers Barriers were not modeled properly. Firstly, barriers were allocated to each WG that was launched, which is not correct, and the CU would provide an infinite number of barrier slots. There are a limited number of barrier slots per CU in reality. In addition, the CU will not allocate barrier slots to WGs with a single WF (nothing to sync if only one WF). Beyond modeling problems, there also the issue of deadlock. The barrier could deadlock because not all WFs are freed from the barrier once it has been satisfied. Instead, we relied on the scoreboard stage to release them lazily, one-by-one. Under this implementation the scoreboard may not fully release all WFs participating in a barrier; this happens because the first WF to be freed from the barrier could reach an s_barrier instruction again, forever causing the barrier counts across WFs to be out-of-sync. This change refactors the barrier logic to: 1) Create a proper barrier slot implementation 2) Enforce (via a parameter) the number of barrier slots on the CU. 3) Simplify the logic and cleanup the code (i.e., we no longer iterate through the entire WF list each time we check if a barrier is satisfied). 4) Fix deadlock issues. Change-Id: If53955b54931886baaae322640a7b9da7a1595e0 --- M src/arch/gcn3/insts/instructions.cc M src/gpu-compute/GPU.py M src/gpu-compute/compute_unit.cc M src/gpu-compute/compute_unit.hh M src/gpu-compute/scoreboard_check_stage.cc M src/gpu-compute/shader.cc M src/gpu-compute/wavefront.cc M src/gpu-compute/wavefront.hh 8 files changed, 386 insertions(+), 101 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 607e3c6..817b339 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -39,6 +39,7 @@ #include "arch/gcn3/insts/inst_util.hh" #include "debug/GCN3.hh" +#include "debug/GPUSync.hh" #include "gpu-compute/shader.hh" namespace Gcn3ISA @@ -3709,6 +3710,7 @@ Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst) { Wavefront *wf = gpuDynInst->wavefront(); +ComputeUnit *cu = gpuDynInst->computeUnit(); // delete extra instructions fetched for completed work-items wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1, @@ -3725,6 +3727,25 @@ int refCount = wf->computeUnit->getLds() .decreaseRefCounter(wf->dispatchId, wf->wgId); +/** + * The parent WF of this instruction is exiting, therefore + * it should not participate in this barrier any longer. This + * prevents possible deadlock issues if WFs exit early. + */ +int bar_id = WFBarrier::InvalidID; +if (wf->hasBarrier()) { +assert(wf->getStatus() != Wavefront::S_BARRIER); +bar_id = wf->barrierId(); +assert(bar_id != WFBarrier::InvalidID); +wf->releaseBarrier(); +cu->decMaxBarrierCnt(bar_id); +DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the " +"program and decrementing max barrier count for " +"barrier Id%d. New max count: %d.\n", cu->cu_id, +wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id, +cu->maxBarrierCnt(bar_id)); +} + DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n", wf->computeUnit->cu_id, wf->wgId, refCount); @@ -3748,6 +3769,20 @@ wf->lastInstExec = 0; if (!refCount) { +/** + * If all WFs have finished, and hence the WG has finished, + * then we can free up the barrier belonging to the parent + * WG, but only if we actually used a barrier (i.e., more + * than one WF in the WG). + */ +if (bar_id != WFBarrier::InvalidID) { +DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are " +"now complete. Releasing barrier Id%d.\n", cu->cu_id, +wf->simdId, wf->wfSlotId, wf->wfDynId, +wf->barrierId()); +cu->releaseBarrier(bar_id); +} + /** * Last wavefront of the workgroup has executed return. If the * workgroup is not the final one in the kernel, then simply @@ -4027,12 +4062,21 @@ Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst) { Wavefront *wf = gpuDynInst->wavefront(); +ComputeUnit *cu = gpuDynInst->computeUnit(); -assert(wf->barrierCnt == wf->oldBarrierCnt); - -wf->barrierCnt = wf->oldBa