[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Create CU's ports in the standard way

2020-08-27 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/32836 )


Change subject: gpu-compute: Create CU's ports in the standard way
..

gpu-compute: Create CU's ports in the standard way

The CU would initialize its ports in getMasterPort(), which
is not desirable as getMasterPort() may be called several
times for the same port. This can lead to a fatal if the CU
expects to only create a single port of a given type, and may
lead to other issues where stat names are duplicated.

This change instantiates and initializes the CU's ports in the
CU constructor using the CU params.

The index field is also removed from the CU's ports because the
base class already has an ID field, which will be set to the
default value in the base class's constructor for scalar ports.

It doesn't make sense for scalar port's to take an index because
they are scalar, so we let the base class initialize the ID to
the invalid port ID.

Change-Id: Id18386f5f53800a6447d968380676d8fd9bac9df
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/32836
Reviewed-by: Anthony Gutierrez 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/compute_unit.hh
M src/gpu-compute/fetch_unit.cc
M src/gpu-compute/shader.cc
4 files changed, 99 insertions(+), 126 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/gpu-compute/compute_unit.cc  
b/src/gpu-compute/compute_unit.cc

index 9a41233..2d64fa3 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -96,6 +96,11 @@
 resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
 _masterId(p->system->getMasterId(this, "ComputeUnit")),
 lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
+ldsPort(csprintf("%s-port", name()), this),
+scalarDataPort(csprintf("%s-port", name()), this),
+scalarDTLBPort(csprintf("%s-port", name()), this),
+sqcPort(csprintf("%s-port", name()), this),
+sqcTLBPort(csprintf("%s-port", name()), this),
 _cacheLineSize(p->system->cacheLineSize()),
 _numBarrierSlots(p->num_barrier_slots),
 globalSeqNum(0), wavefrontSize(p->wf_size),
@@ -169,16 +174,18 @@
 fatal("Invalid WF execution policy (CU)\n");
 }

-memPort.resize(wfSize());
+for (int i = 0; i < p->port_memory_port_connection_count; ++i) {
+memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
+}
+
+for (int i = 0; i < p->port_translation_port_connection_count; ++i) {
+tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
+}

 // Setup tokens for slave ports. The number of tokens in memSlaveTokens
 // is the total token count for the entire vector port (i.e., this CU).
 memPortTokens = new TokenManager(p->max_cu_tokens);

-// resize the tlbPort vectorArray
-int tlbPort_width = perLaneTLB ? wfSize() : 1;
-tlbPort.resize(tlbPort_width);
-
 registerExitCallback([this]() { exitCallback(); });

 lastExecCycle.resize(numVectorALUs, 0);
@@ -214,7 +221,6 @@
 lastVaddrSimd[j].clear();
 }
 lastVaddrCU.clear();
-delete ldsPort;
 }

 int
@@ -781,7 +787,7 @@
 // appropriate cycle to process the timing memory response
 // This delay represents the pipeline delay
 SenderState *sender_state = safe_cast(pkt->senderState);
-int index = sender_state->port_index;
+PortID index = sender_state->port_index;
 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
 GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();

@@ -886,7 +892,7 @@
 }

 EventFunctionWrapper *mem_resp_event =
-computeUnit->memPort[index]->createMemRespEvent(pkt);
+computeUnit->memPort[index].createMemRespEvent(pkt);

 DPRINTF(GPUPort,
 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x  
received!\n",

@@ -1007,7 +1013,7 @@
 }

 void
-ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr  
pkt)
+ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr  
pkt)

 {
 // There must be a way around this check to do the globalMemStart...
 Addr tmp_vaddr = pkt->req->getVaddr();
@@ -1039,7 +1045,7 @@
 tlbCycles -= curTick();
 ++tlbRequests;

-int tlbPort_index = perLaneTLB ? index : 0;
+PortID tlbPort_index = perLaneTLB ? index : 0;

 if (shader->timingSim) {
 if (debugSegFault) {
@@ -1074,7 +1080,7 @@
 pkt->senderState = translation_state;

 if (functionalTLB) {
-tlbPort[tlbPort_index]->sendFunctional(pkt);
+tlbPort[tlbPort_index].sendFunctional(pkt);

 // update the hitLevel distribution
 int hit_level = translation_state->hitLevel;
@@ -1117,33 +1123,33 @@
 // translation 

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Create CU's ports in the standard way

2020-08-18 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/32836

to review the following change.


Change subject: gpu-compute: Create CU's ports in the standard way
..

gpu-compute: Create CU's ports in the standard way

The CU would initialize its ports in getMasterPort(), which
is not desirable as getMasterPort() may be called several
times for the same port. This can lead to a fatal if the CU
expects to only create a single port of a given type, and may
lead to other issues where stat names are duplicated.

This change instantiates and initializes the CU's ports in the
CU constructor using the CU params.

The index field is also removed from the CU's ports because the
base class already has an ID field, which will be set to the
default value in the base class's constructor for scalar ports.

It doesn't make sense for scalar port's to take an index because
they are scalar, so we let the base class initialize the ID to
the invalid port ID.

Change-Id: Id18386f5f53800a6447d968380676d8fd9bac9df
---
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/compute_unit.hh
M src/gpu-compute/fetch_unit.cc
3 files changed, 86 insertions(+), 103 deletions(-)



diff --git a/src/gpu-compute/compute_unit.cc  
b/src/gpu-compute/compute_unit.cc

index 7e0947f..b9f7dec 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -96,6 +96,11 @@
 resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
 _masterId(p->system->getMasterId(this, "ComputeUnit")),
 lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
+ldsPort(csprintf("%s-port", name()), this),
+scalarDataPort(csprintf("%s-port", name()), this),
+scalarDTLBPort(csprintf("%s-port", name()), this),
+sqcPort(csprintf("%s-port", name()), this),
+sqcTLBPort(csprintf("%s-port", name()), this),
 _cacheLineSize(p->system->cacheLineSize()),
 _numBarrierSlots(p->num_barrier_slots),
 globalSeqNum(0), wavefrontSize(p->wf_size),
@@ -169,16 +174,20 @@
 fatal("Invalid WF execution policy (CU)\n");
 }

-memPort.resize(wfSize());
+for (int i = 0; i < p->port_memory_port_connection_count; ++i) {
+memPort.push_back(
+new DataPort(csprintf("%s-port%d", name(), i), this, i));
+}
+
+for (int i = 0; i < p->port_translation_port_connection_count; ++i) {
+tlbPort.push_back(
+new DTLBPort(csprintf("%s-port%d", name(), i), this, i));
+}

 // Setup tokens for slave ports. The number of tokens in memSlaveTokens
 // is the total token count for the entire vector port (i.e., this CU).
 memPortTokens = new TokenManager(p->max_cu_tokens);

-// resize the tlbPort vectorArray
-int tlbPort_width = perLaneTLB ? wfSize() : 1;
-tlbPort.resize(tlbPort_width);
-
 registerExitCallback([this]() { exitCallback(); });

 lastExecCycle.resize(numVectorALUs, 0);
@@ -214,7 +223,14 @@
 lastVaddrSimd[j].clear();
 }
 lastVaddrCU.clear();
-delete ldsPort;
+
+for (auto mem_port : memPort) {
+delete mem_port;
+}
+
+for (auto tlb_port : tlbPort) {
+delete tlb_port;
+}
 }

 int
@@ -781,7 +797,7 @@
 // appropriate cycle to process the timing memory response
 // This delay represents the pipeline delay
 SenderState *sender_state = safe_cast(pkt->senderState);
-int index = sender_state->port_index;
+PortID index = sender_state->port_index;
 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
 GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();

@@ -1007,7 +1023,7 @@
 }

 void
-ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr  
pkt)
+ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr  
pkt)

 {
 // There must be a way around this check to do the globalMemStart...
 Addr tmp_vaddr = pkt->req->getVaddr();
@@ -1039,7 +1055,7 @@
 tlbCycles -= curTick();
 ++tlbRequests;

-int tlbPort_index = perLaneTLB ? index : 0;
+PortID tlbPort_index = perLaneTLB ? index : 0;

 if (shader->timingSim) {
 if (debugSegFault) {
@@ -1205,12 +1221,12 @@
 new TheISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc,  
false,

  pkt->senderState);

-if (scalarDTLBPort->isStalled()) {
-assert(scalarDTLBPort->retries.size());
-scalarDTLBPort->retries.push_back(pkt);
-} else if (!scalarDTLBPort->sendTimingReq(pkt)) {
-scalarDTLBPort->stallPort();
-scalarDTLBPort->retries.push_back(pkt);
+if (scalarDTLBPort.isStalled()) {
+assert(scalarDTLBPort.retries.size());
+scalarDTLBPort.retries.push_back(pkt);
+} else if (!scalarDTLBPort.sendTimingReq(pkt)) {
+scalarDTLBPort.stallPort();
+scalarDTLBPort.retries.push_back(pkt)

[gem5-dev] Change in gem5/gem5[develop]: configs: Replace DirMem w/RubyDirectoryMemory, set addr_ranges

2020-08-18 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/32674 )


Change subject: configs: Replace DirMem w/RubyDirectoryMemory, set  
addr_ranges

..

configs: Replace DirMem w/RubyDirectoryMemory, set addr_ranges

This was originally from the GCN staging branch, which only had
GPU_VIPER.py, but the other GPU_VIPER configs had DirMem as well, so I
applied this change to all of them.

The patch replaces the Directory in DirCntrl from DirMem to
RubyDirectoryMemory. This fixes errors that DirMem caused relating to
setting class variables. It also generates and sets addr_ranges in
DirCntrl as RubyDirectoryMemory uses the parent object's addr_ranges
in its code

The style checker complained about a line length in GPU_VIPER_Region,
so the patch also fixes that

Change-Id: Icec96777a51d8a826b576fc752fae0f7f15427bc
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/32674
Reviewed-by: Matt Sinclair 
Reviewed-by: Bradford Beckmann 
Maintainer: Bradford Beckmann 
Tested-by: kokoro 
---
M configs/ruby/GPU_VIPER.py
M configs/ruby/GPU_VIPER_Baseline.py
M configs/ruby/GPU_VIPER_Region.py
3 files changed, 69 insertions(+), 47 deletions(-)

Approvals:
  Bradford Beckmann: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, but someone else must approve
  kokoro: Regressions pass



diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py
index 92dcf5e..967b4d3 100644
--- a/configs/ruby/GPU_VIPER.py
+++ b/configs/ruby/GPU_VIPER.py
@@ -322,24 +322,14 @@
 self.probeToL3 = probe_to_l3
 self.respToL3 = resp_to_l3

-class DirMem(RubyDirectoryMemory, CntrlBase):
-def create(self, options, ruby_system, system):
-self.version = self.versionCount()
-
-phys_mem_size = AddrRange(options.mem_size).size()
-mem_module_size = phys_mem_size / options.num_dirs
-dir_size = MemorySize('0B')
-dir_size.value = mem_module_size
-self.size = dir_size
-
 class DirCntrl(Directory_Controller, CntrlBase):
-def create(self, options, ruby_system, system):
+def create(self, options, dir_ranges, ruby_system, system):
 self.version = self.versionCount()

 self.response_latency = 30

-self.directory = DirMem()
-self.directory.create(options, ruby_system, system)
+self.addr_ranges = dir_ranges
+self.directory = RubyDirectoryMemory()

 self.L3CacheMemory = L3Cache()
 self.L3CacheMemory.create(options, ruby_system, system)
@@ -441,6 +431,17 @@
 # Clusters
 crossbar_bw = None
 mainCluster = None
+
+if options.numa_high_bit:
+numa_bit = options.numa_high_bit
+else:
+# if the numa_bit is not specified, set the directory bits as the
+# lowest bits above the block offset bits, and the numa_bit as the
+# highest of those directory bits
+dir_bits = int(math.log(options.num_dirs, 2))
+block_size_bits = int(math.log(options.cacheline_size, 2))
+numa_bit = block_size_bits + dir_bits - 1
+
 if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
 #Assuming a 2GHz clock
 crossbar_bw = 16 * options.num_compute_units * options.bw_scalor
@@ -448,9 +449,16 @@
 else:
 mainCluster = Cluster(intBW=8) # 16 GB/s
 for i in range(options.num_dirs):
+dir_ranges = []
+for r in system.mem_ranges:
+addr_range = m5.objects.AddrRange(r.start, size = r.size(),
+  intlvHighBit = numa_bit,
+  intlvBits = dir_bits,
+  intlvMatch = i)
+dir_ranges.append(addr_range)

 dir_cntrl = DirCntrl(noTCCdir = True, TCC_select_num_bits =  
TCC_bits)

-dir_cntrl.create(options, ruby_system, system)
+dir_cntrl.create(options, dir_ranges, ruby_system, system)
 dir_cntrl.number_of_TBEs = options.num_tbes
 dir_cntrl.useL3OnWT = options.use_L3_on_WT
 # the number_of_TBEs is inclusive of TBEs below
diff --git a/configs/ruby/GPU_VIPER_Baseline.py  
b/configs/ruby/GPU_VIPER_Baseline.py

index 5388a4e..5a3 100644
--- a/configs/ruby/GPU_VIPER_Baseline.py
+++ b/configs/ruby/GPU_VIPER_Baseline.py
@@ -301,22 +301,12 @@
 self.probeToL3 = probe_to_l3
 self.respToL3 = resp_to_l3

-class DirMem(RubyDirectoryMemory, CntrlBase):
-def create(self, options, ruby_system, system):
-self.version = self.versionCount()
-
-phys_mem_size = AddrRange(options.mem_size).size()
-mem_module_size = phys_mem_size / options.num_dirs
-dir_size = MemorySize('0B')
-dir_size.value = mem_module_size
-self.size = dir_size
-
 class DirCntrl(Directory_Controller, CntrlBase):
-def create(self, options, ruby_system, system):
+def c

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute, arch-gcn3: Change how waitcnts are implemented

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29973 )


Change subject: gpu-compute, arch-gcn3: Change how waitcnts are implemented
..

gpu-compute, arch-gcn3: Change how waitcnts are implemented

Use single counters per memory operation type and increment
them upon issue, not execute.

Change-Id: I6afc0b66b21882538ef90a14a57a3ab3cc7bd6f3
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29973
Reviewed-by: Anthony Gutierrez 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/arch/gcn3/insts/instructions.cc
M src/gpu-compute/global_memory_pipeline.cc
M src/gpu-compute/gpu_dyn_inst.cc
M src/gpu-compute/local_memory_pipeline.cc
M src/gpu-compute/scalar_memory_pipeline.cc
M src/gpu-compute/schedule_stage.cc
M src/gpu-compute/wavefront.cc
M src/gpu-compute/wavefront.hh
8 files changed, 106 insertions(+), 18 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 9987fad..7c2cf0e 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -32565,6 +32565,7 @@

 vdst.write();

+wf->decLGKMInstsIssued();
 wf->rdLmReqsInPipe--;
 wf->validateRequestCounters();
 } // execute
@@ -32635,6 +32636,7 @@

 vdst.write();

+wf->decLGKMInstsIssued();
 wf->rdLmReqsInPipe--;
 wf->validateRequestCounters();
 } // execute
@@ -39400,6 +39402,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->rdGmReqsInPipe--;
 wf->rdLmReqsInPipe--;
 return;
@@ -39496,6 +39500,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->rdGmReqsInPipe--;
 wf->rdLmReqsInPipe--;
 return;
@@ -39592,6 +39598,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->rdGmReqsInPipe--;
 wf->rdLmReqsInPipe--;
 return;
@@ -39660,6 +39668,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->rdGmReqsInPipe--;
 wf->rdLmReqsInPipe--;
 return;
@@ -39728,6 +39738,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->rdGmReqsInPipe--;
 wf->rdLmReqsInPipe--;
 return;
@@ -39805,6 +39817,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->rdGmReqsInPipe--;
 wf->rdLmReqsInPipe--;
 }
@@ -39884,6 +39898,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->wrGmReqsInPipe--;
 wf->wrLmReqsInPipe--;
 return;
@@ -39952,6 +39968,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->wrGmReqsInPipe--;
 wf->wrLmReqsInPipe--;
 return;
@@ -40021,6 +40039,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->wrGmReqsInPipe--;
 wf->wrLmReqsInPipe--;
 return;
@@ -40090,6 +40110,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->wrGmReqsInPipe--;
 wf->wrLmReqsInPipe--;
 return;
@@ -40159,6 +40181,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->wrGmReqsInPipe--;
 wf->wrLmReqsInPipe--;
 return;
@@ -40237,6 +40261,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->wrGmReqsInPipe--;
 wf->wrLmReqsInPipe--;
  

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add case to op selector when operand is vcc_hi

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29971 )


Change subject: arch-gcn3: Add case to op selector when operand is vcc_hi
..

arch-gcn3: Add case to op selector when operand is vcc_hi

Change-Id: Ib8846656e18aad04ccb8c9112bc629c69078fe36
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29971
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/registers.cc
1 file changed, 2 insertions(+), 0 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/registers.cc b/src/arch/gcn3/registers.cc
index 016160f..d5c4903 100644
--- a/src/arch/gcn3/registers.cc
+++ b/src/arch/gcn3/registers.cc
@@ -141,6 +141,8 @@
  *
  */
 regIdx = numScalarRegs - 2;
+} else if (idx == REG_VCC_HI) {
+regIdx = numScalarRegs - 1;
 } else if (idx == REG_FLAT_SCRATCH_LO) {
 /**
  * the FLAT_SCRATCH register occupies the two SRF entries

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29971
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Ib8846656e18aad04ccb8c9112bc629c69078fe36
Gerrit-Change-Number: 29971
Gerrit-PatchSet: 8
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Use refs to CU in pipe stages/mem pipes

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29969 )


Change subject: gpu-compute: Use refs to CU in pipe stages/mem pipes
..

gpu-compute: Use refs to CU in pipe stages/mem pipes

The pipe stages and memory pipes are changed to store
a reference to their parent CU as opposed to a pointer.
These objects will never change which CU they belong to,
and they are constructed by their parent CU.

Change-Id: Ie5476e1e2e124a024c2efebceb28cb3a9baa78c1
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29969
Reviewed-by: Anthony Gutierrez 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/exec_stage.cc
M src/gpu-compute/exec_stage.hh
M src/gpu-compute/fetch_stage.cc
M src/gpu-compute/fetch_stage.hh
M src/gpu-compute/fetch_unit.cc
M src/gpu-compute/fetch_unit.hh
M src/gpu-compute/global_memory_pipeline.cc
M src/gpu-compute/global_memory_pipeline.hh
M src/gpu-compute/local_memory_pipeline.cc
M src/gpu-compute/local_memory_pipeline.hh
M src/gpu-compute/scalar_memory_pipeline.cc
M src/gpu-compute/scalar_memory_pipeline.hh
M src/gpu-compute/schedule_stage.cc
M src/gpu-compute/schedule_stage.hh
M src/gpu-compute/scoreboard_check_stage.cc
M src/gpu-compute/scoreboard_check_stage.hh
17 files changed, 193 insertions(+), 193 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/gpu-compute/compute_unit.cc  
b/src/gpu-compute/compute_unit.cc

index 653c074..a59a7fd 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -67,13 +67,13 @@
 vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
 coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
 registerManager(p->register_manager),
-fetchStage(p, this),
-scoreboardCheckStage(p, this),
-scheduleStage(p, this),
-execStage(p, this),
-globalMemoryPipe(p, this),
-localMemoryPipe(p, this),
-scalarMemoryPipe(p, this),
+fetchStage(p, *this),
+scoreboardCheckStage(p, *this),
+scheduleStage(p, *this),
+execStage(p, *this),
+globalMemoryPipe(p, *this),
+localMemoryPipe(p, *this),
+scalarMemoryPipe(p, *this),
 tickEvent([this]{ exec(); }, "Compute unit tick event",
   false, Event::CPU_Tick_Pri),
 cu_id(p->cu_id),
diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc
index e420579..2b0a797 100644
--- a/src/gpu-compute/exec_stage.cc
+++ b/src/gpu-compute/exec_stage.cc
@@ -41,10 +41,10 @@
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"

-ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit *cu)
+ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu)
 : computeUnit(cu), lastTimeInstExecuted(false),
   thisTimeInstExecuted(false), instrExecuted (false),
-  executionResourcesUsed(0), _name(cu->name() + ".ExecStage")
+  executionResourcesUsed(0), _name(cu.name() + ".ExecStage")

 {
 numTransActiveIdle = 0;
@@ -54,7 +54,7 @@
 void
 ExecStage::init()
 {
-dispatchList = &computeUnit->dispatchList;
+dispatchList = &computeUnit.dispatchList;
 idle_dur = 0;
 }

@@ -127,7 +127,7 @@
 {
 std::stringstream ss;
 bool empty = true;
-for (int i = 0; i < computeUnit->numExeUnits(); i++) {
+for (int i = 0; i < computeUnit.numExeUnits(); i++) {
 DISPATCH_STATUS s = dispatchList->at(i).second;
 ss << i << ": " << dispStatusToStr(s);
 if (s != EMPTY) {
@@ -151,7 +151,7 @@
 if (Debug::GPUSched) {
 dumpDispList();
 }
-for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
+for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
 DISPATCH_STATUS s = dispatchList->at(unitId).second;
 switch (s) {
 case EMPTY:
@@ -168,7 +168,7 @@
 (w->instructionBuffer.front())->disassemble());
 DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
 dispatchList->at(unitId).first->exec();
-(computeUnit->scheduleStage).deleteFromSch(w);
+(computeUnit.scheduleStage).deleteFromSch(w);
 dispatchList->at(unitId).second = EMPTY;
 dispatchList->at(unitId).first->freeResources();
 dispatchList->at(unitId).first = nullptr;
@@ -208,7 +208,7 @@
 ;

 spc
-.init(0, computeUnit->numExeUnits(), 1)
+.init(0, computeUnit.numExeUnits(), 1)
 .name(name() + ".spc")
 .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
 ;
@@ -220,26 +220,26 @@
 ;

 numCyclesWithInstrTypeIssued
-.init(computeUnit->numExeUnits())
+.init(computeUnit.numExeUnits())
 .name(name() + ".num_cycles_issue_exec_rsrc")
 .desc("Number of cycle

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: No RF scheduling in case of SKIP or EMPTY

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29970 )


Change subject: gpu-compute: No RF scheduling in case of SKIP or EMPTY
..

gpu-compute: No RF scheduling in case of SKIP or EMPTY

In case of flat memory instructions the status for the
LM pipe execution unit is set to SKIP or EMPTY, as the bus
between the VRF and the GM and LM pipe is shared. The
destination operands should not be scheduled for the LM pipe,
event if the wave is in the dispatch list. This can lead
to deadlock in the destination cache as DCEs are reused
and the slotsAvailableForBank count gets artificially
incremented.

Change-Id: I2230c53e3bc1032d2cccbe00fab62c99ab8de6cd
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29970
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/gpu-compute/schedule_stage.cc
1 file changed, 5 insertions(+), 1 deletion(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/gpu-compute/schedule_stage.cc  
b/src/gpu-compute/schedule_stage.cc

index 0785aa0..e0600a6 100644
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -236,9 +236,13 @@
 ScheduleStage::scheduleRfDestOperands()
 {
 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
-if (!dispatchList->at(j).first) {
+if (dispatchList->at(j).second == EMPTY ||
+dispatchList->at(j).second == SKIP) {
 continue;
 }
+
+assert(dispatchList->at(j).first);
+
 // get the wave on dispatch list and attempt to allocate write
 // resources in the RFs
 Wavefront *w = dispatchList->at(j).first;

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29970
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I2230c53e3bc1032d2cccbe00fab62c99ab8de6cd
Gerrit-Change-Number: 29970
Gerrit-PatchSet: 9
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Alexandru Duțu 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Replace some instances of std::isnormal with std::fpclassify

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29967 )


Change subject: arch-gcn3: Replace some instances of std::isnormal with  
std::fpclassify

..

arch-gcn3: Replace some instances of std::isnormal with std::fpclassify

Affected instructions: V_DIV_SCALE_F64, V_CMP_CLASS_F64,
V_CMPX_CLASS_F64 and their VOPC, VOP3, F32 variants.

These instances of std::isnormal were being used to check for
subnormal (denorms) values. std::isnormal is not specific enough.
It returns true for normal values but false for NaN, Inf, 0.0, and
subnormals. std::fpclassify returns macros for each category of
floating point numbers. Now we only catch subnormals.

Change-Id: I8d8f4452ff58de71e7c8e0b2b5e73467b532e196
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29967
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 23 insertions(+), 21 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 302dad4..9987fad 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -9439,7 +9439,7 @@
 }
 if (bits(src1[lane], 4)) {
 // is -denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -9463,7 +9463,7 @@
 }
 if (bits(src1[lane], 7)) {
 // is +denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && !std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -9551,7 +9551,7 @@
 }
 if (bits(src1[lane], 4)) {
 // is -denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -9575,7 +9575,7 @@
 }
 if (bits(src1[lane], 7)) {
 // is +denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && !std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -9664,7 +9664,7 @@
 }
 if (bits(src1[lane], 4)) {
 // is -denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -9688,7 +9688,7 @@
 }
 if (bits(src1[lane], 7)) {
 // is +denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && !std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -9777,7 +9777,7 @@
 }
 if (bits(src1[lane], 4)) {
 // is -denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -9801,7 +9801,7 @@
 }
 if (bits(src1[lane], 7)) {
 // is +denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && !std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -15550,7 +15550,7 @@
 }
 if (bits(src1[lane], 4)) {
 // is -denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && std::signbit(src0[lane])) {
 sdst.setBit(lane,  1);
 continue;
@@ -15574,7 +15574,7 @@
 }
 if (bits(src1[lane], 7)) {
 // is +denormal
-if (!std::isnormal(s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix stride bug in buffer OOB detection logic

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29968 )


Change subject: arch-gcn3: Fix stride bug in buffer OOB detection logic
..

arch-gcn3: Fix stride bug in buffer OOB detection logic

The out-of-range logic for buffer accesses is missing the top 4 bits of
const_stride when dealing with scratch buffers.  This can cause
perfectly valid scratch acceses to be suppressed when const_stride is
large.

Change-Id: I8f94d44c242fda26cf6dfb75db04fa3aca934b3e
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29968
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Matt Sinclair 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/arch/gcn3/insts/op_encodings.hh
1 file changed, 3 insertions(+), 3 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, but someone else must approve
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/op_encodings.hh  
b/src/arch/gcn3/insts/op_encodings.hh

index 202dd1d..b35fb3d 100644
--- a/src/arch/gcn3/insts/op_encodings.hh
+++ b/src/arch/gcn3/insts/op_encodings.hh
@@ -651,7 +651,7 @@
  * non-formatted accesses, this is done on a per-lane
  * basis.
  */
-if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {
+if (stride == 0 || !rsrc_desc.swizzleEn) {
 if (buf_off + stride * buf_idx >=
 rsrc_desc.numRecords - s_offset.rawData()) {
 DPRINTF(GCN3, "mubuf out-of-bounds condition  
1: "

@@ -659,13 +659,13 @@
 "const_stride = %llx, "
 "const_num_records = %llx\n",
 lane, buf_off + stride * buf_idx,
-rsrc_desc.stride,  
rsrc_desc.numRecords);

+stride, rsrc_desc.numRecords);
 oobMask.set(lane);
 continue;
 }
 }

-if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {
+if (stride != 0 && rsrc_desc.swizzleEn) {
 if (buf_idx >= rsrc_desc.numRecords ||
 buf_off >= stride) {
 DPRINTF(GCN3, "mubuf out-of-bounds condition  
2: "


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29968
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I8f94d44c242fda26cf6dfb75db04fa3aca934b3e
Gerrit-Change-Number: 29968
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Michael LeBeane 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix VOP3 V_LDEXP_F64

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29966 )


Change subject: arch-gcn3: Fix VOP3 V_LDEXP_F64
..

arch-gcn3: Fix VOP3 V_LDEXP_F64

Replaced !std::isnormal with std::fpclassify because std::isnormal
is not specific enough. !std::isnormal was incorrectly catching
NaN, Inf, 0.0, and subnormals (aka denormals), where as it was only
suppose to catch subnormals.

The return value and error handling spec of std::ldexp listed on
cppreference.com appears to match up in nearly all cases after
making these changes. If std::ldexp handled subnormals as described
in the GCN3 2016 guide, we could have used vdst[lane] = std::ldexp
and not need to check for any corner cases.

Change-Id: I4c77af77c3b7798f86d40442610cef1296a28441
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29966
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 4 insertions(+), 3 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 2b992b1..302dad4 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -30282,10 +30282,11 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
-if (std::isnan(src1[lane]) || std::isinf(src1[lane])) {
+if (std::isnan(src0[lane]) || std::isinf(src0[lane])) {
 vdst[lane] = src0[lane];
-} else if (!std::isnormal(src1[lane])) {
-if (std::signbit(src1[lane])) {
+} else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+   || std::fpclassify(src0[lane]) == FP_ZERO) {
+if (std::signbit(src0[lane])) {
 vdst[lane] = -0.0;
 } else {
 vdst[lane] = +0.0;

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29966
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I4c77af77c3b7798f86d40442610cef1296a28441
Gerrit-Change-Number: 29966
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix roundNearestEven for V_RNDNE_F64 and V_RNDNE_F32

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29964 )


Change subject: arch-gcn3: Fix roundNearestEven for V_RNDNE_F64 and  
V_RNDNE_F32

..

arch-gcn3: Fix roundNearestEven for V_RNDNE_F64 and V_RNDNE_F32

roundNearestEven is an inst_util function that RNDNE_F64 and F32
call, including both VOP1 and VOP3 formats. IEEE 754 spec says this
function should round inputs to the nearest integer but round ties
to the nearest even integer. Prior to this patch it was rounding all
inputs to nearest even, not just the ties. It was probably implemented
this way originally because the language in the ISA manual is ambiguous
although it provided the correct logic.

Fixed roundNearestEven to use the semantics originally described in
the GCN3 ISA manual.

Change-Id: I83ecb1d516fcf5bdf17e54ddf409b447a129a9a7
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29964
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/inst_util.hh
1 file changed, 7 insertions(+), 1 deletion(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/inst_util.hh  
b/src/arch/gcn3/insts/inst_util.hh

index b40e890..15ffe9a 100644
--- a/src/arch/gcn3/insts/inst_util.hh
+++ b/src/arch/gcn3/insts/inst_util.hh
@@ -258,7 +258,13 @@
 template 
 inline T roundNearestEven(T val)
 {
-T nearest_round = std::round(val * 0.5) * 2.0;
+T int_part = 0;
+T nearest_round = std::floor(val + 0.5);
+if ((int)std::floor(val) % 2 == 0
+&& std::modf(std::abs(val), &int_part) == 0.5) {
+  nearest_round = nearest_round - 1;
+}
+
 return nearest_round;
 }


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29964
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I83ecb1d516fcf5bdf17e54ddf409b447a129a9a7
Gerrit-Change-Number: 29964
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Fix Y-dimension ABI decode

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29965 )


Change subject: gpu-compute: Fix Y-dimension ABI decode
..

gpu-compute: Fix Y-dimension ABI decode

We currently have a bug in decoding workitem ID from the kernel
descriptor with multiple dimensions.  The enable_vgpr_workitem_id bits
are currently seperated into x and y components, when they should be
treated as a single 2 bit value, where y is enabled when it is > 0,
and z is enabled when it is > 1.  The current setup allows a kernel
launch with vgprs reserved for the z dimension and not the y dimension,
which is incorrect.

Change-Id: Iee64b207feb95bcf064898d5db33b8f201e25323
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29965
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/gpu-compute/hsa_queue_entry.hh
M src/gpu-compute/kernel_code.hh
2 files changed, 3 insertions(+), 4 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/gpu-compute/hsa_queue_entry.hh  
b/src/gpu-compute/hsa_queue_entry.hh

index 5fc5e56..ea79869 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -417,8 +417,8 @@
  * workitem Id in the X dimension is always initialized.
  */
 initialVgprState.set(WorkitemIdX, true);
-initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id_y);
-initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id_z);
+initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id >  
0);
+initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id >  
1);

 }

 // name of the kernel associated with the AQL entry
diff --git a/src/gpu-compute/kernel_code.hh b/src/gpu-compute/kernel_code.hh
index b3560c7..680dd72 100644
--- a/src/gpu-compute/kernel_code.hh
+++ b/src/gpu-compute/kernel_code.hh
@@ -130,8 +130,7 @@
 uint32_t enable_sgpr_workgroup_id_y : 1;
 uint32_t enable_sgpr_workgroup_id_z : 1;
 uint32_t enable_sgpr_workgroup_info : 1;
-uint32_t enable_vgpr_workitem_id_y : 1;
-uint32_t enable_vgpr_workitem_id_z : 1;
+uint32_t enable_vgpr_workitem_id : 2;
 uint32_t enable_exception_address_watch : 1;
 uint32_t enable_exception_memory_violation : 1;
 uint32_t granulated_lds_size : 9;

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29965
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Iee64b207feb95bcf064898d5db33b8f201e25323
Gerrit-Change-Number: 29965
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Michael LeBeane 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Don't track vector store insts in CU's headTailMap

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29963 )


Change subject: gpu-compute: Don't track vector store insts in CU's  
headTailMap

..

gpu-compute: Don't track vector store insts in CU's headTailMap

This change fixes a memory leak due to live GPUDynInstPtr references
to vector store insts being stored in the CU's headTailMap and never
released.

This happened because store insts are not supposed to have their
head-tail latencies tracked by the headTailMap; instead they use
timing information from the GPUCoalescer. When updating the
headTailLatency stat via the headTailMap, only loads were considered
and removed from the headTailMap, however when inserting into the
headTailMap loads and stores were considered, thus leading to the
memory leak.

This change fixes the issue by only adding loads to the headTailMap.

Change-Id: I8a8f5b79f55e00481ae5e82519a9ed627a7ecbd1
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29963
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/gpu-compute/compute_unit.cc
1 file changed, 5 insertions(+), 3 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/gpu-compute/compute_unit.cc  
b/src/gpu-compute/compute_unit.cc

index f3387a7..653c074 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1389,9 +1389,11 @@
 gpuDynInst->wfSlotId);
 }
 } else {
-if (!compute_unit->headTailMap.count(gpuDynInst)) {
-compute_unit->headTailMap.insert(
-std::make_pair(gpuDynInst, curTick()));
+if (pkt->isRead()) {
+if (!compute_unit->headTailMap.count(gpuDynInst)) {
+compute_unit->headTailMap
+.insert(std::make_pair(gpuDynInst, curTick()));
+}
 }
 }


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29963
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I8a8f5b79f55e00481ae5e82519a9ed627a7ecbd1
Gerrit-Change-Number: 29963
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: add all s_buffer_load_dword instructions

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29962 )


Change subject: arch-gcn3: add all s_buffer_load_dword instructions
..

arch-gcn3: add all s_buffer_load_dword instructions

Adds the other s_buffer_load_dword* instruction implementations to
f134a84.

Change-Id: I8d97527278900dc68c32463ea1824409ccd04e1d
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29962
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 125 insertions(+), 8 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 002c4d5..2b992b1 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -4737,17 +4737,46 @@
 void
 Inst_SMEM__S_BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+ScalarRegU32 offset(0);
+ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+rsrcDesc.read();
+
+if (instData.IMM) {
+offset = extData.OFFSET;
+} else {
+ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+off_sgpr.read();
+offset = off_sgpr.rawData();
+}
+
+calcAddr(gpuDynInst, rsrcDesc, offset);
+
+gpuDynInst->computeUnit()->scalarMemoryPipe
+.getGMReqFIFO().push(gpuDynInst);
+
+wf->scalarRdGmReqsInPipe--;
+wf->scalarOutstandingReqsRdGm++;
+gpuDynInst->wavefront()->outstandingReqs++;
+gpuDynInst->wavefront()->validateRequestCounters();
+} // execute

 void
 Inst_SMEM__S_BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
+initMemRead<1>(gpuDynInst);
 } // initiateAcc

 void
 Inst_SMEM__S_BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
 {
+// 1 request, size 32
+ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
+sdst.write();
 } // completeAcc

 Inst_SMEM__S_BUFFER_LOAD_DWORDX2::Inst_SMEM__S_BUFFER_LOAD_DWORDX2(
@@ -4767,17 +4796,46 @@
 void
 Inst_SMEM__S_BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+ScalarRegU32 offset(0);
+ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+rsrcDesc.read();
+
+if (instData.IMM) {
+offset = extData.OFFSET;
+} else {
+ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+off_sgpr.read();
+offset = off_sgpr.rawData();
+}
+
+calcAddr(gpuDynInst, rsrcDesc, offset);
+
+gpuDynInst->computeUnit()->scalarMemoryPipe
+.getGMReqFIFO().push(gpuDynInst);
+
+wf->scalarRdGmReqsInPipe--;
+wf->scalarOutstandingReqsRdGm++;
+gpuDynInst->wavefront()->outstandingReqs++;
+gpuDynInst->wavefront()->validateRequestCounters();
+} // execute

 void
 Inst_SMEM__S_BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
+initMemRead<2>(gpuDynInst);
 } // initiateAcc

 void
 Inst_SMEM__S_BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
 {
+// use U64 because 2 requests, each size 32
+ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
+sdst.write();
 } // completeAcc

 Inst_SMEM__S_BUFFER_LOAD_DWORDX4::Inst_SMEM__S_BUFFER_LOAD_DWORDX4(
@@ -4797,17 +4855,46 @@
 void
 Inst_SMEM__S_BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+ScalarRegU32 offset(0);
+ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+rsrcDesc.read();
+
+if (instData.IMM) {
+offset = extData.OFFSET;
+} else {
+ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+off_sgpr.read();
+offset = off_sgpr.rawData();
+}
+
+calcAddr(gpuDyn

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add memcpy condition when writing EXEC_LO

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29961 )


Change subject: arch-gcn3: Add memcpy condition when writing EXEC_LO
..

arch-gcn3: Add memcpy condition when writing EXEC_LO

Some compilers emit an error on the operand template class when writing
exec mask. Add a condition to explicitly set memcpy size argument to
32b or 64b based on the number of dwords.

Change-Id: I49b0e4a1680283e772d0a5a8efd687b31d4f1624
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29961
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Matt Sinclair 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/arch/gcn3/operand.hh
1 file changed, 9 insertions(+), 2 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, but someone else must approve
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/operand.hh b/src/arch/gcn3/operand.hh
index 9d28deb..97c6310 100644
--- a/src/arch/gcn3/operand.hh
+++ b/src/arch/gcn3/operand.hh
@@ -437,8 +437,15 @@
 if (_opIdx == REG_EXEC_LO) {
 ScalarRegU64 new_exec_mask_val
 = wf->execMask().to_ullong();
-std::memcpy((void*)&new_exec_mask_val,
-(void*)srfData.data(), sizeof(srfData));
+if (NumDwords == 1) {
+std::memcpy((void*)&new_exec_mask_val,
+(void*)srfData.data(), sizeof(VecElemU32));
+} else if (NumDwords == 2) {
+std::memcpy((void*)&new_exec_mask_val,
+(void*)srfData.data(), sizeof(VecElemU64));
+} else {
+panic("Trying to write more than 2 DWORDS to  
EXEC\n");

+}
 VectorMask new_exec_mask(new_exec_mask_val);
 wf->execMask() = new_exec_mask;
 DPRINTF(GPUSRF, "Write EXEC\n");

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29961
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I49b0e4a1680283e772d0a5a8efd687b31d4f1624
Gerrit-Change-Number: 29961
Gerrit-PatchSet: 8
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Matthew Poremba 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Remove invalid assert when reading EXEC_LO

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29960 )


Change subject: arch-gcn3: Remove invalid assert when reading EXEC_LO
..

arch-gcn3: Remove invalid assert when reading EXEC_LO

This assert assumed all reads to EXEC_LO would be
64b, that is, we would always read the entire EXEC
mask. This is invalid as some kernels read only
the low 32b of EXEC.

The write to EXEC_LO is also updated to handle 32b
writes.

Change-Id: Ifeb167578515bf112b1eab70bbf2201a5e936358
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29960
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/operand.hh
1 file changed, 3 insertions(+), 3 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/operand.hh b/src/arch/gcn3/operand.hh
index 960d05e..9d28deb 100644
--- a/src/arch/gcn3/operand.hh
+++ b/src/arch/gcn3/operand.hh
@@ -435,9 +435,10 @@

 if (!isScalarReg(_opIdx)) {
 if (_opIdx == REG_EXEC_LO) {
-ScalarRegU64 new_exec_mask_val(0);
+ScalarRegU64 new_exec_mask_val
+= wf->execMask().to_ullong();
 std::memcpy((void*)&new_exec_mask_val,
-(void*)srfData.data(), sizeof(new_exec_mask_val));
+(void*)srfData.data(), sizeof(srfData));
 VectorMask new_exec_mask(new_exec_mask_val);
 wf->execMask() = new_exec_mask;
 DPRINTF(GPUSRF, "Write EXEC\n");
@@ -513,7 +514,6 @@
 switch(_opIdx) {
   case REG_EXEC_LO:
 {
-assert(NumDwords == 2);
 ScalarRegU64 exec_mask = _gpuDynInst->wavefront()->
 execMask().to_ullong();
 std::memcpy((void*)srfData.data(), (void*)&exec_mask,

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29960
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Ifeb167578515bf112b1eab70bbf2201a5e936358
Gerrit-Change-Number: 29960
Gerrit-PatchSet: 8
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: gpu_compute: Support loading BLIT kernels

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29959 )


Change subject: gpu_compute: Support loading BLIT kernels
..

gpu_compute: Support loading BLIT kernels

The BLIT kernels used to implement DMA through the shaders don't fill
out all of the standard fields in an amd_kernel_code_t object.  This
patch modifies the code object parsing logic to support these new
kernels.

BLIT kernels are used in APUs when using ROCm memcopies for certain size
buffers, and are used for dGPUs when the SDMA engines are disabled.

Change-Id: Id4e667474d05e311097dbec443def07dfad14a79
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29959
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/gpu-compute/gpu_command_processor.cc
M src/gpu-compute/hsa_queue_entry.hh
2 files changed, 31 insertions(+), 4 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/gpu-compute/gpu_command_processor.cc  
b/src/gpu-compute/gpu_command_processor.cc

index b6205ac..fccc035 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -100,11 +100,25 @@
 machine_code_addr);

 Addr kern_name_addr(0);
-virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10,
-(uint8_t*)&kern_name_addr, 0x8);
-
 std::string kernel_name;
-virt_proxy.readString(kernel_name, kern_name_addr);
+
+/**
+ * BLIT kernels don't have symbol names.  BLIT kernels are built-in  
compute

+ * kernels issued by ROCm to handle DMAs for dGPUs when the SDMA
+ * hardware engines are unavailable or explicitly disabled.  They can  
also

+ * be used to do copies that ROCm things would be better performed
+ * by the shader than the SDMA engines.  They are also sometimes used  
on

+ * APUs to implement asynchronous memcopy operations from 2 pointers in
+ * host memory.  I have no idea what BLIT stands for.
+ * */
+if (akc.runtime_loader_kernel_symbol) {
+virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10,
+(uint8_t*)&kern_name_addr, 0x8);
+
+virt_proxy.readString(kernel_name, kern_name_addr);
+} else {
+kernel_name = "Blit kernel";
+}

 DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());

diff --git a/src/gpu-compute/hsa_queue_entry.hh  
b/src/gpu-compute/hsa_queue_entry.hh

index a6917db..5fc5e56 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -88,6 +88,19 @@
   _globalWgId(0), dispatchComplete(false)

 {
+// Precompiled BLIT kernels actually violate the spec a bit
+// and don't set many of the required akc fields.  For these  
kernels,

+// we need to rip register usage from the resource registers.
+//
+// We can't get an exact number of registers from the resource
+// registers because they round, but we can get an upper bound on  
it

+if (!numVgprs)
+numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
+
+// TODO: Granularity changes for GFX9!
+if (!numSgprs)
+numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8;
+
 initialVgprState.reset();
 initialSgprState.reset();


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29959
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Id4e667474d05e311097dbec443def07dfad14a79
Gerrit-Change-Number: 29959
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Michael LeBeane 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement ds_swizzle

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29958 )


Change subject: arch-gcn3: Implement ds_swizzle
..

arch-gcn3: Implement ds_swizzle

Change-Id: I7d188388afa16932217ae207368666a724207c52
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29958
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 102 insertions(+), 2 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 71efd8f..002c4d5 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -32266,6 +32266,7 @@
 Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)
 : Inst_DS(iFmt, "ds_swizzle_b32")
 {
+ setFlag(Load);
 } // Inst_DS__DS_SWIZZLE_B32

 Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()
@@ -32277,8 +32278,107 @@
 void
 Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+wf->rdLmReqsInPipe--;
+wf->validateRequestCounters();
+
+if (gpuDynInst->exec_mask.none()) {
+return;
+}
+
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()
+->cyclesToTicks(Cycles(24)));
+
+ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+VecOperandU32 vdst(gpuDynInst, extData.VDST);
+/**
+ * The "DS pattern" is comprised of both offset fields. That is,  
the

+ * swizzle pattern between lanes. Bit 15 of the DS pattern dictates
+ * which swizzle mode to use. There are two different swizzle
+ * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use
+ * QDMode else use Bit-masks mode. The remaining bits dictate how  
to

+ * swizzle the lanes.
+ *
+ * QDMode:  Chunks the lanes into 4s and swizzles among them.
+ *  Bits 7:6 dictate where lane 3 (of the current  
chunk)

+ *  gets its date, 5:4 lane 2, etc.
+ *
+ * Bit-mask:This mode breaks bits 14:0 into 3 equal-sized  
chunks.

+ *  14:10 is the xor_mask, 9:5 is the or_mask, and 4:0
+ *  is the and_mask. Each lane is swizzled by  
performing

+ *  the appropriate operation using these masks.
+ */
+VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) |  
instData.OFFSET0);

+
+data.read();
+
+if (bits(ds_pattern, 15)) {
+// QDMode
+for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) {
+/**
+ * This operation allows data sharing between groups
+ * of four consecutive threads. Note the increment by
+ * 4 in the for loop.
+ */
+if (gpuDynInst->exec_mask[lane]) {
+int index0 = lane + bits(ds_pattern, 1, 0);
+panic_if(index0 >= NumVecElemPerVecReg, "%s: index0  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index0);
+vdst[lane]
+= gpuDynInst->exec_mask[index0] ? data[index0]: 0;
+}
+if (gpuDynInst->exec_mask[lane + 1]) {
+int index1 = lane + bits(ds_pattern, 3, 2);
+panic_if(index1 >= NumVecElemPerVecReg, "%s: index1  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index1);
+vdst[lane + 1]
+= gpuDynInst->exec_mask[index1] ? data[index1]: 0;
+}
+if (gpuDynInst->exec_mask[lane + 2]) {
+int index2 = lane + bits(ds_pattern, 5, 4);
+panic_if(index2 >= NumVecElemPerVecReg, "%s: index2  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index2);
+vdst[lane + 2]
+= gpuDynInst->exec_mask[index2] ? data[index2]: 0;
+}
+if (gpuDynInst->exec_mask[lane + 3]) {
+int index3 = lane + bits(ds_pattern, 7, 6);
+panic_if(index3 >= NumVecElemPerVecReg, "%s: index3  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index3);
+   

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement s_buffer_load_dwordx16

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29957 )


Change subject: arch-gcn3: Implement s_buffer_load_dwordx16
..

arch-gcn3: Implement s_buffer_load_dwordx16

Change-Id: I25382dcae9bb55eaf035385fa925157f25d39c20
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29957
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/instructions.cc
M src/arch/gcn3/insts/op_encodings.hh
2 files changed, 90 insertions(+), 31 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 567cc10..71efd8f 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -4857,17 +4857,45 @@
 void
 Inst_SMEM__S_BUFFER_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+ScalarRegU32 offset(0);
+ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+rsrcDesc.read();
+
+if (instData.IMM) {
+offset = extData.OFFSET;
+} else {
+ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+off_sgpr.read();
+offset = off_sgpr.rawData();
+}
+
+calcAddr(gpuDynInst, rsrcDesc, offset);
+
+gpuDynInst->computeUnit()->scalarMemoryPipe
+.getGMReqFIFO().push(gpuDynInst);
+
+wf->scalarRdGmReqsInPipe--;
+wf->scalarOutstandingReqsRdGm++;
+gpuDynInst->wavefront()->outstandingReqs++;
+gpuDynInst->wavefront()->validateRequestCounters();
+} // execute

 void
 Inst_SMEM__S_BUFFER_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr  
gpuDynInst)

 {
+initMemRead<16>(gpuDynInst);
 } // initiateAcc

 void
 Inst_SMEM__S_BUFFER_LOAD_DWORDX16::completeAcc(GPUDynInstPtr  
gpuDynInst)

 {
+ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
+sdst.write();
 } // completeAcc

 Inst_SMEM__S_STORE_DWORD::Inst_SMEM__S_STORE_DWORD(InFmt_SMEM *iFmt)
diff --git a/src/arch/gcn3/insts/op_encodings.hh  
b/src/arch/gcn3/insts/op_encodings.hh

index 4056f0a..202dd1d 100644
--- a/src/arch/gcn3/insts/op_encodings.hh
+++ b/src/arch/gcn3/insts/op_encodings.hh
@@ -46,6 +46,29 @@

 namespace Gcn3ISA
 {
+struct BufferRsrcDescriptor
+{
+uint64_t baseAddr : 48;
+uint32_t stride : 14;
+uint32_t cacheSwizzle : 1;
+uint32_t swizzleEn : 1;
+uint32_t numRecords : 32;
+uint32_t dstSelX : 3;
+uint32_t dstSelY : 3;
+uint32_t dstSelZ : 3;
+uint32_t dstSelW : 3;
+uint32_t numFmt : 3;
+uint32_t dataFmt : 4;
+uint32_t elemSize : 2;
+uint32_t idxStride : 2;
+uint32_t addTidEn : 1;
+uint32_t atc : 1;
+uint32_t hashEn : 1;
+uint32_t heap : 1;
+uint32_t mType : 3;
+uint32_t type : 2;
+};
+
 // --- purely virtual instruction classes ---

 class Inst_SOP2 : public GCN3GPUStaticInst
@@ -197,14 +220,45 @@
 MemCmd::WriteReq);
 }

+/**
+ * For normal s_load_dword/s_store_dword instruction addresses.
+ */
 void
-calcAddr(GPUDynInstPtr gpuDynInst, ConstScalarOperandU64 &addr,
-ScalarRegU32 offset)
+calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU64 &addr,
+ ScalarRegU32 offset)
 {
-Addr vaddr = addr.rawData();
-vaddr += offset;
-vaddr &= ~0x3;
-gpuDynInst->scalarAddr = vaddr;
+Addr vaddr = ((addr.rawData() + offset) & ~0x3);
+gpu_dyn_inst->scalarAddr = vaddr;
+}
+
+/**
+ * For s_buffer_load_dword/s_buffer_store_dword instruction  
addresses.
+ * The s_buffer instructions use the same buffer resource  
descriptor

+ * as the MUBUF instructions.
+ */
+void
+calcAddr(GPUDynInstPtr gpu_dyn_inst,
+ ConstScalarOperandU128 &s_rsrc_desc, ScalarRegU32 offset)
+{
+BufferRsrcDescriptor rsrc_desc;
+ScalarRegU32 clamped_offset(offset);
+std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
+sizeof(BufferRsrcDescriptor));
+
+/**
+ * The address is clamped if:
+ * Stride is zero: clamp if offset >= num_records
+ * Stride is non-zer

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fixup DIV instructions

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29956 )


Change subject: arch-gcn3: Fixup DIV instructions
..

arch-gcn3: Fixup DIV instructions

Adds support to handle the special cases
for GCN3 DIV instructions.

Change-Id: I18f91870e802407c93831f313ce76be053bc4230
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29956
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Matt Sinclair 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 44 insertions(+), 42 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index a25ec17..567cc10 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -28952,34 +28952,35 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
-int signOut = std::signbit(src1[lane]) ^
-  std::signbit(src2[lane]);
-int exp1, exp2;
-std::frexp(src1[lane],&exp1);
-std::frexp(src2[lane],&exp2);
-if (std::isnan(src2[lane])) {
-vdst[lane] = src2[lane];
-} else if (std::isnan(src1[lane])) {
-vdst[lane] = src1[lane];
-} else if (src1[lane] == 0.0 && src2[lane] == 0.0) {
-vdst[lane] = -NAN;
+int sign_out = std::signbit(src1[lane])
+  ^ std::signbit(src2[lane]);
+int exp1(0);
+int exp2(0);
+std::frexp(src1[lane], &exp1);
+std::frexp(src2[lane], &exp2);
+
+if (std::isnan(src1[lane]) || std::isnan(src2[lane])) {
+vdst[lane] =  
std::numeric_limits::quiet_NaN();

+} else if (std::fpclassify(src1[lane]) == FP_ZERO
+   && std::fpclassify(src2[lane]) == FP_ZERO) {
+vdst[lane]
+= std::numeric_limits::signaling_NaN();
 } else if (std::isinf(src1[lane]) &&  
std::isinf(src2[lane])) {

-vdst[lane] = -NAN;
-} else if (src1[lane] == 0.0 || std::isinf(src2[lane])) {
-vdst[lane] = signOut ? -INFINITY : +INFINITY;
-} else if (src2[lane] == 0.0 || std::isinf(src1[lane])) {
-vdst[lane] = signOut ? -0.0 : +0.0;
+vdst[lane]
+= std::numeric_limits::signaling_NaN();
+} else if (std::fpclassify(src1[lane]) == FP_ZERO
+   || std::isinf(src2[lane])) {
+vdst[lane] = sign_out ? -INFINITY : +INFINITY;
+} else if (std::isinf(src1[lane])
+   || std::fpclassify(src2[lane]) == FP_ZERO) {
+vdst[lane] = sign_out ? -0.0 : +0.0;
 } else if (exp2 - exp1 < -1075) {
-warn_once("fixup_f64 unimplemented case:"
-  "exp2 - ex1 < -1075");
 vdst[lane] = src0[lane];
 } else if (exp1 == 2047) {
-warn_once("fixup_f64 unimplemented case:"
-  "exp1 == 2047");
 vdst[lane] = src0[lane];
 } else {
-vdst[lane] = ((uint64_t)signOut<<63) |
-((uint64_t)src0[lane] & 0x7fffULL);
+vdst[lane] = sign_out ? -std::fabs(src0[lane])
+: std::fabs(src0[lane]);
 }
 }
 }
@@ -29089,36 +29090,37 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
-int exp1, exp2;
-std::frexp(src1[lane],&exp1);
-std::frexp(src2[lane],&exp2);
+int exp1(0);
+int exp2(0);
+std::frexp(src1[lane], &exp1);
+std::frexp(src2[lane], &exp2);
 vcc.setBit(lane, 0);
-if (src2[lane] == 0 || src1[lane] == 0) {
+
+if (std::fpclassify(src1[lane]) == FP_ZERO
+|| std::fpclassify(src2[lane]) == FP_ZERO) {
 vdst[lane] = NAN;
 } else if (exp2 - exp1 >= 768) {
 vcc.setBit(lane, 1);
 if (src0[lane] == src1[lane]) {
-vdst[lane] = std::ldexp(src0[lane],128);
+vdst[lane] = std::ldexp(src0[lane], 128);
 }
-} else if (exp1 == 0) {
- 

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add handling for Inf/overflow in CVT insts

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29953 )


Change subject: arch-gcn3: Add handling for Inf/overflow in CVT insts
..

arch-gcn3: Add handling for Inf/overflow in CVT insts

Change-Id: I0fddffdeaebd9f45fe89f44d536f80a43de63ff5
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29953
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 77 insertions(+), 1 deletion(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index e93278a..a7b8923 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -7260,8 +7260,16 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
+int exp;
+std::frexp(src[lane],&exp);
 if (std::isnan(src[lane])) {
 vdst[lane] = 0;
+} else if (std::isinf(src[lane]) || exp > 30) {
+if (std::signbit(src[lane])) {
+vdst[lane] = INT_MIN;
+} else {
+vdst[lane] = INT_MAX;
+}
 } else {
 vdst[lane] = (VecElemI32)src[lane];
 }
@@ -7386,8 +7394,18 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
+int exp;
+std::frexp(src[lane],&exp);
 if (std::isnan(src[lane])) {
 vdst[lane] = 0;
+} else if (std::isinf(src[lane])) {
+if (std::signbit(src[lane])) {
+vdst[lane] = 0;
+} else {
+vdst[lane] = UINT_MAX;
+}
+} else if (exp > 31) {
+vdst[lane] = UINT_MAX;
 } else {
 vdst[lane] = (VecElemU32)src[lane];
 }
@@ -7422,8 +7440,16 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
+int exp;
+std::frexp(src[lane],&exp);
 if (std::isnan(src[lane])) {
 vdst[lane] = 0;
+} else if (std::isinf(src[lane]) || exp > 30) {
+if (std::signbit(src[lane])) {
+vdst[lane] = INT_MIN;
+} else {
+vdst[lane] = INT_MAX;
+}
 } else {
 vdst[lane] = (VecElemI32)src[lane];
 }
@@ -7772,8 +7798,18 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
+int exp;
+std::frexp(src[lane],&exp);
 if (std::isnan(src[lane])) {
 vdst[lane] = 0;
+} else if (std::isinf(src[lane])) {
+if (std::signbit(src[lane])) {
+vdst[lane] = 0;
+} else {
+vdst[lane] = UINT_MAX;
+}
+} else if (exp > 31) {
+vdst[lane] = UINT_MAX;
 } else {
 vdst[lane] = (VecElemU32)src[lane];
 }
@@ -25075,8 +25111,16 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
+int exp;
+std::frexp(src[lane],&exp);
 if (std::isnan(src[lane])) {
 vdst[lane] = 0;
+} else if (std::isinf(src[lane]) || exp > 30) {
+if (std::signbit(src[lane])) {
+vdst[lane] = INT_MIN;
+} else {
+vdst[lane] = INT_MAX;
+}
 } else {
 vdst[lane] = (VecElemI32)src[lane];
 }
@@ -25235,8 +25279,18 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
+int exp;
+std::frexp(src[lane],&exp);
 if (std::isnan(src[lane])) {
 vdst[lane] = 0;
+} else if (std::isinf(src[lane])) {
+if (std::signbit(src[lane])) {
+vdst[lane] = 0;
+} else {
+vdst[lane] = UINT_MAX;
+}
+} else if (exp > 31) {
+vdst[lane] = UINT_MAX;
 } else {
 vdst[lane] = (V

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix s_getpc operand information

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29954 )


Change subject: arch-gcn3: Fix s_getpc operand information
..

arch-gcn3: Fix s_getpc operand information

s_getpc was currently reporting only a single operand,
and was only considering the SSRC operand. However,
this instruction' source is implicitly the PC.
Because its destination register was never tracked for
dependence checking purposes, dependence violations
are possible.

Change-Id: Ia80b8b3e24d5885f646a9ee41212a2cb35b9ffe6
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29954
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/instructions.hh
M src/arch/gcn3/insts/op_encodings.cc
2 files changed, 15 insertions(+), 10 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.hh  
b/src/arch/gcn3/insts/instructions.hh

index b0cc37e..f561043 100644
--- a/src/arch/gcn3/insts/instructions.hh
+++ b/src/arch/gcn3/insts/instructions.hh
@@ -5846,9 +5846,7 @@
 getOperandSize(int opIdx) override
 {
 switch (opIdx) {
-  case 0: //ssrc
-return 8;
-  case 1: //sdst
+  case 0: //sdst
 return 8;
   default:
 fatal("op idx %i out of bounds\n", opIdx);
@@ -5860,9 +5858,7 @@
 isSrcOperand(int opIdx) override
 {
 switch (opIdx) {
-  case 0: //ssrc
-return true;
-  case 1: //sdst
+  case 0: //sdst
 return false;
   default:
 fatal("op idx %i out of bounds\n", opIdx);
@@ -5874,9 +5870,7 @@
 isDstOperand(int opIdx) override
 {
 switch (opIdx) {
-  case 0: //ssrc
-return false;
-  case 1: //sdst
+  case 0: //sdst
 return true;
   default:
 fatal("op idx %i out of bounds\n", opIdx);
diff --git a/src/arch/gcn3/insts/op_encodings.cc  
b/src/arch/gcn3/insts/op_encodings.cc

index 22d0f48..997b22f 100644
--- a/src/arch/gcn3/insts/op_encodings.cc
+++ b/src/arch/gcn3/insts/op_encodings.cc
@@ -326,7 +326,12 @@

 switch (opIdx) {
   case 0:
-  return isScalarReg(instData.SSRC0);
+if (instData.OP == 0x1C) {
+// Special case for s_getpc, which has no source reg.
+// Instead, it implicitly reads the PC.
+return isScalarReg(instData.SDST);
+}
+return isScalarReg(instData.SSRC0);
   case 1:
   return isScalarReg(instData.SDST);
   default:
@@ -353,6 +358,12 @@

 switch (opIdx) {
   case 0:
+if (instData.OP == 0x1C) {
+// Special case for s_getpc, which has no source reg.
+// Instead, it implicitly reads the PC.
+return opSelectorToRegIdx(instData.SDST,
+gpuDynInst->wavefront()->reservedScalarRegs);
+}
 return opSelectorToRegIdx(instData.SSRC0,
 gpuDynInst->wavefront()->reservedScalarRegs);
   case 1:

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29954
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Ia80b8b3e24d5885f646a9ee41212a2cb35b9ffe6
Gerrit-Change-Number: 29954
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: fixed scale,fixup,fmas f64 ops

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29955 )


Change subject: arch-gcn3: fixed scale,fixup,fmas f64 ops
..

arch-gcn3: fixed scale,fixup,fmas f64 ops

Change-Id: Ie13794554db8a958fda1f7103ec18058fda2e66d
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29955
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 65 insertions(+), 17 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index a7b8923..a25ec17 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -28952,22 +28952,34 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
-if (std::fpclassify(src1[lane]) == FP_ZERO) {
-if (std::signbit(src1[lane])) {
-vdst[lane] = -INFINITY;
-} else {
-vdst[lane] = +INFINITY;
-}
-} else if (std::isnan(src2[lane]) ||  
std::isnan(src1[lane])) {

-vdst[lane] = NAN;
-} else if (std::isinf(src1[lane])) {
-if (std::signbit(src1[lane])) {
-vdst[lane] = -INFINITY;
-} else {
-vdst[lane] = +INFINITY;
-}
+int signOut = std::signbit(src1[lane]) ^
+  std::signbit(src2[lane]);
+int exp1, exp2;
+std::frexp(src1[lane],&exp1);
+std::frexp(src2[lane],&exp2);
+if (std::isnan(src2[lane])) {
+vdst[lane] = src2[lane];
+} else if (std::isnan(src1[lane])) {
+vdst[lane] = src1[lane];
+} else if (src1[lane] == 0.0 && src2[lane] == 0.0) {
+vdst[lane] = -NAN;
+} else if (std::isinf(src1[lane]) &&  
std::isinf(src2[lane])) {

+vdst[lane] = -NAN;
+} else if (src1[lane] == 0.0 || std::isinf(src2[lane])) {
+vdst[lane] = signOut ? -INFINITY : +INFINITY;
+} else if (src2[lane] == 0.0 || std::isinf(src1[lane])) {
+vdst[lane] = signOut ? -0.0 : +0.0;
+} else if (exp2 - exp1 < -1075) {
+warn_once("fixup_f64 unimplemented case:"
+  "exp2 - ex1 < -1075");
+vdst[lane] = src0[lane];
+} else if (exp1 == 2047) {
+warn_once("fixup_f64 unimplemented case:"
+  "exp1 == 2047");
+vdst[lane] = src0[lane];
 } else {
-vdst[lane] = src2[lane] / src1[lane];
+vdst[lane] = ((uint64_t)signOut<<63) |
+((uint64_t)src0[lane] & 0x7fffULL);
 }
 }
 }
@@ -29077,8 +29089,37 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
-vdst[lane] = src0[lane];
+int exp1, exp2;
+std::frexp(src1[lane],&exp1);
+std::frexp(src2[lane],&exp2);
 vcc.setBit(lane, 0);
+if (src2[lane] == 0 || src1[lane] == 0) {
+vdst[lane] = NAN;
+} else if (exp2 - exp1 >= 768) {
+vcc.setBit(lane, 1);
+if (src0[lane] == src1[lane]) {
+vdst[lane] = std::ldexp(src0[lane],128);
+}
+} else if (exp1 == 0) {
+vdst[lane] = std::ldexp(src0[lane],128);
+} else if (exp1 >= 0x7fd && exp2 - exp1 <= -768) {
+vcc.setBit(lane, 1);
+if (src0[lane] == src1[lane]) {
+vdst[lane] = std::ldexp(src0[lane],-128);
+}
+} else if (exp1 >= 0x7fd) {
+vdst[lane] = std::ldexp(src0[lane],-128);
+} else if (exp2 - exp1 <= -768) {
+vcc.setBit(lane, 1);
+if (src0[lane] != src2[lane]) {
+vdst[lane] = std::ldexp(src0[lane],128);
+}
+} else if (exp2 <= 53) {
+vdst[lane] = std::ldexp(src0[lane],128);
+}
+else {
+vdst[lane] = src0[lane];
+}
 }
 }

@@ -29171,10 +29212,12 @@
 ConstVecOperandF64 src1(gpuDynInst, ext

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: ds_read_u8 and ds_read_u16 fix

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29951 )


Change subject: arch-gcn3: ds_read_u8 and ds_read_u16 fix
..

arch-gcn3: ds_read_u8 and ds_read_u16 fix

This changeset zero extends the destination register
for ds_read_u8 and ds_read_u16 instructions.

Change-Id: I193adadd68adf2572b59743b1504f18ad225f506
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29951
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 4 insertions(+), 4 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 8b72e0d..6e5ff42 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -32016,11 +32016,11 @@
 void
 Inst_DS__DS_READ_U8::completeAcc(GPUDynInstPtr gpuDynInst)
 {
-VecOperandU8 vdst(gpuDynInst, extData.VDST);
+VecOperandU32 vdst(gpuDynInst, extData.VDST);

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (gpuDynInst->exec_mask[lane]) {
-vdst[lane] = (reinterpret_cast(
+vdst[lane] = (VecElemU32)(reinterpret_cast(
 gpuDynInst->d_data))[lane];
 }
 }
@@ -32096,11 +32096,11 @@
 void
 Inst_DS__DS_READ_U16::completeAcc(GPUDynInstPtr gpuDynInst)
 {
-VecOperandU16 vdst(gpuDynInst, extData.VDST);
+VecOperandU32 vdst(gpuDynInst, extData.VDST);

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (gpuDynInst->exec_mask[lane]) {
-vdst[lane] = (reinterpret_cast(
+vdst[lane] = (VecElemU32)(reinterpret_cast(
 gpuDynInst->d_data))[lane];
 }
 }

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29951
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I193adadd68adf2572b59743b1504f18ad225f506
Gerrit-Change-Number: 29951
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Alexandru Duțu 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add ds_bpermute and ds_permute insts

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29952 )


Change subject: arch-gcn3: Add ds_bpermute and ds_permute insts
..

arch-gcn3: Add ds_bpermute and ds_permute insts

The implementation of these insts provided by this
change is based on the description provided here:

https://gpuopen.com/amd-gcn-assembly-cross-lane-operations/

Change-Id: Id63b6c34c9fdc6e0dbd445d859e7b209023f2874
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29952
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 113 insertions(+), 4 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 6e5ff42..e93278a 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -32129,6 +32129,13 @@
 Inst_DS__DS_PERMUTE_B32::Inst_DS__DS_PERMUTE_B32(InFmt_DS *iFmt)
 : Inst_DS(iFmt, "ds_permute_b32")
 {
+setFlag(MemoryRef);
+/**
+ * While this operation doesn't actually use DS storage we classify
+ * it as a load here because it does a writeback to a VGPR, which
+ * fits in better with the LDS pipeline logic.
+ */
+ setFlag(Load);
 } // Inst_DS__DS_PERMUTE_B32

 Inst_DS__DS_PERMUTE_B32::~Inst_DS__DS_PERMUTE_B32()
@@ -32139,12 +32146,66 @@
 void
 Inst_DS__DS_PERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()
+->cyclesToTicks(Cycles(24)));
+ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+addr.read();
+data.read();
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (wf->execMask(lane)) {
+/**
+ * One of the offset fields can be used for the index.
+ * It is assumed OFFSET0 would be used, as OFFSET1 is
+ * typically only used for DS ops that operate on two
+ * disparate pieces of data.
+ */
+assert(!instData.OFFSET1);
+/**
+ * The address provided is a byte address, but VGPRs are
+ * 4 bytes, so we must divide by 4 to get the actual VGPR
+ * index. Additionally, the index is calculated modulo the
+ * WF size, 64 in this case, so we simply extract bits 7-2.
+ */
+int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
+panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is  
out "

+ "of bounds.\n", gpuDynInst->disassemble(), index);
+/**
+ * If the shuffled index corresponds to a lane that is
+ * inactive then this instruction writes a 0 to the active
+ * lane in VDST.
+ */
+if (wf->execMask(index)) {
+vdst[index] = data[lane];
+} else {
+vdst[index] = 0;
+}
+}
+}
+
+vdst.write();
+
+wf->rdLmReqsInPipe--;
+wf->validateRequestCounters();
+} // execute
+// --- Inst_DS__DS_BPERMUTE_B32 class methods ---

 Inst_DS__DS_BPERMUTE_B32::Inst_DS__DS_BPERMUTE_B32(InFmt_DS *iFmt)
 : Inst_DS(iFmt, "ds_bpermute_b32")
 {
+setFlag(MemoryRef);
+/**
+ * While this operation doesn't actually use DS storage we classify
+ * it as a load here because it does a writeback to a VGPR, which
+ * fits in better with the LDS pipeline logic.
+ */
+setFlag(Load);
 } // Inst_DS__DS_BPERMUTE_B32

 Inst_DS__DS_BPERMUTE_B32::~Inst_DS__DS_BPERMUTE_B32()
@@ -32155,8 +32216,56 @@
 void
 Inst_DS__DS_BPERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()
+->cyclesToTicks(Cycles(24)));
+ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+VecOperandU32 vdst(gpuDynInst, extData.V

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: convert vALU instruction counters from 32 to 64-bit

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29950 )


Change subject: arch-gcn3: convert vALU instruction counters from 32 to  
64-bit

..

arch-gcn3: convert vALU instruction counters from 32 to 64-bit

The vALU instruction counters were previously 32 bits, but for some
workloads this value wraps around and triggers an assert failure
because the max vALU operations are reached.  To resolve this, this
commit increases the counter size to 64 bits.

Change-Id: I90ed4514669485cfea7ccc37ba9d69665277bccb
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29950
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/gpu-compute/shader.hh
1 file changed, 2 insertions(+), 2 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
index 238f6e0..3e2e569 100644
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -258,8 +258,8 @@
 Stats::Vector vectorInstDstOperand;
 void regStats();

-int max_valu_insts;
-int total_valu_insts;
+int64_t max_valu_insts;
+int64_t total_valu_insts;

 Shader(const Params *p);
 ~Shader();

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29950
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I90ed4514669485cfea7ccc37ba9d69665277bccb
Gerrit-Change-Number: 29950
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: add support for v_mbcnt_hi and v_mbcnt_lo

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29948 )


Change subject: arch-gcn3: add support for v_mbcnt_hi and v_mbcnt_lo
..

arch-gcn3: add support for v_mbcnt_hi and v_mbcnt_lo

Change-Id: I1c70fe693c904f1abd7d5a2b99220c74a075eae5
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29948
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 60 insertions(+), 4 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 79e7dda..6ffd049 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -30309,8 +30309,36 @@
 void
 Inst_VOP3__V_MBCNT_LO_U32_B32::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+VecOperandU32 vdst(gpuDynInst, instData.VDST);
+uint64_t threadMask = 0;
+
+src0.readSrc();
+src1.readSrc();
+
+/**
+ * input modifiers are supported by FP operations only
+ */
+assert(!(instData.ABS & 0x1));
+assert(!(instData.ABS & 0x2));
+assert(!(instData.ABS & 0x4));
+assert(!(extData.NEG & 0x1));
+assert(!(extData.NEG & 0x2));
+assert(!(extData.NEG & 0x4));
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (wf->execMask(lane)) {
+threadMask = ((1LL << lane) - 1LL);
+vdst[lane] = popCount(src0[lane] & bits(threadMask, 31,  
0)) +

+ src1[lane];
+}
+}
+
+vdst.write();
+} // execute
+// --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods ---

 Inst_VOP3__V_MBCNT_HI_U32_B32::Inst_VOP3__V_MBCNT_HI_U32_B32(
   InFmt_VOP3 *iFmt)
@@ -30330,8 +30358,36 @@
 void
 Inst_VOP3__V_MBCNT_HI_U32_B32::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+VecOperandU32 vdst(gpuDynInst, instData.VDST);
+uint64_t threadMask = 0;
+
+src0.readSrc();
+src1.readSrc();
+
+/**
+ * input modifiers are supported by FP operations only
+ */
+assert(!(instData.ABS & 0x1));
+assert(!(instData.ABS & 0x2));
+assert(!(instData.ABS & 0x4));
+assert(!(extData.NEG & 0x1));
+assert(!(extData.NEG & 0x2));
+assert(!(extData.NEG & 0x4));
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (wf->execMask(lane)) {
+threadMask = ((1LL << lane) - 1LL);
+vdst[lane] = popCount(src0[lane] & bits(threadMask, 63,  
32)) +

+ src1[lane];
+}
+}
+
+vdst.write();
+} // execute
+// --- Inst_VOP3__V_LSHLREV_B64 class methods ---

 Inst_VOP3__V_LSHLREV_B64::Inst_VOP3__V_LSHLREV_B64(InFmt_VOP3 *iFmt)
 : Inst_VOP3(iFmt, "v_lshlrev_b64", false)

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29948
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I1c70fe693c904f1abd7d5a2b99220c74a075eae5
Gerrit-Change-Number: 29948
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute, arch-gcn3: refactor barriers

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29943 )


Change subject: gpu-compute, arch-gcn3: refactor barriers
..

gpu-compute, arch-gcn3: refactor barriers

Barriers were not modeled properly. Firstly, barriers were
allocated to each WG that was launched, which is not
correct, and the CU would provide an infinite number
of barrier slots. There are a limited number of barrier slots
per CU in reality. In addition, the CU will not allocate
barrier slots to WGs with a single WF (nothing to sync if
only one WF).

Beyond modeling problems, there also the issue of deadlock.
The barrier could deadlock because not all WFs are freed
from the barrier once it has been satisfied. Instead, we
relied on the scoreboard stage to release them lazily,
one-by-one.

Under this implementation the scoreboard may not fully release
all WFs participating in a barrier; this happens because the
first WF to be freed from the barrier could reach an s_barrier
instruction again, forever causing the barrier counts across
WFs to be out-of-sync.

This change refactors the barrier logic to:

1) Create a proper barrier slot implementation

2) Enforce (via a parameter) the number of barrier
   slots on the CU.

3) Simplify the logic and cleanup the code (i.e., we
   no longer iterate through the entire WF list each
   time we check if a barrier is satisfied).

4) Fix deadlock issues.

Change-Id: If53955b54931886baaae322640a7b9da7a1595e0
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29943
Reviewed-by: Anthony Gutierrez 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/arch/gcn3/insts/instructions.cc
M src/gpu-compute/GPU.py
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/compute_unit.hh
M src/gpu-compute/scoreboard_check_stage.cc
M src/gpu-compute/shader.cc
M src/gpu-compute/wavefront.cc
M src/gpu-compute/wavefront.hh
8 files changed, 386 insertions(+), 101 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 607e3c6..817b339 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -39,6 +39,7 @@

 #include "arch/gcn3/insts/inst_util.hh"
 #include "debug/GCN3.hh"
+#include "debug/GPUSync.hh"
 #include "gpu-compute/shader.hh"

 namespace Gcn3ISA
@@ -3709,6 +3710,7 @@
 Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
 {
 Wavefront *wf = gpuDynInst->wavefront();
+ComputeUnit *cu = gpuDynInst->computeUnit();

 // delete extra instructions fetched for completed work-items
 wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
@@ -3725,6 +3727,25 @@
 int refCount = wf->computeUnit->getLds()
 .decreaseRefCounter(wf->dispatchId, wf->wgId);

+/**
+ * The parent WF of this instruction is exiting, therefore
+ * it should not participate in this barrier any longer. This
+ * prevents possible deadlock issues if WFs exit early.
+ */
+int bar_id = WFBarrier::InvalidID;
+if (wf->hasBarrier()) {
+assert(wf->getStatus() != Wavefront::S_BARRIER);
+bar_id = wf->barrierId();
+assert(bar_id != WFBarrier::InvalidID);
+wf->releaseBarrier();
+cu->decMaxBarrierCnt(bar_id);
+DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
+"program and decrementing max barrier count for "
+"barrier Id%d. New max count: %d.\n", cu->cu_id,
+wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
+cu->maxBarrierCnt(bar_id));
+}
+
 DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
 wf->computeUnit->cu_id, wf->wgId, refCount);

@@ -3748,6 +3769,20 @@
 wf->lastInstExec = 0;

 if (!refCount) {
+/**
+ * If all WFs have finished, and hence the WG has finished,
+ * then we can free up the barrier belonging to the parent
+ * WG, but only if we actually used a barrier (i.e., more
+ * than one WF in the WG).
+ */
+if (bar_id != WFBarrier::InvalidID) {
+DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves  
are "
+"now complete. Releasing barrier Id%d.\n",  
cu->cu_id,

+wf->simdId, wf->wfSlotId, wf->wfDynId,
+wf->barrierId());
+cu->releaseBarrier(bar_id);
+}
+
/**
  * Last wavefront of the workgroup has executed return. If the
  * workgroup is not the final one in the kernel, then simply
@@ -4027,12 +4062,21 @@
 Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuD

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: add support of 64-bit SOPK instruction

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29942 )


Change subject: arch-gcn3: add support of 64-bit SOPK instruction
..

arch-gcn3: add support of 64-bit SOPK instruction

s_setreg_imm32_b32 is a 64-bit instruction, using a 32-bit literal
constant. Related functions are added to support decoding the second
dword.

Change-Id: I290f8578f726885c137dbfac3773035f814e0a3a
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29942
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Xianwei Zhang 
---
M src/arch/gcn3/insts/op_encodings.cc
M src/arch/gcn3/insts/op_encodings.hh
2 files changed, 43 insertions(+), 4 deletions(-)

Approvals:
  Xianwei Zhang: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/op_encodings.cc  
b/src/arch/gcn3/insts/op_encodings.cc

index fe501f2..22d0f48 100644
--- a/src/arch/gcn3/insts/op_encodings.cc
+++ b/src/arch/gcn3/insts/op_encodings.cc
@@ -160,6 +160,14 @@

 // copy first instruction DWORD
 instData = iFmt[0];
+if (hasSecondDword(iFmt)) {
+// copy second instruction DWORD into union
+extData = ((MachInst)iFmt)[1];
+_srcLiteral = *reinterpret_cast(&iFmt[1]);
+varSize = 4 + 4;
+} else {
+varSize = 4;
+} // if
 } // Inst_SOPK

 Inst_SOPK::~Inst_SOPK()
@@ -169,18 +177,43 @@
 int
 Inst_SOPK::instSize() const
 {
-return 4;
+return varSize;
 } // instSize

+bool
+Inst_SOPK::hasSecondDword(InFmt_SOPK *iFmt)
+{
+/*
+  SOPK can be a 64-bit instruction, i.e., have a second dword:
+  S_SETREG_IMM32_B32 writes some or all of the LSBs of a 32-bit
+  literal constant into a hardware register;
+  the way to detect such special case is to explicitly check the
+  opcode (20/0x14)
+*/
+if (iFmt->OP == 0x14)
+return true;
+
+return false;
+}
+
+
 void
 Inst_SOPK::generateDisassembly()
 {
 std::stringstream dis_stream;
 dis_stream << _opcode << " ";
-dis_stream << opSelectorToRegSym(instData.SDST) << ", ";

-dis_stream << "0x" << std::hex << std::setfill('0') <<  
std::setw(4)

-   << instData.SIMM16;
+// S_SETREG_IMM32_B32 is a 64-bit instruction, using a
+// 32-bit literal constant
+if (instData.OP == 0x14) {
+dis_stream << "0x" << std::hex << std::setfill('0')
+<< std::setw(8) << extData.imm_u32 << ", ";
+} else {
+dis_stream << opSelectorToRegSym(instData.SDST) << ", ";
+}
+
+dis_stream << "0x" << std::hex << std::setfill('0') << std::setw(4)
+ << instData.SIMM16;

 disassembly = dis_stream.str();
 }
diff --git a/src/arch/gcn3/insts/op_encodings.hh  
b/src/arch/gcn3/insts/op_encodings.hh

index 22c146a..4f151b9 100644
--- a/src/arch/gcn3/insts/op_encodings.hh
+++ b/src/arch/gcn3/insts/op_encodings.hh
@@ -87,6 +87,12 @@
   protected:
 // first instruction DWORD
 InFmt_SOPK instData;
+// possible second DWORD
+InstFormat extData;
+uint32_t varSize;
+
+  private:
+bool hasSecondDword(InFmt_SOPK *);
 }; // Inst_SOPK

 class Inst_SOP1 : public GCN3GPUStaticInst

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29942
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I290f8578f726885c137dbfac3773035f814e0a3a
Gerrit-Change-Number: 29942
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: Xianwei Zhang 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: implement multi-dword buffer loads and stores

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29946 )


Change subject: arch-gcn3: implement multi-dword buffer loads and stores
..

arch-gcn3: implement multi-dword buffer loads and stores

Add support for all multi-dword buffer loads and stores:
buffer_load_dword x2, x3, and x4 and buffer_store_dword x2, x3, and x4

Change-Id: I4017b6b4f625fc92002ce8ade695ae29700fa55e
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29946
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/instructions.cc
M src/arch/gcn3/insts/op_encodings.hh
2 files changed, 504 insertions(+), 18 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 817b339..b852281 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -34777,7 +34777,11 @@
 {
 setFlag(MemoryRef);
 setFlag(Load);
-setFlag(GlobalSegment);
+if (instData.LDS) {
+setFlag(GroupSegment);
+} else {
+setFlag(GlobalSegment);
+}
 } // Inst_MUBUF__BUFFER_LOAD_DWORDX2

 Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2()
@@ -34788,17 +34792,88 @@
 void
 Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->exec_mask = wf->execMask();
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+rsrcDesc.read();
+offset.read();
+
+int inst_offset = instData.OFFSET;
+
+if (!instData.IDXEN && !instData.OFFEN) {
+calcAddr(gpuDynInst,
+addr0, addr1, rsrcDesc, offset, inst_offset);
+} else if (!instData.IDXEN && instData.OFFEN) {
+addr0.read();
+calcAddr(gpuDynInst,
+addr0, addr1, rsrcDesc, offset, inst_offset);
+} else if (instData.IDXEN && !instData.OFFEN) {
+addr0.read();
+calcAddr(gpuDynInst,
+addr1, addr0, rsrcDesc, offset, inst_offset);
+} else {
+addr0.read();
+addr1.read();
+calcAddr(gpuDynInst,
+addr1, addr0, rsrcDesc, offset, inst_offset);
+}
+
+if (isLocalMem()) {
+gpuDynInst->computeUnit()->localMemoryPipe
+.issueRequest(gpuDynInst);
+wf->rdLmReqsInPipe--;
+wf->outstandingReqsRdLm++;
+} else {
+gpuDynInst->computeUnit()->globalMemoryPipe
+.issueRequest(gpuDynInst);
+wf->rdGmReqsInPipe--;
+wf->outstandingReqsRdGm++;
+}
+
+wf->outstandingReqs++;
+wf->validateRequestCounters();
+} // execute

 void
 Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
+initMemRead<2>(gpuDynInst);
 } // initiateAcc

 void
 Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
 {
+VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
+VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (gpuDynInst->exec_mask[lane]) {
+if (!oobMask[lane]) {
+vdst0[lane] = (reinterpret_cast(
+gpuDynInst->d_data))[lane * 2];
+vdst1[lane] = (reinterpret_cast(
+gpuDynInst->d_data))[lane * 2 + 1];
+} else {
+vdst0[lane] = 0;
+vdst1[lane] = 0;
+}
+}
+}
+
+vdst0.write();
+vdst1.write();
 } // completeAcc

 Inst_MUBUF__BUFFER_LOAD_DWORDX3
@@ -34807,7 +34882,11 @@
 {
 setFlag(MemoryRef);
 setFlag(Load);
-setFlag(GlobalSegment);
+if (instData.LDS) {
+setFlag(GroupSegment);
+} else {
+setFlag(GlobalSegment);
+}
 } // Inst_MUBUF__BUFFER_LOAD_DWORDX3

 Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3()
@@ -34818,17 +34897,93 @@
 void
 Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
 {
-panic

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add support for rd/wr EXEC_HI to operand class

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29944 )


Change subject: arch-gcn3: Add support for rd/wr EXEC_HI to operand class
..

arch-gcn3: Add support for rd/wr EXEC_HI to operand class

Change-Id: Ib22dd604f88ea56801964235082835002deffca1
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29944
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/operand.hh
1 file changed, 35 insertions(+), 1 deletion(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/operand.hh b/src/arch/gcn3/operand.hh
index 7f70fab..960d05e 100644
--- a/src/arch/gcn3/operand.hh
+++ b/src/arch/gcn3/operand.hh
@@ -435,13 +435,30 @@

 if (!isScalarReg(_opIdx)) {
 if (_opIdx == REG_EXEC_LO) {
-uint64_t new_exec_mask_val(0);
+ScalarRegU64 new_exec_mask_val(0);
 std::memcpy((void*)&new_exec_mask_val,
 (void*)srfData.data(), sizeof(new_exec_mask_val));
 VectorMask new_exec_mask(new_exec_mask_val);
 wf->execMask() = new_exec_mask;
 DPRINTF(GPUSRF, "Write EXEC\n");
 DPRINTF(GPUSRF, "EXEC = %#x\n", new_exec_mask_val);
+} else if (_opIdx == REG_EXEC_HI) {
+/**
+ * If we're writing only the upper half of the EXEC  
mask

+ * this ought to be a single dword operand.
+ */
+assert(NumDwords == 1);
+ScalarRegU32 new_exec_mask_hi_val(0);
+ScalarRegU64 new_exec_mask_val
+= wf->execMask().to_ullong();
+std::memcpy((void*)&new_exec_mask_hi_val,
+(void*)srfData.data(),  
sizeof(new_exec_mask_hi_val));

+replaceBits(new_exec_mask_val, 63, 32,
+new_exec_mask_hi_val);
+VectorMask new_exec_mask(new_exec_mask_val);
+wf->execMask() = new_exec_mask;
+DPRINTF(GPUSRF, "Write EXEC\n");
+DPRINTF(GPUSRF, "EXEC = %#x\n", new_exec_mask_val);
 } else {
 _gpuDynInst->writeMiscReg(_opIdx, srfData[0]);
 }
@@ -505,6 +522,23 @@
 DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask);
 }
 break;
+  case REG_EXEC_HI:
+{
+/**
+ * If we're reading only the upper half of the EXEC  
mask

+ * this ought to be a single dword operand.
+ */
+assert(NumDwords == 1);
+ScalarRegU64 exec_mask = _gpuDynInst->wavefront()
+->execMask().to_ullong();
+
+ScalarRegU32 exec_mask_hi = bits(exec_mask, 63, 32);
+std::memcpy((void*)srfData.data(),  
(void*)&exec_mask_hi,

+sizeof(srfData));
+DPRINTF(GPUSRF, "Read EXEC_HI\n");
+DPRINTF(GPUSRF, "EXEC_HI = %#x\n", exec_mask_hi);
+}
+break;
   case REG_SRC_SWDA:
   case REG_SRC_DPP:
   case REG_SRC_LITERAL:

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29944
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Ib22dd604f88ea56801964235082835002deffca1
Gerrit-Change-Number: 29944
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Fix LDS out-of-bounds behavior

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29940 )


Change subject: gpu-compute: Fix LDS out-of-bounds behavior
..

gpu-compute: Fix LDS out-of-bounds behavior

The LDS is capable of handling out-of-bounds accesses,
that is, accesses that are outside the bounds of the
chunk allocated to a WG. Currently, the simulator asserts
on these accesses. This patch changes the behavior of the
LDS to return 0 for reads and dropping writes that are
out-of-bounds.

Change-Id: I5f467d0f52113e8565e1a3029e82fb89cc6f07ea
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29940
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/gpu-compute/lds_state.hh
1 file changed, 16 insertions(+), 6 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh
index 58171e3..d793f0f 100644
--- a/src/gpu-compute/lds_state.hh
+++ b/src/gpu-compute/lds_state.hh
@@ -69,9 +69,14 @@
 T
 read(const uint32_t index)
 {
-fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0");
-fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS "
-"chunk");
+/**
+ * For reads that are outside the bounds of the LDS
+ * chunk allocated to this WG we return 0.
+ */
+if (index >= chunk.size()) {
+return (T)0;
+}
+
 T *p0 = (T *) (&(chunk.at(index)));
 return *p0;
 }
@@ -83,9 +88,14 @@
 void
 write(const uint32_t index, const T value)
 {
-fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0");
-fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS "
-"chunk");
+/**
+ * Writes that are outside the bounds of the LDS
+ * chunk allocated to this WG are dropped.
+ */
+if (index >= chunk.size()) {
+return;
+}
+
 T *p0 = (T *) (&(chunk.at(index)));
 *p0 = value;
 }

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29940
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I5f467d0f52113e8565e1a3029e82fb89cc6f07ea
Gerrit-Change-Number: 29940
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: ensure that atomics follow HSA conventions

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29941 )


Change subject: arch-gcn3: ensure that atomics follow HSA conventions
..

arch-gcn3: ensure that atomics follow HSA conventions

Add asserts to make sure atomics are following the HSA conventions
that atomics should be word aligned (i.e., can't be byte aligned)
and should not be misaligned such that a given lane's access
spans multiple cache lines.

Change-Id: Ia48758b9ed96764864234dc607f337e30e287d1c
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29941
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/gpu_mem_helpers.hh
1 file changed, 6 insertions(+), 0 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/gpu_mem_helpers.hh  
b/src/arch/gcn3/gpu_mem_helpers.hh

index 40ca565..562158d 100644
--- a/src/arch/gcn3/gpu_mem_helpers.hh
+++ b/src/arch/gcn3/gpu_mem_helpers.hh
@@ -80,6 +80,12 @@
 misaligned_acc = split_addr > vaddr;

 if (is_atomic) {
+// make sure request is word aligned
+assert((vaddr & 0x3) == 0);
+
+// a given lane's atomic can't cross cache lines
+assert(!misaligned_acc);
+
 req = std::make_shared(vaddr, sizeof(T), 0,
 gpuDynInst->computeUnit()->masterId(), 0,
 gpuDynInst->wfDynId,

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29941
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Ia48758b9ed96764864234dc607f337e30e287d1c
Gerrit-Change-Number: 29941
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: implement instruction s_setreg_b32

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29949 )


Change subject: arch-gcn3: implement instruction s_setreg_b32
..

arch-gcn3: implement instruction s_setreg_b32

Instruction s_setreg_b32 was unimplemented, but is used by hipified
rodinia 'srad'. The instruction sets values of hardware internal
registers. If the instruction is writing into MODE to control
single-precision FP round and denorm modes, a simple warn will be
printed; for all other cases (non-MODE hw register or other
precisions), panic will happen.

Change-Id: Idb1cd5f60548a146bc980f1a27faff30259e74ce
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29949
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Xianwei Zhang 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 27 insertions(+), 0 deletions(-)

Approvals:
  Xianwei Zhang: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 6ffd049..8b72e0d 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -1800,6 +1800,7 @@
 Inst_SOPK__S_SETREG_B32::Inst_SOPK__S_SETREG_B32(InFmt_SOPK *iFmt)
 : Inst_SOPK(iFmt, "s_setreg_b32")
 {
+setFlag(ALU);
 } // Inst_SOPK__S_SETREG_B32

 Inst_SOPK__S_SETREG_B32::~Inst_SOPK__S_SETREG_B32()
@@ -1813,6 +1814,32 @@
 void
 Inst_SOPK__S_SETREG_B32::execute(GPUDynInstPtr gpuDynInst)
 {
+ScalarRegI16 simm16 = instData.SIMM16;
+ScalarRegU32 hwregId = simm16 & 0x3f;
+ScalarRegU32 offset = (simm16 >> 6) & 31;
+ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
+
+ScalarOperandU32 hwreg(gpuDynInst, hwregId);
+ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+hwreg.read();
+sdst.read();
+
+// Store value from SDST to part of the hardware register.
+ScalarRegU32 mask = (((1U << size) - 1U) << offset);
+hwreg = ((hwreg.rawData() & ~mask)
+| ((sdst.rawData() << offset) & mask));
+hwreg.write();
+
+// set MODE register to control the behavior of single precision
+// floating-point numbers: denormal mode or round mode
+if (hwregId==1 && size==2
+&& (offset==4 || offset==0)) {
+warn_once("Be cautious that s_setreg_b32 has no real effect "
+"on FP modes: %s\n",  
gpuDynInst->disassemble());

+return;
+}
+
+// panic if not changing MODE of floating-point numbers
 panicUnimplemented();
 }


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29949
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Idb1cd5f60548a146bc980f1a27faff30259e74ce
Gerrit-Change-Number: 29949
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: Xianwei Zhang 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Init CU object for pipe stages in their ctors

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29945 )


Change subject: gpu-compute: Init CU object for pipe stages in their ctors
..

gpu-compute: Init CU object for pipe stages in their ctors

This change updates the constructors of the CU's pipe
stages/memory pipelines to accept a pointer to their
parent CU. Because the CU creates these objects, and
can pass a pointer to itself to these object via their
constructors, this is the safer way to initalize these
classes.

Change-Id: I0b3732ce7c03781ee15332dac7a21c097ad387a4
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29945
Reviewed-by: Anthony Gutierrez 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/exec_stage.cc
M src/gpu-compute/exec_stage.hh
M src/gpu-compute/fetch_stage.cc
M src/gpu-compute/fetch_stage.hh
M src/gpu-compute/fetch_unit.cc
M src/gpu-compute/fetch_unit.hh
M src/gpu-compute/global_memory_pipeline.cc
M src/gpu-compute/global_memory_pipeline.hh
M src/gpu-compute/local_memory_pipeline.cc
M src/gpu-compute/local_memory_pipeline.hh
M src/gpu-compute/scalar_memory_pipeline.cc
M src/gpu-compute/scalar_memory_pipeline.hh
M src/gpu-compute/schedule_stage.cc
M src/gpu-compute/schedule_stage.hh
M src/gpu-compute/scoreboard_check_stage.cc
M src/gpu-compute/scoreboard_check_stage.hh
17 files changed, 65 insertions(+), 80 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/gpu-compute/compute_unit.cc  
b/src/gpu-compute/compute_unit.cc

index 0fcbb1a..f3387a7 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -66,9 +66,14 @@
 numScalarALUs(p->num_scalar_cores),
 vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
 coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
-registerManager(p->register_manager), fetchStage(p),
-scoreboardCheckStage(p), scheduleStage(p, this), execStage(p),
-globalMemoryPipe(p), localMemoryPipe(p), scalarMemoryPipe(p),
+registerManager(p->register_manager),
+fetchStage(p, this),
+scoreboardCheckStage(p, this),
+scheduleStage(p, this),
+execStage(p, this),
+globalMemoryPipe(p, this),
+localMemoryPipe(p, this),
+scalarMemoryPipe(p, this),
 tickEvent([this]{ exec(); }, "Compute unit tick event",
   false, Event::CPU_Tick_Pri),
 cu_id(p->cu_id),
@@ -788,13 +793,11 @@
 dispatchList.push_back(std::make_pair(nullptr, EMPTY));
 }

-fetchStage.init(this);
-scoreboardCheckStage.init(this);
-scheduleStage.init(this);
-execStage.init(this);
-globalMemoryPipe.init(this);
-localMemoryPipe.init(this);
-scalarMemoryPipe.init(this);
+fetchStage.init();
+scoreboardCheckStage.init();
+scheduleStage.init();
+execStage.init();
+globalMemoryPipe.init();

 gmTokenPort.setTokenManager(memPortTokens);
 }
diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc
index 2dece18..e420579 100644
--- a/src/gpu-compute/exec_stage.cc
+++ b/src/gpu-compute/exec_stage.cc
@@ -41,19 +41,19 @@
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"

-ExecStage::ExecStage(const ComputeUnitParams *p) :  
lastTimeInstExecuted(false),

-thisTimeInstExecuted(false), instrExecuted (false),
-executionResourcesUsed(0)
+ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit *cu)
+: computeUnit(cu), lastTimeInstExecuted(false),
+  thisTimeInstExecuted(false), instrExecuted (false),
+  executionResourcesUsed(0), _name(cu->name() + ".ExecStage")
+
 {
 numTransActiveIdle = 0;
 idle_dur = 0;
 }

 void
-ExecStage::init(ComputeUnit *cu)
+ExecStage::init()
 {
-computeUnit = cu;
-_name = computeUnit->name() + ".ExecStage";
 dispatchList = &computeUnit->dispatchList;
 idle_dur = 0;
 }
diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh
index 670252c..f984d72 100644
--- a/src/gpu-compute/exec_stage.hh
+++ b/src/gpu-compute/exec_stage.hh
@@ -69,9 +69,9 @@
 class ExecStage
 {
   public:
-ExecStage(const ComputeUnitParams* params);
+ExecStage(const ComputeUnitParams* p, ComputeUnit *cu);
 ~ExecStage() { }
-void init(ComputeUnit *cu);
+void init();
 void exec();

 std::string dispStatusToStr(int j);
diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc
index cf0b39e..b9df6ce 100644
--- a/src/gpu-compute/fetch_stage.cc
+++ b/src/gpu-compute/fetch_stage.cc
@@ -36,11 +36,12 @@
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/wavefront.hh"

-FetchStage::FetchStage(const ComputeUnitParams* p) :
-numVectorALUs(p->num_SIMDs), computeUnit(nullptr)
+FetchStage::FetchStage(const ComputeUnitParams* p, ComputeUnit *cu)
+: numVectorALUs(p->num_SIMD

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: fix bug with DPP support

2020-07-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29947 )


Change subject: arch-gcn3: fix bug with DPP support
..

arch-gcn3: fix bug with DPP support

Instructions that use the DPP field need to use the extra SRC0
register associated with the DPP instruction instead of the
"default" SRC0 register, since the default SRC0 register contains
the DPP information when DPP is being used.  This commit fixes
2735c3bb88 to take this into account.  Additionally, this commit
removes write of the src register from the DPP helper functions,
to avoid overwriting any changes made to the destination register.
Finally, this change modifies the instructions that use DPP to
simplify the flow through the execute() functions.

Change-Id: I80fd0af1f131f287f18ff73b3c1c9122d8c60823
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29947
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/inst_util.hh
M src/arch/gcn3/insts/instructions.cc
2 files changed, 41 insertions(+), 20 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/inst_util.hh  
b/src/arch/gcn3/insts/inst_util.hh

index 433ccbe..b40e890 100644
--- a/src/arch/gcn3/insts/inst_util.hh
+++ b/src/arch/gcn3/insts/inst_util.hh
@@ -505,7 +505,6 @@
 src0[lane] = 0;
 }

-src0.write();
 // reset for next iteration
 laneDisabled = false;
 }
diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index b852281..79e7dda 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -5296,8 +5296,12 @@
 VecOperandF32 src1(gpuDynInst, instData.VSRC1);
 VecOperandF32 vdst(gpuDynInst, instData.VDST);

+src0.readSrc();
+src1.read();
+
 if (isDPPInst()) {
 VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+src0_dpp.read();

 DPRINTF(GCN3, "Handling V_ADD_F32 SRC DPP. SRC0: register  
v[%d], "

 "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
@@ -5313,14 +5317,17 @@
 extData.iFmt_VOP_DPP.ROW_MASK);

 processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
-}

-src0.readSrc();
-src1.read();
-
-for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-if (wf->execMask(lane)) {
-vdst[lane] = src0[lane] + src1[lane];
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (wf->execMask(lane)) {
+vdst[lane] = src0_dpp[lane] + src1[lane];
+}
+}
+} else {
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (wf->execMask(lane)) {
+vdst[lane] = src0[lane] + src1[lane];
+}
 }
 }

@@ -6164,6 +6171,7 @@

 if (isDPPInst()) {
 VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+src0_dpp.read();

 DPRINTF(GCN3, "Handling V_MAC_F32 SRC DPP. SRC0: register  
v[%d], "

 "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
@@ -6179,11 +6187,18 @@
 extData.iFmt_VOP_DPP.ROW_MASK);

 processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
-}

-for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-if (wf->execMask(lane)) {
-vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (wf->execMask(lane)) {
+vdst[lane] = std::fma(src0_dpp[lane], src1[lane],
+  vdst[lane]);
+}
+}
+} else {
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (wf->execMask(lane)) {
+vdst[lane] = std::fma(src0[lane], src1[lane],  
vdst[lane]);

+}
 }
 }

@@ -7117,8 +7132,11 @@
 ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
 VecOperandU32 vdst(gpuDynInst, instData.VDST);

+src.readSrc();
+
 if (isDPPInst()) {
-VecOperandU32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+VecOperandU32 src_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+src_dpp.read();

 DPRINTF(GCN3, "Handling V_MOV_B32 SRC DPP. SRC0: register  
v[%d], "

 "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
@@ -7137,14 +7155,18 @@
 // to negate it or take the absolute value of it
 assert(!e

[gem5-dev] Change in gem5/gem5[develop]: mem-ruby: Add support for MemSync reqs in VIPER

2020-07-15 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29939 )


Change subject: mem-ruby: Add support for MemSync reqs in VIPER
..

mem-ruby: Add support for MemSync reqs in VIPER

Change-Id: Ib129e82be5348c641a8ae18093324bcedfb38abe
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29939
Reviewed-by: Jason Lowe-Power 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/mem/ruby/system/GPUCoalescer.cc
M src/mem/ruby/system/GPUCoalescer.hh
M src/mem/ruby/system/RubyPort.cc
3 files changed, 22 insertions(+), 21 deletions(-)

Approvals:
  Jason Lowe-Power: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/mem/ruby/system/GPUCoalescer.cc  
b/src/mem/ruby/system/GPUCoalescer.cc

index d9793fa..80bc19a 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -553,24 +553,25 @@
 assert(pkt->req->hasInstSeqNum());

 if (pkt->cmd == MemCmd::MemSyncReq) {
-// let the child coalescer handle MemSyncReq because this is
-// cache coherence protocol specific
-return RequestStatus_Issued;
-}
-// otherwise, this must be either read or write command
-assert(pkt->isRead() || pkt->isWrite());
+// issue mem_sync requests immediately to the cache system without
+// going through uncoalescedTable like normal LD/ST/Atomic requests
+issueMemSyncRequest(pkt);
+} else {
+// otherwise, this must be either read or write command
+assert(pkt->isRead() || pkt->isWrite());

-// the pkt is temporarily stored in the uncoalesced table until
-// it's picked for coalescing process later in this cycle or in a
-// future cycle
-uncoalescedTable.insertPacket(pkt);
-DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
-pkt->getAddr());
+// the pkt is temporarily stored in the uncoalesced table until
+// it's picked for coalescing process later in this cycle or in a
+// future cycle
+uncoalescedTable.insertPacket(pkt);
+DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to  
uncoalescedTable\n",

+pkt->getAddr());

-// we schedule an issue event here to process the uncoalesced table
-// and try to issue Ruby request to cache system
-if (!issueEvent.scheduled()) {
-schedule(issueEvent, curTick());
+// we schedule an issue event here to process the uncoalesced table
+// and try to issue Ruby request to cache system
+if (!issueEvent.scheduled()) {
+schedule(issueEvent, curTick());
+}
 }

 // we always return RequestStatus_Issued in this coalescer
diff --git a/src/mem/ruby/system/GPUCoalescer.hh  
b/src/mem/ruby/system/GPUCoalescer.hh

index 74236cb..401f70b 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -367,7 +367,7 @@
 // since the two following issue functions are protocol-specific,
 // they must be implemented in a derived coalescer
 virtual void issueRequest(CoalescedRequest* crequest) = 0;
-//virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
+virtual void issueMemSyncRequest(PacketPtr pkt) {}

 void kernelCallback(int wavefront_id);

diff --git a/src/mem/ruby/system/RubyPort.cc  
b/src/mem/ruby/system/RubyPort.cc

index 0526e65..4510e3a 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -251,7 +251,7 @@
 }
 // Check for pio requests and directly send them to the dedicated
 // pio port.
-if (pkt->cmd != MemCmd::MemFenceReq) {
+if (pkt->cmd != MemCmd::MemSyncReq) {
 if (!isPhysMemAddress(pkt)) {
 assert(ruby_port->memMasterPort.isConnected());
 DPRINTF(RubyPort, "Request address %#x assumed to be a "
@@ -312,7 +312,7 @@

 // Check for pio requests and directly send them to the dedicated
 // pio port.
-if (pkt->cmd != MemCmd::MemFenceReq) {
+if (pkt->cmd != MemCmd::MemSyncReq) {
 if (!isPhysMemAddress(pkt)) {
 assert(ruby_port->memMasterPort.isConnected());
 DPRINTF(RubyPort, "Request address %#x assumed to be a "
@@ -539,7 +539,7 @@
 }

 // Flush, acquire, release requests don't access physical memory
-if (pkt->isFlush() || pkt->cmd == MemCmd::MemFenceReq) {
+if (pkt->isFlush() || pkt->cmd == MemCmd::MemSyncReq) {
 accessPhysMem = false;
 }

@@ -649,4 +649,4 @@
 }
 }
 return num_written;
-}
\ No newline at end of file
+}

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29939
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Ib129e82be5348c641a8ae18093324b

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: enable kernel-end WB functionality

2020-07-13 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29938 )


Change subject: gpu-compute: enable kernel-end WB functionality
..

gpu-compute: enable kernel-end WB functionality

Change-Id: Ib17e1d700586d1aa04d408e7b924270f0de82efe
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29938
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Xianwei Zhang 
---
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/shader.cc
M src/mem/request.hh
3 files changed, 27 insertions(+), 18 deletions(-)

Approvals:
  Xianwei Zhang: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/gpu-compute/compute_unit.cc  
b/src/gpu-compute/compute_unit.cc

index b0616d6..178fd6e 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1218,23 +1218,25 @@

 schedule(mem_req_event, curTick() + req_tick_latency);
 } else {
-assert(gpuDynInst->isEndOfKernel());
+  // kernel end release must be enabled
+  assert(shader->impl_kern_end_rel);
+  assert(gpuDynInst->isEndOfKernel());

-req->setCacheCoherenceFlags(Request::RELEASE);
-req->setReqInstSeqNum(gpuDynInst->seqNum());
-req->setFlags(Request::KERNEL);
-pkt = new Packet(req, MemCmd::MemSyncReq);
-pkt->pushSenderState(
-   new ComputeUnit::DataPort::SenderState(gpuDynInst, 0,  
nullptr));

+  req->setCacheCoherenceFlags(Request::WB_L2);
+  req->setReqInstSeqNum(gpuDynInst->seqNum());
+  req->setFlags(Request::KERNEL);
+  pkt = new Packet(req, MemCmd::MemSyncReq);
+  pkt->pushSenderState(
+ new ComputeUnit::DataPort::SenderState(gpuDynInst, 0,  
nullptr));


-EventFunctionWrapper *mem_req_event =
-  memPort[0]->createMemReqEvent(pkt);
+  EventFunctionWrapper *mem_req_event =
+memPort[0]->createMemReqEvent(pkt);

-DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x  
scheduling "

-"a release\n", cu_id, gpuDynInst->simdId,
-gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
+  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x  
scheduling "

+  "a release\n", cu_id, gpuDynInst->simdId,
+  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());

-schedule(mem_req_event, curTick() + req_tick_latency);
+  schedule(mem_req_event, curTick() + req_tick_latency);
 }
 } else {
 gpuDynInst->setRequestFlags(req);
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index f5e9444..59ce239 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -223,11 +223,11 @@
 // flush has never been started, performed only once at kernel end
 assert(_dispatcher.getOutstandingWbs(kernId) == 0);

-// iterate all cus, managed by the shader, to perform flush.
-for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
-_dispatcher.updateWbCounter(kernId, +1);
-cuList[i_cu]->doFlush(gpuDynInst);
-}
+// the first cu, managed by the shader, performs flush operation,
+// assuming that L2 cache is shared by all cus in the shader
+int i_cu = 0;
+_dispatcher.updateWbCounter(kernId, +1);
+cuList[i_cu]->doFlush(gpuDynInst);
 }

 bool
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 4e0ba97..718d5fa 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -225,6 +225,9 @@
  * See the AMD GCN3 ISA Architecture Manual for more
  * details.
  *
+ * INV_L1: L1 cache invalidation
+ * WB_L2: L2 cache writeback
+ *
  * SLC: System Level Coherent. Accesses are forced to miss in
  *  the L2 cache and are coherent with system memory.
  *
@@ -237,6 +240,10 @@
  * between atomic return/no-return operations.
  */
 enum : CacheCoherenceFlagsType {
+/** mem_sync_op flags */
+INV_L1  = 0x0001,
+WB_L2   = 0x0020,
+/** user-policy flags */
 /** user-policy flags */
 SLC_BIT = 0x0080,
 GLC_BIT = 0x0100,

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29938
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Ib17e1d700586d1aa04d408e7b924270f0de82efe
Gerrit-Change-Number: 29938
Gerrit-PatchSet: 9
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: Xianwei Zhang 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implementation of flat atomic swap instruction

2020-07-13 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29937 )


Change subject: arch-gcn3: Implementation of flat atomic swap instruction
..

arch-gcn3: Implementation of flat atomic swap instruction

Change-Id: I9b9042899e65e8c9848b31c509eb2e3b13293e52
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29937
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/instructions.cc
M src/arch/gcn3/insts/instructions.hh
2 files changed, 78 insertions(+), 4 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 2e39bf5..607e3c6 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -39231,8 +39231,80 @@
 void
 Inst_FLAT__FLAT_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+
+if (wf->execMask().none()) {
+wf->wrGmReqsInPipe--;
+wf->rdGmReqsInPipe--;
+return;
+}
+
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->exec_mask = wf->execMask();
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+
+addr.read();
+
+calcAddr(gpuDynInst, addr);
+
+if (gpuDynInst->executedAs() == Enums::SC_GLOBAL ||
+gpuDynInst->executedAs() == Enums::SC_PRIVATE) {
+// TODO: additional address computation required for scratch
+panic_if(gpuDynInst->executedAs() == Enums::SC_PRIVATE,
+ "Flats to private aperture not tested yet\n");
+gpuDynInst->computeUnit()->globalMemoryPipe.
+issueRequest(gpuDynInst);
+wf->wrGmReqsInPipe--;
+wf->outstandingReqsWrGm++;
+wf->rdGmReqsInPipe--;
+wf->outstandingReqsRdGm++;
+} else {
+fatal("Non global flat instructions not implemented yet.\n");
+}
+
+gpuDynInst->wavefront()->outstandingReqs++;
+gpuDynInst->wavefront()->validateRequestCounters();
+
+ConstVecOperandU32 data(gpuDynInst, extData.DATA);
+
+data.read();
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (gpuDynInst->exec_mask[lane]) {
+(reinterpret_cast(gpuDynInst->a_data))[lane]
+= data[lane];
+}
+}
+
+} // execute
+
+void
+Inst_FLAT__FLAT_ATOMIC_SWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
+{
+initAtomicAccess(gpuDynInst);
+} // initiateAcc
+
+void
+Inst_FLAT__FLAT_ATOMIC_SWAP::completeAcc(GPUDynInstPtr gpuDynInst)
+{
+if (isAtomicRet()) {
+VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (gpuDynInst->exec_mask[lane]) {
+vdst[lane] = (reinterpret_cast(
+gpuDynInst->d_data))[lane];
+}
+}
+
+vdst.write();
+}
+} // completeAcc
+
+// --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP class methods ---

 Inst_FLAT__FLAT_ATOMIC_CMPSWAP
 ::Inst_FLAT__FLAT_ATOMIC_CMPSWAP(InFmt_FLAT *iFmt)
diff --git a/src/arch/gcn3/insts/instructions.hh  
b/src/arch/gcn3/insts/instructions.hh

index ff0cfea..b0cc37e 100644
--- a/src/arch/gcn3/insts/instructions.hh
+++ b/src/arch/gcn3/insts/instructions.hh
@@ -79949,9 +79949,9 @@
   case 0: //vgpr_addr
 return 8;
   case 1: //vgpr_src
-return 32;
+return 4;
   case 2: //vgpr_dst
-return 32;
+return 4;
   default:
 fatal("op idx %i out of bounds\n", opIdx);
 return -1;
@@ -79991,6 +79991,8 @@
 } // isDstOperand

 void execute(GPUDynInstPtr) override;
+void initiateAcc(GPUDynInstPtr) override;
+void completeAcc(GPUDynInstPtr) override;
 }; // Inst_FLAT__FLAT_ATOMIC_SWAP

 class Inst_FLAT__FLAT_ATOMIC_CMPSWAP : public Inst_FLAT

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29937
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I9b9042899e65e8c9848b31c509eb2e3b13293e52
Gerrit-Change-Number: 29937
Gerrit-PatchSet: 9
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Alexandru Duțu 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Rev

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix VOP2 dissasembly prints

2020-07-13 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29936 )


Change subject: arch-gcn3: Fix VOP2 dissasembly prints
..

arch-gcn3: Fix VOP2 dissasembly prints

VOP2 prints VSRC1 register index as hex instead of decimal if the
instruction contains a literal operand.  This patch resets the
format specifiers in the stream to print the register correctly.

Change-Id: Icc7e6588b3c5af545be6590ce412460e72df253f
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29936
Tested-by: kokoro 
Reviewed-by: Anthony Gutierrez 
Maintainer: Anthony Gutierrez 
---
M src/arch/gcn3/insts/op_encodings.cc
1 file changed, 2 insertions(+), 1 deletion(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/op_encodings.cc  
b/src/arch/gcn3/insts/op_encodings.cc

index 2eb4042..fe501f2 100644
--- a/src/arch/gcn3/insts/op_encodings.cc
+++ b/src/arch/gcn3/insts/op_encodings.cc
@@ -763,7 +763,8 @@
<< extData.imm_u32 << ", ";
 }

-dis_stream << "v" << instData.VSRC1;
+dis_stream << std::resetiosflags(std::ios_base::basefield) << "v"
+<< instData.VSRC1;

 if (readsVCC())
 dis_stream << ", vcc";

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29936
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Icc7e6588b3c5af545be6590ce412460e72df253f
Gerrit-Change-Number: 29936
Gerrit-PatchSet: 9
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Michael LeBeane 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3, gpu-compute: Implement out-of-range accesses

2020-07-13 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29935 )


Change subject: arch-gcn3, gpu-compute: Implement out-of-range accesses
..

arch-gcn3, gpu-compute: Implement out-of-range accesses

Certain buffer out-of-range memory accesses should be special
cased and not generate memory accesses. This patch implements
those special cases and supresses lanes from accessing memory
when the calculated address falls in an ISA-specified out-of-range
condition.

Change-Id: I8298f861c6b59587789853a01e503ba7d98cb13d
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29935
Tested-by: kokoro 
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Matt Sinclair 
Maintainer: Anthony Gutierrez 
---
M src/arch/gcn3/insts/instructions.cc
M src/arch/gcn3/insts/op_encodings.hh
M src/gpu-compute/global_memory_pipeline.cc
3 files changed, 96 insertions(+), 6 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, but someone else must approve
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index b923eae..2e39bf5 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -34453,8 +34453,12 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (gpuDynInst->exec_mask[lane]) {
-vdst[lane] = (VecElemU32)((reinterpret_cast(
-gpuDynInst->d_data))[lane]);
+if (!oobMask[lane]) {
+vdst[lane] =  
(VecElemU32)((reinterpret_cast(

+gpuDynInst->d_data))[lane]);
+} else {
+vdst[lane] = 0;
+}
 }
 }

@@ -34580,8 +34584,12 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (gpuDynInst->exec_mask[lane]) {
-vdst[lane] = (VecElemU32)((reinterpret_cast(
-gpuDynInst->d_data))[lane]);
+if (!oobMask[lane]) {
+vdst[lane] =  
(VecElemU32)((reinterpret_cast(

+gpuDynInst->d_data))[lane]);
+} else {
+vdst[lane] = 0;
+}
 }
 }

@@ -34707,8 +34715,12 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (gpuDynInst->exec_mask[lane]) {
-vdst[lane] = (reinterpret_cast(
-gpuDynInst->d_data))[lane];
+if (!oobMask[lane]) {
+vdst[lane] = (reinterpret_cast(
+gpuDynInst->d_data))[lane];
+} else {
+vdst[lane] = 0;
+}
 }
 }

diff --git a/src/arch/gcn3/insts/op_encodings.hh  
b/src/arch/gcn3/insts/op_encodings.hh

index 308560a..22c146a 100644
--- a/src/arch/gcn3/insts/op_encodings.hh
+++ b/src/arch/gcn3/insts/op_encodings.hh
@@ -40,6 +40,7 @@
 #include "arch/gcn3/gpu_mem_helpers.hh"
 #include "arch/gcn3/insts/gpu_static_inst.hh"
 #include "arch/gcn3/operand.hh"
+#include "debug/GCN3.hh"
 #include "debug/GPUExec.hh"
 #include "mem/ruby/system/RubySystem.hh"

@@ -489,14 +490,26 @@
 void
 initMemRead(GPUDynInstPtr gpuDynInst)
 {
+// temporarily modify exec_mask to supress memory accesses to  
oob
+// regions.  Only issue memory requests for lanes that have  
their

+// exec_mask set and are not out of bounds.
+VectorMask old_exec_mask = gpuDynInst->exec_mask;
+gpuDynInst->exec_mask &= ~oobMask;
 initMemReqHelper(gpuDynInst, MemCmd::ReadReq);
+gpuDynInst->exec_mask = old_exec_mask;
 }

 template
 void
 initMemWrite(GPUDynInstPtr gpuDynInst)
 {
+// temporarily modify exec_mask to supress memory accesses to  
oob
+// regions.  Only issue memory requests for lanes that have  
their

+// exec_mask set and are not out of bounds.
+VectorMask old_exec_mask = gpuDynInst->exec_mask;
+gpuDynInst->exec_mask &= ~oobMask;
 initMemReqHelper(gpuDynInst, MemCmd::WriteReq);
+gpuDynInst->exec_mask = old_exec_mask;
 }

 void
@@ -566,6 +579,42 @@

 buf_off = v_off[lane] + inst_offset;

+
+/**
+ * Range check behavior causes out of range accesses to
+ * to be treated differently. Out of range accesses  
return

+ * 0 for loads and are ignored for stores. For
+ * non-formatted accesses, this is done on a per-lane
+ * basis.
+ */
+if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleE

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix writelane src0,src1 usage

2020-07-13 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29933 )


Change subject: arch-gcn3: Fix writelane src0,src1 usage
..

arch-gcn3: Fix writelane src0,src1 usage

Src1 should only be used for lane select.  The data should come
from src0.

Change-Id: Ibe960df2e56d351a3819b40194104d2972a5cd4c
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29933
Tested-by: kokoro 
Maintainer: Anthony Gutierrez 
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 1 insertion(+), 1 deletion(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 0256d46..b923eae 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -30181,7 +30181,7 @@
 assert(!(extData.NEG & 0x2));
 assert(!(extData.NEG & 0x4));

-vdst[src1.rawData() & 0x3f] = src1.rawData();
+vdst[src1.rawData() & 0x3f] = src0.rawData();

 vdst.write();
 }

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29933
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Ibe960df2e56d351a3819b40194104d2972a5cd4c
Gerrit-Change-Number: 29933
Gerrit-PatchSet: 9
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Michael LeBeane 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Dropping fetchs when no entry is reserved in the buffer

2020-07-13 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29932 )


Change subject: gpu-compute: Dropping fetchs when no entry is reserved in  
the buffer

..

gpu-compute: Dropping fetchs when no entry is reserved in the buffer

This changeset drops fetches if there is no entry reserved in the
fetch buffer for that instruction. This can happen due to a fetch
attempted to be issued in the same cycle where a branch instruction
flushed the fetch buffer, while an ITLB or I-cache request is still
pending.

Change-Id: I3b80dbd71af27ccf790b543bd5c034bb9b02624a
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29932
Tested-by: kokoro 
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Onur Kayıran 
Maintainer: Anthony Gutierrez 
---
M src/gpu-compute/fetch_unit.cc
M src/gpu-compute/fetch_unit.hh
2 files changed, 22 insertions(+), 0 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Onur Kayıran: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
index fb04cd2..447ff12 100644
--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@@ -235,6 +235,16 @@
 delete oldPkt;

 /**
+ * if we have not reserved an entry in the fetch buffer,
+ * stop fetching. this can happen due to a branch instruction
+ * flushing the fetch buffer while an ITLB or I-cache request is still
+ * pending, in the same cycle another instruction is trying to fetch.
+ */
+if  
(!fetchBuf.at(wavefront->wfSlotId).isReserved(pkt->req->getVaddr())) {

+return;
+}
+
+/**
  * we should have reserved an entry in the fetch buffer
  * for this cache line. here we get the pointer to the
  * entry used to buffer this request's line data.
diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh
index 2cfe3f0..798c264 100644
--- a/src/gpu-compute/fetch_unit.hh
+++ b/src/gpu-compute/fetch_unit.hh
@@ -120,6 +120,18 @@
 return reserved_pc->second;
 }

+/**
+ * returns true if there is an entry reserved for this address,
+ * and false otherwise
+ */
+bool
+isReserved(Addr vaddr) const
+{
+auto reserved_pc = reservedPCs.find(vaddr);
+bool is_reserved = (reserved_pc != reservedPCs.end());
+return is_reserved;
+}
+
 void fetchDone(Addr vaddr);

 /**

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29932
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I3b80dbd71af27ccf790b543bd5c034bb9b02624a
Gerrit-Change-Number: 29932
Gerrit-PatchSet: 9
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Onur Kayıran 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: fix bits that SDWA selects

2020-07-13 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29931 )


Change subject: arch-gcn3: fix bits that SDWA selects
..

arch-gcn3: fix bits that SDWA selects

This commit fixes a bug in 200f2408 where the SDWA support was selecting  
bits

backwards.  As part of this commit, to help resolve this problem in the
future, I have added asserts in the helper functions in bitfield.hh to  
ensure

that the number of bits aren't negative.

Change-Id: I4b0ecb0e7c110600c0b5063101b75f9adcc512ac
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29931
Tested-by: kokoro 
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Matt Sinclair 
Maintainer: Anthony Gutierrez 
---
M src/arch/gcn3/insts/inst_util.hh
M src/base/bitfield.hh
2 files changed, 37 insertions(+), 30 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, but someone else must approve
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/inst_util.hh  
b/src/arch/gcn3/insts/inst_util.hh

index 292e3ba..433ccbe 100644
--- a/src/arch/gcn3/insts/inst_util.hh
+++ b/src/arch/gcn3/insts/inst_util.hh
@@ -551,7 +551,7 @@
  const SDWASelVals sel, const bool signExt)
 {
 // local variables
-int first_bit = 0, last_bit = 0;
+int low_bit = 0, high_bit = 0;
 bool signExt_local = signExt;
 T retVal = 0;

@@ -566,17 +566,19 @@
   of byte 0, or makes the bits of the selected byte be byte 0  
(and

   next either sign extends or zero's out upper bits).
 */
-first_bit = (sel * Gcn3ISA::BITS_PER_BYTE);
-last_bit = first_bit + Gcn3ISA::MSB_PER_BYTE;
-retVal = bits(currOperVal, first_bit, last_bit);
+low_bit = (sel * Gcn3ISA::BITS_PER_BYTE);
+high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE;
+retVal = bits(currOperVal, high_bit, low_bit);

 // make sure update propagated, since used next
-assert(bits(retVal, Gcn3ISA::MSB_PER_BYTE) ==
-   bits(origOperVal, (sel * Gcn3ISA::BITS_PER_BYTE) +
-Gcn3ISA::MSB_PER_BYTE));
+fatal_if(bits(retVal, Gcn3ISA::MSB_PER_BYTE) !=
+ bits(origOperVal, high_bit),
+ "ERROR: SDWA byte update not propagated: retVal: %d, "
+ "orig: %d\n", bits(retVal, Gcn3ISA::MSB_PER_BYTE),
+ bits(origOperVal, high_bit));
 // sign extended value depends on upper-most bit of the new  
byte 0

 signExt_local = (signExt &&
- (bits(retVal, 0, Gcn3ISA::MSB_PER_BYTE) &  
0x80));
+ (bits(retVal, Gcn3ISA::MSB_PER_BYTE, 0) &  
0x80));


 // process all other bytes -- if sign extending, make them 1,  
else

 // all 0's so leave as is
@@ -589,17 +591,20 @@
   of word 0, or makes the bits of the selected word be word 0  
(and

   next either sign extends or zero's out upper bits).
 */
-first_bit = (sel & 1) * Gcn3ISA::BITS_PER_WORD;
-last_bit = first_bit + Gcn3ISA::MSB_PER_WORD;
-retVal = bits(currOperVal, first_bit, last_bit);
+low_bit = (sel & 1) * Gcn3ISA::BITS_PER_WORD;
+high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
+retVal = bits(currOperVal, high_bit, low_bit);

 // make sure update propagated, since used next
-assert(bits(retVal, Gcn3ISA::MSB_PER_WORD) ==
-   bits(origOperVal, ((sel & 1) * Gcn3ISA::BITS_PER_WORD) +
-Gcn3ISA::MSB_PER_WORD));
+fatal_if(bits(retVal, Gcn3ISA::MSB_PER_WORD) !=
+ bits(origOperVal, high_bit),
+ "ERROR: SDWA word update not propagated: retVal: %d, "
+ "orig: %d\n",
+ bits(retVal, Gcn3ISA::MSB_PER_WORD),
+ bits(origOperVal, high_bit));
 // sign extended value depends on upper-most bit of the new  
word 0

 signExt_local = (signExt &&
- (bits(retVal, 0, Gcn3ISA::MSB_PER_WORD) &
+ (bits(retVal, Gcn3ISA::MSB_PER_WORD, 0) &
   0x8000));

 // process other word -- if sign extending, make them 1, else  
all

@@ -659,7 +664,7 @@
  const SDWADstVals unusedBits_format)
 {
 // local variables
-int first_bit = 0, last_bit = 0;
+int low_bit = 0, high_bit = 0;
 bool signExt = (unusedBits_format == SDWA_UNUSED_SEXT);
 //bool pad = (unusedBits_format == SDWA_UNUSED_PAD);
 bool preserve = (unusedBits_format == SDWA_UNUSED_PRESERVE);
@@ -679,11 +684,11 @@

[gem5-dev] Change in gem5/gem5[develop]: arch-arm: Initialized some variables

2020-07-10 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/31094 )


Change subject: arch-arm: Initialized some variables
..

arch-arm: Initialized some variables

Some of the variables in pauth_helpers.cc
are uninitialized in certain control paths
which causes a compiler warning. We initialize
these to false since they should be updated
to the correct value in all valid code paths.

Change-Id: If34d7daaf2404c2cf014c7b4c0c2f979580f36b9
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/31094
Reviewed-by: Giacomo Travaglini 
Maintainer: Giacomo Travaglini 
Tested-by: kokoro 
---
M src/arch/arm/pauth_helpers.cc
1 file changed, 28 insertions(+), 28 deletions(-)

Approvals:
  Giacomo Travaglini: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/arm/pauth_helpers.cc b/src/arch/arm/pauth_helpers.cc
index c88795f..7424eb3 100644
--- a/src/arch/arm/pauth_helpers.cc
+++ b/src/arch/arm/pauth_helpers.cc
@@ -286,9 +286,9 @@
   using the same algorithm and key as AddPACDA().
 */

-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2 = false;
+bool trapEL3 = false;
+bool enable = false;

 uint64_t hi_key= tc->readMiscReg(MISCREG_APDAKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APDAKeyLo_EL1);
@@ -354,9 +354,9 @@
   using the same algorithm and key as AddPACDA().
 */

-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2 = false;
+bool trapEL3 = false;
+bool enable = false;

 uint64_t hi_key= tc->readMiscReg(MISCREG_APDBKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APDBKeyLo_EL1);
@@ -424,9 +424,9 @@
   using the same algorithm and key as AddPACDA().
 */

-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2 = false;
+bool trapEL3 = false;
+bool enable = false;

 uint64_t hi_key= tc->readMiscReg(MISCREG_APIAKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APIAKeyLo_EL1);
@@ -498,9 +498,9 @@
   using the same algorithm and key as AddPACDA().
 */

-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2 = false;
+bool trapEL3 = false;
+bool enable = false;

 uint64_t hi_key= tc->readMiscReg(MISCREG_APIBKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APIBKeyLo_EL1);
@@ -566,9 +566,9 @@
 Fault
 ArmISA::addPACDA(ThreadContext* tc, uint64_t X, uint64_t Y, uint64_t* out)
 {
-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2 = false;
+bool trapEL3 = false;
+bool enable = false;

 uint64_t hi_key= tc->readMiscReg(MISCREG_APDAKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APDAKeyLo_EL1);
@@ -630,9 +630,9 @@
 Fault
 ArmISA::addPACDB(ThreadContext* tc, uint64_t X, uint64_t Y, uint64_t* out)
 {
-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2 = false;
+bool trapEL3 = false;
+bool enable = false;

 uint64_t hi_key= tc->readMiscReg(MISCREG_APDBKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APDBKeyLo_EL1);
@@ -691,8 +691,8 @@
 Fault
 ArmISA::addPACGA(ThreadContext * tc, uint64_t X, uint64_t Y, uint64_t* out)
 {
-bool trapEL2;
-bool trapEL3;
+bool trapEL2 = false;
+bool trapEL3 = false;

 uint64_t hi_key= tc->readMiscReg(MISCREG_APGAKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APGAKeyLo_EL1);
@@ -738,9 +738,9 @@

 Fault
 ArmISA::addPACIA(ThreadContext * tc, uint64_t X, uint64_t Y, uint64_t*  
out){

-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2 = false;
+bool trapEL3 = false;
+bool enable = false;

 uint64_t hi_key= tc->readMiscReg(MISCREG_APIAKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APIAKeyLo_EL1);
@@ -797,9 +797,9 @@

 Fault
 ArmISA::addPACIB(ThreadContext* tc, uint64_t X, uint64_t Y, uint64_t* out){
-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2 = false;
+bool trapEL3 = false;
+bool enable = false;

 uint64_t hi_key= tc->readMiscReg(MISCREG_APIBKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APIBKeyLo_EL1);
@@ -859,8 +859,8 @@

 Fault
 ArmISA::stripPAC(ThreadContext* tc, uint64_t A, bool data, uint64_t* out){
-bool trapEL2;
-bool trapEL3;
+bool trapEL2 = false;
+bool trapEL3 = false;

 uint64_t ptr;


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/31094
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: If34d7daaf2404c2cf014c7b4c0c2f979580f36b9
Gerrit-Change-Number: 31094
Gerrit-PatchSet: 3
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Ciro Santilli 
Gerrit-Reviewer: Giacomo Travaglini 
Gerrit-Reviewer: Jason Lowe-Power 
Gerrit-Reviewer: Tony Gutierrez 
Ger

[gem5-dev] Change in gem5/gem5[develop]: arch-arm: Initialized some variables

2020-07-08 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/31094

to review the following change.


Change subject: arch-arm: Initialized some variables
..

arch-arm: Initialized some variables

Some of the variables in pauth_helpers.cc
are uninitialized in certain control paths
which causes a compiler warning. We initialize
these to false since they should be updated
to the correct value in all valid code paths.

Change-Id: If34d7daaf2404c2cf014c7b4c0c2f979580f36b9
---
M src/arch/arm/pauth_helpers.cc
1 file changed, 28 insertions(+), 28 deletions(-)



diff --git a/src/arch/arm/pauth_helpers.cc b/src/arch/arm/pauth_helpers.cc
index c88795f..e996fd5 100644
--- a/src/arch/arm/pauth_helpers.cc
+++ b/src/arch/arm/pauth_helpers.cc
@@ -286,9 +286,9 @@
   using the same algorithm and key as AddPACDA().
 */

-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2(false);
+bool trapEL3(false);
+bool enable(false);

 uint64_t hi_key= tc->readMiscReg(MISCREG_APDAKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APDAKeyLo_EL1);
@@ -354,9 +354,9 @@
   using the same algorithm and key as AddPACDA().
 */

-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2(false);
+bool trapEL3(false);
+bool enable(false);

 uint64_t hi_key= tc->readMiscReg(MISCREG_APDBKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APDBKeyLo_EL1);
@@ -424,9 +424,9 @@
   using the same algorithm and key as AddPACDA().
 */

-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2(false);
+bool trapEL3(false);
+bool enable(false);

 uint64_t hi_key= tc->readMiscReg(MISCREG_APIAKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APIAKeyLo_EL1);
@@ -498,9 +498,9 @@
   using the same algorithm and key as AddPACDA().
 */

-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2(false);
+bool trapEL3(false);
+bool enable(false);

 uint64_t hi_key= tc->readMiscReg(MISCREG_APIBKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APIBKeyLo_EL1);
@@ -566,9 +566,9 @@
 Fault
 ArmISA::addPACDA(ThreadContext* tc, uint64_t X, uint64_t Y, uint64_t* out)
 {
-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2(false);
+bool trapEL3(false);
+bool enable(false);

 uint64_t hi_key= tc->readMiscReg(MISCREG_APDAKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APDAKeyLo_EL1);
@@ -630,9 +630,9 @@
 Fault
 ArmISA::addPACDB(ThreadContext* tc, uint64_t X, uint64_t Y, uint64_t* out)
 {
-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2(false);
+bool trapEL3(false);
+bool enable(false);

 uint64_t hi_key= tc->readMiscReg(MISCREG_APDBKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APDBKeyLo_EL1);
@@ -691,8 +691,8 @@
 Fault
 ArmISA::addPACGA(ThreadContext * tc, uint64_t X, uint64_t Y, uint64_t* out)
 {
-bool trapEL2;
-bool trapEL3;
+bool trapEL2(false);
+bool trapEL3(false);

 uint64_t hi_key= tc->readMiscReg(MISCREG_APGAKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APGAKeyLo_EL1);
@@ -738,9 +738,9 @@

 Fault
 ArmISA::addPACIA(ThreadContext * tc, uint64_t X, uint64_t Y, uint64_t*  
out){

-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2(false);
+bool trapEL3(false);
+bool enable(false);

 uint64_t hi_key= tc->readMiscReg(MISCREG_APIAKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APIAKeyLo_EL1);
@@ -797,9 +797,9 @@

 Fault
 ArmISA::addPACIB(ThreadContext* tc, uint64_t X, uint64_t Y, uint64_t* out){
-bool trapEL2;
-bool trapEL3;
-bool enable;
+bool trapEL2(false);
+bool trapEL3(false);
+bool enable(false);

 uint64_t hi_key= tc->readMiscReg(MISCREG_APIBKeyHi_EL1);
 uint64_t lo_key= tc->readMiscReg(MISCREG_APIBKeyLo_EL1);
@@ -859,8 +859,8 @@

 Fault
 ArmISA::stripPAC(ThreadContext* tc, uint64_t A, bool data, uint64_t* out){
-bool trapEL2;
-bool trapEL3;
+bool trapEL2(false);
+bool trapEL3(false);

 uint64_t ptr;


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/31094
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: If34d7daaf2404c2cf014c7b4c0c2f979580f36b9
Gerrit-Change-Number: 31094
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: sim: Add M5_VAR_USED to var used in dprint

2020-07-06 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/30896 )


Change subject: sim: Add M5_VAR_USED to var used in dprint
..

sim: Add M5_VAR_USED to var used in dprint

Change-Id: I8f8654b8546ee8df3d4acd1ccbc5080ad38764c1
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/30896
Reviewed-by: Jason Lowe-Power 
Reviewed-by: Andreas Sandberg 
Maintainer: Gabe Black 
Maintainer: Andreas Sandberg 
Tested-by: kokoro 
---
M src/sim/system.cc
1 file changed, 1 insertion(+), 1 deletion(-)

Approvals:
  Jason Lowe-Power: Looks good to me, approved
  Andreas Sandberg: Looks good to me, approved; Looks good to me, approved
  Gabe Black: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/sim/system.cc b/src/sim/system.cc
index 7057a97..7841ec0 100644
--- a/src/sim/system.cc
+++ b/src/sim/system.cc
@@ -179,7 +179,7 @@
 {
 auto &t = thread(id);
 #   if THE_ISA != NULL_ISA
-BaseCPU *cpu = t.context->getCpuPtr();
+BaseCPU M5_VAR_USED *cpu = t.context->getCpuPtr();
 DPRINTFS(Quiesce, cpu, "quiesce()\n");
 #   endif
 t.quiesce();

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/30896
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I8f8654b8546ee8df3d4acd1ccbc5080ad38764c1
Gerrit-Change-Number: 30896
Gerrit-PatchSet: 2
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Andreas Sandberg 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Ciro Santilli 
Gerrit-Reviewer: Gabe Black 
Gerrit-Reviewer: Giacomo Gabrielli 
Gerrit-Reviewer: Jason Lowe-Power 
Gerrit-Reviewer: Jason Lowe-Power 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-arm: Remove some unused vars from self_debug.hh

2020-07-06 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/30895 )


Change subject: arch-arm: Remove some unused vars from self_debug.hh
..

arch-arm: Remove some unused vars from self_debug.hh

Change-Id: I68b4ddfe66a34a29c0abfd52a8448e0b8a5bbe94
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/30895
Reviewed-by: Jason Lowe-Power 
Reviewed-by: Andreas Sandberg 
Reviewed-by: Jordi Vaquero 
Maintainer: Andreas Sandberg 
Tested-by: kokoro 
---
M src/arch/arm/self_debug.hh
1 file changed, 1 insertion(+), 6 deletions(-)

Approvals:
  Jason Lowe-Power: Looks good to me, approved
  Andreas Sandberg: Looks good to me, but someone else must approve; Looks  
good to me, approved

  Jordi Vaquero: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/arm/self_debug.hh b/src/arch/arm/self_debug.hh
index 9739c77..67654d2 100644
--- a/src/arch/arm/self_debug.hh
+++ b/src/arch/arm/self_debug.hh
@@ -58,7 +58,6 @@
   private:
 MiscRegIndex ctrlRegIndex;
 MiscRegIndex valRegIndex;
-MiscRegIndex xRegIndex;
 SelfDebug * conf;
 bool isCntxtAware;
 bool VMID16enabled;
@@ -72,7 +71,7 @@
  MiscRegIndex _xIndex, SelfDebug* _conf, bool _ctxAw, bool lva,
  bool vmid16, bool aarch32):
 ctrlRegIndex(_ctrlIndex), valRegIndex(_valIndex),
-xRegIndex(_xIndex), conf(_conf), isCntxtAware(_ctxAw),
+conf(_conf), isCntxtAware(_ctxAw),
 VMID16enabled(vmid16), active_pc(0x0), enable(false)
 {
 maxAddrSize = lva ? 52: 48 ;
@@ -215,10 +214,6 @@
 bool prevSteppedLdx;
 bool cpsrD;

-bool ctrStepped;
-bool ctrActivate;
-
-
   public:
 SoftwareStep(SelfDebug* s): bSS(false), stateSS(INACTIVE_STATE),
 conf(s), steppedLdx(false) { }

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/30895
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I68b4ddfe66a34a29c0abfd52a8448e0b8a5bbe94
Gerrit-Change-Number: 30895
Gerrit-PatchSet: 2
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Andreas Sandberg 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Ciro Santilli 
Gerrit-Reviewer: Giacomo Gabrielli 
Gerrit-Reviewer: Jason Lowe-Power 
Gerrit-Reviewer: Jason Lowe-Power 
Gerrit-Reviewer: Jordi Vaquero 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-arm: Add missing override to ARM faults

2020-07-06 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/30894 )


Change subject: arch-arm: Add missing override to ARM faults
..

arch-arm: Add missing override to ARM faults

Change-Id: I7d64bdb4dfb0ba204e734f727b016bea168180ef
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/30894
Reviewed-by: Jason Lowe-Power 
Reviewed-by: Andreas Sandberg 
Maintainer: Andreas Sandberg 
Tested-by: kokoro 
---
M src/arch/arm/faults.hh
1 file changed, 1 insertion(+), 1 deletion(-)

Approvals:
  Jason Lowe-Power: Looks good to me, approved
  Andreas Sandberg: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/arm/faults.hh b/src/arch/arm/faults.hh
index a552757..7a2f69e 100644
--- a/src/arch/arm/faults.hh
+++ b/src/arch/arm/faults.hh
@@ -647,7 +647,7 @@
 bool routeToHyp(ThreadContext *tc) const override;
 uint32_t iss() const override;
 ExceptionClass ec(ThreadContext *tc) const override;
-void annotate(AnnotationIDs id, uint64_t val);
+void annotate(AnnotationIDs id, uint64_t val) override;
 };

 class SoftwareStepFault : public ArmFaultVals

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/30894
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I7d64bdb4dfb0ba204e734f727b016bea168180ef
Gerrit-Change-Number: 30894
Gerrit-PatchSet: 2
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Andreas Sandberg 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Ciro Santilli 
Gerrit-Reviewer: Giacomo Gabrielli 
Gerrit-Reviewer: Jason Lowe-Power 
Gerrit-Reviewer: Jason Lowe-Power 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-arm: Add missing override to ARM faults

2020-07-02 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/30894

to review the following change.


Change subject: arch-arm: Add missing override to ARM faults
..

arch-arm: Add missing override to ARM faults

Change-Id: I7d64bdb4dfb0ba204e734f727b016bea168180ef
---
M src/arch/arm/faults.hh
1 file changed, 1 insertion(+), 1 deletion(-)



diff --git a/src/arch/arm/faults.hh b/src/arch/arm/faults.hh
index 8a127ff..09513de 100644
--- a/src/arch/arm/faults.hh
+++ b/src/arch/arm/faults.hh
@@ -635,7 +635,7 @@
 bool routeToHyp(ThreadContext *tc) const override;
 uint32_t iss() const override;
 ExceptionClass ec(ThreadContext *tc) const override;
-void annotate(AnnotationIDs id, uint64_t val);
+void annotate(AnnotationIDs id, uint64_t val) override;
 };

 class SoftwareStepFault : public ArmFaultVals

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/30894
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I7d64bdb4dfb0ba204e734f727b016bea168180ef
Gerrit-Change-Number: 30894
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-arm: Remove some unused vars from self_debug.hh

2020-07-02 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/30895

to review the following change.


Change subject: arch-arm: Remove some unused vars from self_debug.hh
..

arch-arm: Remove some unused vars from self_debug.hh

Change-Id: I68b4ddfe66a34a29c0abfd52a8448e0b8a5bbe94
---
M src/arch/arm/self_debug.hh
1 file changed, 1 insertion(+), 6 deletions(-)



diff --git a/src/arch/arm/self_debug.hh b/src/arch/arm/self_debug.hh
index 48d784c..ce35a8b 100644
--- a/src/arch/arm/self_debug.hh
+++ b/src/arch/arm/self_debug.hh
@@ -58,7 +58,6 @@
   private:
 MiscRegIndex ctrlRegIndex;
 MiscRegIndex valRegIndex;
-MiscRegIndex xRegIndex;
 SelfDebug * conf;
 bool isCntxtAware;
 bool VMID16enabled;
@@ -72,7 +71,7 @@
  MiscRegIndex _xIndex, SelfDebug* _conf, bool _ctxAw, bool lva,
  bool vmid16, bool aarch32):
 ctrlRegIndex(_ctrlIndex), valRegIndex(_valIndex),
-xRegIndex(_xIndex), conf(_conf), isCntxtAware(_ctxAw),
+conf(_conf), isCntxtAware(_ctxAw),
 VMID16enabled(vmid16), active_pc(0x0), enable(false)
 {
 maxAddrSize = lva ? 52: 48 ;
@@ -215,10 +214,6 @@
 bool prevSteppedLdx;
 bool cpsrD;

-bool ctrStepped;
-bool ctrActivate;
-
-
   public:
 SoftwareStep(SelfDebug* s): bSS(false), stateSS(INACTIVE_STATE),
 conf(s), steppedLdx(false) { }

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/30895
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I68b4ddfe66a34a29c0abfd52a8448e0b8a5bbe94
Gerrit-Change-Number: 30895
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: sim: Add M5_VAR_USED to var used in dprint

2020-07-02 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/30896

to review the following change.


Change subject: sim: Add M5_VAR_USED to var used in dprint
..

sim: Add M5_VAR_USED to var used in dprint

Change-Id: I8f8654b8546ee8df3d4acd1ccbc5080ad38764c1
---
M src/sim/system.cc
1 file changed, 1 insertion(+), 1 deletion(-)



diff --git a/src/sim/system.cc b/src/sim/system.cc
index 7057a97..7841ec0 100644
--- a/src/sim/system.cc
+++ b/src/sim/system.cc
@@ -179,7 +179,7 @@
 {
 auto &t = thread(id);
 #   if THE_ISA != NULL_ISA
-BaseCPU *cpu = t.context->getCpuPtr();
+BaseCPU M5_VAR_USED *cpu = t.context->getCpuPtr();
 DPRINTFS(Quiesce, cpu, "quiesce()\n");
 #   endif
 t.quiesce();

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/30896
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I8f8654b8546ee8df3d4acd1ccbc5080ad38764c1
Gerrit-Change-Number: 30896
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3, gpu-compute: Fix issue when reading const operands

2020-06-22 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29927 )


Change subject: arch-gcn3, gpu-compute: Fix issue when reading const  
operands

..

arch-gcn3, gpu-compute: Fix issue when reading const operands

Currently, when an instruction has an operand that reads a const
value, it goes thru the same readMiscReg() api call as other
misc registers (real HW registers, not constant values). There
is an issue, however, when casting from the const values (which are
32b) to higher precision values, like 64b.

This change creates a separate, templated function call to the GPU's
ISA state that will return the correct type.

Change-Id: I41965ebeeed20bb70e919fce5ad94d957b3af802
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29927
Reviewed-by: Anthony Gutierrez 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/arch/gcn3/gpu_isa.hh
M src/arch/gcn3/isa.cc
M src/arch/gcn3/operand.hh
M src/arch/gcn3/registers.cc
M src/arch/gcn3/registers.hh
M src/gpu-compute/gpu_exec_context.hh
6 files changed, 66 insertions(+), 17 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/gpu_isa.hh b/src/arch/gcn3/gpu_isa.hh
index 26b79c7..228c3fe 100644
--- a/src/arch/gcn3/gpu_isa.hh
+++ b/src/arch/gcn3/gpu_isa.hh
@@ -37,6 +37,7 @@
 #define __ARCH_GCN3_GPU_ISA_HH__

 #include 
+#include 

 #include "arch/gcn3/registers.hh"
 #include "gpu-compute/dispatcher.hh"
@@ -52,6 +53,24 @@
   public:
 GPUISA(Wavefront &wf);

+template T
+readConstVal(int opIdx) const
+{
+panic_if(!std::is_integral::value, "Constant values must "
+ "be an integer.\n");
+T val(0);
+
+if (isPosConstVal(opIdx)) {
+val = (T)readPosConstReg(opIdx);
+}
+
+if (isNegConstVal(opIdx)) {
+val = (T)readNegConstReg(opIdx);
+}
+
+return val;
+}
+
 ScalarRegU32 readMiscReg(int opIdx) const;
 void writeMiscReg(int opIdx, ScalarRegU32 operandVal);
 bool hasScalarUnit() const { return true; }
@@ -63,10 +82,9 @@
 return posConstRegs[opIdx - REG_INT_CONST_POS_MIN];
 }

-ScalarRegU32 readNegConstReg(int opIdx) const
+ScalarRegI32 readNegConstReg(int opIdx) const
 {
-return *((ScalarRegU32*)
-&negConstRegs[opIdx - REG_INT_CONST_NEG_MIN]);
+return negConstRegs[opIdx - REG_INT_CONST_NEG_MIN];
 }

 static const std::array
diff --git a/src/arch/gcn3/isa.cc b/src/arch/gcn3/isa.cc
index 036c771..3bd122d 100644
--- a/src/arch/gcn3/isa.cc
+++ b/src/arch/gcn3/isa.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
+ * Copyright (c) 2016-2018 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
@@ -49,14 +49,6 @@
 ScalarRegU32
 GPUISA::readMiscReg(int opIdx) const
 {
-if (opIdx >= REG_INT_CONST_POS_MIN && opIdx <=  
REG_INT_CONST_POS_MAX) {

-return readPosConstReg(opIdx);
-}
-
-if (opIdx >= REG_INT_CONST_NEG_MIN && opIdx <=  
REG_INT_CONST_NEG_MAX) {

-return readNegConstReg(opIdx);
-}
-
 switch (opIdx) {
   case REG_M0:
 return m0;
diff --git a/src/arch/gcn3/operand.hh b/src/arch/gcn3/operand.hh
index 218faf8..7f70fab 100644
--- a/src/arch/gcn3/operand.hh
+++ b/src/arch/gcn3/operand.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * Copyright (c) 2017-2018 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
@@ -583,10 +583,15 @@
   default:
 {
 assert(sizeof(DataType) <= sizeof(srfData));
-DataType misc_val
-= (DataType)_gpuDynInst->readMiscReg(_opIdx);
+DataType misc_val(0);
+if (isConstVal(_opIdx)) {
+misc_val = (DataType)_gpuDynInst
+->readConstVal(_opIdx);
+} else {
+misc_val =  
(DataType)_gpuDynInst->readMiscReg(_opIdx);

+}
 std::memcpy((void*)srfData.data(), (void*)&misc_val,
-sizeof(DataType));
+sizeof(DataType));
 }
 }
 }
diff --git a/src/arch/gcn3/registers.cc b/src/arch/gcn3/registers.cc
index 0872ff9..016160f 100644
--- a/src/arch/gcn3/registers.cc
+++ b/src/arch/gcn3/registers.cc
@@ -163,6 +163,31 @@
 }

 bool
+isPosConstVal(int opIdx)
+{
+bool is_pos_const_val = (opIdx >= REG_INT_CONST_POS_MIN
+  

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix V_MAD_I32_I24 sign extension

2020-06-22 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29928 )


Change subject: arch-gcn3: Fix V_MAD_I32_I24 sign extension
..

arch-gcn3: Fix V_MAD_I32_I24 sign extension

We are not properly sign extending the bits we hack off for
V_MAD_I32_I24.

This fixes rnn_fwdBwd 64 1 1 lstm pte assertion failure.

Change-Id: I2516e5715227cbd822e6a62630674f64f7a109e0
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29928
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Matt Sinclair 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 2 insertions(+), 2 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 32719ad..0256d46 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -27446,8 +27446,8 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
-vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane],  
23, 0)

-+ src2[lane];
+vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
+* sext<24>(bits(src1[lane], 23, 0)) + src2[lane];
 }
 }


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29928
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I2516e5715227cbd822e6a62630674f64f7a109e0
Gerrit-Change-Number: 29928
Gerrit-PatchSet: 6
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Michael LeBeane 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Make headTailMap a std::unordered_map

2020-06-22 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29930 )


Change subject: gpu-compute: Make headTailMap a std::unordered_map
..

gpu-compute: Make headTailMap a std::unordered_map

There is no reason that the headTailMap needs to be
sorted, so let's use a std::unordered_map.

Change-Id: I18641b893352c18ec86e3775c8947a05a6c6547d
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29930
Reviewed-by: Anthony Gutierrez 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/gpu-compute/compute_unit.hh
1 file changed, 1 insertion(+), 1 deletion(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/gpu-compute/compute_unit.hh  
b/src/gpu-compute/compute_unit.hh

index 187cbc9..110097e 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -981,7 +981,7 @@
 // hold the time of the arrival of the first cache block related to
 // a particular GPUDynInst. This is used to calculate the difference
 // between the first and last chace block arrival times.
-std::map headTailMap;
+std::unordered_map headTailMap;
 };

 #endif // __COMPUTE_UNIT_HH__

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29930
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I18641b893352c18ec86e3775c8947a05a6c6547d
Gerrit-Change-Number: 29930
Gerrit-PatchSet: 6
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: dev: add support for HSA's barrier bit kernel synchronization

2020-06-22 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29925 )


Change subject: dev: add support for HSA's barrier bit kernel  
synchronization

..

dev: add support for HSA's barrier bit kernel synchronization

This commit adds support for the HSA's barrier bit version of
synchronization.  This method of synchronization is used for all
HIP benchmarks, and thus is necessary to ensure that multiple
kernels from the same queue are synchronizing properly.

Change-Id: I64f2d311a3970b71194e0555e2b932800df65e98
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29925
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Matt Sinclair 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/dev/hsa/hsa_packet_processor.cc
M src/dev/hsa/hsa_packet_processor.hh
2 files changed, 39 insertions(+), 3 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/dev/hsa/hsa_packet_processor.cc  
b/src/dev/hsa/hsa_packet_processor.cc

index f9880e4..4143019 100644
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -60,6 +60,11 @@
 #define PKT_TYPE(PKT) ((hsa_packet_type_t)(((PKT->header) >> \
 HSA_PACKET_HEADER_TYPE) & (HSA_PACKET_HEADER_WIDTH_TYPE - 1)))

+// checks if the barrier bit is set in the header -- shift the barrier bit
+// to LSB, then bitwise "and" to mask off all other bits
+#define IS_BARRIER(PKT) ((hsa_packet_header_t)(((PKT->header) >> \
+HSA_PACKET_HEADER_BARRIER) & HSA_PACKET_HEADER_WIDTH_BARRIER))
+
 HSAPP_EVENT_DESCRIPTION_GENERATOR(UpdateReadDispIdDmaEvent)
 HSAPP_EVENT_DESCRIPTION_GENERATOR(CmdQueueCmdDmaEvent)
 HSAPP_EVENT_DESCRIPTION_GENERATOR(QueueProcessEvent)
@@ -280,7 +285,7 @@
 HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx)
 {
 RQLEntry *queue = regdQList[rl_idx];
-if (!queue->aqlProcessEvent.scheduled()) {
+if (!queue->aqlProcessEvent.scheduled() && !queue->getBarrierBit()) {
 Tick processingTick = curTick() + pktProcessDelay;
 schedule(queue->aqlProcessEvent, processingTick);
 DPRINTF(HSAPacketProcessor, "AQL processing scheduled at  
tick: %d\n",

@@ -316,6 +321,16 @@
 // Submit packet to HSA device (dispatcher)
 hsa_device->submitDispatchPkt((void *)disp_pkt, rl_idx,  
host_pkt_addr);

 is_submitted = true;
+/*
+  If this packet is using the "barrier bit" to enforce ordering  
with

+  subsequent kernels, set the bit for this queue now, after
+  dispatching.
+*/
+if (IS_BARRIER(disp_pkt)) {
+DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for  
active" \

+" list ID = %d\n", __FUNCTION__, rl_idx);
+regdQList[rl_idx]->setBarrierBit(true);
+}
 } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) {
 DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \
 " active list ID = %d\n", __FUNCTION__, rl_idx);
@@ -631,6 +646,23 @@
 HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx)
 {
 HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
+
+// if barrier bit was set, unset it here -- we assume that finishPkt is
+// only called after the completion of a kernel
+if (regdQList[rl_idx]->getBarrierBit()) {
+DPRINTF(HSAPacketProcessor,
+"Unset barrier bit for active list ID %d\n", rl_idx);
+regdQList[rl_idx]->setBarrierBit(false);
+// if pending kernels in the queue after this kernel, reschedule
+if (regdQList[rl_idx]->dispPending()) {
+DPRINTF(HSAPacketProcessor,
+"Rescheduling active list ID %d after unsetting  
barrier "

+"bit\n", rl_idx);
+schedAQLProcessing(rl_idx);
+}
+}
+
+// If set, then blocked schedule, so need to reschedule
 if (regdQList[rl_idx]->qCntxt.aqlBuf->freeEntry(pvPkt))
 updateReadIndex(0, rl_idx);
 DPRINTF(HSAPacketProcessor,
diff --git a/src/dev/hsa/hsa_packet_processor.hh  
b/src/dev/hsa/hsa_packet_processor.hh

index 206d9ab..3ff7ad2 100644
--- a/src/dev/hsa/hsa_packet_processor.hh
+++ b/src/dev/hsa/hsa_packet_processor.hh
@@ -168,11 +168,13 @@
 typedef struct QueueContext {
 HSAQueueDescriptor* qDesc;
 AQLRingBuffer* aqlBuf;
+// used for HSA packets that enforce synchronization with barrier bit
+bool barrierBit;
 QueueContext(HSAQueueDescriptor* q_desc,
  AQLRingBuffer* aql_buf)
- : qDesc(q_desc), aqlBuf(aql_buf)
+ : qDesc(q_desc), aqlBuf(aql_buf), barrierBit(false)
 {}
-QueueContext() : qDesc(NULL), aqlBuf(NULL) {}
+QueueContext() : qDesc(NULL), aqlBuf(NULL), barrierBit(false) {}
 } QCntxt;

 class HSAPacketPr

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Remove unused function hostWakeUp from shader

2020-06-22 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29929 )


Change subject: gpu-compute: Remove unused function hostWakeUp from shader
..

gpu-compute: Remove unused function hostWakeUp from shader

Change-Id: Ib4415a7c5918da03bbd16fe9adb4dd593dcaa95c
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29929
Reviewed-by: Anthony Gutierrez 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/gpu-compute/shader.cc
M src/gpu-compute/shader.hh
2 files changed, 0 insertions(+), 14 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index aa7a6dd..f5e9444 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -153,19 +153,6 @@
 assert(gpuTc);
 }

-void
-Shader::hostWakeUp(BaseCPU *cpu) {
-if (cpuPointer == cpu) {
-if (gpuTc->status() == ThreadContext::Suspended)
-cpu->activateContext(gpuTc->threadId());
-} else {
-//Make sure both dispatcher and shader are trying to
-//wakeup same host. Hack here to enable kernel launch
-//from multiple CPUs
-panic("Dispatcher wants to wakeup a different host");
-}
-}
-
 Shader*
 ShaderParams::create()
 {
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
index eeaf343..238f6e0 100644
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -301,7 +301,6 @@
 Addr mmap(int length);
 void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
 void updateContext(int cid);
-void hostWakeUp(BaseCPU *cpu);
 void notifyCuSleep();
 };


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29929
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Ib4415a7c5918da03bbd16fe9adb4dd593dcaa95c
Gerrit-Change-Number: 29929
Gerrit-PatchSet: 6
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Updating implementation of atomics

2020-06-22 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29926 )


Change subject: arch-gcn3: Updating implementation of atomics
..

arch-gcn3: Updating implementation of atomics

This changeset is moving the access of the data operand
from initiateAcc to the execute method of atomic instructions.

Change-Id: I1debae302f0b13f79ed2b7a9ed2f6b07fcec5128
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29926
Reviewed-by: Anthony Gutierrez 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 45 insertions(+), 52 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 26af241..32719ad 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -39261,11 +39261,24 @@
 gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

 ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+ConstVecOperandU32 data(gpuDynInst, extData.DATA);
+ConstVecOperandU32 cmp(gpuDynInst, extData.DATA + 1);

 addr.read();
+data.read();
+cmp.read();

 calcAddr(gpuDynInst, addr);

+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (gpuDynInst->exec_mask[lane]) {
+(reinterpret_cast(gpuDynInst->x_data))[lane]
+= data[lane];
+(reinterpret_cast(gpuDynInst->a_data))[lane]
+= cmp[lane];
+}
+}
+
 if (gpuDynInst->executedAs() == Enums::SC_GLOBAL ||
 gpuDynInst->executedAs() == Enums::SC_PRIVATE) {
 /**
@@ -39293,21 +39306,6 @@
 void
 Inst_FLAT__FLAT_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
-ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-ConstVecOperandU32 cmp(gpuDynInst, extData.DATA + 1);
-
-data.read();
-cmp.read();
-
-for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-if (gpuDynInst->exec_mask[lane]) {
-(reinterpret_cast(gpuDynInst->x_data))[lane]
-= data[lane];
-(reinterpret_cast(gpuDynInst->a_data))[lane]
-= cmp[lane];
-}
-}
-
 initAtomicAccess(gpuDynInst);
 } // initiateAcc

@@ -39364,11 +39362,20 @@
 gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

 ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+ConstVecOperandU32 data(gpuDynInst, extData.DATA);

 addr.read();
+data.read();

 calcAddr(gpuDynInst, addr);

+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (gpuDynInst->exec_mask[lane]) {
+(reinterpret_cast(gpuDynInst->a_data))[lane]
+= data[lane];
+}
+}
+
 if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
 gpuDynInst->computeUnit()->globalMemoryPipe.
 issueRequest(gpuDynInst);
@@ -39387,17 +39394,6 @@
 void
 Inst_FLAT__FLAT_ATOMIC_ADD::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
-ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-data.read();
-
-for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-if (gpuDynInst->exec_mask[lane]) {
-(reinterpret_cast(gpuDynInst->a_data))[lane]
-= data[lane];
-}
-}
-
 initAtomicAccess(gpuDynInst);
 } // initiateAcc

@@ -39733,11 +39729,24 @@
 gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

 ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
+ConstVecOperandU64 data(gpuDynInst, extData.DATA);
+ConstVecOperandU64 cmp(gpuDynInst, extData.DATA + 2);

 addr.read();
+data.read();
+cmp.read();

 calcAddr(gpuDynInst, addr);

+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (gpuDynInst->exec_mask[lane]) {
+(reinterpret_cast(gpuDynInst->x_data))[lane]
+= data[lane];
+(reinterpret_cast(gpuDynInst->a_data))[lane]
+= cmp[lane];
+}
+}
+
 if (gpuDynInst->executedAs() == Enums::SC_GLOBAL ||
 gpuDynInst->executedAs() == Enums::SC_PRIVATE) {
 /**
@@ -39765,21 +39774,6 @@
 void
 Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::initiateAcc(GPUDynInstPtr  
gpuDynInst)

 {
-ConstVecOperandU64 data(gpuDynInst, extData.DATA);
-ConstVecOperandU64 cmp(gpuDynInst, extData.DATA + 2);
-
-data.read();
-cmp.read();
-
-for (int lane = 0; lane < NumVecElemPer

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement instruction v_div_fixup_f32

2020-06-19 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29924 )


Change subject: arch-gcn3: Implement instruction v_div_fixup_f32
..

arch-gcn3: Implement instruction v_div_fixup_f32

Instruction v_div_fixup_f32 was unimplemented. The
implementation was added by mimicking v_div_fixup_f64.

Change-Id: I9306b198f327e9fde3414aa1bb2bec20503b1efd
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29924
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Matt Sinclair 
Reviewed-by: Xianwei Zhang 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 59 insertions(+), 3 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, but someone else must approve
  Xianwei Zhang: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 308fd5d..26af241 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -28671,9 +28671,65 @@
 void
 Inst_VOP3__V_DIV_FIXUP_F32::execute(GPUDynInstPtr gpuDynInst)
 {
-// Could not parse sq_uc.arch desc field
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+src0.readSrc();
+src1.readSrc();
+src2.readSrc();
+
+if (instData.ABS & 0x1) {
+src0.absModifier();
+}
+
+if (instData.ABS & 0x2) {
+src1.absModifier();
+}
+
+if (instData.ABS & 0x4) {
+src2.absModifier();
+}
+
+if (extData.NEG & 0x1) {
+src0.negModifier();
+}
+
+if (extData.NEG & 0x2) {
+src1.negModifier();
+}
+
+if (extData.NEG & 0x4) {
+src2.negModifier();
+}
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (wf->execMask(lane)) {
+if (std::fpclassify(src1[lane]) == FP_ZERO) {
+if (std::signbit(src1[lane])) {
+vdst[lane] = -INFINITY;
+} else {
+vdst[lane] = +INFINITY;
+}
+} else if (std::isnan(src2[lane]) ||  
std::isnan(src1[lane])) {

+vdst[lane] = NAN;
+} else if (std::isinf(src1[lane])) {
+if (std::signbit(src1[lane])) {
+vdst[lane] = -INFINITY;
+} else {
+vdst[lane] = +INFINITY;
+}
+} else {
+vdst[lane] = src2[lane] / src1[lane];
+}
+}
+}
+
+vdst.write();
+} // execute
+// --- Inst_VOP3__V_DIV_FIXUP_F64 class methods ---

 Inst_VOP3__V_DIV_FIXUP_F64::Inst_VOP3__V_DIV_FIXUP_F64(InFmt_VOP3  
*iFmt)

 : Inst_VOP3(iFmt, "v_div_fixup_f64", false)

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29924
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I9306b198f327e9fde3414aa1bb2bec20503b1efd
Gerrit-Change-Number: 29924
Gerrit-PatchSet: 6
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: Xianwei Zhang 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement instruction v_div_fmas_f32

2020-06-19 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29923 )


Change subject: arch-gcn3: Implement instruction v_div_fmas_f32
..

arch-gcn3: Implement instruction v_div_fmas_f32

Instruction v_div_fmas_f32 was unimplemented. The
implementation was added by mimicking v_div_fmas_f64.

Change-Id: I262820a7a66877d140eb99b538715c3cae4d1860
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29923
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Xianwei Zhang 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 43 insertions(+), 2 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Xianwei Zhang: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 2789f3e..308fd5d 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -28879,8 +28879,49 @@
 void
 Inst_VOP3__V_DIV_FMAS_F32::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+src0.readSrc();
+src1.readSrc();
+src2.readSrc();
+
+if (instData.ABS & 0x1) {
+src0.absModifier();
+}
+
+if (instData.ABS & 0x2) {
+src1.absModifier();
+}
+
+if (instData.ABS & 0x4) {
+src2.absModifier();
+}
+
+if (extData.NEG & 0x1) {
+src0.negModifier();
+}
+
+if (extData.NEG & 0x2) {
+src1.negModifier();
+}
+
+if (extData.NEG & 0x4) {
+src2.negModifier();
+}
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (wf->execMask(lane)) {
+vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+}
+}
+
+//vdst.write();
+} // execute
+// --- Inst_VOP3__V_DIV_FMAS_F64 class methods ---

 Inst_VOP3__V_DIV_FMAS_F64::Inst_VOP3__V_DIV_FMAS_F64(InFmt_VOP3 *iFmt)
 : Inst_VOP3(iFmt, "v_div_fmas_f64", false)

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29923
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I262820a7a66877d140eb99b538715c3cae4d1860
Gerrit-Change-Number: 29923
Gerrit-PatchSet: 6
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: Xianwei Zhang 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: fix bug with SDWA support

2020-06-19 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29922 )


Change subject: arch-gcn3: fix bug with SDWA support
..

arch-gcn3: fix bug with SDWA support

Instructions that use the SDWA field need to use the extra SRC0
register associated with the SDWA instruction instead of the
"default" SRC0 register, since the default SRC0 register contains
the SDWA information when SDWA is being used.  This commit fixes
15de044c to take this into account.  Additionally, this commit
removes reads of the registers from the SDWA helper functions,
since they overwrite any changes made to the destination register.
Finally, this change modifies the instructions that use SDWA to
simplify the flow through the execute() functions.

Change-Id: I3bad83133808dfffc6a4c40bbd49c3d76599e669
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29922
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Matt Sinclair 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/arch/gcn3/insts/inst_util.hh
M src/arch/gcn3/insts/instructions.cc
2 files changed, 133 insertions(+), 110 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/inst_util.hh  
b/src/arch/gcn3/insts/inst_util.hh

index a3b2f4a..292e3ba 100644
--- a/src/arch/gcn3/insts/inst_util.hh
+++ b/src/arch/gcn3/insts/inst_util.hh
@@ -547,8 +547,8 @@
  * operations are done on it.
  */
 template
-T sdwaInstSrcImpl_helper(T currOperVal, T origOperVal, SDWASelVals sel,
- bool signExt)
+T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,
+ const SDWASelVals sel, const bool signExt)
 {
 // local variables
 int first_bit = 0, last_bit = 0;
@@ -635,16 +635,14 @@
  *   2.  if sign extend is set, then sign extend the value
  */
 template
-void sdwaInstSrcImpl(T & currOper, T & origCurrOper, SDWASelVals sel,
- bool signExt)
+void sdwaInstSrcImpl(T & currOper, T & origCurrOper,
+ const SDWASelVals sel, const bool signExt)
 {
 // iterate over all lanes, setting appropriate, selected value
-currOper.read();
-origCurrOper.read();
 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
-   origCurrOper[lane], sel,
-   signExt);
+origCurrOper[lane],  
sel,

+signExt);
 }
 }

@@ -656,8 +654,9 @@
  * operations are done on it.
  */
 template
-T sdwaInstDstImpl_helper(T currDstVal, T origDstVal, bool clamp,
- SDWASelVals sel, SDWADstVals  
unusedBits_format)

+T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,
+ const bool clamp, const SDWASelVals sel,
+ const SDWADstVals unusedBits_format)
 {
 // local variables
 int first_bit = 0, last_bit = 0;
@@ -756,12 +755,11 @@
  *   2 (SDWA_UNUSED_PRESERVE): select data[31:0]
  */
 template
-void sdwaInstDstImpl(T & dstOper, T & origDstOper, bool clamp,
- SDWASelVals sel, SDWADstVals unusedBits_format)
+void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,
+ const SDWASelVals sel,
+ const SDWADstVals unusedBits_format)
 {
 // iterate over all lanes, setting appropriate, selected value
-dstOper.read();
-origDstOper.read();
 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
origDstOper[lane],  
clamp,

@@ -779,8 +777,9 @@
  */
 template
 void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
-SDWASelVals src_sel, bool src_signExt,
-bool src_abs, bool src_neg)
+const SDWASelVals src_sel,
+const bool src_signExt, const bool src_abs,
+const bool src_neg)
 {
 /**
  * STEP 1: check if the absolute value (ABS) or negation (NEG) tags
@@ -812,14 +811,13 @@
  * processSDWA_src is called before the math.
  */
 template
-void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
- T & src0, T & origSrc0)
+void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & or

[gem5-dev] Change in gem5/gem5[develop]: tests: remove deprecated hsail gpu_hello

2020-06-19 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29921 )


Change subject: tests: remove deprecated hsail gpu_hello
..

tests: remove deprecated hsail gpu_hello

Change-Id: I7e15075e7805af732e89c3269fdff9d65a144219
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29921
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Jason Lowe-Power 
Reviewed-by: Matt Sinclair 
Reviewed-by: Xianwei Zhang 
Maintainer: Anthony Gutierrez 
Maintainer: Jason Lowe-Power 
Tested-by: kokoro 
---
D tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello
D tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm
D tests/test-progs/gpu-hello/src/gpu-hello-kernel.cl
D tests/test-progs/gpu-hello/src/gpu-hello.cpp
4 files changed, 0 insertions(+), 420 deletions(-)

Approvals:
  Jason Lowe-Power: Looks good to me, approved; Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, but someone else must approve
  Xianwei Zhang: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello  
b/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello

deleted file mode 100755
index de248ee..000
--- a/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello
+++ /dev/null
Binary files differ
diff --git a/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm  
b/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm

deleted file mode 100644
index a4ad144..000
--- a/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm
+++ /dev/null
Binary files differ
diff --git a/tests/test-progs/gpu-hello/src/gpu-hello-kernel.cl  
b/tests/test-progs/gpu-hello/src/gpu-hello-kernel.cl

deleted file mode 100755
index 496f9b5..000
--- a/tests/test-progs/gpu-hello/src/gpu-hello-kernel.cl
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * For use for simulation and test purposes only
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are  
met:

- *
- * 1. Redistributions of source code must retain the above copyright  
notice,

- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright  
notice,
- * this list of conditions and the following disclaimer in the  
documentation

- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its  
contributors

- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS  
IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,  
THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR  
PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS  
BE

- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF  
THE

- * POSSIBILITY OF SUCH DAMAGE.
- *
- * Author: Marc Orr
- */
-
-
-__kernel void read_kernel(size_t code_size,
-  __global char *code_in,
-  __global int *key_arr,
-  __global char *msg_out,
-  __global int *chars_decoded)
-{
-size_t gid = get_global_id(0);
-size_t my_idx = gid % code_size;
-bool decode = 0;
-__local atomic_int lcount;
-
-if (get_local_id(0) == 0) {
-lcount=0;
-}
-barrier(CLK_LOCAL_MEM_FENCE);
-
-// read code
-char mycode = code_in[my_idx];
-
-// decode
-int my_key = key_arr[my_idx];
-if (my_key) {
-decode = 1;
-for (int n = 0; n < my_key; n++) {
-mycode++;
-}
-}
-
-// write out msg
-msg_out[gid] = mycode;
-
-if (decode) {
-atomic_fetch_add((atomic_int *)(&lcount), 1);
-}
-barrier(CLK_LOCAL_MEM_FENCE);
-
-
-if (get_local_id(0) == 0) {
-int _lcount = atomic_load(&lcount);
-atomic_fetch_add((atomic_int *)chars_decoded, _lcount);
-}
-}
diff --git a/tests/test-progs/gpu-hello/src/gpu-hello.cpp  
b/tests/test-progs/gpu-hello/src/gpu-hello.cpp

deleted file mode 100755
index bdff074..000
--- a/tests/test-progs/gpu-hello/src/gpu-hello.cpp
+++ /dev/null
@@ -1,342 +0,0 @@
-/*

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: add support for unaligned accesses

2020-06-19 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29920 )


Change subject: arch-gcn3: add support for unaligned accesses
..

arch-gcn3: add support for unaligned accesses

Previously, with HSAIL, we were guaranteed by the HSA specification
that the GPU will never issue unaligned accesses.  However, now
that we are directly running GCN this is no longer true.
Accordingly, this commit adds support for unaligned accesses.
Moreover, to reduce the replication of nearly identical
code for the different request types, I also added new helper
functions that are called by all the different memory request
producing instruction types in op_encodings.hh.

Adding support for unaligned instructions requires changing
the statusBitVector used to track the status of the memory
requests for each lane from a bit per lane to an int per lane.
This is necessary because an unaligned access may span multiple
cache lines.  In the worst case, each lane may span multiple
cache lines.  There are corresponding changes in the files that
use the statusBitVector.

Change-Id: I319bf2f0f644083e98ca546d2bfe68cf87a5f967
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29920
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Matt Sinclair 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
A src/arch/gcn3/gpu_mem_helpers.hh
M src/arch/gcn3/insts/op_encodings.hh
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/gpu_dyn_inst.cc
M src/gpu-compute/gpu_dyn_inst.hh
M src/mem/ruby/common/DataBlock.cc
M src/mem/ruby/system/RubyPort.cc
7 files changed, 298 insertions(+), 242 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/gpu_mem_helpers.hh  
b/src/arch/gcn3/gpu_mem_helpers.hh

new file mode 100644
index 000..40ca565
--- /dev/null
+++ b/src/arch/gcn3/gpu_mem_helpers.hh
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are  
met:

+ *
+ * 1. Redistributions of source code must retain the above copyright  
notice,

+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright  
notice,
+ * this list of conditions and the following disclaimer in the  
documentation

+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from  
this

+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS  
IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,  
THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR  
PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS  
BE

+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF  
THE

+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Matt Sinclair
+ */
+
+#ifndef __ARCH_GCN3_GPU_MEM_HELPERS_HH__
+#define __ARCH_GCN3_GPU_MEM_HELPERS_HH__
+
+#include "arch/gcn3/insts/gpu_static_inst.hh"
+#include "arch/gcn3/insts/op_encodings.hh"
+#include "debug/GPUMem.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+
+/**
+ * Helper function for instructions declared in op_encodings.  This  
function
+ * takes in all of the arguments for a given memory request we are trying  
to

+ * initialize, then submits the request or requests depending on if the
+ * original request is aligned or unaligned.
+ */
+template
+inline void
+initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type,
+ bool is_atomic=false)
+{
+// local variables
+int req_size = N * sizeof(T);
+int block_size = gpuDynInst->computeUnit()->cacheLineSize();
+Addr vaddr = 0, split_addr = 0;
+bool misaligned_acc = false;
+RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr;
+PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr;
+
+gpuDynInst->resetEntireStatusVector();
+for (int lane = 0; lane < Gcn3ISA::NumVecElemPerVecReg; ++lane) {
+if (gpuDynInst->exec_mask[lane]) {
+vaddr = gpuDynInst->addr[lane];
+
+   

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement instruction v_div_scale_f32

2020-06-19 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29919 )


Change subject: arch-gcn3: Implement instruction v_div_scale_f32
..

arch-gcn3: Implement instruction v_div_scale_f32

Instruction v_div_scale_f32 was unimplemented, the
implementation was added by mimicking v_div_scale_f64.

Change-Id: I89cdfd02ab01b5936de0e9f6c41e7f3fc4f10ae1
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29919
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Xianwei Zhang 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 34 insertions(+), 2 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Xianwei Zhang: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 8d63296..bd6e4f4 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -28746,8 +28746,40 @@
 void
 Inst_VOP3__V_DIV_SCALE_F32::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+src0.readSrc();
+src1.readSrc();
+src2.readSrc();
+
+if (extData.NEG & 0x1) {
+src0.negModifier();
+}
+
+if (extData.NEG & 0x2) {
+src1.negModifier();
+}
+
+if (extData.NEG & 0x4) {
+src2.negModifier();
+}
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (wf->execMask(lane)) {
+vdst[lane] = src0[lane];
+vcc.setBit(lane, 0);
+}
+}
+
+vcc.write();
+vdst.write();
+} // execute
+// --- Inst_VOP3__V_DIV_SCALE_F64 class methods ---

 Inst_VOP3__V_DIV_SCALE_F64::Inst_VOP3__V_DIV_SCALE_F64(
   InFmt_VOP3_SDST_ENC *iFmt)

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29919
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I89cdfd02ab01b5936de0e9f6c41e7f3fc4f10ae1
Gerrit-Change-Number: 29919
Gerrit-PatchSet: 6
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: Xianwei Zhang 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: config: fix settings of kernel boundary sync flags

2020-06-19 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29918 )


Change subject: config: fix settings of kernel boundary sync flags
..

config: fix settings of kernel boundary sync flags

Change-Id: I58a8edc5d324bdcaa84e3d715e2712a43e8ede0d
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29918
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Matt Sinclair 
Reviewed-by: Xianwei Zhang 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M configs/example/apu_se.py
1 file changed, 15 insertions(+), 5 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, approved
  Xianwei Zhang: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index fee85f0..0ff80d8 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -209,13 +209,23 @@
 # So, all GPU protocols other than GPU_RfO should make their writes
 # visible to the global memory and should read from global memory
 # during kernal boundary. The pipeline initiates(or do not initiate)
-# the acquire/release operation depending on this impl_kern_boundary_sync
-# flag. This flag=true means pipeline initiates a acquire/release operation
-# at kernel boundary.
+# the acquire/release operation depending on these impl_kern_launch_rel
+# and impl_kern_end_rel flags.  The flag=true means pipeline initiates
+# a acquire/release operation at kernel launch/end.
+# VIPER protocols (GPU_VIPER, GPU_VIPER_Region and GPU_VIPER_Baseline)
+# are write-through based, and thus only imple_kern_launch_acq needs to
+# set.
 if buildEnv['PROTOCOL'] == 'GPU_RfO':
-shader.impl_kern_boundary_sync = False
+shader.impl_kern_launch_acq = False
+shader.impl_kern_end_rel = False
+elif (buildEnv['PROTOCOL'] != 'GPU_VIPER' or
+buildEnv['PROTOCOL'] != 'GPU_VIPER_Region' or
+buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline'):
+shader.impl_kern_launch_acq = True
+shader.impl_kern_end_rel = False
 else:
-shader.impl_kern_boundary_sync = True
+shader.impl_kern_launch_acq = True
+shader.impl_kern_end_rel = True

 # Switching off per-lane TLB by default
 per_lane = False

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29918
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I58a8edc5d324bdcaa84e3d715e2712a43e8ede0d
Gerrit-Change-Number: 29918
Gerrit-PatchSet: 6
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: Xianwei Zhang 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: enable flexible control of kernel boundary syncs

2020-06-19 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29917 )


Change subject: gpu-compute: enable flexible control of kernel boundary  
syncs

..

gpu-compute: enable flexible control of kernel boundary syncs

Kernel end release was turned on for VIPER protocol, which
is in fact write-through based and thus no need to have
release operation. This changeset splits the option
'impl_kern_boundary_sync' into 'impl_kern_launch_acq'
and 'impl_kern_end_rel', and turns off release on VIPER.

Change-Id: I5490019b6765a25bd801cc78fb7445b90eb02a3d
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29917
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Xianwei Zhang 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/arch/gcn3/insts/instructions.cc
M src/gpu-compute/GPU.py
M src/gpu-compute/dispatcher.cc
M src/gpu-compute/shader.cc
M src/gpu-compute/shader.hh
5 files changed, 20 insertions(+), 11 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Xianwei Zhang: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 7578694..8d63296 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -3759,9 +3759,13 @@
 // the last workgroup in the kernel).
 bool kernelEnd =
  
wf->computeUnit->shader->dispatcher().isReachingKernelEnd(wf);

+// further check whether 'release @ kernel end' is needed
+bool relNeeded =
+wf->computeUnit->shader->impl_kern_end_rel;

-// if it is not a kernel end, then retire the workgroup  
directly

-if (!kernelEnd) {
+// if not a kernel end or no release needed, retire the  
workgroup

+// directly
+if (!kernelEnd || !relNeeded) {
 wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
 wf->setStatus(Wavefront::S_STOPPED);
 wf->computeUnit->completedWGs++;
@@ -3770,8 +3774,8 @@
 }

 /**
- * If it is a kernel end, inject a memory sync and retire the
- * workgroup after receving response.
+ * If a kernel end and release needed, inject a memory sync and
+ * retire the workgroup after receving all acks.
  */
 setFlag(MemSync);
 setFlag(GlobalSegment);
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index 6b033f4..8a2ad81 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -213,8 +213,10 @@
 gpu_cmd_proc = Param.GPUCommandProcessor('Command processor for GPU')
 dispatcher = Param.GPUDispatcher('GPU workgroup dispatcher')
 n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
-impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets  
into

- ruby at kernel boundaries""")
+impl_kern_launch_acq = Param.Bool(True, """Insert acq packet into
+ ruby at kernel launch""")
+impl_kern_end_rel = Param.Bool(False, """Insert rel packet into
+ ruby at kernel end""")
 globalmem = Param.MemorySize('64kB', 'Memory size')
 timing = Param.Bool(False, 'timing memory accesses')

diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
index 51f5e97..6a8242f 100644
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -166,12 +166,12 @@
 auto task = hsaQueueEntries[exec_id];
 bool launched(false);

-// invalidate is needed before starting dispatch
-if (shader->impl_kern_boundary_sync) {
+// acq is needed before starting dispatch
+if (shader->impl_kern_launch_acq) {
 // try to invalidate cache
 shader->prepareInvalidate(task);
 } else {
-// kern boundary sync is not set, skip invalidate
+// kern launch acquire is not set, skip invalidate
 task->markInvDone();
 }

diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index 4be2fbf..aa7a6dd 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -56,7 +56,8 @@
 tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds  
event",

   false, Event::CPU_Tick_Pri),
 timingSim(p->timing), hsail_mode(SIMT),
-impl_kern_boundary_sync(p->impl_kern_boundary_sync),
+impl_kern_launch_acq(p->impl_kern_launch_acq),
+impl_kern_end_rel(p->impl_kern_end_rel),
 coissue_return(1),
 trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
 globalMemSize(p->globalmem),
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
index 72063a4..eeaf343 10

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: remove recvToken from GM pipe exec

2020-06-19 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29916 )


Change subject: gpu-compute: remove recvToken from GM pipe exec
..

gpu-compute: remove recvToken from GM pipe exec

Tokens were previously acquired in GM pipe exec but has been moved to
acqCoalescerToken. This removes the extraneous code which was acquiring
tokens twice, causing them to be depleted and triggering an assertion.

Change-Id: Ic92de8f06cc85828b29c69790bdadde057ef1777
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29916
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Matthew Poremba 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/gpu-compute/global_memory_pipeline.cc
1 file changed, 0 insertions(+), 6 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matthew Poremba: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/gpu-compute/global_memory_pipeline.cc  
b/src/gpu-compute/global_memory_pipeline.cc

index 0bbacc4..c73184a 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -190,12 +190,6 @@

 DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
 mp->disassemble(), mp->seqNum());
-// Memfences will not return tokens and must be issued so we should
-// not request one as this will deplete the token count until  
deadlock

-if (!mp->isMemSync()) {
-assert(mp->computeUnit()->getTokenManager()->haveTokens(1));
-mp->computeUnit()->getTokenManager()->acquireTokens(1);
-}
 mp->initiateAcc(mp);

 if (((mp->isMemSync() && !mp->isEndOfKernel()) | 
| !mp->isMemSync())) {


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29916
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Ic92de8f06cc85828b29c69790bdadde057ef1777
Gerrit-Change-Number: 29916
Gerrit-PatchSet: 6
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Matthew Poremba 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: mem-ruby: Add DMA support to MOESI_AMD_Base-dir.sm

2020-06-19 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29914 )


Change subject: mem-ruby: Add DMA support to MOESI_AMD_Base-dir.sm
..

mem-ruby: Add DMA support to MOESI_AMD_Base-dir.sm

This change adds DMA support to the MOESI_AMD_Base-dir.sm,
which is needed to support ROCm apps/GCN3 ISA in the VIPER
ptl. The DMA controller is copied from the MOESI_hammer-dma.sm
with few modifications.

Change-Id: I56141436eee1c8f62c2a0915fa3b63b83bbcbc9a
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29914
Reviewed-by: Anthony Gutierrez 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/mem/ruby/protocol/GPU_VIPER.slicc
M src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
A src/mem/ruby/protocol/MOESI_AMD_Base-dma.sm
M src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
4 files changed, 499 insertions(+), 6 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/mem/ruby/protocol/GPU_VIPER.slicc  
b/src/mem/ruby/protocol/GPU_VIPER.slicc

index 55ed671..196058b 100644
--- a/src/mem/ruby/protocol/GPU_VIPER.slicc
+++ b/src/mem/ruby/protocol/GPU_VIPER.slicc
@@ -2,6 +2,7 @@
 include "RubySlicc_interfaces.slicc";
 include "MOESI_AMD_Base-msg.sm";
 include "MOESI_AMD_Base-dir.sm";
+include "MOESI_AMD_Base-dma.sm";
 include "MOESI_AMD_Base-CorePair.sm";
 include "GPU_VIPER-msg.sm";
 include "GPU_VIPER-TCP.sm";
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm  
b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm

index efbffbd..c8dafd5 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -42,6 +42,10 @@
   bool useL3OnWT := "False";
   Cycles to_memory_controller_latency := 1;

+  // DMA
+  MessageBuffer * requestFromDMA, network="From", virtual_network="1",  
vnet_type="request";
+  MessageBuffer * responseToDMA, network="To", virtual_network="3",  
vnet_type="request";

+
   // From the Cores
   MessageBuffer * requestFromCores, network="From", virtual_network="0",  
vnet_type="request";
   MessageBuffer * responseFromCores, network="From", virtual_network="2",  
vnet_type="response";

@@ -63,13 +67,17 @@
 // BL is Busy because it's possible for the data only to be in the  
network
 // in the WB, L3 has sent it and gone on with its business in possibly  
I

 // state.
+BDR_M, AccessPermission:Backing_Store,  desc="DMA read, blocked  
waiting for memory";
 BS_M, AccessPermission:Backing_Store, desc="blocked  
waiting for memory";
 BM_M, AccessPermission:Backing_Store, desc="blocked  
waiting for memory";
 B_M, AccessPermission:Backing_Store, desc="blocked  
waiting for memory";
 BP, AccessPermission:Backing_Store, desc="blocked  
waiting for probes, no need for memory";
+BDR_PM, AccessPermission:Backing_Store, desc="DMA read, blocked  
waiting for probes and memory";
 BS_PM, AccessPermission:Backing_Store,desc="blocked  
waiting for probes and Memory";
 BM_PM, AccessPermission:Backing_Store,desc="blocked  
waiting for probes and Memory";
 B_PM, AccessPermission:Backing_Store,desc="blocked  
waiting for probes and Memory";
+BDW_P, AccessPermission:Backing_Store, desc="DMA write, blocked  
waiting for probes, no need for memory";
+BDR_Pm, AccessPermission:Backing_Store, desc="DMA read, blocked  
waiting for probes, already got memory";
 BS_Pm, AccessPermission:Backing_Store,desc="blocked  
waiting for probes, already got memory";
 BM_Pm, AccessPermission:Backing_Store,desc="blocked  
waiting for probes, already got memory";
 B_Pm, AccessPermission:Backing_Store,desc="blocked  
waiting for probes, already got memory";

@@ -107,6 +115,10 @@
 UnblockWriteThrough,desc="Unblock because of writethrough request  
finishing";


 StaleVicDirty,desc="Core invalidated before VicDirty  
processed";

+
+// DMA
+DmaRead,desc="DMA read";
+DmaWrite,   desc="DMA write";
   }

   enumeration(RequestType, desc="To communicate stats from transitions to  
recordStats") {

@@ -148,6 +160,7 @@
 bool L3Hit, default="false", desc="Was this an L3 hit?";
 uint64_t probe_id,desc="probe id for lifetime profiling";
 WriteMask writeMask,desc="outstanding write through mask";
+int Len,desc="Length of memory request for DMA";
   }

   structure(TBETable, external="yes") {
@@ -266,6 +279,8 @@
   }

   // ** OUT_PORTS **
+  out_port(dmaResponseQueue_out, DMAResponseMsg, responseToDMA);
+
   out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore);
   out_port(responseNetwork_out, ResponseMsg, responseToCore);

@@ -276,6 +291,23 @@

   // ** IN_PORTS **

+  // DMA Ports
+ 

[gem5-dev] Change in gem5/gem5[develop]: arch, gpu-compute: Remove HSAIL related files

2020-06-16 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/28410 )


Change subject: arch, gpu-compute: Remove HSAIL related files
..

arch, gpu-compute: Remove HSAIL related files

Change-Id: Iefba0a38d62da7598bbfe3fe6ff46454d35144b1
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/28410
Reviewed-by: Anthony Gutierrez 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M MAINTAINERS
M SConstruct
D build_opts/HSAIL_X86
M src/arch/gcn3/SConscript
D src/arch/hsail/Brig.h
D src/arch/hsail/Brig_new.hpp
D src/arch/hsail/SConscript
D src/arch/hsail/SConsopts
D src/arch/hsail/gen.py
D src/arch/hsail/gpu_decoder.hh
D src/arch/hsail/gpu_isa.hh
D src/arch/hsail/gpu_types.hh
D src/arch/hsail/insts/branch.cc
D src/arch/hsail/insts/branch.hh
D src/arch/hsail/insts/decl.hh
D src/arch/hsail/insts/gpu_static_inst.cc
D src/arch/hsail/insts/gpu_static_inst.hh
D src/arch/hsail/insts/main.cc
D src/arch/hsail/insts/mem.cc
D src/arch/hsail/insts/mem.hh
D src/arch/hsail/insts/mem_impl.hh
D src/arch/hsail/insts/pseudo_inst.cc
D src/arch/hsail/operand.cc
D src/arch/hsail/operand.hh
D src/gpu-compute/brig_object.cc
D src/gpu-compute/brig_object.hh
D src/gpu-compute/cl_driver.cc
D src/gpu-compute/cl_driver.hh
D src/gpu-compute/cl_event.hh
D src/gpu-compute/condition_register_state.cc
D src/gpu-compute/condition_register_state.hh
D src/gpu-compute/hsa_code.hh
D src/gpu-compute/hsa_kernel_info.hh
D src/gpu-compute/hsa_object.cc
D src/gpu-compute/hsa_object.hh
D src/gpu-compute/hsail_code.cc
D src/gpu-compute/hsail_code.hh
D src/gpu-compute/kernel_cfg.cc
D src/gpu-compute/kernel_cfg.hh
D src/gpu-compute/ndrange.hh
D src/gpu-compute/qstruct.hh
D src/gpu-compute/vector_register_state.cc
D src/gpu-compute/vector_register_state.hh
M util/git-commit-msg.py
M util/regress
45 files changed, 6 insertions(+), 12,854 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass




--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/28410
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Iefba0a38d62da7598bbfe3fe6ff46454d35144b1
Gerrit-Change-Number: 28410
Gerrit-PatchSet: 10
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Alexandru Duțu 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Bradford Beckmann 
Gerrit-Reviewer: Gabe Black 
Gerrit-Reviewer: Jason Lowe-Power 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Matthew Poremba 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model

2020-06-15 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29912 )


Change subject: gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU  
model

..

gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model

Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Jason Lowe-Power 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
A build_opts/GCN3_X86
M configs/common/GPUTLBConfig.py
M src/arch/gcn3/insts/instructions.cc
M src/arch/gcn3/insts/op_encodings.hh
M src/arch/gcn3/operand.hh
M src/dev/hsa/hsa_device.cc
M src/dev/hsa/hsa_driver.cc
M src/dev/hsa/hsa_driver.hh
M src/dev/hsa/hsa_packet_processor.cc
M src/dev/hsa/hw_scheduler.cc
M src/gpu-compute/GPU.py
M src/gpu-compute/GPUStaticInstFlags.py
M src/gpu-compute/SConscript
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/compute_unit.hh
M src/gpu-compute/dispatcher.cc
M src/gpu-compute/dispatcher.hh
M src/gpu-compute/exec_stage.cc
M src/gpu-compute/exec_stage.hh
M src/gpu-compute/fetch_stage.cc
M src/gpu-compute/fetch_stage.hh
M src/gpu-compute/fetch_unit.cc
M src/gpu-compute/fetch_unit.hh
M src/gpu-compute/global_memory_pipeline.cc
M src/gpu-compute/global_memory_pipeline.hh
A src/gpu-compute/gpu_command_processor.cc
A src/gpu-compute/gpu_command_processor.hh
A src/gpu-compute/gpu_compute_driver.cc
A src/gpu-compute/gpu_compute_driver.hh
M src/gpu-compute/gpu_dyn_inst.cc
M src/gpu-compute/gpu_dyn_inst.hh
M src/gpu-compute/gpu_exec_context.cc
M src/gpu-compute/gpu_static_inst.cc
M src/gpu-compute/gpu_static_inst.hh
M src/gpu-compute/gpu_tlb.cc
M src/gpu-compute/gpu_tlb.hh
A src/gpu-compute/hsa_queue_entry.hh
A src/gpu-compute/kernel_code.hh
M src/gpu-compute/lds_state.cc
M src/gpu-compute/lds_state.hh
M src/gpu-compute/local_memory_pipeline.cc
M src/gpu-compute/local_memory_pipeline.hh
M src/gpu-compute/misc.hh
M src/gpu-compute/pool_manager.cc
M src/gpu-compute/pool_manager.hh
A src/gpu-compute/register_file.cc
A src/gpu-compute/register_file.hh
A src/gpu-compute/register_manager.cc
A src/gpu-compute/register_manager.hh
A src/gpu-compute/register_manager_policy.hh
M src/gpu-compute/rr_scheduling_policy.hh
A src/gpu-compute/scalar_memory_pipeline.cc
A src/gpu-compute/scalar_memory_pipeline.hh
A src/gpu-compute/scalar_register_file.cc
A src/gpu-compute/scalar_register_file.hh
M src/gpu-compute/schedule_stage.cc
M src/gpu-compute/schedule_stage.hh
M src/gpu-compute/scoreboard_check_stage.cc
M src/gpu-compute/scoreboard_check_stage.hh
M src/gpu-compute/shader.cc
M src/gpu-compute/shader.hh
M src/gpu-compute/simple_pool_manager.cc
M src/gpu-compute/simple_pool_manager.hh
A src/gpu-compute/static_register_manager_policy.cc
A src/gpu-compute/static_register_manager_policy.hh
M src/gpu-compute/tlb_coalescer.cc
M src/gpu-compute/tlb_coalescer.hh
M src/gpu-compute/vector_register_file.cc
M src/gpu-compute/vector_register_file.hh
M src/gpu-compute/wavefront.cc
M src/gpu-compute/wavefront.hh
M src/mem/packet.cc
M src/mem/packet.hh
M src/mem/ruby/protocol/GPU_VIPER-TCP.sm
A src/mem/ruby/protocol/GPU_VIPER-msg.sm
M src/mem/ruby/protocol/GPU_VIPER.slicc
M src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
M src/mem/ruby/protocol/RubySlicc_Exports.sm
M src/mem/ruby/protocol/RubySlicc_Types.sm
M src/mem/ruby/slicc_interface/AbstractController.cc
M src/mem/ruby/slicc_interface/RubyRequest.hh
M src/mem/ruby/system/GPUCoalescer.cc
M src/mem/ruby/system/GPUCoalescer.hh
M src/mem/ruby/system/GPUCoalescer.py
M src/mem/ruby/system/VIPERCoalescer.hh
M src/mem/ruby/system/VIPERCoalescer.py
86 files changed, 10,299 insertions(+), 3,734 deletions(-)

Approvals:
  Jason Lowe-Power: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass




--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29912
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254
Gerrit-Change-Number: 29912
Gerrit-PatchSet: 8
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Bradford Beckmann 
Gerrit-Reviewer: Gabe Black 
Gerrit-Reviewer: Jason Lowe-Power 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Matthew Poremba 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s


[gem5-dev] Change in gem5/gem5[develop]: mem-ruby: Add codes for pure virtual functions for compilation

2020-06-09 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/28409 )


Change subject: mem-ruby: Add codes for pure virtual functions for  
compilation

..

mem-ruby: Add codes for pure virtual functions for compilation

Change-Id: Ic34f9ccf10ec28d68eed236dc6246e2ae2ef1b89
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/28409
Tested-by: kokoro 
Reviewed-by: Anthony Gutierrez 
Reviewed-by: Matt Sinclair 
Maintainer: Anthony Gutierrez 
---
M src/mem/ruby/system/VIPERCoalescer.cc
M src/mem/ruby/system/VIPERCoalescer.hh
2 files changed, 13 insertions(+), 0 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/mem/ruby/system/VIPERCoalescer.cc  
b/src/mem/ruby/system/VIPERCoalescer.cc

index d8977ac..cdef2b1 100644
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -76,6 +76,16 @@
 {
 }

+void
+VIPERCoalescer::issueRequest(CoalescedRequest* crequest)
+{
+}
+
+void
+VIPERCoalescer::issueMemSyncRequest(PacketPtr pkt)
+{
+}
+
 // Places an uncoalesced packet in uncoalescedTable. If the packet is a
 // special type (MemFence, scoping, etc), it is issued immediately.
 RequestStatus
diff --git a/src/mem/ruby/system/VIPERCoalescer.hh  
b/src/mem/ruby/system/VIPERCoalescer.hh

index 2b6e86e..814166d 100644
--- a/src/mem/ruby/system/VIPERCoalescer.hh
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -57,6 +57,9 @@
 typedef VIPERCoalescerParams Params;
 VIPERCoalescer(const Params *);
 ~VIPERCoalescer();
+
+void issueMemSyncRequest(PacketPtr pkt);
+void issueRequest(CoalescedRequest* crequest) override;
 void wbCallback(Addr address);
 void invCallback(Addr address);
 RequestStatus makeRequest(PacketPtr pkt);

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/28409
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Ic34f9ccf10ec28d68eed236dc6246e2ae2ef1b89
Gerrit-Change-Number: 28409
Gerrit-PatchSet: 6
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Alexandru Duțu 
Gerrit-Reviewer: Anthony Gutierrez 
Gerrit-Reviewer: Bradford Beckmann 
Gerrit-Reviewer: Jason Lowe-Power 
Gerrit-Reviewer: Matt Sinclair 
Gerrit-Reviewer: Matthew Poremba 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Tuan Ta 
Gerrit-Reviewer: Xianwei Zhang 
Gerrit-Reviewer: kokoro 
Gerrit-MessageType: merged
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: mem-ruby: update memory interfaces to support GPU ISA

2020-06-09 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/28408 )


Change subject: mem-ruby: update memory interfaces to support GPU ISA
..

mem-ruby: update memory interfaces to support GPU ISA

This patch deprecates HSA-based memory request types and adds new
types that can be used by real ISA instructions.

Change-Id: Ie107a69d8a35e9de0853f1407392ad01a8b3e930
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/28408
Reviewed-by: Anthony Gutierrez 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/mem/packet.cc
M src/mem/packet.hh
M src/mem/request.hh
M src/mem/ruby/slicc_interface/RubyRequest.hh
4 files changed, 45 insertions(+), 131 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index 2d69ba2..1c1da21 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -181,6 +181,10 @@
 { 0, InvalidCmd, "Deprecated_MessageResp" },
 /* MemFenceReq -- for synchronization requests */
 {SET2(IsRequest, NeedsResponse), MemFenceResp, "MemFenceReq"},
+/* MemSyncReq */
+{SET2(IsRequest, NeedsResponse), MemSyncResp, "MemSyncReq"},
+/* MemSyncResp */
+{SET1(IsResponse), InvalidCmd, "MemSyncResp"},
 /* MemFenceResp -- for synchronization responses */
 {SET1(IsResponse), InvalidCmd, "MemFenceResp"},
 /* Cache Clean Request -- Update with the latest data all existing
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index d390c00..42d286a 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -110,6 +110,8 @@
 SwapResp,
 // MessageReq and MessageResp are deprecated.
 MemFenceReq = SwapResp + 3,
+MemSyncReq,  // memory synchronization request (e.g., cache  
invalidate)

+MemSyncResp, // memory synchronization response
 MemFenceResp,
 CleanSharedReq,
 CleanSharedResp,
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 01252bf..4e0ba97 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -110,7 +110,7 @@
  * STRICT_ORDER flag should be set if such reordering is
  * undesirable.
  */
-UNCACHEABLE= 0x0400,
+UNCACHEABLE = 0x0400,
 /**
  * The request is required to be strictly ordered by CPU
  * models and is non-speculative.
@@ -216,35 +216,30 @@
 };
 /** @} */

-typedef uint32_t MemSpaceConfigFlagsType;
-typedef ::Flags MemSpaceConfigFlags;
+typedef uint64_t CacheCoherenceFlagsType;
+typedef ::Flags CacheCoherenceFlags;

-enum : MemSpaceConfigFlagsType {
-/** Has a synchronization scope been set? */
-SCOPE_VALID= 0x0001,
-/** Access has Wavefront scope visibility */
-WAVEFRONT_SCOPE= 0x0002,
-/** Access has Workgroup scope visibility */
-WORKGROUP_SCOPE= 0x0004,
-/** Access has Device (e.g., GPU) scope visibility */
-DEVICE_SCOPE   = 0x0008,
-/** Access has System (e.g., CPU + GPU) scope visibility */
-SYSTEM_SCOPE   = 0x0010,
-
-/** Global Segment */
-GLOBAL_SEGMENT = 0x0020,
-/** Group Segment */
-GROUP_SEGMENT  = 0x0040,
-/** Private Segment */
-PRIVATE_SEGMENT= 0x0080,
-/** Kergarg Segment */
-KERNARG_SEGMENT= 0x0100,
-/** Readonly Segment */
-READONLY_SEGMENT   = 0x0200,
-/** Spill Segment */
-SPILL_SEGMENT  = 0x0400,
-/** Arg Segment */
-ARG_SEGMENT= 0x0800,
+/**
+ * These bits are used to set the coherence policy
+ * for the GPU and are encoded in the GCN3 instructions.
+ * See the AMD GCN3 ISA Architecture Manual for more
+ * details.
+ *
+ * SLC: System Level Coherent. Accesses are forced to miss in
+ *  the L2 cache and are coherent with system memory.
+ *
+ * GLC: Globally Coherent. Controls how reads and writes are
+ *  handled by the L1 cache. Global here referes to the
+ *  data being visible globally on the GPU (i.e., visible
+ *  to all WGs).
+ *
+ * For atomics, the GLC bit is used to distinguish between
+ * between atomic return/no-return operations.
+ */
+enum : CacheCoherenceFlagsType {
+/** user-policy flags */
+SLC_BIT = 0x0080,
+GLC_BIT = 0x0100,
 };

 using LocalAccessor =
@@ -305,8 +300,8 @@
 /** Flag structure for the request. */
 Flags _flags;

-/** Memory space configuraiton flag structure for the request. */
-MemSpaceConfigFlags _memSpaceConfigFlags;
+/** Flags that control

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute, arch-gcn3: Change how waitcnts are implemented

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29973

to review the following change.


Change subject: gpu-compute, arch-gcn3: Change how waitcnts are implemented
..

gpu-compute, arch-gcn3: Change how waitcnts are implemented

Use single counters per memory operation type and increment
them upon issue, not execute.

Change-Id: I6afc0b66b21882538ef90a14a57a3ab3cc7bd6f3
---
M src/arch/gcn3/insts/instructions.cc
M src/gpu-compute/global_memory_pipeline.cc
M src/gpu-compute/gpu_dyn_inst.cc
M src/gpu-compute/local_memory_pipeline.cc
M src/gpu-compute/scalar_memory_pipeline.cc
M src/gpu-compute/schedule_stage.cc
M src/gpu-compute/wavefront.cc
M src/gpu-compute/wavefront.hh
8 files changed, 106 insertions(+), 18 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 9987fad..7c2cf0e 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -32565,6 +32565,7 @@

 vdst.write();

+wf->decLGKMInstsIssued();
 wf->rdLmReqsInPipe--;
 wf->validateRequestCounters();
 } // execute
@@ -32635,6 +32636,7 @@

 vdst.write();

+wf->decLGKMInstsIssued();
 wf->rdLmReqsInPipe--;
 wf->validateRequestCounters();
 } // execute
@@ -39400,6 +39402,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->rdGmReqsInPipe--;
 wf->rdLmReqsInPipe--;
 return;
@@ -39496,6 +39500,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->rdGmReqsInPipe--;
 wf->rdLmReqsInPipe--;
 return;
@@ -39592,6 +39598,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->rdGmReqsInPipe--;
 wf->rdLmReqsInPipe--;
 return;
@@ -39660,6 +39668,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->rdGmReqsInPipe--;
 wf->rdLmReqsInPipe--;
 return;
@@ -39728,6 +39738,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->rdGmReqsInPipe--;
 wf->rdLmReqsInPipe--;
 return;
@@ -39805,6 +39817,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->rdGmReqsInPipe--;
 wf->rdLmReqsInPipe--;
 }
@@ -39884,6 +39898,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->wrGmReqsInPipe--;
 wf->wrLmReqsInPipe--;
 return;
@@ -39952,6 +39968,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->wrGmReqsInPipe--;
 wf->wrLmReqsInPipe--;
 return;
@@ -40021,6 +40039,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->wrGmReqsInPipe--;
 wf->wrLmReqsInPipe--;
 return;
@@ -40090,6 +40110,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->wrGmReqsInPipe--;
 wf->wrLmReqsInPipe--;
 return;
@@ -40159,6 +40181,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->wrGmReqsInPipe--;
 wf->wrLmReqsInPipe--;
 return;
@@ -40237,6 +40261,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 wf->wrGmReqsInPipe--;
 wf->wrLmReqsInPipe--;
 return;
@@ -40325,6 +40351,8 @@
 Wavefront *wf = gpuDynInst->wavefront();

 if (wf->execMask().none()) {
+wf->decVMemInstsIssued();
+wf->decLGKMInstsIssued();
 

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Add pipeline stage interface classes

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29972

to review the following change.


Change subject: gpu-compute: Add pipeline stage interface classes
..

gpu-compute: Add pipeline stage interface classes

This change separates the pipeline stage interfaces
for the GPU's compute unit into their own classes
with a well-defined interface. This helps to create
a cleaner interface for users to extend the CU
pipeline's capabilities and also helps consolidate
all the pipeline communication code in one place
in the source.

Change-Id: I569d52bce84dc1b9fbf8f0f96d53a81a2b6773c6
---
M src/gpu-compute/SConscript
A src/gpu-compute/comm.cc
A src/gpu-compute/comm.hh
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/compute_unit.hh
M src/gpu-compute/exec_stage.cc
M src/gpu-compute/exec_stage.hh
M src/gpu-compute/schedule_stage.cc
M src/gpu-compute/schedule_stage.hh
M src/gpu-compute/scoreboard_check_stage.cc
M src/gpu-compute/scoreboard_check_stage.hh
11 files changed, 578 insertions(+), 308 deletions(-)



diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript
index 244791b..0f1afbc 100644
--- a/src/gpu-compute/SConscript
+++ b/src/gpu-compute/SConscript
@@ -41,6 +41,7 @@
 SimObject('LdsState.py')
 SimObject('X86GPUTLB.py')

+Source('comm.cc')
 Source('compute_unit.cc')
 Source('dispatcher.cc')
 Source('exec_stage.cc')
diff --git a/src/gpu-compute/comm.cc b/src/gpu-compute/comm.cc
new file mode 100644
index 000..b1dd031
--- /dev/null
+++ b/src/gpu-compute/comm.cc
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are  
met:

+ *
+ * 1. Redistributions of source code must retain the above copyright  
notice,

+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright  
notice,
+ * this list of conditions and the following disclaimer in the  
documentation

+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from  
this

+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS  
IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,  
THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR  
PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS  
BE

+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF  
THE

+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+#include "gpu-compute/comm.hh"
+
+#include 
+
+#include "gpu-compute/wavefront.hh"
+#include "params/ComputeUnit.hh"
+
+/**
+ * Scoreboard/Schedule stage interface.
+ */
+ScoreboardCheckToSchedule::ScoreboardCheckToSchedule(const  
ComputeUnitParams

+ *p)
+{
+int num_func_units = p->num_SIMDs + p->num_scalar_cores
++ p->num_global_mem_pipes + p->num_shared_mem_pipes
++ p->num_scalar_mem_pipes;
+_readyWFs.resize(num_func_units);
+
+for (auto &func_unit_wf_list : _readyWFs) {
+func_unit_wf_list.reserve(p->n_wf);
+}
+}
+
+void
+ScoreboardCheckToSchedule::reset()
+{
+for (auto &func_unit_wf_list : _readyWFs) {
+func_unit_wf_list.resize(0);
+}
+}
+
+void
+ScoreboardCheckToSchedule::markWFReady(Wavefront *wf, int func_unit_id)
+{
+_readyWFs[func_unit_id].push_back(wf);
+}
+
+int
+ScoreboardCheckToSchedule::numReadyLists() const
+{
+return _readyWFs.size();
+}
+
+std::vector&
+ScoreboardCheckToSchedule::readyWFs(int func_unit_id)
+{
+return _readyWFs[func_unit_id];
+}
+
+/**
+ * Delete all wavefronts that have been marked as ready at scoreboard stage
+ * but are found to have empty instruction buffers at schedule stage.
+ */
+void
+ScoreboardCheckToSchedule::updateReadyList(int func_unit_id)
+{
+std::vector &func_unit_wf_list = _readyWFs[func_unit_id];
+
+for (auto it = func_unit_wf_list.begin(); it !=  
func_unit_wf_list.end();) {

+if ((*it)->instructionBuffer.empty()) {
+it = func_unit_wf_list.erase(it);
+} else {
+  

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Use refs to CU in pipe stages/mem pipes

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29969

to review the following change.


Change subject: gpu-compute: Use refs to CU in pipe stages/mem pipes
..

gpu-compute: Use refs to CU in pipe stages/mem pipes

The pipe stages and memory pipes are changed to store
a reference to their parent CU as opposed to a pointer.
These objects will never change which CU they belong to,
and they are constructed by their parent CU.

Change-Id: Ie5476e1e2e124a024c2efebceb28cb3a9baa78c1
---
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/exec_stage.cc
M src/gpu-compute/exec_stage.hh
M src/gpu-compute/fetch_stage.cc
M src/gpu-compute/fetch_stage.hh
M src/gpu-compute/fetch_unit.cc
M src/gpu-compute/fetch_unit.hh
M src/gpu-compute/global_memory_pipeline.cc
M src/gpu-compute/global_memory_pipeline.hh
M src/gpu-compute/local_memory_pipeline.cc
M src/gpu-compute/local_memory_pipeline.hh
M src/gpu-compute/scalar_memory_pipeline.cc
M src/gpu-compute/scalar_memory_pipeline.hh
M src/gpu-compute/schedule_stage.cc
M src/gpu-compute/schedule_stage.hh
M src/gpu-compute/scoreboard_check_stage.cc
M src/gpu-compute/scoreboard_check_stage.hh
17 files changed, 191 insertions(+), 191 deletions(-)



diff --git a/src/gpu-compute/compute_unit.cc  
b/src/gpu-compute/compute_unit.cc

index 653c074..a59a7fd 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -67,13 +67,13 @@
 vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
 coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
 registerManager(p->register_manager),
-fetchStage(p, this),
-scoreboardCheckStage(p, this),
-scheduleStage(p, this),
-execStage(p, this),
-globalMemoryPipe(p, this),
-localMemoryPipe(p, this),
-scalarMemoryPipe(p, this),
+fetchStage(p, *this),
+scoreboardCheckStage(p, *this),
+scheduleStage(p, *this),
+execStage(p, *this),
+globalMemoryPipe(p, *this),
+localMemoryPipe(p, *this),
+scalarMemoryPipe(p, *this),
 tickEvent([this]{ exec(); }, "Compute unit tick event",
   false, Event::CPU_Tick_Pri),
 cu_id(p->cu_id),
diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc
index 3c6aaad..1fc04f5 100644
--- a/src/gpu-compute/exec_stage.cc
+++ b/src/gpu-compute/exec_stage.cc
@@ -40,10 +40,10 @@
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"

-ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit *cu)
+ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu)
 : computeUnit(cu), lastTimeInstExecuted(false),
   thisTimeInstExecuted(false), instrExecuted (false),
-  executionResourcesUsed(0), _name(cu->name() + ".ExecStage")
+  executionResourcesUsed(0), _name(cu.name() + ".ExecStage")

 {
 numTransActiveIdle = 0;
@@ -53,7 +53,7 @@
 void
 ExecStage::init()
 {
-dispatchList = &computeUnit->dispatchList;
+dispatchList = &computeUnit.dispatchList;
 idle_dur = 0;
 }

@@ -126,7 +126,7 @@
 {
 std::stringstream ss;
 bool empty = true;
-for (int i = 0; i < computeUnit->numExeUnits(); i++) {
+for (int i = 0; i < computeUnit.numExeUnits(); i++) {
 DISPATCH_STATUS s = dispatchList->at(i).second;
 ss << i << ": " << dispStatusToStr(s);
 if (s != EMPTY) {
@@ -150,7 +150,7 @@
 if (Debug::GPUSched) {
 dumpDispList();
 }
-for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
+for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
 DISPATCH_STATUS s = dispatchList->at(unitId).second;
 switch (s) {
 case EMPTY:
@@ -167,7 +167,7 @@
 (w->instructionBuffer.front())->disassemble());
 DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
 dispatchList->at(unitId).first->exec();
-(computeUnit->scheduleStage).deleteFromSch(w);
+(computeUnit.scheduleStage).deleteFromSch(w);
 dispatchList->at(unitId).second = EMPTY;
 dispatchList->at(unitId).first->freeResources();
 dispatchList->at(unitId).first = nullptr;
@@ -207,7 +207,7 @@
 ;

 spc
-.init(0, computeUnit->numExeUnits(), 1)
+.init(0, computeUnit.numExeUnits(), 1)
 .name(name() + ".spc")
 .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
 ;
@@ -219,26 +219,26 @@
 ;

 numCyclesWithInstrTypeIssued
-.init(computeUnit->numExeUnits())
+.init(computeUnit.numExeUnits())
 .name(name() + ".num_cycles_issue_exec_rsrc")
 .desc("Number of cycles at least one instruction issued to "
   "execution resource type")
 ;

 numCyclesWithNoInstrTypeIssued
-.init(computeUnit->numExeUnits())
+.init(computeUnit.numExeUnits(

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add case to op selector when operand is vcc_hi

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29971

to review the following change.


Change subject: arch-gcn3: Add case to op selector when operand is vcc_hi
..

arch-gcn3: Add case to op selector when operand is vcc_hi

Change-Id: Ib8846656e18aad04ccb8c9112bc629c69078fe36
---
M src/arch/gcn3/registers.cc
1 file changed, 2 insertions(+), 0 deletions(-)



diff --git a/src/arch/gcn3/registers.cc b/src/arch/gcn3/registers.cc
index 016160f..d5c4903 100644
--- a/src/arch/gcn3/registers.cc
+++ b/src/arch/gcn3/registers.cc
@@ -141,6 +141,8 @@
  *
  */
 regIdx = numScalarRegs - 2;
+} else if (idx == REG_VCC_HI) {
+regIdx = numScalarRegs - 1;
 } else if (idx == REG_FLAT_SCRATCH_LO) {
 /**
  * the FLAT_SCRATCH register occupies the two SRF entries

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29971
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Ib8846656e18aad04ccb8c9112bc629c69078fe36
Gerrit-Change-Number: 29971
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s


[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: No RF scheduling in case of SKIP or EMPTY

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez, Alexandru Duțu,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29970

to review the following change.


Change subject: gpu-compute: No RF scheduling in case of SKIP or EMPTY
..

gpu-compute: No RF scheduling in case of SKIP or EMPTY

In case of flat memory instructions the status for the
LM pipe execution unit is set to SKIP or EMPTY, as the bus
between the VRF and the GM and LM pipe is shared. The
destination operands should not be scheduled for the LM pipe,
event if the wave is in the dispatch list. This can lead
to deadlock in the destination cache as DCEs are reused
and the slotsAvailableForBank count gets artificially
incremented.

Change-Id: I2230c53e3bc1032d2cccbe00fab62c99ab8de6cd
---
M src/gpu-compute/schedule_stage.cc
1 file changed, 5 insertions(+), 1 deletion(-)



diff --git a/src/gpu-compute/schedule_stage.cc  
b/src/gpu-compute/schedule_stage.cc

index 0785aa0..e0600a6 100644
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -236,9 +236,13 @@
 ScheduleStage::scheduleRfDestOperands()
 {
 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
-if (!dispatchList->at(j).first) {
+if (dispatchList->at(j).second == EMPTY ||
+dispatchList->at(j).second == SKIP) {
 continue;
 }
+
+assert(dispatchList->at(j).first);
+
 // get the wave on dispatch list and attempt to allocate write
 // resources in the RFs
 Wavefront *w = dispatchList->at(j).first;

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29970
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I2230c53e3bc1032d2cccbe00fab62c99ab8de6cd
Gerrit-Change-Number: 29970
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Alexandru Duțu 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix stride bug in buffer OOB detection logic

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Michael LeBeane, Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29968

to review the following change.


Change subject: arch-gcn3: Fix stride bug in buffer OOB detection logic
..

arch-gcn3: Fix stride bug in buffer OOB detection logic

The out-of-range logic for buffer accesses is missing the top 4 bits of
const_stride when dealing with scratch buffers.  This can cause
perfectly valid scratch acceses to be suppressed when const_stride is
large.

Change-Id: I8f94d44c242fda26cf6dfb75db04fa3aca934b3e
---
M src/arch/gcn3/insts/op_encodings.hh
1 file changed, 3 insertions(+), 3 deletions(-)



diff --git a/src/arch/gcn3/insts/op_encodings.hh  
b/src/arch/gcn3/insts/op_encodings.hh

index 202dd1d..b35fb3d 100644
--- a/src/arch/gcn3/insts/op_encodings.hh
+++ b/src/arch/gcn3/insts/op_encodings.hh
@@ -651,7 +651,7 @@
  * non-formatted accesses, this is done on a per-lane
  * basis.
  */
-if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {
+if (stride == 0 || !rsrc_desc.swizzleEn) {
 if (buf_off + stride * buf_idx >=
 rsrc_desc.numRecords - s_offset.rawData()) {
 DPRINTF(GCN3, "mubuf out-of-bounds condition  
1: "

@@ -659,13 +659,13 @@
 "const_stride = %llx, "
 "const_num_records = %llx\n",
 lane, buf_off + stride * buf_idx,
-rsrc_desc.stride,  
rsrc_desc.numRecords);

+stride, rsrc_desc.numRecords);
 oobMask.set(lane);
 continue;
 }
 }

-if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {
+if (stride != 0 && rsrc_desc.swizzleEn) {
 if (buf_idx >= rsrc_desc.numRecords ||
 buf_off >= stride) {
 DPRINTF(GCN3, "mubuf out-of-bounds condition  
2: "


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29968
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I8f94d44c242fda26cf6dfb75db04fa3aca934b3e
Gerrit-Change-Number: 29968
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Michael LeBeane 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s


[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Replace some instances of std::isnormal with std::fpclassify

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29967

to review the following change.


Change subject: arch-gcn3: Replace some instances of std::isnormal with  
std::fpclassify

..

arch-gcn3: Replace some instances of std::isnormal with std::fpclassify

Affected instructions: V_DIV_SCALE_F64, V_CMP_CLASS_F64,
V_CMPX_CLASS_F64 and their VOPC, VOP3, F32 variants.

These instances of std::isnormal were being used to check for
subnormal (denorms) values. std::isnormal is not specific enough.
It returns true for normal values but false for NaN, Inf, 0.0, and
subnormals. std::fpclassify returns macros for each category of
floating point numbers. Now we only catch subnormals.

Change-Id: I8d8f4452ff58de71e7c8e0b2b5e73467b532e196
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 23 insertions(+), 21 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 302dad4..9987fad 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -9439,7 +9439,7 @@
 }
 if (bits(src1[lane], 4)) {
 // is -denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -9463,7 +9463,7 @@
 }
 if (bits(src1[lane], 7)) {
 // is +denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && !std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -9551,7 +9551,7 @@
 }
 if (bits(src1[lane], 4)) {
 // is -denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -9575,7 +9575,7 @@
 }
 if (bits(src1[lane], 7)) {
 // is +denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && !std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -9664,7 +9664,7 @@
 }
 if (bits(src1[lane], 4)) {
 // is -denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -9688,7 +9688,7 @@
 }
 if (bits(src1[lane], 7)) {
 // is +denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && !std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -9777,7 +9777,7 @@
 }
 if (bits(src1[lane], 4)) {
 // is -denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -9801,7 +9801,7 @@
 }
 if (bits(src1[lane], 7)) {
 // is +denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && !std::signbit(src0[lane])) {
 vcc.setBit(lane, 1);
 continue;
@@ -15550,7 +15550,7 @@
 }
 if (bits(src1[lane], 4)) {
 // is -denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && std::signbit(src0[lane])) {
 sdst.setBit(lane,  1);
 continue;
@@ -15574,7 +15574,7 @@
 }
 if (bits(src1[lane], 7)) {
 // is +denormal
-if (!std::isnormal(src0[lane])
+if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
 && !std::signbit(src0[lane])) {
 sdst.setBit(lane,  1);
 continue;
@@ 

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add memcpy condition when writing EXEC_LO

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Matthew Poremba, Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29961

to review the following change.


Change subject: arch-gcn3: Add memcpy condition when writing EXEC_LO
..

arch-gcn3: Add memcpy condition when writing EXEC_LO

Some compilers emit an error on the operand template class when writing
exec mask. Add a condition to explicitly set memcpy size argument to
32b or 64b based on the number of dwords.

Change-Id: I49b0e4a1680283e772d0a5a8efd687b31d4f1624
---
M src/arch/gcn3/operand.hh
1 file changed, 9 insertions(+), 2 deletions(-)



diff --git a/src/arch/gcn3/operand.hh b/src/arch/gcn3/operand.hh
index 9d28deb..97c6310 100644
--- a/src/arch/gcn3/operand.hh
+++ b/src/arch/gcn3/operand.hh
@@ -437,8 +437,15 @@
 if (_opIdx == REG_EXEC_LO) {
 ScalarRegU64 new_exec_mask_val
 = wf->execMask().to_ullong();
-std::memcpy((void*)&new_exec_mask_val,
-(void*)srfData.data(), sizeof(srfData));
+if (NumDwords == 1) {
+std::memcpy((void*)&new_exec_mask_val,
+(void*)srfData.data(), sizeof(VecElemU32));
+} else if (NumDwords == 2) {
+std::memcpy((void*)&new_exec_mask_val,
+(void*)srfData.data(), sizeof(VecElemU64));
+} else {
+panic("Trying to write more than 2 DWORDS to  
EXEC\n");

+}
 VectorMask new_exec_mask(new_exec_mask_val);
 wf->execMask() = new_exec_mask;
 DPRINTF(GPUSRF, "Write EXEC\n");

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29961
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I49b0e4a1680283e772d0a5a8efd687b31d4f1624
Gerrit-Change-Number: 29961
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Matthew Poremba 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s


[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Remove invalid assert when reading EXEC_LO

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29960

to review the following change.


Change subject: arch-gcn3: Remove invalid assert when reading EXEC_LO
..

arch-gcn3: Remove invalid assert when reading EXEC_LO

This assert assumed all reads to EXEC_LO would be
64b, that is, we would always read the entire EXEC
mask. This is invalid as some kernels read only
the low 32b of EXEC.

The write to EXEC_LO is also updated to handle 32b
writes.

Change-Id: Ifeb167578515bf112b1eab70bbf2201a5e936358
---
M src/arch/gcn3/operand.hh
1 file changed, 3 insertions(+), 3 deletions(-)



diff --git a/src/arch/gcn3/operand.hh b/src/arch/gcn3/operand.hh
index 960d05e..9d28deb 100644
--- a/src/arch/gcn3/operand.hh
+++ b/src/arch/gcn3/operand.hh
@@ -435,9 +435,10 @@

 if (!isScalarReg(_opIdx)) {
 if (_opIdx == REG_EXEC_LO) {
-ScalarRegU64 new_exec_mask_val(0);
+ScalarRegU64 new_exec_mask_val
+= wf->execMask().to_ullong();
 std::memcpy((void*)&new_exec_mask_val,
-(void*)srfData.data(), sizeof(new_exec_mask_val));
+(void*)srfData.data(), sizeof(srfData));
 VectorMask new_exec_mask(new_exec_mask_val);
 wf->execMask() = new_exec_mask;
 DPRINTF(GPUSRF, "Write EXEC\n");
@@ -513,7 +514,6 @@
 switch(_opIdx) {
   case REG_EXEC_LO:
 {
-assert(NumDwords == 2);
 ScalarRegU64 exec_mask = _gpuDynInst->wavefront()->
 execMask().to_ullong();
 std::memcpy((void*)srfData.data(), (void*)&exec_mask,

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29960
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Ifeb167578515bf112b1eab70bbf2201a5e936358
Gerrit-Change-Number: 29960
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s


[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Fix Y-dimension ABI decode

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Michael LeBeane, Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29965

to review the following change.


Change subject: gpu-compute: Fix Y-dimension ABI decode
..

gpu-compute: Fix Y-dimension ABI decode

We currently have a bug in decoding workitem ID from the kernel
descriptor with multiple dimensions.  The enable_vgpr_workitem_id bits
are currently seperated into x and y components, when they should be
treated as a single 2 bit value, where y is enabled when it is > 0,
and z is enabled when it is > 1.  The current setup allows a kernel
launch with vgprs reserved for the z dimension and not the y dimension,
which is incorrect.

Change-Id: Iee64b207feb95bcf064898d5db33b8f201e25323
---
M src/gpu-compute/hsa_queue_entry.hh
M src/gpu-compute/kernel_code.hh
2 files changed, 3 insertions(+), 4 deletions(-)



diff --git a/src/gpu-compute/hsa_queue_entry.hh  
b/src/gpu-compute/hsa_queue_entry.hh

index 5fc5e56..ea79869 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -417,8 +417,8 @@
  * workitem Id in the X dimension is always initialized.
  */
 initialVgprState.set(WorkitemIdX, true);
-initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id_y);
-initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id_z);
+initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id >  
0);
+initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id >  
1);

 }

 // name of the kernel associated with the AQL entry
diff --git a/src/gpu-compute/kernel_code.hh b/src/gpu-compute/kernel_code.hh
index b3560c7..680dd72 100644
--- a/src/gpu-compute/kernel_code.hh
+++ b/src/gpu-compute/kernel_code.hh
@@ -130,8 +130,7 @@
 uint32_t enable_sgpr_workgroup_id_y : 1;
 uint32_t enable_sgpr_workgroup_id_z : 1;
 uint32_t enable_sgpr_workgroup_info : 1;
-uint32_t enable_vgpr_workitem_id_y : 1;
-uint32_t enable_vgpr_workitem_id_z : 1;
+uint32_t enable_vgpr_workitem_id : 2;
 uint32_t enable_exception_address_watch : 1;
 uint32_t enable_exception_memory_violation : 1;
 uint32_t granulated_lds_size : 9;

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29965
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Iee64b207feb95bcf064898d5db33b8f201e25323
Gerrit-Change-Number: 29965
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Michael LeBeane 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s


[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix VOP3 V_LDEXP_F64

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29966

to review the following change.


Change subject: arch-gcn3: Fix VOP3 V_LDEXP_F64
..

arch-gcn3: Fix VOP3 V_LDEXP_F64

Replaced !std::isnormal with std::fpclassify because std::isnormal
is not specific enough. !std::isnormal was incorrectly catching
NaN, Inf, 0.0, and subnormals (aka denormals), where as it was only
suppose to catch subnormals.

The return value and error handling spec of std::ldexp listed on
cppreference.com appears to match up in nearly all cases after
making these changes. If std::ldexp handled subnormals as described
in the GCN3 2016 guide, we could have used vdst[lane] = std::ldexp
and not need to check for any corner cases.

Change-Id: I4c77af77c3b7798f86d40442610cef1296a28441
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 4 insertions(+), 3 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 2b992b1..302dad4 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -30282,10 +30282,11 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
-if (std::isnan(src1[lane]) || std::isinf(src1[lane])) {
+if (std::isnan(src0[lane]) || std::isinf(src0[lane])) {
 vdst[lane] = src0[lane];
-} else if (!std::isnormal(src1[lane])) {
-if (std::signbit(src1[lane])) {
+} else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+   || std::fpclassify(src0[lane]) == FP_ZERO) {
+if (std::signbit(src0[lane])) {
 vdst[lane] = -0.0;
 } else {
 vdst[lane] = +0.0;

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29966
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I4c77af77c3b7798f86d40442610cef1296a28441
Gerrit-Change-Number: 29966
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s


[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Don't track vector store insts in CU's headTailMap

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29963

to review the following change.


Change subject: gpu-compute: Don't track vector store insts in CU's  
headTailMap

..

gpu-compute: Don't track vector store insts in CU's headTailMap

This change fixes a memory leak due to live GPUDynInstPtr references
to vector store insts being stored in the CU's headTailMap and never
released.

This happened because store insts are not supposed to have their
head-tail latencies tracked by the headTailMap; instead they use
timing information from the GPUCoalescer. When updating the
headTailLatency stat via the headTailMap, only loads were considered
and removed from the headTailMap, however when inserting into the
headTailMap loads and stores were considered, thus leading to the
memory leak.

This change fixes the issue by only adding loads to the headTailMap.

Change-Id: I8a8f5b79f55e00481ae5e82519a9ed627a7ecbd1
---
M src/gpu-compute/compute_unit.cc
1 file changed, 5 insertions(+), 3 deletions(-)



diff --git a/src/gpu-compute/compute_unit.cc  
b/src/gpu-compute/compute_unit.cc

index f3387a7..653c074 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1389,9 +1389,11 @@
 gpuDynInst->wfSlotId);
 }
 } else {
-if (!compute_unit->headTailMap.count(gpuDynInst)) {
-compute_unit->headTailMap.insert(
-std::make_pair(gpuDynInst, curTick()));
+if (pkt->isRead()) {
+if (!compute_unit->headTailMap.count(gpuDynInst)) {
+compute_unit->headTailMap
+.insert(std::make_pair(gpuDynInst, curTick()));
+}
 }
 }


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29963
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I8a8f5b79f55e00481ae5e82519a9ed627a7ecbd1
Gerrit-Change-Number: 29963
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s


[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix roundNearestEven for V_RNDNE_F64 and V_RNDNE_F32

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29964

to review the following change.


Change subject: arch-gcn3: Fix roundNearestEven for V_RNDNE_F64 and  
V_RNDNE_F32

..

arch-gcn3: Fix roundNearestEven for V_RNDNE_F64 and V_RNDNE_F32

roundNearestEven is an inst_util function that RNDNE_F64 and F32
call, including both VOP1 and VOP3 formats. IEEE 754 spec says this
function should round inputs to the nearest integer but round ties
to the nearest even integer. Prior to this patch it was rounding all
inputs to nearest even, not just the ties. It was probably implemented
this way originally because the language in the ISA manual is ambiguous
although it provided the correct logic.

Fixed roundNearestEven to use the semantics originally described in
the GCN3 ISA manual.

Change-Id: I83ecb1d516fcf5bdf17e54ddf409b447a129a9a7
---
M src/arch/gcn3/insts/inst_util.hh
1 file changed, 7 insertions(+), 1 deletion(-)



diff --git a/src/arch/gcn3/insts/inst_util.hh  
b/src/arch/gcn3/insts/inst_util.hh

index b40e890..15ffe9a 100644
--- a/src/arch/gcn3/insts/inst_util.hh
+++ b/src/arch/gcn3/insts/inst_util.hh
@@ -258,7 +258,13 @@
 template 
 inline T roundNearestEven(T val)
 {
-T nearest_round = std::round(val * 0.5) * 2.0;
+T int_part = 0;
+T nearest_round = std::floor(val + 0.5);
+if ((int)std::floor(val) % 2 == 0
+&& std::modf(std::abs(val), &int_part) == 0.5) {
+  nearest_round = nearest_round - 1;
+}
+
 return nearest_round;
 }


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29964
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I83ecb1d516fcf5bdf17e54ddf409b447a129a9a7
Gerrit-Change-Number: 29964
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s


[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: add all s_buffer_load_dword instructions

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29962

to review the following change.


Change subject: arch-gcn3: add all s_buffer_load_dword instructions
..

arch-gcn3: add all s_buffer_load_dword instructions

Adds the other s_buffer_load_dword* instruction implementations to
f134a84.

Change-Id: I8d97527278900dc68c32463ea1824409ccd04e1d
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 125 insertions(+), 8 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 002c4d5..2b992b1 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -4737,17 +4737,46 @@
 void
 Inst_SMEM__S_BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+ScalarRegU32 offset(0);
+ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+rsrcDesc.read();
+
+if (instData.IMM) {
+offset = extData.OFFSET;
+} else {
+ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+off_sgpr.read();
+offset = off_sgpr.rawData();
+}
+
+calcAddr(gpuDynInst, rsrcDesc, offset);
+
+gpuDynInst->computeUnit()->scalarMemoryPipe
+.getGMReqFIFO().push(gpuDynInst);
+
+wf->scalarRdGmReqsInPipe--;
+wf->scalarOutstandingReqsRdGm++;
+gpuDynInst->wavefront()->outstandingReqs++;
+gpuDynInst->wavefront()->validateRequestCounters();
+} // execute

 void
 Inst_SMEM__S_BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
+initMemRead<1>(gpuDynInst);
 } // initiateAcc

 void
 Inst_SMEM__S_BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
 {
+// 1 request, size 32
+ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
+sdst.write();
 } // completeAcc

 Inst_SMEM__S_BUFFER_LOAD_DWORDX2::Inst_SMEM__S_BUFFER_LOAD_DWORDX2(
@@ -4767,17 +4796,46 @@
 void
 Inst_SMEM__S_BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+ScalarRegU32 offset(0);
+ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+rsrcDesc.read();
+
+if (instData.IMM) {
+offset = extData.OFFSET;
+} else {
+ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+off_sgpr.read();
+offset = off_sgpr.rawData();
+}
+
+calcAddr(gpuDynInst, rsrcDesc, offset);
+
+gpuDynInst->computeUnit()->scalarMemoryPipe
+.getGMReqFIFO().push(gpuDynInst);
+
+wf->scalarRdGmReqsInPipe--;
+wf->scalarOutstandingReqsRdGm++;
+gpuDynInst->wavefront()->outstandingReqs++;
+gpuDynInst->wavefront()->validateRequestCounters();
+} // execute

 void
 Inst_SMEM__S_BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
+initMemRead<2>(gpuDynInst);
 } // initiateAcc

 void
 Inst_SMEM__S_BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
 {
+// use U64 because 2 requests, each size 32
+ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
+sdst.write();
 } // completeAcc

 Inst_SMEM__S_BUFFER_LOAD_DWORDX4::Inst_SMEM__S_BUFFER_LOAD_DWORDX4(
@@ -4797,17 +4855,46 @@
 void
 Inst_SMEM__S_BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+ScalarRegU32 offset(0);
+ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+rsrcDesc.read();
+
+if (instData.IMM) {
+offset = extData.OFFSET;
+} else {
+ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+off_sgpr.read();
+offset = off_sgpr.rawData();
+}
+
+calcAddr(gpuDynInst, rsrcDesc, offset);
+
+gpuDynInst->computeUnit()->scalarMemoryPipe
+.getGMReqFIFO().push(gpuDynInst);
+
+wf->scalarRdGmReqsInPipe--;
+wf->scalarOutstandingReqsRdGm++;
+gpuDyn

[gem5-dev] Change in gem5/gem5[develop]: gpu_compute: Support loading BLIT kernels

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Michael LeBeane, Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29959

to review the following change.


Change subject: gpu_compute: Support loading BLIT kernels
..

gpu_compute: Support loading BLIT kernels

The BLIT kernels used to implement DMA through the shaders don't fill
out all of the standard fields in an amd_kernel_code_t object.  This
patch modifies the code object parsing logic to support these new
kernels.

BLIT kernels are used in APUs when using ROCm memcopies for certain size
buffers, and are used for dGPUs when the SDMA engines are disabled.

Change-Id: Id4e667474d05e311097dbec443def07dfad14a79
---
M src/gpu-compute/gpu_command_processor.cc
M src/gpu-compute/hsa_queue_entry.hh
2 files changed, 31 insertions(+), 4 deletions(-)



diff --git a/src/gpu-compute/gpu_command_processor.cc  
b/src/gpu-compute/gpu_command_processor.cc

index b5e9452..aee3e1b 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -100,11 +100,25 @@
 machine_code_addr);

 Addr kern_name_addr(0);
-virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10,
-(uint8_t*)&kern_name_addr, 0x8);
-
 std::string kernel_name;
-virt_proxy.readString(kernel_name, kern_name_addr);
+
+/**
+ * BLIT kernels don't have symbol names.  BLIT kernels are built-in  
compute

+ * kernels issued by ROCm to handle DMAs for dGPUs when the SDMA
+ * hardware engines are unavailable or explicitly disabled.  They can  
also

+ * be used to do copies that ROCm things would be better performed
+ * by the shader than the SDMA engines.  They are also sometimes used  
on

+ * APUs to implement asynchronous memcopy operations from 2 pointers in
+ * host memory.  I have no idea what BLIT stands for.
+ * */
+if (akc.runtime_loader_kernel_symbol) {
+virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10,
+(uint8_t*)&kern_name_addr, 0x8);
+
+virt_proxy.readString(kernel_name, kern_name_addr);
+} else {
+kernel_name = "Blit kernel";
+}

 DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());

diff --git a/src/gpu-compute/hsa_queue_entry.hh  
b/src/gpu-compute/hsa_queue_entry.hh

index a6917db..5fc5e56 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -88,6 +88,19 @@
   _globalWgId(0), dispatchComplete(false)

 {
+// Precompiled BLIT kernels actually violate the spec a bit
+// and don't set many of the required akc fields.  For these  
kernels,

+// we need to rip register usage from the resource registers.
+//
+// We can't get an exact number of registers from the resource
+// registers because they round, but we can get an upper bound on  
it

+if (!numVgprs)
+numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
+
+// TODO: Granularity changes for GFX9!
+if (!numSgprs)
+numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8;
+
 initialVgprState.reset();
 initialSgprState.reset();


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29959
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Id4e667474d05e311097dbec443def07dfad14a79
Gerrit-Change-Number: 29959
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Michael LeBeane 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s


[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement ds_swizzle

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29958

to review the following change.


Change subject: arch-gcn3: Implement ds_swizzle
..

arch-gcn3: Implement ds_swizzle

Change-Id: I7d188388afa16932217ae207368666a724207c52
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 102 insertions(+), 2 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 71efd8f..002c4d5 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -32266,6 +32266,7 @@
 Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)
 : Inst_DS(iFmt, "ds_swizzle_b32")
 {
+ setFlag(Load);
 } // Inst_DS__DS_SWIZZLE_B32

 Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()
@@ -32277,8 +32278,107 @@
 void
 Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+wf->rdLmReqsInPipe--;
+wf->validateRequestCounters();
+
+if (gpuDynInst->exec_mask.none()) {
+return;
+}
+
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()
+->cyclesToTicks(Cycles(24)));
+
+ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+VecOperandU32 vdst(gpuDynInst, extData.VDST);
+/**
+ * The "DS pattern" is comprised of both offset fields. That is,  
the

+ * swizzle pattern between lanes. Bit 15 of the DS pattern dictates
+ * which swizzle mode to use. There are two different swizzle
+ * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use
+ * QDMode else use Bit-masks mode. The remaining bits dictate how  
to

+ * swizzle the lanes.
+ *
+ * QDMode:  Chunks the lanes into 4s and swizzles among them.
+ *  Bits 7:6 dictate where lane 3 (of the current  
chunk)

+ *  gets its date, 5:4 lane 2, etc.
+ *
+ * Bit-mask:This mode breaks bits 14:0 into 3 equal-sized  
chunks.

+ *  14:10 is the xor_mask, 9:5 is the or_mask, and 4:0
+ *  is the and_mask. Each lane is swizzled by  
performing

+ *  the appropriate operation using these masks.
+ */
+VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) |  
instData.OFFSET0);

+
+data.read();
+
+if (bits(ds_pattern, 15)) {
+// QDMode
+for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) {
+/**
+ * This operation allows data sharing between groups
+ * of four consecutive threads. Note the increment by
+ * 4 in the for loop.
+ */
+if (gpuDynInst->exec_mask[lane]) {
+int index0 = lane + bits(ds_pattern, 1, 0);
+panic_if(index0 >= NumVecElemPerVecReg, "%s: index0  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index0);
+vdst[lane]
+= gpuDynInst->exec_mask[index0] ? data[index0]: 0;
+}
+if (gpuDynInst->exec_mask[lane + 1]) {
+int index1 = lane + bits(ds_pattern, 3, 2);
+panic_if(index1 >= NumVecElemPerVecReg, "%s: index1  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index1);
+vdst[lane + 1]
+= gpuDynInst->exec_mask[index1] ? data[index1]: 0;
+}
+if (gpuDynInst->exec_mask[lane + 2]) {
+int index2 = lane + bits(ds_pattern, 5, 4);
+panic_if(index2 >= NumVecElemPerVecReg, "%s: index2  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index2);
+vdst[lane + 2]
+= gpuDynInst->exec_mask[index2] ? data[index2]: 0;
+}
+if (gpuDynInst->exec_mask[lane + 3]) {
+int index3 = lane + bits(ds_pattern, 7, 6);
+panic_if(index3 >= NumVecElemPerVecReg, "%s: index3  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index3);
+vdst[lane + 3]
+= gpuDynInst->exec_mask[index3] ? data[index3]: 0;
+}
+}
+} else {
+// Bit Mode
+int and_mask = bits(ds_pa

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement s_buffer_load_dwordx16

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29957

to review the following change.


Change subject: arch-gcn3: Implement s_buffer_load_dwordx16
..

arch-gcn3: Implement s_buffer_load_dwordx16

Change-Id: I25382dcae9bb55eaf035385fa925157f25d39c20
---
M src/arch/gcn3/insts/instructions.cc
M src/arch/gcn3/insts/op_encodings.hh
2 files changed, 90 insertions(+), 31 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 567cc10..71efd8f 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -4857,17 +4857,45 @@
 void
 Inst_SMEM__S_BUFFER_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+ScalarRegU32 offset(0);
+ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+rsrcDesc.read();
+
+if (instData.IMM) {
+offset = extData.OFFSET;
+} else {
+ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+off_sgpr.read();
+offset = off_sgpr.rawData();
+}
+
+calcAddr(gpuDynInst, rsrcDesc, offset);
+
+gpuDynInst->computeUnit()->scalarMemoryPipe
+.getGMReqFIFO().push(gpuDynInst);
+
+wf->scalarRdGmReqsInPipe--;
+wf->scalarOutstandingReqsRdGm++;
+gpuDynInst->wavefront()->outstandingReqs++;
+gpuDynInst->wavefront()->validateRequestCounters();
+} // execute

 void
 Inst_SMEM__S_BUFFER_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr  
gpuDynInst)

 {
+initMemRead<16>(gpuDynInst);
 } // initiateAcc

 void
 Inst_SMEM__S_BUFFER_LOAD_DWORDX16::completeAcc(GPUDynInstPtr  
gpuDynInst)

 {
+ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
+sdst.write();
 } // completeAcc

 Inst_SMEM__S_STORE_DWORD::Inst_SMEM__S_STORE_DWORD(InFmt_SMEM *iFmt)
diff --git a/src/arch/gcn3/insts/op_encodings.hh  
b/src/arch/gcn3/insts/op_encodings.hh

index 4056f0a..202dd1d 100644
--- a/src/arch/gcn3/insts/op_encodings.hh
+++ b/src/arch/gcn3/insts/op_encodings.hh
@@ -46,6 +46,29 @@

 namespace Gcn3ISA
 {
+struct BufferRsrcDescriptor
+{
+uint64_t baseAddr : 48;
+uint32_t stride : 14;
+uint32_t cacheSwizzle : 1;
+uint32_t swizzleEn : 1;
+uint32_t numRecords : 32;
+uint32_t dstSelX : 3;
+uint32_t dstSelY : 3;
+uint32_t dstSelZ : 3;
+uint32_t dstSelW : 3;
+uint32_t numFmt : 3;
+uint32_t dataFmt : 4;
+uint32_t elemSize : 2;
+uint32_t idxStride : 2;
+uint32_t addTidEn : 1;
+uint32_t atc : 1;
+uint32_t hashEn : 1;
+uint32_t heap : 1;
+uint32_t mType : 3;
+uint32_t type : 2;
+};
+
 // --- purely virtual instruction classes ---

 class Inst_SOP2 : public GCN3GPUStaticInst
@@ -197,14 +220,45 @@
 MemCmd::WriteReq);
 }

+/**
+ * For normal s_load_dword/s_store_dword instruction addresses.
+ */
 void
-calcAddr(GPUDynInstPtr gpuDynInst, ConstScalarOperandU64 &addr,
-ScalarRegU32 offset)
+calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU64 &addr,
+ ScalarRegU32 offset)
 {
-Addr vaddr = addr.rawData();
-vaddr += offset;
-vaddr &= ~0x3;
-gpuDynInst->scalarAddr = vaddr;
+Addr vaddr = ((addr.rawData() + offset) & ~0x3);
+gpu_dyn_inst->scalarAddr = vaddr;
+}
+
+/**
+ * For s_buffer_load_dword/s_buffer_store_dword instruction  
addresses.
+ * The s_buffer instructions use the same buffer resource  
descriptor

+ * as the MUBUF instructions.
+ */
+void
+calcAddr(GPUDynInstPtr gpu_dyn_inst,
+ ConstScalarOperandU128 &s_rsrc_desc, ScalarRegU32 offset)
+{
+BufferRsrcDescriptor rsrc_desc;
+ScalarRegU32 clamped_offset(offset);
+std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
+sizeof(BufferRsrcDescriptor));
+
+/**
+ * The address is clamped if:
+ * Stride is zero: clamp if offset >= num_records
+ * Stride is non-zero: clamp if offset > (stride *  
num_records)

+ */
+if (!rsrc_desc.stride && offset >= rsrc_desc.numRecords) {
+clamped_offset = rsrc_desc.numRecords;
+} else if (rsrc_de

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: implement instruction s_setreg_b32

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez, Xianwei Zhang,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29949

to review the following change.


Change subject: arch-gcn3: implement instruction s_setreg_b32
..

arch-gcn3: implement instruction s_setreg_b32

Instruction s_setreg_b32 was unimplemented, but is used by hipified
rodinia 'srad'. The instruction sets values of hardware internal
registers. If the instruction is writing into MODE to control
single-precision FP round and denorm modes, a simple warn will be
printed; for all other cases (non-MODE hw register or other
precisions), panic will happen.

Change-Id: Idb1cd5f60548a146bc980f1a27faff30259e74ce
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 27 insertions(+), 0 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 6ffd049..8b72e0d 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -1800,6 +1800,7 @@
 Inst_SOPK__S_SETREG_B32::Inst_SOPK__S_SETREG_B32(InFmt_SOPK *iFmt)
 : Inst_SOPK(iFmt, "s_setreg_b32")
 {
+setFlag(ALU);
 } // Inst_SOPK__S_SETREG_B32

 Inst_SOPK__S_SETREG_B32::~Inst_SOPK__S_SETREG_B32()
@@ -1813,6 +1814,32 @@
 void
 Inst_SOPK__S_SETREG_B32::execute(GPUDynInstPtr gpuDynInst)
 {
+ScalarRegI16 simm16 = instData.SIMM16;
+ScalarRegU32 hwregId = simm16 & 0x3f;
+ScalarRegU32 offset = (simm16 >> 6) & 31;
+ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
+
+ScalarOperandU32 hwreg(gpuDynInst, hwregId);
+ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+hwreg.read();
+sdst.read();
+
+// Store value from SDST to part of the hardware register.
+ScalarRegU32 mask = (((1U << size) - 1U) << offset);
+hwreg = ((hwreg.rawData() & ~mask)
+| ((sdst.rawData() << offset) & mask));
+hwreg.write();
+
+// set MODE register to control the behavior of single precision
+// floating-point numbers: denormal mode or round mode
+if (hwregId==1 && size==2
+&& (offset==4 || offset==0)) {
+warn_once("Be cautious that s_setreg_b32 has no real effect "
+"on FP modes: %s\n",  
gpuDynInst->disassemble());

+return;
+}
+
+// panic if not changing MODE of floating-point numbers
 panicUnimplemented();
 }


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29949
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Idb1cd5f60548a146bc980f1a27faff30259e74ce
Gerrit-Change-Number: 29949
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-Reviewer: Xianwei Zhang 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s


[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: fixed scale,fixup,fmas f64 ops

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29955

to review the following change.


Change subject: arch-gcn3: fixed scale,fixup,fmas f64 ops
..

arch-gcn3: fixed scale,fixup,fmas f64 ops

Change-Id: Ie13794554db8a958fda1f7103ec18058fda2e66d
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 65 insertions(+), 17 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index a7b8923..a25ec17 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -28952,22 +28952,34 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
-if (std::fpclassify(src1[lane]) == FP_ZERO) {
-if (std::signbit(src1[lane])) {
-vdst[lane] = -INFINITY;
-} else {
-vdst[lane] = +INFINITY;
-}
-} else if (std::isnan(src2[lane]) ||  
std::isnan(src1[lane])) {

-vdst[lane] = NAN;
-} else if (std::isinf(src1[lane])) {
-if (std::signbit(src1[lane])) {
-vdst[lane] = -INFINITY;
-} else {
-vdst[lane] = +INFINITY;
-}
+int signOut = std::signbit(src1[lane]) ^
+  std::signbit(src2[lane]);
+int exp1, exp2;
+std::frexp(src1[lane],&exp1);
+std::frexp(src2[lane],&exp2);
+if (std::isnan(src2[lane])) {
+vdst[lane] = src2[lane];
+} else if (std::isnan(src1[lane])) {
+vdst[lane] = src1[lane];
+} else if (src1[lane] == 0.0 && src2[lane] == 0.0) {
+vdst[lane] = -NAN;
+} else if (std::isinf(src1[lane]) &&  
std::isinf(src2[lane])) {

+vdst[lane] = -NAN;
+} else if (src1[lane] == 0.0 || std::isinf(src2[lane])) {
+vdst[lane] = signOut ? -INFINITY : +INFINITY;
+} else if (src2[lane] == 0.0 || std::isinf(src1[lane])) {
+vdst[lane] = signOut ? -0.0 : +0.0;
+} else if (exp2 - exp1 < -1075) {
+warn_once("fixup_f64 unimplemented case:"
+  "exp2 - ex1 < -1075");
+vdst[lane] = src0[lane];
+} else if (exp1 == 2047) {
+warn_once("fixup_f64 unimplemented case:"
+  "exp1 == 2047");
+vdst[lane] = src0[lane];
 } else {
-vdst[lane] = src2[lane] / src1[lane];
+vdst[lane] = ((uint64_t)signOut<<63) |
+((uint64_t)src0[lane] & 0x7fffULL);
 }
 }
 }
@@ -29077,8 +29089,37 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
-vdst[lane] = src0[lane];
+int exp1, exp2;
+std::frexp(src1[lane],&exp1);
+std::frexp(src2[lane],&exp2);
 vcc.setBit(lane, 0);
+if (src2[lane] == 0 || src1[lane] == 0) {
+vdst[lane] = NAN;
+} else if (exp2 - exp1 >= 768) {
+vcc.setBit(lane, 1);
+if (src0[lane] == src1[lane]) {
+vdst[lane] = std::ldexp(src0[lane],128);
+}
+} else if (exp1 == 0) {
+vdst[lane] = std::ldexp(src0[lane],128);
+} else if (exp1 >= 0x7fd && exp2 - exp1 <= -768) {
+vcc.setBit(lane, 1);
+if (src0[lane] == src1[lane]) {
+vdst[lane] = std::ldexp(src0[lane],-128);
+}
+} else if (exp1 >= 0x7fd) {
+vdst[lane] = std::ldexp(src0[lane],-128);
+} else if (exp2 - exp1 <= -768) {
+vcc.setBit(lane, 1);
+if (src0[lane] != src2[lane]) {
+vdst[lane] = std::ldexp(src0[lane],128);
+}
+} else if (exp2 <= 53) {
+vdst[lane] = std::ldexp(src0[lane],128);
+}
+else {
+vdst[lane] = src0[lane];
+}
 }
 }

@@ -29171,10 +29212,12 @@
 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
 ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
 VecOperandF64 vdst(gpuDynInst, instData.VDST);
+ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

 src0.readSrc();
 

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fixup DIV instructions

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29956

to review the following change.


Change subject: arch-gcn3: Fixup DIV instructions
..

arch-gcn3: Fixup DIV instructions

Adds support to handle the special cases
for GCN3 DIV instructions.

Change-Id: I18f91870e802407c93831f313ce76be053bc4230
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 44 insertions(+), 42 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index a25ec17..567cc10 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -28952,34 +28952,35 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
-int signOut = std::signbit(src1[lane]) ^
-  std::signbit(src2[lane]);
-int exp1, exp2;
-std::frexp(src1[lane],&exp1);
-std::frexp(src2[lane],&exp2);
-if (std::isnan(src2[lane])) {
-vdst[lane] = src2[lane];
-} else if (std::isnan(src1[lane])) {
-vdst[lane] = src1[lane];
-} else if (src1[lane] == 0.0 && src2[lane] == 0.0) {
-vdst[lane] = -NAN;
+int sign_out = std::signbit(src1[lane])
+  ^ std::signbit(src2[lane]);
+int exp1(0);
+int exp2(0);
+std::frexp(src1[lane], &exp1);
+std::frexp(src2[lane], &exp2);
+
+if (std::isnan(src1[lane]) || std::isnan(src2[lane])) {
+vdst[lane] =  
std::numeric_limits::quiet_NaN();

+} else if (std::fpclassify(src1[lane]) == FP_ZERO
+   && std::fpclassify(src2[lane]) == FP_ZERO) {
+vdst[lane]
+= std::numeric_limits::signaling_NaN();
 } else if (std::isinf(src1[lane]) &&  
std::isinf(src2[lane])) {

-vdst[lane] = -NAN;
-} else if (src1[lane] == 0.0 || std::isinf(src2[lane])) {
-vdst[lane] = signOut ? -INFINITY : +INFINITY;
-} else if (src2[lane] == 0.0 || std::isinf(src1[lane])) {
-vdst[lane] = signOut ? -0.0 : +0.0;
+vdst[lane]
+= std::numeric_limits::signaling_NaN();
+} else if (std::fpclassify(src1[lane]) == FP_ZERO
+   || std::isinf(src2[lane])) {
+vdst[lane] = sign_out ? -INFINITY : +INFINITY;
+} else if (std::isinf(src1[lane])
+   || std::fpclassify(src2[lane]) == FP_ZERO) {
+vdst[lane] = sign_out ? -0.0 : +0.0;
 } else if (exp2 - exp1 < -1075) {
-warn_once("fixup_f64 unimplemented case:"
-  "exp2 - ex1 < -1075");
 vdst[lane] = src0[lane];
 } else if (exp1 == 2047) {
-warn_once("fixup_f64 unimplemented case:"
-  "exp1 == 2047");
 vdst[lane] = src0[lane];
 } else {
-vdst[lane] = ((uint64_t)signOut<<63) |
-((uint64_t)src0[lane] & 0x7fffULL);
+vdst[lane] = sign_out ? -std::fabs(src0[lane])
+: std::fabs(src0[lane]);
 }
 }
 }
@@ -29089,36 +29090,37 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
-int exp1, exp2;
-std::frexp(src1[lane],&exp1);
-std::frexp(src2[lane],&exp2);
+int exp1(0);
+int exp2(0);
+std::frexp(src1[lane], &exp1);
+std::frexp(src2[lane], &exp2);
 vcc.setBit(lane, 0);
-if (src2[lane] == 0 || src1[lane] == 0) {
+
+if (std::fpclassify(src1[lane]) == FP_ZERO
+|| std::fpclassify(src2[lane]) == FP_ZERO) {
 vdst[lane] = NAN;
 } else if (exp2 - exp1 >= 768) {
 vcc.setBit(lane, 1);
 if (src0[lane] == src1[lane]) {
-vdst[lane] = std::ldexp(src0[lane],128);
+vdst[lane] = std::ldexp(src0[lane], 128);
 }
-} else if (exp1 == 0) {
-vdst[lane] = std::ldexp(src0[lane],128);
-} else if (exp1 >= 0x7fd && exp2 - exp1 <= -768) {
+} else if (!std::isnormal(src1[lane])) {
+vdst[lane] = std::ldexp(src0[lane], 128);
+} else if (!std::isnormal

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add handling for Inf/overflow in CVT insts

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29953

to review the following change.


Change subject: arch-gcn3: Add handling for Inf/overflow in CVT insts
..

arch-gcn3: Add handling for Inf/overflow in CVT insts

Change-Id: I0fddffdeaebd9f45fe89f44d536f80a43de63ff5
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 77 insertions(+), 1 deletion(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index e93278a..a7b8923 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -7260,8 +7260,16 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
+int exp;
+std::frexp(src[lane],&exp);
 if (std::isnan(src[lane])) {
 vdst[lane] = 0;
+} else if (std::isinf(src[lane]) || exp > 30) {
+if (std::signbit(src[lane])) {
+vdst[lane] = INT_MIN;
+} else {
+vdst[lane] = INT_MAX;
+}
 } else {
 vdst[lane] = (VecElemI32)src[lane];
 }
@@ -7386,8 +7394,18 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
+int exp;
+std::frexp(src[lane],&exp);
 if (std::isnan(src[lane])) {
 vdst[lane] = 0;
+} else if (std::isinf(src[lane])) {
+if (std::signbit(src[lane])) {
+vdst[lane] = 0;
+} else {
+vdst[lane] = UINT_MAX;
+}
+} else if (exp > 31) {
+vdst[lane] = UINT_MAX;
 } else {
 vdst[lane] = (VecElemU32)src[lane];
 }
@@ -7422,8 +7440,16 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
+int exp;
+std::frexp(src[lane],&exp);
 if (std::isnan(src[lane])) {
 vdst[lane] = 0;
+} else if (std::isinf(src[lane]) || exp > 30) {
+if (std::signbit(src[lane])) {
+vdst[lane] = INT_MIN;
+} else {
+vdst[lane] = INT_MAX;
+}
 } else {
 vdst[lane] = (VecElemI32)src[lane];
 }
@@ -7772,8 +7798,18 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
+int exp;
+std::frexp(src[lane],&exp);
 if (std::isnan(src[lane])) {
 vdst[lane] = 0;
+} else if (std::isinf(src[lane])) {
+if (std::signbit(src[lane])) {
+vdst[lane] = 0;
+} else {
+vdst[lane] = UINT_MAX;
+}
+} else if (exp > 31) {
+vdst[lane] = UINT_MAX;
 } else {
 vdst[lane] = (VecElemU32)src[lane];
 }
@@ -25075,8 +25111,16 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
+int exp;
+std::frexp(src[lane],&exp);
 if (std::isnan(src[lane])) {
 vdst[lane] = 0;
+} else if (std::isinf(src[lane]) || exp > 30) {
+if (std::signbit(src[lane])) {
+vdst[lane] = INT_MIN;
+} else {
+vdst[lane] = INT_MAX;
+}
 } else {
 vdst[lane] = (VecElemI32)src[lane];
 }
@@ -25235,8 +25279,18 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
+int exp;
+std::frexp(src[lane],&exp);
 if (std::isnan(src[lane])) {
 vdst[lane] = 0;
+} else if (std::isinf(src[lane])) {
+if (std::signbit(src[lane])) {
+vdst[lane] = 0;
+} else {
+vdst[lane] = UINT_MAX;
+}
+} else if (exp > 31) {
+vdst[lane] = UINT_MAX;
 } else {
 vdst[lane] = (VecElemU32)src[lane];
 }
@@ -25287,8 +25341,16 @@

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
+int exp;
+std::frex

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Fix s_getpc operand information

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29954

to review the following change.


Change subject: arch-gcn3: Fix s_getpc operand information
..

arch-gcn3: Fix s_getpc operand information

s_getpc was currently reporting only a single operand,
and was only considering the SSRC operand. However,
this instruction' source is implicitly the PC.
Because its destination register was never tracked for
dependence checking purposes, dependence violations
are possible.

Change-Id: Ia80b8b3e24d5885f646a9ee41212a2cb35b9ffe6
---
M src/arch/gcn3/insts/instructions.hh
M src/arch/gcn3/insts/op_encodings.cc
2 files changed, 15 insertions(+), 10 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.hh  
b/src/arch/gcn3/insts/instructions.hh

index b0cc37e..f561043 100644
--- a/src/arch/gcn3/insts/instructions.hh
+++ b/src/arch/gcn3/insts/instructions.hh
@@ -5846,9 +5846,7 @@
 getOperandSize(int opIdx) override
 {
 switch (opIdx) {
-  case 0: //ssrc
-return 8;
-  case 1: //sdst
+  case 0: //sdst
 return 8;
   default:
 fatal("op idx %i out of bounds\n", opIdx);
@@ -5860,9 +5858,7 @@
 isSrcOperand(int opIdx) override
 {
 switch (opIdx) {
-  case 0: //ssrc
-return true;
-  case 1: //sdst
+  case 0: //sdst
 return false;
   default:
 fatal("op idx %i out of bounds\n", opIdx);
@@ -5874,9 +5870,7 @@
 isDstOperand(int opIdx) override
 {
 switch (opIdx) {
-  case 0: //ssrc
-return false;
-  case 1: //sdst
+  case 0: //sdst
 return true;
   default:
 fatal("op idx %i out of bounds\n", opIdx);
diff --git a/src/arch/gcn3/insts/op_encodings.cc  
b/src/arch/gcn3/insts/op_encodings.cc

index 22d0f48..997b22f 100644
--- a/src/arch/gcn3/insts/op_encodings.cc
+++ b/src/arch/gcn3/insts/op_encodings.cc
@@ -326,7 +326,12 @@

 switch (opIdx) {
   case 0:
-  return isScalarReg(instData.SSRC0);
+if (instData.OP == 0x1C) {
+// Special case for s_getpc, which has no source reg.
+// Instead, it implicitly reads the PC.
+return isScalarReg(instData.SDST);
+}
+return isScalarReg(instData.SSRC0);
   case 1:
   return isScalarReg(instData.SDST);
   default:
@@ -353,6 +358,12 @@

 switch (opIdx) {
   case 0:
+if (instData.OP == 0x1C) {
+// Special case for s_getpc, which has no source reg.
+// Instead, it implicitly reads the PC.
+return opSelectorToRegIdx(instData.SDST,
+gpuDynInst->wavefront()->reservedScalarRegs);
+}
 return opSelectorToRegIdx(instData.SSRC0,
 gpuDynInst->wavefront()->reservedScalarRegs);
   case 1:

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29954
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Ia80b8b3e24d5885f646a9ee41212a2cb35b9ffe6
Gerrit-Change-Number: 29954
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s


[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Add ds_bpermute and ds_permute insts

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29952

to review the following change.


Change subject: arch-gcn3: Add ds_bpermute and ds_permute insts
..

arch-gcn3: Add ds_bpermute and ds_permute insts

The implementation of these insts provided by this
change is based on the description provided here:

https://gpuopen.com/amd-gcn-assembly-cross-lane-operations/

Change-Id: Id63b6c34c9fdc6e0dbd445d859e7b209023f2874
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 113 insertions(+), 4 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 6e5ff42..e93278a 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -32129,6 +32129,13 @@
 Inst_DS__DS_PERMUTE_B32::Inst_DS__DS_PERMUTE_B32(InFmt_DS *iFmt)
 : Inst_DS(iFmt, "ds_permute_b32")
 {
+setFlag(MemoryRef);
+/**
+ * While this operation doesn't actually use DS storage we classify
+ * it as a load here because it does a writeback to a VGPR, which
+ * fits in better with the LDS pipeline logic.
+ */
+ setFlag(Load);
 } // Inst_DS__DS_PERMUTE_B32

 Inst_DS__DS_PERMUTE_B32::~Inst_DS__DS_PERMUTE_B32()
@@ -32139,12 +32146,66 @@
 void
 Inst_DS__DS_PERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()
+->cyclesToTicks(Cycles(24)));
+ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+addr.read();
+data.read();
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (wf->execMask(lane)) {
+/**
+ * One of the offset fields can be used for the index.
+ * It is assumed OFFSET0 would be used, as OFFSET1 is
+ * typically only used for DS ops that operate on two
+ * disparate pieces of data.
+ */
+assert(!instData.OFFSET1);
+/**
+ * The address provided is a byte address, but VGPRs are
+ * 4 bytes, so we must divide by 4 to get the actual VGPR
+ * index. Additionally, the index is calculated modulo the
+ * WF size, 64 in this case, so we simply extract bits 7-2.
+ */
+int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
+panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is  
out "

+ "of bounds.\n", gpuDynInst->disassemble(), index);
+/**
+ * If the shuffled index corresponds to a lane that is
+ * inactive then this instruction writes a 0 to the active
+ * lane in VDST.
+ */
+if (wf->execMask(index)) {
+vdst[index] = data[lane];
+} else {
+vdst[index] = 0;
+}
+}
+}
+
+vdst.write();
+
+wf->rdLmReqsInPipe--;
+wf->validateRequestCounters();
+} // execute
+// --- Inst_DS__DS_BPERMUTE_B32 class methods ---

 Inst_DS__DS_BPERMUTE_B32::Inst_DS__DS_BPERMUTE_B32(InFmt_DS *iFmt)
 : Inst_DS(iFmt, "ds_bpermute_b32")
 {
+setFlag(MemoryRef);
+/**
+ * While this operation doesn't actually use DS storage we classify
+ * it as a load here because it does a writeback to a VGPR, which
+ * fits in better with the LDS pipeline logic.
+ */
+setFlag(Load);
 } // Inst_DS__DS_BPERMUTE_B32

 Inst_DS__DS_BPERMUTE_B32::~Inst_DS__DS_BPERMUTE_B32()
@@ -32155,8 +32216,56 @@
 void
 Inst_DS__DS_BPERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()
+->cyclesToTicks(Cycles(24)));
+ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+addr.read();
+data.read();
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (wf->execMask(lane)) {
+/**
+ * One of the offset field

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: convert vALU instruction counters from 32 to 64-bit

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29950

to review the following change.


Change subject: arch-gcn3: convert vALU instruction counters from 32 to  
64-bit

..

arch-gcn3: convert vALU instruction counters from 32 to 64-bit

The vALU instruction counters were previously 32 bits, but for some
workloads this value wraps around and triggers an assert failure
because the max vALU operations are reached.  To resolve this, this
commit increases the counter size to 64 bits.

Change-Id: I90ed4514669485cfea7ccc37ba9d69665277bccb
---
M src/gpu-compute/shader.hh
1 file changed, 2 insertions(+), 2 deletions(-)



diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
index 238f6e0..3e2e569 100644
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -258,8 +258,8 @@
 Stats::Vector vectorInstDstOperand;
 void regStats();

-int max_valu_insts;
-int total_valu_insts;
+int64_t max_valu_insts;
+int64_t total_valu_insts;

 Shader(const Params *p);
 ~Shader();

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29950
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I90ed4514669485cfea7ccc37ba9d69665277bccb
Gerrit-Change-Number: 29950
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s


[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: add support for v_mbcnt_hi and v_mbcnt_lo

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29948

to review the following change.


Change subject: arch-gcn3: add support for v_mbcnt_hi and v_mbcnt_lo
..

arch-gcn3: add support for v_mbcnt_hi and v_mbcnt_lo

Change-Id: I1c70fe693c904f1abd7d5a2b99220c74a075eae5
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 60 insertions(+), 4 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 79e7dda..6ffd049 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -30309,8 +30309,36 @@
 void
 Inst_VOP3__V_MBCNT_LO_U32_B32::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+VecOperandU32 vdst(gpuDynInst, instData.VDST);
+uint64_t threadMask = 0;
+
+src0.readSrc();
+src1.readSrc();
+
+/**
+ * input modifiers are supported by FP operations only
+ */
+assert(!(instData.ABS & 0x1));
+assert(!(instData.ABS & 0x2));
+assert(!(instData.ABS & 0x4));
+assert(!(extData.NEG & 0x1));
+assert(!(extData.NEG & 0x2));
+assert(!(extData.NEG & 0x4));
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (wf->execMask(lane)) {
+threadMask = ((1LL << lane) - 1LL);
+vdst[lane] = popCount(src0[lane] & bits(threadMask, 31,  
0)) +

+ src1[lane];
+}
+}
+
+vdst.write();
+} // execute
+// --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods ---

 Inst_VOP3__V_MBCNT_HI_U32_B32::Inst_VOP3__V_MBCNT_HI_U32_B32(
   InFmt_VOP3 *iFmt)
@@ -30330,8 +30358,36 @@
 void
 Inst_VOP3__V_MBCNT_HI_U32_B32::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+VecOperandU32 vdst(gpuDynInst, instData.VDST);
+uint64_t threadMask = 0;
+
+src0.readSrc();
+src1.readSrc();
+
+/**
+ * input modifiers are supported by FP operations only
+ */
+assert(!(instData.ABS & 0x1));
+assert(!(instData.ABS & 0x2));
+assert(!(instData.ABS & 0x4));
+assert(!(extData.NEG & 0x1));
+assert(!(extData.NEG & 0x2));
+assert(!(extData.NEG & 0x4));
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (wf->execMask(lane)) {
+threadMask = ((1LL << lane) - 1LL);
+vdst[lane] = popCount(src0[lane] & bits(threadMask, 63,  
32)) +

+ src1[lane];
+}
+}
+
+vdst.write();
+} // execute
+// --- Inst_VOP3__V_LSHLREV_B64 class methods ---

 Inst_VOP3__V_LSHLREV_B64::Inst_VOP3__V_LSHLREV_B64(InFmt_VOP3 *iFmt)
 : Inst_VOP3(iFmt, "v_lshlrev_b64", false)

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29948
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I1c70fe693c904f1abd7d5a2b99220c74a075eae5
Gerrit-Change-Number: 29948
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s


[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: ds_read_u8 and ds_read_u16 fix

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez, Alexandru Duțu,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29951

to review the following change.


Change subject: arch-gcn3: ds_read_u8 and ds_read_u16 fix
..

arch-gcn3: ds_read_u8 and ds_read_u16 fix

This changeset zero extends the destination register
for ds_read_u8 and ds_read_u16 instructions.

Change-Id: I193adadd68adf2572b59743b1504f18ad225f506
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 4 insertions(+), 4 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 8b72e0d..6e5ff42 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -32016,11 +32016,11 @@
 void
 Inst_DS__DS_READ_U8::completeAcc(GPUDynInstPtr gpuDynInst)
 {
-VecOperandU8 vdst(gpuDynInst, extData.VDST);
+VecOperandU32 vdst(gpuDynInst, extData.VDST);

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (gpuDynInst->exec_mask[lane]) {
-vdst[lane] = (reinterpret_cast(
+vdst[lane] = (VecElemU32)(reinterpret_cast(
 gpuDynInst->d_data))[lane];
 }
 }
@@ -32096,11 +32096,11 @@
 void
 Inst_DS__DS_READ_U16::completeAcc(GPUDynInstPtr gpuDynInst)
 {
-VecOperandU16 vdst(gpuDynInst, extData.VDST);
+VecOperandU32 vdst(gpuDynInst, extData.VDST);

 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (gpuDynInst->exec_mask[lane]) {
-vdst[lane] = (reinterpret_cast(
+vdst[lane] = (VecElemU32)(reinterpret_cast(
 gpuDynInst->d_data))[lane];
 }
 }

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29951
To unsubscribe, or for help writing mail filters, visit  
https://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I193adadd68adf2572b59743b1504f18ad225f506
Gerrit-Change-Number: 29951
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez 
Gerrit-Reviewer: Alexandru Duțu 
Gerrit-Reviewer: Tony Gutierrez 
Gerrit-MessageType: newchange
___
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: implement multi-dword buffer loads and stores

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29946

to review the following change.


Change subject: arch-gcn3: implement multi-dword buffer loads and stores
..

arch-gcn3: implement multi-dword buffer loads and stores

Add support for all multi-dword buffer loads and stores:
buffer_load_dword x2, x3, and x4 and buffer_store_dword x2, x3, and x4

Change-Id: I4017b6b4f625fc92002ce8ade695ae29700fa55e
---
M src/arch/gcn3/insts/instructions.cc
M src/arch/gcn3/insts/op_encodings.hh
2 files changed, 504 insertions(+), 18 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 817b339..b852281 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -34777,7 +34777,11 @@
 {
 setFlag(MemoryRef);
 setFlag(Load);
-setFlag(GlobalSegment);
+if (instData.LDS) {
+setFlag(GroupSegment);
+} else {
+setFlag(GlobalSegment);
+}
 } // Inst_MUBUF__BUFFER_LOAD_DWORDX2

 Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2()
@@ -34788,17 +34792,88 @@
 void
 Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->exec_mask = wf->execMask();
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+rsrcDesc.read();
+offset.read();
+
+int inst_offset = instData.OFFSET;
+
+if (!instData.IDXEN && !instData.OFFEN) {
+calcAddr(gpuDynInst,
+addr0, addr1, rsrcDesc, offset, inst_offset);
+} else if (!instData.IDXEN && instData.OFFEN) {
+addr0.read();
+calcAddr(gpuDynInst,
+addr0, addr1, rsrcDesc, offset, inst_offset);
+} else if (instData.IDXEN && !instData.OFFEN) {
+addr0.read();
+calcAddr(gpuDynInst,
+addr1, addr0, rsrcDesc, offset, inst_offset);
+} else {
+addr0.read();
+addr1.read();
+calcAddr(gpuDynInst,
+addr1, addr0, rsrcDesc, offset, inst_offset);
+}
+
+if (isLocalMem()) {
+gpuDynInst->computeUnit()->localMemoryPipe
+.issueRequest(gpuDynInst);
+wf->rdLmReqsInPipe--;
+wf->outstandingReqsRdLm++;
+} else {
+gpuDynInst->computeUnit()->globalMemoryPipe
+.issueRequest(gpuDynInst);
+wf->rdGmReqsInPipe--;
+wf->outstandingReqsRdGm++;
+}
+
+wf->outstandingReqs++;
+wf->validateRequestCounters();
+} // execute

 void
 Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
+initMemRead<2>(gpuDynInst);
 } // initiateAcc

 void
 Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
 {
+VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
+VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
+
+for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+if (gpuDynInst->exec_mask[lane]) {
+if (!oobMask[lane]) {
+vdst0[lane] = (reinterpret_cast(
+gpuDynInst->d_data))[lane * 2];
+vdst1[lane] = (reinterpret_cast(
+gpuDynInst->d_data))[lane * 2 + 1];
+} else {
+vdst0[lane] = 0;
+vdst1[lane] = 0;
+}
+}
+}
+
+vdst0.write();
+vdst1.write();
 } // completeAcc

 Inst_MUBUF__BUFFER_LOAD_DWORDX3
@@ -34807,7 +34882,11 @@
 {
 setFlag(MemoryRef);
 setFlag(Load);
-setFlag(GlobalSegment);
+if (instData.LDS) {
+setFlag(GroupSegment);
+} else {
+setFlag(GlobalSegment);
+}
 } // Inst_MUBUF__BUFFER_LOAD_DWORDX3

 Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3()
@@ -34818,17 +34897,93 @@
 void
 Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->exec_mask = wf->execMask();
+gpuDynInst->latency.init(gpuDynInst->comp

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute, arch-gcn3: refactor barriers

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29943

to review the following change.


Change subject: gpu-compute, arch-gcn3: refactor barriers
..

gpu-compute, arch-gcn3: refactor barriers

Barriers were not modeled properly. Firstly, barriers were
allocated to each WG that was launched, which is not
correct, and the CU would provide an infinite number
of barrier slots. There are a limited number of barrier slots
per CU in reality. In addition, the CU will not allocate
barrier slots to WGs with a single WF (nothing to sync if
only one WF).

Beyond modeling problems, there also the issue of deadlock.
The barrier could deadlock because not all WFs are freed
from the barrier once it has been satisfied. Instead, we
relied on the scoreboard stage to release them lazily,
one-by-one.

Under this implementation the scoreboard may not fully release
all WFs participating in a barrier; this happens because the
first WF to be freed from the barrier could reach an s_barrier
instruction again, forever causing the barrier counts across
WFs to be out-of-sync.

This change refactors the barrier logic to:

1) Create a proper barrier slot implementation

2) Enforce (via a parameter) the number of barrier
   slots on the CU.

3) Simplify the logic and cleanup the code (i.e., we
   no longer iterate through the entire WF list each
   time we check if a barrier is satisfied).

4) Fix deadlock issues.

Change-Id: If53955b54931886baaae322640a7b9da7a1595e0
---
M src/arch/gcn3/insts/instructions.cc
M src/gpu-compute/GPU.py
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/compute_unit.hh
M src/gpu-compute/scoreboard_check_stage.cc
M src/gpu-compute/shader.cc
M src/gpu-compute/wavefront.cc
M src/gpu-compute/wavefront.hh
8 files changed, 386 insertions(+), 101 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 607e3c6..817b339 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -39,6 +39,7 @@

 #include "arch/gcn3/insts/inst_util.hh"
 #include "debug/GCN3.hh"
+#include "debug/GPUSync.hh"
 #include "gpu-compute/shader.hh"

 namespace Gcn3ISA
@@ -3709,6 +3710,7 @@
 Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
 {
 Wavefront *wf = gpuDynInst->wavefront();
+ComputeUnit *cu = gpuDynInst->computeUnit();

 // delete extra instructions fetched for completed work-items
 wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
@@ -3725,6 +3727,25 @@
 int refCount = wf->computeUnit->getLds()
 .decreaseRefCounter(wf->dispatchId, wf->wgId);

+/**
+ * The parent WF of this instruction is exiting, therefore
+ * it should not participate in this barrier any longer. This
+ * prevents possible deadlock issues if WFs exit early.
+ */
+int bar_id = WFBarrier::InvalidID;
+if (wf->hasBarrier()) {
+assert(wf->getStatus() != Wavefront::S_BARRIER);
+bar_id = wf->barrierId();
+assert(bar_id != WFBarrier::InvalidID);
+wf->releaseBarrier();
+cu->decMaxBarrierCnt(bar_id);
+DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
+"program and decrementing max barrier count for "
+"barrier Id%d. New max count: %d.\n", cu->cu_id,
+wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
+cu->maxBarrierCnt(bar_id));
+}
+
 DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
 wf->computeUnit->cu_id, wf->wgId, refCount);

@@ -3748,6 +3769,20 @@
 wf->lastInstExec = 0;

 if (!refCount) {
+/**
+ * If all WFs have finished, and hence the WG has finished,
+ * then we can free up the barrier belonging to the parent
+ * WG, but only if we actually used a barrier (i.e., more
+ * than one WF in the WG).
+ */
+if (bar_id != WFBarrier::InvalidID) {
+DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves  
are "
+"now complete. Releasing barrier Id%d.\n",  
cu->cu_id,

+wf->simdId, wf->wfSlotId, wf->wfDynId,
+wf->barrierId());
+cu->releaseBarrier(bar_id);
+}
+
/**
  * Last wavefront of the workgroup has executed return. If the
  * workgroup is not the final one in the kernel, then simply
@@ -4027,12 +4062,21 @@
 Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst)
 {
 Wavefront *wf = gpuDynInst->wavefront();
+ComputeUnit *cu = gpuDynInst->computeUnit();

-assert(wf->barrierCnt == wf->oldBarrierCnt);
-
-wf->barrierCnt = wf->oldBa

  1   2   >