Gabor Dozsa has uploaded this change for review. (
https://gem5-review.googlesource.com/c/public/gem5/+/16869
Change subject: cpu: Add non-faulting load support to Minor and O3
......................................................................
cpu: Add non-faulting load support to Minor and O3
Some architecture allows non-faulting memory loads
in some specific circumstances (e.g. first-faulting
and non-faulting loads of ARM SVE). This patch
adds support for such loads in the Minor and O3
CPU models.
Change-Id: Id5f6f0ee62b1d94d2dad8103ebfa6693967395a3
Signed-off-by: Gabor Dozsa <[email protected]>
---
M src/cpu/minor/dyn_inst.cc
M src/cpu/minor/dyn_inst.hh
M src/cpu/minor/exec_context.hh
M src/cpu/minor/execute.cc
M src/cpu/minor/lsq.cc
M src/cpu/minor/lsq.hh
M src/cpu/o3/lsq.hh
M src/cpu/o3/lsq_impl.hh
M src/cpu/o3/lsq_unit_impl.hh
9 files changed, 218 insertions(+), 98 deletions(-)
diff --git a/src/cpu/minor/dyn_inst.cc b/src/cpu/minor/dyn_inst.cc
index 3531637..087b718 100644
--- a/src/cpu/minor/dyn_inst.cc
+++ b/src/cpu/minor/dyn_inst.cc
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013-2014, 2016 ARM Limited
+ * Copyright (c) 2013-2014, 2016,2018 ARM Limited
* All rights reserved
*
* The license below extends only to copyright in the software and shall
@@ -108,6 +108,8 @@
os << "-";
else if (isFault())
os << "F;" << id;
+ else if (translationFault != NoFault)
+ os << "TF;" << id;
else
os << id;
}
@@ -120,6 +122,8 @@
if (inst.isFault())
os << "fault: \"" << inst.fault->name() << '"';
+ else if (inst.translationFault != NoFault)
+ os << "translation fault: \"" << inst.translationFault->name()
<< '"';
else if (inst.staticInst)
os << inst.staticInst->getName();
else
diff --git a/src/cpu/minor/dyn_inst.hh b/src/cpu/minor/dyn_inst.hh
index b2decb3..9a30c35 100644
--- a/src/cpu/minor/dyn_inst.hh
+++ b/src/cpu/minor/dyn_inst.hh
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013-2014 ARM Limited
+ * Copyright (c) 2013-2014,2018 ARM Limited
* All rights reserved
*
* The license below extends only to copyright in the software and shall
@@ -194,6 +194,9 @@
/** This instruction is in the LSQ, not a functional unit */
bool inLSQ;
+ /** Translation fault in case of a mem ref */
+ Fault translationFault;
+
/** The instruction has been sent to the store buffer */
bool inStoreBuffer;
@@ -226,8 +229,8 @@
staticInst(NULL), id(id_), traceData(NULL),
pc(TheISA::PCState(0)), fault(fault_),
triedToPredict(false), predictedTaken(false),
- fuIndex(0), inLSQ(false), inStoreBuffer(false),
- canEarlyIssue(false),
+ fuIndex(0), inLSQ(false), translationFault(NoFault),
+ inStoreBuffer(false), canEarlyIssue(false),
instToWaitFor(0), extraCommitDelay(Cycles(0)),
extraCommitDelayExpr(NULL), minimumCommitCycle(Cycles(0))
{ }
diff --git a/src/cpu/minor/exec_context.hh b/src/cpu/minor/exec_context.hh
index 3560a7e..09e8b3e 100644
--- a/src/cpu/minor/exec_context.hh
+++ b/src/cpu/minor/exec_context.hh
@@ -110,10 +110,9 @@
const std::vector<bool>& byteEnable =
std::vector<bool>())
override
{
- execute.getLSQ().pushRequest(inst, true /* load */, nullptr,
- size, addr, flags, nullptr,
- byteEnable);
- return NoFault;
+ return execute.getLSQ().pushRequest(inst, true /* load */, nullptr,
+ size, addr, flags, nullptr,
+ byteEnable);
}
Fault
@@ -123,9 +122,8 @@
override
{
assert(byteEnable.empty() || byteEnable.size() == size);
- execute.getLSQ().pushRequest(inst, false /* store */, data,
- size, addr, flags, res, byteEnable);
- return NoFault;
+ return execute.getLSQ().pushRequest(inst, false /* store */, data,
+ size, addr, flags, res, byteEnable);
}
IntReg
diff --git a/src/cpu/minor/execute.cc b/src/cpu/minor/execute.cc
index af244a4..b278c79 100644
--- a/src/cpu/minor/execute.cc
+++ b/src/cpu/minor/execute.cc
@@ -336,19 +336,19 @@
* context predicate, otherwise, it will be set to false */
bool use_context_predicate = true;
- if (response->fault != NoFault) {
+ if (inst->translationFault != NoFault) {
/* Invoke memory faults. */
DPRINTF(MinorMem, "Completing fault from DTLB access: %s\n",
- response->fault->name());
+ inst->translationFault->name());
if (inst->staticInst->isPrefetch()) {
DPRINTF(MinorMem, "Not taking fault on prefetch: %s\n",
- response->fault->name());
+ inst->translationFault->name());
/* Don't assign to fault */
} else {
/* Take the fault raised during the TLB/memory access */
- fault = response->fault;
+ fault = inst->translationFault;
fault->invoke(thread, inst->staticInst);
}
@@ -466,6 +466,18 @@
Fault init_fault = inst->staticInst->initiateAcc(&context,
inst->traceData);
+ if (inst->inLSQ) {
+ if (init_fault != NoFault) {
+ assert(inst->translationFault != NoFault);
+ // Translation faults are dealt with in handleMemResponse()
+ init_fault = NoFault;
+ } else {
+ // If we have a translation fault then it got suppressed
by
+ // initateAcc()
+ inst->translationFault = NoFault;
+ }
+ }
+
if (init_fault != NoFault) {
DPRINTF(MinorExecute, "Fault on memory inst: %s"
" initiateAcc: %s\n", *inst, init_fault->name());
diff --git a/src/cpu/minor/lsq.cc b/src/cpu/minor/lsq.cc
index 255bd29..cda7a9d 100644
--- a/src/cpu/minor/lsq.cc
+++ b/src/cpu/minor/lsq.cc
@@ -65,17 +65,53 @@
data(data_),
packet(NULL),
request(),
- fault(NoFault),
res(res_),
byteEnable(byteEnable_),
skipped(false),
issuedToMemory(false),
+ isTranslationDelayed(false),
state(NotIssued)
{
request = std::make_shared<Request>();
}
void
+LSQ::LSQRequest::tryToSuppressFault()
+{
+ SimpleThread &thread = *port.cpu.threads[inst->id.threadId];
+ TheISA::PCState old_pc = thread.pcState();
+ ExecContext context(port.cpu, thread, port.execute, inst);
+ Fault M5_VAR_USED fault = inst->translationFault;
+
+ // Give the instruction a chance to suppress a translation fault
+ inst->translationFault = inst->staticInst->initiateAcc(&context,
nullptr);
+ if (inst->translationFault == NoFault) {
+ DPRINTFS(MinorMem, (&port),
+ "Translation fault suppressed for inst:%s\n", *inst);
+ } else {
+ assert(inst->translationFault == fault);
+ }
+ thread.pcState(old_pc);
+}
+
+void
+LSQ::LSQRequest::completeDisabledMemAccess()
+{
+ DPRINTFS(MinorMem, (&port), "Complete disabled mem access for
inst:%s\n",
+ *inst);
+
+ SimpleThread &thread = *port.cpu.threads[inst->id.threadId];
+ TheISA::PCState old_pc = thread.pcState();
+
+ ExecContext context(port.cpu, thread, port.execute, inst);
+
+ context.setMemAccPredicate(false);
+ inst->staticInst->completeAcc(nullptr, &context, inst->traceData);
+
+ thread.pcState(old_pc);
+}
+
+void
LSQ::LSQRequest::disableMemAccess()
{
port.cpu.threads[inst->id.threadId]->setMemAccPredicate(false);
@@ -228,16 +264,26 @@
LSQ::SingleDataRequest::finish(const Fault &fault_, const RequestPtr
&request_,
ThreadContext *tc, BaseTLB::Mode mode)
{
- fault = fault_;
-
port.numAccessesInDTLB--;
DPRINTFS(MinorMem, (&port), "Received translation response for"
- " request: %s\n", *inst);
+ " request: %s delayed:%d %s\n", *inst, isTranslationDelayed,
+ fault_ != NoFault ? fault_->name() : "");
- makePacket();
-
- setState(Translated);
+ if (fault_ != NoFault) {
+ inst->translationFault = fault_;
+ if (isTranslationDelayed) {
+ tryToSuppressFault();
+ if (inst->translationFault == NoFault) {
+ completeDisabledMemAccess();
+ setState(Complete);
+ }
+ }
+ setState(Translated);
+ } else {
+ setState(Translated);
+ makePacket();
+ }
port.tryToSendToTransfers(this);
/* Let's try and wake up the processor for the next cycle */
@@ -282,8 +328,6 @@
LSQ::SplitDataRequest::finish(const Fault &fault_, const RequestPtr
&request_,
ThreadContext *tc, BaseTLB::Mode mode)
{
- fault = fault_;
-
port.numAccessesInDTLB--;
unsigned int M5_VAR_USED expected_fragment_index =
@@ -293,7 +337,9 @@
numTranslatedFragments++;
DPRINTFS(MinorMem, (&port), "Received translation response for
fragment"
- " %d of request: %s\n", expected_fragment_index, *inst);
+ " %d of request: %s delayed:%d %s\n", expected_fragment_index,
+ *inst, isTranslationDelayed,
+ fault_ != NoFault ? fault_->name() : "");
assert(request_ == fragmentRequests[expected_fragment_index]);
@@ -301,18 +347,33 @@
* tryToSendToTransfers does take */
port.cpu.wakeupOnEvent(Pipeline::ExecuteStageId);
- if (fault != NoFault) {
+ if (fault_ != NoFault) {
/* tryToSendToTransfers will handle the fault */
+ inst->translationFault = fault_;
DPRINTFS(MinorMem, (&port), "Faulting translation for fragment:"
" %d of request: %s\n",
expected_fragment_index, *inst);
- setState(Translated);
+ if (expected_fragment_index > 0 || isTranslationDelayed)
+ tryToSuppressFault();
+ if (expected_fragment_index == 0) {
+ if (isTranslationDelayed && inst->translationFault == NoFault)
{
+ completeDisabledMemAccess();
+ setState(Complete);
+ } else {
+ setState(Translated);
+ }
+ } else if (inst->translationFault == NoFault) {
+ setState(Translated);
+ numTranslatedFragments--;
+ makeFragmentPackets();
+ } else {
+ setState(Translated);
+ }
port.tryToSendToTransfers(this);
} else if (numTranslatedFragments == numFragments) {
makeFragmentPackets();
-
setState(Translated);
port.tryToSendToTransfers(this);
} else {
@@ -565,6 +626,7 @@
void
LSQ::SplitDataRequest::retireResponse(PacketPtr response)
{
+ assert(inst->translationFault == NoFault);
assert(numRetiredFragments < numTranslatedFragments);
DPRINTFS(MinorMem, (&port), "Retiring fragment addr: 0x%x size: %d"
@@ -953,7 +1015,7 @@
return;
}
- if (request->fault != NoFault) {
+ if (request->inst->translationFault != NoFault) {
if (request->inst->staticInst->isPrefetch()) {
DPRINTF(MinorMem, "Not signalling fault for faulting
prefetch\n");
}
@@ -1509,58 +1571,62 @@
return ret;
}
-void
+Fault
LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
unsigned int size, Addr addr, Request::Flags flags,
uint64_t *res, const std::vector<bool>& byteEnable)
{
- bool needs_burst = transferNeedsBurst(addr, size, lineWidth);
- LSQRequestPtr request;
+ assert(inst->translationFault == NoFault || inst->inLSQ);
+ if (!inst->inLSQ) {
+ bool needs_burst = transferNeedsBurst(addr, size, lineWidth);
+ LSQRequestPtr request;
- /* Copy given data into the request. The request will pass this to the
- * packet and then it will own the data */
- uint8_t *request_data = NULL;
+ /* Copy given data into the request. The request will pass this
to the
+ * packet and then it will own the data */
+ uint8_t *request_data = NULL;
- DPRINTF(MinorMem, "Pushing request (%s) addr: 0x%x size: %d flags:"
- " 0x%x%s lineWidth : 0x%x\n",
- (isLoad ? "load" : "store"), addr, size, flags,
- (needs_burst ? " (needs burst)" : ""), lineWidth);
+ DPRINTF(MinorMem, "Pushing request (%s) addr: 0x%x size: %d flags:"
+ " 0x%x%s lineWidth : 0x%x\n",
+ (isLoad ? "load" : "store"), addr, size, flags,
+ (needs_burst ? " (needs burst)" : ""), lineWidth);
- if (!isLoad) {
- /* request_data becomes the property of a ...DataRequest (see
below)
- * and destroyed by its destructor */
- request_data = new uint8_t[size];
- if (flags & Request::STORE_NO_DATA) {
- /* For cache zeroing, just use zeroed data */
- std::memset(request_data, 0, size);
- } else {
- std::memcpy(request_data, data, size);
+ if (!isLoad) {
+ /* request_data becomes the property of a ...DataRequest (see
+ * below) and destroyed by its destructor */
+ request_data = new uint8_t[size];
+ if (flags & Request::STORE_NO_DATA) {
+ /* For cache zeroing, just use zeroed data */
+ std::memset(request_data, 0, size);
+ } else {
+ std::memcpy(request_data, data, size);
+ }
}
- }
- if (needs_burst) {
- request = new SplitDataRequest(
- *this, inst, isLoad, request_data, res, byteEnable);
- } else {
- request = new SingleDataRequest(
- *this, inst, isLoad, request_data, res, byteEnable);
- }
+ if (needs_burst) {
+ request = new SplitDataRequest(
+ *this, inst, isLoad, request_data, res, byteEnable);
+ } else {
+ request = new SingleDataRequest(
+ *this, inst, isLoad, request_data, res, byteEnable);
+ }
- if (inst->traceData)
- inst->traceData->setMem(addr, size, flags);
+ if (inst->traceData)
+ inst->traceData->setMem(addr, size, flags);
- int cid = cpu.threads[inst->id.threadId]->getTC()->contextId();
- request->request->setContext(cid);
- request->request->setVirt(0 /* asid */,
- addr, size, flags, cpu.dataMasterId(),
- /* I've no idea why we need the PC, but give it */
- inst->pc.instAddr());
- if (!byteEnable.empty()) {
+ int cid = cpu.threads[inst->id.threadId]->getTC()->contextId();
+ request->request->setContext(cid);
+ request->request->setVirt(0 /* asid */,
+ addr, size, flags, cpu.dataMasterId(),
+ /* I've no idea why we need the PC, but
give
+ * it */
+ inst->pc.instAddr());
request->request->setByteEnable(byteEnable);
- }
- requests.push(request);
- request->startAddrTranslation();
+ requests.push(request);
+ inst->inLSQ = true;
+ request->startAddrTranslation();
+ }
+ return inst->translationFault;
}
void
@@ -1631,16 +1697,12 @@
void
LSQ::LSQRequest::makePacket()
{
+ assert(inst->translationFault == NoFault);
+
/* Make the function idempotent */
if (packet)
return;
- // if the translation faulted, do not create a packet
- if (fault != NoFault) {
- assert(packet == NULL);
- return;
- }
-
packet = makePacketForRequest(request, isLoad, this, data,
byteEnable);
/* Null the ret data so we know not to deallocate it when the
diff --git a/src/cpu/minor/lsq.hh b/src/cpu/minor/lsq.hh
index 15c0e8e..cbea281 100644
--- a/src/cpu/minor/lsq.hh
+++ b/src/cpu/minor/lsq.hh
@@ -145,9 +145,6 @@
/** The underlying request of this LSQRequest */
RequestPtr request;
- /** Fault generated performing this request */
- Fault fault;
-
/** Res from pushRequest */
uint64_t *res;
@@ -163,6 +160,9 @@
* that's visited the memory system */
bool issuedToMemory;
+ /** Address translation is delayed due to table walk */
+ bool isTranslationDelayed;
+
enum LSQRequestState
{
NotIssued, /* Newly created */
@@ -189,9 +189,14 @@
protected:
/** BaseTLB::Translation interface */
- void markDelayed() { }
+ void markDelayed() { isTranslationDelayed = true; }
+
+ /** Instructions may want to suppress translation faults (e.g.
+ * non-faulting vector loads).*/
+ void tryToSuppressFault();
void disableMemAccess();
+ void completeDisabledMemAccess();
public:
LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_,
@@ -707,7 +712,7 @@
/** Single interface for readMem/writeMem to issue requests into
* the LSQ */
- void pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
+ Fault pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
unsigned int size, Addr addr, Request::Flags flags,
uint64_t *res,
const std::vector<bool>& byteEnable = std::vector<bool>());
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index f3fdfc1..5ffac97 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -218,6 +218,7 @@
Complete,
Squashed,
Fault,
+ PartialFault,
};
State _state;
LSQSenderState* _senderState;
@@ -514,6 +515,19 @@
return flags[(int)Flag::Sent];
}
+ bool
+ isPartialFault()
+ {
+ return _state == State::PartialFault;
+ }
+
+ bool
+ isMemAccessRequired()
+ {
+ return (_state == State::Request ||
+ (isPartialFault() && isLoad()));
+ }
+
/**
* The LSQ entry is cleared
*/
diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh
index 11d2dc4..a374b15 100644
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -712,7 +712,7 @@
/* This is the place were instructions get the effAddr. */
if (req->isTranslationComplete()) {
- if (inst->getFault() == NoFault) {
+ if (req->isMemAccessRequired()) {
inst->effAddr = req->getVaddr();
inst->effSize = size;
inst->effAddrValid(true);
@@ -720,10 +720,17 @@
if (cpu->checker) {
inst->reqToVerify =
std::make_shared<Request>(*req->request());
}
+ Fault fault;
if (isLoad)
- inst->getFault() = cpu->read(req, inst->lqIdx);
+ fault = cpu->read(req, inst->lqIdx);
else
- inst->getFault() = cpu->write(req, data, inst->sqIdx);
+ fault = cpu->write(req, data, inst->sqIdx);
+ // inst->getFault() may have the first-fault of a
+ // multi-access split request at this point.
+ // Overwrite that only if we got another type of fault
+ // (e.g. re-exec).
+ if (fault != NoFault)
+ inst->getFault() = fault;
} else if (isLoad) {
inst->setMemAccPredicate(false);
// Commit will have to clean up whatever happened. Set this
@@ -776,13 +783,16 @@
LSQ<Impl>::SplitDataRequest::finish(const Fault &fault, const RequestPtr
&req,
ThreadContext* tc, BaseTLB::Mode mode)
{
- _fault.push_back(fault);
- assert(req == _requests[numTranslatedFragments] || this->isDelayed());
+ int i;
+ for (i = 0; i < _requests.size() && _requests[i] != req; i++);
+ assert(i < _requests.size());
+ _fault[i] = fault;
numInTranslationFragments--;
numTranslatedFragments++;
- mainReq->setFlags(req->getFlags());
+ if (fault == NoFault)
+ mainReq->setFlags(req->getFlags());
if (numTranslatedFragments == _requests.size()) {
if (_inst->isSquashed()) {
@@ -790,27 +800,30 @@
} else {
_inst->strictlyOrdered(mainReq->isStrictlyOrdered());
flags[(int)Flag::TranslationFinished] = true;
- auto fault_it = _fault.begin();
- /* Ffwd to the first NoFault. */
- while (fault_it != _fault.end() && *fault_it == NoFault)
- fault_it++;
- /* If none of the fragments faulted: */
- if (fault_it == _fault.end()) {
- _inst->physEffAddr = request(0)->getPaddr();
+ _inst->translationCompleted(true);
+ for (i = 0; i < _fault.size() && _fault[i] == NoFault; i++);
+ if (i > 0) {
+ _inst->physEffAddr = request(0)->getPaddr();
_inst->memReqFlags = mainReq->getFlags();
if (mainReq->isCondSwap()) {
+ assert (i == _fault.size());
assert(_res);
mainReq->setExtraData(*_res);
}
- setState(State::Request);
- _inst->fault = NoFault;
+ if (i == _fault.size()) {
+ _inst->fault = NoFault;
+ setState(State::Request);
+ } else {
+ _inst->fault = _fault[i];
+ setState(State::PartialFault);
+ }
} else {
+ _inst->fault = _fault[0];
setState(State::Fault);
- _inst->fault = *fault_it;
}
- _inst->translationCompleted(true);
}
+
}
}
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index 992c964..4236fd5 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -539,8 +539,7 @@
load_fault = inst->initiateAcc();
- if (!inst->readMemAccPredicate()) {
- assert(load_fault == NoFault);
+ if (load_fault == NoFault && !inst->readMemAccPredicate()) {
assert(inst->readPredicate());
inst->setExecuted();
inst->completeAcc(nullptr);
@@ -552,6 +551,16 @@
if (inst->isTranslationDelayed() && load_fault == NoFault)
return load_fault;
+ if (load_fault != NoFault && inst->translationCompleted() &&
+ inst->savedReq->isPartialFault() && !inst->savedReq->isComplete())
{
+ assert(inst->savedReq->isSplit());
+ // If we have a partial fault where the mem access is not complete
yet
+ // then the cache must have been blocked. This load will be
re-executed
+ // when the cache gets unblocked. We will handle the fault when the
+ // mem access is complete.
+ return NoFault;
+ }
+
// If the instruction faulted or predicated false, then we need to
send it
// along to commit without the instruction completing.
if (load_fault != NoFault || !inst->readPredicate()) {
--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/16869
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings
Gerrit-Project: public/gem5
Gerrit-Branch: master
Gerrit-Change-Id: Id5f6f0ee62b1d94d2dad8103ebfa6693967395a3
Gerrit-Change-Number: 16869
Gerrit-PatchSet: 1
Gerrit-Owner: Gabor Dozsa <[email protected]>
Gerrit-MessageType: newchange
_______________________________________________
gem5-dev mailing list
[email protected]
http://m5sim.org/mailman/listinfo/gem5-dev