[gem5-dev] Change in gem5/gem5[master]: cpu: Add non-faulting load support to Minor and O3

Gabor Dozsa (Gerrit) Thu, 28 Feb 2019 06:28:53 -0800

Gabor Dozsa has uploaded this change for review. (https://gem5-review.googlesource.com/c/public/gem5/+/16869


Change subject: cpu: Add non-faulting load support to Minor and O3
......................................................................

cpu: Add non-faulting load support to Minor and O3

Some architecture allows non-faulting memory loads
in some specific circumstances (e.g. first-faulting
and non-faulting loads of ARM SVE). This patch
adds support for such loads in the Minor and O3
CPU models.

Change-Id: Id5f6f0ee62b1d94d2dad8103ebfa6693967395a3
Signed-off-by: Gabor Dozsa <[email protected]>
---
M src/cpu/minor/dyn_inst.cc
M src/cpu/minor/dyn_inst.hh
M src/cpu/minor/exec_context.hh
M src/cpu/minor/execute.cc
M src/cpu/minor/lsq.cc
M src/cpu/minor/lsq.hh
M src/cpu/o3/lsq.hh
M src/cpu/o3/lsq_impl.hh
M src/cpu/o3/lsq_unit_impl.hh
9 files changed, 218 insertions(+), 98 deletions(-)



diff --git a/src/cpu/minor/dyn_inst.cc b/src/cpu/minor/dyn_inst.cc
index 3531637..087b718 100644
--- a/src/cpu/minor/dyn_inst.cc
+++ b/src/cpu/minor/dyn_inst.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014, 2016 ARM Limited
+ * Copyright (c) 2013-2014, 2016,2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -108,6 +108,8 @@
         os << "-";
     else if (isFault())
         os << "F;" << id;
+    else if (translationFault != NoFault)
+        os << "TF;" << id;
     else
         os << id;
 }
@@ -120,6 +122,8 @@

     if (inst.isFault())
         os << "fault: \"" << inst.fault->name() << '"';
+    else if (inst.translationFault != NoFault)

+ os << "translation fault: \"" << inst.translationFault->name()<< '"';

     else if (inst.staticInst)
         os << inst.staticInst->getName();
     else
diff --git a/src/cpu/minor/dyn_inst.hh b/src/cpu/minor/dyn_inst.hh
index b2decb3..9a30c35 100644
--- a/src/cpu/minor/dyn_inst.hh
+++ b/src/cpu/minor/dyn_inst.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014 ARM Limited
+ * Copyright (c) 2013-2014,2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -194,6 +194,9 @@
     /** This instruction is in the LSQ, not a functional unit */
     bool inLSQ;

+    /** Translation fault in case of a mem ref */
+    Fault translationFault;
+
     /** The instruction has been sent to the store buffer */
     bool inStoreBuffer;

@@ -226,8 +229,8 @@
         staticInst(NULL), id(id_), traceData(NULL),
         pc(TheISA::PCState(0)), fault(fault_),
         triedToPredict(false), predictedTaken(false),
-        fuIndex(0), inLSQ(false), inStoreBuffer(false),
-        canEarlyIssue(false),
+        fuIndex(0), inLSQ(false), translationFault(NoFault),
+        inStoreBuffer(false), canEarlyIssue(false),
         instToWaitFor(0), extraCommitDelay(Cycles(0)),
         extraCommitDelayExpr(NULL), minimumCommitCycle(Cycles(0))
     { }
diff --git a/src/cpu/minor/exec_context.hh b/src/cpu/minor/exec_context.hh
index 3560a7e..09e8b3e 100644
--- a/src/cpu/minor/exec_context.hh
+++ b/src/cpu/minor/exec_context.hh
@@ -110,10 +110,9 @@

const std::vector<bool>& byteEnable =std::vector<bool>())

         override
     {
-        execute.getLSQ().pushRequest(inst, true /* load */, nullptr,
-                                     size, addr, flags, nullptr,
-                                     byteEnable);
-        return NoFault;
+        return execute.getLSQ().pushRequest(inst, true /* load */, nullptr,
+                                            size, addr, flags, nullptr,
+                                            byteEnable);
     }

     Fault
@@ -123,9 +122,8 @@
         override
     {
         assert(byteEnable.empty() || byteEnable.size() == size);
-        execute.getLSQ().pushRequest(inst, false /* store */, data,
-                                     size, addr, flags, res, byteEnable);
-        return NoFault;
+        return execute.getLSQ().pushRequest(inst, false /* store */, data,
+            size, addr, flags, res, byteEnable);
     }

     IntReg
diff --git a/src/cpu/minor/execute.cc b/src/cpu/minor/execute.cc
index af244a4..b278c79 100644
--- a/src/cpu/minor/execute.cc
+++ b/src/cpu/minor/execute.cc
@@ -336,19 +336,19 @@
      *  context predicate, otherwise, it will be set to false */
     bool use_context_predicate = true;

-    if (response->fault != NoFault) {
+    if (inst->translationFault != NoFault) {
         /* Invoke memory faults. */
         DPRINTF(MinorMem, "Completing fault from DTLB access: %s\n",
-            response->fault->name());
+            inst->translationFault->name());

         if (inst->staticInst->isPrefetch()) {
             DPRINTF(MinorMem, "Not taking fault on prefetch: %s\n",
-                response->fault->name());
+                inst->translationFault->name());

             /* Don't assign to fault */
         } else {
             /* Take the fault raised during the TLB/memory access */
-            fault = response->fault;
+            fault = inst->translationFault;

             fault->invoke(thread, inst->staticInst);
         }
@@ -466,6 +466,18 @@
         Fault init_fault = inst->staticInst->initiateAcc(&context,
             inst->traceData);

+        if (inst->inLSQ) {
+            if (init_fault != NoFault) {
+                assert(inst->translationFault != NoFault);
+                // Translation faults are dealt with in handleMemResponse()
+                init_fault = NoFault;
+            } else {

+ // If we have a translation fault then it got suppressedby

+                // initateAcc()
+                inst->translationFault = NoFault;
+            }
+        }
+
         if (init_fault != NoFault) {
             DPRINTF(MinorExecute, "Fault on memory inst: %s"
                 " initiateAcc: %s\n", *inst, init_fault->name());
diff --git a/src/cpu/minor/lsq.cc b/src/cpu/minor/lsq.cc
index 255bd29..cda7a9d 100644
--- a/src/cpu/minor/lsq.cc
+++ b/src/cpu/minor/lsq.cc
@@ -65,17 +65,53 @@
     data(data_),
     packet(NULL),
     request(),
-    fault(NoFault),
     res(res_),
     byteEnable(byteEnable_),
     skipped(false),
     issuedToMemory(false),
+    isTranslationDelayed(false),
     state(NotIssued)
 {
     request = std::make_shared<Request>();
 }

 void
+LSQ::LSQRequest::tryToSuppressFault()
+{
+    SimpleThread &thread = *port.cpu.threads[inst->id.threadId];
+    TheISA::PCState old_pc = thread.pcState();
+    ExecContext context(port.cpu, thread, port.execute, inst);
+    Fault M5_VAR_USED fault = inst->translationFault;
+
+    // Give the instruction a chance to suppress a translation fault

+ inst->translationFault = inst->staticInst->initiateAcc(&context,nullptr);

+    if (inst->translationFault == NoFault) {
+        DPRINTFS(MinorMem, (&port),
+                 "Translation fault suppressed for inst:%s\n", *inst);
+    } else {
+        assert(inst->translationFault == fault);
+    }
+    thread.pcState(old_pc);
+}
+
+void
+LSQ::LSQRequest::completeDisabledMemAccess()
+{

+ DPRINTFS(MinorMem, (&port), "Complete disabled mem access forinst:%s\n",

+             *inst);
+
+    SimpleThread &thread = *port.cpu.threads[inst->id.threadId];
+    TheISA::PCState old_pc = thread.pcState();
+
+    ExecContext context(port.cpu, thread, port.execute, inst);
+
+    context.setMemAccPredicate(false);
+    inst->staticInst->completeAcc(nullptr, &context, inst->traceData);
+
+    thread.pcState(old_pc);
+}
+
+void
 LSQ::LSQRequest::disableMemAccess()
 {
     port.cpu.threads[inst->id.threadId]->setMemAccPredicate(false);
@@ -228,16 +264,26 @@

LSQ::SingleDataRequest::finish(const Fault &fault_, const RequestPtr&request_,

                                ThreadContext *tc, BaseTLB::Mode mode)
 {
-    fault = fault_;
-
     port.numAccessesInDTLB--;

     DPRINTFS(MinorMem, (&port), "Received translation response for"
-        " request: %s\n", *inst);
+             " request: %s delayed:%d %s\n", *inst, isTranslationDelayed,
+             fault_ != NoFault ? fault_->name() : "");

-    makePacket();
-
-    setState(Translated);
+    if (fault_ != NoFault) {
+        inst->translationFault = fault_;
+        if (isTranslationDelayed) {
+            tryToSuppressFault();
+            if (inst->translationFault == NoFault) {
+                completeDisabledMemAccess();
+                setState(Complete);
+            }
+        }
+        setState(Translated);
+    } else {
+        setState(Translated);
+        makePacket();
+    }
     port.tryToSendToTransfers(this);

     /* Let's try and wake up the processor for the next cycle */
@@ -282,8 +328,6 @@

LSQ::SplitDataRequest::finish(const Fault &fault_, const RequestPtr&request_,

                               ThreadContext *tc, BaseTLB::Mode mode)
 {
-    fault = fault_;
-
     port.numAccessesInDTLB--;

     unsigned int M5_VAR_USED expected_fragment_index =
@@ -293,7 +337,9 @@
     numTranslatedFragments++;

DPRINTFS(MinorMem, (&port), "Received translation response forfragment"

-        " %d of request: %s\n", expected_fragment_index, *inst);
+             " %d of request: %s delayed:%d %s\n", expected_fragment_index,
+             *inst, isTranslationDelayed,
+             fault_ != NoFault ? fault_->name() : "");

     assert(request_ == fragmentRequests[expected_fragment_index]);

@@ -301,18 +347,33 @@
      *  tryToSendToTransfers does take */
     port.cpu.wakeupOnEvent(Pipeline::ExecuteStageId);

-    if (fault != NoFault) {
+    if (fault_ != NoFault) {
         /* tryToSendToTransfers will handle the fault */
+        inst->translationFault = fault_;

         DPRINTFS(MinorMem, (&port), "Faulting translation for fragment:"
             " %d of request: %s\n",
             expected_fragment_index, *inst);

-        setState(Translated);
+        if (expected_fragment_index > 0 || isTranslationDelayed)
+            tryToSuppressFault();
+        if (expected_fragment_index == 0) {

+ if (isTranslationDelayed && inst->translationFault == NoFault){

+                completeDisabledMemAccess();
+                setState(Complete);
+            } else {
+                setState(Translated);
+            }
+        } else if (inst->translationFault == NoFault) {
+            setState(Translated);
+            numTranslatedFragments--;
+            makeFragmentPackets();
+        } else {
+            setState(Translated);
+        }
         port.tryToSendToTransfers(this);
     } else if (numTranslatedFragments == numFragments) {
         makeFragmentPackets();
-
         setState(Translated);
         port.tryToSendToTransfers(this);
     } else {
@@ -565,6 +626,7 @@
 void
 LSQ::SplitDataRequest::retireResponse(PacketPtr response)
 {
+    assert(inst->translationFault == NoFault);
     assert(numRetiredFragments < numTranslatedFragments);

     DPRINTFS(MinorMem, (&port), "Retiring fragment addr: 0x%x size: %d"
@@ -953,7 +1015,7 @@
         return;
     }

-    if (request->fault != NoFault) {
+    if (request->inst->translationFault != NoFault) {
         if (request->inst->staticInst->isPrefetch()) {

DPRINTF(MinorMem, "Not signalling fault for faultingprefetch\n");

         }
@@ -1509,58 +1571,62 @@
     return ret;
 }

-void
+Fault
 LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
                  unsigned int size, Addr addr, Request::Flags flags,
                  uint64_t *res, const std::vector<bool>& byteEnable)
 {
-    bool needs_burst = transferNeedsBurst(addr, size, lineWidth);
-    LSQRequestPtr request;
+    assert(inst->translationFault == NoFault || inst->inLSQ);
+    if (!inst->inLSQ) {
+        bool needs_burst = transferNeedsBurst(addr, size, lineWidth);
+        LSQRequestPtr request;

-    /* Copy given data into the request.  The request will pass this to the
-     *  packet and then it will own the data */
-    uint8_t *request_data = NULL;

+ /* Copy given data into the request. The request will pass thisto the

+         *  packet and then it will own the data */
+        uint8_t *request_data = NULL;

-    DPRINTF(MinorMem, "Pushing request (%s) addr: 0x%x size: %d flags:"
-        " 0x%x%s lineWidth : 0x%x\n",
-        (isLoad ? "load" : "store"), addr, size, flags,
-            (needs_burst ? " (needs burst)" : ""), lineWidth);
+        DPRINTF(MinorMem, "Pushing request (%s) addr: 0x%x size: %d flags:"
+                " 0x%x%s lineWidth : 0x%x\n",
+                (isLoad ? "load" : "store"), addr, size, flags,
+                (needs_burst ? " (needs burst)" : ""), lineWidth);

-    if (!isLoad) {

- /* request_data becomes the property of a ...DataRequest (seebelow)

-         *  and destroyed by its destructor */
-        request_data = new uint8_t[size];
-        if (flags & Request::STORE_NO_DATA) {
-            /* For cache zeroing, just use zeroed data */
-            std::memset(request_data, 0, size);
-        } else {
-            std::memcpy(request_data, data, size);
+        if (!isLoad) {
+            /* request_data becomes the property of a ...DataRequest (see
+             * below) and destroyed by its destructor */
+            request_data = new uint8_t[size];
+            if (flags & Request::STORE_NO_DATA) {
+                /* For cache zeroing, just use zeroed data */
+                std::memset(request_data, 0, size);
+            } else {
+                std::memcpy(request_data, data, size);
+            }
         }
-    }

-    if (needs_burst) {
-        request = new SplitDataRequest(
-            *this, inst, isLoad, request_data, res, byteEnable);
-    } else {
-        request = new SingleDataRequest(
-            *this, inst, isLoad, request_data, res, byteEnable);
-    }
+        if (needs_burst) {
+            request = new SplitDataRequest(
+                *this, inst, isLoad, request_data, res, byteEnable);
+        } else {
+            request = new SingleDataRequest(
+                *this, inst, isLoad, request_data, res, byteEnable);
+        }

-    if (inst->traceData)
-        inst->traceData->setMem(addr, size, flags);
+        if (inst->traceData)
+            inst->traceData->setMem(addr, size, flags);

-    int cid = cpu.threads[inst->id.threadId]->getTC()->contextId();
-    request->request->setContext(cid);
-    request->request->setVirt(0 /* asid */,
-        addr, size, flags, cpu.dataMasterId(),
-        /* I've no idea why we need the PC, but give it */
-        inst->pc.instAddr());
-    if (!byteEnable.empty()) {
+        int cid = cpu.threads[inst->id.threadId]->getTC()->contextId();
+        request->request->setContext(cid);
+        request->request->setVirt(0 /* asid */,
+                                  addr, size, flags, cpu.dataMasterId(),

+ /* I've no idea why we need the PC, butgive

+                                   * it */
+                                  inst->pc.instAddr());
         request->request->setByteEnable(byteEnable);
-    }

-    requests.push(request);
-    request->startAddrTranslation();
+        requests.push(request);
+        inst->inLSQ = true;
+        request->startAddrTranslation();
+    }
+    return inst->translationFault;
 }

 void
@@ -1631,16 +1697,12 @@
 void
 LSQ::LSQRequest::makePacket()
 {
+    assert(inst->translationFault == NoFault);
+
     /* Make the function idempotent */
     if (packet)
         return;

-    // if the translation faulted, do not create a packet
-    if (fault != NoFault) {
-        assert(packet == NULL);
-        return;
-    }
-
     packet = makePacketForRequest(request, isLoad, this, data,
                                   byteEnable);
     /* Null the ret data so we know not to deallocate it when the
diff --git a/src/cpu/minor/lsq.hh b/src/cpu/minor/lsq.hh
index 15c0e8e..cbea281 100644
--- a/src/cpu/minor/lsq.hh
+++ b/src/cpu/minor/lsq.hh
@@ -145,9 +145,6 @@
         /** The underlying request of this LSQRequest */
         RequestPtr request;

-        /** Fault generated performing this request */
-        Fault fault;
-
         /** Res from pushRequest */
         uint64_t *res;

@@ -163,6 +160,9 @@
          *  that's visited the memory system */
         bool issuedToMemory;

+        /** Address translation is delayed due to table walk */
+        bool isTranslationDelayed;
+
         enum LSQRequestState
         {
             NotIssued, /* Newly created */
@@ -189,9 +189,14 @@

       protected:
         /** BaseTLB::Translation interface */
-        void markDelayed() { }
+        void markDelayed() { isTranslationDelayed = true; }
+
+        /** Instructions may want to suppress translation faults (e.g.
+         *  non-faulting vector loads).*/
+        void tryToSuppressFault();

         void disableMemAccess();
+        void completeDisabledMemAccess();

       public:
         LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_,
@@ -707,7 +712,7 @@

     /** Single interface for readMem/writeMem to issue requests into
      *  the LSQ */
-    void pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
+    Fault pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
             unsigned int size, Addr addr, Request::Flags flags,
             uint64_t *res,
             const std::vector<bool>& byteEnable = std::vector<bool>());
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index f3fdfc1..5ffac97 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -218,6 +218,7 @@
             Complete,
             Squashed,
             Fault,
+            PartialFault,
         };
         State _state;
         LSQSenderState* _senderState;
@@ -514,6 +515,19 @@
             return flags[(int)Flag::Sent];
         }

+        bool
+        isPartialFault()
+        {
+            return _state == State::PartialFault;
+        }
+
+        bool
+        isMemAccessRequired()
+        {
+            return (_state == State::Request ||
+                    (isPartialFault() && isLoad()));
+        }
+
         /**
          * The LSQ entry is cleared
          */
diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh
index 11d2dc4..a374b15 100644
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -712,7 +712,7 @@

     /* This is the place were instructions get the effAddr. */
     if (req->isTranslationComplete()) {
-        if (inst->getFault() == NoFault) {
+        if (req->isMemAccessRequired()) {
             inst->effAddr = req->getVaddr();
             inst->effSize = size;
             inst->effAddrValid(true);
@@ -720,10 +720,17 @@
             if (cpu->checker) {

inst->reqToVerify =std::make_shared<Request>(*req->request());

             }
+            Fault fault;
             if (isLoad)
-                inst->getFault() = cpu->read(req, inst->lqIdx);
+                fault = cpu->read(req, inst->lqIdx);
             else
-                inst->getFault() = cpu->write(req, data, inst->sqIdx);
+                fault = cpu->write(req, data, inst->sqIdx);
+            // inst->getFault() may have the first-fault of a
+            // multi-access split request at this point.
+            // Overwrite that only if we got another type of fault
+            // (e.g. re-exec).
+            if (fault != NoFault)
+                inst->getFault() = fault;
         } else if (isLoad) {
             inst->setMemAccPredicate(false);
             // Commit will have to clean up whatever happened.  Set this
@@ -776,13 +783,16 @@

LSQ<Impl>::SplitDataRequest::finish(const Fault &fault, const RequestPtr&req,

         ThreadContext* tc, BaseTLB::Mode mode)
 {
-    _fault.push_back(fault);
-    assert(req == _requests[numTranslatedFragments] || this->isDelayed());
+    int i;
+    for (i = 0; i < _requests.size() && _requests[i] != req; i++);
+    assert(i < _requests.size());
+    _fault[i] = fault;

     numInTranslationFragments--;
     numTranslatedFragments++;

-    mainReq->setFlags(req->getFlags());
+    if (fault == NoFault)
+        mainReq->setFlags(req->getFlags());

     if (numTranslatedFragments == _requests.size()) {
         if (_inst->isSquashed()) {
@@ -790,27 +800,30 @@
         } else {
             _inst->strictlyOrdered(mainReq->isStrictlyOrdered());
             flags[(int)Flag::TranslationFinished] = true;
-            auto fault_it = _fault.begin();
-            /* Ffwd to the first NoFault. */
-            while (fault_it != _fault.end() && *fault_it == NoFault)
-                fault_it++;
-            /* If none of the fragments faulted: */
-            if (fault_it == _fault.end()) {
-                _inst->physEffAddr = request(0)->getPaddr();
+            _inst->translationCompleted(true);

+            for (i = 0; i < _fault.size() && _fault[i] == NoFault; i++);
+            if (i > 0) {
+                _inst->physEffAddr = request(0)->getPaddr();
                 _inst->memReqFlags = mainReq->getFlags();
                 if (mainReq->isCondSwap()) {
+                    assert (i == _fault.size());
                     assert(_res);
                     mainReq->setExtraData(*_res);
                 }
-                setState(State::Request);
-                _inst->fault = NoFault;
+                if (i == _fault.size()) {
+                    _inst->fault = NoFault;
+                    setState(State::Request);
+                } else {
+                  _inst->fault = _fault[i];
+                  setState(State::PartialFault);
+                }
             } else {
+                _inst->fault = _fault[0];
                 setState(State::Fault);
-                _inst->fault = *fault_it;
             }
-            _inst->translationCompleted(true);
         }
+
     }
 }

diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index 992c964..4236fd5 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -539,8 +539,7 @@

     load_fault = inst->initiateAcc();

-    if (!inst->readMemAccPredicate()) {
-        assert(load_fault == NoFault);
+    if (load_fault == NoFault && !inst->readMemAccPredicate()) {
         assert(inst->readPredicate());
         inst->setExecuted();
         inst->completeAcc(nullptr);
@@ -552,6 +551,16 @@
     if (inst->isTranslationDelayed() && load_fault == NoFault)
         return load_fault;

+    if (load_fault != NoFault && inst->translationCompleted() &&

+ inst->savedReq->isPartialFault() && !inst->savedReq->isComplete()){

+        assert(inst->savedReq->isSplit());

+ // If we have a partial fault where the mem access is not completeyet+ // then the cache must have been blocked. This load will bere-executed

+        // when the cache gets unblocked. We will handle the fault when the
+        // mem access is complete.
+        return NoFault;
+    }
+

// If the instruction faulted or predicated false, then we need tosend it

     // along to commit without the instruction completing.
     if (load_fault != NoFault || !inst->readPredicate()) {

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/16869

To unsubscribe, or for help writing mail filters, visithttps://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: master
Gerrit-Change-Id: Id5f6f0ee62b1d94d2dad8103ebfa6693967395a3
Gerrit-Change-Number: 16869
Gerrit-PatchSet: 1
Gerrit-Owner: Gabor Dozsa <[email protected]>
Gerrit-MessageType: newchange
_______________________________________________
gem5-dev mailing list
[email protected]
http://m5sim.org/mailman/listinfo/gem5-dev

[gem5-dev] Change in gem5/gem5[master]: cpu: Add non-faulting load support to Minor and O3

Reply via email to