Tuan Ta has uploaded this change for review. ( https://gem5-review.googlesource.com/8188


Change subject: cpu: support atomic memory request type with AtomicOpFunctor
......................................................................

cpu: support atomic memory request type with AtomicOpFunctor

This patch enables all 4 CPU models (AtomicSimpleCPU, TimingSimpleCPU,
MinorCPU and DerivO3CPU) to issue atomic memory (AMO) requests to memory
system.

Atomic memory instruction is treated as a special store instruction in
all CPU models.

In simple CPUs, an AMO request with an associated AtomicOpFunctor is
simply sent to L1 dcache.

In MinorCPU, an AMO request bypasses store buffer and waits for any
conflicting store request(s) currently in the store buffer to retire
before the AMO request is sent to the cache. AMO requests are not buffered
in the store buffer, so their effects appear immediately in the cache.

In DerivO3CPU, an AMO request is inserted in the store buffer so that it
is delivered to the cache only after all previous stores are issued to
the cache. Data forwarding between between an outstanding AMO in the
store buffer and a subsequent load is not allowed since the AMO request
does not hold valid data until it's executed in the cache.

This implementation assumes that a target ISA implementation must insert
enough memory fences as micro-ops around an atomic instruction to
enforce a correct order of memory instructions. Without extra memory
fences, this implementation can allow AMOs and other memory instructions
that do not conflict (i.e., not target the same address) to reorder.

This implementation also assumes that atomic instructions execute within
a cache line boundary since the cache for now is not able to execute an
operation on two different cache lines in one single step. Therefore,
ISAs like x86 that require multi-cache-line atomic instructions need to
either use a pair of locking load and unlocking store or change the
cache implementation to guarantee the atomicity of an atomic
instruction.

Change-Id: Ib8a7c81868ac05b98d73afc7d16eb88486f8cf9a
---
M src/cpu/base.cc
M src/cpu/base_dyn_inst.hh
M src/cpu/checker/cpu.hh
M src/cpu/exec_context.hh
M src/cpu/minor/exec_context.hh
M src/cpu/minor/execute.cc
M src/cpu/minor/fetch2.cc
M src/cpu/minor/lsq.cc
M src/cpu/minor/lsq.hh
M src/cpu/o3/commit_impl.hh
M src/cpu/o3/iew_impl.hh
M src/cpu/o3/inst_queue_impl.hh
M src/cpu/o3/lsq_unit.hh
M src/cpu/o3/lsq_unit_impl.hh
M src/cpu/o3/mem_dep_unit_impl.hh
M src/cpu/o3/rename_impl.hh
M src/cpu/simple/atomic.cc
M src/cpu/simple/atomic.hh
M src/cpu/simple/base.cc
M src/cpu/simple/base.hh
M src/cpu/simple/exec_context.hh
M src/cpu/simple/timing.cc
M src/cpu/simple/timing.hh
23 files changed, 410 insertions(+), 60 deletions(-)



diff --git a/src/cpu/base.cc b/src/cpu/base.cc
index 4fd804b..48726e6 100644
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -408,7 +408,7 @@
     if (inst->isLoad())
         ppRetiredLoads->notify(1);

-    if (inst->isStore())
+    if (inst->isStore() || inst->isAtomic())
         ppRetiredStores->notify(1);

     if (inst->isControl())
diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh
index 2eeb773..89063ac 100644
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@@ -303,6 +303,15 @@
     Fault writeMem(uint8_t *data, unsigned size, Addr addr,
                    Request::Flags flags, uint64_t *res);

+    Fault amoMem(Addr addr, uint8_t *data, unsigned int size,
+                 Request::Flags flags, AtomicOpFunctor *amo_op)
+    {
+        panic("amoMem is used only in Atomic CPU model, not in O3CPU\n");
+    }
+
+    Fault initiateMemAMO(Addr addr, unsigned size, Request::Flags flags,
+                         AtomicOpFunctor *amo_op);
+
     /** Splits a request in two if it crosses a dcache block. */
     void splitRequest(RequestPtr req, RequestPtr &sreqLow,
                       RequestPtr &sreqHigh);
@@ -989,6 +998,64 @@
 }

 template<class Impl>
+Fault
+BaseDynInst<Impl>::initiateMemAMO(Addr addr, unsigned size,
+                                  Request::Flags flags,
+                                  AtomicOpFunctor *amo_op)
+{
+    instFlags[ReqMade] = true;
+    Request *req = NULL;
+    Request *sreqLow = NULL;
+    Request *sreqHigh = NULL;
+
+    if (instFlags[ReqMade] && translationStarted()) {
+        req = savedReq;
+        sreqLow = savedSreqLow;
+        sreqHigh = savedSreqHigh;
+    } else {
+        req = new Request(asid, addr, size, flags, masterId(),
+ this->pc.instAddr(), thread->contextId(), amo_op);
+
+        req->taskId(cpu->taskId());
+
+        // Only split the request if the ISA supports unaligned accesses.
+        if (TheISA::HasUnalignedMemAcc) {
+            splitRequest(req, sreqLow, sreqHigh);
+        }
+        initiateTranslation(req, sreqLow, sreqHigh, NULL, BaseTLB::Write);
+    }
+
+    // AMO requests that access across a cache line boundary are not
+    // allowed since the cache does not guarantee AMO ops to be executed
+    // atomically in two cache lines
+    // For ISAs such as x86 that requires AMO operations to work on
+    // accesses that cross cache-line boundaries, the cache needs to be
+    // modified to support locking both cache lines to guarantee the
+    // atomicity.
+    assert(!sreqLow && !sreqHigh);
+
+    if (fault == NoFault && translationCompleted()) {
+        effAddr = req->getVaddr();
+        effSize = size;
+        instFlags[EffAddrValid] = true;
+
+        if (cpu->checker) {
+            if (reqToVerify != NULL) {
+                delete reqToVerify;
+            }
+            reqToVerify = new Request(*req);
+        }
+
+        // AMO requests are treated as special store requests with no data
+        // included in the requests. They are pushed through the store
+        // queue to memory.
+        fault = cpu->write(req, sreqLow, sreqHigh, NULL, sqIdx);
+    }
+
+    return fault;
+}
+
+template<class Impl>
 inline void
 BaseDynInst<Impl>::splitRequest(RequestPtr req, RequestPtr &sreqLow,
                                 RequestPtr &sreqHigh)
diff --git a/src/cpu/checker/cpu.hh b/src/cpu/checker/cpu.hh
index f79aa08..26673fb 100644
--- a/src/cpu/checker/cpu.hh
+++ b/src/cpu/checker/cpu.hh
@@ -506,9 +506,16 @@

     Fault readMem(Addr addr, uint8_t *data, unsigned size,
                   Request::Flags flags) override;
+
     Fault writeMem(uint8_t *data, unsigned size, Addr addr,
                    Request::Flags flags, uint64_t *res) override;

+    Fault amoMem(Addr addr, uint8_t* data, unsigned size,
+                 Request::Flags flags, AtomicOpFunctor *amo_op) override
+    {
+        panic("AMO is not supported yet in CPU checker\n");
+    }
+
     unsigned int readStCondFailures() const override {
         return thread->readStCondFailures();
     }
diff --git a/src/cpu/exec_context.hh b/src/cpu/exec_context.hh
index 59d7414..7515f87 100644
--- a/src/cpu/exec_context.hh
+++ b/src/cpu/exec_context.hh
@@ -256,6 +256,28 @@
                            Request::Flags flags, uint64_t *res) = 0;

     /**
+     * For atomic-mode contexts, perform an atomic AMO (a.k.a., Atomic
+     * Read-Modify-Write Memory Operation)
+     */
+    virtual Fault amoMem(Addr addr, uint8_t *data, unsigned int size,
+                         Request::Flags flags,
+                         AtomicOpFunctor *amo_op)
+    {
+        panic("ExecContext::amoMem() should be overridden\n");
+    }
+
+    /**
+     * For timing-mode contexts, initiate an atomic AMO (atomic
+     * read-modify-write memory operation)
+     */
+    virtual Fault initiateMemAMO(Addr addr, unsigned int size,
+                                 Request::Flags flags,
+                                 AtomicOpFunctor *amo_op)
+    {
+        panic("ExecContext::initiateMemAMO() should be overridden\n");
+    }
+
+    /**
      * Sets the number of consecutive store conditional failures.
      */
     virtual void setStCondFailures(unsigned int sc_failures) = 0;
diff --git a/src/cpu/minor/exec_context.hh b/src/cpu/minor/exec_context.hh
index 6ac0df5..121fc68 100644
--- a/src/cpu/minor/exec_context.hh
+++ b/src/cpu/minor/exec_context.hh
@@ -108,7 +108,7 @@
                     Request::Flags flags) override
     {
         execute.getLSQ().pushRequest(inst, true /* load */, nullptr,
-            size, addr, flags, NULL);
+            size, addr, flags, NULL, nullptr);
         return NoFault;
     }

@@ -117,7 +117,17 @@
              Request::Flags flags, uint64_t *res) override
     {
         execute.getLSQ().pushRequest(inst, false /* store */, data,
-            size, addr, flags, res);
+            size, addr, flags, res, nullptr);
+        return NoFault;
+    }
+
+    Fault
+    initiateMemAMO(Addr addr, unsigned int size, Request::Flags flags,
+                   AtomicOpFunctor *amo_op) override
+    {
+        // AMO requests are pushed through the store path
+        execute.getLSQ().pushRequest(inst, false /* amo */, nullptr,
+            size, addr, flags, nullptr, amo_op);
         return NoFault;
     }

diff --git a/src/cpu/minor/execute.cc b/src/cpu/minor/execute.cc
index 7b76ca2..3436f57 100644
--- a/src/cpu/minor/execute.cc
+++ b/src/cpu/minor/execute.cc
@@ -330,6 +330,7 @@

     bool is_load = inst->staticInst->isLoad();
     bool is_store = inst->staticInst->isStore();
+    bool is_atomic = inst->staticInst->isAtomic();
     bool is_prefetch = inst->staticInst->isDataPrefetch();

     /* If true, the trace's predicate value will be taken from the exec
@@ -361,7 +362,7 @@
             *inst);

         fatal("Received error response packet for inst: %s\n", *inst);
-    } else if (is_store || is_load || is_prefetch) {
+    } else if (is_store || is_load || is_prefetch || is_atomic) {
         assert(packet);

         DPRINTF(MinorMem, "Memory response inst: %s addr: 0x%x size: %d\n",
diff --git a/src/cpu/minor/fetch2.cc b/src/cpu/minor/fetch2.cc
index ba898d9..edc1bab 100644
--- a/src/cpu/minor/fetch2.cc
+++ b/src/cpu/minor/fetch2.cc
@@ -418,7 +418,8 @@
                     // Collect some basic inst class stats
                     if (decoded_inst->isLoad())
                         loadInstructions++;
-                    else if (decoded_inst->isStore())
+                    else if (decoded_inst->isStore() ||
+                             decoded_inst->isAtomic())
                         storeInstructions++;
                     else if (decoded_inst->isVector())
                         vecInstructions++;
diff --git a/src/cpu/minor/lsq.cc b/src/cpu/minor/lsq.cc
index cb0611b..1293bd5 100644
--- a/src/cpu/minor/lsq.cc
+++ b/src/cpu/minor/lsq.cc
@@ -679,9 +679,9 @@
     while (ret == NoAddrRangeCoverage && i != slots.rend()) {
         LSQRequestPtr slot = *i;

-        /* Cache maintenance instructions go down via the store path *
-         * but they carry no data and they shouldn't be considered for
-         * forwarding */
+        /* Cache maintenance instructions go down via the store path but
+         * they carry no data and they shouldn't be considered
+         * for forwarding */
         if (slot->packet &&
             slot->inst->id.threadId == request->inst->id.threadId &&
             !slot->packet->req->isCacheMaintenance()) {
@@ -934,8 +934,9 @@
     bool is_load = request->isLoad;
     bool is_llsc = request->request.isLLSC();
     bool is_swap = request->request.isSwap();
+    bool is_atomic = request->request.isAtomic();
     bool bufferable = !(request->request.isStrictlyOrdered() ||
-        is_llsc || is_swap);
+                        is_llsc || is_swap || is_atomic);

     if (is_load) {
         if (numStoresInTransfers != 0) {
@@ -968,9 +969,16 @@
         if (storeBuffer.canForwardDataToLoad(request, forwarding_slot) !=
             NoAddrRangeCoverage)
         {
+            // There's at least another request that targets the same
+            // address and is staying in the storeBuffer. Since our
+ // request is non-bufferable (e.g., strictly ordered or atomic),
+            // we must wait for the other request in the storeBuffer to
+            // complete before we can issue this non-bufferable request.
+            // This is to make sure that the order they access the cache is
+            // correct.
             DPRINTF(MinorMem, "Memory access can receive forwarded data"
-                " from the store buffer, need to wait for store buffer to"
-                " drain\n");
+                " from the store buffer, but need to wait for store buffer"
+                " to drain\n");
             return;
         }
     }
@@ -1478,9 +1486,21 @@
 void
 LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
                  unsigned int size, Addr addr, Request::Flags flags,
-                 uint64_t *res)
+                 uint64_t *res, AtomicOpFunctor *amo_op)
 {
     bool needs_burst = transferNeedsBurst(addr, size, lineWidth);
+
+    if (needs_burst && inst->staticInst->isAtomic()) {
+        // AMO requests that access across a cache line boundary are not
+ // allowed since the cache does not guarantee AMO ops to be executed
+        // atomically in two cache lines
+        // For ISAs such as x86 that requires AMO operations to work on
+        // accesses that cross cache-line boundaries, the cache needs to be
+        // modified to support locking both cache lines to guarantee the
+        // atomicity.
+        panic("Do not expect cross-cache-line atomic memory request\n");
+    }
+
     LSQRequestPtr request;

     /* Copy given data into the request.  The request will pass this to the
@@ -1489,15 +1509,16 @@

     DPRINTF(MinorMem, "Pushing request (%s) addr: 0x%x size: %d flags:"
         " 0x%x%s lineWidth : 0x%x\n",
-        (isLoad ? "load" : "store"), addr, size, flags,
+        (isLoad ? "load" : "store/atomic"), addr, size, flags,
             (needs_burst ? " (needs burst)" : ""), lineWidth);

     if (!isLoad) {
- /* request_data becomes the property of a ...DataRequest (see below) + /* Request_data becomes the property of a ...DataRequest (see below)
          *  and destroyed by its destructor */
         request_data = new uint8_t[size];
-        if (flags & Request::STORE_NO_DATA) {
-            /* For cache zeroing, just use zeroed data */
+        if (inst->staticInst->isAtomic() ||
+            (flags & Request::STORE_NO_DATA)) {
+            /* For atomic or store-no-data, just use zeroed data */
             std::memset(request_data, 0, size);
         } else {
             std::memcpy(request_data, data, size);
@@ -1520,7 +1541,7 @@
     request->request.setVirt(0 /* asid */,
         addr, size, flags, cpu.dataMasterId(),
         /* I've no idea why we need the PC, but give it */
-        inst->pc.instAddr());
+        inst->pc.instAddr(), amo_op);

     requests.push(request);
     request->startAddrTranslation();
diff --git a/src/cpu/minor/lsq.hh b/src/cpu/minor/lsq.hh
index d4973f5..9d042b2 100644
--- a/src/cpu/minor/lsq.hh
+++ b/src/cpu/minor/lsq.hh
@@ -696,11 +696,11 @@
     void completeMemBarrierInst(MinorDynInstPtr inst,
         bool committed);

-    /** Single interface for readMem/writeMem to issue requests into
+    /** Single interface for readMem/writeMem/amoMem to issue requests into
      *  the LSQ */
     void pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
                      unsigned int size, Addr addr, Request::Flags flags,
-                     uint64_t *res);
+                     uint64_t *res, AtomicOpFunctor *amo_op);

     /** Push a predicate failed-representing request into the queues just
      *  to maintain commit order */
diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh
index d32493c..dc34051 100644
--- a/src/cpu/o3/commit_impl.hh
+++ b/src/cpu/o3/commit_impl.hh
@@ -1152,8 +1152,9 @@
// Make sure we are only trying to commit un-executed instructions we
         // think are possible.
assert(head_inst->isNonSpeculative() || head_inst->isStoreConditional() - || head_inst->isMemBarrier() || head_inst->isWriteBarrier() ||
-               (head_inst->isLoad() && head_inst->strictlyOrdered()));
+               || head_inst->isMemBarrier() || head_inst->isWriteBarrier()
+               || head_inst->isAtomic()
+               || (head_inst->isLoad() && head_inst->strictlyOrdered()));

         DPRINTF(Commit, "Encountered a barrier or non-speculative "
                 "instruction [sn:%lli] at the head of the ROB, PC %s.\n",
@@ -1300,7 +1301,7 @@
 #endif

     // If this was a store, record it for this cycle.
-    if (head_inst->isStore())
+    if (head_inst->isStore() || head_inst->isAtomic())
         committedStores[tid] = true;

     // Return true to indicate that we have committed an instruction.
diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh
index 80d7adc..bd20a79 100644
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -464,7 +464,8 @@
         if (skidBuffer[tid].front()->isLoad()) {
             toRename->iewInfo[tid].dispatchedToLQ++;
         }
-        if (skidBuffer[tid].front()->isStore()) {
+        if (skidBuffer[tid].front()->isStore() ||
+            skidBuffer[tid].front()->isAtomic()) {
             toRename->iewInfo[tid].dispatchedToSQ++;
         }

@@ -855,7 +856,8 @@
         if (insts[tid].front()->isLoad()) {
             toRename->iewInfo[tid].dispatchedToLQ++;
         }
-        if (insts[tid].front()->isStore()) {
+        if (insts[tid].front()->isStore() ||
+            insts[tid].front()->isAtomic()) {
             toRename->iewInfo[tid].dispatchedToSQ++;
         }

@@ -997,7 +999,7 @@
             if (inst->isLoad()) {
                 toRename->iewInfo[tid].dispatchedToLQ++;
             }
-            if (inst->isStore()) {
+            if (inst->isStore() || inst->isAtomic()) {
                 toRename->iewInfo[tid].dispatchedToSQ++;
             }

@@ -1023,7 +1025,8 @@
         }

         // Check LSQ if inst is LD/ST
-        if ((inst->isLoad() && ldstQueue.lqFull(tid)) ||
+        if ((inst->isAtomic() && ldstQueue.sqFull(tid)) ||
+            (inst->isLoad() && ldstQueue.lqFull(tid)) ||
             (inst->isStore() && ldstQueue.sqFull(tid))) {
             DPRINTF(IEW, "[tid:%i]: Issue: %s has become full.\n",tid,
                     inst->isLoad() ? "LQ" : "SQ");
@@ -1041,7 +1044,25 @@
         }

         // Otherwise issue the instruction just fine.
-        if (inst->isLoad()) {
+        if (inst->isAtomic()) {
+            DPRINTF(IEW, "[tid:%i]: Issue: Memory instruction "
+                    "encountered, adding to LSQ.\n", tid);
+
+            ldstQueue.insertStore(inst);
+
+            ++iewDispStoreInsts;
+
+            // AMOs need to be set as "canCommit()"
+            // so that commit can process them when they reach the
+            // head of commit.
+            inst->setCanCommit();
+            instQueue.insertNonSpec(inst);
+            add_to_iq = false;
+
+            ++iewDispNonSpecInsts;
+
+            toRename->iewInfo[tid].dispatchedToSQ++;
+        } else if (inst->isLoad()) {
             DPRINTF(IEW, "[tid:%i]: Issue: Memory instruction "
                     "encountered, adding to LSQ.\n", tid);

@@ -1236,7 +1257,20 @@
                     "reference.\n");

// Tell the LDSTQ to execute this instruction (if it is a load).
-            if (inst->isLoad()) {
+            if (inst->isAtomic()) {
+                // AMOs are treated like store requests
+                fault = ldstQueue.executeStore(inst);
+
+                if (inst->isTranslationDelayed() &&
+                    fault == NoFault) {
+                    // A hw page table walk is currently going on; the
+                    // instruction must be deferred.
+                    DPRINTF(IEW, "Execute: Delayed translation, deferring "
+                            "store.\n");
+                    instQueue.deferMemInst(inst);
+                    continue;
+                }
+            } else if (inst->isLoad()) {
// Loads will mark themselves as executed, and their writeback
                 // event adds the instruction to the queue to commit
                 fault = ldstQueue.executeLoad(inst);
diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh
index f70f662..b2542002 100644
--- a/src/cpu/o3/inst_queue_impl.hh
+++ b/src/cpu/o3/inst_queue_impl.hh
@@ -1262,13 +1262,15 @@

             bool is_acq_rel = squashed_inst->isMemBarrier() &&
                          (squashed_inst->isLoad() ||
-                           (squashed_inst->isStore() &&
+                          squashed_inst->isAtomic() ||
+                          (squashed_inst->isStore() &&
                              !squashed_inst->isStoreConditional()));

             // Remove the instruction from the dependency list.
             if (is_acq_rel ||
                 (!squashed_inst->isNonSpeculative() &&
                  !squashed_inst->isStoreConditional() &&
+                 !squashed_inst->isAtomic() &&
                  !squashed_inst->isMemBarrier() &&
                  !squashed_inst->isWriteBarrier())) {

diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index a2813b3..fdbbdd8 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -661,6 +661,9 @@

         assert(storeQueue[store_idx].inst->effAddrValid());

+        // is the store an Atomic?
+        bool isStoreAtomic = storeQueue[store_idx].inst->isAtomic();
+
         // Check if the store data is within the lower and upper bounds of
         // addresses that the request needs.
         bool store_has_lower_limit =
@@ -675,9 +678,10 @@
             (req->getVaddr() + req->getSize()) >
             storeQueue[store_idx].inst->effAddr;

- // If the store's data has all of the data needed and the load isn't
-        // LLSC, we can forward.
- if (store_has_lower_limit && store_has_upper_limit && !req->isLLSC()) {
+        // If the store's data has all of the data needed, the load isn't
+        // LLSC and the store isn't atomic, we can forward.
+        if (store_has_lower_limit && store_has_upper_limit &&
+            !req->isLLSC() && !isStoreAtomic) {
             // Get shift amount for offset into the store's data.
int shift_amt = req->getVaddr() - storeQueue[store_idx].inst->effAddr;

@@ -719,11 +723,24 @@
(lower_load_has_store_part && upper_load_has_store_part))) ||
                 (req->isLLSC() &&
                  ((store_has_lower_limit || upper_load_has_store_part) &&
-                  (store_has_upper_limit || lower_load_has_store_part)))) {
-            // This is the partial store-load forwarding case where a store
-            // has only part of the load's data and the load isn't LLSC or
-            // the load is LLSC and the store has all or part of the load's
+                  (store_has_upper_limit || lower_load_has_store_part))) ||
+                (isStoreAtomic &&
+                 ((store_has_lower_limit || upper_load_has_store_part) &&
+                  (store_has_upper_limit || lower_load_has_store_part)))
+        ) {
+            // There're 3 scenarios here:
+            // (1) This is the partial store-load forwarding case where a
+ // store has only part of the load's data and the load is neither
+            // LLSC
+ // (2) The load is LLSC and the store has all or part of the load's
             // data
+            // (3) The store is Atomic and the store has all or part of the
+            // load's data
+ // If one of the 3 scenarios is true, we must check if the store
+            // in the storeQueue has been completed (i.e., appeared in the
+            // cache). If not, we must stall this load to maintain correct
+ // ordering. In the third case, the atomic has not been executed,
+            // so no data forwarding is allowed.

             // If it's already been written back, then don't worry about
             // stalling on it.
@@ -908,8 +925,8 @@
         storeQueue[store_idx].isSplit = true;
     }

-    if (!(req->getFlags() & Request::CACHE_BLOCK_ZERO) && \
-        !req->isCacheMaintenance())
+    if (!(req->getFlags() & Request::CACHE_BLOCK_ZERO) &&
+        !req->isCacheMaintenance() && !req->isAtomic())
         memcpy(storeQueue[store_idx].data, data, size);

     // This function only writes the data to the store queue, so no fault
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index ca6a7f3..6a9c25a 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -116,9 +116,10 @@
     assert(!cpu->switchedOut());
     if (!inst->isSquashed()) {
         if (!state->noWB) {
-            // Only loads and store conditionals perform the writeback
+ // Only loads, store conditionals and AMOs perform the writeback
             // after receving the response from the memory
-            assert(inst->isLoad() || inst->isStoreConditional());
+            assert(inst->isLoad() || inst->isStoreConditional() ||
+                   inst->isAtomic());
             if (!TheISA::HasUnalignedMemAcc || !state->isSplit ||
                 !state->isLoad) {
                 writeback(inst, pkt);
@@ -127,7 +128,7 @@
             }
         }

-        if (inst->isStore()) {
+        if (inst->isStore() || inst->isAtomic()) {
             completeStore(state->idx);
         }
     }
@@ -345,7 +346,7 @@
 {
     assert(inst->isMemRef());

-    assert(inst->isLoad() || inst->isStore());
+    assert(inst->isLoad() || inst->isStore() || inst->isAtomic());

     if (inst->isLoad()) {
         insertLoad(inst);
@@ -695,8 +696,8 @@

     assert(store_fault == NoFault);

-    if (store_inst->isStoreConditional()) {
-        // Store conditionals need to set themselves as able to
+    if (store_inst->isStoreConditional() || store_inst->isAtomic()) {
+        // Store conditionals and Atomics need to set themselves as able to
         // writeback if we haven't had a fault by here.
         storeQueue[store_idx].canWB = true;

@@ -905,8 +906,10 @@
                 incrStIdx(storeWBIdx);
                 continue;
             }
-        } else {
-            // Non-store conditionals do not need a writeback.
+        } else if (!inst->isAtomic()){
+            // Non-store conditionals and non-atomic stores do not need a
+            // writeback.
+            // For atomics writeback is handled in completeDataAccess()
             state->noWB = true;
         }

diff --git a/src/cpu/o3/mem_dep_unit_impl.hh b/src/cpu/o3/mem_dep_unit_impl.hh
index 376198f..74a5a6e 100644
--- a/src/cpu/o3/mem_dep_unit_impl.hh
+++ b/src/cpu/o3/mem_dep_unit_impl.hh
@@ -191,11 +191,11 @@
     // Check any barriers and the dependence predictor for any
     // producing memrefs/stores.
     InstSeqNum producing_store;
-    if (inst->isLoad() && loadBarrier) {
+    if ((inst->isLoad() || inst->isAtomic()) && loadBarrier) {
         DPRINTF(MemDepUnit, "Load barrier [sn:%lli] in flight\n",
                 loadBarrierSN);
         producing_store = loadBarrierSN;
-    } else if (inst->isStore() && storeBarrier) {
+    } else if ((inst->isStore() || inst->isAtomic()) && storeBarrier) {
         DPRINTF(MemDepUnit, "Store barrier [sn:%lli] in flight\n",
                 storeBarrierSN);
         producing_store = storeBarrierSN;
@@ -252,8 +252,8 @@
         }
     }

-    if (inst->isStore()) {
-        DPRINTF(MemDepUnit, "Inserting store PC %s [sn:%lli].\n",
+    if (inst->isStore() || inst->isAtomic()) {
+        DPRINTF(MemDepUnit, "Inserting store/atomic PC %s [sn:%lli].\n",
                 inst->pcState(), inst->seqNum);

depPred.insertStore(inst->instAddr(), inst->seqNum, inst->threadNumber);
@@ -288,8 +288,8 @@

     // Might want to turn this part into an inline function or something.
     // It's shared between both insert functions.
-    if (inst->isStore()) {
-        DPRINTF(MemDepUnit, "Inserting store PC %s [sn:%lli].\n",
+    if (inst->isStore() || inst->isAtomic()) {
+        DPRINTF(MemDepUnit, "Inserting store/atomic PC %s [sn:%lli].\n",
                 inst->pcState(), inst->seqNum);

depPred.insertStore(inst->instAddr(), inst->seqNum, inst->threadNumber);
diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh
index bc024f6..9bc7ae7 100644
--- a/src/cpu/o3/rename_impl.hh
+++ b/src/cpu/o3/rename_impl.hh
@@ -611,7 +611,7 @@
             }
         }

-        if (inst->isStore()) {
+        if (inst->isStore() || inst->isAtomic()) {
             if (calcFreeSQEntries(tid) <= 0) {
DPRINTF(Rename, "[tid:%u]: Cannot rename due to no free SQ\n");
                 source = SQ;
@@ -704,12 +704,12 @@

         renameDestRegs(inst, inst->threadNumber);

-        if (inst->isLoad()) {
-                loadsInProgress[tid]++;
+        if (inst->isAtomic() || inst->isStore()) {
+            storesInProgress[tid]++;
+        } else if (inst->isLoad()) {
+            loadsInProgress[tid]++;
         }
-        if (inst->isStore()) {
-                storesInProgress[tid]++;
-        }
+
         ++renamed_insts;
// Notify potential listeners that source and destination registers for
         // this instruction have been renamed.
diff --git a/src/cpu/simple/atomic.cc b/src/cpu/simple/atomic.cc
index bc7670b..2499a9d 100644
--- a/src/cpu/simple/atomic.cc
+++ b/src/cpu/simple/atomic.cc
@@ -72,6 +72,7 @@
     ifetch_req.setContext(cid);
     data_read_req.setContext(cid);
     data_write_req.setContext(cid);
+    data_amo_req.setContext(cid);
 }

 AtomicSimpleCPU::AtomicSimpleCPU(AtomicSimpleCPUParams *p)
@@ -532,6 +533,80 @@
     }
 }

+Fault
+AtomicSimpleCPU::amoMem(Addr addr, uint8_t* data, unsigned size,
+                        Request::Flags flags, AtomicOpFunctor *amo_op)
+{
+    SimpleExecContext& t_info = *threadInfo[curThread];
+    SimpleThread* thread = t_info.thread;
+
+    // use the CPU's statically allocated amo request and packet objects
+    Request *req = &data_amo_req;
+
+    if (traceData)
+        traceData->setMem(addr, size, flags);
+
+    //The address of the second part of this access if it needs to be split
+    //across a cache line boundary.
+    Addr secondAddr = roundDown(addr + size - 1, cacheLineSize());
+
+    // AMO requests that access across a cache line boundary are not
+    // allowed since the cache does not guarantee AMO ops to be executed
+    // atomically in two cache lines
+    // For ISAs such as x86 that requires AMO operations to work on
+    // accesses that cross cache-line boundaries, the cache needs to be
+    // modified to support locking both cache lines to guarantee the
+    // atomicity.
+    assert(secondAddr <= addr);
+
+    dcache_latency = 0;
+
+    req->taskId(taskId());
+    req->setVirt(0, addr, size, flags, dataMasterId(),
+                 thread->pcState().instAddr(), amo_op);
+
+    // translate to physical address
+    Fault fault = thread->dtb->translateAtomic(req, thread->getTC(),
+                                                      BaseTLB::Write);
+
+    // Now do the access.
+    if (fault == NoFault && !req->getFlags().isSet(Request::NO_ACCESS)) {
+        // We treat AMO accesses as Write accesses with SwapReq command
+        // data will hold the return data of the AMO access
+        Packet pkt(req, Packet::makeWriteCmd(req));
+        pkt.dataStatic(data);
+
+        if (req->isMmappedIpr())
+            dcache_latency += TheISA::handleIprRead(thread->getTC(), &pkt);
+        else {
+            if (fastmem && system->isMemAddr(pkt.getAddr()))
+                system->getPhysMem().access(&pkt);
+            else
+                dcache_latency += dcachePort.sendAtomic(&pkt);
+        }
+
+        dcache_access = true;
+
+        assert(!pkt.isError());
+        assert(!req->isLLSC());
+    }
+
+    if (fault != NoFault && req->isPrefetch()) {
+        return NoFault;
+    }
+
+    //If there's a fault and we're not doing prefetch, return it
+    return fault;
+}
+
+Fault
+AtomicSimpleCPU::initiateMemAMO(Addr addr, unsigned size,
+                                Request::Flags flags,
+                                AtomicOpFunctor *amo_op)
+{
+    panic("initiateMemAMO() is for timing accesses, and should "
+          "never be called on AtomicSimpleCPU.\n");
+}

 void
 AtomicSimpleCPU::tick()
@@ -548,6 +623,7 @@
         ifetch_req.setContext(cid);
         data_read_req.setContext(cid);
         data_write_req.setContext(cid);
+        data_amo_req.setContext(cid);
     }

     SimpleExecContext& t_info = *threadInfo[curThread];
diff --git a/src/cpu/simple/atomic.hh b/src/cpu/simple/atomic.hh
index c9dd954..af057a2 100644
--- a/src/cpu/simple/atomic.hh
+++ b/src/cpu/simple/atomic.hh
@@ -162,6 +162,7 @@
     Request ifetch_req;
     Request data_read_req;
     Request data_write_req;
+    Request data_amo_req;

     bool dcache_access;
     Tick dcache_latency;
@@ -202,6 +203,12 @@
     Fault writeMem(uint8_t *data, unsigned size,
Addr addr, Request::Flags flags, uint64_t *res) override;

+    Fault amoMem(Addr addr, uint8_t* data, unsigned size,
+                 Request::Flags flags, AtomicOpFunctor *amo_op) override;
+
+    Fault initiateMemAMO(Addr addr, unsigned size, Request::Flags flags,
+                         AtomicOpFunctor *amo_op) override;
+
     void regProbePoints() override;

     /**
diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc
index 36a2cb0..e81c0f8 100644
--- a/src/cpu/simple/base.cc
+++ b/src/cpu/simple/base.cc
@@ -644,7 +644,7 @@
         t_info.numLoadInsts++;
     }

-    if (curStaticInst->isStore()){
+    if (curStaticInst->isStore() || curStaticInst->isAtomic()){
         t_info.numStoreInsts++;
     }
     /* End power model statistics */
diff --git a/src/cpu/simple/base.hh b/src/cpu/simple/base.hh
index 15ab2ab..81de231 100644
--- a/src/cpu/simple/base.hh
+++ b/src/cpu/simple/base.hh
@@ -151,6 +151,14 @@
     virtual Fault writeMem(uint8_t* data, unsigned size, Addr addr,
                            Request::Flags flags, uint64_t* res) = 0;

+    virtual Fault amoMem(Addr addr, uint8_t* data, unsigned size,
+                         Request::Flags flags,
+                         AtomicOpFunctor *amo_op) = 0;
+
+    virtual Fault initiateMemAMO(Addr addr, unsigned size,
+                                 Request::Flags flags,
+                                 AtomicOpFunctor *amo_op) = 0;
+
     void countInst();
     Counter totalInsts() const override;
     Counter totalOps() const override;
diff --git a/src/cpu/simple/exec_context.hh b/src/cpu/simple/exec_context.hh
index 13c44ac..969d80f 100644
--- a/src/cpu/simple/exec_context.hh
+++ b/src/cpu/simple/exec_context.hh
@@ -412,7 +412,6 @@
         thread->pcState(val);
     }

-
     Fault readMem(Addr addr, uint8_t *data, unsigned int size,
                   Request::Flags flags) override
     {
@@ -431,6 +430,19 @@
         return cpu->writeMem(data, size, addr, flags, res);
     }

+    Fault amoMem(Addr addr, uint8_t *data, unsigned int size,
+                 Request::Flags flags, AtomicOpFunctor *amo_op) override
+    {
+        return cpu->amoMem(addr, data, size, flags, amo_op);
+    }
+
+    Fault initiateMemAMO(Addr addr, unsigned int size,
+                         Request::Flags flags,
+                         AtomicOpFunctor *amo_op) override
+    {
+        return cpu->initiateMemAMO(addr, size, flags, amo_op);
+    }
+
     /**
      * Sets the number of consecutive store conditional failures.
      */
diff --git a/src/cpu/simple/timing.cc b/src/cpu/simple/timing.cc
index 083de2b..1404f27 100644
--- a/src/cpu/simple/timing.cc
+++ b/src/cpu/simple/timing.cc
@@ -293,6 +293,7 @@

     PacketPtr pkt = buildPacket(req, read);
     pkt->dataDynamic<uint8_t>(data);
+
     if (req->getFlags().isSet(Request::NO_ACCESS)) {
         assert(!dcache_pkt);
         pkt->makeResponse();
@@ -554,6 +555,60 @@
     return NoFault;
 }

+Fault
+TimingSimpleCPU::amoMem(Addr addr, uint8_t* data, unsigned size,
+                        Request::Flags flags, AtomicOpFunctor *amo_op)
+{
+    panic("amoMem() is for atomic accesses, and should "
+          "never be called on TimingSimpleCPU.\n");
+}
+
+Fault
+TimingSimpleCPU::initiateMemAMO(Addr addr, unsigned size,
+                                Request::Flags flags,
+                                AtomicOpFunctor *amo_op)
+{
+    SimpleExecContext &t_info = *threadInfo[curThread];
+    SimpleThread* thread = t_info.thread;
+
+    Fault fault;
+    const int asid = 0;
+    const Addr pc = thread->instAddr();
+    unsigned block_size = cacheLineSize();
+    BaseTLB::Mode mode = BaseTLB::Write;
+
+    if (traceData)
+        traceData->setMem(addr, size, flags);
+
+    RequestPtr req = new Request(asid, addr, size, flags, dataMasterId(),
+                                 pc, thread->contextId(), amo_op);
+
+    assert(req->hasAtomicOpFunctor());
+
+    req->taskId(taskId());
+
+    Addr split_addr = roundDown(addr + size - 1, block_size);
+
+    // AMO requests that access across a cache line boundary are not
+    // allowed since the cache does not guarantee AMO ops to be executed
+    // atomically in two cache lines
+    // For ISAs such as x86 that requires AMO operations to work on
+    // accesses that cross cache-line boundaries, the cache needs to be
+    // modified to support locking both cache lines to guarantee the
+    // atomicity.
+    assert(split_addr <= addr);
+
+    _status = DTBWaitResponse;
+
+    WholeTranslationState *state =
+        new WholeTranslationState(req, new uint8_t[size], NULL, mode);
+    DataTranslation<TimingSimpleCPU *> *translation
+        = new DataTranslation<TimingSimpleCPU *>(this, state);
+    thread->dtb->translateTiming(req, thread->getTC(), translation, mode);
+
+    return NoFault;
+}
+
 void
 TimingSimpleCPU::threadSnoop(PacketPtr pkt, ThreadID sender)
 {
diff --git a/src/cpu/simple/timing.hh b/src/cpu/simple/timing.hh
index 8498630..94a238b 100644
--- a/src/cpu/simple/timing.hh
+++ b/src/cpu/simple/timing.hh
@@ -288,6 +288,12 @@
     Fault writeMem(uint8_t *data, unsigned size,
Addr addr, Request::Flags flags, uint64_t *res) override;

+    Fault amoMem(Addr addr, uint8_t* data, unsigned size,
+                 Request::Flags flags, AtomicOpFunctor *amo_op) override;
+
+    Fault initiateMemAMO(Addr addr, unsigned size, Request::Flags flags,
+                         AtomicOpFunctor *amo_op) override;
+
     void fetch();
     void sendFetch(const Fault &fault, RequestPtr req, ThreadContext *tc);
     void completeIfetch(PacketPtr );

--
To view, visit https://gem5-review.googlesource.com/8188
To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: master
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib8a7c81868ac05b98d73afc7d16eb88486f8cf9a
Gerrit-Change-Number: 8188
Gerrit-PatchSet: 1
Gerrit-Owner: Tuan Ta <q...@cornell.edu>
_______________________________________________
gem5-dev mailing list
gem5-dev@gem5.org
http://m5sim.org/mailman/listinfo/gem5-dev

Reply via email to