[m5-dev] [PATCH 3 of 3] O3PCU: Split loads and stores that cross cache line boundaries

Timothy M. Jones Mon, 09 Nov 2009 05:31:53 -0800

# HG changeset patch
# User Timothy M. Jones <[email protected]>
# Date 1257772288 0
# Node ID 1c63ee4b8afa271d3ae645419e37913bbb97fe6b
# Parent  da27e67385cca6cf4dd6d18cdead5cfd54559afb
O3PCU: Split loads and stores that cross cache line boundaries.


When each load or store is sent to the LSQ, we check whether it will cross a
cache line boundary and, if so, split it in two. This creates two TLB
translations and two memory requests. Care has to be taken if the first
packet of a split load is sent but the second blocks the cache. Similarly,
for a store, if the first packet cannot be sent, we must store the second
one somewhere to retry later.

This modifies the DataTranslation class to support split translations. It
also adds state into the LSQSenderState class to record both packets in a
split load or store.

diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@@ -131,13 +131,21 @@
     template <class T>
     Fault write(T data, Addr addr, unsigned flags, uint64_t *res);
 
-    /** Initiate a DTB address translation. */
+    /** Splits a request in two if it crosses a dcache block. */
+    bool splitRequest(RequestPtr req, RequestPtr &sreqLow,
+                      RequestPtr &sreqHigh);
+
+    /** Initiates a DTB translation for a single request. */
     void initiateTranslation(RequestPtr req, uint64_t *res,
                              BaseTLB::Mode mode);
 
+    /** Initiates a DTB translation for a split request. */
+    void initiateSplitTranslation(RequestPtr req, RequestPtr sreqLow,
+                                  RequestPtr sreqHigh, uint64_t *res,
+                                  BaseTLB::Mode mode);
+
     /** Finish a DTB address translation. */
-    void finishTranslation(Fault translate_fault, RequestPtr &req,
-                           uint64_t *res, bool read);
+    void finishTranslation(WholeTranslationState *state);
 
     void prefetch(Addr addr, unsigned flags);
     void writeHint(Addr addr, int size, unsigned flags);
@@ -871,12 +879,24 @@
     Request *req = new Request(asid, addr, sizeof(T), flags, this->PC,
                                thread->contextId(), threadNumber);
 
-    initiateTranslation(req, NULL, BaseTLB::Read);
+    BaseTLB::Mode mode = BaseTLB::Read;
+    Request *sreqLow = NULL;
+    Request *sreqHigh = NULL;
+
+    bool isSplit = splitRequest(req, sreqLow, sreqHigh);
+    if (!isSplit) {
+        initiateTranslation(req, NULL, mode);
+    } else {
+        initiateSplitTranslation(req, sreqLow, sreqHigh, NULL, mode);
+    }
 
     effAddr = req->getVaddr();
     effAddrValid = true;
     if (fault == NoFault) {
-        cpu->read(req, data, lqIdx);
+        if (!isSplit)
+            cpu->read(req, data, lqIdx);
+        else
+            cpu->read(true, req, sreqLow, sreqHigh, data, lqIdx);
     } else {
 
         // Return a fixed value to keep simulation deterministic even
@@ -910,48 +930,103 @@
     Request *req = new Request(asid, addr, sizeof(T), flags, this->PC,
                                thread->contextId(), threadNumber);
 
-    initiateTranslation(req, res, BaseTLB::Write);
+    BaseTLB::Mode mode = BaseTLB::Write;
+    Request *sreqLow = NULL;
+    Request *sreqHigh = NULL;
+
+    bool isSplit = splitRequest(req, sreqLow, sreqHigh);
+    if (!isSplit) {
+        initiateTranslation(req, res, mode);
+    } else {
+        initiateSplitTranslation(req, sreqLow, sreqHigh, res, mode);
+    }
 
     effAddr = req->getVaddr();
     effAddrValid = true;
     if (fault == NoFault) {
-        cpu->write(req, data, sqIdx);
+        if (!isSplit)
+            cpu->write(req, data, sqIdx);
+        else
+            cpu->write(true, req, sreqLow, sreqHigh, data, sqIdx);
     }
 
     return fault;
 }
 
 template<class Impl>
+inline bool
+BaseDynInst<Impl>::splitRequest(RequestPtr req, RequestPtr &sreqLow,
+                                RequestPtr &sreqHigh)
+{
+    // Check to see if the request crosses the next level block boundary.
+    unsigned block_size = cpu->getDcachePort()->peerBlockSize();
+    Addr addr = req->getVaddr();
+    Addr split_addr = roundDown(addr + req->getSize() - 1, block_size);
+    assert(split_addr <= addr || split_addr - addr < block_size);
+
+    // Spans two blocks.
+    if (split_addr > addr) {
+        req->splitOnVaddr(split_addr, sreqLow, sreqHigh);
+        return true;
+    } else {
+        return false;
+    }
+}
+
+template<class Impl>
 inline void
 BaseDynInst<Impl>::initiateTranslation(RequestPtr req, uint64_t *res,
                                        BaseTLB::Mode mode)
 {
+    WholeTranslationState *state =
+        new WholeTranslationState(req, res, mode);
     DataTranslation<Impl> *trans =
-        new DataTranslation<Impl>(this, res, mode);
+        new DataTranslation<Impl>(this, state);
     cpu->dtb->translateTiming(req, thread->getTC(), trans, mode);
 }
 
 template<class Impl>
 inline void
-BaseDynInst<Impl>::finishTranslation(Fault translate_fault, RequestPtr &req,
-                                     uint64_t *res, bool read)
+BaseDynInst<Impl>::initiateSplitTranslation(RequestPtr req, RequestPtr sreqLow,
+                                            RequestPtr sreqHigh, uint64_t *res,
+                                            BaseTLB::Mode mode)
 {
-    fault = translate_fault;
-    if (req->isUncacheable())
+    // Set up the translation state.
+    WholeTranslationState *state =
+        new WholeTranslationState(req, sreqLow, sreqHigh, res, mode);
+    DataTranslation<Impl> *stransLow =
+        new DataTranslation<Impl>(this, state, 0);
+    DataTranslation<Impl> *stransHigh =
+        new DataTranslation<Impl>(this, state, 1);
+
+    // Perform the translation.
+    cpu->dtb->translateTiming(sreqLow, thread->getTC(), stransLow, mode);
+    cpu->dtb->translateTiming(sreqHigh, thread->getTC(), stransHigh, mode);
+}
+
+template<class Impl>
+inline void
+BaseDynInst<Impl>::finishTranslation(WholeTranslationState *state)
+{
+    fault = state->getFault();
+
+    if (state->isUncacheable())
         isUncacheable = true;
 
     if (fault == NoFault) {
-        physEffAddr = req->getPaddr();
-        memReqFlags = req->getFlags();
+        physEffAddr = state->getPaddr();
+        memReqFlags = state->getFlags();
 
-        if (req->isCondSwap()) {
-            assert(res);
-            req->setExtraData(*res);
+        if (state->mainReq->isCondSwap()) {
+            assert(!state->isSplit);
+            assert(state->res);
+            state->mainReq->setExtraData(*state->res);
         }
 
-    } else {
-        delete req;
+   } else {
+        state->deleteReqs();
     }
+    delete state;
 }
 
 #endif // __CPU_BASE_DYN_INST_HH__
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -709,6 +709,15 @@
         return this->iew.ldstQueue.read(req, data, load_idx);
     }
 
+    /** CPU split read function, forwards read to LSQ. */
+    template <class T>
+    Fault read(bool isSplit, RequestPtr &req, RequestPtr &sreqLow,
+               RequestPtr &sreqHigh, T &data, int load_idx)
+    {
+        return this->iew.ldstQueue.read(isSplit, req, sreqLow, sreqHigh,
+                                        data, load_idx);
+    }
+
     /** CPU write function, forwards write to LSQ. */
     template <class T>
     Fault write(RequestPtr &req, T &data, int store_idx)
@@ -716,6 +725,18 @@
         return this->iew.ldstQueue.write(req, data, store_idx);
     }
 
+    /** CPU split write function, forwards write to LSQ. */
+    template <class T>
+    Fault write(bool isSplit, RequestPtr &req, RequestPtr &sreqLow,
+                RequestPtr &sreqHigh, T &data, int store_idx)
+    {
+        return this->iew.ldstQueue.write(isSplit, req, sreqLow, sreqHigh,
+                                         data, store_idx);
+    }
+
+    /** Get the dcache port (used to find block size for translations). */
+    Port *getDcachePort() { return this->iew.ldstQueue.getDcachePort(); }
+
     Addr lockAddr;
 
     /** Temporary fix for the lock flag, works in the UP case. */
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -274,12 +274,26 @@
     template <class T>
     Fault read(RequestPtr req, T &data, int load_idx);
 
+    /** Executes a read operation, using the load specified at the load index
+     * with a request split into seperate low and high requests.
+     */
+    template <class T>
+    Fault read(bool isSplit, RequestPtr req, RequestPtr sreqLow,
+               RequestPtr sreqHigh, T &data, int load_idx);
+
     /** Executes a store operation, using the store specified at the store
      *   index.
      */
     template <class T>
     Fault write(RequestPtr req, T &data, int store_idx);
 
+    /** Executes a store operation, using the store specified at the store
+     * index with a request split into seperate low and high requests.
+     */
+    template <class T>
+    Fault write(bool isSplit, RequestPtr req, RequestPtr sreqLow,
+                RequestPtr sreqHigh, T &data, int store_idx);
+
     /** The CPU pointer. */
     O3CPU *cpu;
 
@@ -379,6 +393,17 @@
 template <class Impl>
 template <class T>
 Fault
+LSQ<Impl>::read(bool isSplit, RequestPtr req, RequestPtr sreqLow,
+                RequestPtr sreqHigh, T &data, int load_idx)
+{
+    ThreadID tid = req->threadId();
+
+    return thread[tid].read(isSplit, req, sreqLow, sreqHigh, data, load_idx);
+}
+
+template <class Impl>
+template <class T>
+Fault
 LSQ<Impl>::write(RequestPtr req, T &data, int store_idx)
 {
     ThreadID tid = req->threadId();
@@ -386,4 +411,15 @@
     return thread[tid].write(req, data, store_idx);
 }
 
+template <class Impl>
+template <class T>
+Fault
+LSQ<Impl>::write(bool isSplit, RequestPtr req, RequestPtr sreqLow,
+                 RequestPtr sreqHigh, T &data, int store_idx)
+{
+    ThreadID tid = req->threadId();
+
+    return thread[tid].write(isSplit, req, sreqLow, sreqHigh, data, store_idx);
+}
+
 #endif // __CPU_O3_LSQ_HH__
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -216,12 +216,18 @@
     /** Writes back the instruction, sending it to IEW. */
     void writeback(DynInstPtr &inst, PacketPtr pkt);
 
+    /** Writes back a store that couldn't be completed the previous cycle. */
+    void writebackPendingStore();
+
     /** Handles completing the send of a store to memory. */
     void storePostSend(PacketPtr pkt);
 
     /** Completes the store at the specified index. */
     void completeStore(int store_idx);
 
+    /** Attempts to send a store to the cache. */
+    bool sendStore(PacketPtr data_pkt);
+
     /** Increments the given store index (circular queue). */
     inline void incrStIdx(int &store_idx);
     /** Decrements the given store index (circular queue). */
@@ -254,7 +260,8 @@
       public:
         /** Default constructor. */
         LSQSenderState()
-            : noWB(false)
+            : noWB(false), isSplit(false), pktToSend(false), outstanding(1),
+              mainPkt(NULL), pendingPacket(NULL)
         { }
 
         /** Instruction who initiated the access to memory. */
@@ -265,6 +272,19 @@
         int idx;
         /** Whether or not the instruction will need to writeback. */
         bool noWB;
+        /** Whether or not this access is split in two. */
+        bool isSplit;
+        /** Whether or not there is a packet that needs sending. */
+        bool pktToSend;
+        /** Number of outstanding packets to complete. */
+        int outstanding;
+        /** The main packet from a split load, used during writeback. */
+        PacketPtr mainPkt;
+        /** A second packet from a split store that needs sending. */
+        PacketPtr pendingPacket;
+
+        /** Completes a packet and returns whether the access is finished. */
+        inline bool complete() { return --outstanding == 0; }
     };
 
     /** Writeback event, specifically for when stores forward data to loads. */
@@ -302,8 +322,8 @@
 
         /** Constructs a store queue entry for a given instruction. */
         SQEntry(DynInstPtr &_inst)
-            : inst(_inst), req(NULL), size(0),
-              canWB(0), committed(0), completed(0)
+            : inst(_inst), req(NULL), sreqLow(NULL), sreqHigh(NULL), size(0),
+              isSplit(0), canWB(0), committed(0), completed(0)
         {
             std::memset(data, 0, sizeof(data));
         }
@@ -312,10 +332,15 @@
         DynInstPtr inst;
         /** The request for the store. */
         RequestPtr req;
+        /** The split requests for the store. */
+        RequestPtr sreqLow;
+        RequestPtr sreqHigh;
         /** The size of the store. */
         int size;
         /** The store data. */
         char data[sizeof(IntReg)];
+        /** Whether or not the store is split into two requests. */
+        bool isSplit;
         /** Whether or not the store can writeback. */
         bool canWB;
         /** Whether or not the store is committed. */
@@ -406,6 +431,13 @@
     /** The oldest load that caused a memory ordering violation. */
     DynInstPtr memDepViolator;
 
+    /** Whether or not there is a packet that couldn't be sent because of
+     * a lack of cache ports. */
+    bool hasPendingPkt;
+
+    /** The packet that is pending free cache ports. */
+    PacketPtr pendingPkt;
+
     // Will also need how many read/write ports the Dcache has.  Or keep track
     // of that in stage that is one level up, and only call executeLoad/Store
     // the appropriate number of times.
@@ -445,10 +477,20 @@
     template <class T>
     Fault read(Request *req, T &data, int load_idx);
 
+    /** Executes the split load at the given index. */
+    template <class T>
+    Fault read(bool isSplit, Request *req, Request *sreqLow,
+               Request *sreqHigh, T &data, int load_idx);
+
     /** Executes the store at the given index. */
     template <class T>
     Fault write(Request *req, T &data, int store_idx);
 
+    /** Executes the split store at the given index. */
+    template <class T>
+    Fault write(bool isSplit, Request *req, Request *sreqLow,
+                Request *sreqHigh, T &data, int store_idx);
+
     /** Returns the index of the head load instruction. */
     int getLoadHead() { return loadHead; }
     /** Returns the sequence number of the head load instruction. */
@@ -484,6 +526,15 @@
 Fault
 LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
 {
+    return read(false, req, NULL, NULL, data, load_idx);
+}
+
+template <class Impl>
+template <class T>
+Fault
+LSQUnit<Impl>::read(bool isSplit, Request *req, Request *sreqLow,
+                    Request *sreqHigh, T &data, int load_idx)
+{
     DynInstPtr load_inst = loadQueue[load_idx];
 
     assert(load_inst);
@@ -503,6 +554,10 @@
         // memory.  This is quite ugly.  @todo: Figure out the proper
         // place to really handle request deletes.
         delete req;
+        if (isSplit) {
+            delete sreqLow;
+            delete sreqHigh;
+        }
         return TheISA::genMachineCheckFault();
     }
 
@@ -512,10 +567,12 @@
     int store_size = 0;
 
     DPRINTF(LSQUnit, "Read called, load idx: %i, store idx: %i, "
-            "storeHead: %i addr: %#x\n",
-            load_idx, store_idx, storeHead, req->getPaddr());
+            "storeHead: %i addr: %#x%s\n",
+            load_idx, store_idx, storeHead, req->getPaddr(),
+            isSplit ? " split" : "");
 
     if (req->isLLSC()) {
+        assert(!isSplit);
         // Disable recording the result temporarily.  Writing to misc
         // regs normally updates the result, but this is not the
         // desired behavior when handling store conditionals.
@@ -587,6 +644,12 @@
             // @todo: Need to make this a parameter.
             cpu->schedule(wb, curTick);
 
+            // Don't need to do anything special for split loads.
+            if (isSplit) {
+                delete sreqLow;
+                delete sreqHigh;
+            }
+
             ++lsqForwLoads;
             return NoFault;
         } else if ((store_has_lower_limit && lower_load_has_store_part) ||
@@ -630,6 +693,10 @@
             // memory.  This is quite ugly.  @todo: Figure out the
             // proper place to really handle request deletes.
             delete req;
+            if (isSplit) {
+                delete sreqLow;
+                delete sreqHigh;
+            }
 
             return NoFault;
         }
@@ -645,12 +712,14 @@
     ++usedPorts;
 
     // if we the cache is not blocked, do cache access
+    bool completedFirst = false;
     if (!lsq->cacheBlocked()) {
-        PacketPtr data_pkt =
-            new Packet(req,
-                       (req->isLLSC() ?
-                        MemCmd::LoadLockedReq : MemCmd::ReadReq),
-                       Packet::Broadcast);
+        MemCmd command =
+            req->isLLSC() ? MemCmd::LoadLockedReq : MemCmd::ReadReq;
+        PacketPtr data_pkt = new Packet(req, command, Packet::Broadcast);
+        PacketPtr fst_data_pkt = NULL;
+        PacketPtr snd_data_pkt = NULL;
+
         data_pkt->dataStatic(load_inst->memData);
 
         LSQSenderState *state = new LSQSenderState;
@@ -659,18 +728,66 @@
         state->inst = load_inst;
         data_pkt->senderState = state;
 
-        if (!dcachePort->sendTiming(data_pkt)) {
+        if (!isSplit) {
+
+            // Point the first packet at the main data packet.
+            fst_data_pkt = data_pkt;
+        } else {
+
+            // Create the split packets.
+            fst_data_pkt = new Packet(sreqLow, command, Packet::Broadcast);
+            snd_data_pkt = new Packet(sreqHigh, command, Packet::Broadcast);
+
+            fst_data_pkt->dataStatic(load_inst->memData);
+            snd_data_pkt->dataStatic(load_inst->memData + sreqLow->getSize());
+
+            fst_data_pkt->senderState = state;
+            snd_data_pkt->senderState = state;
+
+            state->isSplit = true;
+            state->outstanding = 2;
+            state->mainPkt = data_pkt;
+        }
+
+        if (!dcachePort->sendTiming(fst_data_pkt)) {
             // Delete state and data packet because a load retry
             // initiates a pipeline restart; it does not retry.
             delete state;
             delete data_pkt->req;
             delete data_pkt;
+            if (isSplit) {
+                delete fst_data_pkt->req;
+                delete fst_data_pkt;
+                delete snd_data_pkt->req;
+                delete snd_data_pkt;
+            }
 
             req = NULL;
 
             // If the access didn't succeed, tell the LSQ by setting
             // the retry thread id.
             lsq->setRetryTid(lsqID);
+        } else if (isSplit) {
+            completedFirst = true;
+
+            // The first packet was sent without problems, so send this one
+            // too. If there is a problem with this packet then the whole
+            // load will be squashed, so indicate this to the state object.
+            // The first packet will return in completeDataAccess and be
+            // handled there.
+            ++usedPorts;
+            if (!dcachePort->sendTiming(snd_data_pkt)) {
+
+                // The main packet will be deleted in completeDataAccess.
+                delete snd_data_pkt->req;
+                delete snd_data_pkt;
+
+                state->complete();
+
+                req = NULL;
+
+                lsq->setRetryTid(lsqID);
+            }
         }
     }
 
@@ -679,6 +796,10 @@
     if (lsq->cacheBlocked()) {
         if (req)
             delete req;
+        if (isSplit && !completedFirst) {
+            delete sreqLow;
+            delete sreqHigh;
+        }
 
         ++lsqCacheBlocked;
 
@@ -705,6 +826,15 @@
 Fault
 LSQUnit<Impl>::write(Request *req, T &data, int store_idx)
 {
+    return write(false, req, NULL, NULL, data, store_idx);
+}
+
+template <class Impl>
+template <class T>
+Fault
+LSQUnit<Impl>::write(bool isSplit, Request *req, Request *sreqLow,
+                     Request *sreqHigh, T &data, int store_idx)
+{
     assert(storeQueue[store_idx].inst);
 
     DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x data %#x"
@@ -719,6 +849,10 @@
     T gData = htog(data);
     memcpy(storeQueue[store_idx].data, &gData, sizeof(T));
 
+    storeQueue[store_idx].isSplit = isSplit;
+    storeQueue[store_idx].sreqLow = sreqLow;
+    storeQueue[store_idx].sreqHigh = sreqHigh;
+
     // This function only writes the data to the store queue, so no fault
     // can happen here.
     return NoFault;
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -85,11 +85,22 @@
 
     assert(!pkt->wasNacked());
 
+    // If this is a split access, wait until all packets are received.
+    if (!state->complete()) {
+        delete pkt->req;
+        delete pkt;
+        return;
+    }
+
     if (isSwitchedOut() || inst->isSquashed()) {
         iewStage->decrWb(inst->seqNum);
     } else {
         if (!state->noWB) {
-            writeback(inst, pkt);
+            if (!state->isSplit || !state->isLoad) {
+                writeback(inst, pkt);
+            } else {
+                writeback(inst, state->mainPkt);
+            }
         }
 
         if (inst->isStore()) {
@@ -97,6 +108,10 @@
         }
     }
 
+    if (state->isSplit && state->isLoad) {
+        delete state->mainPkt->req;
+        delete state->mainPkt;
+    }
     delete state;
     delete pkt->req;
     delete pkt;
@@ -605,8 +620,28 @@
 
 template <class Impl>
 void
+LSQUnit<Impl>::writebackPendingStore()
+{
+    if (hasPendingPkt) {
+        assert(pendingPkt != NULL);
+
+        // If the cache is blocked, this will store the packet for retry.
+        if (sendStore(pendingPkt)) {
+            storePostSend(pendingPkt);
+        }
+        pendingPkt = NULL;
+        hasPendingPkt = false;
+    }
+}
+
+template <class Impl>
+void
 LSQUnit<Impl>::writebackStores()
 {
+    // First writeback the second packet from any split store that didn't
+    // complete last cycle because there weren't enough cache ports available.
+    writebackPendingStore();
+
     while (storesToWB > 0 &&
            storeWBIdx != storeTail &&
            storeQueue[storeWBIdx].inst &&
@@ -640,6 +675,11 @@
         assert(storeQueue[storeWBIdx].req);
         assert(!storeQueue[storeWBIdx].committed);
 
+        if (storeQueue[storeWBIdx].isSplit) {
+            assert(storeQueue[storeWBIdx].sreqLow);
+            assert(storeQueue[storeWBIdx].sreqHigh);
+        }
+
         DynInstPtr inst = storeQueue[storeWBIdx].inst;
 
         Request *req = storeQueue[storeWBIdx].req;
@@ -653,15 +693,41 @@
         MemCmd command =
             req->isSwap() ? MemCmd::SwapReq :
             (req->isLLSC() ? MemCmd::StoreCondReq : MemCmd::WriteReq);
-        PacketPtr data_pkt = new Packet(req, command,
-                                        Packet::Broadcast);
-        data_pkt->dataStatic(inst->memData);
+        PacketPtr data_pkt;
+        PacketPtr snd_data_pkt = NULL;
 
         LSQSenderState *state = new LSQSenderState;
         state->isLoad = false;
         state->idx = storeWBIdx;
         state->inst = inst;
-        data_pkt->senderState = state;
+
+        if (!storeQueue[storeWBIdx].isSplit) {
+
+            // Build a single data packet if the store isn't split.
+            data_pkt = new Packet(req, command, Packet::Broadcast);
+            data_pkt->dataStatic(inst->memData);
+            data_pkt->senderState = state;
+        } else {
+            RequestPtr sreqLow = storeQueue[storeWBIdx].sreqLow;
+            RequestPtr sreqHigh = storeQueue[storeWBIdx].sreqHigh;
+
+            // Create two packets if the store is split in two.
+            data_pkt = new Packet(sreqLow, command, Packet::Broadcast);
+            snd_data_pkt = new Packet(sreqHigh, command, Packet::Broadcast);
+
+            data_pkt->dataStatic(inst->memData);
+            snd_data_pkt->dataStatic(inst->memData + sreqLow->getSize());
+
+            data_pkt->senderState = state;
+            snd_data_pkt->senderState = state;
+
+            state->isSplit = true;
+            state->outstanding = 2;
+
+            // Can delete the main request now.
+            delete req;
+            req = sreqLow;
+        }
 
         DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%#x "
                 "to Addr:%#x, data:%#x [sn:%lli]\n",
@@ -671,6 +737,7 @@
 
         // @todo: Remove this SC hack once the memory system handles it.
         if (inst->isStoreConditional()) {
+            assert(!storeQueue[storeWBIdx].isSplit);
             // Disable recording the result temporarily.  Writing to
             // misc regs normally updates the result, but this is not
             // the desired behavior when handling store conditionals.
@@ -694,18 +761,44 @@
             state->noWB = true;
         }
 
-        if (!dcachePort->sendTiming(data_pkt)) {
-            // Need to handle becoming blocked on a store.
+        if (!sendStore(data_pkt)) {
             DPRINTF(IEW, "D-Cache became blocked when writing [sn:%lli], will"
                     "retry later\n",
                     inst->seqNum);
-            isStoreBlocked = true;
-            ++lsqCacheBlocked;
-            assert(retryPkt == NULL);
-            retryPkt = data_pkt;
-            lsq->setRetryTid(lsqID);
+
+            // Need to store the second packet, if split.
+            if (storeQueue[storeWBIdx].isSplit) {
+                state->pktToSend = true;
+                state->pendingPacket = snd_data_pkt;
+            }
         } else {
-            storePostSend(data_pkt);
+
+            // If split, try to send the second packet too
+            if (storeQueue[storeWBIdx].isSplit) {
+                assert(snd_data_pkt);
+
+                // Ensure there are enough ports to use.
+                if (usedPorts < cachePorts) {
+                    ++usedPorts;
+                    if (sendStore(snd_data_pkt)) {
+                        storePostSend(snd_data_pkt);
+                    } else {
+                        DPRINTF(IEW, "D-Cache became blocked when writing"
+                                " [sn:%lli] second packet, will retry later\n",
+                                inst->seqNum);
+                    }
+                } else {
+
+                    // Store the packet for when there's free ports.
+                    assert(pendingPkt == NULL);
+                    pendingPkt = snd_data_pkt;
+                    hasPendingPkt = true;
+                }
+            } else {
+
+                // Not a split store.
+                storePostSend(data_pkt);
+            }
         }
     }
 
@@ -808,6 +901,13 @@
         // memory.  This is quite ugly.  @todo: Figure out the proper
         // place to really handle request deletes.
         delete storeQueue[store_idx].req;
+        if (storeQueue[store_idx].isSplit) {
+            delete storeQueue[store_idx].sreqLow;
+            delete storeQueue[store_idx].sreqHigh;
+
+            storeQueue[store_idx].sreqLow = NULL;
+            storeQueue[store_idx].sreqHigh = NULL;
+        }
 
         storeQueue[store_idx].req = NULL;
         --stores;
@@ -927,6 +1027,22 @@
 }
 
 template <class Impl>
+bool
+LSQUnit<Impl>::sendStore(PacketPtr data_pkt)
+{
+    if (!dcachePort->sendTiming(data_pkt)) {
+        // Need to handle becoming blocked on a store.
+        isStoreBlocked = true;
+        ++lsqCacheBlocked;
+        assert(retryPkt == NULL);
+        retryPkt = data_pkt;
+        lsq->setRetryTid(lsqID);
+        return false;
+    }
+    return true;
+}
+
+template <class Impl>
 void
 LSQUnit<Impl>::recvRetry()
 {
@@ -935,10 +1051,24 @@
         assert(retryPkt != NULL);
 
         if (dcachePort->sendTiming(retryPkt)) {
-            storePostSend(retryPkt);
+            LSQSenderState *state =
+                dynamic_cast<LSQSenderState *>(retryPkt->senderState);
+
+            // Don't finish the store unless this is the last packet.
+            if (!state->pktToSend) {
+                storePostSend(retryPkt);
+            }
             retryPkt = NULL;
             isStoreBlocked = false;
             lsq->setRetryTid(InvalidThreadID);
+
+            // Send any outstanding packet.
+            if (state->pktToSend) {
+                assert(state->pendingPacket);
+                if (sendStore(state->pendingPacket)) {
+                    storePostSend(state->pendingPacket);
+                }
+            }
         } else {
             // Still blocked!
             ++lsqCacheBlocked;
diff --git a/src/cpu/translation.hh b/src/cpu/translation.hh
--- a/src/cpu/translation.hh
+++ b/src/cpu/translation.hh
@@ -39,6 +39,100 @@
 template <class Impl>
 class BaseDynInst;
 
+class WholeTranslationState
+{
+  protected:
+    int outstanding;
+    Fault faults[2];
+
+  public:
+    bool isSplit;
+    RequestPtr mainReq;
+    RequestPtr sreqLow;
+    RequestPtr sreqHigh;
+    uint64_t *res;
+    BaseTLB::Mode mode;
+
+    /** Single translation state. */
+    WholeTranslationState(RequestPtr _req, uint64_t *_res, BaseTLB::Mode _mode)
+        : outstanding(1), isSplit(false), mainReq(_req), sreqLow(NULL),
+          sreqHigh(NULL), res(_res), mode(_mode)
+    {
+        faults[0] = faults[1] = NoFault;
+        assert(mode == BaseTLB::Read || mode == BaseTLB::Write);
+    }
+
+    /** Split translation state. */
+    WholeTranslationState(RequestPtr _req, RequestPtr _sreqLow,
+                          RequestPtr _sreqHigh, uint64_t *_res,
+                          BaseTLB::Mode _mode)
+        : outstanding(2), isSplit(true), mainReq(_req), sreqLow(_sreqLow),
+          sreqHigh(_sreqHigh), res(_res), mode(_mode)
+    {
+        faults[0] = faults[1] = NoFault;
+        assert(mode == BaseTLB::Read || mode == BaseTLB::Write);
+    }
+
+    bool
+    finish(Fault fault, int index)
+    {
+        assert(outstanding);
+        faults[index] = fault;
+        outstanding--;
+        if (isSplit && outstanding == 0) {
+
+            // For ease later, we copy some state to the main request.
+            if (faults[0] == NoFault) {
+                mainReq->setPaddr(sreqLow->getPaddr());
+            }
+            mainReq->setFlags(sreqLow->getFlags());
+            mainReq->setFlags(sreqHigh->getFlags());
+        }
+        return outstanding == 0;
+    }
+
+    Fault
+    getFault() const
+    {
+        if (!isSplit)
+            return faults[0];
+        else if (faults[0] != NoFault)
+            return faults[0];
+        else if (faults[1] != NoFault)
+            return faults[1];
+        else
+            return NoFault;
+    }
+
+    bool
+    isUncacheable() const
+    {
+        return mainReq->isUncacheable();
+    }
+
+    Addr
+    getPaddr() const
+    {
+        return mainReq->getPaddr();
+    }
+
+    unsigned
+    getFlags()
+    {
+        return mainReq->getFlags();
+    }
+
+    void
+    deleteReqs()
+    {
+        delete mainReq;
+        if (isSplit) {
+            delete sreqLow;
+            delete sreqHigh;
+        }
+    }
+};
+
 template <class Impl>
 class DataTranslation : public BaseTLB::Translation
 {
@@ -47,22 +141,30 @@
     typedef RefCountingPtr<DynInst> DynInstPtr;
 
     DynInstPtr inst;
-    uint64_t *res;
-    BaseTLB::Mode mode;
+    WholeTranslationState *state;
+    int index;
 
   public:
-    DataTranslation(DynInstPtr _inst, uint64_t *_res, BaseTLB::Mode _mode)
-        : inst(_inst), res(_res), mode(_mode)
+    DataTranslation(DynInstPtr _inst, WholeTranslationState *_state)
+        : inst(_inst), state(_state), index(0)
     {
-        assert(mode == BaseTLB::Read || mode == BaseTLB::Write);
     }
 
-    void
+    DataTranslation(DynInstPtr _inst, WholeTranslationState *_state,
+                    int _index)
+        : inst(_inst), state(_state), index(_index)
+    {
+    }
+
+    virtual void
     finish(Fault fault, RequestPtr req, ThreadContext *tc,
            BaseTLB::Mode mode)
     {
-        assert(mode == this->mode);
-        inst->finishTranslation(fault, req, res, mode == BaseTLB::Read);
+        assert(state);
+        assert(mode == state->mode);
+        if (state->finish(fault, index)) {
+            inst->finishTranslation(state);
+        }
         delete this;
     }
 };

-- 
The University of Edinburgh is a charitable body, registered in
Scotland, with registration number SC005336.

_______________________________________________
m5-dev mailing list
[email protected]
http://m5sim.org/mailman/listinfo/m5-dev

[m5-dev] [PATCH 3 of 3] O3PCU: Split loads and stores that cross cache line boundaries

Reply via email to