Hi Lisa,

I'll have a look into it, yes.  I did run all the regressions before  
submitting so there shouldn't have been any problems, but I'll check the  
results.

Tim

On Fri, 19 Feb 2010 16:35:34 -0500, Lisa Hsu <[email protected]> wrote:

> Timothy,
>
> Did you check this against ALPHA_FS/test/fast/long/10.linux-boot tests  
> under
> the O3 configurations?  I believe this changeset may have broken them.   
> I am
> getting ready to make a push, and I passed ALPHA_FS linux boot regression
> tests a few days ago, but since pulling the last 6 changesets, I've begun
> failing on this test, and this O3 change seems to be the most obvious
> breaker.  What happens is that it seems like no forward progress is made  
> in
> the program and it just gets hung and runs forever. This happens on a  
> clean
> tree after having removed my patches that I want to push.  Can you look  
> into
> it?
>
> Thanks,
> Lisa
>
> On Fri, Feb 12, 2010 at 12:52 PM, Timothy M. Jones  
> <[email protected]>wrote:
>
>> changeset 4d4903a3e7c5 in /z/repo/m5
>> details: http://repo.m5sim.org/m5?cmd=changeset;node=4d4903a3e7c5
>> description:
>>        O3PCU: Split loads and stores that cross cache line boundaries.
>>
>>        When each load or store is sent to the LSQ, we check whether it  
>> will
>> cross a
>>        cache line boundary and, if so, split it in two. This creates two
>> TLB
>>        translations and two memory requests. Care has to be taken if the
>> first
>>        packet of a split load is sent but the second blocks the cache.
>> Similarly,
>>        for a store, if the first packet cannot be sent, we must store  
>> the
>> second
>>        one somewhere to retry later.
>>
>>        This modifies the LSQSenderState class to record both packets in  
>> a
>> split
>>        load or store.
>>
>>        Finally, a new const variable, HasUnalignedMemAcc, is added to  
>> each
>> ISA
>>        to indicate whether unaligned memory accesses are allowed. This  
>> is
>> used
>>        throughout the changed code so that compiler can optimise away  
>> code
>> dealing
>>        with split requests for ISAs that don't need them.
>>
>> diffstat:
>>
>> 11 files changed, 378 insertions(+), 53 deletions(-)
>> src/arch/alpha/isa_traits.hh |    3
>> src/arch/arm/isa_traits.hh   |    3
>> src/arch/mips/isa_traits.hh  |    3
>> src/arch/power/isa_traits.hh |    3
>> src/arch/sparc/isa_traits.hh |    3
>> src/arch/x86/isa_traits.hh   |    3
>> src/cpu/base_dyn_inst.hh     |   75 ++++++++++++++++---
>> src/cpu/o3/cpu.hh            |   15 ++-
>> src/cpu/o3/lsq.hh            |   22 +++--
>> src/cpu/o3/lsq_unit.hh       |  138 +++++++++++++++++++++++++++++++----
>> src/cpu/o3/lsq_unit_impl.hh  |  163
>> ++++++++++++++++++++++++++++++++++++++----
>>
>> diffs (truncated from 831 to 300 lines):
>>
>> diff -r a123bd350935 -r 4d4903a3e7c5 src/arch/alpha/isa_traits.hh
>> --- a/src/arch/alpha/isa_traits.hh      Fri Feb 12 19:53:19 2010 +0000
>> +++ b/src/arch/alpha/isa_traits.hh      Fri Feb 12 19:53:20 2010 +0000
>> @@ -131,6 +131,9 @@
>>  // Alpha UNOP (ldq_u r31,0(r0))
>>  const ExtMachInst NoopMachInst = 0x2ffe0000;
>>
>> +// Memory accesses cannot be unaligned
>> +const bool HasUnalignedMemAcc = false;
>> +
>>  } // namespace AlphaISA
>>
>>  #endif // __ARCH_ALPHA_ISA_TRAITS_HH__
>> diff -r a123bd350935 -r 4d4903a3e7c5 src/arch/arm/isa_traits.hh
>> --- a/src/arch/arm/isa_traits.hh        Fri Feb 12 19:53:19 2010 +0000
>> +++ b/src/arch/arm/isa_traits.hh        Fri Feb 12 19:53:20 2010 +0000
>> @@ -106,6 +106,9 @@
>>     const int ByteBytes = 1;
>>
>>     const uint32_t HighVecs = 0xFFFF0000;
>> +
>> +    // Memory accesses cannot be unaligned
>> +    const bool HasUnalignedMemAcc = false;
>>  };
>>
>>  using namespace ArmISA;
>> diff -r a123bd350935 -r 4d4903a3e7c5 src/arch/mips/isa_traits.hh
>> --- a/src/arch/mips/isa_traits.hh       Fri Feb 12 19:53:19 2010 +0000
>> +++ b/src/arch/mips/isa_traits.hh       Fri Feb 12 19:53:20 2010 +0000
>> @@ -164,6 +164,9 @@
>>  const int ANNOTE_NONE = 0;
>>  const uint32_t ITOUCH_ANNOTE = 0xffffffff;
>>
>> +// Memory accesses cannot be unaligned
>> +const bool HasUnalignedMemAcc = false;
>> +
>>  };
>>
>>  #endif // __ARCH_MIPS_ISA_TRAITS_HH__
>> diff -r a123bd350935 -r 4d4903a3e7c5 src/arch/power/isa_traits.hh
>> --- a/src/arch/power/isa_traits.hh      Fri Feb 12 19:53:19 2010 +0000
>> +++ b/src/arch/power/isa_traits.hh      Fri Feb 12 19:53:20 2010 +0000
>> @@ -70,6 +70,9 @@
>>  // This is ori 0, 0, 0
>>  const ExtMachInst NoopMachInst = 0x60000000;
>>
>> +// Memory accesses can be unaligned
>> +const bool HasUnalignedMemAcc = true;
>> +
>>  } // PowerISA namespace
>>
>>  #endif // __ARCH_POWER_ISA_TRAITS_HH__
>> diff -r a123bd350935 -r 4d4903a3e7c5 src/arch/sparc/isa_traits.hh
>> --- a/src/arch/sparc/isa_traits.hh      Fri Feb 12 19:53:19 2010 +0000
>> +++ b/src/arch/sparc/isa_traits.hh      Fri Feb 12 19:53:20 2010 +0000
>> @@ -98,6 +98,9 @@
>>     };
>>
>>  #endif
>> +
>> +// Memory accesses cannot be unaligned
>> +const bool HasUnalignedMemAcc = false;
>>  }
>>
>>  #endif // __ARCH_SPARC_ISA_TRAITS_HH__
>> diff -r a123bd350935 -r 4d4903a3e7c5 src/arch/x86/isa_traits.hh
>> --- a/src/arch/x86/isa_traits.hh        Fri Feb 12 19:53:19 2010 +0000
>> +++ b/src/arch/x86/isa_traits.hh        Fri Feb 12 19:53:20 2010 +0000
>> @@ -91,6 +91,9 @@
>>     StaticInstPtr decodeInst(ExtMachInst);
>>
>>     const Addr LoadAddrMask = ULL(-1);
>> +
>> +    // Memory accesses can be unaligned
>> +    const bool HasUnalignedMemAcc = true;
>>  };
>>
>>  #endif // __ARCH_X86_ISATRAITS_HH__
>> diff -r a123bd350935 -r 4d4903a3e7c5 src/cpu/base_dyn_inst.hh
>> --- a/src/cpu/base_dyn_inst.hh  Fri Feb 12 19:53:19 2010 +0000
>> +++ b/src/cpu/base_dyn_inst.hh  Fri Feb 12 19:53:20 2010 +0000
>> @@ -131,8 +131,13 @@
>>     template <class T>
>>     Fault write(T data, Addr addr, unsigned flags, uint64_t *res);
>>
>> +    /** Splits a request in two if it crosses a dcache block. */
>> +    void splitRequest(RequestPtr req, RequestPtr &sreqLow,
>> +                      RequestPtr &sreqHigh);
>> +
>>     /** Initiate a DTB address translation. */
>> -    void initiateTranslation(RequestPtr req, uint64_t *res,
>> +    void initiateTranslation(RequestPtr req, RequestPtr sreqLow,
>> +                             RequestPtr sreqHigh, uint64_t *res,
>>                              BaseTLB::Mode mode);
>>
>>     /** Finish a DTB address translation. */
>> @@ -870,12 +875,19 @@
>>     Request *req = new Request(asid, addr, sizeof(T), flags, this->PC,
>>                                thread->contextId(), threadNumber);
>>
>> -    initiateTranslation(req, NULL, BaseTLB::Read);
>> +    Request *sreqLow = NULL;
>> +    Request *sreqHigh = NULL;
>> +
>> +    // Only split the request if the ISA supports unaligned accesses.
>> +    if (TheISA::HasUnalignedMemAcc) {
>> +        splitRequest(req, sreqLow, sreqHigh);
>> +    }
>> +    initiateTranslation(req, sreqLow, sreqHigh, NULL, BaseTLB::Read);
>>
>>     if (fault == NoFault) {
>>         effAddr = req->getVaddr();
>>         effAddrValid = true;
>> -        cpu->read(req, data, lqIdx);
>> +        cpu->read(req, sreqLow, sreqHigh, data, lqIdx);
>>     } else {
>>
>>         // Return a fixed value to keep simulation deterministic even
>> @@ -909,12 +921,19 @@
>>     Request *req = new Request(asid, addr, sizeof(T), flags, this->PC,
>>                                thread->contextId(), threadNumber);
>>
>> -    initiateTranslation(req, res, BaseTLB::Write);
>> +    Request *sreqLow = NULL;
>> +    Request *sreqHigh = NULL;
>> +
>> +    // Only split the request if the ISA supports unaligned accesses.
>> +    if (TheISA::HasUnalignedMemAcc) {
>> +        splitRequest(req, sreqLow, sreqHigh);
>> +    }
>> +    initiateTranslation(req, sreqLow, sreqHigh, res, BaseTLB::Write);
>>
>>     if (fault == NoFault) {
>>         effAddr = req->getVaddr();
>>         effAddrValid = true;
>> -        cpu->write(req, data, sqIdx);
>> +        cpu->write(req, sreqLow, sreqHigh, data, sqIdx);
>>     }
>>
>>     return fault;
>> @@ -922,14 +941,48 @@
>>
>>  template<class Impl>
>>  inline void
>> -BaseDynInst<Impl>::initiateTranslation(RequestPtr req, uint64_t *res,
>> +BaseDynInst<Impl>::splitRequest(RequestPtr req, RequestPtr &sreqLow,
>> +                                RequestPtr &sreqHigh)
>> +{
>> +    // Check to see if the request crosses the next level block  
>> boundary.
>> +    unsigned block_size = cpu->getDcachePort()->peerBlockSize();
>> +    Addr addr = req->getVaddr();
>> +    Addr split_addr = roundDown(addr + req->getSize() - 1, block_size);
>> +    assert(split_addr <= addr || split_addr - addr < block_size);
>> +
>> +    // Spans two blocks.
>> +    if (split_addr > addr) {
>> +        req->splitOnVaddr(split_addr, sreqLow, sreqHigh);
>> +    }
>> +}
>> +
>> +template<class Impl>
>> +inline void
>> +BaseDynInst<Impl>::initiateTranslation(RequestPtr req, RequestPtr  
>> sreqLow,
>> +                                       RequestPtr sreqHigh, uint64_t  
>> *res,
>>                                        BaseTLB::Mode mode)
>>  {
>> -    WholeTranslationState *state =
>> -        new WholeTranslationState(req, NULL, res, mode);
>> -    DataTranslation<BaseDynInst<Impl> > *trans =
>> -        new DataTranslation<BaseDynInst<Impl> >(this, state);
>> -    cpu->dtb->translateTiming(req, thread->getTC(), trans, mode);
>> +    if (!TheISA::HasUnalignedMemAcc || sreqLow == NULL) {
>> +        WholeTranslationState *state =
>> +            new WholeTranslationState(req, NULL, res, mode);
>> +
>> +        // One translation if the request isn't split.
>> +        DataTranslation<BaseDynInst<Impl> > *trans =
>> +            new DataTranslation<BaseDynInst<Impl> >(this, state);
>> +        cpu->dtb->translateTiming(req, thread->getTC(), trans, mode);
>> +    } else {
>> +        WholeTranslationState *state =
>> +            new WholeTranslationState(req, sreqLow, sreqHigh, NULL,  
>> res,
>> mode);
>> +
>> +        // Two translations when the request is split.
>> +        DataTranslation<BaseDynInst<Impl> > *stransLow =
>> +            new DataTranslation<BaseDynInst<Impl> >(this, state, 0);
>> +        DataTranslation<BaseDynInst<Impl> > *stransHigh =
>> +            new DataTranslation<BaseDynInst<Impl> >(this, state, 1);
>> +
>> +        cpu->dtb->translateTiming(sreqLow, thread->getTC(), stransLow,
>> mode);
>> +        cpu->dtb->translateTiming(sreqHigh, thread->getTC(),  
>> stransHigh,
>> mode);
>> +    }
>>  }
>>
>>  template<class Impl>
>> diff -r a123bd350935 -r 4d4903a3e7c5 src/cpu/o3/cpu.hh
>> --- a/src/cpu/o3/cpu.hh Fri Feb 12 19:53:19 2010 +0000
>> +++ b/src/cpu/o3/cpu.hh Fri Feb 12 19:53:20 2010 +0000
>> @@ -703,18 +703,25 @@
>>
>>     /** CPU read function, forwards read to LSQ. */
>>     template <class T>
>> -    Fault read(RequestPtr &req, T &data, int load_idx)
>> +    Fault read(RequestPtr &req, RequestPtr &sreqLow, RequestPtr  
>> &sreqHigh,
>> +               T &data, int load_idx)
>>     {
>> -        return this->iew.ldstQueue.read(req, data, load_idx);
>> +        return this->iew.ldstQueue.read(req, sreqLow, sreqHigh,
>> +                                        data, load_idx);
>>     }
>>
>>     /** CPU write function, forwards write to LSQ. */
>>     template <class T>
>> -    Fault write(RequestPtr &req, T &data, int store_idx)
>> +    Fault write(RequestPtr &req, RequestPtr &sreqLow, RequestPtr
>> &sreqHigh,
>> +                T &data, int store_idx)
>>     {
>> -        return this->iew.ldstQueue.write(req, data, store_idx);
>> +        return this->iew.ldstQueue.write(req, sreqLow, sreqHigh,
>> +                                         data, store_idx);
>>     }
>>
>> +    /** Get the dcache port (used to find block size for  
>> translations). */
>> +    Port *getDcachePort() { return  
>> this->iew.ldstQueue.getDcachePort(); }
>> +
>>     Addr lockAddr;
>>
>>     /** Temporary fix for the lock flag, works in the UP case. */
>> diff -r a123bd350935 -r 4d4903a3e7c5 src/cpu/o3/lsq.hh
>> --- a/src/cpu/o3/lsq.hh Fri Feb 12 19:53:19 2010 +0000
>> +++ b/src/cpu/o3/lsq.hh Fri Feb 12 19:53:20 2010 +0000
>> @@ -270,15 +270,19 @@
>>     void dumpInsts(ThreadID tid)
>>     { thread[tid].dumpInsts(); }
>>
>> -    /** Executes a read operation, using the load specified at the load
>> index. */
>> +    /** Executes a read operation, using the load specified at the load
>> +     * index.
>> +     */
>>     template <class T>
>> -    Fault read(RequestPtr req, T &data, int load_idx);
>> +    Fault read(RequestPtr req, RequestPtr sreqLow, RequestPtr sreqHigh,
>> +               T &data, int load_idx);
>>
>>     /** Executes a store operation, using the store specified at the  
>> store
>> -     *   index.
>> +     * index.
>>      */
>>     template <class T>
>> -    Fault write(RequestPtr req, T &data, int store_idx);
>> +    Fault write(RequestPtr req, RequestPtr sreqLow, RequestPtr  
>> sreqHigh,
>> +                T &data, int store_idx);
>>
>>     /** The CPU pointer. */
>>     O3CPU *cpu;
>> @@ -369,21 +373,23 @@
>>  template <class Impl>
>>  template <class T>
>>  Fault
>> -LSQ<Impl>::read(RequestPtr req, T &data, int load_idx)
>> +LSQ<Impl>::read(RequestPtr req, RequestPtr sreqLow, RequestPtr  
>> sreqHigh,
>> +                T &data, int load_idx)
>>  {
>>     ThreadID tid = req->threadId();
>>
>> -    return thread[tid].read(req, data, load_idx);
>> +    return thread[tid].read(req, sreqLow, sreqHigh, data, load_idx);
>>  }
>>
>>  template <class Impl>
>>  template <class T>
>>  Fault
>> -LSQ<Impl>::write(RequestPtr req, T &data, int store_idx)
>> +LSQ<Impl>::write(RequestPtr req, RequestPtr sreqLow, RequestPtr  
>> sreqHigh,
>> +                 T &data, int store_idx)
>>  {
>>     ThreadID tid = req->threadId();
>>
>> -    return thread[tid].write(req, data, store_idx);
>> +    return thread[tid].write(req, sreqLow, sreqHigh, data, store_idx);
>>  }
>>
>>  #endif // __CPU_O3_LSQ_HH__
>> diff -r a123bd350935 -r 4d4903a3e7c5 src/cpu/o3/lsq_unit.hh
>> --- a/src/cpu/o3/lsq_unit.hh    Fri Feb 12 19:53:19 2010 +0000
>> +++ b/src/cpu/o3/lsq_unit.hh    Fri Feb 12 19:53:20 2010 +0000
>> @@ -216,12 +216,18 @@
>>     /** Writes back the instruction, sending it to IEW. */
>>     void writeback(DynInstPtr &inst, PacketPtr pkt);
>>
>> +    /** Writes back a store that couldn't be completed the previous  
>> cycle.
>> */
>> +    void writebackPendingStore();
>> +
>>     /** Handles completing the send of a store to memory. */
>>     void storePostSend(PacketPtr pkt);
>>
>>     /** Completes the store at the specified index. */
>>     void completeStore(int store_idx);
>>
>> +    /** Attempts to send a store to the cache. */
>> _______________________________________________
>> m5-dev mailing list
>> [email protected]
>> http://m5sim.org/mailman/listinfo/m5-dev
>>
>>


-- 
The University of Edinburgh is a charitable body, registered in
Scotland, with registration number SC005336.

_______________________________________________
m5-dev mailing list
[email protected]
http://m5sim.org/mailman/listinfo/m5-dev

Reply via email to