aasorokiin commented on a change in pull request #32:
URL: https://github.com/apache/tvm-vta/pull/32#discussion_r692641717



##########
File path: src/dpi/module.cc
##########
@@ -180,36 +188,84 @@ void HostDevice::WaitPopResponse(HostResponse* r) {
   resp_.WaitPop(r);
 }
 
-void MemDevice::SetRequest(uint8_t opcode, uint64_t addr, uint32_t len) {
+  void MemDevice::  SetRequest(uint8_t rd_req_valid,uint64_t rd_req_addr, 
uint32_t rd_req_len, uint32_t rd_req_id, uint64_t wr_req_addr, uint32_t 
wr_req_len, uint8_t wr_req_valid){
+
   std::lock_guard<std::mutex> lock(mutex_);
-  void * vaddr = vta::vmem::VirtualMemoryManager::Global()->GetAddr(addr);
-
-  if (opcode == 1) {
-    wlen_ = len + 1;
-    waddr_ = reinterpret_cast<uint64_t*>(vaddr);
-  } else {
-    rlen_ = len + 1;
-    raddr_ = reinterpret_cast<uint64_t*>(vaddr);
-  }
+    if(rd_req_addr !=0 ){
+     void * rd_vaddr = 
vta::vmem::VirtualMemoryManager::Global()->GetAddr(rd_req_addr);
+     if(rd_req_valid == 1) {
+         rlen_ = rd_req_len + 1;
+         rid_  = rd_req_id;
+         raddr_ = reinterpret_cast<uint64_t*>(rd_vaddr);
+      }
+    }
+
+    if(wr_req_addr != 0){
+       void * wr_vaddr = 
vta::vmem::VirtualMemoryManager::Global()->GetAddr(wr_req_addr);
+       if (wr_req_valid == 1) {
+           wlen_ = wr_req_len + 1;
+            waddr_ = reinterpret_cast<uint64_t*>(wr_vaddr);
+         } 
+    }
+
+ //    if(wr_req_addr != 0 && rd_req_addr!=0){

Review comment:
       Updated PR.

##########
File path: hardware/chisel/src/main/scala/core/TensorUtil.scala
##########
@@ -79,9 +80,68 @@ class TensorParams(tensorType: String = "none")(implicit p: 
Parameters) extends
     else
       p(CoreKey).outMemDepth
 
+  // the number of cycles Instruction write is delayed
+  // Idle state writes are not delayed
+  // inserted regs are used to physically deliver signal to memories
+  val writePipeLatency =
+    if (tensorType == "inp") {
+      0 // VME data load cmd write (per group)
+    } else if (tensorType == "wgt") {
+      0 // VME data load cmd write (per group)
+    } else if (tensorType == "acc") {
+      0 // VME data load cmd write (per group)
+    } else if (tensorType == "fetch") {
+      0
+    } else if (tensorType == "uop") {
+      0
+    } else if (tensorType == "out") {
+      0 // direct write from core
+    } else {
+      0
+    }
+
+  // the number of cycles Idle state data read is delayed
+  // inserted regs are used to physically deliver signal to memories
+  val readTensorLatency =
+    if (tensorType == "inp") {
+      0 // GEMM inp data read (per memsplit)
+    } else if (tensorType == "wgt") {
+      0
+    } else if (tensorType == "acc") {
+      0
+    } else if (tensorType == "fetch") {
+      0
+    } else if (tensorType == "uop") {
+      0
+    } else if (tensorType == "out") {
+      0
+    } else {
+      0
+    }
+  // the number of cycles vme data signals are delayed
+  // This is a global delay of VME data signals. One for all groups
+  //
+  val readVMEDataLatency =
+    if (tensorType == "inp") {
+      0 // VME data signals delay
+    } else if (tensorType == "wgt") {
+      0 // VME data signals delay
+    } else if (tensorType == "acc") {
+      0  // VME data signals delay
+    } else if (tensorType == "fetch") {
+      0
+    } else if (tensorType == "uop") {
+      0 // VME data signals delay
+    } else if (tensorType == "out") {
+      0
+    } else {
+      0
+    }
+
+
   // acc/wgt parts are grouped to form
   // a physically compact compute entity
-
+  //

Review comment:
       Updated PR.

##########
File path: hardware/chisel/src/main/scala/core/TensorUtil.scala
##########
@@ -79,9 +80,68 @@ class TensorParams(tensorType: String = "none")(implicit p: 
Parameters) extends
     else
       p(CoreKey).outMemDepth
 
+  // the number of cycles Instruction write is delayed
+  // Idle state writes are not delayed
+  // inserted regs are used to physically deliver signal to memories
+  val writePipeLatency =
+    if (tensorType == "inp") {
+      0 // VME data load cmd write (per group)
+    } else if (tensorType == "wgt") {
+      0 // VME data load cmd write (per group)
+    } else if (tensorType == "acc") {
+      0 // VME data load cmd write (per group)
+    } else if (tensorType == "fetch") {
+      0
+    } else if (tensorType == "uop") {
+      0
+    } else if (tensorType == "out") {
+      0 // direct write from core
+    } else {
+      0
+    }
+
+  // the number of cycles Idle state data read is delayed
+  // inserted regs are used to physically deliver signal to memories
+  val readTensorLatency =
+    if (tensorType == "inp") {
+      0 // GEMM inp data read (per memsplit)
+    } else if (tensorType == "wgt") {
+      0
+    } else if (tensorType == "acc") {
+      0
+    } else if (tensorType == "fetch") {
+      0
+    } else if (tensorType == "uop") {
+      0
+    } else if (tensorType == "out") {
+      0
+    } else {
+      0
+    }
+  // the number of cycles vme data signals are delayed
+  // This is a global delay of VME data signals. One for all groups
+  //

Review comment:
       Updated PR.

##########
File path: hardware/chisel/src/main/scala/core/LoadUopSimple.scala
##########
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+import vta.shell._
+
+class LoadUopSimple(debug: Boolean = false)(implicit val p: Parameters) 
extends Module {
+  val mp = p(ShellKey).memParams
+  val io = IO(new Bundle {
+    val start = Input(Bool())
+    val done = Output(Bool())
+    val dec = Input(new MemDecode)
+    val baddr = Input(UInt(mp.addrBits.W))
+    val vme_rd = new VMEReadMaster
+    val uop = new UopClient
+  })
+  val uopsPerMemXfer = p(ShellKey).memParams.dataBits / p(CoreKey).uopBits
+  require(p(ShellKey).memParams.dataBits % p(CoreKey).uopBits == 0)
+  //require(uopsPerMemXfer == 1 || uopsPerMemXfer == 2)

Review comment:
       Updated PR.

##########
File path: hardware/chisel/src/main/scala/core/FetchWideVME.scala
##########
@@ -0,0 +1,356 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+//import scala.math.pow
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+import vta.shell._
+
+/** Fetch.
+ *
+ * The fetch unit reads instructions (tasks) from memory (i.e. DRAM), using the
+ * VTA Memory Engine (VME), and push them into an instruction queue called
+ * inst_q. Once the instruction queue is full, instructions are dispatched to
+ * the Load, Compute and Store module queues based on the instruction opcode.
+ * After draining the queue, the fetch unit checks if there are more 
instructions
+ * via the ins_count register which is written by the host.
+ *
+ * Additionally, instructions are read into two chunks (see sReadLSB and 
sReadMSB)
+ * because we are using a DRAM payload of 8-bytes or half of a VTA instruction.
+ * This should be configurable for larger payloads, i.e. 64-bytes, which can 
load
+ * more than one instruction at the time. Finally, the instruction queue is
+ * sized (entries_q), depending on the maximum burst allowed in the memory.
+ */
+class FetchWideVME(debug: Boolean = false)(implicit p: Parameters) extends 
Module {
+  val vp = p(ShellKey).vcrParams
+  val mp = p(ShellKey).memParams
+  val io = IO(new Bundle {
+    val launch = Input(Bool())
+    val ins_baddr = Input(UInt(mp.addrBits.W))
+    val ins_count = Input(UInt(vp.regBits.W))
+    val vme_rd = new VMEReadMaster
+    val inst = new Bundle {
+      val ld = Decoupled(UInt(INST_BITS.W))
+      val co = Decoupled(UInt(INST_BITS.W))
+      val st = Decoupled(UInt(INST_BITS.W))
+    }
+  })
+
+  val tp = new TensorParams("fetch")
+  val tensorsInClNb = tp.clSizeRatio
+  val tensorsInClNbWidth = log2Ceil(tensorsInClNb)
+  val inst_q = Seq.fill(tensorsInClNb) {
+    require((tp.memDepth/tensorsInClNb) * tensorsInClNb == tp.memDepth,
+      "-F- Unexpected queue depth to instructions in cacheline ratio")
+    SyncReadMem(tp.memDepth/tensorsInClNb, UInt(tp.tensorSizeBits.W))
+  }
+
+  //sample start
+  val s1_launch = RegNext(io.launch, init = false.B)
+  val start = io.launch & ~s1_launch
+
+
+  val xrem = Reg(chiselTypeOf(io.ins_count))
+  // fit instruction into 64bit chunks
+  val elemsInInstr = INST_BITS/64
+  val xsize = io.ins_count << log2Ceil(elemsInInstr)
+  // max size of transfer is limited by a buffer size
+  val xmax = (((1 << mp.lenBits) << 
log2Ceil(tp.clSizeRatio)).min(tp.memDepth)).U
+  val elemNb = Reg(xsize.cloneType)
+
+  val sIdle :: sRead :: sDrain :: Nil = Enum(3)
+  val state = RegInit(sIdle)
+  val isBusy = state === sRead
+
+  val vmeStart = start || (state === sRead && RegNext(state, init = sIdle) === 
sDrain)
+  val dramOffset  = RegInit(UInt(mp.addrBits.W), init = 0.U)
+  val vmeCmd = Module (new GenVMECmdWideFetch(debug))
+  vmeCmd.io.start := vmeStart
+  vmeCmd.io.isBusy := isBusy & ~vmeStart
+  vmeCmd.io.ins_baddr := Mux(start, io.ins_baddr, io.ins_baddr + (dramOffset 
<< log2Ceil(tp.tensorSizeBits / 8)))
+  vmeCmd.io.vmeCmd <> io.vme_rd.cmd
+  val readLen = vmeCmd.io.readLen
+  val vmeCmdDone = vmeCmd.io.done & ~vmeStart
+
+  vmeCmd.io.xsize := elemNb
+  vmeCmd.io.sram_offset := 0.U // this is a queue we reload
+
+  io.vme_rd.data.ready := true.B
+  val pipeDelayQueueDeqV = RegNext(io.vme_rd.data.valid, init = false.B)
+  val pipeDelayQueueDeqF = pipeDelayQueueDeqV // fire()
+  val pipeDelayQueueDeqB = RegNext(io.vme_rd.data.bits)
+
+  // Nb of CLs requestd, not received.
+  val clCntIdxWdth = log2Ceil(tp.memDepth/tensorsInClNb) + 1
+  val clInFlight = Reg(UInt(clCntIdxWdth.W))
+  when(start) {
+    clInFlight := 0.U
+  }.elsewhen(isBusy && io.vme_rd.cmd.fire() && !pipeDelayQueueDeqF) {
+    clInFlight := clInFlight + readLen
+  }.elsewhen(isBusy && io.vme_rd.cmd.fire() && pipeDelayQueueDeqF) {
+    clInFlight := clInFlight + readLen - 1.U
+  }.elsewhen(isBusy && !io.vme_rd.cmd.fire() && pipeDelayQueueDeqF) {
+    assert(clInFlight > 0.U)
+    clInFlight := clInFlight - 1.U
+  }.otherwise {
+    clInFlight := clInFlight
+  }
+
+  // number of entries in a queue
+  val queueCount = Reg(UInt((tp.memAddrBits + 1).W))
+  val queueHead  = Wire(UInt(tp.memAddrBits.W))
+  val queueHeadNext  = Reg(UInt(tp.memAddrBits.W))
+  val forceRead  = Wire(Bool())
+  forceRead := false.B
+  // control
+  switch(state) {
+    is(sIdle) {
+      when(start) {
+        state := sRead
+        dramOffset := 0.U
+        when(xsize < xmax) {
+          elemNb := xsize
+          xrem := 0.U
+        }.otherwise {
+          elemNb := xmax
+          xrem := xsize - xmax
+        }
+      }
+    }
+    is(sRead) {
+      when(vmeCmdDone && clInFlight === 0.U) {
+        forceRead := true.B
+        state := sDrain
+      }
+    }
+    is(sDrain) {
+      when(queueCount === 0.U) {
+        dramOffset := dramOffset + elemNb
+        when(xrem === 0.U) {
+          state := sIdle
+        }.elsewhen(xrem < xmax) {
+          state := sRead
+          elemNb := xrem
+          xrem := 0.U
+        }.otherwise {
+          state := sRead
+          elemNb := xmax
+          xrem := xrem - xmax
+        }
+      }
+    }
+  }
+
+
+  //---------------------
+  //--- Read VME data ---
+  //---------------------
+
+  val readData = Module(new ReadVMEDataWide("fetch", debug))
+  readData.io.start := vmeStart
+  //io.vme_rd.data <> readData.io.vmeData
+  //pipeDelayQueueDeq <> readData.io.vmeData
+  readData.io.vmeData.valid := pipeDelayQueueDeqV
+  readData.io.vmeData.bits := pipeDelayQueueDeqB
+  assert(readData.io.vmeData.ready === true.B)
+
+  //--------------------
+  //--- Write memory ---
+  //--------------------
+
+  val wmask = readData.io.destMask
+  val wdata = readData.io.destData
+  val widx  = readData.io.destIdx
+
+  for (i <- 0 until tensorsInClNb) {
+    when(wmask(i) && pipeDelayQueueDeqF) {
+      inst_q(i).write(widx(i), wdata(i))
+    }
+  }
+  if (debug) {
+    when (io.vme_rd.data.fire()) {
+      printf(s"[TensorLoad] fetch data rdDataDestIdx:%x rdDataDestMask:%b\n",
+        widx.asUInt,
+        wmask.asUInt)
+    }
+  }
+
+  // read-from-sram
+  // queue head points to the first elem of instruction
+  val rIdx = queueHead >> tensorsInClNbWidth // SP idx
+  // rMask selects the first elem of instruction
+  val rMask = if (tensorsInClNbWidth > 0) {
+    UIntToOH(queueHead(tensorsInClNbWidth - 1, 0))
+    } else {
+      1.U
+    }
+
+  val deqElem = Wire(Bool())
+  val rdataVec =   for (i <- 0 until tensorsInClNb) yield {
+    // expand mask to select all elems of instruction
+    val maskShift = i%elemsInInstr
+    inst_q(i).read(rIdx, 
VecInit((rMask<<maskShift).asTypeOf(rMask).toBools)(i) && (deqElem || 
forceRead))
+
+  }
+
+  // instruction is a elemsInInstr number of elements
+  // combine them into one instruction
+  val rdata = Wire(Vec(elemsInInstr, UInt((tp.tensorSizeBits).W)))
+  for (i <- 0 until elemsInInstr) {
+    // expand mask to select all elems of instruction
+    rdata(i) := Mux1H(RegNext((rMask << i).asTypeOf(rMask)), rdataVec)
+  }
+
+
+  val canRead = queueCount >= elemsInInstr.U && state === sDrain
+  // instruction queues
+
+  //use 2-enty queue to create one pipe stage for valid-ready interface
+  val readInstrPipe = Module(new Queue(UInt(INST_BITS.W), 2))
+
+  // decode
+  val dec = Module(new FetchDecode)
+  dec.io.inst := readInstrPipe.io.deq.bits
+  readInstrPipe.io.enq.valid := canRead
+  readInstrPipe.io.enq.bits := rdata.asTypeOf(UInt(INST_BITS.W))
+  deqElem := readInstrPipe.io.enq.fire()
+  readInstrPipe.io.deq.ready := (
+    (dec.io.isLoad & io.inst.ld.ready) ||
+    (dec.io.isCompute & io.inst.co.ready) ||
+    (dec.io.isStore & io.inst.st.ready))
+  io.inst.ld.valid := dec.io.isLoad & readInstrPipe.io.deq.valid
+  io.inst.co.valid := dec.io.isCompute & readInstrPipe.io.deq.valid
+  io.inst.st.valid := dec.io.isStore & readInstrPipe.io.deq.valid
+
+  io.inst.ld.bits := readInstrPipe.io.deq.bits
+  io.inst.co.bits := readInstrPipe.io.deq.bits
+  io.inst.st.bits := readInstrPipe.io.deq.bits
+
+  when(start) {
+    queueCount := 0.U
+  }.elsewhen(deqElem && pipeDelayQueueDeqF) {
+    assert(queueCount > 0.U, "-F- Decrement zero counter")
+    val readCount = PopCount(wmask)
+    assert(readCount > 0.U, "-F- Must push something")
+    queueCount := queueCount + readCount - elemsInInstr.U
+  }.elsewhen(deqElem) {
+    assert(queueCount > 0.U, "-F- Decrement zero counter")
+    queueCount := queueCount - elemsInInstr.U
+  }.elsewhen (pipeDelayQueueDeqF) {
+    val numLoaded = PopCount(wmask)
+    assert(tp.memDepth.U - numLoaded >= queueCount, "-F- Counter overflow")
+    queueCount := queueCount + PopCount(wmask)
+  }.otherwise {
+    queueCount := queueCount
+  }
+  when(start) {
+    queueHead := 0.U
+    queueHeadNext := 0.U
+  }.elsewhen(deqElem) {
+    queueHead := queueHeadNext + elemsInInstr.U // read ahead
+    when (queueCount - elemsInInstr.U === 0.U) {
+      queueHeadNext := 0.U
+    }.otherwise {
+      queueHeadNext := queueHeadNext + elemsInInstr.U
+    }
+  }.otherwise {
+    // check if queueCount === 0.U -> queueHeadNext === 0.U

Review comment:
       Updated PR.

##########
File path: hardware/chisel/src/main/scala/core/FetchWideVME.scala
##########
@@ -0,0 +1,356 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+//import scala.math.pow
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+import vta.shell._
+
+/** Fetch.
+ *
+ * The fetch unit reads instructions (tasks) from memory (i.e. DRAM), using the
+ * VTA Memory Engine (VME), and push them into an instruction queue called
+ * inst_q. Once the instruction queue is full, instructions are dispatched to
+ * the Load, Compute and Store module queues based on the instruction opcode.
+ * After draining the queue, the fetch unit checks if there are more 
instructions
+ * via the ins_count register which is written by the host.
+ *
+ * Additionally, instructions are read into two chunks (see sReadLSB and 
sReadMSB)
+ * because we are using a DRAM payload of 8-bytes or half of a VTA instruction.
+ * This should be configurable for larger payloads, i.e. 64-bytes, which can 
load
+ * more than one instruction at the time. Finally, the instruction queue is
+ * sized (entries_q), depending on the maximum burst allowed in the memory.
+ */
+class FetchWideVME(debug: Boolean = false)(implicit p: Parameters) extends 
Module {
+  val vp = p(ShellKey).vcrParams
+  val mp = p(ShellKey).memParams
+  val io = IO(new Bundle {
+    val launch = Input(Bool())
+    val ins_baddr = Input(UInt(mp.addrBits.W))
+    val ins_count = Input(UInt(vp.regBits.W))
+    val vme_rd = new VMEReadMaster
+    val inst = new Bundle {
+      val ld = Decoupled(UInt(INST_BITS.W))
+      val co = Decoupled(UInt(INST_BITS.W))
+      val st = Decoupled(UInt(INST_BITS.W))
+    }
+  })
+
+  val tp = new TensorParams("fetch")
+  val tensorsInClNb = tp.clSizeRatio
+  val tensorsInClNbWidth = log2Ceil(tensorsInClNb)
+  val inst_q = Seq.fill(tensorsInClNb) {
+    require((tp.memDepth/tensorsInClNb) * tensorsInClNb == tp.memDepth,
+      "-F- Unexpected queue depth to instructions in cacheline ratio")
+    SyncReadMem(tp.memDepth/tensorsInClNb, UInt(tp.tensorSizeBits.W))
+  }
+
+  //sample start
+  val s1_launch = RegNext(io.launch, init = false.B)
+  val start = io.launch & ~s1_launch
+
+
+  val xrem = Reg(chiselTypeOf(io.ins_count))
+  // fit instruction into 64bit chunks
+  val elemsInInstr = INST_BITS/64
+  val xsize = io.ins_count << log2Ceil(elemsInInstr)
+  // max size of transfer is limited by a buffer size
+  val xmax = (((1 << mp.lenBits) << 
log2Ceil(tp.clSizeRatio)).min(tp.memDepth)).U
+  val elemNb = Reg(xsize.cloneType)
+
+  val sIdle :: sRead :: sDrain :: Nil = Enum(3)
+  val state = RegInit(sIdle)
+  val isBusy = state === sRead
+
+  val vmeStart = start || (state === sRead && RegNext(state, init = sIdle) === 
sDrain)
+  val dramOffset  = RegInit(UInt(mp.addrBits.W), init = 0.U)
+  val vmeCmd = Module (new GenVMECmdWideFetch(debug))
+  vmeCmd.io.start := vmeStart
+  vmeCmd.io.isBusy := isBusy & ~vmeStart
+  vmeCmd.io.ins_baddr := Mux(start, io.ins_baddr, io.ins_baddr + (dramOffset 
<< log2Ceil(tp.tensorSizeBits / 8)))
+  vmeCmd.io.vmeCmd <> io.vme_rd.cmd
+  val readLen = vmeCmd.io.readLen
+  val vmeCmdDone = vmeCmd.io.done & ~vmeStart
+
+  vmeCmd.io.xsize := elemNb
+  vmeCmd.io.sram_offset := 0.U // this is a queue we reload
+
+  io.vme_rd.data.ready := true.B
+  val pipeDelayQueueDeqV = RegNext(io.vme_rd.data.valid, init = false.B)
+  val pipeDelayQueueDeqF = pipeDelayQueueDeqV // fire()
+  val pipeDelayQueueDeqB = RegNext(io.vme_rd.data.bits)
+
+  // Nb of CLs requestd, not received.
+  val clCntIdxWdth = log2Ceil(tp.memDepth/tensorsInClNb) + 1
+  val clInFlight = Reg(UInt(clCntIdxWdth.W))
+  when(start) {
+    clInFlight := 0.U
+  }.elsewhen(isBusy && io.vme_rd.cmd.fire() && !pipeDelayQueueDeqF) {
+    clInFlight := clInFlight + readLen
+  }.elsewhen(isBusy && io.vme_rd.cmd.fire() && pipeDelayQueueDeqF) {
+    clInFlight := clInFlight + readLen - 1.U
+  }.elsewhen(isBusy && !io.vme_rd.cmd.fire() && pipeDelayQueueDeqF) {
+    assert(clInFlight > 0.U)
+    clInFlight := clInFlight - 1.U
+  }.otherwise {
+    clInFlight := clInFlight
+  }
+
+  // number of entries in a queue
+  val queueCount = Reg(UInt((tp.memAddrBits + 1).W))
+  val queueHead  = Wire(UInt(tp.memAddrBits.W))
+  val queueHeadNext  = Reg(UInt(tp.memAddrBits.W))
+  val forceRead  = Wire(Bool())
+  forceRead := false.B
+  // control
+  switch(state) {
+    is(sIdle) {
+      when(start) {
+        state := sRead
+        dramOffset := 0.U
+        when(xsize < xmax) {
+          elemNb := xsize
+          xrem := 0.U
+        }.otherwise {
+          elemNb := xmax
+          xrem := xsize - xmax
+        }
+      }
+    }
+    is(sRead) {
+      when(vmeCmdDone && clInFlight === 0.U) {
+        forceRead := true.B
+        state := sDrain
+      }
+    }
+    is(sDrain) {
+      when(queueCount === 0.U) {
+        dramOffset := dramOffset + elemNb
+        when(xrem === 0.U) {
+          state := sIdle
+        }.elsewhen(xrem < xmax) {
+          state := sRead
+          elemNb := xrem
+          xrem := 0.U
+        }.otherwise {
+          state := sRead
+          elemNb := xmax
+          xrem := xrem - xmax
+        }
+      }
+    }
+  }
+
+
+  //---------------------
+  //--- Read VME data ---
+  //---------------------
+
+  val readData = Module(new ReadVMEDataWide("fetch", debug))
+  readData.io.start := vmeStart
+  //io.vme_rd.data <> readData.io.vmeData

Review comment:
       Updated PR.

##########
File path: hardware/chisel/src/main/scala/core/Compute.scala
##########
@@ -119,6 +122,7 @@ class Compute(debug: Boolean = false)(implicit val p: 
Parameters) extends Module
 
   // uop
   loadUop.io.start := state === sIdle & start & dec.io.isLoadUop
+  //loadUop.io.dec := inst_q.io.deq.bits.asTypeOf(new MemDecode)

Review comment:
       Updated PR.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@tvm.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to