Timothy Miller wrote:
> There's lots of stuff you can do earlier in the pipeline, such as
> detecting row misses and such.  One approach I've considered (which
> suffers from the evil two-level state machine problem) is something
> like what I describe below.  Some of the pipelining might seem
> excessive, but the idea is to get it to run at 200MHz.
>
What is the evil two-level state machine problem?

> First of all, since reads can come from multiple places and there's
> lots of pipeline latency, read requests are addresses, accompanied by
> a TAG.  This tag is used by upstream logic to figure out where the
> data came from and whom it's going to.
>
> Stage 0:
> Deal with memory controller internal busy signals.  Basically, you mux
> between the in-coming command and an in-coming command that was
> registered on an earlier cycle.  This way, the out-going busy signal
> is registered (very important).
>
> Stage 1:
> Look up the last row associated with the selected bank.  In that
> table, replace the row number with the new one.
> For a read, enqueue the tag.
>
> Stage 2:
> Compare the row address with the one looked up in the last stage.
>
> Stage 3:
> Split the command into a 1-hot encoding.
> Determine if the row is open (might need a precharge) or closed
> (definitely needs an activate).
> Determine if there is a row miss based on the compare from the last
> stage and whether or not the row is open.
>
> Stage 4:
> This stage determines if we're going to do the command issued, or
> we're going to do something else.  For instance, if there's a row
> miss, we need to hang onto the read/write command and issue precharge,
> then activate, then the read/write.
> This is the first state machine, which can be busy, holding up earlier
> stages.
> This stage may also detect the need for a refresh and insert
> appropriate commands ahead of anything else that might be coming in.
>
> Stage 5:
> This is the second state machine.
> Execute commands generated by previous stage.
> Issue commands directly to RAM chips.
> Manage counters to insert appropriate timing delays.
> This stage has its own busy signal, complicating things even further.
>
> The mechanics of dealing with CAS latency are handled in other logic,
> and that logic dequeues the tags when appropriate.
>
I made a sketch inspired by this. Your mention of 1-hot encoding (which
I had to look up) gave me the idea that this can be used for counters,
but with a simple mask of 1bits for each occupied cycle. A nice
side-effect is that a max-function which is needed at one point, is a
simple bitwise or. The sketch is still not programmable, but instead
executes directly at stage 4. It would be good if you or someone else
share your opinion on whether this could be warped to 200 MHz, before I
spend more time on it.

module ddr_ctl(clock, reset_, req, busy, rd_avail, rd_tag,
               ddr_clk, ddr_clk_, ddr_cke, ddr_bank, ddr_addr,
               ddr_cmd, ddr_dm, ddr_dq_io, ddr_dqs);
    parameter chip_count = 2;

    // Bus Widths
    parameter tag_width = 4;
    parameter cmd_width = 2;
    parameter addr_width = 24;
    parameter   col_width = 9;
    parameter   row_width = 13;
    parameter   bank_width = 2;
    parameter data_width = 16*chip_count;
    parameter mask_width = data_width/8;
    parameter bank_count = 1 << bank_width;

    // Commands
    parameter CMD_REFRESH = 2'b11;
    parameter CMD_READ = 2'b01;
    parameter CMD_WRITE = 2'b10;

    // The Request Format
    parameter req_width = cmd_width + tag_width + addr_width
                        + data_width + mask_width;
    parameter req_mask_offset = 0;
    parameter req_data_offset = req_mask_offset + mask_width;
    parameter req_addr_offset = req_data_offset + data_width;
    parameter   req_col_offset = req_addr_offset;
    parameter   req_row_offset = req_col_offset + col_width;
    parameter   req_bank_offset = req_row_offset + row_width;
    parameter req_cmd_offset = req_addr_offset + addr_width;
    parameter req_tag_offset = req_cmd_offset + cmd_width;

    // Client Connectors
    input clock;
    input reset_;
    input[req_width-1:0] req;
    output busy;
    output rd_avail;
    output[tag_width-1:0] rd_tag;

    // DDR Connectors
    // Some of the ports connects to both memories, for others the lines are
    // split between them.
    parameter ddr_addr_width = 12;
    parameter DDR_CMD_NOOP      = 3'b111;
    parameter DDR_CMD_READ      = 3'b101;
    parameter DDR_CMD_WRITE     = 3'b100;
    parameter DDR_CMD_PRECHARGE = 3'b010;
    parameter DDR_CMD_ACTIVE    = 3'b011;
    output ddr_clk, ddr_clk_, ddr_cke;
    output[bank_width-1:0] ddr_bank;
    output[ddr_addr_width-1:0] ddr_addr;
    output[2:0] ddr_cmd; // = {ddr_ras_,ddr_cas_,ddr_we_}
    output[data_width/8-1:0] ddr_dm;
    inout[data_width-1:0] ddr_dq_io;
    inout[chip_count-1:0] ddr_dqs;

    // Maximum Configurable Timing Values
    parameter t_poa_max = 90;
    parameter t_read_max = 30;
    parameter t_write_max = 30;
    parameter t_rw_max = t_read_max > t_write_max? t_read_max : t_write_max;

// Functions to De-Compose a Request
function[cmd_width-1:0] req_cmd;
        input[req_width-1:0] req;
    req_cmd = req[req_cmd_offset + cmd_width - 1 : req_cmd_offset];
endfunction
function[tag_width-1:0] req_tag;
        input[req_width-1:0] req;
    req_tag = req[req_tag_offset + tag_width - 1 : req_tag_offset];
endfunction
function[bank_width-1:0] req_bank;
        input[req_width-1:0] req;
    req_bank = req[req_bank_offset + bank_width - 1 : req_bank_offset];
endfunction
function[row_width-1:0] req_row;
        input[req_width-1:0] req;
    req_row = req[req_row_offset + row_width - 1 : req_row_offset];
endfunction
function[col_width-1:0] req_col;
        input[req_width-1:0] req;
    req_col = req[req_col_offset + col_width - 1 : req_col_offset];
endfunction
function[mask_width-1:0] req_mask;
        input[req_width-1:0] req;
    req_mask = req[req_mask_offset + mask_width - 1 : req_mask_offset];
endfunction
function[data_width-1:0] req_data;
        input[req_width-1:0] req;
    req_data = req[req_data_offset + data_width - 1 : req_data_offset];
endfunction


// Interface to DDR and dealing with DDR, CAS latency and tDQSS.
parameter DDR_IOQ_CMD_READ = 2'b01;
parameter DDR_IOQ_CMD_WRITE = 2'b10;
reg[1:0]            ddr_ioq_cmd;
reg[mask_width-1:0] ddr_ioq_mask;
reg[data_width-1:0] ddr_ioq_data; // data for write, or tag for read

integer i;
parameter T_CAS = 3;  // TODO make configurable
parameter T_DQSS = 1;
parameter DDR_IOQ_LENGTH = T_CAS > T_DQSS? T_CAS : T_DQSS;
parameter DDR_IOQ_CMDBIT_READ = data_width+mask_width;
parameter DDR_IOQ_CMDBIT_WRITE = data_width+mask_width+1;
reg[data_width+mask_width+1:0] ddr_ioq_arr[0:DDR_IOQ_LENGTH];
wire[data_width+mask_width+1:0] ddr_ioq_current;
always @(posedge clock) begin
    for (i = 1; i < DDR_IOQ_LENGTH; i = i + 1)
        ddr_ioq_arr[i - 1] <= ddr_ioq_arr[i];
    if (ddr_ioq_cmd[0]) // read
        ddr_ioq_arr[T_CAS] <= {ddr_ioq_cmd,ddr_ioq_mask,ddr_ioq_data};
    if (ddr_ioq_cmd[1]) // write
        ddr_ioq_arr[T_DQSS] <= {ddr_ioq_cmd,ddr_ioq_mask,ddr_ioq_data};
end
reg busy;
reg[2:0] ddr_cmd;
reg[bank_width-1:0] ddr_bank;
reg[ddr_addr_width-1:0] ddr_addr;
assign ddr_dq_io = ddr_ioq_current[DDR_IOQ_CMDBIT_WRITE]
                   ? ddr_ioq_current[data_width-1:0] : 32'bz;
assign ddr_dm = ddr_ioq_current[data_width+mask_width-1:data_width];
assign rd_avail = ddr_ioq_current[DDR_IOQ_CMDBIT_READ];
assign rd_tag = ddr_ioq_current[tag_width-1:0];


// Stage 0
reg[req_width-1:0] s0_req;
always @(posedge clock) begin
    if (!busy) begin
        s0_req <= req;
    end
end

// Stage 1
reg[req_width-1:0] s1_req;
reg[row_width-1:0] s1_open_row;
reg[row_width-1:0] s1_open_row_arr[0:bank_count-1];
always @(posedge clock) begin
    if (!busy) begin
        s1_req <= s1_req;
        s1_open_row <= s1_open_row_arr[req_bank(req)];
        s1_open_row_arr[req_bank(req)] <= req_row(req);
    end
end

// Stage 2
reg[req_width-1:0] s2_req;
reg s2_maybe_row_hit;
always @(posedge clock) begin
    if (!busy) begin
        s2_req <= s1_req;
        s2_maybe_row_hit <= req_row(s1_req) == s1_open_row;
    end
end

// Stage 3
reg[req_width-1:0] s3_req;
reg s3_row_is_open_arr[0:bank_count-1];
reg s3_row_hit;
reg s3_need_precharge;
reg s3_need_activate;
reg s3_is_read;
reg s3_is_write;
always @(posedge clock) begin
    if (!busy) begin
        s3_req <= s2_req;
        s3_row_is_open_arr[req_bank(s2_req)] <= 1;
        s3_need_precharge <= 0;
        s3_need_activate <= 0;
        case (req_cmd(s2_req))
            CMD_REFRESH: begin
                for (i = 0; i < bank_count; i = i + 1)
                    s3_row_is_open_arr[i] <= 0;
            end
            CMD_READ, CMD_WRITE:
                if (s3_row_is_open_arr[req_bank(s2_req)]) begin
                    if (!s2_maybe_row_hit) begin
                        s3_need_precharge <= 1;
                        s3_need_activate <= 1;
                    end
                end else begin
                    s3_need_activate <= 1;
                end
        endcase
        if (req_cmd(s2_req) == CMD_READ) s3_is_read = 1'b1;
        if (req_cmd(s2_req) == CMD_WRITE) s3_is_write = 1'b1;
    end
end

// Stage 4
reg[t_poa_max-1:0] s4_tmask_poa;
reg[t_read_max-1:0] s4_tmask_read;
reg[t_write_max-1:0] s4_tmask_write;
reg[req_width-1:0] s4_req;

// Timing configuration (TODO: add inteface to change settings)
reg[t_poa_max-1:0]   tmask_precharge_to_activate;
reg[t_poa_max-1:0]   tmask_activate_to_precharge;
reg[t_poa_max-1:0]   tmask_read_to_precharge;
reg[t_poa_max-1:0]   tmask_write_to_precharge;
reg[t_rw_max-1:0]    tmask_activate_to_rw;
reg[t_write_max-1:0] tmask_read_to_write;
reg[t_read_max-1:0]  tmask_write_to_read;

always @(posedge clock) begin
    s4_tmask_poa   <= s4_tmask_poa   >> 1;
    s4_tmask_read  <= s4_tmask_read  >> 1;
    s4_tmask_write <= s4_tmask_write >> 1;

    ddr_cmd <= DDR_CMD_NOOP;
    busy <= 1;
    if (s3_need_precharge && !s4_tmask_poa[0]) begin
        ddr_cmd <= DDR_CMD_PRECHARGE;
        ddr_bank <= req_bank(s3_req);

        s3_need_precharge <= 0;
        s4_tmask_poa <= tmask_precharge_to_activate;
    end
    if (s3_need_activate && !s4_tmask_poa[0]) begin
        ddr_cmd <= DDR_CMD_ACTIVE;
        ddr_bank <= req_bank(s3_req);
        ddr_addr <= req_row(s3_req);

        s3_need_activate <= 0;
        s4_tmask_poa <= tmask_activate_to_precharge;
        s4_tmask_write <= tmask_activate_to_rw;
        s4_tmask_read <= tmask_activate_to_rw;
    end
    if (s3_is_read && !s4_tmask_read[0]) begin
        ddr_cmd <= DDR_CMD_READ;
        ddr_bank <= req_bank(s3_req);
        ddr_addr <= req_col(s3_req);
        ddr_ioq_cmd <= DDR_IOQ_CMD_READ;
        ddr_ioq_data <= req_tag(s3_req);

        busy <= 0;
        s4_tmask_poa   <= (s4_tmask_poa >> 1)   | tmask_read_to_precharge;
        s4_tmask_write <= (s4_tmask_write >> 1) | tmask_read_to_write;
    end
    if (s3_is_write && !s4_tmask_write[0]) begin
        ddr_cmd <= DDR_CMD_WRITE;
        ddr_bank <= req_bank(s3_req);
        ddr_addr <= req_col(s3_req);
        ddr_ioq_cmd <= DDR_IOQ_CMD_WRITE;
        ddr_ioq_mask <= req_mask(s3_req);
        ddr_ioq_data <= req_data(s3_req);

        busy <= 0;
        s4_tmask_poa  <= (s4_tmask_poa >> 1)  | tmask_write_to_precharge;
        s4_tmask_read <= (s4_tmask_read >> 1) | tmask_write_to_read;
    end
end

endmodule
_______________________________________________
Open-graphics mailing list
[email protected]
http://lists.duskglow.com/mailman/listinfo/open-graphics
List service provided by Duskglow Consulting, LLC (www.duskglow.com)

Reply via email to