From 1aeb760d093189486efbf5adf3292881eda94eb0 Mon Sep 17 00:00:00 2001
From: Julian Blake Kongslie
Date: Sun, 24 Jul 2022 14:59:03 -0700
Subject: Writeback cache using explicit altsyncram instead of inferred memory.

---
 .gitignore           |   1 +
 PLAN                 |  24 +++++++
 hdl/defs.svh         |   3 +-
 hdl/mem_broadcast.sv |   2 +
 hdl/mem_cache.sv     | 191 ++++++++++++++++++++++++++++++++++++++-------------
 hdl/top.sv           |   6 +-
 6 files changed, 175 insertions(+), 52 deletions(-)

diff --git a/.gitignore b/.gitignore
index 298becb..1922d9a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 /build
 /db
+/greybox_tmp
 /incremental_db
 /pdp8.*
diff --git a/PLAN b/PLAN
index e083e8e..60adf5f 100644
--- a/PLAN
+++ b/PLAN
@@ -1,4 +1,28 @@
 0. writeback cache
+    [✔️] Cache dirty bits
+    [✔️] Cache evicting dirty data on fills that would replace
+    [✔️] Cache not immediately forwarding writes
+    [✔️] Fix mem_cache to actually instantiate memory correctly
+        [✔️] Run Quartus in Windows to generate a Verilog template for manual instantiation of RAM blocks
+            [X] Try to use asynchronous clears for reset instead of occupying a port for SETS cycles NOPE
+            [✔️] Need at least one port capable of read-before-write
+            [✔️] Maybe don't need a second port if the first port can make write optional
+        [✔️] We might need to split our accesses across two cycles
+            [X] If so, can we infer the correct logic without explicit instantiation of the megafunction? NOPE
+        [X] Can we do asynchronous clear without explicit instantiation of the megafunction? NOPE
+        [✔️] Copy from said template into mem_cache.sv instead of trying to use inference
+    ---
+    [ ] Arbiter sending snoops to caches in response to CLI writes
+    [ ] Cache updating itself to clean state for write snoops
+    ---
+    [ ] Arbiter sending snoops to caches in response to CLI reads
+    [ ] Arbiter waiting for snoop responses from caches for CLI reads
+    [ ] Arbiter sending correct data for CLI reads (snoop responses in preference over RAM response)
+    [ ] Cache sending snoop responses for read snoops
+    ---
+    [ ] Cache forwarding snoops upstream
+    [ ] Core updating itself for write snoops (no-op)
+    [ ] Core sending snoop responses for read snoops (always no data)
 1. pipelining that works with SMC / start working on minhdl version of the core
 2. write an SPI or I2C master on the FPGA to sample analog inputs
 3. support wider-than-single-word cache lines
diff --git a/hdl/defs.svh b/hdl/defs.svh
index 73fddaf..dc4a243 100644
--- a/hdl/defs.svh
+++ b/hdl/defs.svh
@@ -5,7 +5,7 @@
 
 `define PDP_ADDRESS_BITS    15
 
-`define NUM_PDPS            4
+`define NUM_PDPS            8
 
 `define UART_BYTE_BITS      8
 
@@ -56,5 +56,6 @@ typedef struct packed {
 typedef struct packed {
     pdp_line_address_t  address;
     bit                 snoop;
+    bit                 data_valid;
     ram_line_t          data;
 } mem_to_core_t;
diff --git a/hdl/mem_broadcast.sv b/hdl/mem_broadcast.sv
index 599be28..e86873e 100644
--- a/hdl/mem_broadcast.sv
+++ b/hdl/mem_broadcast.sv
@@ -48,6 +48,8 @@ module mem_broadcast
                     if (!pdp_valid[ram_data.tag-1]) begin
                         pdp_valid[ram_data.tag-1] = 1;
                         pdp_data[ram_data.tag-1].address = hold_data.address[`PDP_ADDRESS_BITS-1:$clog2(`RAM_LINE_WORDS)];
+                        pdp_data[ram_data.tag-1].snoop = 0;
+                        pdp_data[ram_data.tag-1].data_valid = 1;
                         pdp_data[ram_data.tag-1].data = hold_data.data;
                         hold_valid = 0;
                     end
diff --git a/hdl/mem_cache.sv b/hdl/mem_cache.sv
index 181d8d7..e7fcac7 100644
--- a/hdl/mem_cache.sv
+++ b/hdl/mem_cache.sv
@@ -6,8 +6,6 @@ module mem_cache
     (   input   bit clock
     ,   input   bit reset
 
-    ,   input   bit clear
-
     ,   output  bit             core_command_ready
     ,   input   bit             core_command_valid
     ,   input   core_to_mem_t   core_command_data
@@ -32,6 +30,7 @@ module mem_cache
 
     typedef struct packed {
         bit             valid;
+        bit             dirty;
         address_tag_t   address;
     } tag_t;
 
@@ -40,73 +39,171 @@ module mem_cache
         ram_line_t  data;
     } cache_entry_t;
 
-    (* ramstyle = "no_rw_check, M9K" *) cache_entry_t   cache   [(1<<SET_BITS)-1:0];
-
-    bit outstanding_fill;
+    struct packed {
+        set_t           address;
+        bit             read_enable;
+        bit             write_enable;
+        cache_entry_t   read_data;
+        cache_entry_t   write_data;
+    } cache[1:0];
+
+    altsyncram
+        #(  .address_reg_b("CLOCK0")
+        ,   .clock_enable_input_a("BYPASS"), .clock_enable_input_b("BYPASS")
+        ,   .clock_enable_output_a("BYPASS"), .clock_enable_output_b("BYPASS")
+        ,   .indata_reg_b("CLOCK0")
+        ,   .numwords_a(1 << SET_BITS), .numwords_b(1 << SET_BITS)
+        ,   .operation_mode("BIDIR_DUAL_PORT")
+        ,   .outdata_aclr_a("NONE"), .outdata_aclr_b("NONE")
+        ,   .outdata_reg_a("UNREGISTERED"), .outdata_reg_b("UNREGISTERED")
+        ,   .power_up_uninitialized("TRUE")
+        ,   .ram_block_type("M9K")
+        ,   .read_during_write_mode_mixed_ports("OLD_DATA")
+        ,   .read_during_write_mode_port_a("OLD_DATA"), .read_during_write_mode_port_b("OLD_DATA")
+        ,   .widthad_a(SET_BITS), .widthad_b(SET_BITS)
+        ,   .width_a($bits(cache_entry_t)), .width_b($bits(cache_entry_t))
+        ,   .width_byteena_a(1), .width_byteena_b(1)
+        ,   .wrcontrol_wraddress_reg_b("CLOCK0")
+        )   cache_controller
+        (   .address_a(cache[0].address), .address_b(cache[1].address)
+        ,   .clock0(~clock)
+        ,   .data_a(cache[0].write_data), .data_b(cache[1].write_data)
+        ,   .rden_a(cache[0].read_enable), .rden_b(cache[1].read_enable)
+        ,   .wren_a(cache[0].write_enable), .wren_b(cache[1].write_enable)
+        ,   .q_a(cache[0].read_data), .q_b(cache[1].read_data)
+        ,   .aclr0(1'b0), .aclr1(1'b0)
+        ,   .addressstall_a(1'b0), .addressstall_b(1'b0)
+        ,   .byteena_a(1'b1), .byteena_b(1'b1)
+        ,   .clock1(1'b1)
+        ,   .clocken0(1'b1), .clocken1(1'b1), .clocken2(1'b1), .clocken3(1'b1)
+        ,   .eccstatus()
+        );
 
     bit [SET_BITS:0]    reset_entry;
 
+    // "The" fill buffer
+    address_tag_t   working_tag;
+    set_t           working_set;
+
+    (* syn_encoding = "one-hot" *) enum int unsigned
+        {   AWAIT_CORE_COMMAND
+        ,   AWAIT_CACHE
+        ,   SEND_FILL_REQUEST
+        ,   AWAIT_RAM_RESPONSE
+        } state;
+
     always @(posedge clock) begin
         if (reset) begin
             core_command_ready = 0;
             ram_command_valid = 0;
             ram_response_ready = 0;
             core_response_valid = 0;
-            outstanding_fill = 0;
             reset_entry = 0;
+            cache[0].address = 0; cache[1].address = 0;
+            cache[0].read_enable = 0; cache[1].read_enable = 0;
+            cache[0].write_enable = 0; cache[1].write_enable = 0;
+            cache[0].write_data = 0; cache[1].write_data = 0;
+            state = state.first;
         end else begin
-            if (clear)
-                reset_entry = 0;
-
             if (ram_command_ready && ram_command_valid)
                 ram_command_valid = 0;
             if (core_response_ready && core_response_valid)
                 core_response_valid = 0;
 
-            if (!outstanding_fill && !reset_entry[SET_BITS]) begin
-                cache[reset_entry[SET_BITS-1:0]] = 0;
-                ++reset_entry;
-            end else if (ram_response_ready && ram_response_valid && outstanding_fill) begin
-                automatic address_tag_t tag;
-                automatic set_t set;
-                automatic cache_entry_t entry;
-                {tag, set} = ram_response_data.address;
-                entry.tag.valid = 1;
-                entry.tag.address = tag;
-                entry.data = ram_response_data.data;
-                cache[set] = entry;
-                core_response_valid = 1;
-                core_response_data = ram_response_data;
-                outstanding_fill = 0;
-            end else if (core_command_ready && core_command_valid) begin
-                automatic address_tag_t tag;
-                automatic set_t set;
-                {tag, set} = core_command_data.address;
-                if (core_command_data.write) begin
-                    automatic cache_entry_t entry;
-                    entry.tag.valid = 1;
-                    entry.tag.address = tag;
-                    // FIXME masked stores
-                    entry.data = core_command_data.data;
-                    cache[set] = entry;
-                    ram_command_valid = 1;
-                    ram_command_data = core_command_data;
-                end else begin
-                    automatic cache_entry_t entry = cache[set];
-                    if (entry.tag.valid && entry.tag.address == tag) begin
+            if (!reset_entry[SET_BITS]) begin
+                cache[0].address = reset_entry[SET_BITS-1:0];
+                cache[1].address = reset_entry[SET_BITS-1:0] + 1;
+                cache[0].read_enable = 0; cache[1].read_enable = 0;
+                cache[0].write_enable = 1; cache[1].write_enable = 1;
+                cache[0].write_data = 0; cache[1].write_data = 0;
+                reset_entry += 2;
+            end else begin
+                case (state)
+
+                AWAIT_CORE_COMMAND: begin
+                    cache[0].read_enable = 0;
+                    cache[0].write_enable = 0;
+
+                    if (core_command_ready && core_command_valid) begin
+                        {working_tag, working_set} = core_command_data.address;
+                        cache[0].address = working_set;
+                        cache[0].read_enable = 1;
+                        cache[0].write_enable = core_command_data.write;
+                        cache[0].write_data.tag.valid = 1;
+                        cache[0].write_data.tag.dirty = 1;
+                        cache[0].write_data.tag.address = working_tag;
+                        cache[0].write_data.data = core_command_data.data;
+                        state = AWAIT_CACHE;
+                    end
+                end
+
+                AWAIT_CACHE: begin
+                    if (cache[0].read_data.tag.valid && cache[0].read_data.tag.dirty && cache[0].read_data.tag.address != working_tag) begin
+                        ram_command_valid = 1;
+                        ram_command_data.address = {cache[0].read_data.tag.address, working_set};
+                        ram_command_data.write = 1;
+                        ram_command_data.snoop_response = 0;
+                        ram_command_data.data = cache[0].read_data.data;
+                        ram_command_data.mask = ~0;
+                        state = cache[0].write_enable ? AWAIT_CORE_COMMAND : SEND_FILL_REQUEST;
+                    end else if (cache[0].write_enable) begin
+                        core_command_ready = !core_response_valid && !ram_command_valid;
+                        state = AWAIT_CORE_COMMAND;
+                    end else if (cache[0].read_data.tag.valid && cache[0].read_data.tag.address == working_tag) begin
                         core_response_valid = 1;
-                        core_response_data.address = {tag, set};
-                        core_response_data.data = entry.data;
+                        core_response_data.address = {working_tag, working_set};
+                        core_response_data.snoop = 0;
+                        core_response_data.data_valid = 1;
+                        core_response_data.data = cache[0].read_data.data;
+                        state = AWAIT_CORE_COMMAND;
                     end else begin
                         ram_command_valid = 1;
-                        ram_command_data = core_command_data;
-                        outstanding_fill = 1;
+                        ram_command_data.address = {working_tag, working_set};
+                        ram_command_data.write = 0;
+                        ram_command_data.snoop_response = 0;
+                        state = AWAIT_RAM_RESPONSE;
                     end
+
+                    cache[0].read_enable = 0;
+                    cache[0].write_enable = 0;
                 end
-            end
 
-            core_command_ready = reset_entry[SET_BITS] && !ram_command_valid && !core_response_valid && !outstanding_fill;
-            ram_response_ready = !core_response_valid;
+                SEND_FILL_REQUEST: begin
+                    cache[0].read_enable = 0;
+                    cache[0].write_enable = 0;
+
+                    if (!ram_command_valid) begin
+                        ram_command_valid = 1;
+                        ram_command_data.address = {working_tag, working_set};
+                        ram_command_data.write = 0;
+                        ram_command_data.snoop_response = 0;
+                        state = AWAIT_RAM_RESPONSE;
+                    end
+                end
+
+                AWAIT_RAM_RESPONSE: begin
+                    cache[0].read_enable = 0;
+                    cache[0].write_enable = 0;
+
+                    if (ram_response_valid && ram_response_data.address == {working_tag, working_set} && ram_response_data.data_valid) begin
+                        core_response_valid = 1;
+                        core_response_data = ram_response_data;
+                        cache[0].address = working_set;
+                        cache[0].read_enable = 0;
+                        cache[0].write_enable = 1;
+                        cache[0].write_data.tag.valid = 1;
+                        cache[0].write_data.tag.dirty = 0;
+                        cache[0].write_data.tag.address = working_tag;
+                        cache[0].write_data.data = ram_response_data.data;
+                        state = AWAIT_CORE_COMMAND;
+                    end
+                end
+
+                endcase
+
+                core_command_ready = state == AWAIT_CORE_COMMAND && !core_response_valid && !ram_command_valid;
+                ram_response_ready = state == AWAIT_RAM_RESPONSE && !core_response_valid && !ram_command_valid;
+            end
         end
     end
 
diff --git a/hdl/top.sv b/hdl/top.sv
index 96b7510..7d6ba8e 100644
--- a/hdl/top.sv
+++ b/hdl/top.sv
@@ -493,9 +493,7 @@ module top
 `else
             mem_cache cache
                 (   .clock(internal_clock)
-                ,   .reset(internal_reset)
-
-                ,   .clear(clear_caches)
+                ,   .reset(internal_reset || clear_caches)
 
                 ,   .core_command_ready(cache_command_ready)
                 ,   .core_command_valid(cache_command_valid)
@@ -517,7 +515,7 @@ module top
 
             core cpu
                 (   .clk(internal_clock)
-                ,   .reset(internal_reset)
+                ,   .reset(internal_reset || clear_caches)
 
                 ,   .uart_tx_ready(tx_ready)
                 ,   .uart_tx_valid(tx_valid)
-- 
cgit v1.2.3