From 6b7d1cdc98475f2f21002e7de5a2c563215cb279 Mon Sep 17 00:00:00 2001 From: Julian Blake Kongslie Date: Sun, 15 Jan 2023 14:24:17 -0800 Subject: Reduce stalling due to stores by using a global counter. --- uarch/core.cpp | 24 ++++++++++++------------ uarch/core.h | 7 ++++--- 2 files changed, 16 insertions(+), 15 deletions(-) (limited to 'uarch') diff --git a/uarch/core.cpp b/uarch/core.cpp index 7304442..76af173 100644 --- a/uarch/core.cpp +++ b/uarch/core.cpp @@ -77,13 +77,7 @@ void decode_stage::clock() { speculative_stores_sent = r.stores_sent; } - if (c.decode_store_completep.can_read()) { - ++stores_done; - assert((int)(speculative_stores_sent - stores_done) >= 0); - c.decode_store_completep.discard(); - } - - if (speculative_stores_sent == stores_done && c.fetch_bundlep.can_read() && c.decode_mem_commandp.can_write() && c.indir_instp.can_write() && c.decode_to_exec_instp.can_write()) { + if (c.fetch_bundlep.can_read() && c.decode_mem_commandp.can_write() && c.indir_instp.can_write() && c.decode_to_exec_instp.can_write()) { auto b = c.fetch_bundlep.peek(); if (b.gen != c.gen) { @@ -106,6 +100,11 @@ void decode_stage::clock() { return; } + if (speculative_stores_sent != c.stores_done) { + pte(b.tr, "z"); + return; + } + inst_bundle i; i.tr = infra::pt::child(b.tr); @@ -190,7 +189,6 @@ void indir_stage::clock() { sr.mask.fill(false); sr.mask[*i.inst.init_address & memory::LINE_BYTE_OFFSET_MASK] = true; sr.write = true; - sr.responsep = &c.decode_store_completep; c.indir_mem_store_commandp.write(std::move(sr)); } else { pte(i.tr, "I"); @@ -238,9 +236,9 @@ void exec_stage::clock() { sr.mask.fill(false); sr.mask[*i.inst.init_address & memory::LINE_BYTE_OFFSET_MASK] = true; sr.write = true; - sr.responsep = &c.decode_store_completep; c.exec_mem_commandp.write(std::move(sr)); stores_sent += 2; // Original store sent by Indir stage plus unstore here + c.stores_done += 2; } c.decode_to_exec_instp.discard(); } else if (i.icount == c.icount) { @@ -261,9 +259,9 @@ void exec_stage::clock() { sr.mask.fill(false); sr.mask[*i.inst.init_address & memory::LINE_BYTE_OFFSET_MASK] = true; sr.write = true; - sr.responsep = &c.decode_store_completep; c.exec_mem_commandp.write(std::move(sr)); stores_sent += 2; // Original store sent by Indir stage plus unstore here + c.stores_done += 2; } c.indir_to_exec_instp.discard(); } else if (i.icount == c.icount) { @@ -305,8 +303,10 @@ void exec_stage::clock() { auto next_pc = inst.next_pc; - if (inst.need_autoinc_store) + if (inst.need_autoinc_store) { ++stores_sent; // It was sent by Indir stage + ++c.stores_done; + } if (inst.need_read_acc) inst.acc = acc; if (inst.need_read_link) @@ -338,9 +338,9 @@ void exec_stage::clock() { sr.mask.fill(false); sr.mask[*inst.final_address & memory::LINE_BYTE_OFFSET_MASK] = true; sr.write = true; - sr.responsep = &c.decode_store_completep; c.exec_mem_commandp.write(std::move(sr)); ++stores_sent; + ++c.stores_done; } assert(inst.next_pc == next_pc || inst.possibly_redirects); diff --git a/uarch/core.h b/uarch/core.h index 21725b3..b8473e6 100644 --- a/uarch/core.h +++ b/uarch/core.h @@ -59,7 +59,6 @@ struct decode_stage : public infra::sim { std::uint64_t icount; unsigned int speculative_stores_sent = 0; - unsigned int stores_done = 0; decode_stage(core &c); @@ -114,7 +113,6 @@ struct core { infra::port decode_mem_commandp; infra::port decode_mem_responsep; - infra::port decode_store_completep; infra::port decode_to_exec_instp; infra::port indir_instp; @@ -126,7 +124,10 @@ struct core { infra::port exec_mem_commandp; infra::port exec_mem_responsep; - // Construction order is execution order within a cycle, so this list should be back-to-front (for zero-cycle restarts) + // Global counters (should be Gray code in FPGA implementation, only do == comparisons) + unsigned int stores_done = 0; + + // Construction order is execution order within a cycle, so this list should be back-to-front (for zero-cycle restarts and store count propagation) exec_stage exec{*this}; indir_stage indir{*this}; decode_stage decode{*this}; -- cgit v1.2.3