diff options
| author | Julian Blake Kongslie | 2023-01-15 14:24:17 -0800 |
|---|---|---|
| committer | Julian Blake Kongslie | 2023-01-15 15:04:07 -0800 |
| commit | 6b7d1cdc98475f2f21002e7de5a2c563215cb279 (patch) | |
| tree | beb6399dae809aa298dd9a885de6fba752b551c6 | |
| parent | Stall decode after an instruction with stores until the stores are done. (diff) | |
| download | biggolf-6b7d1cdc98475f2f21002e7de5a2c563215cb279.tar.xz | |
Reduce stalling due to stores by using a global counter.
| -rw-r--r-- | uarch/core.cpp | 24 | ||||
| -rw-r--r-- | uarch/core.h | 7 |
2 files changed, 16 insertions, 15 deletions
diff --git a/uarch/core.cpp b/uarch/core.cpp index 7304442..76af173 100644 --- a/uarch/core.cpp +++ b/uarch/core.cpp | |||
| @@ -77,13 +77,7 @@ void decode_stage::clock() { | |||
| 77 | speculative_stores_sent = r.stores_sent; | 77 | speculative_stores_sent = r.stores_sent; |
| 78 | } | 78 | } |
| 79 | 79 | ||
| 80 | if (c.decode_store_completep.can_read()) { | 80 | if (c.fetch_bundlep.can_read() && c.decode_mem_commandp.can_write() && c.indir_instp.can_write() && c.decode_to_exec_instp.can_write()) { |
| 81 | ++stores_done; | ||
| 82 | assert((int)(speculative_stores_sent - stores_done) >= 0); | ||
| 83 | c.decode_store_completep.discard(); | ||
| 84 | } | ||
| 85 | |||
| 86 | if (speculative_stores_sent == stores_done && c.fetch_bundlep.can_read() && c.decode_mem_commandp.can_write() && c.indir_instp.can_write() && c.decode_to_exec_instp.can_write()) { | ||
| 87 | auto b = c.fetch_bundlep.peek(); | 81 | auto b = c.fetch_bundlep.peek(); |
| 88 | 82 | ||
| 89 | if (b.gen != c.gen) { | 83 | if (b.gen != c.gen) { |
| @@ -106,6 +100,11 @@ void decode_stage::clock() { | |||
| 106 | return; | 100 | return; |
| 107 | } | 101 | } |
| 108 | 102 | ||
| 103 | if (speculative_stores_sent != c.stores_done) { | ||
| 104 | pte(b.tr, "z"); | ||
| 105 | return; | ||
| 106 | } | ||
| 107 | |||
| 109 | inst_bundle i; | 108 | inst_bundle i; |
| 110 | 109 | ||
| 111 | i.tr = infra::pt::child(b.tr); | 110 | i.tr = infra::pt::child(b.tr); |
| @@ -190,7 +189,6 @@ void indir_stage::clock() { | |||
| 190 | sr.mask.fill(false); | 189 | sr.mask.fill(false); |
| 191 | sr.mask[*i.inst.init_address & memory::LINE_BYTE_OFFSET_MASK] = true; | 190 | sr.mask[*i.inst.init_address & memory::LINE_BYTE_OFFSET_MASK] = true; |
| 192 | sr.write = true; | 191 | sr.write = true; |
| 193 | sr.responsep = &c.decode_store_completep; | ||
| 194 | c.indir_mem_store_commandp.write(std::move(sr)); | 192 | c.indir_mem_store_commandp.write(std::move(sr)); |
| 195 | } else { | 193 | } else { |
| 196 | pte(i.tr, "I"); | 194 | pte(i.tr, "I"); |
| @@ -238,9 +236,9 @@ void exec_stage::clock() { | |||
| 238 | sr.mask.fill(false); | 236 | sr.mask.fill(false); |
| 239 | sr.mask[*i.inst.init_address & memory::LINE_BYTE_OFFSET_MASK] = true; | 237 | sr.mask[*i.inst.init_address & memory::LINE_BYTE_OFFSET_MASK] = true; |
| 240 | sr.write = true; | 238 | sr.write = true; |
| 241 | sr.responsep = &c.decode_store_completep; | ||
| 242 | c.exec_mem_commandp.write(std::move(sr)); | 239 | c.exec_mem_commandp.write(std::move(sr)); |
| 243 | stores_sent += 2; // Original store sent by Indir stage plus unstore here | 240 | stores_sent += 2; // Original store sent by Indir stage plus unstore here |
| 241 | c.stores_done += 2; | ||
| 244 | } | 242 | } |
| 245 | c.decode_to_exec_instp.discard(); | 243 | c.decode_to_exec_instp.discard(); |
| 246 | } else if (i.icount == c.icount) { | 244 | } else if (i.icount == c.icount) { |
| @@ -261,9 +259,9 @@ void exec_stage::clock() { | |||
| 261 | sr.mask.fill(false); | 259 | sr.mask.fill(false); |
| 262 | sr.mask[*i.inst.init_address & memory::LINE_BYTE_OFFSET_MASK] = true; | 260 | sr.mask[*i.inst.init_address & memory::LINE_BYTE_OFFSET_MASK] = true; |
| 263 | sr.write = true; | 261 | sr.write = true; |
| 264 | sr.responsep = &c.decode_store_completep; | ||
| 265 | c.exec_mem_commandp.write(std::move(sr)); | 262 | c.exec_mem_commandp.write(std::move(sr)); |
| 266 | stores_sent += 2; // Original store sent by Indir stage plus unstore here | 263 | stores_sent += 2; // Original store sent by Indir stage plus unstore here |
| 264 | c.stores_done += 2; | ||
| 267 | } | 265 | } |
| 268 | c.indir_to_exec_instp.discard(); | 266 | c.indir_to_exec_instp.discard(); |
| 269 | } else if (i.icount == c.icount) { | 267 | } else if (i.icount == c.icount) { |
| @@ -305,8 +303,10 @@ void exec_stage::clock() { | |||
| 305 | 303 | ||
| 306 | auto next_pc = inst.next_pc; | 304 | auto next_pc = inst.next_pc; |
| 307 | 305 | ||
| 308 | if (inst.need_autoinc_store) | 306 | if (inst.need_autoinc_store) { |
| 309 | ++stores_sent; // It was sent by Indir stage | 307 | ++stores_sent; // It was sent by Indir stage |
| 308 | ++c.stores_done; | ||
| 309 | } | ||
| 310 | if (inst.need_read_acc) | 310 | if (inst.need_read_acc) |
| 311 | inst.acc = acc; | 311 | inst.acc = acc; |
| 312 | if (inst.need_read_link) | 312 | if (inst.need_read_link) |
| @@ -338,9 +338,9 @@ void exec_stage::clock() { | |||
| 338 | sr.mask.fill(false); | 338 | sr.mask.fill(false); |
| 339 | sr.mask[*inst.final_address & memory::LINE_BYTE_OFFSET_MASK] = true; | 339 | sr.mask[*inst.final_address & memory::LINE_BYTE_OFFSET_MASK] = true; |
| 340 | sr.write = true; | 340 | sr.write = true; |
| 341 | sr.responsep = &c.decode_store_completep; | ||
| 342 | c.exec_mem_commandp.write(std::move(sr)); | 341 | c.exec_mem_commandp.write(std::move(sr)); |
| 343 | ++stores_sent; | 342 | ++stores_sent; |
| 343 | ++c.stores_done; | ||
| 344 | } | 344 | } |
| 345 | 345 | ||
| 346 | assert(inst.next_pc == next_pc || inst.possibly_redirects); | 346 | assert(inst.next_pc == next_pc || inst.possibly_redirects); |
diff --git a/uarch/core.h b/uarch/core.h index 21725b3..b8473e6 100644 --- a/uarch/core.h +++ b/uarch/core.h | |||
| @@ -59,7 +59,6 @@ struct decode_stage : public infra::sim { | |||
| 59 | std::uint64_t icount; | 59 | std::uint64_t icount; |
| 60 | 60 | ||
| 61 | unsigned int speculative_stores_sent = 0; | 61 | unsigned int speculative_stores_sent = 0; |
| 62 | unsigned int stores_done = 0; | ||
| 63 | 62 | ||
| 64 | decode_stage(core &c); | 63 | decode_stage(core &c); |
| 65 | 64 | ||
| @@ -114,7 +113,6 @@ struct core { | |||
| 114 | 113 | ||
| 115 | infra::port<memory::dram::command> decode_mem_commandp; | 114 | infra::port<memory::dram::command> decode_mem_commandp; |
| 116 | infra::port<memory::dram::response> decode_mem_responsep; | 115 | infra::port<memory::dram::response> decode_mem_responsep; |
| 117 | infra::port<memory::dram::response> decode_store_completep; | ||
| 118 | infra::port<inst_bundle> decode_to_exec_instp; | 116 | infra::port<inst_bundle> decode_to_exec_instp; |
| 119 | 117 | ||
| 120 | infra::port<inst_bundle> indir_instp; | 118 | infra::port<inst_bundle> indir_instp; |
| @@ -126,7 +124,10 @@ struct core { | |||
| 126 | infra::port<memory::dram::command> exec_mem_commandp; | 124 | infra::port<memory::dram::command> exec_mem_commandp; |
| 127 | infra::port<memory::dram::response> exec_mem_responsep; | 125 | infra::port<memory::dram::response> exec_mem_responsep; |
| 128 | 126 | ||
| 129 | // Construction order is execution order within a cycle, so this list should be back-to-front (for zero-cycle restarts) | 127 | // Global counters (should be Gray code in FPGA implementation, only do == comparisons) |
| 128 | unsigned int stores_done = 0; | ||
| 129 | |||
| 130 | // Construction order is execution order within a cycle, so this list should be back-to-front (for zero-cycle restarts and store count propagation) | ||
| 130 | exec_stage exec{*this}; | 131 | exec_stage exec{*this}; |
| 131 | indir_stage indir{*this}; | 132 | indir_stage indir{*this}; |
| 132 | decode_stage decode{*this}; | 133 | decode_stage decode{*this}; |
