From f02d950eee017515996f9f51dc9e34e0cdc56c10 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 3 Jun 2025 13:11:21 +0000 Subject: [PATCH 1/7] cpu: add top-down stats --- src/cpu/o3/cpu.cc | 70 ++++++++++++++++++++++++++++++++++++++++ src/cpu/o3/cpu.hh | 36 +++++++++++++++++++++ src/cpu/o3/decode.cc | 2 ++ src/cpu/o3/fu_pool.hh | 11 +++++++ src/cpu/o3/inst_queue.cc | 48 ++++++++++++++++++++++++++- src/cpu/o3/inst_queue.hh | 11 +++++++ src/cpu/o3/rename.cc | 11 +++++-- src/cpu/o3/rename.hh | 2 ++ 8 files changed, 188 insertions(+), 3 deletions(-) diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 568ca436637..5012b1189c2 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -368,6 +368,76 @@ CPU::CPUStats::CPUStats(CPU *cpu) .prereq(quiesceCycles); } +CPU::CPUStats::TopDownStats::TopDownStats(CPU *cpu) : + statistics::Group(cpu, "TopDownStats"), + topDownL1(cpu), + topDownFbL2(cpu), + topDownBbL2(cpu), + topDownBbMem(cpu){} + +CPU::CPUStats::TopDownStats::TopDownL1::TopDownL1(CPU *cpu) : + statistics::Group(cpu, "TopDownL1"), + ADD_STAT(frontendBound, statistics::units::Rate::get(), + "Frontend Bound, fraction of slots lost due to frontend undersupplying the backend"), + ADD_STAT(badSpeculation, statistics::units::Rate::get(), + "Bad Speculation, fraction of slots lost due to mispeculation"), + ADD_STAT(backendBound, statistics::units::Rate::get(), + "Backend Bound, fraction of slots lost due to backend resource constraints."), + ADD_STAT(retiring, statistics::units::Rate::get(), + "Retiring, fraction of slots successfully retired by the backend") +{ + // L1 + frontendBound = cpu->decode.getStats().fetchBubbles / (cpu->rename.getWidth() * cpu->baseStats.numCycles); + badSpeculation = (cpu->rename.getStats().renamedInsts - cpu->commit.getStats().committedInst + (cpu->commit.getStats().recoveryBubbles)) / (cpu->rename.getWidth() * cpu->baseStats.numCycles); + retiring = cpu->commit.getStats().committedInst / (cpu->rename.getWidth() * cpu->baseStats.numCycles); + backendBound = 1 - (frontendBound + badSpeculation + retiring); +} + +CPU::CPUStats::TopDownStats::TopDownFrontendBoundL2::TopDownFrontendBoundL2(CPU *cpu) + : statistics::Group(cpu, "TopDownL2_FrontendBound"), + ADD_STAT(fetchLatency, statistics::units::Rate::get(), + "Fetch Latency Bound, frontend stalls due to instruction cache inefficiency"), + ADD_STAT(fetchBandwidth, statistics::units::Rate::get(), + "Fetch Bandwidth Bound, frontend stalls due to decoder inefficiency") +{ + // Frontend L2 + fetchLatency = cpu->decode.getStats().fetchBubblesMax / (cpu->baseStats.numCycles); + fetchBandwidth = cpu->cpuStats.topDownStats.topDownL1.frontendBound - fetchLatency; +} + +CPU::CPUStats::TopDownStats::TopDownBackendBoundL2::TopDownBackendBoundL2(CPU *cpu) + : statistics::Group(cpu, "TopDownL2_BackendBound"), + ADD_STAT(memoryBound, statistics::units::Rate::get(), + "Memory Bound, backend stalls due to memory subsystem"), + ADD_STAT(coreBound, statistics::units::Rate::get(), + "Core Bound, backend stalls due to functional unit constraints") +{ + // Backend L2 + executionStalls = (cpu->iew.instQueue.getStats().numInstsExec0 - cpu->rename.getStats().idleCycles + cpu->iew.instQueue.getStats().numInstsExec1 + cpu->iew.instQueue.getStats().numInstsExec2) / (cpu->baseStats.numCycles); + // memoryBound = (cpu->iew.instQueue.getStats().loadStallCycles + cpu->rename.getStats().SQFullEvents) / (cpu->baseStats.numCycles); + memoryBound = (cpu->iew.instQueue.getStats().loadStallCycles + cpu->rename.getStats().storeStalls) / (cpu->baseStats.numCycles); + coreBound = executionStalls - memoryBound; +} + +CPU::CPUStats::TopDownStats::TopDownBackendBoundL3::TopDownBackendBoundL3(CPU *cpu) : statistics::Group(cpu, "TopDownL3_BackendBound_MemoryBound"), + ADD_STAT(l1Bound, statistics::units::Rate::get(), + "L1 Cache Bound"), + ADD_STAT(l2Bound, statistics::units::Rate::get(), + "L2 Cache Bound"), + ADD_STAT(l3Bound, statistics::units::Rate::get(), + "L3 Cache Bound"), + ADD_STAT(extMemBound, statistics::units::Rate::get(), + "External Memory Bound"), + ADD_STAT(storeBound, statistics::units::Rate::get(), + "Store Bound") +{ + l1Bound = (cpu->iew.instQueue.getStats().loadStallCycles - cpu->iew.instQueue.getStats().L1miss) / (cpu->baseStats.numCycles); + l2Bound = (cpu->iew.instQueue.getStats().L1miss - cpu->iew.instQueue.getStats().L2miss) / (cpu->baseStats.numCycles); + l3Bound = (cpu->iew.instQueue.getStats().L2miss - cpu->iew.instQueue.getStats().L3miss) / (cpu->baseStats.numCycles); + extMemBound = (cpu->iew.instQueue.getStats().L3miss) / (cpu->baseStats.numCycles); + storeBound = (cpu->rename.getStats().storeStalls) / (cpu->baseStats.numCycles); +} + void CPU::tick() { diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index fcc34d0f986..9d0d507d13d 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -601,6 +601,42 @@ class CPU : public BaseCPU /** Stat for total number of cycles the CPU spends descheduled due to a * quiesce operation or waiting for an interrupt. */ statistics::Scalar quiesceCycles; + + struct TopDownStats : statistics::Group { + TopDownStats(CPU *cpu); + + struct TopDownL1 : statistics::Group{ + TopDownL1(CPU *cpu); + statistics::Formula frontendBound; + statistics::Formula badSpeculation; + statistics::Formula backendBound; + statistics::Formula retiring; + } topDownL1; + + struct TopDownFrontendBoundL2 : statistics::Group { + TopDownFrontendBoundL2(CPU *cpu); + statistics::Formula fetchLatency; + statistics::Formula fetchBandwidth; + } topDownFbL2; + + struct TopDownBackendBoundL2 : statistics::Group { + TopDownBackendBoundL2(CPU *cpu); + statistics::Formula executionStalls; + statistics::Formula memoryBound; + statistics::Formula coreBound; + } topDownBbL2; + + struct TopDownBackendBoundL3 : statistics::Group { + TopDownBackendBoundL3(CPU *cpu); + statistics::Formula l1Bound; + statistics::Formula l2Bound; + statistics::Formula l3Bound; + statistics::Formula extMemBound; + statistics::Formula storeBound; + } topDownBbMem; + + } topDownStats; + } cpuStats; public: diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc index c5123663185..30d3b4c4654 100644 --- a/src/cpu/o3/decode.cc +++ b/src/cpu/o3/decode.cc @@ -565,6 +565,8 @@ Decode::tick() toRenameIndex = 0; + fetchBubbles = decodeWidth; + list::iterator threads = activeThreads->begin(); list::iterator end = activeThreads->end(); diff --git a/src/cpu/o3/fu_pool.hh b/src/cpu/o3/fu_pool.hh index f0f01c38d40..5df18115209 100644 --- a/src/cpu/o3/fu_pool.hh +++ b/src/cpu/o3/fu_pool.hh @@ -203,6 +203,17 @@ class FUPool : public SimObject /** Takes over from another CPU's thread. */ void takeOverFrom() {}; + + /** Returns the number of free FUs */ + int numBusyFUs() const + { + int busy = 0; + for (int i = 0; i < numFU; ++i) { + if (unitBusy[i]) + busy++; + } + return busy; + } }; } // namespace o3 diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc index b3cf330c37e..853514f18c9 100644 --- a/src/cpu/o3/inst_queue.cc +++ b/src/cpu/o3/inst_queue.cc @@ -217,7 +217,21 @@ InstructionQueue::IQStats::IQStats(CPU *cpu, const unsigned &total_width) ADD_STAT(fuBusy, statistics::units::Count::get(), "FU busy when requested"), ADD_STAT(fuBusyRate, statistics::units::Rate< statistics::units::Count, statistics::units::Count>::get(), - "FU busy rate (busy events/executed inst)") + "FU busy rate (busy events/executed inst)"), + ADD_STAT(numInstsExec0, statistics::units::Count::get(), + "0 instructions executed in a cycle"), + ADD_STAT(numInstsExec1, statistics::units::Count::get(), + "1 instruction executed in a cycle"), + ADD_STAT(numInstsExec2, statistics::units::Count::get(), + "2 instructions executed in a cycle"), + ADD_STAT(loadStallCycles, statistics::units::Cycle::get(), + "Top down, no uops executed and at least 1 in-flight load"), + ADD_STAT(L1miss, statistics::units::Cycle::get(), + "l1miss"), + ADD_STAT(L2miss, statistics::units::Cycle::get(), + "l2miss"), + ADD_STAT(L3miss, statistics::units::Cycle::get(), + "l1miss") { instsAdded .prereq(instsAdded); @@ -323,6 +337,14 @@ InstructionQueue::IQStats::IQStats(CPU *cpu, const unsigned &total_width) .flags(statistics::total) ; fuBusyRate = fuBusy / instsIssued; + + numInstsExec0.prereq(numInstsExec0); + numInstsExec1.prereq(numInstsExec1); + numInstsExec2.prereq(numInstsExec2); + + L1miss.prereq(L1miss); + L2miss.prereq(L2miss); + L3miss.prereq(L3miss); } InstructionQueue::IQIOStats::IQIOStats(statistics::Group *parent) @@ -928,6 +950,30 @@ InstructionQueue::scheduleReadyInsts() } else { DPRINTF(IQ, "Not able to schedule any instructions.\n"); } + + int numBusyFUs = fuPool->numBusyFUs(); + + if (numBusyFUs == 0) + iqStats.numInstsExec0++; + else if (numBusyFUs == 1) + iqStats.numInstsExec1++; + else if (numBusyFUs == 2) + iqStats.numInstsExec2++; + + if (fuPool->isDrained() && iewStage->ldstQueue.numLoads()){ //numLoads returns for all threads, change it to a single thread + iqStats.loadStallCycles++; + if (iewStage->ldstQueue.anyCacheLevelMisses(3)) { + iqStats.L1miss++; + iqStats.L2miss++; + iqStats.L3miss++; + } + else if (iewStage->ldstQueue.anyCacheLevelMisses(2)) { + iqStats.L1miss++; + iqStats.L2miss++; + } + else if (iewStage->ldstQueue.anyCacheLevelMisses(1)) + iqStats.L1miss++; + } } void diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh index 57928e74784..7e40de1ecb3 100644 --- a/src/cpu/o3/inst_queue.hh +++ b/src/cpu/o3/inst_queue.hh @@ -541,6 +541,17 @@ class InstructionQueue statistics::Vector fuBusy; /** Number of times the FU was busy per instruction issued. */ statistics::Formula fuBusyRate; + + /* Top down, cycles in which few ops are executed */ + statistics::Scalar numInstsExec0; + statistics::Scalar numInstsExec1; + statistics::Scalar numInstsExec2; + + /*Top down, MemStalls.AnyLoad*/ + statistics::Scalar loadStallCycles; + statistics::Scalar L1miss; + statistics::Scalar L2miss; + statistics::Scalar L3miss; } iqStats; public: diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc index 83fd67f457d..f60565d1c86 100644 --- a/src/cpu/o3/rename.cc +++ b/src/cpu/o3/rename.cc @@ -45,6 +45,7 @@ #include "cpu/o3/cpu.hh" #include "cpu/o3/dyn_inst.hh" +#include "cpu/o3/fu_pool.hh" #include "cpu/o3/limits.hh" #include "cpu/reg_class.hh" #include "debug/Activity.hh" @@ -149,8 +150,9 @@ Rename::RenameStats::RenameStats(statistics::Group *parent) ADD_STAT(intReturned, statistics::units::Count::get(), "count of registers freed and written back to integer free list"), ADD_STAT(fpReturned, statistics::units::Count::get(), - "count of registers freed and written back to floating point free list") - + "count of registers freed and written back to floating point free list"), + ADD_STAT(storeStalls, statistics::units::Cycle::get(), + "Number of cycles with few uops executed and no more stores can be issued") { squashCycles.prereq(squashCycles); idleCycles.prereq(idleCycles); @@ -184,6 +186,7 @@ Rename::RenameStats::RenameStats(statistics::Group *parent) intReturned.prereq(intReturned); fpReturned.prereq(fpReturned); + storeStalls.prereq(storeStalls); } void @@ -649,8 +652,12 @@ Rename::renameInsts(ThreadID tid) tid); source = SQ; incrFullStat(source); + if (iew_ptr->fuPool->isDrained()){ + stats.storeStalls++; + } break; } + } insts_to_rename.pop_front(); diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh index 0782645b29c..9cbafd07487 100644 --- a/src/cpu/o3/rename.hh +++ b/src/cpu/o3/rename.hh @@ -540,6 +540,8 @@ class Rename statistics::Scalar intReturned; /** Number of registers freed and written back to floating point free list*/ statistics::Scalar fpReturned; + /** Top Down, IEW stall while there is an in flight load */ + statistics::Scalar storeStalls; } stats; }; From 436917127044ffe16125f8be3812a27e45425d2e Mon Sep 17 00:00:00 2001 From: root Date: Tue, 3 Jun 2025 13:50:52 +0000 Subject: [PATCH 2/7] cpu: add top-down --- src/cpu/o3/commit.cc | 16 +++++++++++++++- src/cpu/o3/commit.hh | 10 ++++++++++ src/cpu/o3/cpu.cc | 3 ++- src/cpu/o3/decode.cc | 15 ++++++++++++++- src/cpu/o3/decode.hh | 10 ++++++++++ src/cpu/o3/fetch.hh | 3 +++ src/cpu/o3/iew.cc | 11 +++++++++++ src/cpu/o3/iew.hh | 13 ++++++++++++- src/cpu/o3/inst_queue.hh | 4 ++++ src/cpu/o3/lsq.cc | 14 ++++++++++++++ src/cpu/o3/lsq.hh | 2 ++ src/cpu/o3/rename.cc | 2 +- src/cpu/o3/rename.hh | 5 +++++ 13 files changed, 103 insertions(+), 5 deletions(-) diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index ebd6e2a4d4c..4de7c36ada9 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -165,7 +165,11 @@ Commit::CommitStats::CommitStats(CPU *cpu, Commit *commit) ADD_STAT(committedInstType, statistics::units::Count::get(), "Class of committed instruction"), ADD_STAT(commitEligibleSamples, statistics::units::Cycle::get(), - "number cycles where commit BW limit reached") + "number cycles where commit BW limit reached"), + ADD_STAT(committedInst, statistics::units::Count::get(), + "Required for Top-Down, number of committed instructions"), + ADD_STAT(recoveryBubbles, statistics::units::Count::get(), + "Required for Top-Down, recovery bubbles") { using namespace statistics; @@ -993,6 +997,15 @@ Commit::commitInsts() stats.committedInstType[tid][head_inst->opClass()]++; ppCommit->notify(head_inst); + if (ismispred) { + ismispred = false; + stats.recoveryBubbles += (cpu->curCycle() - lastCommitCycle) * renameWidth; + } + if (head_inst->mispredicted()) { + ismispred = true; + } + + lastCommitCycle = cpu->curCycle(); // hardware transactional memory // update nesting depth @@ -1104,6 +1117,7 @@ Commit::commitInsts() DPRINTF(CommitRate, "%i\n", num_committed); stats.numCommittedDist.sample(num_committed); + stats.committedInst += num_committed; if (num_committed == commitWidth) { stats.commitEligibleSamples++; diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index 4fff9fe892d..9592b0f7676 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -490,7 +490,17 @@ class Commit /** Number of cycles where the commit bandwidth limit is reached. */ statistics::Scalar commitEligibleSamples; + /** TDM, Number of commited instructions*/ + statistics::Scalar committedInst; + /** TDM,Recovery bubbles*/ + statistics::Scalar recoveryBubbles; } stats; + + bool ismispred = false; uint64_t lastCommitCycle = 0; //TDM + + public: + const CommitStats& getStats() const { return stats; } + }; } // namespace o3 diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 5012b1189c2..295de6c9652 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -355,7 +355,8 @@ CPU::CPUStats::CPUStats(CPU *cpu) "to idling"), ADD_STAT(quiesceCycles, statistics::units::Cycle::get(), "Total number of cycles that CPU has spent quiesced or waiting " - "for an interrupt") + "for an interrupt"), + topDownStats(cpu) { // Register any of the O3CPU's stats here. timesIdled diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc index 30d3b4c4654..7d203eddd6a 100644 --- a/src/cpu/o3/decode.cc +++ b/src/cpu/o3/decode.cc @@ -158,7 +158,11 @@ Decode::DecodeStats::DecodeStats(CPU *cpu) ADD_STAT(decodedInsts, statistics::units::Count::get(), "Number of instructions handled by decode"), ADD_STAT(squashedInsts, statistics::units::Count::get(), - "Number of squashed instructions handled by decode") + "Number of squashed instructions handled by decode"), + ADD_STAT(fetchBubbles, statistics::units::Count::get(), + "Stat for Top-Down Methodology, number of instructions not delivered to backend"), + ADD_STAT(fetchBubblesMax, statistics::units::Count::get(), + "Stat for Top-Down Methodology, number of cycles in which no instructions are delivered to backend") { idleCycles.prereq(idleCycles); blockedCycles.prereq(blockedCycles); @@ -170,6 +174,8 @@ Decode::DecodeStats::DecodeStats(CPU *cpu) controlMispred.prereq(controlMispred); decodedInsts.prereq(decodedInsts); squashedInsts.prereq(squashedInsts); + fetchBubbles.prereq(fetchBubbles); + fetchBubblesMax.prereq(fetchBubblesMax); } void @@ -580,6 +586,10 @@ Decode::tick() status_change = checkSignalsAndUpdate(tid) || status_change; decode(status_change, tid); + + stats.fetchBubbles += fetchBubbles; + if (fetchBubbles == decodeWidth) + stats.fetchBubblesMax++; } if (status_change) { @@ -604,8 +614,10 @@ Decode::decode(bool &status_change, ThreadID tid) // check if stall conditions have passed if (decodeStatus[tid] == Blocked) { + fetchBubbles -= decodeWidth; ++stats.blockedCycles; } else if (decodeStatus[tid] == Squashing) { + fetchBubbles -= decodeWidth; ++stats.squashCycles; } @@ -704,6 +716,7 @@ Decode::decodeInsts(ThreadID tid) ++toRenameIndex; ++stats.decodedInsts; --insts_available; + --fetchBubbles; #if TRACING_ON if (debug::O3PipeView) { diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh index 6b0e20ea281..7f8735ff335 100644 --- a/src/cpu/o3/decode.hh +++ b/src/cpu/o3/decode.hh @@ -299,6 +299,8 @@ class Decode */ bool squashAfterDelaySlot[MaxThreads]; + unsigned fetchBubbles = 0; + struct DecodeStats : public statistics::Group { DecodeStats(CPU *cpu); @@ -325,7 +327,15 @@ class Decode statistics::Scalar decodedInsts; /** Stat for total number of squashed instructions. */ statistics::Scalar squashedInsts; + /** Stat for Top-Down Methodology, number of instructions not delivered to backend */ + statistics::Scalar fetchBubbles; + /** Stat for Top-Down Methodology, number of cycles in which no instructions are delivered to backend */ + statistics::Scalar fetchBubblesMax; } stats; + + public: + + const DecodeStats& getStats() const { return stats; } }; } // namespace o3 diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 05d92e37567..79a3618681f 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -592,6 +592,9 @@ class Fetch /** Rate of how often fetch was idle. */ statistics::Formula idleRate; } fetchStats; + + public: + const FetchStatGroup& getStats() const { return fetchStats; } }; } // namespace o3 diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index a01c6b9deca..5bf05773e15 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -171,6 +171,10 @@ IEW::IEWStats::IEWStats(CPU *cpu) "Number of branches that were predicted taken incorrectly"), ADD_STAT(predictedNotTakenIncorrect, statistics::units::Count::get(), "Number of branches that were predicted not taken incorrectly"), + ADD_STAT(totalSlots, statistics::units::Count::get(), + "Required for Top-Down methodology, total number of issue pipeline slots"), + ADD_STAT(recoveryBubbles, statistics::units::Count::get(), + "Required for Top-Down methodology, number of slots required for recovery"), ADD_STAT(branchMispredicts, statistics::units::Count::get(), "Number of branch mispredicts detected at execute", predictedTakenIncorrect + predictedNotTakenIncorrect), @@ -1288,6 +1292,8 @@ IEW::executeInsts() if (inst->mispredicted() && !loadNotExecuted) { fetchRedirect[tid] = true; + recovery = true; + recovery_started = true; DPRINTF(IEW, "[tid:%i] [sn:%llu] Execute: " "Branch mispredict detected.\n", @@ -1425,6 +1431,8 @@ IEW::tick() wbNumInst = 0; wbCycle = 0; + iewStats.totalSlots += issueWidth; + wroteToTimeBuffer = false; updatedQueues = false; @@ -1438,6 +1446,9 @@ IEW::tick() std::list::iterator threads = activeThreads->begin(); std::list::iterator end = activeThreads->end(); + if (recovery) + iewStats.recoveryBubbles += issueWidth; + // Check stall and squash signals, dispatch any instructions. while (threads != end) { ThreadID tid = *threads++; diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index 4fe8227dcc8..b42bff6f0bb 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -364,6 +364,11 @@ class IEW */ bool updateLSQNextCycle; + /** Required for Top-Down, determines if recovery is happening */ + bool recovery = false; + + bool recovery_started = false; + private: /** Records if there is a fetch redirect on this cycle for each thread. */ bool fetchRedirect[MaxThreads]; @@ -414,7 +419,6 @@ class IEW /** Maximum size of the skid buffer. */ unsigned skidBufferMax; - struct IEWStats : public statistics::Group { IEWStats(CPU *cpu); @@ -447,6 +451,10 @@ class IEW statistics::Scalar predictedTakenIncorrect; /** Stat for total number of incorrect predicted not taken branches. */ statistics::Scalar predictedNotTakenIncorrect; + /** Stat for Top-Down Methodology, total number of issue-pipeline slots */ + statistics::Scalar totalSlots; + /** Stat for Top-Down Methodology, number of cycles for recovery */ + statistics::Scalar recoveryBubbles; /** Stat for total number of mispredicted branches detected at * execute. */ statistics::Formula branchMispredicts; @@ -475,6 +483,9 @@ class IEW /** Average number of woken instructions per writeback. */ statistics::Formula wbFanout; } iewStats; + + public: + const IEWStats& getStats() const { return iewStats; } }; } // namespace o3 diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh index 7e40de1ecb3..de4120c8913 100644 --- a/src/cpu/o3/inst_queue.hh +++ b/src/cpu/o3/inst_queue.hh @@ -572,6 +572,10 @@ class InstructionQueue statistics::Scalar fpAluAccesses; statistics::Scalar vecAluAccesses; } iqIOStats; + + public: + const IQStats& getStats() const { return iqStats; } + }; } // namespace o3 diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index ad63fef633c..02ea47d595e 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -452,6 +452,20 @@ LSQ::sendRetryResp() dcachePort.sendRetryResp(); } +bool LSQ::anyCacheLevelMisses(int level) +{ + for (LSQUnit& unit : thread) { + for (auto& entry : unit.loadQueue){ + if(entry.valid() && entry.hasRequest()){ + auto req = entry.request()->mainReq(); + if (req->getAccessDepth() == level) + return true; + } + } + } + return false; +} + bool LSQ::recvTimingResp(PacketPtr pkt) { diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index c208a9fd378..d88d1aae667 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -923,6 +923,8 @@ class LSQ void sendRetryResp(); + bool anyCacheLevelMisses(int level); + protected: /** D-cache is blocked */ bool _cacheBlocked; diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc index f60565d1c86..eb4098b2e45 100644 --- a/src/cpu/o3/rename.cc +++ b/src/cpu/o3/rename.cc @@ -631,7 +631,7 @@ Rename::renameInsts(ThreadID tid) DynInstPtr inst = insts_to_rename.front(); - //For all kind of instructions, check ROB and IQ first For load + //For all kind of instructions, check ROB and IQ first. For load //instruction, check LQ size and take into account the inflight loads //For store instruction, check SQ size and take into account the //inflight stores diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh index 9cbafd07487..eb73fcf5599 100644 --- a/src/cpu/o3/rename.hh +++ b/src/cpu/o3/rename.hh @@ -543,6 +543,11 @@ class Rename /** Top Down, IEW stall while there is an in flight load */ statistics::Scalar storeStalls; } stats; + + public: + const RenameStats& getStats() const { return stats; } + + unsigned getWidth() const { return renameWidth; } }; } // namespace o3 From bc9b6dffddb74438bba2975ce9d7eeb5f000591a Mon Sep 17 00:00:00 2001 From: root Date: Fri, 6 Jun 2025 16:19:08 +0000 Subject: [PATCH 3/7] cpu: add formatting and l2 badspec --- src/cpu/o3/commit.cc | 63 ++++---- src/cpu/o3/commit.hh | 19 ++- src/cpu/o3/cpu.cc | 221 ++++++++++++++++--------- src/cpu/o3/cpu.hh | 8 +- src/cpu/o3/decode.cc | 43 ++--- src/cpu/o3/decode.hh | 11 +- src/cpu/o3/fetch.hh | 3 - src/cpu/o3/fu_pool.hh | 3 +- src/cpu/o3/iew.cc | 11 -- src/cpu/o3/iew.hh | 13 +- src/cpu/o3/inst_queue.cc | 338 ++++++++++++++++++--------------------- src/cpu/o3/inst_queue.hh | 5 +- src/cpu/o3/lsq.cc | 21 ++- src/cpu/o3/rename.cc | 87 +++++----- src/cpu/o3/rename.hh | 2 +- 15 files changed, 442 insertions(+), 406 deletions(-) diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index 4de7c36ada9..ca7de6c506b 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -168,36 +168,28 @@ Commit::CommitStats::CommitStats(CPU *cpu, Commit *commit) "number cycles where commit BW limit reached"), ADD_STAT(committedInst, statistics::units::Count::get(), "Required for Top-Down, number of committed instructions"), - ADD_STAT(recoveryBubbles, statistics::units::Count::get(), - "Required for Top-Down, recovery bubbles") -{ - using namespace statistics; + ADD_STAT(recoveryBubblesMissprediction, statistics::units::Cycle::get(), + "Required for Top-Down, recovery bubbles"), + ADD_STAT(recoveryBubblesMemoryNuke, statistics::units::Cycle::get(), + "Required for Top-Down, recovery bubbles") { + using namespace statistics; - commitSquashedInsts.prereq(commitSquashedInsts); - commitNonSpecStalls.prereq(commitNonSpecStalls); - branchMispredicts.prereq(branchMispredicts); + commitSquashedInsts.prereq(commitSquashedInsts); + commitNonSpecStalls.prereq(commitNonSpecStalls); + branchMispredicts.prereq(branchMispredicts); - numCommittedDist - .init(0,commit->commitWidth,1) - .flags(statistics::pdf); + numCommittedDist.init(0, commit->commitWidth, 1).flags(statistics::pdf); - amos - .init(cpu->numThreads) - .flags(total); + amos.init(cpu->numThreads).flags(total); - membars - .init(cpu->numThreads) - .flags(total); + membars.init(cpu->numThreads).flags(total); - functionCalls - .init(commit->numThreads) - .flags(total); + functionCalls.init(commit->numThreads).flags(total); - committedInstType - .init(commit->numThreads,enums::Num_OpClass) - .flags(total | pdf | dist); + committedInstType.init(commit->numThreads, enums::Num_OpClass) + .flags(total | pdf | dist); - committedInstType.ysubnames(enums::OpClassStrings); + committedInstType.ysubnames(enums::OpClassStrings); } void @@ -965,6 +957,12 @@ Commit::commitInsts() DPRINTF(Commit, "Retiring squashed instruction from " "ROB.\n"); + if (!isMissPredicted && !isMemoryViolation) { + stats.numMachineClear++; + isMemoryViolation = true; + recoveryBubbleStart = cpu->curCycle(); + } + rob->retireHead(commit_thread); ++stats.commitSquashedInsts; @@ -997,15 +995,22 @@ Commit::commitInsts() stats.committedInstType[tid][head_inst->opClass()]++; ppCommit->notify(head_inst); - if (ismispred) { - ismispred = false; - stats.recoveryBubbles += (cpu->curCycle() - lastCommitCycle) * renameWidth; + if (isMissPredicted) { + stats.recoveryBubblesMissprediction += + uint64_t(cpu->curCycle() - recoveryBubbleStart); + } else if (isMemoryViolation) { + stats.recoveryBubblesMemoryNuke += + uint64_t(cpu->curCycle() - recoveryBubbleStart); } + + isMemoryViolation = false; + isMissPredicted = false; + if (head_inst->mispredicted()) { - ismispred = true; + recoveryBubbleStart = cpu->curCycle(); + isMissPredicted = true; } - - lastCommitCycle = cpu->curCycle(); + // hardware transactional memory // update nesting depth diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index 9592b0f7676..675ee4febc6 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -487,20 +487,25 @@ class Commit statistics::Vector functionCalls; /** Committed instructions by instruction type (OpClass) */ statistics::Vector2d committedInstType; - /** Number of cycles where the commit bandwidth limit is reached. */ statistics::Scalar commitEligibleSamples; - /** TDM, Number of commited instructions*/ + /** Top Down Methodology, Number of commited instructions*/ statistics::Scalar committedInst; - /** TDM,Recovery bubbles*/ - statistics::Scalar recoveryBubbles; + statistics::Scalar numMachineClear; + /** Top Down Methodology, Recovery bubbles, miss predictions*/ + statistics::Scalar recoveryBubblesMissprediction; + /** Top Down Methodology, Recovery bubbles, memory nukes */ + statistics::Scalar recoveryBubblesMemoryNuke; + } stats; - bool ismispred = false; uint64_t lastCommitCycle = 0; //TDM + // Top Down Methodology + Cycles recoveryBubbleStart; + bool isMissPredicted = false; + bool isMemoryViolation = false; public: - const CommitStats& getStats() const { return stats; } - + const CommitStats &getStats() const { return stats; } }; } // namespace o3 diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 295de6c9652..8eb1c113fcd 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -356,87 +356,160 @@ CPU::CPUStats::CPUStats(CPU *cpu) ADD_STAT(quiesceCycles, statistics::units::Cycle::get(), "Total number of cycles that CPU has spent quiesced or waiting " "for an interrupt"), - topDownStats(cpu) -{ - // Register any of the O3CPU's stats here. - timesIdled - .prereq(timesIdled); - - idleCycles - .prereq(idleCycles); - - quiesceCycles - .prereq(quiesceCycles); -} - -CPU::CPUStats::TopDownStats::TopDownStats(CPU *cpu) : - statistics::Group(cpu, "TopDownStats"), - topDownL1(cpu), - topDownFbL2(cpu), - topDownBbL2(cpu), - topDownBbMem(cpu){} - -CPU::CPUStats::TopDownStats::TopDownL1::TopDownL1(CPU *cpu) : - statistics::Group(cpu, "TopDownL1"), - ADD_STAT(frontendBound, statistics::units::Rate::get(), - "Frontend Bound, fraction of slots lost due to frontend undersupplying the backend"), - ADD_STAT(badSpeculation, statistics::units::Rate::get(), - "Bad Speculation, fraction of slots lost due to mispeculation"), - ADD_STAT(backendBound, statistics::units::Rate::get(), - "Backend Bound, fraction of slots lost due to backend resource constraints."), - ADD_STAT(retiring, statistics::units::Rate::get(), - "Retiring, fraction of slots successfully retired by the backend") -{ - // L1 - frontendBound = cpu->decode.getStats().fetchBubbles / (cpu->rename.getWidth() * cpu->baseStats.numCycles); - badSpeculation = (cpu->rename.getStats().renamedInsts - cpu->commit.getStats().committedInst + (cpu->commit.getStats().recoveryBubbles)) / (cpu->rename.getWidth() * cpu->baseStats.numCycles); - retiring = cpu->commit.getStats().committedInst / (cpu->rename.getWidth() * cpu->baseStats.numCycles); - backendBound = 1 - (frontendBound + badSpeculation + retiring); -} - -CPU::CPUStats::TopDownStats::TopDownFrontendBoundL2::TopDownFrontendBoundL2(CPU *cpu) + topDownStats(cpu) { + // Register any of the O3CPU's stats here. + timesIdled.prereq(timesIdled); + + idleCycles.prereq(idleCycles); + + quiesceCycles.prereq(quiesceCycles); +} + +CPU::CPUStats::TopDownStats::TopDownStats(CPU *cpu) + : statistics::Group(cpu, "TopDownStats"), topDownL1(cpu), topDownFbL2(cpu), + topDownBbL2(cpu), topDownBbMem(cpu) {} + +CPU::CPUStats::TopDownStats::TopDownL1::TopDownL1(CPU *cpu) + : statistics::Group(cpu, "TopDownL1"), + ADD_STAT(frontendBound, + statistics::units::Rate::get(), + "Frontend Bound, fraction of slots lost due to frontend " + "undersupplying the backend"), + ADD_STAT(badSpeculation, + statistics::units::Rate::get(), + "Bad Speculation, fraction of slots lost due to mispeculation"), + ADD_STAT(backendBound, + statistics::units::Rate::get(), + "Backend Bound, fraction of slots lost due to backend resource " + "constraints."), + ADD_STAT( + retiring, + statistics::units::Rate::get(), + "Retiring, fraction of slots successfully retired by the backend") { + // L1 + frontendBound = cpu->decode.getStats().fetchBubbles / + (cpu->rename.getWidth() * cpu->baseStats.numCycles); + + badSpeculation = (cpu->rename.getStats().renamedInsts - + cpu->commit.getStats().committedInst + + (cpu->commit.getStats().recoveryBubblesMissprediction + + cpu->commit.getStats().recoveryBubblesMemoryNuke) + * cpu->rename.getWidth()) / + (cpu->rename.getWidth() * cpu->baseStats.numCycles); + + retiring = cpu->commit.getStats().committedInst / + (cpu->rename.getWidth() * cpu->baseStats.numCycles); + + backendBound = 1 - (frontendBound + badSpeculation + retiring); +} + +CPU::CPUStats::TopDownStats::TopDownFrontendBoundL2::TopDownFrontendBoundL2( + CPU *cpu) : statistics::Group(cpu, "TopDownL2_FrontendBound"), - ADD_STAT(fetchLatency, statistics::units::Rate::get(), - "Fetch Latency Bound, frontend stalls due to instruction cache inefficiency"), - ADD_STAT(fetchBandwidth, statistics::units::Rate::get(), - "Fetch Bandwidth Bound, frontend stalls due to decoder inefficiency") -{ - // Frontend L2 - fetchLatency = cpu->decode.getStats().fetchBubblesMax / (cpu->baseStats.numCycles); - fetchBandwidth = cpu->cpuStats.topDownStats.topDownL1.frontendBound - fetchLatency; -} - -CPU::CPUStats::TopDownStats::TopDownBackendBoundL2::TopDownBackendBoundL2(CPU *cpu) + ADD_STAT(fetchLatency, + statistics::units::Rate::get(), + "Fetch Latency Bound, frontend stalls due to instruction cache " + "inefficiency"), + ADD_STAT(fetchBandwidth, + statistics::units::Rate::get(), + "Fetch Bandwidth Bound, frontend stalls due to decoder " + "inefficiency") { + // Frontend L2 + fetchLatency = + cpu->decode.getStats().fetchBubblesMax / (cpu->baseStats.numCycles); + fetchBandwidth = + cpu->cpuStats.topDownStats.topDownL1.frontendBound - fetchLatency; +} + +// CPU::CPUStats::TopDownStats::TopDownFrontendBoundL2::TopDownFrontendBoundL2(CPU +// *cpu) +// : statistics::Group(cpu, "TopDownL2_FrontendBound"), +// ADD_STAT(fetchLatency, +// statistics::units::Rate::get(), +// "Fetch Latency Bound, frontend stalls due to instruction cache +// inefficiency"), +// ADD_STAT(fetchBandwidth, +// statistics::units::Rate::get(), +// "Fetch Bandwidth Bound, frontend stalls due to decoder +// inefficiency") +// { +// // Frontend L2 +// fetchLatency = cpu->decode.getStats().fetchBubblesMax / +// (cpu->baseStats.numCycles); fetchBandwidth = +// cpu->cpuStats.topDownStats.topDownL1.frontendBound - fetchLatency; +// } + +CPU::CPUStats::TopDownStats::TopDownBackendBoundL2::TopDownBackendBoundL2( + CPU *cpu) : statistics::Group(cpu, "TopDownL2_BackendBound"), - ADD_STAT(memoryBound, statistics::units::Rate::get(), + ADD_STAT(memoryBound, + statistics::units::Rate::get(), "Memory Bound, backend stalls due to memory subsystem"), - ADD_STAT(coreBound, statistics::units::Rate::get(), - "Core Bound, backend stalls due to functional unit constraints") -{ - // Backend L2 - executionStalls = (cpu->iew.instQueue.getStats().numInstsExec0 - cpu->rename.getStats().idleCycles + cpu->iew.instQueue.getStats().numInstsExec1 + cpu->iew.instQueue.getStats().numInstsExec2) / (cpu->baseStats.numCycles); - // memoryBound = (cpu->iew.instQueue.getStats().loadStallCycles + cpu->rename.getStats().SQFullEvents) / (cpu->baseStats.numCycles); - memoryBound = (cpu->iew.instQueue.getStats().loadStallCycles + cpu->rename.getStats().storeStalls) / (cpu->baseStats.numCycles); - coreBound = executionStalls - memoryBound; -} - -CPU::CPUStats::TopDownStats::TopDownBackendBoundL3::TopDownBackendBoundL3(CPU *cpu) : statistics::Group(cpu, "TopDownL3_BackendBound_MemoryBound"), - ADD_STAT(l1Bound, statistics::units::Rate::get(), + ADD_STAT( + coreBound, + statistics::units::Rate::get(), + "Core Bound, backend stalls due to functional unit constraints") { + // Backend L2 + executionStalls = (cpu->iew.instQueue.getStats().numInstsExec0 - + cpu->rename.getStats().idleCycles + + cpu->iew.instQueue.getStats().numInstsExec1 + + cpu->iew.instQueue.getStats().numInstsExec2) / + (cpu->baseStats.numCycles); + // memoryBound = (cpu->iew.instQueue.getStats().loadStallCycles + + // cpu->rename.getStats().SQFullEvents) / (cpu->baseStats.numCycles); + memoryBound = (cpu->iew.instQueue.getStats().loadStallCycles + + cpu->rename.getStats().storeStalls) / + (cpu->baseStats.numCycles); + coreBound = executionStalls - memoryBound; +} + +CPU::CPUStats::TopDownStats::TopDownBackendBoundL3::TopDownBackendBoundL3( + CPU *cpu) + : statistics::Group(cpu, "TopDownL3_BackendBound_MemoryBound"), + ADD_STAT(l1Bound, + statistics::units::Rate::get(), "L1 Cache Bound"), - ADD_STAT(l2Bound, statistics::units::Rate::get(), + ADD_STAT(l2Bound, + statistics::units::Rate::get(), "L2 Cache Bound"), - ADD_STAT(l3Bound, statistics::units::Rate::get(), + ADD_STAT(l3Bound, + statistics::units::Rate::get(), "L3 Cache Bound"), - ADD_STAT(extMemBound, statistics::units::Rate::get(), + ADD_STAT(extMemBound, + statistics::units::Rate::get(), "External Memory Bound"), - ADD_STAT(storeBound, statistics::units::Rate::get(), - "Store Bound") -{ - l1Bound = (cpu->iew.instQueue.getStats().loadStallCycles - cpu->iew.instQueue.getStats().L1miss) / (cpu->baseStats.numCycles); - l2Bound = (cpu->iew.instQueue.getStats().L1miss - cpu->iew.instQueue.getStats().L2miss) / (cpu->baseStats.numCycles); - l3Bound = (cpu->iew.instQueue.getStats().L2miss - cpu->iew.instQueue.getStats().L3miss) / (cpu->baseStats.numCycles); - extMemBound = (cpu->iew.instQueue.getStats().L3miss) / (cpu->baseStats.numCycles); - storeBound = (cpu->rename.getStats().storeStalls) / (cpu->baseStats.numCycles); + ADD_STAT(storeBound, + statistics::units::Rate::get(), + "Store Bound") { + // Backend Bound / Memory Bound L3 + l1Bound = (cpu->iew.instQueue.getStats().loadStallCycles - + cpu->iew.instQueue.getStats().L1miss) / + (cpu->baseStats.numCycles); + l2Bound = (cpu->iew.instQueue.getStats().L1miss - + cpu->iew.instQueue.getStats().L2miss) / + (cpu->baseStats.numCycles); + l3Bound = (cpu->iew.instQueue.getStats().L2miss - + cpu->iew.instQueue.getStats().L3miss) / + (cpu->baseStats.numCycles); + extMemBound = + (cpu->iew.instQueue.getStats().L3miss) / (cpu->baseStats.numCycles); + storeBound = + (cpu->rename.getStats().storeStalls) / (cpu->baseStats.numCycles); } void diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index 9d0d507d13d..eec0f44bebf 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -605,7 +605,7 @@ class CPU : public BaseCPU struct TopDownStats : statistics::Group { TopDownStats(CPU *cpu); - struct TopDownL1 : statistics::Group{ + struct TopDownL1 : statistics::Group { TopDownL1(CPU *cpu); statistics::Formula frontendBound; statistics::Formula badSpeculation; @@ -619,6 +619,12 @@ class CPU : public BaseCPU statistics::Formula fetchBandwidth; } topDownFbL2; + // struct TopDownBadSpeculationL2 : statistics::Group{ + // TopDownBadSpeculationL2(CPU *cpu); + // statistics::Formula branchMissPredicts; + // statistics::Formula machineClears; + // } + struct TopDownBackendBoundL2 : statistics::Group { TopDownBackendBoundL2(CPU *cpu); statistics::Formula executionStalls; diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc index 7d203eddd6a..f2f393f936d 100644 --- a/src/cpu/o3/decode.cc +++ b/src/cpu/o3/decode.cc @@ -160,22 +160,23 @@ Decode::DecodeStats::DecodeStats(CPU *cpu) ADD_STAT(squashedInsts, statistics::units::Count::get(), "Number of squashed instructions handled by decode"), ADD_STAT(fetchBubbles, statistics::units::Count::get(), - "Stat for Top-Down Methodology, number of instructions not delivered to backend"), + "Stat for Top-Down Methodology, number of instructions not " + "delivered to backend"), ADD_STAT(fetchBubblesMax, statistics::units::Count::get(), - "Stat for Top-Down Methodology, number of cycles in which no instructions are delivered to backend") -{ - idleCycles.prereq(idleCycles); - blockedCycles.prereq(blockedCycles); - runCycles.prereq(runCycles); - unblockCycles.prereq(unblockCycles); - squashCycles.prereq(squashCycles); - branchResolved.prereq(branchResolved); - branchMispred.prereq(branchMispred); - controlMispred.prereq(controlMispred); - decodedInsts.prereq(decodedInsts); - squashedInsts.prereq(squashedInsts); - fetchBubbles.prereq(fetchBubbles); - fetchBubblesMax.prereq(fetchBubblesMax); + "Stat for Top-Down Methodology, number of cycles in which no " + "instructions are delivered to backend") { + idleCycles.prereq(idleCycles); + blockedCycles.prereq(blockedCycles); + runCycles.prereq(runCycles); + unblockCycles.prereq(unblockCycles); + squashCycles.prereq(squashCycles); + branchResolved.prereq(branchResolved); + branchMispred.prereq(branchMispred); + controlMispred.prereq(controlMispred); + decodedInsts.prereq(decodedInsts); + squashedInsts.prereq(squashedInsts); + fetchBubbles.prereq(fetchBubbles); + fetchBubblesMax.prereq(fetchBubblesMax); } void @@ -588,8 +589,8 @@ Decode::tick() decode(status_change, tid); stats.fetchBubbles += fetchBubbles; - if (fetchBubbles == decodeWidth) - stats.fetchBubblesMax++; + if (fetchBubbles == decodeWidth) + stats.fetchBubblesMax++; } if (status_change) { @@ -614,11 +615,11 @@ Decode::decode(bool &status_change, ThreadID tid) // check if stall conditions have passed if (decodeStatus[tid] == Blocked) { - fetchBubbles -= decodeWidth; - ++stats.blockedCycles; + fetchBubbles -= decodeWidth; + ++stats.blockedCycles; } else if (decodeStatus[tid] == Squashing) { - fetchBubbles -= decodeWidth; - ++stats.squashCycles; + fetchBubbles -= decodeWidth; + ++stats.squashCycles; } // Decode should try to decode as many instructions as its bandwidth diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh index 7f8735ff335..7c07505c75e 100644 --- a/src/cpu/o3/decode.hh +++ b/src/cpu/o3/decode.hh @@ -327,15 +327,16 @@ class Decode statistics::Scalar decodedInsts; /** Stat for total number of squashed instructions. */ statistics::Scalar squashedInsts; - /** Stat for Top-Down Methodology, number of instructions not delivered to backend */ + /** Stat for Top-Down Methodology, number of instructions not delivered + * to backend */ statistics::Scalar fetchBubbles; - /** Stat for Top-Down Methodology, number of cycles in which no instructions are delivered to backend */ + /** Stat for Top-Down Methodology, number of cycles in which no + * instructions are delivered to backend */ statistics::Scalar fetchBubblesMax; } stats; - public: - - const DecodeStats& getStats() const { return stats; } + public: + const DecodeStats &getStats() const { return stats; } }; } // namespace o3 diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 79a3618681f..05d92e37567 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -592,9 +592,6 @@ class Fetch /** Rate of how often fetch was idle. */ statistics::Formula idleRate; } fetchStats; - - public: - const FetchStatGroup& getStats() const { return fetchStats; } }; } // namespace o3 diff --git a/src/cpu/o3/fu_pool.hh b/src/cpu/o3/fu_pool.hh index 5df18115209..7589ce18e72 100644 --- a/src/cpu/o3/fu_pool.hh +++ b/src/cpu/o3/fu_pool.hh @@ -205,8 +205,7 @@ class FUPool : public SimObject void takeOverFrom() {}; /** Returns the number of free FUs */ - int numBusyFUs() const - { + int numBusyFUs() const { int busy = 0; for (int i = 0; i < numFU; ++i) { if (unitBusy[i]) diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index 5bf05773e15..a01c6b9deca 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -171,10 +171,6 @@ IEW::IEWStats::IEWStats(CPU *cpu) "Number of branches that were predicted taken incorrectly"), ADD_STAT(predictedNotTakenIncorrect, statistics::units::Count::get(), "Number of branches that were predicted not taken incorrectly"), - ADD_STAT(totalSlots, statistics::units::Count::get(), - "Required for Top-Down methodology, total number of issue pipeline slots"), - ADD_STAT(recoveryBubbles, statistics::units::Count::get(), - "Required for Top-Down methodology, number of slots required for recovery"), ADD_STAT(branchMispredicts, statistics::units::Count::get(), "Number of branch mispredicts detected at execute", predictedTakenIncorrect + predictedNotTakenIncorrect), @@ -1292,8 +1288,6 @@ IEW::executeInsts() if (inst->mispredicted() && !loadNotExecuted) { fetchRedirect[tid] = true; - recovery = true; - recovery_started = true; DPRINTF(IEW, "[tid:%i] [sn:%llu] Execute: " "Branch mispredict detected.\n", @@ -1431,8 +1425,6 @@ IEW::tick() wbNumInst = 0; wbCycle = 0; - iewStats.totalSlots += issueWidth; - wroteToTimeBuffer = false; updatedQueues = false; @@ -1446,9 +1438,6 @@ IEW::tick() std::list::iterator threads = activeThreads->begin(); std::list::iterator end = activeThreads->end(); - if (recovery) - iewStats.recoveryBubbles += issueWidth; - // Check stall and squash signals, dispatch any instructions. while (threads != end) { ThreadID tid = *threads++; diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index b42bff6f0bb..4fe8227dcc8 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -364,11 +364,6 @@ class IEW */ bool updateLSQNextCycle; - /** Required for Top-Down, determines if recovery is happening */ - bool recovery = false; - - bool recovery_started = false; - private: /** Records if there is a fetch redirect on this cycle for each thread. */ bool fetchRedirect[MaxThreads]; @@ -419,6 +414,7 @@ class IEW /** Maximum size of the skid buffer. */ unsigned skidBufferMax; + struct IEWStats : public statistics::Group { IEWStats(CPU *cpu); @@ -451,10 +447,6 @@ class IEW statistics::Scalar predictedTakenIncorrect; /** Stat for total number of incorrect predicted not taken branches. */ statistics::Scalar predictedNotTakenIncorrect; - /** Stat for Top-Down Methodology, total number of issue-pipeline slots */ - statistics::Scalar totalSlots; - /** Stat for Top-Down Methodology, number of cycles for recovery */ - statistics::Scalar recoveryBubbles; /** Stat for total number of mispredicted branches detected at * execute. */ statistics::Formula branchMispredicts; @@ -483,9 +475,6 @@ class IEW /** Average number of woken instructions per writeback. */ statistics::Formula wbFanout; } iewStats; - - public: - const IEWStats& getStats() const { return iewStats; } }; } // namespace o3 diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc index 853514f18c9..4cba6bad963 100644 --- a/src/cpu/o3/inst_queue.cc +++ b/src/cpu/o3/inst_queue.cc @@ -179,172 +179,144 @@ InstructionQueue::name() const InstructionQueue::IQStats::IQStats(CPU *cpu, const unsigned &total_width) : statistics::Group(cpu), - ADD_STAT(instsAdded, statistics::units::Count::get(), - "Number of instructions added to the IQ (excludes non-spec)"), - ADD_STAT(nonSpecInstsAdded, statistics::units::Count::get(), - "Number of non-speculative instructions added to the IQ"), - ADD_STAT(instsIssued, statistics::units::Count::get(), - "Number of instructions issued"), - ADD_STAT(intInstsIssued, statistics::units::Count::get(), - "Number of integer instructions issued"), - ADD_STAT(floatInstsIssued, statistics::units::Count::get(), - "Number of float instructions issued"), - ADD_STAT(branchInstsIssued, statistics::units::Count::get(), - "Number of branch instructions issued"), - ADD_STAT(memInstsIssued, statistics::units::Count::get(), - "Number of memory instructions issued"), - ADD_STAT(miscInstsIssued, statistics::units::Count::get(), - "Number of miscellaneous instructions issued"), - ADD_STAT(squashedInstsIssued, statistics::units::Count::get(), - "Number of squashed instructions issued"), - ADD_STAT(squashedInstsExamined, statistics::units::Count::get(), - "Number of squashed instructions iterated over during squash; " - "mainly for profiling"), - ADD_STAT(squashedOperandsExamined, statistics::units::Count::get(), - "Number of squashed operands that are examined and possibly " - "removed from graph"), - ADD_STAT(squashedNonSpecRemoved, statistics::units::Count::get(), - "Number of squashed non-spec instructions that were removed"), - ADD_STAT(numIssuedDist, statistics::units::Count::get(), - "Number of insts issued each cycle"), - ADD_STAT(statFuBusy, statistics::units::Count::get(), - "attempts to use FU when none available"), - ADD_STAT(statIssuedInstType, statistics::units::Count::get(), - "Number of instructions issued per FU type, per thread"), - ADD_STAT(issueRate, statistics::units::Rate< - statistics::units::Count, statistics::units::Cycle>::get(), - "Inst issue rate", instsIssued / cpu->baseStats.numCycles), - ADD_STAT(fuBusy, statistics::units::Count::get(), "FU busy when requested"), - ADD_STAT(fuBusyRate, statistics::units::Rate< - statistics::units::Count, statistics::units::Count>::get(), - "FU busy rate (busy events/executed inst)"), - ADD_STAT(numInstsExec0, statistics::units::Count::get(), - "0 instructions executed in a cycle"), - ADD_STAT(numInstsExec1, statistics::units::Count::get(), - "1 instruction executed in a cycle"), - ADD_STAT(numInstsExec2, statistics::units::Count::get(), - "2 instructions executed in a cycle"), - ADD_STAT(loadStallCycles, statistics::units::Cycle::get(), - "Top down, no uops executed and at least 1 in-flight load"), - ADD_STAT(L1miss, statistics::units::Cycle::get(), - "l1miss"), - ADD_STAT(L2miss, statistics::units::Cycle::get(), - "l2miss"), - ADD_STAT(L3miss, statistics::units::Cycle::get(), - "l1miss") -{ - instsAdded - .prereq(instsAdded); - - nonSpecInstsAdded - .prereq(nonSpecInstsAdded); - - instsIssued - .prereq(instsIssued); - - intInstsIssued - .prereq(intInstsIssued); - - floatInstsIssued - .prereq(floatInstsIssued); - - branchInstsIssued - .prereq(branchInstsIssued); - - memInstsIssued - .prereq(memInstsIssued); - - miscInstsIssued - .prereq(miscInstsIssued); - - squashedInstsIssued - .prereq(squashedInstsIssued); - - squashedInstsExamined - .prereq(squashedInstsExamined); - - squashedOperandsExamined - .prereq(squashedOperandsExamined); - - squashedNonSpecRemoved - .prereq(squashedNonSpecRemoved); -/* - queueResDist - .init(Num_OpClasses, 0, 99, 2) - .name(name() + ".IQ:residence:") - .desc("cycles from dispatch to issue") - .flags(total | pdf | cdf ) - ; - for (int i = 0; i < Num_OpClasses; ++i) { - queueResDist.subname(i, opClassStrings[i]); - } -*/ - numIssuedDist - .init(0,total_width,1) - .flags(statistics::pdf) - ; -/* - dist_unissued - .init(Num_OpClasses+2) - .name(name() + ".unissued_cause") - .desc("Reason ready instruction not issued") - .flags(pdf | dist) - ; - for (int i=0; i < (Num_OpClasses + 2); ++i) { - dist_unissued.subname(i, unissued_names[i]); - } -*/ - statIssuedInstType - .init(cpu->numThreads,enums::Num_OpClass) - .flags(statistics::total | statistics::pdf | statistics::dist) - ; - statIssuedInstType.ysubnames(enums::OpClassStrings); - - // - // How long did instructions for a particular FU type wait prior to issue - // -/* - issueDelayDist - .init(Num_OpClasses,0,99,2) - .name(name() + ".") - .desc("cycles from operands ready to issue") - .flags(pdf | cdf) - ; - for (int i=0; inumThreads) - .flags(statistics::total) - ; - - fuBusyRate - .flags(statistics::total) - ; - fuBusyRate = fuBusy / instsIssued; - - numInstsExec0.prereq(numInstsExec0); - numInstsExec1.prereq(numInstsExec1); - numInstsExec2.prereq(numInstsExec2); - - L1miss.prereq(L1miss); - L2miss.prereq(L2miss); - L3miss.prereq(L3miss); + ADD_STAT(instsAdded, statistics::units::Count::get(), + "Number of instructions added to the IQ (excludes non-spec)"), + ADD_STAT(nonSpecInstsAdded, statistics::units::Count::get(), + "Number of non-speculative instructions added to the IQ"), + ADD_STAT(instsIssued, statistics::units::Count::get(), + "Number of instructions issued"), + ADD_STAT(intInstsIssued, statistics::units::Count::get(), + "Number of integer instructions issued"), + ADD_STAT(floatInstsIssued, statistics::units::Count::get(), + "Number of float instructions issued"), + ADD_STAT(branchInstsIssued, statistics::units::Count::get(), + "Number of branch instructions issued"), + ADD_STAT(memInstsIssued, statistics::units::Count::get(), + "Number of memory instructions issued"), + ADD_STAT(miscInstsIssued, statistics::units::Count::get(), + "Number of miscellaneous instructions issued"), + ADD_STAT(squashedInstsIssued, statistics::units::Count::get(), + "Number of squashed instructions issued"), + ADD_STAT(squashedInstsExamined, statistics::units::Count::get(), + "Number of squashed instructions iterated over during squash; " + "mainly for profiling"), + ADD_STAT(squashedOperandsExamined, statistics::units::Count::get(), + "Number of squashed operands that are examined and possibly " + "removed from graph"), + ADD_STAT(squashedNonSpecRemoved, statistics::units::Count::get(), + "Number of squashed non-spec instructions that were removed"), + ADD_STAT(numIssuedDist, statistics::units::Count::get(), + "Number of insts issued each cycle"), + ADD_STAT(statFuBusy, statistics::units::Count::get(), + "attempts to use FU when none available"), + ADD_STAT(statIssuedInstType, statistics::units::Count::get(), + "Number of instructions issued per FU type, per thread"), + ADD_STAT(issueRate, + statistics::units::Rate::get(), + "Inst issue rate", instsIssued / cpu->baseStats.numCycles), + ADD_STAT(fuBusy, statistics::units::Count::get(), + "FU busy when requested"), + ADD_STAT(fuBusyRate, + statistics::units::Rate::get(), + "FU busy rate (busy events/executed inst)"), + ADD_STAT(numInstsExec0, statistics::units::Count::get(), + "0 instructions executed in a cycle"), + ADD_STAT(numInstsExec1, statistics::units::Count::get(), + "1 instruction executed in a cycle"), + ADD_STAT(numInstsExec2, statistics::units::Count::get(), + "2 instructions executed in a cycle"), + ADD_STAT(loadStallCycles, statistics::units::Cycle::get(), + "Top down, no uops executed and at least 1 in-flight load"), + ADD_STAT(L1miss, statistics::units::Cycle::get(), "l1miss"), + ADD_STAT(L2miss, statistics::units::Cycle::get(), "l2miss"), + ADD_STAT(L3miss, statistics::units::Cycle::get(), "l1miss") { + instsAdded.prereq(instsAdded); + + nonSpecInstsAdded.prereq(nonSpecInstsAdded); + + instsIssued.prereq(instsIssued); + + intInstsIssued.prereq(intInstsIssued); + + floatInstsIssued.prereq(floatInstsIssued); + + branchInstsIssued.prereq(branchInstsIssued); + + memInstsIssued.prereq(memInstsIssued); + + miscInstsIssued.prereq(miscInstsIssued); + + squashedInstsIssued.prereq(squashedInstsIssued); + + squashedInstsExamined.prereq(squashedInstsExamined); + + squashedOperandsExamined.prereq(squashedOperandsExamined); + + squashedNonSpecRemoved.prereq(squashedNonSpecRemoved); + /* + queueResDist + .init(Num_OpClasses, 0, 99, 2) + .name(name() + ".IQ:residence:") + .desc("cycles from dispatch to issue") + .flags(total | pdf | cdf ) + ; + for (int i = 0; i < Num_OpClasses; ++i) { + queueResDist.subname(i, opClassStrings[i]); + } + */ + numIssuedDist.init(0, total_width, 1).flags(statistics::pdf); + /* + dist_unissued + .init(Num_OpClasses+2) + .name(name() + ".unissued_cause") + .desc("Reason ready instruction not issued") + .flags(pdf | dist) + ; + for (int i=0; i < (Num_OpClasses + 2); ++i) { + dist_unissued.subname(i, unissued_names[i]); + } + */ + statIssuedInstType.init(cpu->numThreads, enums::Num_OpClass) + .flags(statistics::total | statistics::pdf | statistics::dist); + statIssuedInstType.ysubnames(enums::OpClassStrings); + + // + // How long did instructions for a particular FU type wait prior to issue + // + /* + issueDelayDist + .init(Num_OpClasses,0,99,2) + .name(name() + ".") + .desc("cycles from operands ready to issue") + .flags(pdf | cdf) + ; + for (int i=0; inumThreads).flags(statistics::total); + + fuBusyRate.flags(statistics::total); + fuBusyRate = fuBusy / instsIssued; + + numInstsExec0.prereq(numInstsExec0); + numInstsExec1.prereq(numInstsExec1); + numInstsExec2.prereq(numInstsExec2); + + L1miss.prereq(L1miss); + L2miss.prereq(L2miss); + L3miss.prereq(L3miss); } InstructionQueue::IQIOStats::IQIOStats(statistics::Group *parent) @@ -954,25 +926,25 @@ InstructionQueue::scheduleReadyInsts() int numBusyFUs = fuPool->numBusyFUs(); if (numBusyFUs == 0) - iqStats.numInstsExec0++; + iqStats.numInstsExec0++; else if (numBusyFUs == 1) - iqStats.numInstsExec1++; + iqStats.numInstsExec1++; else if (numBusyFUs == 2) - iqStats.numInstsExec2++; - - if (fuPool->isDrained() && iewStage->ldstQueue.numLoads()){ //numLoads returns for all threads, change it to a single thread - iqStats.loadStallCycles++; - if (iewStage->ldstQueue.anyCacheLevelMisses(3)) { - iqStats.L1miss++; - iqStats.L2miss++; - iqStats.L3miss++; - } - else if (iewStage->ldstQueue.anyCacheLevelMisses(2)) { - iqStats.L1miss++; - iqStats.L2miss++; - } - else if (iewStage->ldstQueue.anyCacheLevelMisses(1)) - iqStats.L1miss++; + iqStats.numInstsExec2++; + + if (fuPool->isDrained() && + iewStage->ldstQueue.numLoads()) { // numLoads returns for all threads, + // change it to a single thread + iqStats.loadStallCycles++; + if (iewStage->ldstQueue.anyCacheLevelMisses(3)) { + iqStats.L1miss++; + iqStats.L2miss++; + iqStats.L3miss++; + } else if (iewStage->ldstQueue.anyCacheLevelMisses(2)) { + iqStats.L1miss++; + iqStats.L2miss++; + } else if (iewStage->ldstQueue.anyCacheLevelMisses(1)) + iqStats.L1miss++; } } diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh index de4120c8913..7b50b2b5380 100644 --- a/src/cpu/o3/inst_queue.hh +++ b/src/cpu/o3/inst_queue.hh @@ -573,9 +573,8 @@ class InstructionQueue statistics::Scalar vecAluAccesses; } iqIOStats; - public: - const IQStats& getStats() const { return iqStats; } - + public: + const IQStats &getStats() const { return iqStats; } }; } // namespace o3 diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index 02ea47d595e..65ee52979f4 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -452,18 +452,17 @@ LSQ::sendRetryResp() dcachePort.sendRetryResp(); } -bool LSQ::anyCacheLevelMisses(int level) -{ - for (LSQUnit& unit : thread) { - for (auto& entry : unit.loadQueue){ - if(entry.valid() && entry.hasRequest()){ - auto req = entry.request()->mainReq(); - if (req->getAccessDepth() == level) - return true; - } - } +bool LSQ::anyCacheLevelMisses(int level) { + for (LSQUnit &unit : thread) { + for (auto &entry : unit.loadQueue) { + if (entry.valid() && entry.hasRequest()) { + auto req = entry.request()->mainReq(); + if (req->getAccessDepth() == level) + return true; + } } - return false; + } + return false; } bool diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc index eb4098b2e45..c4539234ac4 100644 --- a/src/cpu/o3/rename.cc +++ b/src/cpu/o3/rename.cc @@ -118,7 +118,7 @@ Rename::RenameStats::RenameStats(statistics::Group *parent) ADD_STAT(IQFullEvents, statistics::units::Count::get(), "Number of times rename has blocked due to IQ full"), ADD_STAT(LQFullEvents, statistics::units::Count::get(), - "Number of times rename has blocked due to LQ full" ), + "Number of times rename has blocked due to LQ full"), ADD_STAT(SQFullEvents, statistics::units::Count::get(), "Number of times rename has blocked due to SQ full"), ADD_STAT(fullRegistersEvents, statistics::units::Count::get(), @@ -147,46 +147,48 @@ Rename::RenameStats::RenameStats(statistics::Group *parent) "count of temporary serializing insts renamed"), ADD_STAT(skidInsts, statistics::units::Count::get(), "count of insts added to the skid buffer"), - ADD_STAT(intReturned, statistics::units::Count::get(), - "count of registers freed and written back to integer free list"), + ADD_STAT( + intReturned, statistics::units::Count::get(), + "count of registers freed and written back to integer free list"), ADD_STAT(fpReturned, statistics::units::Count::get(), - "count of registers freed and written back to floating point free list"), + "count of registers freed and written back to floating point " + "free list"), ADD_STAT(storeStalls, statistics::units::Cycle::get(), - "Number of cycles with few uops executed and no more stores can be issued") -{ - squashCycles.prereq(squashCycles); - idleCycles.prereq(idleCycles); - blockCycles.prereq(blockCycles); - serializeStallCycles.flags(statistics::total); - runCycles.prereq(idleCycles); - unblockCycles.prereq(unblockCycles); - - renamedInsts.prereq(renamedInsts); - squashedInsts.prereq(squashedInsts); - - ROBFullEvents.prereq(ROBFullEvents); - IQFullEvents.prereq(IQFullEvents); - LQFullEvents.prereq(LQFullEvents); - SQFullEvents.prereq(SQFullEvents); - fullRegistersEvents.prereq(fullRegistersEvents); - - renamedOperands.prereq(renamedOperands); - lookups.prereq(lookups); - intLookups.prereq(intLookups); - fpLookups.prereq(fpLookups); - vecLookups.prereq(vecLookups); - vecPredLookups.prereq(vecPredLookups); - matLookups.prereq(matLookups); - - committedMaps.prereq(committedMaps); - undoneMaps.prereq(undoneMaps); - serializing.flags(statistics::total); - tempSerializing.flags(statistics::total); - skidInsts.flags(statistics::total); - - intReturned.prereq(intReturned); - fpReturned.prereq(fpReturned); - storeStalls.prereq(storeStalls); + "Number of cycles with few uops executed and no more stores can " + "be issued") { + squashCycles.prereq(squashCycles); + idleCycles.prereq(idleCycles); + blockCycles.prereq(blockCycles); + serializeStallCycles.flags(statistics::total); + runCycles.prereq(idleCycles); + unblockCycles.prereq(unblockCycles); + + renamedInsts.prereq(renamedInsts); + squashedInsts.prereq(squashedInsts); + + ROBFullEvents.prereq(ROBFullEvents); + IQFullEvents.prereq(IQFullEvents); + LQFullEvents.prereq(LQFullEvents); + SQFullEvents.prereq(SQFullEvents); + fullRegistersEvents.prereq(fullRegistersEvents); + + renamedOperands.prereq(renamedOperands); + lookups.prereq(lookups); + intLookups.prereq(intLookups); + fpLookups.prereq(fpLookups); + vecLookups.prereq(vecLookups); + vecPredLookups.prereq(vecPredLookups); + matLookups.prereq(matLookups); + + committedMaps.prereq(committedMaps); + undoneMaps.prereq(undoneMaps); + serializing.flags(statistics::total); + tempSerializing.flags(statistics::total); + skidInsts.flags(statistics::total); + + intReturned.prereq(intReturned); + fpReturned.prereq(fpReturned); + storeStalls.prereq(storeStalls); } void @@ -631,7 +633,7 @@ Rename::renameInsts(ThreadID tid) DynInstPtr inst = insts_to_rename.front(); - //For all kind of instructions, check ROB and IQ first. For load + //For all kind of instructions, check ROB and IQ first For load //instruction, check LQ size and take into account the inflight loads //For store instruction, check SQ size and take into account the //inflight stores @@ -652,12 +654,11 @@ Rename::renameInsts(ThreadID tid) tid); source = SQ; incrFullStat(source); - if (iew_ptr->fuPool->isDrained()){ - stats.storeStalls++; + if (iew_ptr->fuPool->isDrained()) { + stats.storeStalls++; } break; } - } insts_to_rename.pop_front(); diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh index eb73fcf5599..4cdec7e0bbd 100644 --- a/src/cpu/o3/rename.hh +++ b/src/cpu/o3/rename.hh @@ -545,7 +545,7 @@ class Rename } stats; public: - const RenameStats& getStats() const { return stats; } + const RenameStats &getStats() const { return stats; } unsigned getWidth() const { return renameWidth; } }; From b65ac3465b48ddddc2de50bc632c0f7f3b0ee344 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 6 Jun 2025 16:33:16 +0000 Subject: [PATCH 4/7] cpu: formatting issues to tdm --- src/cpu/o3/commit.cc | 30 ++++--- src/cpu/o3/cpu.cc | 14 +-- src/cpu/o3/inst_queue.cc | 179 ++++++++++++++++++++++----------------- src/cpu/o3/lsq.cc | 3 +- src/cpu/o3/rename.cc | 79 +++++++++-------- 5 files changed, 171 insertions(+), 134 deletions(-) diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index ca7de6c506b..41a3f409f24 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -171,22 +171,32 @@ Commit::CommitStats::CommitStats(CPU *cpu, Commit *commit) ADD_STAT(recoveryBubblesMissprediction, statistics::units::Cycle::get(), "Required for Top-Down, recovery bubbles"), ADD_STAT(recoveryBubblesMemoryNuke, statistics::units::Cycle::get(), - "Required for Top-Down, recovery bubbles") { - using namespace statistics; + "Required for Top-Down, recovery bubbles") +{ + using namespace statistics; - commitSquashedInsts.prereq(commitSquashedInsts); - commitNonSpecStalls.prereq(commitNonSpecStalls); - branchMispredicts.prereq(branchMispredicts); + commitSquashedInsts.prereq(commitSquashedInsts); + commitNonSpecStalls.prereq(commitNonSpecStalls); + branchMispredicts.prereq(branchMispredicts); - numCommittedDist.init(0, commit->commitWidth, 1).flags(statistics::pdf); + numCommittedDist + .init(0,commit->commitWidth,1) + .flags(statistics::pdf); - amos.init(cpu->numThreads).flags(total); + amos + .init(cpu->numThreads) + .flags(total); - membars.init(cpu->numThreads).flags(total); + membars + .init(cpu->numThreads) + .flags(total); - functionCalls.init(commit->numThreads).flags(total); + functionCalls + .init(commit->numThreads) + .flags(total); - committedInstType.init(commit->numThreads, enums::Num_OpClass) + committedInstType + .init(commit->numThreads,enums::Num_OpClass) .flags(total | pdf | dist); committedInstType.ysubnames(enums::OpClassStrings); diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 8eb1c113fcd..906109f6d30 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -356,13 +356,17 @@ CPU::CPUStats::CPUStats(CPU *cpu) ADD_STAT(quiesceCycles, statistics::units::Cycle::get(), "Total number of cycles that CPU has spent quiesced or waiting " "for an interrupt"), - topDownStats(cpu) { - // Register any of the O3CPU's stats here. - timesIdled.prereq(timesIdled); + topDownStats(cpu) +{ + // Register any of the O3CPU's stats here. + timesIdled + .prereq(timesIdled); - idleCycles.prereq(idleCycles); + idleCycles + .prereq(idleCycles); - quiesceCycles.prereq(quiesceCycles); + quiesceCycles + .prereq(quiesceCycles); } CPU::CPUStats::TopDownStats::TopDownStats(CPU *cpu) diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc index 4cba6bad963..04ff7c94d08 100644 --- a/src/cpu/o3/inst_queue.cc +++ b/src/cpu/o3/inst_queue.cc @@ -179,82 +179,92 @@ InstructionQueue::name() const InstructionQueue::IQStats::IQStats(CPU *cpu, const unsigned &total_width) : statistics::Group(cpu), - ADD_STAT(instsAdded, statistics::units::Count::get(), - "Number of instructions added to the IQ (excludes non-spec)"), - ADD_STAT(nonSpecInstsAdded, statistics::units::Count::get(), - "Number of non-speculative instructions added to the IQ"), - ADD_STAT(instsIssued, statistics::units::Count::get(), - "Number of instructions issued"), - ADD_STAT(intInstsIssued, statistics::units::Count::get(), - "Number of integer instructions issued"), - ADD_STAT(floatInstsIssued, statistics::units::Count::get(), - "Number of float instructions issued"), - ADD_STAT(branchInstsIssued, statistics::units::Count::get(), - "Number of branch instructions issued"), - ADD_STAT(memInstsIssued, statistics::units::Count::get(), - "Number of memory instructions issued"), - ADD_STAT(miscInstsIssued, statistics::units::Count::get(), - "Number of miscellaneous instructions issued"), - ADD_STAT(squashedInstsIssued, statistics::units::Count::get(), - "Number of squashed instructions issued"), - ADD_STAT(squashedInstsExamined, statistics::units::Count::get(), - "Number of squashed instructions iterated over during squash; " - "mainly for profiling"), - ADD_STAT(squashedOperandsExamined, statistics::units::Count::get(), - "Number of squashed operands that are examined and possibly " - "removed from graph"), - ADD_STAT(squashedNonSpecRemoved, statistics::units::Count::get(), - "Number of squashed non-spec instructions that were removed"), - ADD_STAT(numIssuedDist, statistics::units::Count::get(), - "Number of insts issued each cycle"), - ADD_STAT(statFuBusy, statistics::units::Count::get(), - "attempts to use FU when none available"), - ADD_STAT(statIssuedInstType, statistics::units::Count::get(), - "Number of instructions issued per FU type, per thread"), - ADD_STAT(issueRate, - statistics::units::Rate::get(), + ADD_STAT(instsAdded, statistics::units::Count::get(), + "Number of instructions added to the IQ (excludes non-spec)"), + ADD_STAT(nonSpecInstsAdded, statistics::units::Count::get(), + "Number of non-speculative instructions added to the IQ"), + ADD_STAT(instsIssued, statistics::units::Count::get(), + "Number of instructions issued"), + ADD_STAT(intInstsIssued, statistics::units::Count::get(), + "Number of integer instructions issued"), + ADD_STAT(floatInstsIssued, statistics::units::Count::get(), + "Number of float instructions issued"), + ADD_STAT(branchInstsIssued, statistics::units::Count::get(), + "Number of branch instructions issued"), + ADD_STAT(memInstsIssued, statistics::units::Count::get(), + "Number of memory instructions issued"), + ADD_STAT(miscInstsIssued, statistics::units::Count::get(), + "Number of miscellaneous instructions issued"), + ADD_STAT(squashedInstsIssued, statistics::units::Count::get(), + "Number of squashed instructions issued"), + ADD_STAT(squashedInstsExamined, statistics::units::Count::get(), + "Number of squashed instructions iterated over during squash; " + "mainly for profiling"), + ADD_STAT(squashedOperandsExamined, statistics::units::Count::get(), + "Number of squashed operands that are examined and possibly " + "removed from graph"), + ADD_STAT(squashedNonSpecRemoved, statistics::units::Count::get(), + "Number of squashed non-spec instructions that were removed"), + ADD_STAT(numIssuedDist, statistics::units::Count::get(), + "Number of insts issued each cycle"), + ADD_STAT(statFuBusy, statistics::units::Count::get(), + "attempts to use FU when none available"), + ADD_STAT(statIssuedInstType, statistics::units::Count::get(), + "Number of instructions issued per FU type, per thread"), + ADD_STAT(issueRate, statistics::units::Rate< + statistics::units::Count, statistics::units::Cycle>::get(), "Inst issue rate", instsIssued / cpu->baseStats.numCycles), - ADD_STAT(fuBusy, statistics::units::Count::get(), - "FU busy when requested"), - ADD_STAT(fuBusyRate, - statistics::units::Rate::get(), - "FU busy rate (busy events/executed inst)"), - ADD_STAT(numInstsExec0, statistics::units::Count::get(), - "0 instructions executed in a cycle"), - ADD_STAT(numInstsExec1, statistics::units::Count::get(), - "1 instruction executed in a cycle"), - ADD_STAT(numInstsExec2, statistics::units::Count::get(), - "2 instructions executed in a cycle"), - ADD_STAT(loadStallCycles, statistics::units::Cycle::get(), - "Top down, no uops executed and at least 1 in-flight load"), - ADD_STAT(L1miss, statistics::units::Cycle::get(), "l1miss"), - ADD_STAT(L2miss, statistics::units::Cycle::get(), "l2miss"), - ADD_STAT(L3miss, statistics::units::Cycle::get(), "l1miss") { - instsAdded.prereq(instsAdded); + ADD_STAT(fuBusy, statistics::units::Count::get(), "FU busy when requested"), + ADD_STAT(fuBusyRate, statistics::units::Rate< + statistics::units::Count, statistics::units::Count>::get(), + "FU busy rate (busy events/executed inst)"), + ADD_STAT(numInstsExec0, statistics::units::Count::get(), + "0 instructions executed in a cycle"), + ADD_STAT(numInstsExec1, statistics::units::Count::get(), + "1 instruction executed in a cycle"), + ADD_STAT(numInstsExec2, statistics::units::Count::get(), + "2 instructions executed in a cycle"), + ADD_STAT(loadStallCycles, statistics::units::Cycle::get(), + "Top down, no uops executed and at least 1 in-flight load"), + ADD_STAT(L1miss, statistics::units::Cycle::get(), "l1miss"), + ADD_STAT(L2miss, statistics::units::Cycle::get(), "l2miss"), + ADD_STAT(L3miss, statistics::units::Cycle::get(), "l1miss") +{ + instsAdded. + prereq(instsAdded); - nonSpecInstsAdded.prereq(nonSpecInstsAdded); + nonSpecInstsAdded + .prereq(nonSpecInstsAdded); - instsIssued.prereq(instsIssued); + instsIssued + .prereq(instsIssued); - intInstsIssued.prereq(intInstsIssued); + intInstsIssued + .prereq(intInstsIssued); - floatInstsIssued.prereq(floatInstsIssued); + floatInstsIssued + .prereq(floatInstsIssued); - branchInstsIssued.prereq(branchInstsIssued); + branchInstsIssued + .prereq(branchInstsIssued); - memInstsIssued.prereq(memInstsIssued); + memInstsIssued + .prereq(memInstsIssued); - miscInstsIssued.prereq(miscInstsIssued); + miscInstsIssued + .prereq(miscInstsIssued); - squashedInstsIssued.prereq(squashedInstsIssued); + squashedInstsIssued + .prereq(squashedInstsIssued); - squashedInstsExamined.prereq(squashedInstsExamined); + squashedInstsExamined + .prereq(squashedInstsExamined); - squashedOperandsExamined.prereq(squashedOperandsExamined); + squashedOperandsExamined + .prereq(squashedOperandsExamined); - squashedNonSpecRemoved.prereq(squashedNonSpecRemoved); + squashedNonSpecRemoved + .prereq(squashedNonSpecRemoved); /* queueResDist .init(Num_OpClasses, 0, 99, 2) @@ -266,7 +276,10 @@ InstructionQueue::IQStats::IQStats(CPU *cpu, const unsigned &total_width) queueResDist.subname(i, opClassStrings[i]); } */ - numIssuedDist.init(0, total_width, 1).flags(statistics::pdf); + numIssuedDist + .init(0,total_width,1) + .flags(statistics::pdf) + ; /* dist_unissued .init(Num_OpClasses+2) @@ -298,25 +311,35 @@ InstructionQueue::IQStats::IQStats(CPU *cpu, const unsigned &total_width) issueDelayDist.subname(i, subname.str()); } */ - issueRate.flags(statistics::total); - - statFuBusy.init(Num_OpClasses).flags(statistics::pdf | statistics::dist); - for (int i = 0; i < Num_OpClasses; ++i) { + issueRate + .flags(statistics::total) + ; + + statFuBusy + .init(Num_OpClasses) + .flags(statistics::pdf | statistics::dist) + ; + for (int i=0; i < Num_OpClasses; ++i) { statFuBusy.subname(i, enums::OpClassStrings[i]); } - fuBusy.init(cpu->numThreads).flags(statistics::total); + fuBusy + .init(cpu->numThreads) + .flags(statistics::total) + ; - fuBusyRate.flags(statistics::total); - fuBusyRate = fuBusy / instsIssued; + fuBusyRate + .flags(statistics::total) + ; + fuBusyRate = fuBusy / instsIssued; - numInstsExec0.prereq(numInstsExec0); - numInstsExec1.prereq(numInstsExec1); - numInstsExec2.prereq(numInstsExec2); + numInstsExec0.prereq(numInstsExec0); + numInstsExec1.prereq(numInstsExec1); + numInstsExec2.prereq(numInstsExec2); - L1miss.prereq(L1miss); - L2miss.prereq(L2miss); - L3miss.prereq(L3miss); + L1miss.prereq(L1miss); + L2miss.prereq(L2miss); + L3miss.prereq(L3miss); } InstructionQueue::IQIOStats::IQIOStats(statistics::Group *parent) diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index 65ee52979f4..f81ead16cf2 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -452,7 +452,8 @@ LSQ::sendRetryResp() dcachePort.sendRetryResp(); } -bool LSQ::anyCacheLevelMisses(int level) { +bool +LSQ::anyCacheLevelMisses(int level) { for (LSQUnit &unit : thread) { for (auto &entry : unit.loadQueue) { if (entry.valid() && entry.hasRequest()) { diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc index c4539234ac4..8ded3719324 100644 --- a/src/cpu/o3/rename.cc +++ b/src/cpu/o3/rename.cc @@ -118,7 +118,7 @@ Rename::RenameStats::RenameStats(statistics::Group *parent) ADD_STAT(IQFullEvents, statistics::units::Count::get(), "Number of times rename has blocked due to IQ full"), ADD_STAT(LQFullEvents, statistics::units::Count::get(), - "Number of times rename has blocked due to LQ full"), + "Number of times rename has blocked due to LQ full" ), ADD_STAT(SQFullEvents, statistics::units::Count::get(), "Number of times rename has blocked due to SQ full"), ADD_STAT(fullRegistersEvents, statistics::units::Count::get(), @@ -147,48 +147,47 @@ Rename::RenameStats::RenameStats(statistics::Group *parent) "count of temporary serializing insts renamed"), ADD_STAT(skidInsts, statistics::units::Count::get(), "count of insts added to the skid buffer"), - ADD_STAT( - intReturned, statistics::units::Count::get(), + ADD_STAT(intReturned, statistics::units::Count::get(), "count of registers freed and written back to integer free list"), ADD_STAT(fpReturned, statistics::units::Count::get(), - "count of registers freed and written back to floating point " - "free list"), + "count of registers freed and written back to floating point free list"), ADD_STAT(storeStalls, statistics::units::Cycle::get(), - "Number of cycles with few uops executed and no more stores can " - "be issued") { - squashCycles.prereq(squashCycles); - idleCycles.prereq(idleCycles); - blockCycles.prereq(blockCycles); - serializeStallCycles.flags(statistics::total); - runCycles.prereq(idleCycles); - unblockCycles.prereq(unblockCycles); - - renamedInsts.prereq(renamedInsts); - squashedInsts.prereq(squashedInsts); - - ROBFullEvents.prereq(ROBFullEvents); - IQFullEvents.prereq(IQFullEvents); - LQFullEvents.prereq(LQFullEvents); - SQFullEvents.prereq(SQFullEvents); - fullRegistersEvents.prereq(fullRegistersEvents); - - renamedOperands.prereq(renamedOperands); - lookups.prereq(lookups); - intLookups.prereq(intLookups); - fpLookups.prereq(fpLookups); - vecLookups.prereq(vecLookups); - vecPredLookups.prereq(vecPredLookups); - matLookups.prereq(matLookups); - - committedMaps.prereq(committedMaps); - undoneMaps.prereq(undoneMaps); - serializing.flags(statistics::total); - tempSerializing.flags(statistics::total); - skidInsts.flags(statistics::total); - - intReturned.prereq(intReturned); - fpReturned.prereq(fpReturned); - storeStalls.prereq(storeStalls); + "Number of cycles with few uops executed and no more stores" + "can be issued") +{ + squashCycles.prereq(squashCycles); + idleCycles.prereq(idleCycles); + blockCycles.prereq(blockCycles); + serializeStallCycles.flags(statistics::total); + runCycles.prereq(idleCycles); + unblockCycles.prereq(unblockCycles); + + renamedInsts.prereq(renamedInsts); + squashedInsts.prereq(squashedInsts); + + ROBFullEvents.prereq(ROBFullEvents); + IQFullEvents.prereq(IQFullEvents); + LQFullEvents.prereq(LQFullEvents); + SQFullEvents.prereq(SQFullEvents); + fullRegistersEvents.prereq(fullRegistersEvents); + + renamedOperands.prereq(renamedOperands); + lookups.prereq(lookups); + intLookups.prereq(intLookups); + fpLookups.prereq(fpLookups); + vecLookups.prereq(vecLookups); + vecPredLookups.prereq(vecPredLookups); + matLookups.prereq(matLookups); + + committedMaps.prereq(committedMaps); + undoneMaps.prereq(undoneMaps); + serializing.flags(statistics::total); + tempSerializing.flags(statistics::total); + skidInsts.flags(statistics::total); + + intReturned.prereq(intReturned); + fpReturned.prereq(fpReturned); + storeStalls.prereq(storeStalls); } void From 913ba5945f598a280890c2706de4d12a0dd2c2b4 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 10 Jun 2025 11:05:19 +0000 Subject: [PATCH 5/7] cpu: topdown l2 bad speculation l3 frontend --- src/cpu/o3/commit.hh | 1 + src/cpu/o3/cpu.cc | 77 +++++++++++++++++++++++++++++++------------- src/cpu/o3/cpu.hh | 18 ++++++++--- src/cpu/o3/decode.cc | 33 ++++++++++--------- src/cpu/o3/fetch.hh | 3 ++ 5 files changed, 90 insertions(+), 42 deletions(-) diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index 675ee4febc6..2c365569595 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -487,6 +487,7 @@ class Commit statistics::Vector functionCalls; /** Committed instructions by instruction type (OpClass) */ statistics::Vector2d committedInstType; + /** Number of cycles where the commit bandwidth limit is reached. */ statistics::Scalar commitEligibleSamples; /** Top Down Methodology, Number of commited instructions*/ diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 906109f6d30..8f35a323689 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -371,7 +371,7 @@ CPU::CPUStats::CPUStats(CPU *cpu) CPU::CPUStats::TopDownStats::TopDownStats(CPU *cpu) : statistics::Group(cpu, "TopDownStats"), topDownL1(cpu), topDownFbL2(cpu), - topDownBbL2(cpu), topDownBbMem(cpu) {} + topDownBsL2(cpu), topDownBbL2(cpu), topDownBbMem(cpu), topDownFlL3(cpu) {} CPU::CPUStats::TopDownStats::TopDownL1::TopDownL1(CPU *cpu) : statistics::Group(cpu, "TopDownL1"), @@ -431,25 +431,60 @@ CPU::CPUStats::TopDownStats::TopDownFrontendBoundL2::TopDownFrontendBoundL2( cpu->cpuStats.topDownStats.topDownL1.frontendBound - fetchLatency; } -// CPU::CPUStats::TopDownStats::TopDownFrontendBoundL2::TopDownFrontendBoundL2(CPU -// *cpu) -// : statistics::Group(cpu, "TopDownL2_FrontendBound"), -// ADD_STAT(fetchLatency, -// statistics::units::Rate::get(), -// "Fetch Latency Bound, frontend stalls due to instruction cache -// inefficiency"), -// ADD_STAT(fetchBandwidth, -// statistics::units::Rate::get(), -// "Fetch Bandwidth Bound, frontend stalls due to decoder -// inefficiency") -// { -// // Frontend L2 -// fetchLatency = cpu->decode.getStats().fetchBubblesMax / -// (cpu->baseStats.numCycles); fetchBandwidth = -// cpu->cpuStats.topDownStats.topDownL1.frontendBound - fetchLatency; -// } +CPU::CPUStats::TopDownStats::TopDownBadSpeculationL2 +::TopDownBadSpeculationL2(CPU *cpu) + : statistics::Group(cpu, "TopDownL2_BadSpeculation"), + ADD_STAT(branchMissPredicts, + statistics::units::Rate::get(), + "Branch Miss Predicts"), + ADD_STAT(machineClears, + statistics::units::Rate::get(), + "Machine Clears") +{ + branchMissPredicts = cpu->commit.getStats().recoveryBubblesMissprediction + / (cpu->commit.getStats().recoveryBubblesMissprediction + + cpu->commit.getStats().recoveryBubblesMemoryNuke); + + machineClears = cpu->commit.getStats().recoveryBubblesMemoryNuke / + (cpu->commit.getStats().recoveryBubblesMissprediction + + cpu->commit.getStats().recoveryBubblesMemoryNuke); +} + +CPU::CPUStats::TopDownStats::TopDownFrontendBoundL3 +::TopDownFrontendBoundL3(CPU *cpu) + : statistics::Group(cpu, "TopDownL3_FrontendBound"), + ADD_STAT(iTlbMiss, + statistics::units::Rate::get(), + "Instruction TLB Miss Stalls"), + ADD_STAT(iCacheMiss, + statistics::units::Rate::get(), + "Instruction Cache Miss Stalls"), + ADD_STAT(branchResteer, + statistics::units::Rate::get(), + "Branch Resteer Stalls"), + ADD_STAT(others, + statistics::units::Rate::get(), + "Others") +{ + auto sum = cpu->fetchStats[0]->icacheStallCycles + + cpu->fetch.getStats().tlbCycles + cpu->fetch.getStats().ftqStallCycles + + cpu->fetch.getStats().miscStallCycles; + + iTlbMiss = cpu->fetch.getStats().tlbCycles / sum; + + //TODO: change 0 with tid? + iCacheMiss = cpu->fetchStats[0]->icacheStallCycles / sum; + + branchResteer = cpu->fetch.getStats().ftqStallCycles / sum; + + others = cpu->fetch.getStats().miscStallCycles / sum; +} CPU::CPUStats::TopDownStats::TopDownBackendBoundL2::TopDownBackendBoundL2( CPU *cpu) @@ -469,8 +504,6 @@ CPU::CPUStats::TopDownStats::TopDownBackendBoundL2::TopDownBackendBoundL2( cpu->iew.instQueue.getStats().numInstsExec1 + cpu->iew.instQueue.getStats().numInstsExec2) / (cpu->baseStats.numCycles); - // memoryBound = (cpu->iew.instQueue.getStats().loadStallCycles + - // cpu->rename.getStats().SQFullEvents) / (cpu->baseStats.numCycles); memoryBound = (cpu->iew.instQueue.getStats().loadStallCycles + cpu->rename.getStats().storeStalls) / (cpu->baseStats.numCycles); diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index eec0f44bebf..4f9b6fc1347 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -619,11 +619,11 @@ class CPU : public BaseCPU statistics::Formula fetchBandwidth; } topDownFbL2; - // struct TopDownBadSpeculationL2 : statistics::Group{ - // TopDownBadSpeculationL2(CPU *cpu); - // statistics::Formula branchMissPredicts; - // statistics::Formula machineClears; - // } + struct TopDownBadSpeculationL2 : statistics::Group{ + TopDownBadSpeculationL2(CPU *cpu); + statistics::Formula branchMissPredicts; + statistics::Formula machineClears; + } topDownBsL2; struct TopDownBackendBoundL2 : statistics::Group { TopDownBackendBoundL2(CPU *cpu); @@ -641,6 +641,14 @@ class CPU : public BaseCPU statistics::Formula storeBound; } topDownBbMem; + struct TopDownFrontendBoundL3 : statistics::Group { + TopDownFrontendBoundL3(CPU *cpu); + statistics::Formula iTlbMiss; + statistics::Formula iCacheMiss; + statistics::Formula branchResteer; + statistics::Formula others; + } topDownFlL3; + } topDownStats; } cpuStats; diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc index f2f393f936d..22491c6286f 100644 --- a/src/cpu/o3/decode.cc +++ b/src/cpu/o3/decode.cc @@ -164,19 +164,20 @@ Decode::DecodeStats::DecodeStats(CPU *cpu) "delivered to backend"), ADD_STAT(fetchBubblesMax, statistics::units::Count::get(), "Stat for Top-Down Methodology, number of cycles in which no " - "instructions are delivered to backend") { - idleCycles.prereq(idleCycles); - blockedCycles.prereq(blockedCycles); - runCycles.prereq(runCycles); - unblockCycles.prereq(unblockCycles); - squashCycles.prereq(squashCycles); - branchResolved.prereq(branchResolved); - branchMispred.prereq(branchMispred); - controlMispred.prereq(controlMispred); - decodedInsts.prereq(decodedInsts); - squashedInsts.prereq(squashedInsts); - fetchBubbles.prereq(fetchBubbles); - fetchBubblesMax.prereq(fetchBubblesMax); + "instructions are delivered to backend") +{ + idleCycles.prereq(idleCycles); + blockedCycles.prereq(blockedCycles); + runCycles.prereq(runCycles); + unblockCycles.prereq(unblockCycles); + squashCycles.prereq(squashCycles); + branchResolved.prereq(branchResolved); + branchMispred.prereq(branchMispred); + controlMispred.prereq(controlMispred); + decodedInsts.prereq(decodedInsts); + squashedInsts.prereq(squashedInsts); + fetchBubbles.prereq(fetchBubbles); + fetchBubblesMax.prereq(fetchBubblesMax); } void @@ -590,7 +591,7 @@ Decode::tick() stats.fetchBubbles += fetchBubbles; if (fetchBubbles == decodeWidth) - stats.fetchBubblesMax++; + stats.fetchBubblesMax++; } if (status_change) { @@ -663,7 +664,9 @@ Decode::decodeInsts(ThreadID tid) " early.\n",tid); // Should I change the status to idle? ++stats.idleCycles; - return; + // if (timeBuffer->access(tid)->fetchInfo[tid].squash) + // fetchBubbles -= decodeWidth; + // return; } else if (decodeStatus[tid] == Unblocking) { DPRINTF(Decode, "[tid:%i] Unblocking, removing insts from skid " "buffer.\n",tid); diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 05d92e37567..0f1192d2f97 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -592,6 +592,9 @@ class Fetch /** Rate of how often fetch was idle. */ statistics::Formula idleRate; } fetchStats; + + public: + const FetchStatGroup &getStats() const { return fetchStats; } }; } // namespace o3 From 11935038ad3daa192cad9f9f8098d9d197ada963 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 10 Jun 2025 15:40:13 +0000 Subject: [PATCH 6/7] cpu: top down bad speculation fix --- src/cpu/o3/commit.cc | 28 +------------- src/cpu/o3/commit.hh | 11 ------ src/cpu/o3/cpu.cc | 92 ++++++++++++++++++-------------------------- src/cpu/o3/cpu.hh | 14 +++---- src/cpu/o3/decode.hh | 2 + src/cpu/o3/iew.cc | 1 + src/cpu/o3/iew.hh | 6 ++- src/cpu/o3/rename.hh | 2 + 8 files changed, 55 insertions(+), 101 deletions(-) diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index 41a3f409f24..cfc6ae8e2b3 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -167,11 +167,7 @@ Commit::CommitStats::CommitStats(CPU *cpu, Commit *commit) ADD_STAT(commitEligibleSamples, statistics::units::Cycle::get(), "number cycles where commit BW limit reached"), ADD_STAT(committedInst, statistics::units::Count::get(), - "Required for Top-Down, number of committed instructions"), - ADD_STAT(recoveryBubblesMissprediction, statistics::units::Cycle::get(), - "Required for Top-Down, recovery bubbles"), - ADD_STAT(recoveryBubblesMemoryNuke, statistics::units::Cycle::get(), - "Required for Top-Down, recovery bubbles") + "Required for Top-Down, number of committed instructions") { using namespace statistics; @@ -967,12 +963,6 @@ Commit::commitInsts() DPRINTF(Commit, "Retiring squashed instruction from " "ROB.\n"); - if (!isMissPredicted && !isMemoryViolation) { - stats.numMachineClear++; - isMemoryViolation = true; - recoveryBubbleStart = cpu->curCycle(); - } - rob->retireHead(commit_thread); ++stats.commitSquashedInsts; @@ -1005,22 +995,6 @@ Commit::commitInsts() stats.committedInstType[tid][head_inst->opClass()]++; ppCommit->notify(head_inst); - if (isMissPredicted) { - stats.recoveryBubblesMissprediction += - uint64_t(cpu->curCycle() - recoveryBubbleStart); - } else if (isMemoryViolation) { - stats.recoveryBubblesMemoryNuke += - uint64_t(cpu->curCycle() - recoveryBubbleStart); - } - - isMemoryViolation = false; - isMissPredicted = false; - - if (head_inst->mispredicted()) { - recoveryBubbleStart = cpu->curCycle(); - isMissPredicted = true; - } - // hardware transactional memory // update nesting depth diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index 2c365569595..3f16f1adc87 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -492,19 +492,8 @@ class Commit statistics::Scalar commitEligibleSamples; /** Top Down Methodology, Number of commited instructions*/ statistics::Scalar committedInst; - statistics::Scalar numMachineClear; - /** Top Down Methodology, Recovery bubbles, miss predictions*/ - statistics::Scalar recoveryBubblesMissprediction; - /** Top Down Methodology, Recovery bubbles, memory nukes */ - statistics::Scalar recoveryBubblesMemoryNuke; - } stats; - // Top Down Methodology - Cycles recoveryBubbleStart; - bool isMissPredicted = false; - bool isMemoryViolation = false; - public: const CommitStats &getStats() const { return stats; } }; diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 8f35a323689..f3c8a7a938b 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -371,7 +371,7 @@ CPU::CPUStats::CPUStats(CPU *cpu) CPU::CPUStats::TopDownStats::TopDownStats(CPU *cpu) : statistics::Group(cpu, "TopDownStats"), topDownL1(cpu), topDownFbL2(cpu), - topDownBsL2(cpu), topDownBbL2(cpu), topDownBbMem(cpu), topDownFlL3(cpu) {} + topDownBsL2(cpu), topDownBbL2(cpu), topDownBbMem(cpu) {} CPU::CPUStats::TopDownStats::TopDownL1::TopDownL1(CPU *cpu) : statistics::Group(cpu, "TopDownL1"), @@ -387,28 +387,39 @@ CPU::CPUStats::TopDownStats::TopDownL1::TopDownL1(CPU *cpu) ADD_STAT(backendBound, statistics::units::Rate::get(), - "Backend Bound, fraction of slots lost due to backend resource " - "constraints."), + "Backend Bound, fraction of slots lost due to backend resource" + " constraints."), ADD_STAT( retiring, statistics::units::Rate::get(), "Retiring, fraction of slots successfully retired by the backend") { - // L1 - frontendBound = cpu->decode.getStats().fetchBubbles / - (cpu->rename.getWidth() * cpu->baseStats.numCycles); + // L1 + frontendBound = cpu->decode.getStats().fetchBubbles / + (cpu->rename.getWidth() * cpu->baseStats.numCycles); - badSpeculation = (cpu->rename.getStats().renamedInsts - - cpu->commit.getStats().committedInst + - (cpu->commit.getStats().recoveryBubblesMissprediction + - cpu->commit.getStats().recoveryBubblesMemoryNuke) - * cpu->rename.getWidth()) / - (cpu->rename.getWidth() * cpu->baseStats.numCycles); + int recoveryCycleToDecode = cpu->decode.getFetchToDecodeDelay(); - retiring = cpu->commit.getStats().committedInst / - (cpu->rename.getWidth() * cpu->baseStats.numCycles); - - backendBound = 1 - (frontendBound + badSpeculation + retiring); + int recoveryCycleToIEW = cpu->decode.getFetchToDecodeDelay() + + cpu->rename.getDecodeToRenameDelay() + cpu->iew.getRenameToIEWDelay(); + + auto wastedSlots = cpu->rename.getStats().renamedInsts - + cpu->commit.getStats().committedInst; + + auto decodeBranchMispred = (int)recoveryCycleToDecode + * cpu->decode.getStats().branchMispred; + + auto iewBranchMispred = (int)recoveryCycleToIEW + * cpu->iew.getStats().branchMispredicts; + + badSpeculation = (wastedSlots + (decodeBranchMispred + iewBranchMispred) + * cpu->rename.getWidth()) / + (cpu->rename.getWidth() * cpu->baseStats.numCycles); + + retiring = cpu->commit.getStats().committedInst / + (cpu->rename.getWidth() * cpu->baseStats.numCycles); + + backendBound = 1 - (frontendBound + badSpeculation + retiring); } CPU::CPUStats::TopDownStats::TopDownFrontendBoundL2::TopDownFrontendBoundL2( @@ -441,51 +452,22 @@ ::TopDownBadSpeculationL2(CPU *cpu) ADD_STAT(machineClears, statistics::units::Rate::get(), - "Machine Clears") + "Memory Order Violations") { - branchMissPredicts = cpu->commit.getStats().recoveryBubblesMissprediction - / (cpu->commit.getStats().recoveryBubblesMissprediction + - cpu->commit.getStats().recoveryBubblesMemoryNuke); + auto &iewMissPred = cpu->iew.getStats().branchMispredicts; + auto &decodeMissPred = cpu->decode.getStats().branchMispred; - machineClears = cpu->commit.getStats().recoveryBubblesMemoryNuke / - (cpu->commit.getStats().recoveryBubblesMissprediction + - cpu->commit.getStats().recoveryBubblesMemoryNuke); -} + auto brMispredictFraction = (iewMissPred + decodeMissPred) / (iewMissPred + + decodeMissPred + cpu->iew.getStats().memOrderViolationEvents); -CPU::CPUStats::TopDownStats::TopDownFrontendBoundL3 -::TopDownFrontendBoundL3(CPU *cpu) - : statistics::Group(cpu, "TopDownL3_FrontendBound"), - ADD_STAT(iTlbMiss, - statistics::units::Rate::get(), - "Instruction TLB Miss Stalls"), - ADD_STAT(iCacheMiss, - statistics::units::Rate::get(), - "Instruction Cache Miss Stalls"), - ADD_STAT(branchResteer, - statistics::units::Rate::get(), - "Branch Resteer Stalls"), - ADD_STAT(others, - statistics::units::Rate::get(), - "Others") -{ - auto sum = cpu->fetchStats[0]->icacheStallCycles + - cpu->fetch.getStats().tlbCycles + cpu->fetch.getStats().ftqStallCycles - + cpu->fetch.getStats().miscStallCycles; - - iTlbMiss = cpu->fetch.getStats().tlbCycles / sum; - - //TODO: change 0 with tid? - iCacheMiss = cpu->fetchStats[0]->icacheStallCycles / sum; + branchMissPredicts = brMispredictFraction + * cpu->cpuStats.topDownStats.topDownL1.badSpeculation; - branchResteer = cpu->fetch.getStats().ftqStallCycles / sum; - - others = cpu->fetch.getStats().miscStallCycles / sum; + machineClears = cpu->cpuStats.topDownStats.topDownL1.badSpeculation + - branchMissPredicts; } + CPU::CPUStats::TopDownStats::TopDownBackendBoundL2::TopDownBackendBoundL2( CPU *cpu) : statistics::Group(cpu, "TopDownL2_BackendBound"), diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index 4f9b6fc1347..0b8d0e43135 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -641,13 +641,13 @@ class CPU : public BaseCPU statistics::Formula storeBound; } topDownBbMem; - struct TopDownFrontendBoundL3 : statistics::Group { - TopDownFrontendBoundL3(CPU *cpu); - statistics::Formula iTlbMiss; - statistics::Formula iCacheMiss; - statistics::Formula branchResteer; - statistics::Formula others; - } topDownFlL3; + // struct TopDownFrontendBoundL3 : statistics::Group { + // TopDownFrontendBoundL3(CPU *cpu); + // statistics::Formula iTlbMiss; + // statistics::Formula iCacheMiss; + // statistics::Formula branchResteer; + // statistics::Formula others; + // } topDownFlL3; } topDownStats; diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh index 7c07505c75e..5dbab781f96 100644 --- a/src/cpu/o3/decode.hh +++ b/src/cpu/o3/decode.hh @@ -337,6 +337,8 @@ class Decode public: const DecodeStats &getStats() const { return stats; } + + Cycles getFetchToDecodeDelay() { return fetchToDecodeDelay; } }; } // namespace o3 diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index a01c6b9deca..db491315b40 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -1331,6 +1331,7 @@ IEW::executeInsts() ++iewStats.memOrderViolationEvents; } + } else { // Reset any state associated with redirects that will not // be used. diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index 4fe8227dcc8..f44021a1d61 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -414,7 +414,6 @@ class IEW /** Maximum size of the skid buffer. */ unsigned skidBufferMax; - struct IEWStats : public statistics::Group { IEWStats(CPU *cpu); @@ -475,6 +474,11 @@ class IEW /** Average number of woken instructions per writeback. */ statistics::Formula wbFanout; } iewStats; + + public: + const IEWStats &getStats() const { return iewStats; } + + Cycles getRenameToIEWDelay() { return renameToIEWDelay; } }; } // namespace o3 diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh index 4cdec7e0bbd..c8cfcb75416 100644 --- a/src/cpu/o3/rename.hh +++ b/src/cpu/o3/rename.hh @@ -548,6 +548,8 @@ class Rename const RenameStats &getStats() const { return stats; } unsigned getWidth() const { return renameWidth; } + + int getDecodeToRenameDelay() { return decodeToRenameDelay; } }; } // namespace o3 From 8ea14d5a5976f406c777400cdc9a965f828b3fce Mon Sep 17 00:00:00 2001 From: root Date: Sun, 15 Jun 2025 14:49:24 +0000 Subject: [PATCH 7/7] cpu: top-down branch calculation change --- src/cpu/o3/commit.cc | 30 +++----- src/cpu/o3/cpu.cc | 136 ++++++++++++++++++++++-------------- src/cpu/o3/decode.cc | 11 +-- src/cpu/o3/fetch.cc | 160 ++++++++++++++++++++++--------------------- src/cpu/o3/fetch.hh | 8 +++ src/cpu/o3/iew.hh | 2 +- 6 files changed, 189 insertions(+), 158 deletions(-) diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index cfc6ae8e2b3..c5e0a4ebbd5 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -167,32 +167,22 @@ Commit::CommitStats::CommitStats(CPU *cpu, Commit *commit) ADD_STAT(commitEligibleSamples, statistics::units::Cycle::get(), "number cycles where commit BW limit reached"), ADD_STAT(committedInst, statistics::units::Count::get(), - "Required for Top-Down, number of committed instructions") -{ - using namespace statistics; + "Required for Top-Down, number of committed instructions") { + using namespace statistics; - commitSquashedInsts.prereq(commitSquashedInsts); - commitNonSpecStalls.prereq(commitNonSpecStalls); - branchMispredicts.prereq(branchMispredicts); + commitSquashedInsts.prereq(commitSquashedInsts); + commitNonSpecStalls.prereq(commitNonSpecStalls); + branchMispredicts.prereq(branchMispredicts); - numCommittedDist - .init(0,commit->commitWidth,1) - .flags(statistics::pdf); + numCommittedDist.init(0, commit->commitWidth, 1).flags(statistics::pdf); - amos - .init(cpu->numThreads) - .flags(total); + amos.init(cpu->numThreads).flags(total); - membars - .init(cpu->numThreads) - .flags(total); + membars.init(cpu->numThreads).flags(total); - functionCalls - .init(commit->numThreads) - .flags(total); + functionCalls.init(commit->numThreads).flags(total); - committedInstType - .init(commit->numThreads,enums::Num_OpClass) + committedInstType.init(commit->numThreads, enums::Num_OpClass) .flags(total | pdf | dist); committedInstType.ysubnames(enums::OpClassStrings); diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index f3c8a7a938b..71ef8865779 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -394,32 +394,43 @@ CPU::CPUStats::TopDownStats::TopDownL1::TopDownL1(CPU *cpu) statistics::units::Rate::get(), "Retiring, fraction of slots successfully retired by the backend") { - // L1 - frontendBound = cpu->decode.getStats().fetchBubbles / - (cpu->rename.getWidth() * cpu->baseStats.numCycles); - int recoveryCycleToDecode = cpu->decode.getFetchToDecodeDelay(); + // Total Slots + statistics::Temp totalSlots = + cpu->rename.getWidth() * cpu->baseStats.numCycles; - int recoveryCycleToIEW = cpu->decode.getFetchToDecodeDelay() + - cpu->rename.getDecodeToRenameDelay() + cpu->iew.getRenameToIEWDelay(); + // L1 Frontend Bound + frontendBound = cpu->fetch.getStats().fetchBubbles / (totalSlots); - auto wastedSlots = cpu->rename.getStats().renamedInsts - - cpu->commit.getStats().committedInst; - - auto decodeBranchMispred = (int)recoveryCycleToDecode - * cpu->decode.getStats().branchMispred; + // L1 Bad Speculation + // Recovery cycles for mispredictions detected at Decode + int recoveryCycleToDecode = cpu->decode.getFetchToDecodeDelay(); - auto iewBranchMispred = (int)recoveryCycleToIEW - * cpu->iew.getStats().branchMispredicts; + auto decodeBranchMispred = + (int)recoveryCycleToDecode * cpu->decode.getStats().branchMispred; - badSpeculation = (wastedSlots + (decodeBranchMispred + iewBranchMispred) - * cpu->rename.getWidth()) / - (cpu->rename.getWidth() * cpu->baseStats.numCycles); + // Recovery cycles for mispredictions detected at IEW + int recoveryCycleToIEW = cpu->decode.getFetchToDecodeDelay() + + cpu->rename.getDecodeToRenameDelay() + + cpu->iew.getRenameToIEWDelay(); - retiring = cpu->commit.getStats().committedInst / - (cpu->rename.getWidth() * cpu->baseStats.numCycles); - - backendBound = 1 - (frontendBound + badSpeculation + retiring); + auto iewBadSpec = + (int)recoveryCycleToIEW * (cpu->iew.getStats().branchMispredicts + + cpu->iew.getStats().memOrderViolationEvents); + + // Number of wasted slots due to bad speculation + auto wastedSlots = cpu->rename.getStats().renamedInsts - + cpu->commit.getStats().committedInst; + + badSpeculation = (wastedSlots + (decodeBranchMispred + iewBadSpec) * + cpu->rename.getWidth()) / + (totalSlots); + + // L1 Retiring + retiring = cpu->commit.getStats().committedInst / (totalSlots); + + // L1 Backend Bound + backendBound = 1 - (frontendBound + badSpeculation + retiring); } CPU::CPUStats::TopDownStats::TopDownFrontendBoundL2::TopDownFrontendBoundL2( @@ -437,37 +448,37 @@ CPU::CPUStats::TopDownStats::TopDownFrontendBoundL2::TopDownFrontendBoundL2( "inefficiency") { // Frontend L2 fetchLatency = - cpu->decode.getStats().fetchBubblesMax / (cpu->baseStats.numCycles); + cpu->fetch.getStats().fetchBubblesMax / (cpu->baseStats.numCycles); fetchBandwidth = cpu->cpuStats.topDownStats.topDownL1.frontendBound - fetchLatency; } -CPU::CPUStats::TopDownStats::TopDownBadSpeculationL2 -::TopDownBadSpeculationL2(CPU *cpu) +CPU::CPUStats::TopDownStats::TopDownBadSpeculationL2 ::TopDownBadSpeculationL2( + CPU *cpu) : statistics::Group(cpu, "TopDownL2_BadSpeculation"), ADD_STAT(branchMissPredicts, - statistics::units::Rate::get(), + statistics::units::Rate::get(), "Branch Miss Predicts"), ADD_STAT(machineClears, - statistics::units::Rate::get(), - "Memory Order Violations") -{ - auto &iewMissPred = cpu->iew.getStats().branchMispredicts; - auto &decodeMissPred = cpu->decode.getStats().branchMispred; + statistics::units::Rate::get(), + "Memory Order Violations") { + auto &iewMissPred = cpu->iew.getStats().branchMispredicts; + auto &decodeMissPred = cpu->decode.getStats().branchMispred; + auto &memOrderViolations = cpu->iew.getStats().memOrderViolationEvents; - auto brMispredictFraction = (iewMissPred + decodeMissPred) / (iewMissPred - + decodeMissPred + cpu->iew.getStats().memOrderViolationEvents); + auto brMispredictFraction = + (iewMissPred + decodeMissPred) / + (iewMissPred + decodeMissPred + memOrderViolations); - branchMissPredicts = brMispredictFraction - * cpu->cpuStats.topDownStats.topDownL1.badSpeculation; + branchMissPredicts = brMispredictFraction * + cpu->cpuStats.topDownStats.topDownL1.badSpeculation; - machineClears = cpu->cpuStats.topDownStats.topDownL1.badSpeculation - - branchMissPredicts; + machineClears = + cpu->cpuStats.topDownStats.topDownL1.badSpeculation - branchMissPredicts; } - CPU::CPUStats::TopDownStats::TopDownBackendBoundL2::TopDownBackendBoundL2( CPU *cpu) : statistics::Group(cpu, "TopDownL2_BackendBound"), @@ -486,10 +497,17 @@ CPU::CPUStats::TopDownStats::TopDownBackendBoundL2::TopDownBackendBoundL2( cpu->iew.instQueue.getStats().numInstsExec1 + cpu->iew.instQueue.getStats().numInstsExec2) / (cpu->baseStats.numCycles); - memoryBound = (cpu->iew.instQueue.getStats().loadStallCycles + - cpu->rename.getStats().storeStalls) / - (cpu->baseStats.numCycles); - coreBound = executionStalls - memoryBound; + auto memoryBoundRaw = (cpu->iew.instQueue.getStats().loadStallCycles + + cpu->rename.getStats().storeStalls) / + (cpu->baseStats.numCycles); + auto coreBoundRaw = executionStalls - memoryBoundRaw; + + auto &totalBackendBound = cpu->cpuStats.topDownStats.topDownL1.backendBound; + + memoryBound = + memoryBoundRaw / (memoryBoundRaw + coreBoundRaw) * (totalBackendBound); + coreBound = + coreBoundRaw / (memoryBoundRaw + coreBoundRaw) * (totalBackendBound); } CPU::CPUStats::TopDownStats::TopDownBackendBoundL3::TopDownBackendBoundL3( @@ -515,20 +533,32 @@ CPU::CPUStats::TopDownStats::TopDownBackendBoundL3::TopDownBackendBoundL3( statistics::units::Rate::get(), "Store Bound") { + + auto &totalBackendBound = cpu->cpuStats.topDownStats.topDownBbL2.memoryBound; + // Backend Bound / Memory Bound L3 - l1Bound = (cpu->iew.instQueue.getStats().loadStallCycles - - cpu->iew.instQueue.getStats().L1miss) / - (cpu->baseStats.numCycles); - l2Bound = (cpu->iew.instQueue.getStats().L1miss - - cpu->iew.instQueue.getStats().L2miss) / - (cpu->baseStats.numCycles); - l3Bound = (cpu->iew.instQueue.getStats().L2miss - - cpu->iew.instQueue.getStats().L3miss) / - (cpu->baseStats.numCycles); - extMemBound = + auto l1BoundRaw = (cpu->iew.instQueue.getStats().loadStallCycles - + cpu->iew.instQueue.getStats().L1miss) / + (cpu->baseStats.numCycles); + auto l2BoundRaw = (cpu->iew.instQueue.getStats().L1miss - + cpu->iew.instQueue.getStats().L2miss) / + (cpu->baseStats.numCycles); + auto l3BoundRaw = (cpu->iew.instQueue.getStats().L2miss - + cpu->iew.instQueue.getStats().L3miss) / + (cpu->baseStats.numCycles); + auto extMemBoundRaw = (cpu->iew.instQueue.getStats().L3miss) / (cpu->baseStats.numCycles); - storeBound = + auto storeBoundRaw = (cpu->rename.getStats().storeStalls) / (cpu->baseStats.numCycles); + + auto totalMemoryBound = + l1BoundRaw + l2BoundRaw + l3BoundRaw + extMemBoundRaw + storeBoundRaw; + + l1Bound = l1BoundRaw / totalMemoryBound * totalBackendBound; + l2Bound = l2BoundRaw / totalMemoryBound * totalBackendBound; + l3Bound = l3BoundRaw / totalMemoryBound * totalBackendBound; + extMemBound = extMemBoundRaw / totalMemoryBound * totalBackendBound; + storeBound = storeBoundRaw / totalMemoryBound * totalBackendBound; } void diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc index 22491c6286f..13176fc3715 100644 --- a/src/cpu/o3/decode.cc +++ b/src/cpu/o3/decode.cc @@ -589,9 +589,12 @@ Decode::tick() decode(status_change, tid); - stats.fetchBubbles += fetchBubbles; - if (fetchBubbles == decodeWidth) + // Check if branch missprediction is detected while decoding + if (!(decodeStatus[tid] == Squashing)) { + stats.fetchBubbles += fetchBubbles; + if (fetchBubbles == decodeWidth) stats.fetchBubblesMax++; + } } if (status_change) { @@ -664,9 +667,7 @@ Decode::decodeInsts(ThreadID tid) " early.\n",tid); // Should I change the status to idle? ++stats.idleCycles; - // if (timeBuffer->access(tid)->fetchInfo[tid].squash) - // fetchBubbles -= decodeWidth; - // return; + return; } else if (decodeStatus[tid] == Unblocking) { DPRINTF(Decode, "[tid:%i] Unblocking, removing insts from skid " "buffer.\n",tid); diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index c4e76fdaf4b..d51fe29f232 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -159,85 +159,73 @@ Fetch::regProbePoints() Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) : statistics::Group(cpu, "fetch"), - ADD_STAT(predictedBranches, statistics::units::Count::get(), - "Number of branches that fetch has predicted taken"), - ADD_STAT(cycles, statistics::units::Cycle::get(), - "Number of cycles fetch has run and was not squashing or " - "blocked"), - ADD_STAT(squashCycles, statistics::units::Cycle::get(), - "Number of cycles fetch has spent squashing"), - ADD_STAT(tlbCycles, statistics::units::Cycle::get(), - "Number of cycles fetch has spent waiting for tlb"), - ADD_STAT(ftqStallCycles, statistics::units::Cycle::get(), - "Number of cycles fetch has spent waiting for FTQ to fill."), - ADD_STAT(idleCycles, statistics::units::Cycle::get(), - "Number of cycles fetch was idle"), - ADD_STAT(blockedCycles, statistics::units::Cycle::get(), - "Number of cycles fetch has spent blocked"), - ADD_STAT(miscStallCycles, statistics::units::Cycle::get(), - "Number of cycles fetch has spent waiting on interrupts, or bad " - "addresses, or out of MSHRs"), - ADD_STAT(pendingDrainCycles, statistics::units::Cycle::get(), - "Number of cycles fetch has spent waiting on pipes to drain"), - ADD_STAT(noActiveThreadStallCycles, statistics::units::Cycle::get(), - "Number of stall cycles due to no active thread to fetch from"), - ADD_STAT(pendingTrapStallCycles, statistics::units::Cycle::get(), - "Number of stall cycles due to pending traps"), - ADD_STAT(pendingQuiesceStallCycles, statistics::units::Cycle::get(), - "Number of stall cycles due to pending quiesce instructions"), - ADD_STAT(icacheWaitRetryStallCycles, statistics::units::Cycle::get(), - "Number of stall cycles due to full MSHR"), - ADD_STAT(cacheLines, statistics::units::Count::get(), - "Number of cache lines fetched"), - ADD_STAT(icacheSquashes, statistics::units::Count::get(), - "Number of outstanding Icache misses that were squashed"), - ADD_STAT(tlbSquashes, statistics::units::Count::get(), - "Number of outstanding ITLB misses that were squashed"), - ADD_STAT(nisnDist, statistics::units::Count::get(), - "Number of instructions fetched each cycle (Total)"), - ADD_STAT(idleRate, statistics::units::Ratio::get(), - "Ratio of cycles fetch was idle", - idleCycles / cpu->baseStats.numCycles) -{ - predictedBranches - .prereq(predictedBranches); - cycles - .prereq(cycles); - squashCycles - .prereq(squashCycles); - tlbCycles - .prereq(tlbCycles); - ftqStallCycles - .prereq(ftqStallCycles); - idleCycles - .prereq(idleCycles); - blockedCycles - .prereq(blockedCycles); - cacheLines - .prereq(cacheLines); - miscStallCycles - .prereq(miscStallCycles); - pendingDrainCycles - .prereq(pendingDrainCycles); - noActiveThreadStallCycles - .prereq(noActiveThreadStallCycles); - pendingTrapStallCycles - .prereq(pendingTrapStallCycles); - pendingQuiesceStallCycles - .prereq(pendingQuiesceStallCycles); - icacheWaitRetryStallCycles - .prereq(icacheWaitRetryStallCycles); - icacheSquashes - .prereq(icacheSquashes); - tlbSquashes - .prereq(tlbSquashes); - nisnDist - .init(/* base value */ 0, - /* last value */ fetch->fetchWidth, - /* bucket size */ 1) - .flags(statistics::pdf); - idleRate - .prereq(idleRate); + ADD_STAT(predictedBranches, statistics::units::Count::get(), + "Number of branches that fetch has predicted taken"), + ADD_STAT(cycles, statistics::units::Cycle::get(), + "Number of cycles fetch has run and was not squashing or " + "blocked"), + ADD_STAT(squashCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent squashing"), + ADD_STAT(tlbCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent waiting for tlb"), + ADD_STAT(ftqStallCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent waiting for FTQ to fill."), + ADD_STAT(idleCycles, statistics::units::Cycle::get(), + "Number of cycles fetch was idle"), + ADD_STAT(blockedCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent blocked"), + ADD_STAT(miscStallCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent waiting on interrupts, or bad " + "addresses, or out of MSHRs"), + ADD_STAT(pendingDrainCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent waiting on pipes to drain"), + ADD_STAT(noActiveThreadStallCycles, statistics::units::Cycle::get(), + "Number of stall cycles due to no active thread to fetch from"), + ADD_STAT(pendingTrapStallCycles, statistics::units::Cycle::get(), + "Number of stall cycles due to pending traps"), + ADD_STAT(pendingQuiesceStallCycles, statistics::units::Cycle::get(), + "Number of stall cycles due to pending quiesce instructions"), + ADD_STAT(icacheWaitRetryStallCycles, statistics::units::Cycle::get(), + "Number of stall cycles due to full MSHR"), + ADD_STAT(cacheLines, statistics::units::Count::get(), + "Number of cache lines fetched"), + ADD_STAT(icacheSquashes, statistics::units::Count::get(), + "Number of outstanding Icache misses that were squashed"), + ADD_STAT(tlbSquashes, statistics::units::Count::get(), + "Number of outstanding ITLB misses that were squashed"), + ADD_STAT(nisnDist, statistics::units::Count::get(), + "Number of instructions fetched each cycle (Total)"), + ADD_STAT(idleRate, statistics::units::Ratio::get(), + "Ratio of cycles fetch was idle", + idleCycles / cpu->baseStats.numCycles), + ADD_STAT(fetchBubbles, statistics::units::Count::get(), + "Stat for Top-Down Methodology, number of instructions not " + "delivered to backend"), + ADD_STAT(fetchBubblesMax, statistics::units::Count::get(), + "Stat for Top-Down Methodology, number of cycles in which no " + "instructions are delivered to backend") { + predictedBranches.prereq(predictedBranches); + cycles.prereq(cycles); + squashCycles.prereq(squashCycles); + tlbCycles.prereq(tlbCycles); + ftqStallCycles.prereq(ftqStallCycles); + idleCycles.prereq(idleCycles); + blockedCycles.prereq(blockedCycles); + cacheLines.prereq(cacheLines); + miscStallCycles.prereq(miscStallCycles); + pendingDrainCycles.prereq(pendingDrainCycles); + noActiveThreadStallCycles.prereq(noActiveThreadStallCycles); + pendingTrapStallCycles.prereq(pendingTrapStallCycles); + pendingQuiesceStallCycles.prereq(pendingQuiesceStallCycles); + icacheWaitRetryStallCycles.prereq(icacheWaitRetryStallCycles); + icacheSquashes.prereq(icacheSquashes); + tlbSquashes.prereq(tlbSquashes); + nisnDist + .init(/* base value */ 0, + /* last value */ fetch->fetchWidth, + /* bucket size */ 1) + .flags(statistics::pdf); + idleRate.prereq(idleRate); } void Fetch::setTimeBuffer(TimeBuffer *time_buffer) @@ -913,6 +901,20 @@ Fetch::tick() tid_itr = activeThreads->begin(); } + bool backendStall = false; + + for (ThreadID i = 0; i < numThreads; ++i) { + if ((fetchStatus[i] == Squashing) || (stalls[i].decode) || + (fetchStatus[i] == Blocked)) + backendStall = true; + } + + if (!backendStall) { + fetchStats.fetchBubbles += (fetchWidth - insts_to_decode); + if (insts_to_decode == 0) + fetchStats.fetchBubblesMax++; + } + // If there was activity this cycle, inform the CPU of it. if (wroteToTimeBuffer) { DPRINTF(Activity, "Activity this cycle.\n"); diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 0f1192d2f97..0ce07064df0 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -591,10 +591,18 @@ class Fetch statistics::Distribution nisnDist; /** Rate of how often fetch was idle. */ statistics::Formula idleRate; + /** Stat for Top-Down Methodology, number of instructions not delivered + * to backend */ + statistics::Scalar fetchBubbles; + /** Stat for Top-Down Methodology, number of cycles in which no + * instructions are delivered to backend */ + statistics::Scalar fetchBubblesMax; } fetchStats; public: const FetchStatGroup &getStats() const { return fetchStats; } + + ThreadStatus getStatus(int tid) { return fetchStatus[tid]; } }; } // namespace o3 diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index f44021a1d61..3b4b463e1d0 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -474,7 +474,7 @@ class IEW /** Average number of woken instructions per writeback. */ statistics::Formula wbFanout; } iewStats; - + public: const IEWStats &getStats() const { return iewStats; }