Skip to content
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions src/cpu/o3/commit.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,13 @@ Commit::CommitStats::CommitStats(CPU *cpu, Commit *commit)
ADD_STAT(committedInstType, statistics::units::Count::get(),
"Class of committed instruction"),
ADD_STAT(commitEligibleSamples, statistics::units::Cycle::get(),
"number cycles where commit BW limit reached")
"number cycles where commit BW limit reached"),
ADD_STAT(committedInst, statistics::units::Count::get(),
"Required for Top-Down, number of committed instructions"),
ADD_STAT(recoveryBubblesMissprediction, statistics::units::Cycle::get(),
"Required for Top-Down, recovery bubbles"),
ADD_STAT(recoveryBubblesMemoryNuke, statistics::units::Cycle::get(),
"Required for Top-Down, recovery bubbles")
{
using namespace statistics;

Expand All @@ -191,9 +197,9 @@ Commit::CommitStats::CommitStats(CPU *cpu, Commit *commit)

committedInstType
.init(commit->numThreads,enums::Num_OpClass)
.flags(total | pdf | dist);
.flags(total | pdf | dist);

committedInstType.ysubnames(enums::OpClassStrings);
committedInstType.ysubnames(enums::OpClassStrings);
}

void
Expand Down Expand Up @@ -961,6 +967,12 @@ Commit::commitInsts()
DPRINTF(Commit, "Retiring squashed instruction from "
"ROB.\n");

if (!isMissPredicted && !isMemoryViolation) {
stats.numMachineClear++;
isMemoryViolation = true;
recoveryBubbleStart = cpu->curCycle();
}

rob->retireHead(commit_thread);

++stats.commitSquashedInsts;
Expand Down Expand Up @@ -993,6 +1005,22 @@ Commit::commitInsts()
stats.committedInstType[tid][head_inst->opClass()]++;
ppCommit->notify(head_inst);

if (isMissPredicted) {
stats.recoveryBubblesMissprediction +=
uint64_t(cpu->curCycle() - recoveryBubbleStart);
} else if (isMemoryViolation) {
stats.recoveryBubblesMemoryNuke +=
uint64_t(cpu->curCycle() - recoveryBubbleStart);
}

isMemoryViolation = false;
isMissPredicted = false;

if (head_inst->mispredicted()) {
recoveryBubbleStart = cpu->curCycle();
isMissPredicted = true;
}

// hardware transactional memory

// update nesting depth
Expand Down Expand Up @@ -1104,6 +1132,7 @@ Commit::commitInsts()

DPRINTF(CommitRate, "%i\n", num_committed);
stats.numCommittedDist.sample(num_committed);
stats.committedInst += num_committed;

if (num_committed == commitWidth) {
stats.commitEligibleSamples++;
Expand Down
17 changes: 16 additions & 1 deletion src/cpu/o3/commit.hh
Original file line number Diff line number Diff line change
Expand Up @@ -487,10 +487,25 @@ class Commit
statistics::Vector functionCalls;
/** Committed instructions by instruction type (OpClass) */
statistics::Vector2d committedInstType;

/** Number of cycles where the commit bandwidth limit is reached. */
statistics::Scalar commitEligibleSamples;
/** Top Down Methodology, Number of commited instructions*/
statistics::Scalar committedInst;
statistics::Scalar numMachineClear;
/** Top Down Methodology, Recovery bubbles, miss predictions*/
statistics::Scalar recoveryBubblesMissprediction;
/** Top Down Methodology, Recovery bubbles, memory nukes */
statistics::Scalar recoveryBubblesMemoryNuke;

} stats;

// Top Down Methodology
Cycles recoveryBubbleStart;
bool isMissPredicted = false;
bool isMemoryViolation = false;

public:
const CommitStats &getStats() const { return stats; }
};

} // namespace o3
Expand Down
150 changes: 149 additions & 1 deletion src/cpu/o3/cpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,8 @@ CPU::CPUStats::CPUStats(CPU *cpu)
"to idling"),
ADD_STAT(quiesceCycles, statistics::units::Cycle::get(),
"Total number of cycles that CPU has spent quiesced or waiting "
"for an interrupt")
"for an interrupt"),
topDownStats(cpu)
{
// Register any of the O3CPU's stats here.
timesIdled
Expand All @@ -368,6 +369,153 @@ CPU::CPUStats::CPUStats(CPU *cpu)
.prereq(quiesceCycles);
}

CPU::CPUStats::TopDownStats::TopDownStats(CPU *cpu)
: statistics::Group(cpu, "TopDownStats"), topDownL1(cpu), topDownFbL2(cpu),
topDownBbL2(cpu), topDownBbMem(cpu) {}

CPU::CPUStats::TopDownStats::TopDownL1::TopDownL1(CPU *cpu)
: statistics::Group(cpu, "TopDownL1"),
ADD_STAT(frontendBound,
statistics::units::Rate<statistics::units::Count,
statistics::units::Count>::get(),
"Frontend Bound, fraction of slots lost due to frontend "
"undersupplying the backend"),
ADD_STAT(badSpeculation,
statistics::units::Rate<statistics::units::Count,
statistics::units::Count>::get(),
"Bad Speculation, fraction of slots lost due to mispeculation"),
ADD_STAT(backendBound,
statistics::units::Rate<statistics::units::Count,
statistics::units::Count>::get(),
"Backend Bound, fraction of slots lost due to backend resource "
"constraints."),
ADD_STAT(
retiring,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you format it as done above

statistics::units::Rate<statistics::units::Count,
statistics::units::Count>::get(),
"Retiring, fraction of slots successfully retired by the backend") {
// L1
frontendBound = cpu->decode.getStats().fetchBubbles /
(cpu->rename.getWidth() * cpu->baseStats.numCycles);

badSpeculation = (cpu->rename.getStats().renamedInsts -
cpu->commit.getStats().committedInst +
(cpu->commit.getStats().recoveryBubblesMissprediction +
cpu->commit.getStats().recoveryBubblesMemoryNuke)
* cpu->rename.getWidth()) /
(cpu->rename.getWidth() * cpu->baseStats.numCycles);

retiring = cpu->commit.getStats().committedInst /
(cpu->rename.getWidth() * cpu->baseStats.numCycles);

backendBound = 1 - (frontendBound + badSpeculation + retiring);
}

CPU::CPUStats::TopDownStats::TopDownFrontendBoundL2::TopDownFrontendBoundL2(
CPU *cpu)
: statistics::Group(cpu, "TopDownL2_FrontendBound"),
ADD_STAT(fetchLatency,
statistics::units::Rate<statistics::units::Count,
statistics::units::Count>::get(),
"Fetch Latency Bound, frontend stalls due to instruction cache "
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its not only instruction cache but also TLB and BTB

"inefficiency"),
ADD_STAT(fetchBandwidth,
statistics::units::Rate<statistics::units::Count,
statistics::units::Count>::get(),
"Fetch Bandwidth Bound, frontend stalls due to decoder "
"inefficiency") {
// Frontend L2
fetchLatency =
cpu->decode.getStats().fetchBubblesMax / (cpu->baseStats.numCycles);
fetchBandwidth =
cpu->cpuStats.topDownStats.topDownL1.frontendBound - fetchLatency;
}

// CPU::CPUStats::TopDownStats::TopDownFrontendBoundL2::TopDownFrontendBoundL2(CPU
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that not ready or why is it commented?

// *cpu)
// : statistics::Group(cpu, "TopDownL2_FrontendBound"),
// ADD_STAT(fetchLatency,
// statistics::units::Rate<statistics::units::Count,
// statistics::units::Count>::get(),
// "Fetch Latency Bound, frontend stalls due to instruction cache
// inefficiency"),
// ADD_STAT(fetchBandwidth,
// statistics::units::Rate<statistics::units::Count,
// statistics::units::Count>::get(),
// "Fetch Bandwidth Bound, frontend stalls due to decoder
// inefficiency")
// {
// // Frontend L2
// fetchLatency = cpu->decode.getStats().fetchBubblesMax /
// (cpu->baseStats.numCycles); fetchBandwidth =
// cpu->cpuStats.topDownStats.topDownL1.frontendBound - fetchLatency;
// }

CPU::CPUStats::TopDownStats::TopDownBackendBoundL2::TopDownBackendBoundL2(
CPU *cpu)
: statistics::Group(cpu, "TopDownL2_BackendBound"),
ADD_STAT(memoryBound,
statistics::units::Rate<statistics::units::Count,
statistics::units::Count>::get(),
"Memory Bound, backend stalls due to memory subsystem"),
ADD_STAT(
coreBound,
statistics::units::Rate<statistics::units::Count,
statistics::units::Count>::get(),
"Core Bound, backend stalls due to functional unit constraints") {
// Backend L2
executionStalls = (cpu->iew.instQueue.getStats().numInstsExec0 -
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not properly indented. Should be consistent. 4 spaces.
Same in the lines below and a lot of other places

cpu->rename.getStats().idleCycles +
cpu->iew.instQueue.getStats().numInstsExec1 +
cpu->iew.instQueue.getStats().numInstsExec2) /
(cpu->baseStats.numCycles);
// memoryBound = (cpu->iew.instQueue.getStats().loadStallCycles +
// cpu->rename.getStats().SQFullEvents) / (cpu->baseStats.numCycles);
memoryBound = (cpu->iew.instQueue.getStats().loadStallCycles +
cpu->rename.getStats().storeStalls) /
(cpu->baseStats.numCycles);
coreBound = executionStalls - memoryBound;
}

CPU::CPUStats::TopDownStats::TopDownBackendBoundL3::TopDownBackendBoundL3(
CPU *cpu)
: statistics::Group(cpu, "TopDownL3_BackendBound_MemoryBound"),
ADD_STAT(l1Bound,
statistics::units::Rate<statistics::units::Count,
statistics::units::Count>::get(),
"L1 Cache Bound"),
ADD_STAT(l2Bound,
statistics::units::Rate<statistics::units::Count,
statistics::units::Count>::get(),
"L2 Cache Bound"),
ADD_STAT(l3Bound,
statistics::units::Rate<statistics::units::Count,
statistics::units::Count>::get(),
"L3 Cache Bound"),
ADD_STAT(extMemBound,
statistics::units::Rate<statistics::units::Count,
statistics::units::Count>::get(),
"External Memory Bound"),
ADD_STAT(storeBound,
statistics::units::Rate<statistics::units::Count,
statistics::units::Count>::get(),
"Store Bound") {
// Backend Bound / Memory Bound L3
l1Bound = (cpu->iew.instQueue.getStats().loadStallCycles -
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indentation

cpu->iew.instQueue.getStats().L1miss) /
(cpu->baseStats.numCycles);
l2Bound = (cpu->iew.instQueue.getStats().L1miss -
cpu->iew.instQueue.getStats().L2miss) /
(cpu->baseStats.numCycles);
l3Bound = (cpu->iew.instQueue.getStats().L2miss -
cpu->iew.instQueue.getStats().L3miss) /
(cpu->baseStats.numCycles);
extMemBound =
(cpu->iew.instQueue.getStats().L3miss) / (cpu->baseStats.numCycles);
storeBound =
(cpu->rename.getStats().storeStalls) / (cpu->baseStats.numCycles);
}

void
CPU::tick()
{
Expand Down
42 changes: 42 additions & 0 deletions src/cpu/o3/cpu.hh
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,48 @@ class CPU : public BaseCPU
/** Stat for total number of cycles the CPU spends descheduled due to a
* quiesce operation or waiting for an interrupt. */
statistics::Scalar quiesceCycles;

struct TopDownStats : statistics::Group {
TopDownStats(CPU *cpu);

struct TopDownL1 : statistics::Group {
TopDownL1(CPU *cpu);
statistics::Formula frontendBound;
statistics::Formula badSpeculation;
statistics::Formula backendBound;
statistics::Formula retiring;
} topDownL1;

struct TopDownFrontendBoundL2 : statistics::Group {
TopDownFrontendBoundL2(CPU *cpu);
statistics::Formula fetchLatency;
statistics::Formula fetchBandwidth;
} topDownFbL2;

// struct TopDownBadSpeculationL2 : statistics::Group{
// TopDownBadSpeculationL2(CPU *cpu);
// statistics::Formula branchMissPredicts;
// statistics::Formula machineClears;
// }

struct TopDownBackendBoundL2 : statistics::Group {
TopDownBackendBoundL2(CPU *cpu);
statistics::Formula executionStalls;
statistics::Formula memoryBound;
statistics::Formula coreBound;
} topDownBbL2;

struct TopDownBackendBoundL3 : statistics::Group {
TopDownBackendBoundL3(CPU *cpu);
statistics::Formula l1Bound;
statistics::Formula l2Bound;
statistics::Formula l3Bound;
statistics::Formula extMemBound;
statistics::Formula storeBound;
} topDownBbMem;

} topDownStats;

} cpuStats;

public:
Expand Down
44 changes: 30 additions & 14 deletions src/cpu/o3/decode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -158,18 +158,25 @@ Decode::DecodeStats::DecodeStats(CPU *cpu)
ADD_STAT(decodedInsts, statistics::units::Count::get(),
"Number of instructions handled by decode"),
ADD_STAT(squashedInsts, statistics::units::Count::get(),
"Number of squashed instructions handled by decode")
{
idleCycles.prereq(idleCycles);
blockedCycles.prereq(blockedCycles);
runCycles.prereq(runCycles);
unblockCycles.prereq(unblockCycles);
squashCycles.prereq(squashCycles);
branchResolved.prereq(branchResolved);
branchMispred.prereq(branchMispred);
controlMispred.prereq(controlMispred);
decodedInsts.prereq(decodedInsts);
squashedInsts.prereq(squashedInsts);
"Number of squashed instructions handled by decode"),
ADD_STAT(fetchBubbles, statistics::units::Count::get(),
"Stat for Top-Down Methodology, number of instructions not "
"delivered to backend"),
ADD_STAT(fetchBubblesMax, statistics::units::Count::get(),
"Stat for Top-Down Methodology, number of cycles in which no "
"instructions are delivered to backend") {
idleCycles.prereq(idleCycles);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indentation

blockedCycles.prereq(blockedCycles);
runCycles.prereq(runCycles);
unblockCycles.prereq(unblockCycles);
squashCycles.prereq(squashCycles);
branchResolved.prereq(branchResolved);
branchMispred.prereq(branchMispred);
controlMispred.prereq(controlMispred);
decodedInsts.prereq(decodedInsts);
squashedInsts.prereq(squashedInsts);
fetchBubbles.prereq(fetchBubbles);
fetchBubblesMax.prereq(fetchBubblesMax);
}

void
Expand Down Expand Up @@ -565,6 +572,8 @@ Decode::tick()

toRenameIndex = 0;

fetchBubbles = decodeWidth;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks incorrect.
Fetch bubbles are slots not delivert this way its always the full width

Edit: I understand now what you do but I found it very unintuitive. Actually the variable is redundant.
Why not simply use inst_availale. This is actually your fetch bubbles. Is its zero you increment fetchBandwidth


list<ThreadID>::iterator threads = activeThreads->begin();
list<ThreadID>::iterator end = activeThreads->end();

Expand All @@ -578,6 +587,10 @@ Decode::tick()
status_change = checkSignalsAndUpdate(tid) || status_change;

decode(status_change, tid);

stats.fetchBubbles += fetchBubbles;
if (fetchBubbles == decodeWidth)
stats.fetchBubblesMax++;
}

if (status_change) {
Expand All @@ -602,9 +615,11 @@ Decode::decode(bool &status_change, ThreadID tid)
// check if stall conditions have passed

if (decodeStatus[tid] == Blocked) {
++stats.blockedCycles;
fetchBubbles -= decodeWidth;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indentation

++stats.blockedCycles;
} else if (decodeStatus[tid] == Squashing) {
++stats.squashCycles;
fetchBubbles -= decodeWidth;
++stats.squashCycles;
}

// Decode should try to decode as many instructions as its bandwidth
Expand Down Expand Up @@ -702,6 +717,7 @@ Decode::decodeInsts(ThreadID tid)
++toRenameIndex;
++stats.decodedInsts;
--insts_available;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is the same a few lines above. Shouldn'tyou also put it there

--fetchBubbles;

#if TRACING_ON
if (debug::O3PipeView) {
Expand Down
Loading