Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/ddprof_worker_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#pragma once

#include "live_allocation.hpp"
#include "live_sysallocations.hpp"
#include "pevent.hpp"
#include "proc_status.hpp"

Expand Down Expand Up @@ -38,4 +39,5 @@ struct DDProfWorkerContext {
uint32_t count_worker; // exports since last cache clear
std::array<uint64_t, MAX_TYPE_WATCHER> lost_events_per_watcher;
ddprof::LiveAllocation live_allocation;
ddprof::SystemAllocation sys_allocation;
};
98 changes: 98 additions & 0 deletions include/live_sysallocations.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#pragma once

#include "ddprof_defs.hpp"
#include "logger.hpp"
#include "unwind_output.hpp"

#include <unordered_map>
#include <unordered_set>

#include <signal.h>

namespace ddprof {

class SystemAllocation {
private:
template <class T> T to_page(T a) {
return ((a + T{4095ull}) & (~T{4095ull})) >> T{12ull};
}

public:
void add_allocs(const UnwindOutput &stack, uintptr_t addr, size_t size,
pid_t pid) {
StackMap &stack_map = _pid_map[pid];

// Convert addr to page idx, then page-align size and decimate
uintptr_t page_start = to_page(addr);
uintptr_t page_end = to_page(addr + size);

for (auto i = page_start; i <= page_end; ++i) {
stack_map[i] = stack;
}
_visited_recently.insert(pid);
}

void move_allocs(uintptr_t addr0, uintptr_t addr1, size_t size, pid_t pid) {
StackMap &stack_map = _pid_map[pid];

// Convert addr to page idx
uintptr_t page_start_0 = to_page(addr0);
uintptr_t page_end_0 = to_page(addr0 + size);
uintptr_t page_start_1 = to_page(addr1);
uintptr_t page_idx_max = page_end_0 - page_start_0;

// Can ranges overlap? Better not try to delete them all at end...
for (uintptr_t i = 0; i < page_idx_max; ++i) {
stack_map[page_start_1 + i] = stack_map[page_start_0 + i];
stack_map.erase(page_start_0 + i);
}
_visited_recently.insert(pid);
}

void del_allocs(uintptr_t addr, size_t size, pid_t pid) {
StackMap &stack_map = _pid_map[pid];

// Convert addr to page idx, then page-align size and decimate
uintptr_t page_start = to_page(addr);
uintptr_t page_end = to_page(addr + size);

for (auto i = page_start; i <= page_end; ++i) {
stack_map.erase(i);
}
_visited_recently.insert(pid);
}

void do_mmap(const UnwindOutput &stack, uintptr_t addr, size_t size,
pid_t pid) {
add_allocs(stack, addr, size, pid);
}

void do_munmap(uintptr_t addr, size_t size, pid_t pid) {
del_allocs(addr, size, pid);
}

void do_madvise(uintptr_t addr, size_t size, int flags, pid_t pid) {
// No reason to worry about this yet, since it only has to do with RSS
}

void do_mremap(const UnwindOutput &stack, uintptr_t addr0, uintptr_t addr1,
size_t size0, size_t size1, pid_t pid) {
// We could either classify these pages as belonging to the original mmap
// or to the mremap. We chose the latter for now.
// Note that we potentially duplicate a lot of work here in the case
// that addr0 == addr1
del_allocs(addr0, size0, pid);
add_allocs(stack, addr1, size1, pid);
}

void clear_pid(pid_t pid) { _pid_map.erase(pid); }

using StackMap = std::unordered_map<uintptr_t, UnwindOutput>;
using PidMap = std::unordered_map<pid_t, StackMap>;

PidMap _pid_map;
std::unordered_set<pid_t> _visited_recently;
int watcher_pos;
};

} // namespace ddprof
9 changes: 9 additions & 0 deletions include/perf_watcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ struct PerfWatcherOptions {
uint8_t nb_frames_to_skip; // number of bottom frames to skip in stack trace
// (useful for allocation profiling to remove
// frames belonging to lib_ddprofiling.so)
bool is_overloaded; // Isn't actually needed, but makes it clear from this
// file that additional state is injected into the
// watcher in ddprof_cmdline.cc
};

struct PerfWatcher {
Expand Down Expand Up @@ -83,6 +86,7 @@ enum DDProfTypeId { kDDPROF_TYPE_CUSTOM = PERF_TYPE_MAX + 100 };

enum DDProfCustomCountId {
kDDPROF_COUNT_ALLOCATIONS = 0,
kDDPROF_COUNT_SYSALLOCATIONS,
};

// Kernel events are necessary to get a full accounting of CPU
Expand All @@ -106,6 +110,9 @@ enum DDProfCustomCountId {
#define SKIP_FRAMES \
{ .nb_frames_to_skip = NB_FRAMES_TO_SKIP }

#define IS_OVERLOADED \
{ .is_overloaded = true }

// Whereas tracepoints are dynamically configured and can be checked at runtime,
// we lack the ability to inspect events of type other than TYPE_TRACEPOINT.
// Accordingly, we maintain a list of events, even though the type of these
Expand Down Expand Up @@ -134,6 +141,8 @@ enum DDProfCustomCountId {
X(sALGN, "Align. Faults", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ALIGNMENT_FAULTS, 99, DDPROF_PWT_TRACEPOINT, IS_FREQ) \
X(sEMU, "Emu. Faults", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS, 99, DDPROF_PWT_TRACEPOINT, IS_FREQ) \
X(sDUM, "Dummy", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY, 1, DDPROF_PWT_NOCOUNT, {}) \
X(tALLOCSYS1, "System Allocations", PERF_TYPE_TRACEPOINT, kDDPROF_COUNT_SYSALLOCATIONS, 1, DDPROF_PWT_ALLOC_SPACE, IS_OVERLOADED) \
X(tALLOCSYS2, "System Al. (heavy)", PERF_TYPE_TRACEPOINT, kDDPROF_COUNT_SYSALLOCATIONS, 1, DDPROF_PWT_ALLOC_SPACE, IS_OVERLOADED) \
X(sALLOC, "Allocations", kDDPROF_TYPE_CUSTOM, kDDPROF_COUNT_ALLOCATIONS, 524288, DDPROF_PWT_ALLOC_SPACE, SKIP_FRAMES)

// clang-format on
Expand Down
27 changes: 26 additions & 1 deletion src/ddprof_cmdline.cc
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,35 @@ bool watcher_from_str(const char *str, PerfWatcher *watcher) {
watcher->tracepoint_group = conf->groupname;
watcher->tracepoint_label = conf->label;

// Allocation watcher, has an extra field to ensure we capture address
// Certain watcher configs get additional event information
if (watcher->config == kDDPROF_COUNT_ALLOCATIONS) {
watcher->sample_type |= PERF_SAMPLE_ADDR;
}

// Some profiling types get lots of additional state transplanted here
if (watcher->options.is_overloaded) {
if (watcher->ddprof_event_type == DDPROF_PWE_tALLOCSYS1) {
// tALLOCSY1 overrides perfopen to bind together many file descriptors
watcher->tracepoint_group = "syscalls";
watcher->tracepoint_label = "sys_exit_mmap";
watcher->instrument_self = true;
watcher->options.use_kernel = PerfWatcherUseKernel::kTry;
watcher->sample_stack_size /= 2; // Make this one smaller than normal

} else if (watcher->ddprof_event_type == DDPROF_PWE_tALLOCSYS2) {
// tALLOCSYS2 captures all syscalls; used to troubleshoot 1
watcher->tracepoint_group = "raw_syscalls";
watcher->tracepoint_label = "sys_exit";
long id = ddprof::tracepoint_get_id("raw_syscalls", "sys_exit");
if (-1 == id) {
// We mutated the user's event, but it is invalid.
return false;
}
watcher->config = id;
}
watcher->sample_type |= PERF_SAMPLE_RAW;
watcher->options.use_kernel = PerfWatcherUseKernel::kTry;
}

return true;
}
10 changes: 10 additions & 0 deletions src/ddprof_context_lib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,16 @@ DDRes ddprof_context_set(DDProfInput *input, DDProfContext *ctx) {
}
ctx->num_watchers = nwatchers;

// Some profiling features, like system allocations, uses ctx storage and
// needs to associate a watcher (but only one watcher) to that storage.
for (int i = 0; i < ctx->num_watchers; ++i) {
if (ctx->watchers[i].ddprof_event_type == DDPROF_PWE_tALLOCSYS1 ||
ctx->watchers[i].ddprof_event_type == DDPROF_PWE_tALLOCSYS2) {
ctx->worker_ctx.sys_allocation.watcher_pos = i;
break;
}
}

// Set defaults
ctx->params.upload_period = 60.0;

Expand Down
135 changes: 134 additions & 1 deletion src/ddprof_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,77 @@ DDRes ddprof_pr_sample(DDProfContext *ctx, perf_event_sample *sample,
return {};
}

DDRes ddprof_pr_sysallocation_tracking(DDProfContext *ctx,
perf_event_sample *sample,
int watcher_pos, bool &clear_pid) {
clear_pid = false;
// Syscall parameters. Suppressing nags because it's annoying to look these
// up and it isn't totally appropriate to spin out a new header just
// for this
int64_t id;
memcpy(&id, sample->data_raw + 8, sizeof(id));
auto &sysalloc = ctx->worker_ctx.sys_allocation;

#ifdef __x86_64__
[[maybe_unused]] uint64_t sc_ret = sample->regs[PAM_X86_RAX];
[[maybe_unused]] uint64_t sc_p1 = sample->regs[PAM_X86_RDI];
[[maybe_unused]] uint64_t sc_p2 = sample->regs[PAM_X86_RSI];
[[maybe_unused]] uint64_t sc_p3 = sample->regs[PAM_X86_RDX];
[[maybe_unused]] uint64_t sc_p4 = sample->regs[PAM_X86_R10];
[[maybe_unused]] uint64_t sc_p5 = sample->regs[PAM_X86_R8];
[[maybe_unused]] uint64_t sc_p6 = sample->regs[PAM_X86_R9];
#elif __aarch64__
// Obviously ARM is totally broken here.
[[maybe_unused]] uint64_t sc_ret = sample->regs[PAM_ARM_X0];
[[maybe_unused]] uint64_t sc_p1 = sample->regs[PAM_ARM_X0];
[[maybe_unused]] uint64_t sc_p2 = sample->regs[PAM_ARM_X1];
[[maybe_unused]] uint64_t sc_p3 = sample->regs[PAM_ARM_X2];
[[maybe_unused]] uint64_t sc_p4 = sample->regs[PAM_ARM_X3];
[[maybe_unused]] uint64_t sc_p5 = sample->regs[PAM_ARM_X4];
[[maybe_unused]] uint64_t sc_p6 = sample->regs[PAM_ARM_X5];
#else
# error Architecture not supported
#endif
if (sc_ret > -4096UL) {
// If the syscall returned error, it didn't mutate state. Skip!
// ("high" values are errors, as per standard)
return ddres_init();
}

// Only unwind if we will need to propagate unwinding information forward
DDRes res = {};
UnwindOutput *uwo = NULL;
if (id == 9 || id == 25) {
auto ticks0 = ddprof::get_tsc_cycles();
res = ddprof_unwind_sample(ctx, sample, watcher_pos);
auto unwind_ticks = ddprof::get_tsc_cycles();
ddprof_stats_add(STATS_UNWIND_AVG_TIME, unwind_ticks - ticks0, NULL);
uwo = &ctx->worker_ctx.us->output;

// TODO: propagate fatal
if (IsDDResFatal(res)) {
return ddres_init();
}
}

// hardcoded syscall numbers; these are uniform between x86/arm
if (id == 9) {
sysalloc.do_mmap(*uwo, sc_ret, sc_p2, sample->pid);
} else if (id == 11) {
sysalloc.do_munmap(sc_p1, sc_p2, sample->pid);
} else if (id == 28) {
// Unhandled, no need to handle
} else if (id == 25) {
sysalloc.do_mremap(*uwo, sc_ret, sc_p1, sc_p2, sc_p3, sample->pid);
} else if (id == 60 || id == 231 || id == 59 || id == 322 || id == 520 ||
id == 545) {
// Erase upon exit or exec
clear_pid = true;
}

return ddres_init();
}

static void ddprof_reset_worker_stats() {
for (unsigned i = 0; i < std::size(s_cycled_stats); ++i) {
ddprof_stats_clear(s_cycled_stats[i]);
Expand Down Expand Up @@ -438,18 +509,63 @@ static DDRes aggregate_live_allocations(DDProfContext *ctx) {
return ddres_init();
}

DDRes aggregate_sys_allocation_stack(const UnwindOutput *uw_output,
const SymbolHdr *symbol_hdr,
const PerfWatcher *watcher,
DDProfPProf *pprof) {
DDRES_CHECK_FWD(pprof_aggregate(uw_output, *symbol_hdr, get_page_size(), 1,
watcher, pprof));
return ddres_init();
}

static DDRes aggregate_sys_allocations_for_pid(DDProfContext *ctx, pid_t pid) {
struct UnwindState *us = ctx->worker_ctx.us;
SystemAllocation &sysallocs = ctx->worker_ctx.sys_allocation;
PerfWatcher *watcher = &ctx->watchers[sysallocs.watcher_pos];
int i_export = ctx->worker_ctx.i_current_pprof;
DDProfPProf *pprof = ctx->worker_ctx.pprof[i_export];
const auto &stack_map = sysallocs._pid_map[pid];
for (const auto &page : stack_map) {
DDRES_CHECK_FWD(aggregate_sys_allocation_stack(
&page.second, &us->symbol_hdr, watcher, pprof));
}
return ddres_init();
}

static DDRes aggregate_sys_allocations(DDProfContext *ctx) {
struct UnwindState *us = ctx->worker_ctx.us;
SystemAllocation &sysallocs = ctx->worker_ctx.sys_allocation;
PerfWatcher *watcher = &ctx->watchers[sysallocs.watcher_pos];
int i_export = ctx->worker_ctx.i_current_pprof;
DDProfPProf *pprof = ctx->worker_ctx.pprof[i_export];

// Iterate through each PID
for (auto &stack_map : sysallocs._pid_map) {
// Iterate through pages...
// TODO Probably aggregate into ranges of pages or something, but once per
// page is just too much
for (const auto &page : stack_map.second) {
DDRES_CHECK_FWD(aggregate_sys_allocation_stack(
&page.second, &us->symbol_hdr, watcher, pprof));
}
}
return ddres_init();
}

static DDRes worker_pid_free(DDProfContext *ctx, pid_t el) {
DDRES_CHECK_FWD(aggregate_sys_allocations_for_pid(ctx, el));
DDRES_CHECK_FWD(aggregate_live_allocations_for_pid(ctx, el));
UnwindState *us = ctx->worker_ctx.us;
unwind_pid_free(us, el);
ctx->worker_ctx.sys_allocation.clear_pid(el);
ctx->worker_ctx.live_allocation.clear_pid(el);
return ddres_init();
}
#else
static DDRes worker_pid_free(DDProfContext *ctx, pid_t el) {
UnwindState *us = ctx->worker_ctx.us;
unwind_pid_free(us, el);
ctx->worker_ctx.sys_allocation.clear_pid(el);
ctx->worker_ctx.live_allocation.clear_pid(el);
return ddres_init();
}
Expand All @@ -473,6 +589,7 @@ DDRes ddprof_worker_cycle(DDProfContext *ctx, int64_t now,
DDRES_CHECK_FWD(clear_unvisited_pids(ctx));
#ifndef DDPROF_NATIVE_LIB
DDRES_CHECK_FWD(aggregate_live_allocations(ctx));
DDRES_CHECK_FWD(aggregate_sys_allocations(ctx));

// Take the current pprof contents and ship them to the backend. This also
// clears the pprof for reuse
Expand Down Expand Up @@ -738,8 +855,24 @@ DDRes ddprof_worker_process_event(const perf_event_header *hdr, int watcher_pos,
if (wpid->pid) {
uint64_t mask = watcher->sample_type;
perf_event_sample *sample = hdr2samp(hdr, mask);

if (sample) {
DDRES_CHECK_FWD(ddprof_pr_sample(ctx, sample, watcher_pos));
// Handle special profiling types first
if (watcher->ddprof_event_type == DDPROF_PWE_tALLOCSYS1 ||
watcher->ddprof_event_type == DDPROF_PWE_tALLOCSYS2) {
// For now we have a different path for
// - mmap/munmap syscalls
bool clear_pid = false;
DDRES_CHECK_FWD(ddprof_pr_sysallocation_tracking(
ctx, sample, watcher_pos, clear_pid));
if (clear_pid) {
LG_DBG("<%d>(SYSEXIT)%d", watcher_pos, wpid->pid);
// we could consider clearing the pid here
// though we could get other types of events
}
} else {
DDRES_CHECK_FWD(ddprof_pr_sample(ctx, sample, watcher_pos));
}
}
}
break;
Expand Down
Loading