diff --git a/include/ddprof_worker_context.hpp b/include/ddprof_worker_context.hpp index f7c969ace..344386c5a 100644 --- a/include/ddprof_worker_context.hpp +++ b/include/ddprof_worker_context.hpp @@ -6,6 +6,7 @@ #pragma once #include "live_allocation.hpp" +#include "live_sysallocations.hpp" #include "pevent.hpp" #include "proc_status.hpp" @@ -38,4 +39,5 @@ struct DDProfWorkerContext { uint32_t count_worker; // exports since last cache clear std::array lost_events_per_watcher; ddprof::LiveAllocation live_allocation; + ddprof::SystemAllocation sys_allocation; }; diff --git a/include/live_sysallocations.hpp b/include/live_sysallocations.hpp new file mode 100644 index 000000000..b6bde8896 --- /dev/null +++ b/include/live_sysallocations.hpp @@ -0,0 +1,98 @@ +#pragma once + +#include "ddprof_defs.hpp" +#include "logger.hpp" +#include "unwind_output.hpp" + +#include +#include + +#include + +namespace ddprof { + +class SystemAllocation { +private: + template T to_page(T a) { + return ((a + T{4095ull}) & (~T{4095ull})) >> T{12ull}; + } + +public: + void add_allocs(const UnwindOutput &stack, uintptr_t addr, size_t size, + pid_t pid) { + StackMap &stack_map = _pid_map[pid]; + + // Convert addr to page idx, then page-align size and decimate + uintptr_t page_start = to_page(addr); + uintptr_t page_end = to_page(addr + size); + + for (auto i = page_start; i <= page_end; ++i) { + stack_map[i] = stack; + } + _visited_recently.insert(pid); + } + + void move_allocs(uintptr_t addr0, uintptr_t addr1, size_t size, pid_t pid) { + StackMap &stack_map = _pid_map[pid]; + + // Convert addr to page idx + uintptr_t page_start_0 = to_page(addr0); + uintptr_t page_end_0 = to_page(addr0 + size); + uintptr_t page_start_1 = to_page(addr1); + uintptr_t page_idx_max = page_end_0 - page_start_0; + + // Can ranges overlap? Better not try to delete them all at end... + for (uintptr_t i = 0; i < page_idx_max; ++i) { + stack_map[page_start_1 + i] = stack_map[page_start_0 + i]; + stack_map.erase(page_start_0 + i); + } + _visited_recently.insert(pid); + } + + void del_allocs(uintptr_t addr, size_t size, pid_t pid) { + StackMap &stack_map = _pid_map[pid]; + + // Convert addr to page idx, then page-align size and decimate + uintptr_t page_start = to_page(addr); + uintptr_t page_end = to_page(addr + size); + + for (auto i = page_start; i <= page_end; ++i) { + stack_map.erase(i); + } + _visited_recently.insert(pid); + } + + void do_mmap(const UnwindOutput &stack, uintptr_t addr, size_t size, + pid_t pid) { + add_allocs(stack, addr, size, pid); + } + + void do_munmap(uintptr_t addr, size_t size, pid_t pid) { + del_allocs(addr, size, pid); + } + + void do_madvise(uintptr_t addr, size_t size, int flags, pid_t pid) { + // No reason to worry about this yet, since it only has to do with RSS + } + + void do_mremap(const UnwindOutput &stack, uintptr_t addr0, uintptr_t addr1, + size_t size0, size_t size1, pid_t pid) { + // We could either classify these pages as belonging to the original mmap + // or to the mremap. We chose the latter for now. + // Note that we potentially duplicate a lot of work here in the case + // that addr0 == addr1 + del_allocs(addr0, size0, pid); + add_allocs(stack, addr1, size1, pid); + } + + void clear_pid(pid_t pid) { _pid_map.erase(pid); } + + using StackMap = std::unordered_map; + using PidMap = std::unordered_map; + + PidMap _pid_map; + std::unordered_set _visited_recently; + int watcher_pos; +}; + +} // namespace ddprof diff --git a/include/perf_watcher.hpp b/include/perf_watcher.hpp index 0f9ae8f98..0685b5970 100644 --- a/include/perf_watcher.hpp +++ b/include/perf_watcher.hpp @@ -24,6 +24,9 @@ struct PerfWatcherOptions { uint8_t nb_frames_to_skip; // number of bottom frames to skip in stack trace // (useful for allocation profiling to remove // frames belonging to lib_ddprofiling.so) + bool is_overloaded; // Isn't actually needed, but makes it clear from this + // file that additional state is injected into the + // watcher in ddprof_cmdline.cc }; struct PerfWatcher { @@ -83,6 +86,7 @@ enum DDProfTypeId { kDDPROF_TYPE_CUSTOM = PERF_TYPE_MAX + 100 }; enum DDProfCustomCountId { kDDPROF_COUNT_ALLOCATIONS = 0, + kDDPROF_COUNT_SYSALLOCATIONS, }; // Kernel events are necessary to get a full accounting of CPU @@ -106,6 +110,9 @@ enum DDProfCustomCountId { #define SKIP_FRAMES \ { .nb_frames_to_skip = NB_FRAMES_TO_SKIP } +#define IS_OVERLOADED \ + { .is_overloaded = true } + // Whereas tracepoints are dynamically configured and can be checked at runtime, // we lack the ability to inspect events of type other than TYPE_TRACEPOINT. // Accordingly, we maintain a list of events, even though the type of these @@ -134,6 +141,8 @@ enum DDProfCustomCountId { X(sALGN, "Align. Faults", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ALIGNMENT_FAULTS, 99, DDPROF_PWT_TRACEPOINT, IS_FREQ) \ X(sEMU, "Emu. Faults", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS, 99, DDPROF_PWT_TRACEPOINT, IS_FREQ) \ X(sDUM, "Dummy", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY, 1, DDPROF_PWT_NOCOUNT, {}) \ + X(tALLOCSYS1, "System Allocations", PERF_TYPE_TRACEPOINT, kDDPROF_COUNT_SYSALLOCATIONS, 1, DDPROF_PWT_ALLOC_SPACE, IS_OVERLOADED) \ + X(tALLOCSYS2, "System Al. (heavy)", PERF_TYPE_TRACEPOINT, kDDPROF_COUNT_SYSALLOCATIONS, 1, DDPROF_PWT_ALLOC_SPACE, IS_OVERLOADED) \ X(sALLOC, "Allocations", kDDPROF_TYPE_CUSTOM, kDDPROF_COUNT_ALLOCATIONS, 524288, DDPROF_PWT_ALLOC_SPACE, SKIP_FRAMES) // clang-format on diff --git a/src/ddprof_cmdline.cc b/src/ddprof_cmdline.cc index 08662cc73..d616d3a82 100644 --- a/src/ddprof_cmdline.cc +++ b/src/ddprof_cmdline.cc @@ -131,10 +131,35 @@ bool watcher_from_str(const char *str, PerfWatcher *watcher) { watcher->tracepoint_group = conf->groupname; watcher->tracepoint_label = conf->label; - // Allocation watcher, has an extra field to ensure we capture address + // Certain watcher configs get additional event information if (watcher->config == kDDPROF_COUNT_ALLOCATIONS) { watcher->sample_type |= PERF_SAMPLE_ADDR; } + // Some profiling types get lots of additional state transplanted here + if (watcher->options.is_overloaded) { + if (watcher->ddprof_event_type == DDPROF_PWE_tALLOCSYS1) { + // tALLOCSY1 overrides perfopen to bind together many file descriptors + watcher->tracepoint_group = "syscalls"; + watcher->tracepoint_label = "sys_exit_mmap"; + watcher->instrument_self = true; + watcher->options.use_kernel = PerfWatcherUseKernel::kTry; + watcher->sample_stack_size /= 2; // Make this one smaller than normal + + } else if (watcher->ddprof_event_type == DDPROF_PWE_tALLOCSYS2) { + // tALLOCSYS2 captures all syscalls; used to troubleshoot 1 + watcher->tracepoint_group = "raw_syscalls"; + watcher->tracepoint_label = "sys_exit"; + long id = ddprof::tracepoint_get_id("raw_syscalls", "sys_exit"); + if (-1 == id) { + // We mutated the user's event, but it is invalid. + return false; + } + watcher->config = id; + } + watcher->sample_type |= PERF_SAMPLE_RAW; + watcher->options.use_kernel = PerfWatcherUseKernel::kTry; + } + return true; } diff --git a/src/ddprof_context_lib.cc b/src/ddprof_context_lib.cc index 37b64c335..9d61a4596 100644 --- a/src/ddprof_context_lib.cc +++ b/src/ddprof_context_lib.cc @@ -120,6 +120,16 @@ DDRes ddprof_context_set(DDProfInput *input, DDProfContext *ctx) { } ctx->num_watchers = nwatchers; + // Some profiling features, like system allocations, uses ctx storage and + // needs to associate a watcher (but only one watcher) to that storage. + for (int i = 0; i < ctx->num_watchers; ++i) { + if (ctx->watchers[i].ddprof_event_type == DDPROF_PWE_tALLOCSYS1 || + ctx->watchers[i].ddprof_event_type == DDPROF_PWE_tALLOCSYS2) { + ctx->worker_ctx.sys_allocation.watcher_pos = i; + break; + } + } + // Set defaults ctx->params.upload_period = 60.0; diff --git a/src/ddprof_worker.cc b/src/ddprof_worker.cc index ceaa9c9ea..534da4e91 100644 --- a/src/ddprof_worker.cc +++ b/src/ddprof_worker.cc @@ -346,6 +346,77 @@ DDRes ddprof_pr_sample(DDProfContext *ctx, perf_event_sample *sample, return {}; } +DDRes ddprof_pr_sysallocation_tracking(DDProfContext *ctx, + perf_event_sample *sample, + int watcher_pos, bool &clear_pid) { + clear_pid = false; + // Syscall parameters. Suppressing nags because it's annoying to look these + // up and it isn't totally appropriate to spin out a new header just + // for this + int64_t id; + memcpy(&id, sample->data_raw + 8, sizeof(id)); + auto &sysalloc = ctx->worker_ctx.sys_allocation; + +#ifdef __x86_64__ + [[maybe_unused]] uint64_t sc_ret = sample->regs[PAM_X86_RAX]; + [[maybe_unused]] uint64_t sc_p1 = sample->regs[PAM_X86_RDI]; + [[maybe_unused]] uint64_t sc_p2 = sample->regs[PAM_X86_RSI]; + [[maybe_unused]] uint64_t sc_p3 = sample->regs[PAM_X86_RDX]; + [[maybe_unused]] uint64_t sc_p4 = sample->regs[PAM_X86_R10]; + [[maybe_unused]] uint64_t sc_p5 = sample->regs[PAM_X86_R8]; + [[maybe_unused]] uint64_t sc_p6 = sample->regs[PAM_X86_R9]; +#elif __aarch64__ + // Obviously ARM is totally broken here. + [[maybe_unused]] uint64_t sc_ret = sample->regs[PAM_ARM_X0]; + [[maybe_unused]] uint64_t sc_p1 = sample->regs[PAM_ARM_X0]; + [[maybe_unused]] uint64_t sc_p2 = sample->regs[PAM_ARM_X1]; + [[maybe_unused]] uint64_t sc_p3 = sample->regs[PAM_ARM_X2]; + [[maybe_unused]] uint64_t sc_p4 = sample->regs[PAM_ARM_X3]; + [[maybe_unused]] uint64_t sc_p5 = sample->regs[PAM_ARM_X4]; + [[maybe_unused]] uint64_t sc_p6 = sample->regs[PAM_ARM_X5]; +#else +# error Architecture not supported +#endif + if (sc_ret > -4096UL) { + // If the syscall returned error, it didn't mutate state. Skip! + // ("high" values are errors, as per standard) + return ddres_init(); + } + + // Only unwind if we will need to propagate unwinding information forward + DDRes res = {}; + UnwindOutput *uwo = NULL; + if (id == 9 || id == 25) { + auto ticks0 = ddprof::get_tsc_cycles(); + res = ddprof_unwind_sample(ctx, sample, watcher_pos); + auto unwind_ticks = ddprof::get_tsc_cycles(); + ddprof_stats_add(STATS_UNWIND_AVG_TIME, unwind_ticks - ticks0, NULL); + uwo = &ctx->worker_ctx.us->output; + + // TODO: propagate fatal + if (IsDDResFatal(res)) { + return ddres_init(); + } + } + + // hardcoded syscall numbers; these are uniform between x86/arm + if (id == 9) { + sysalloc.do_mmap(*uwo, sc_ret, sc_p2, sample->pid); + } else if (id == 11) { + sysalloc.do_munmap(sc_p1, sc_p2, sample->pid); + } else if (id == 28) { + // Unhandled, no need to handle + } else if (id == 25) { + sysalloc.do_mremap(*uwo, sc_ret, sc_p1, sc_p2, sc_p3, sample->pid); + } else if (id == 60 || id == 231 || id == 59 || id == 322 || id == 520 || + id == 545) { + // Erase upon exit or exec + clear_pid = true; + } + + return ddres_init(); +} + static void ddprof_reset_worker_stats() { for (unsigned i = 0; i < std::size(s_cycled_stats); ++i) { ddprof_stats_clear(s_cycled_stats[i]); @@ -438,11 +509,55 @@ static DDRes aggregate_live_allocations(DDProfContext *ctx) { return ddres_init(); } +DDRes aggregate_sys_allocation_stack(const UnwindOutput *uw_output, + const SymbolHdr *symbol_hdr, + const PerfWatcher *watcher, + DDProfPProf *pprof) { + DDRES_CHECK_FWD(pprof_aggregate(uw_output, *symbol_hdr, get_page_size(), 1, + watcher, pprof)); + return ddres_init(); +} + +static DDRes aggregate_sys_allocations_for_pid(DDProfContext *ctx, pid_t pid) { + struct UnwindState *us = ctx->worker_ctx.us; + SystemAllocation &sysallocs = ctx->worker_ctx.sys_allocation; + PerfWatcher *watcher = &ctx->watchers[sysallocs.watcher_pos]; + int i_export = ctx->worker_ctx.i_current_pprof; + DDProfPProf *pprof = ctx->worker_ctx.pprof[i_export]; + const auto &stack_map = sysallocs._pid_map[pid]; + for (const auto &page : stack_map) { + DDRES_CHECK_FWD(aggregate_sys_allocation_stack( + &page.second, &us->symbol_hdr, watcher, pprof)); + } + return ddres_init(); +} + +static DDRes aggregate_sys_allocations(DDProfContext *ctx) { + struct UnwindState *us = ctx->worker_ctx.us; + SystemAllocation &sysallocs = ctx->worker_ctx.sys_allocation; + PerfWatcher *watcher = &ctx->watchers[sysallocs.watcher_pos]; + int i_export = ctx->worker_ctx.i_current_pprof; + DDProfPProf *pprof = ctx->worker_ctx.pprof[i_export]; + + // Iterate through each PID + for (auto &stack_map : sysallocs._pid_map) { + // Iterate through pages... + // TODO Probably aggregate into ranges of pages or something, but once per + // page is just too much + for (const auto &page : stack_map.second) { + DDRES_CHECK_FWD(aggregate_sys_allocation_stack( + &page.second, &us->symbol_hdr, watcher, pprof)); + } + } + return ddres_init(); +} static DDRes worker_pid_free(DDProfContext *ctx, pid_t el) { + DDRES_CHECK_FWD(aggregate_sys_allocations_for_pid(ctx, el)); DDRES_CHECK_FWD(aggregate_live_allocations_for_pid(ctx, el)); UnwindState *us = ctx->worker_ctx.us; unwind_pid_free(us, el); + ctx->worker_ctx.sys_allocation.clear_pid(el); ctx->worker_ctx.live_allocation.clear_pid(el); return ddres_init(); } @@ -450,6 +565,7 @@ static DDRes worker_pid_free(DDProfContext *ctx, pid_t el) { static DDRes worker_pid_free(DDProfContext *ctx, pid_t el) { UnwindState *us = ctx->worker_ctx.us; unwind_pid_free(us, el); + ctx->worker_ctx.sys_allocation.clear_pid(el); ctx->worker_ctx.live_allocation.clear_pid(el); return ddres_init(); } @@ -473,6 +589,7 @@ DDRes ddprof_worker_cycle(DDProfContext *ctx, int64_t now, DDRES_CHECK_FWD(clear_unvisited_pids(ctx)); #ifndef DDPROF_NATIVE_LIB DDRES_CHECK_FWD(aggregate_live_allocations(ctx)); + DDRES_CHECK_FWD(aggregate_sys_allocations(ctx)); // Take the current pprof contents and ship them to the backend. This also // clears the pprof for reuse @@ -738,8 +855,24 @@ DDRes ddprof_worker_process_event(const perf_event_header *hdr, int watcher_pos, if (wpid->pid) { uint64_t mask = watcher->sample_type; perf_event_sample *sample = hdr2samp(hdr, mask); + if (sample) { - DDRES_CHECK_FWD(ddprof_pr_sample(ctx, sample, watcher_pos)); + // Handle special profiling types first + if (watcher->ddprof_event_type == DDPROF_PWE_tALLOCSYS1 || + watcher->ddprof_event_type == DDPROF_PWE_tALLOCSYS2) { + // For now we have a different path for + // - mmap/munmap syscalls + bool clear_pid = false; + DDRES_CHECK_FWD(ddprof_pr_sysallocation_tracking( + ctx, sample, watcher_pos, clear_pid)); + if (clear_pid) { + LG_DBG("<%d>(SYSEXIT)%d", watcher_pos, wpid->pid); + // we could consider clearing the pid here + // though we could get other types of events + } + } else { + DDRES_CHECK_FWD(ddprof_pr_sample(ctx, sample, watcher_pos)); + } } } break; diff --git a/src/pevent_lib.cc b/src/pevent_lib.cc index b164c3742..bd48df60f 100644 --- a/src/pevent_lib.cc +++ b/src/pevent_lib.cc @@ -65,6 +65,85 @@ static void pevent_set_info(int fd, int attr_idx, PEvent &pevent) { pevent.attr_idx = attr_idx; } +static void pevent_add_child_fd(int child_fd, PEvent &pevent) { + pevent.child_fds[pevent.current_child_fd++] = child_fd; +} + +static DDRes tallocsys1_open(PerfWatcher *watcher, int watcher_idx, pid_t pid, + int num_cpu, PEventHdr *pevent_hdr) { + PerfWatcher watcher_copy = *watcher; + PEvent *pes = pevent_hdr->pes; + + struct talloc_conf { + int fd; + bool enable_userstack; + }; + std::unordered_map kprobes{ + {"sys_exit_mmap", {-1, true}}, + {"sys_exit_munmap", {-1, false}}, + {"sys_exit_mremap", {-1, true}}}; + + // Set the IDs + for (auto &kprobe : kprobes) { + long id = ddprof::tracepoint_get_id("syscalls", kprobe.first); + if (-1 == id) { + DDRES_RETURN_ERROR_LOG(DD_WHAT_PERFOPEN, + "Error opening tracefs for tALLOCSYS1 on %s", + kprobe.first.c_str()); + } + kprobes[kprobe.first].fd = id; + } + + // Iterate + for (int cpu_idx = 0; cpu_idx < num_cpu; ++cpu_idx) { + int fd = -1; + // Create the pevent which will consolidate this watcher + size_t pevent_idx = -1; + DDRES_CHECK_FWD(pevent_create(pevent_hdr, watcher_idx, &pevent_idx)); + perf_event_attr attr = {}; + std::vector cleanup_fds; + + // This is very imperfect since failure leaves a dangling pevent + // TODO + for (auto &kprobe : kprobes) { + watcher_copy.tracepoint_group = "syscalls"; + watcher_copy.tracepoint_label = kprobe.first.c_str(); + watcher_copy.config = kprobe.second.fd; + + // THIS IS WRONG + if (kprobe.second.enable_userstack) { + watcher_copy.sample_stack_size = watcher->sample_stack_size; + } else { + watcher_copy.sample_stack_size = 0; + } + + attr = perf_config_from_watcher(&watcher_copy, true); + int fd_tmp = -1; + fd_tmp = perf_event_open(&attr, pid, cpu_idx, -1, PERF_FLAG_FD_CLOEXEC); + + if (-1 == fd_tmp) { + for (auto cleanup_fd : cleanup_fds) { + close(cleanup_fd); + } + DDRES_RETURN_ERROR_LOG(DD_WHAT_PERFOPEN, + "Error calling perfopen for tALLOCSYS1 on %s", + kprobe.first.c_str()); + } + if (-1 != fd) { + pevent_add_child_fd(fd_tmp, pes[pevent_idx]); + } else { + fd = fd_tmp; + } + cleanup_fds.push_back(fd_tmp); + } + pevent_hdr->attrs[pevent_hdr->nb_attrs] = attr; + pevent_set_info(fd, pes[pevent_idx].attr_idx, pes[pevent_idx]); + ++pevent_hdr->nb_attrs; + } + + return ddres_init(); +} + static DDRes pevent_register_cpu_0(const PerfWatcher *watcher, int watcher_idx, pid_t pid, PEventHdr *pevent_hdr, size_t &pevent_idx) { @@ -134,9 +213,16 @@ DDRes pevent_open(DDProfContext *ctx, pid_t pid, int num_cpu, assert(pevent_hdr->size == 0); // check for previous init for (int watcher_idx = 0; watcher_idx < ctx->num_watchers; ++watcher_idx) { PerfWatcher *watcher = &ctx->watchers[watcher_idx]; - if (watcher->type < kDDPROF_TYPE_CUSTOM) { - DDRES_CHECK_FWD(pevent_open_all_cpus( - &ctx->watchers[watcher_idx], watcher_idx, pid, num_cpu, pevent_hdr)); + if (watcher->instrument_self) { + // Here we inline a lookup for the specific handler, but in reality this + // should be defined at the level of the watcher + if (watcher->ddprof_event_type == DDPROF_PWE_tALLOCSYS1) { + DDRES_CHECK_FWD( + tallocsys1_open(watcher, watcher_idx, pid, num_cpu, pevent_hdr)); + } + } else if (watcher->type < kDDPROF_TYPE_CUSTOM) { + DDRES_CHECK_FWD( + pevent_open_all_cpus(watcher, watcher_idx, pid, num_cpu, pevent_hdr)); } else { // custom event, eg.allocation profiling size_t pevent_idx = 0;