DataDog · r1viollet · Apr 7, 2023
@@ -6,6 +6,7 @@
 #pragma once
 
 #include "live_allocation.hpp"
+#include "live_sysallocations.hpp"
 #include "pevent.hpp"
 #include "proc_status.hpp"
 
@@ -38,4 +39,5 @@ struct DDProfWorkerContext {
   uint32_t count_worker; // exports since last cache clear
   std::array<uint64_t, MAX_TYPE_WATCHER> lost_events_per_watcher;
   ddprof::LiveAllocation live_allocation;
+  ddprof::SystemAllocation sys_allocation;
 };
@@ -0,0 +1,98 @@
+#pragma once
+
+#include "ddprof_defs.hpp"
+#include "logger.hpp"
+#include "unwind_output.hpp"
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include <signal.h>
+
+namespace ddprof {
+
+class SystemAllocation {
+private:
+  template <class T> T to_page(T a) {
+    return ((a + T{4095ull}) & (~T{4095ull})) >> T{12ull};
+  }
+
+public:
+  void add_allocs(const UnwindOutput &stack, uintptr_t addr, size_t size,
+                  pid_t pid) {
+    StackMap &stack_map = _pid_map[pid];
+
+    // Convert addr to page idx, then page-align size and decimate
+    uintptr_t page_start = to_page(addr);
+    uintptr_t page_end = to_page(addr + size);
+
+    for (auto i = page_start; i <= page_end; ++i) {
+      stack_map[i] = stack;
+    }
+    _visited_recently.insert(pid);
+  }
+
+  void move_allocs(uintptr_t addr0, uintptr_t addr1, size_t size, pid_t pid) {
+    StackMap &stack_map = _pid_map[pid];
+
+    // Convert addr to page idx
+    uintptr_t page_start_0 = to_page(addr0);
+    uintptr_t page_end_0 = to_page(addr0 + size);
+    uintptr_t page_start_1 = to_page(addr1);
+    uintptr_t page_idx_max = page_end_0 - page_start_0;
+
+    // Can ranges overlap?  Better not try to delete them all at end...
+    for (uintptr_t i = 0; i < page_idx_max; ++i) {
+      stack_map[page_start_1 + i] = stack_map[page_start_0 + i];
+      stack_map.erase(page_start_0 + i);
+    }
+    _visited_recently.insert(pid);
+  }
+
+  void del_allocs(uintptr_t addr, size_t size, pid_t pid) {
+    StackMap &stack_map = _pid_map[pid];
+
+    // Convert addr to page idx, then page-align size and decimate
+    uintptr_t page_start = to_page(addr);
+    uintptr_t page_end = to_page(addr + size);
+
+    for (auto i = page_start; i <= page_end; ++i) {
+      stack_map.erase(i);
+    }
+    _visited_recently.insert(pid);
+  }
+
+  void do_mmap(const UnwindOutput &stack, uintptr_t addr, size_t size,
+               pid_t pid) {
+    add_allocs(stack, addr, size, pid);
+  }
+
+  void do_munmap(uintptr_t addr, size_t size, pid_t pid) {
+    del_allocs(addr, size, pid);
+  }
+
+  void do_madvise(uintptr_t addr, size_t size, int flags, pid_t pid) {
+    // No reason to worry about this yet, since it only has to do with RSS
+  }
+
+  void do_mremap(const UnwindOutput &stack, uintptr_t addr0, uintptr_t addr1,
+                 size_t size0, size_t size1, pid_t pid) {
+    // We could either classify these pages as belonging to the original mmap
+    // or to the mremap.  We chose the latter for now.
+    // Note that we potentially duplicate a lot of work here in the case
+    // that addr0 == addr1
+    del_allocs(addr0, size0, pid);
+    add_allocs(stack, addr1, size1, pid);
+  }
+
+  void clear_pid(pid_t pid) { _pid_map.erase(pid); }
+
+  using StackMap = std::unordered_map<uintptr_t, UnwindOutput>;
+  using PidMap = std::unordered_map<pid_t, StackMap>;
+
+  PidMap _pid_map;
+  std::unordered_set<pid_t> _visited_recently;
+  int watcher_pos;
+};
+
+} // namespace ddprof
@@ -24,6 +24,9 @@ struct PerfWatcherOptions {
   uint8_t nb_frames_to_skip; // number of bottom frames to skip in stack trace
                              // (useful for allocation profiling to remove
                              // frames belonging to lib_ddprofiling.so)
+  bool is_overloaded; // Isn't actually needed, but makes it clear from this
+                      // file that additional state is injected into the
+                      // watcher in ddprof_cmdline.cc
 };
 
 struct PerfWatcher {
@@ -83,6 +86,7 @@ enum DDProfTypeId { kDDPROF_TYPE_CUSTOM = PERF_TYPE_MAX + 100 };
 
 enum DDProfCustomCountId {
   kDDPROF_COUNT_ALLOCATIONS = 0,
+  kDDPROF_COUNT_SYSALLOCATIONS,
 };
 
 // Kernel events are necessary to get a full accounting of CPU
@@ -106,6 +110,9 @@ enum DDProfCustomCountId {
 #define SKIP_FRAMES                                                            \
   { .nb_frames_to_skip = NB_FRAMES_TO_SKIP }
 
+#define IS_OVERLOADED                                                          \
+  { .is_overloaded = true }
+
 // Whereas tracepoints are dynamically configured and can be checked at runtime,
 // we lack the ability to inspect events of type other than TYPE_TRACEPOINT.
 // Accordingly, we maintain a list of events, even though the type of these
@@ -134,6 +141,8 @@ enum DDProfCustomCountId {
   X(sALGN,      "Align. Faults",      PERF_TYPE_SOFTWARE,   PERF_COUNT_SW_ALIGNMENT_FAULTS,        99,           DDPROF_PWT_TRACEPOINT,  IS_FREQ)                 \
   X(sEMU,       "Emu. Faults",        PERF_TYPE_SOFTWARE,   PERF_COUNT_SW_EMULATION_FAULTS,        99,           DDPROF_PWT_TRACEPOINT,  IS_FREQ)                 \
   X(sDUM,       "Dummy",              PERF_TYPE_SOFTWARE,   PERF_COUNT_SW_DUMMY,                   1,            DDPROF_PWT_NOCOUNT,     {})                      \
+  X(tALLOCSYS1, "System Allocations", PERF_TYPE_TRACEPOINT, kDDPROF_COUNT_SYSALLOCATIONS,          1,            DDPROF_PWT_ALLOC_SPACE, IS_OVERLOADED)           \
+  X(tALLOCSYS2, "System Al. (heavy)", PERF_TYPE_TRACEPOINT, kDDPROF_COUNT_SYSALLOCATIONS,          1,            DDPROF_PWT_ALLOC_SPACE, IS_OVERLOADED)           \
   X(sALLOC,     "Allocations",        kDDPROF_TYPE_CUSTOM,  kDDPROF_COUNT_ALLOCATIONS,             524288,       DDPROF_PWT_ALLOC_SPACE, SKIP_FRAMES)
 
 // clang-format on

@@ -131,10 +131,35 @@ bool watcher_from_str(const char *str, PerfWatcher *watcher) {
   watcher->tracepoint_group = conf->groupname;
   watcher->tracepoint_label = conf->label;
 
-  // Allocation watcher, has an extra field to ensure we capture address
+  // Certain watcher configs get additional event information
   if (watcher->config == kDDPROF_COUNT_ALLOCATIONS) {
     watcher->sample_type |= PERF_SAMPLE_ADDR;
   }
 
+  // Some profiling types get lots of additional state transplanted here
+  if (watcher->options.is_overloaded) {
+    if (watcher->ddprof_event_type == DDPROF_PWE_tALLOCSYS1) {
+      // tALLOCSY1 overrides perfopen to bind together many file descriptors
+      watcher->tracepoint_group = "syscalls";
+      watcher->tracepoint_label = "sys_exit_mmap";
+      watcher->instrument_self = true;
+      watcher->options.use_kernel = PerfWatcherUseKernel::kTry;
+      watcher->sample_stack_size /= 2; // Make this one smaller than normal
+
+    } else if (watcher->ddprof_event_type == DDPROF_PWE_tALLOCSYS2) {
+      // tALLOCSYS2 captures all syscalls; used to troubleshoot 1
+      watcher->tracepoint_group = "raw_syscalls";
+      watcher->tracepoint_label = "sys_exit";
+      long id = ddprof::tracepoint_get_id("raw_syscalls", "sys_exit");
+      if (-1 == id) {
+        // We mutated the user's event, but it is invalid.
+        return false;
+      }
+      watcher->config = id;
+    }
+    watcher->sample_type |= PERF_SAMPLE_RAW;
+    watcher->options.use_kernel = PerfWatcherUseKernel::kTry;
+  }
+
   return true;
 }
@@ -120,6 +120,16 @@ DDRes ddprof_context_set(DDProfInput *input, DDProfContext *ctx) {
   }
   ctx->num_watchers = nwatchers;
 
+  // Some profiling features, like system allocations, uses ctx storage and
+  // needs to associate a watcher (but only one watcher) to that storage.
+  for (int i = 0; i < ctx->num_watchers; ++i) {
+    if (ctx->watchers[i].ddprof_event_type == DDPROF_PWE_tALLOCSYS1 ||
+        ctx->watchers[i].ddprof_event_type == DDPROF_PWE_tALLOCSYS2) {
+      ctx->worker_ctx.sys_allocation.watcher_pos = i;
+      break;
+    }
+  }
+
   // Set defaults
   ctx->params.upload_period = 60.0;
 

@@ -346,6 +346,77 @@ DDRes ddprof_pr_sample(DDProfContext *ctx, perf_event_sample *sample,
   return {};
 }
 
+DDRes ddprof_pr_sysallocation_tracking(DDProfContext *ctx,
+                                       perf_event_sample *sample,
+                                       int watcher_pos, bool &clear_pid) {
+  clear_pid = false;
+  // Syscall parameters.  Suppressing nags because it's annoying to look these
+  // up and it isn't totally appropriate to spin out a new header just
+  // for this
+  int64_t id;
+  memcpy(&id, sample->data_raw + 8, sizeof(id));
+  auto &sysalloc = ctx->worker_ctx.sys_allocation;
+
+#ifdef __x86_64__
+  [[maybe_unused]] uint64_t sc_ret = sample->regs[PAM_X86_RAX];
+  [[maybe_unused]] uint64_t sc_p1 = sample->regs[PAM_X86_RDI];
+  [[maybe_unused]] uint64_t sc_p2 = sample->regs[PAM_X86_RSI];
+  [[maybe_unused]] uint64_t sc_p3 = sample->regs[PAM_X86_RDX];
+  [[maybe_unused]] uint64_t sc_p4 = sample->regs[PAM_X86_R10];
+  [[maybe_unused]] uint64_t sc_p5 = sample->regs[PAM_X86_R8];
+  [[maybe_unused]] uint64_t sc_p6 = sample->regs[PAM_X86_R9];
+#elif __aarch64__
+  // Obviously ARM is totally broken here.
+  [[maybe_unused]] uint64_t sc_ret = sample->regs[PAM_ARM_X0];
+  [[maybe_unused]] uint64_t sc_p1 = sample->regs[PAM_ARM_X0];
+  [[maybe_unused]] uint64_t sc_p2 = sample->regs[PAM_ARM_X1];
+  [[maybe_unused]] uint64_t sc_p3 = sample->regs[PAM_ARM_X2];
+  [[maybe_unused]] uint64_t sc_p4 = sample->regs[PAM_ARM_X3];
+  [[maybe_unused]] uint64_t sc_p5 = sample->regs[PAM_ARM_X4];
+  [[maybe_unused]] uint64_t sc_p6 = sample->regs[PAM_ARM_X5];
+#else
+#  error Architecture not supported
+#endif
+  if (sc_ret > -4096UL) {
+    // If the syscall returned error, it didn't mutate state.  Skip!
+    // ("high" values are errors, as per standard)
+    return ddres_init();
+  }
+
+  // Only unwind if we will need to propagate unwinding information forward
+  DDRes res = {};
+  UnwindOutput *uwo = NULL;
+  if (id == 9 || id == 25) {
+    auto ticks0 = ddprof::get_tsc_cycles();
+    res = ddprof_unwind_sample(ctx, sample, watcher_pos);
+    auto unwind_ticks = ddprof::get_tsc_cycles();
+    ddprof_stats_add(STATS_UNWIND_AVG_TIME, unwind_ticks - ticks0, NULL);
+    uwo = &ctx->worker_ctx.us->output;
+
+    // TODO: propagate fatal
+    if (IsDDResFatal(res)) {
+      return ddres_init();
+    }
+  }
+
+  // hardcoded syscall numbers; these are uniform between x86/arm
+  if (id == 9) {
+    sysalloc.do_mmap(*uwo, sc_ret, sc_p2, sample->pid);
+  } else if (id == 11) {
+    sysalloc.do_munmap(sc_p1, sc_p2, sample->pid);
+  } else if (id == 28) {
+    // Unhandled, no need to handle
+  } else if (id == 25) {
+    sysalloc.do_mremap(*uwo, sc_ret, sc_p1, sc_p2, sc_p3, sample->pid);
+  } else if (id == 60 || id == 231 || id == 59 || id == 322 || id == 520 ||
+             id == 545) {
+    // Erase upon exit or exec
+    clear_pid = true;
+  }
+
+  return ddres_init();
+}
+
 static void ddprof_reset_worker_stats() {
   for (unsigned i = 0; i < std::size(s_cycled_stats); ++i) {
     ddprof_stats_clear(s_cycled_stats[i]);
@@ -438,18 +509,63 @@ static DDRes aggregate_live_allocations(DDProfContext *ctx) {
   return ddres_init();
 }
 
+DDRes aggregate_sys_allocation_stack(const UnwindOutput *uw_output,
+                                     const SymbolHdr *symbol_hdr,
+                                     const PerfWatcher *watcher,
+                                     DDProfPProf *pprof) {
+  DDRES_CHECK_FWD(pprof_aggregate(uw_output, *symbol_hdr, get_page_size(), 1,
+                                  watcher, pprof));
+  return ddres_init();
+}
+
+static DDRes aggregate_sys_allocations_for_pid(DDProfContext *ctx, pid_t pid) {
+  struct UnwindState *us = ctx->worker_ctx.us;
+  SystemAllocation &sysallocs = ctx->worker_ctx.sys_allocation;
+  PerfWatcher *watcher = &ctx->watchers[sysallocs.watcher_pos];
+  int i_export = ctx->worker_ctx.i_current_pprof;
+  DDProfPProf *pprof = ctx->worker_ctx.pprof[i_export];
+  const auto &stack_map = sysallocs._pid_map[pid];
+  for (const auto &page : stack_map) {
+    DDRES_CHECK_FWD(aggregate_sys_allocation_stack(
+        &page.second, &us->symbol_hdr, watcher, pprof));
+  }
+  return ddres_init();
+}
+
+static DDRes aggregate_sys_allocations(DDProfContext *ctx) {
+  struct UnwindState *us = ctx->worker_ctx.us;
+  SystemAllocation &sysallocs = ctx->worker_ctx.sys_allocation;
+  PerfWatcher *watcher = &ctx->watchers[sysallocs.watcher_pos];
+  int i_export = ctx->worker_ctx.i_current_pprof;
+  DDProfPProf *pprof = ctx->worker_ctx.pprof[i_export];
+
+  // Iterate through each PID
+  for (auto &stack_map : sysallocs._pid_map) {
+    // Iterate through pages...
+    // TODO Probably aggregate into ranges of pages or something, but once per
+    //      page is just too much
+    for (const auto &page : stack_map.second) {
+      DDRES_CHECK_FWD(aggregate_sys_allocation_stack(
+          &page.second, &us->symbol_hdr, watcher, pprof));
+    }
+  }
+  return ddres_init();
+}
 
 static DDRes worker_pid_free(DDProfContext *ctx, pid_t el) {
+  DDRES_CHECK_FWD(aggregate_sys_allocations_for_pid(ctx, el));
   DDRES_CHECK_FWD(aggregate_live_allocations_for_pid(ctx, el));
   UnwindState *us = ctx->worker_ctx.us;
   unwind_pid_free(us, el);
+  ctx->worker_ctx.sys_allocation.clear_pid(el);
   ctx->worker_ctx.live_allocation.clear_pid(el);
   return ddres_init();
 }
 #else
 static DDRes worker_pid_free(DDProfContext *ctx, pid_t el) {
   UnwindState *us = ctx->worker_ctx.us;
   unwind_pid_free(us, el);
+  ctx->worker_ctx.sys_allocation.clear_pid(el);
   ctx->worker_ctx.live_allocation.clear_pid(el);
   return ddres_init();
 }
@@ -473,6 +589,7 @@ DDRes ddprof_worker_cycle(DDProfContext *ctx, int64_t now,
   DDRES_CHECK_FWD(clear_unvisited_pids(ctx));
 #ifndef DDPROF_NATIVE_LIB
   DDRES_CHECK_FWD(aggregate_live_allocations(ctx));
+  DDRES_CHECK_FWD(aggregate_sys_allocations(ctx));
 
   // Take the current pprof contents and ship them to the backend.  This also
   // clears the pprof for reuse
@@ -738,8 +855,24 @@ DDRes ddprof_worker_process_event(const perf_event_header *hdr, int watcher_pos,
       if (wpid->pid) {
         uint64_t mask = watcher->sample_type;
         perf_event_sample *sample = hdr2samp(hdr, mask);
+
         if (sample) {
-          DDRES_CHECK_FWD(ddprof_pr_sample(ctx, sample, watcher_pos));
+          // Handle special profiling types first
+          if (watcher->ddprof_event_type == DDPROF_PWE_tALLOCSYS1 ||
+              watcher->ddprof_event_type == DDPROF_PWE_tALLOCSYS2) {
+            // For now we have a different path for
+            // - mmap/munmap syscalls
+            bool clear_pid = false;
+            DDRES_CHECK_FWD(ddprof_pr_sysallocation_tracking(
+                ctx, sample, watcher_pos, clear_pid));
+            if (clear_pid) {
+              LG_DBG("<%d>(SYSEXIT)%d", watcher_pos, wpid->pid);
+              // we could consider clearing the pid here
+              // though we could get other types of events
+            }
+          } else {
+            DDRES_CHECK_FWD(ddprof_pr_sample(ctx, sample, watcher_pos));
+          }
         }
       }
       break;