From 1be4551cb8c9f663de538f1d3c99cc4028ccf4be Mon Sep 17 00:00:00 2001 From: r1viollet Date: Mon, 28 Aug 2023 15:15:58 +0200 Subject: [PATCH 01/16] Performance live heap profiling - deallocation code path Add a bitset to track the addresses that are kept for heap profiling. --- CMakeLists.txt | 1 + include/lib/address_bitset.hpp | 30 ++++++++++++++++ include/lib/allocation_tracker.hpp | 5 ++- src/jit/jitdump.cc | 2 ++ src/lib/address_bitset.cc | 56 ++++++++++++++++++++++++++++++ src/lib/allocation_tracker.cc | 46 +++++++++++++----------- test/CMakeLists.txt | 2 ++ 7 files changed, 121 insertions(+), 21 deletions(-) create mode 100644 include/lib/address_bitset.hpp create mode 100644 src/lib/address_bitset.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 04891fad8..0016eb17f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -198,6 +198,7 @@ set(DD_PROFILING_SOURCES src/ddprof_cmdline.cc src/ddres_list.cc src/ipc.cc + src/lib/address_bitset.cc src/lib/allocation_tracker.cc src/lib/dd_profiling.cc src/lib/elfutils.cc diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp new file mode 100644 index 000000000..790470140 --- /dev/null +++ b/include/lib/address_bitset.hpp @@ -0,0 +1,30 @@ +#pragma once + +#include +#include +#include +#include + +namespace ddprof { +class AddressBitset { +public: + // returns true if the element was inserted + bool set(uintptr_t addr); + // returns true if the element was removed + bool unset(uintptr_t addr); + void clear() { + std::fill(_address_bitset.begin(), _address_bitset.end(), 0); + _nb_elements = 0; + }; + int nb_elements() const { return _nb_elements; } + +private: + // 1 Meg divided in uint64's size + constexpr static unsigned _nb_bits = 8 * 1024 * 1024; + constexpr static unsigned _k_nb_elements = (_nb_bits) / (64); + constexpr static unsigned _nb_bits_mask = _nb_bits - 1; + // We can not use an actual bitset (for atomicity reasons) + std::array, _k_nb_elements> _address_bitset = {}; + int _nb_elements = 0; +}; +} // namespace ddprof diff --git a/include/lib/allocation_tracker.hpp b/include/lib/allocation_tracker.hpp index 4ce24a3fb..6d3ce049f 100644 --- a/include/lib/allocation_tracker.hpp +++ b/include/lib/allocation_tracker.hpp @@ -5,6 +5,7 @@ #pragma once +#include "address_bitset.hpp" #include "allocation_tracker_tls.hpp" #include "ddprof_base.hpp" #include "ddres_def.hpp" @@ -113,7 +114,9 @@ class AllocationTracker { uint32_t _stack_sample_size; PEvent _pevent; bool _deterministic_sampling; - AdressSet _address_set; + + // AdressSet _address_set; + AddressBitset _dealloc_bitset; // These can not be tied to the internal state of the instance. // The creation of the instance depends on this diff --git a/src/jit/jitdump.cc b/src/jit/jitdump.cc index 0f105a417..dc5ce0679 100644 --- a/src/jit/jitdump.cc +++ b/src/jit/jitdump.cc @@ -7,6 +7,8 @@ #include #include +#define DEBUG + // If we want to consider big endian, we will need this // has bswap_64/32/16 // #include diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc new file mode 100644 index 000000000..7f2b4653f --- /dev/null +++ b/src/lib/address_bitset.cc @@ -0,0 +1,56 @@ +#include "address_bitset.hpp" + +#include +#include +#include + +namespace ddprof { + +bool AddressBitset::set(uintptr_t addr) { + size_t hash_addr = std::hash().operator()(addr); + int significant_bits = hash_addr & _nb_bits_mask; + auto res = std::div(significant_bits, 64); + unsigned index_array = res.quot; + unsigned bit_offset = res.rem; + bool success = false; + int attempt = 0; + do { + uint64_t old_value = _address_bitset[index_array].load(); + if (old_value & (static_cast(1) << bit_offset)) { + // element is already set (collisions are expected) + return false; + } + uint64_t new_value = old_value; + new_value |= static_cast(1) << bit_offset; + success = _address_bitset[index_array].compare_exchange_weak(old_value, + new_value); + } while (unlikely(!success && attempt++ < 3)); + _nb_elements += success ? 1 : 0; + return success; +} + +bool AddressBitset::unset(uintptr_t addr) { + size_t hash_addr = std::hash().operator()(addr); + int significant_bits = hash_addr & _nb_bits_mask; + auto res = std::div(significant_bits, 64); + unsigned index_array = res.quot; + unsigned bit_offset = res.rem; + bool success = false; + int attempt = 0; + do { + uint64_t old_value = _address_bitset[index_array].load(); + if (!(old_value & (static_cast(1) << bit_offset))) { + // element is not already set (unexpected?) + return false; + } + uint64_t new_value = old_value; + new_value ^= static_cast(1) << bit_offset; + success = _address_bitset[index_array].compare_exchange_weak(old_value, + new_value); + } while (unlikely(!success && attempt++ < 3)); + _nb_elements -= success ? 1 : 0; + assert(_nb_elements >= 0); + return success; +} + +} // namespace ddprof \ No newline at end of file diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc index d095445d0..01f35485d 100644 --- a/src/lib/allocation_tracker.cc +++ b/src/lib/allocation_tracker.cc @@ -19,7 +19,6 @@ #include #include #include - #include namespace ddprof { @@ -229,27 +228,34 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size, tl_state.remaining_bytes = remaining_bytes; uint64_t total_size = nsamples * sampling_interval; - bool success = IsDDResOK(push_alloc_sample(addr, total_size, tl_state)); - free_on_consecutive_failures(success); - - if (success && _state.track_deallocations) { - // \fixme{r1viollet} adjust set to be lock free - // We should still lock the clear of live allocations (which is unlikely) - std::lock_guard lock{_state.mutex}; - - // ensure we track this dealloc if it occurs - _address_set.insert(addr); - if (unlikely(_address_set.size() > ddprof::liveallocation::kMaxTracked)) { - if (IsDDResOK(push_clear_live_allocation(tl_state))) { - _address_set.clear(); - } else { - log_once( - "Error: %s", - "Stop allocation profiling. Unable to clear live allocation \n"); - free(); + if (_state.track_deallocations) { + if (_dealloc_bitset.set(addr)) { + if (unlikely(_dealloc_bitset.nb_elements() > + ddprof::liveallocation::kMaxTracked)) { + // Check if we reached max number of elements + // Clear elements if we reach too many + // todo: should we just stop new live tracking (like java) ? + if (IsDDResOK(push_clear_live_allocation(tl_state))) { + _dealloc_bitset.clear(); + // still set this as we are pushing the allocation to ddprof + _dealloc_bitset.set(addr); + } else { + log_once( + "Error: %s", + "Stop allocation profiling. Unable to clear live allocation \n"); + free(); + } } + } else { + // we won't track this allocation as we can't double count in the bitset + return; } } + bool success = IsDDResOK(push_alloc_sample(addr, total_size, tl_state)); + free_on_consecutive_failures(success); + if (unlikely(!success) && _state.track_deallocations) { + _dealloc_bitset.unset(addr); + } } void AllocationTracker::track_deallocation(uintptr_t addr, @@ -264,7 +270,7 @@ void AllocationTracker::track_deallocation(uintptr_t addr, { // Grab the lock to check if this allocation was stored in the set std::lock_guard lock{_state.mutex}; - if (!_state.track_deallocations || !_address_set.erase(addr)) { + if (!_state.track_deallocations || !_dealloc_bitset.unset(addr)) { return; } } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a1aad2f94..834d9145c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -255,6 +255,7 @@ add_unit_test( allocation_tracker-ut allocation_tracker-ut.cc ../src/lib/allocation_tracker.cc + ../src/lib/address_bitset.cc ../src/base_frame_symbol_lookup.cc ../src/build_id.cc ../src/container_id.cc @@ -360,6 +361,7 @@ add_benchmark(timer-bench timer-bench.cc ../src/timer.cc ../src/perf.cc) add_benchmark( allocation_tracker-bench allocation_tracker-bench.cc + ../src/lib/address_bitset.cc ../src/lib/allocation_tracker.cc ../src/pevent_lib.cc ../src/perf.cc From 329af4f940204c7def1cdc84f36d5ff1939c99fb Mon Sep 17 00:00:00 2001 From: r1viollet Date: Wed, 30 Aug 2023 13:57:11 +0200 Subject: [PATCH 02/16] deallocation code path - bench fixes Fix the missing TLS storage initializations --- include/lib/address_bitset.hpp | 8 +++-- include/lib/lib_logger.hpp | 2 +- src/jit/jitdump.cc | 2 -- src/lib/address_bitset.cc | 14 ++++++--- src/lib/allocation_tracker.cc | 3 ++ test/CMakeLists.txt | 2 ++ test/address_bitset-ut.cc | 37 ++++++++++++++++++++++ test/allocation_tracker-bench.cc | 54 ++++++++++++++++++++++---------- test/allocation_tracker-ut.cc | 44 +++++++++----------------- 9 files changed, 109 insertions(+), 57 deletions(-) create mode 100644 test/address_bitset-ut.cc diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp index 790470140..37aeb6873 100644 --- a/include/lib/address_bitset.hpp +++ b/include/lib/address_bitset.hpp @@ -13,8 +13,10 @@ class AddressBitset { // returns true if the element was removed bool unset(uintptr_t addr); void clear() { - std::fill(_address_bitset.begin(), _address_bitset.end(), 0); - _nb_elements = 0; + for (auto& element : _address_bitset) { + element.store(0, std::memory_order_relaxed); + } + _nb_elements.store(0); }; int nb_elements() const { return _nb_elements; } @@ -25,6 +27,6 @@ class AddressBitset { constexpr static unsigned _nb_bits_mask = _nb_bits - 1; // We can not use an actual bitset (for atomicity reasons) std::array, _k_nb_elements> _address_bitset = {}; - int _nb_elements = 0; + std::atomic _nb_elements = 0; }; } // namespace ddprof diff --git a/include/lib/lib_logger.hpp b/include/lib/lib_logger.hpp index 49529406d..5e2d8c51f 100644 --- a/include/lib/lib_logger.hpp +++ b/include/lib/lib_logger.hpp @@ -12,7 +12,7 @@ namespace ddprof { template void log_once(char const *const format, Args... args) { -#ifndef DEBUG +#ifdef NDEBUG static std::once_flag flag; std::call_once(flag, [&, format]() { fprintf(stderr, format, args...); }); #else diff --git a/src/jit/jitdump.cc b/src/jit/jitdump.cc index dc5ce0679..0f105a417 100644 --- a/src/jit/jitdump.cc +++ b/src/jit/jitdump.cc @@ -7,8 +7,6 @@ #include #include -#define DEBUG - // If we want to consider big endian, we will need this // has bswap_64/32/16 // #include diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc index 7f2b4653f..4aac94d47 100644 --- a/src/lib/address_bitset.cc +++ b/src/lib/address_bitset.cc @@ -25,7 +25,9 @@ bool AddressBitset::set(uintptr_t addr) { success = _address_bitset[index_array].compare_exchange_weak(old_value, new_value); } while (unlikely(!success && attempt++ < 3)); - _nb_elements += success ? 1 : 0; + if (success) { + ++_nb_elements; + } return success; } @@ -41,15 +43,17 @@ bool AddressBitset::unset(uintptr_t addr) { uint64_t old_value = _address_bitset[index_array].load(); if (!(old_value & (static_cast(1) << bit_offset))) { // element is not already set (unexpected?) - return false; + break; } uint64_t new_value = old_value; new_value ^= static_cast(1) << bit_offset; success = _address_bitset[index_array].compare_exchange_weak(old_value, new_value); - } while (unlikely(!success && attempt++ < 3)); - _nb_elements -= success ? 1 : 0; - assert(_nb_elements >= 0); + } while (unlikely(!success) && attempt++ < 3); + if (success && _nb_elements.load(std::memory_order_relaxed) >= 0) { + // a reset could hit us, just prior to decrementing + --_nb_elements; // fetch_add - 1 + } return success; } diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc index 01f35485d..e3b1bbf50 100644 --- a/src/lib/allocation_tracker.cc +++ b/src/lib/allocation_tracker.cc @@ -162,6 +162,9 @@ void AllocationTracker::allocation_tracking_free() { if (!instance) { return; } + if (instance->_state.track_deallocations) { + instance->_dealloc_bitset.clear(); + } TrackerThreadLocalState *tl_state = get_tl_state(); if (unlikely(!tl_state)) { log_once("Error: Unable to find tl_state during %s\n", __FUNCTION__); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 834d9145c..6e2669668 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -353,6 +353,8 @@ add_unit_test( add_unit_test(pthread_tls-ut pthread_tls-ut.cc) +add_unit_test(address_bitset-ut address_bitset-ut.cc ../src/lib/address_bitset.cc) + add_benchmark(savecontext-bench savecontext-bench.cc ../src/lib/pthread_fixes.cc ../src/lib/savecontext.cc ../src/lib/saveregisters.cc) diff --git a/test/address_bitset-ut.cc b/test/address_bitset-ut.cc new file mode 100644 index 000000000..0ef27b385 --- /dev/null +++ b/test/address_bitset-ut.cc @@ -0,0 +1,37 @@ +#include + +#include + +#include "address_bitset.hpp" + +namespace ddprof { + +TEST(address_bitset, simple) { + AddressBitset address_bitset; + EXPECT_TRUE(address_bitset.set(0xbadbeef)); + EXPECT_FALSE(address_bitset.set(0xbadbeef)); + EXPECT_TRUE(address_bitset.unset(0xbadbeef)); +} + +TEST(address_bitset, many_addresses) { + AddressBitset address_bitset; + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis(0, + std::numeric_limits::max()); + + std::vector addresses; + unsigned nb_elements = 100000; + for (unsigned i = 0; i < nb_elements; ++i) { + uintptr_t addr = dis(gen); + if (address_bitset.set(addr)) { + addresses.push_back(addr); + } + } + EXPECT_TRUE(nb_elements - (nb_elements / 10) < addresses.size()); + for (auto addr : addresses) { + EXPECT_TRUE(address_bitset.unset(addr)); + } + EXPECT_EQ(0, address_bitset.nb_elements()); +} +} diff --git a/test/allocation_tracker-bench.cc b/test/allocation_tracker-bench.cc index 5fa708737..3b6d59699 100644 --- a/test/allocation_tracker-bench.cc +++ b/test/allocation_tracker-bench.cc @@ -13,26 +13,34 @@ // Global bench settings // Activate live heap tracking -// #define LIVE_HEAP +#define LIVE_HEAP // Sampling rate: default rate is 524288 -static constexpr uint64_t k_rate = 20000; +static constexpr uint64_t k_rate = 200000; #define READER_THREAD std::atomic reader_continue{true}; +std::atomic error_in_reader{false}; // Reader worker thread function void read_buffer(ddprof::RingBufferHolder &holder) { int nb_samples = 0; + error_in_reader = false; while (reader_continue) { ddprof::MPSCRingBufferReader reader(holder.get_ring_buffer()); +// fprintf(stderr, "size = %lu! \n", reader.available_size()); auto buf = reader.read_sample(); if (!buf.empty()) { ++nb_samples; - // fprintf(stderr, "Yep, got sample ! \n"); +// fprintf(stderr, "Yep, got sample ! \n"); } std::chrono::microseconds(10000); } +#ifdef DEBUG fprintf(stderr, "Reader thread exit, nb_samples=%d\n", nb_samples); +#endif + if (nb_samples == 0) { + error_in_reader = true; + } } DDPROF_NOINLINE void my_malloc(size_t size, uintptr_t addr = 0xdeadbeef) { @@ -40,8 +48,8 @@ DDPROF_NOINLINE void my_malloc(size_t size, uintptr_t addr = 0xdeadbeef) { ddprof::AllocationTracker::get_tl_state(); if (tl_state) { // tl_state is null if we are not tracking allocations ddprof::AllocationTracker::track_allocation_s(addr, size, *tl_state); - DDPROF_BLOCK_TAIL_CALL_OPTIMIZATION(); } + DDPROF_BLOCK_TAIL_CALL_OPTIMIZATION(); } DDPROF_NOINLINE void my_free(uintptr_t addr) { @@ -62,14 +70,6 @@ void perform_memory_operations(bool track_allocations, ddprof::AllocationTracker::kTrackDeallocations; #endif - ddprof::RingBufferHolder ring_buffer{buf_size_order, - RingBufferType::kMPSCRingBuffer}; - - if (track_allocations) { - ddprof::AllocationTracker::allocation_tracking_init( - rate, flags, k_default_perf_stack_sample_size, - ring_buffer.get_buffer_info()); - } int nb_threads = 4; std::vector threads; @@ -77,13 +77,20 @@ void perform_memory_operations(bool track_allocations, size_t page_size = 0x1000; std::random_device rd; std::mt19937 gen(rd()); - + ddprof::RingBufferHolder ring_buffer{buf_size_order, + RingBufferType::kMPSCRingBuffer}; #ifdef READER_THREAD // create reader worker thread reader_continue = true; std::thread reader_thread{read_buffer, std::ref(ring_buffer)}; #endif + if (track_allocations) { + ddprof::AllocationTracker::allocation_tracking_init( + rate, flags, k_default_perf_stack_sample_size, + ring_buffer.get_buffer_info()); + } + for (auto _ : state) { // Initialize threads and clear addresses threads.clear(); @@ -91,8 +98,9 @@ void perform_memory_operations(bool track_allocations, for (int i = 0; i < nb_threads; ++i) { threads.emplace_back([&, i] { - std::uniform_int_distribution<> dis(i * page_size, - (i + 1) * page_size - 1); + ddprof::AllocationTracker::init_tl_state(); + std::uniform_int_distribution dis(i * page_size, + (i + 1) * page_size - 1); for (int j = 0; j < num_allocations; ++j) { uintptr_t addr = dis(gen); @@ -108,7 +116,9 @@ void perform_memory_operations(bool track_allocations, threads.clear(); for (int i = 0; i < nb_threads; ++i) { + ddprof::AllocationTracker::init_tl_state(); threads.emplace_back([&, i] { + ddprof::AllocationTracker::init_tl_state(); for (auto addr : thread_addresses[i]) { my_free(addr); } @@ -119,11 +129,19 @@ void perform_memory_operations(bool track_allocations, t.join(); } } + + + #ifdef READER_THREAD reader_continue = false; reader_thread.join(); #endif - ddprof::AllocationTracker::allocation_tracking_free(); + if (track_allocations) { + ddprof::AllocationTracker::allocation_tracking_free(); + if (error_in_reader) { + exit(1); + } + } } // Benchmark without allocation tracking @@ -143,6 +161,7 @@ class WorkerThread { WorkerThread() : stop(false), perform_task(false) { worker_thread = std::thread([this] { + ddprof::AllocationTracker::init_tl_state(); while (!stop) { std::unique_lock lock(mutex); cv.wait(lock, [this] { return perform_task || stop; }); @@ -225,7 +244,8 @@ void perform_memory_operations_2(bool track_allocations, std::mt19937 gen(rd()); for (int i = 0; i < nb_threads; ++i) { - std::uniform_int_distribution<> dis(i * page_size, (i + 1) * page_size - 1); + std::uniform_int_distribution dis(i * page_size, + (i + 1) * page_size - 1); for (int j = 0; j < num_allocations; ++j) { uintptr_t addr = dis(gen); thread_addresses[i].push_back(addr); diff --git a/test/allocation_tracker-ut.cc b/test/allocation_tracker-ut.cc index 6bca2f83e..5e0bb5034 100644 --- a/test/allocation_tracker-ut.cc +++ b/test/allocation_tracker-ut.cc @@ -149,44 +149,30 @@ TEST(allocation_tracker, max_tracked_allocs) { k_default_perf_stack_sample_size, ring_buffer.get_buffer_info()); ASSERT_TRUE(ddprof::AllocationTracker::is_active()); - - for (int i = 0; i <= ddprof::liveallocation::kMaxTracked + 1; ++i) { + bool clear_found = false; + for (int i = 0; i <= ddprof::liveallocation::kMaxTracked * 2 + 1; ++i) { my_malloc(1, 0x1000 + i); ddprof::MPSCRingBufferReader reader{ring_buffer.get_ring_buffer()}; - if (i <= - ddprof::liveallocation::kMaxTracked) { // check that we get the relevant - // info for this allocation - ASSERT_GT(reader.available_size(), 0); + while (reader.available_size() > 0) { auto buf = reader.read_sample(); ASSERT_FALSE(buf.empty()); const perf_event_header *hdr = reinterpret_cast(buf.data()); - ASSERT_EQ(hdr->type, PERF_RECORD_SAMPLE); - - perf_event_sample *sample = - hdr2samp(hdr, perf_event_default_sample_type() | PERF_SAMPLE_ADDR); - - ASSERT_EQ(sample->period, 1); - ASSERT_EQ(sample->pid, getpid()); - ASSERT_EQ(sample->tid, ddprof::gettid()); - ASSERT_EQ(sample->addr, 0x1000 + i); - } else { - bool clear_found = false; - int nb_read = 0; - ddprof::ConstBuffer buf; - do { - buf = reader.read_sample(); - ++nb_read; - if (buf.empty()) - break; - const perf_event_header *hdr = - reinterpret_cast(buf.data()); + if (hdr->type == PERF_RECORD_SAMPLE) { + perf_event_sample *sample = + hdr2samp(hdr, perf_event_default_sample_type() | PERF_SAMPLE_ADDR); + ASSERT_EQ(sample->period, 1); + ASSERT_EQ(sample->pid, getpid()); + ASSERT_EQ(sample->tid, ddprof::gettid()); + ASSERT_EQ(sample->addr, 0x1000 + i); + } + else { if (hdr->type == PERF_CUSTOM_EVENT_CLEAR_LIVE_ALLOCATION) { + printf("Clear was found at iteration %d\n", i); clear_found = true; } - } while (true); - EXPECT_EQ(nb_read, 3); - EXPECT_TRUE(clear_found); + } } } + ASSERT_TRUE(clear_found); } From 52fb8c9fccd527883028d986d23552c9e449f19f Mon Sep 17 00:00:00 2001 From: r1viollet Date: Wed, 30 Aug 2023 14:53:51 +0200 Subject: [PATCH 03/16] Deallocation code path optimisations - Removal of the lock - Minor fixes around the allocation code paths --- include/lib/address_bitset.hpp | 11 ++++++++++- src/lib/address_bitset.cc | 3 ++- src/lib/allocation_tracker.cc | 7 ++----- test/address_bitset-ut.cc | 6 +++--- test/allocation_tracker-bench.cc | 17 +++++++---------- test/allocation_tracker-ut.cc | 6 ++---- 6 files changed, 26 insertions(+), 24 deletions(-) diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp index 37aeb6873..0501271db 100644 --- a/include/lib/address_bitset.hpp +++ b/include/lib/address_bitset.hpp @@ -7,13 +7,20 @@ namespace ddprof { class AddressBitset { + // Number of bits is the number of addresses we can store + // We have one address per individual bit). + // so lets say you have 1111, you can store 4 addresses. + // We hash the address to a number (to have an equal probability of using + // all bits). Then we use the mask to position this address in our bitset. + // Addr -> Hash -> Mask (to get useable bits) -> Position in the bitset + // Note: the hashing step might be bad for cache locality. public: // returns true if the element was inserted bool set(uintptr_t addr); // returns true if the element was removed bool unset(uintptr_t addr); void clear() { - for (auto& element : _address_bitset) { + for (auto &element : _address_bitset) { element.store(0, std::memory_order_relaxed); } _nb_elements.store(0); @@ -22,6 +29,8 @@ class AddressBitset { private: // 1 Meg divided in uint64's size + // The probability of collision is proportional to the number of elements + // already within the bitset constexpr static unsigned _nb_bits = 8 * 1024 * 1024; constexpr static unsigned _k_nb_elements = (_nb_bits) / (64); constexpr static unsigned _nb_bits_mask = _nb_bits - 1; diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc index 4aac94d47..389ac23bb 100644 --- a/src/lib/address_bitset.cc +++ b/src/lib/address_bitset.cc @@ -49,7 +49,8 @@ bool AddressBitset::unset(uintptr_t addr) { new_value ^= static_cast(1) << bit_offset; success = _address_bitset[index_array].compare_exchange_weak(old_value, new_value); - } while (unlikely(!success) && attempt++ < 3); + } while (unlikely(!success) && ++attempt < 4); + assert(attempt < 4); // This can desync our live heap view (Should not happen) if (success && _nb_elements.load(std::memory_order_relaxed) >= 0) { // a reset could hit us, just prior to decrementing --_nb_elements; // fetch_add - 1 diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc index e3b1bbf50..544d19293 100644 --- a/src/lib/allocation_tracker.cc +++ b/src/lib/allocation_tracker.cc @@ -271,11 +271,8 @@ void AllocationTracker::track_deallocation(uintptr_t addr, return; } - { // Grab the lock to check if this allocation was stored in the set - std::lock_guard lock{_state.mutex}; - if (!_state.track_deallocations || !_dealloc_bitset.unset(addr)) { - return; - } + if (!_state.track_deallocations || !_dealloc_bitset.unset(addr)) { + return; } bool success = IsDDResOK(push_dealloc_sample(addr, tl_state)); diff --git a/test/address_bitset-ut.cc b/test/address_bitset-ut.cc index 0ef27b385..13935078d 100644 --- a/test/address_bitset-ut.cc +++ b/test/address_bitset-ut.cc @@ -17,8 +17,8 @@ TEST(address_bitset, many_addresses) { AddressBitset address_bitset; std::random_device rd; std::mt19937 gen(rd()); - std::uniform_int_distribution dis(0, - std::numeric_limits::max()); + std::uniform_int_distribution dis( + 0, std::numeric_limits::max()); std::vector addresses; unsigned nb_elements = 100000; @@ -34,4 +34,4 @@ TEST(address_bitset, many_addresses) { } EXPECT_EQ(0, address_bitset.nb_elements()); } -} +} // namespace ddprof diff --git a/test/allocation_tracker-bench.cc b/test/allocation_tracker-bench.cc index 3b6d59699..76194971e 100644 --- a/test/allocation_tracker-bench.cc +++ b/test/allocation_tracker-bench.cc @@ -17,7 +17,9 @@ // Sampling rate: default rate is 524288 static constexpr uint64_t k_rate = 200000; -#define READER_THREAD +// The reader thread is interesting, though it starts dominating the CPU +// the benchmark focuses on the capture of allocation events. +// #define READER_THREAD std::atomic reader_continue{true}; std::atomic error_in_reader{false}; @@ -27,17 +29,14 @@ void read_buffer(ddprof::RingBufferHolder &holder) { error_in_reader = false; while (reader_continue) { ddprof::MPSCRingBufferReader reader(holder.get_ring_buffer()); -// fprintf(stderr, "size = %lu! \n", reader.available_size()); + // fprintf(stderr, "size = %lu! \n", reader.available_size()); auto buf = reader.read_sample(); if (!buf.empty()) { ++nb_samples; -// fprintf(stderr, "Yep, got sample ! \n"); } std::chrono::microseconds(10000); } -#ifdef DEBUG fprintf(stderr, "Reader thread exit, nb_samples=%d\n", nb_samples); -#endif if (nb_samples == 0) { error_in_reader = true; } @@ -70,7 +69,6 @@ void perform_memory_operations(bool track_allocations, ddprof::AllocationTracker::kTrackDeallocations; #endif - int nb_threads = 4; std::vector threads; int num_allocations = 1000; @@ -98,9 +96,11 @@ void perform_memory_operations(bool track_allocations, for (int i = 0; i < nb_threads; ++i) { threads.emplace_back([&, i] { + // in theory we automatically hook in thread creation + // though in the benchmark we can not do this. ddprof::AllocationTracker::init_tl_state(); std::uniform_int_distribution dis(i * page_size, - (i + 1) * page_size - 1); + (i + 1) * page_size - 1); for (int j = 0; j < num_allocations; ++j) { uintptr_t addr = dis(gen); @@ -116,7 +116,6 @@ void perform_memory_operations(bool track_allocations, threads.clear(); for (int i = 0; i < nb_threads; ++i) { - ddprof::AllocationTracker::init_tl_state(); threads.emplace_back([&, i] { ddprof::AllocationTracker::init_tl_state(); for (auto addr : thread_addresses[i]) { @@ -130,8 +129,6 @@ void perform_memory_operations(bool track_allocations, } } - - #ifdef READER_THREAD reader_continue = false; reader_thread.join(); diff --git a/test/allocation_tracker-ut.cc b/test/allocation_tracker-ut.cc index 5e0bb5034..5f2382506 100644 --- a/test/allocation_tracker-ut.cc +++ b/test/allocation_tracker-ut.cc @@ -150,7 +150,7 @@ TEST(allocation_tracker, max_tracked_allocs) { ASSERT_TRUE(ddprof::AllocationTracker::is_active()); bool clear_found = false; - for (int i = 0; i <= ddprof::liveallocation::kMaxTracked * 2 + 1; ++i) { + for (int i = 0; i <= ddprof::liveallocation::kMaxTracked + 10; ++i) { my_malloc(1, 0x1000 + i); ddprof::MPSCRingBufferReader reader{ring_buffer.get_ring_buffer()}; while (reader.available_size() > 0) { @@ -165,10 +165,8 @@ TEST(allocation_tracker, max_tracked_allocs) { ASSERT_EQ(sample->pid, getpid()); ASSERT_EQ(sample->tid, ddprof::gettid()); ASSERT_EQ(sample->addr, 0x1000 + i); - } - else { + } else { if (hdr->type == PERF_CUSTOM_EVENT_CLEAR_LIVE_ALLOCATION) { - printf("Clear was found at iteration %d\n", i); clear_found = true; } } From 60d9dd1d926f7af181f4f034e22f9d8108b5c34d Mon Sep 17 00:00:00 2001 From: r1viollet Date: Thu, 31 Aug 2023 09:09:19 +0200 Subject: [PATCH 04/16] Minor - add licensing headers --- include/lib/address_bitset.hpp | 4 ++++ include/lib/allocation_tracker.hpp | 1 - src/lib/address_bitset.cc | 4 ++++ test/address_bitset-ut.cc | 4 ++++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp index 0501271db..f29519552 100644 --- a/include/lib/address_bitset.hpp +++ b/include/lib/address_bitset.hpp @@ -1,3 +1,7 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. This product includes software +// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present +// Datadog, Inc. #pragma once #include diff --git a/include/lib/allocation_tracker.hpp b/include/lib/allocation_tracker.hpp index 6d3ce049f..db63ef344 100644 --- a/include/lib/allocation_tracker.hpp +++ b/include/lib/allocation_tracker.hpp @@ -115,7 +115,6 @@ class AllocationTracker { PEvent _pevent; bool _deterministic_sampling; - // AdressSet _address_set; AddressBitset _dealloc_bitset; // These can not be tied to the internal state of the instance. diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc index 389ac23bb..fad3cfe38 100644 --- a/src/lib/address_bitset.cc +++ b/src/lib/address_bitset.cc @@ -1,3 +1,7 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. This product includes software +// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present +// Datadog, Inc. #include "address_bitset.hpp" #include diff --git a/test/address_bitset-ut.cc b/test/address_bitset-ut.cc index 13935078d..d3baac0fa 100644 --- a/test/address_bitset-ut.cc +++ b/test/address_bitset-ut.cc @@ -1,3 +1,7 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. This product includes software +// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present +// Datadog, Inc. #include #include From ac8cb9e184e319c7c1aa1f152b5710be8dda41d5 Mon Sep 17 00:00:00 2001 From: r1viollet Date: Tue, 5 Sep 2023 09:38:01 +0200 Subject: [PATCH 05/16] Performances for deallocation tracking code path Adjust the usage of the address bitset. --- include/lib/address_bitset.hpp | 42 ++++++++++++++++------- include/lib/allocation_tracker.hpp | 5 +-- include/live_allocation-c.hpp | 2 +- src/lib/address_bitset.cc | 47 +++++++++++++++++-------- src/lib/allocation_tracker.cc | 26 +++++++------- test/address_bitset-ut.cc | 55 ++++++++++++++++++++++++++++-- 6 files changed, 132 insertions(+), 45 deletions(-) diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp index f29519552..0b3be9798 100644 --- a/include/lib/address_bitset.hpp +++ b/include/lib/address_bitset.hpp @@ -4,8 +4,8 @@ // Datadog, Inc. #pragma once -#include #include +#include #include #include @@ -19,27 +19,43 @@ class AddressBitset { // Addr -> Hash -> Mask (to get useable bits) -> Position in the bitset // Note: the hashing step might be bad for cache locality. public: + // Publish 1 Meg as default + constexpr static unsigned _k_default_max_addresses = 8 * 1024 * 1024; + AddressBitset(unsigned max_addresses = 0) { init(max_addresses); } + + void init(unsigned max_addresses) { + if (_address_bitset) { + _address_bitset.reset(); + } + _nb_bits = max_addresses; + _k_nb_elements = (_nb_bits) / (_nb_bits_per_elt); + if (_nb_bits) { + _nb_bits_mask = _nb_bits - 1; + _address_bitset = + std::make_unique[]>(_k_nb_elements); + } + } + // returns true if the element was inserted bool set(uintptr_t addr); // returns true if the element was removed bool unset(uintptr_t addr); - void clear() { - for (auto &element : _address_bitset) { - element.store(0, std::memory_order_relaxed); - } - _nb_elements.store(0); - }; - int nb_elements() const { return _nb_elements; } + void clear(); + int nb_addresses() const { return _nb_addresses; } private: + constexpr static auto _k_max_write_attempts = 4; + // element type + using Elt_t = uint64_t; + constexpr static unsigned _nb_bits_per_elt = sizeof(Elt_t) * 8; // 1 Meg divided in uint64's size // The probability of collision is proportional to the number of elements // already within the bitset - constexpr static unsigned _nb_bits = 8 * 1024 * 1024; - constexpr static unsigned _k_nb_elements = (_nb_bits) / (64); - constexpr static unsigned _nb_bits_mask = _nb_bits - 1; + unsigned _nb_bits = {}; + unsigned _k_nb_elements = {}; + unsigned _nb_bits_mask = {}; // We can not use an actual bitset (for atomicity reasons) - std::array, _k_nb_elements> _address_bitset = {}; - std::atomic _nb_elements = 0; + std::unique_ptr[]> _address_bitset; + std::atomic _nb_addresses = 0; }; } // namespace ddprof diff --git a/include/lib/allocation_tracker.hpp b/include/lib/allocation_tracker.hpp index db63ef344..ae869f134 100644 --- a/include/lib/allocation_tracker.hpp +++ b/include/lib/allocation_tracker.hpp @@ -84,7 +84,8 @@ class AllocationTracker { uint64_t next_sample_interval(std::minstd_rand &gen); DDRes init(uint64_t mem_profile_interval, bool deterministic_sampling, - uint32_t stack_sample_size, const RingBufferInfo &ring_buffer); + bool track_deallocations, uint32_t stack_sample_size, + const RingBufferInfo &ring_buffer); void free(); static AllocationTracker *create_instance(); @@ -115,7 +116,7 @@ class AllocationTracker { PEvent _pevent; bool _deterministic_sampling; - AddressBitset _dealloc_bitset; + AddressBitset _allocated_address_set; // These can not be tied to the internal state of the instance. // The creation of the instance depends on this diff --git a/include/live_allocation-c.hpp b/include/live_allocation-c.hpp index 657e84b1b..5ccfc136e 100644 --- a/include/live_allocation-c.hpp +++ b/include/live_allocation-c.hpp @@ -11,7 +11,7 @@ namespace liveallocation { // build time override to reduce execution time of test static constexpr auto kMaxTracked = KMAX_TRACKED_ALLOCATIONS; #else -static constexpr auto kMaxTracked = 500000; +static constexpr auto kMaxTracked = 524288; // 2^19 #endif } // namespace liveallocation } // namespace ddprof diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc index fad3cfe38..c05a82077 100644 --- a/src/lib/address_bitset.cc +++ b/src/lib/address_bitset.cc @@ -10,12 +10,23 @@ namespace ddprof { +namespace { +// instead of a hash function we just remove lower bits +// the assumption is that we sample, so we should have less address ranges +// that are close to each other. +// We can't use the sample period as we can still have addresses +// that are close due to sequences of allocations / frees +inline uint64_t remove_lower_bits(uintptr_t h1) { return h1 >> 8; } +} // namespace + bool AddressBitset::set(uintptr_t addr) { - size_t hash_addr = std::hash().operator()(addr); - int significant_bits = hash_addr & _nb_bits_mask; - auto res = std::div(significant_bits, 64); - unsigned index_array = res.quot; - unsigned bit_offset = res.rem; + uint64_t hash_addr = remove_lower_bits(addr); + unsigned significant_bits = hash_addr & _nb_bits_mask; + // As per nsavoire's comment, it is better to use separate operators + // than to use the div instruction which generates an extra function call + // Also, the usage of a power of two value allows for bit operations + unsigned index_array = significant_bits / 64; + unsigned bit_offset = significant_bits % 64; bool success = false; int attempt = 0; do { @@ -28,17 +39,17 @@ bool AddressBitset::set(uintptr_t addr) { new_value |= static_cast(1) << bit_offset; success = _address_bitset[index_array].compare_exchange_weak(old_value, new_value); - } while (unlikely(!success && attempt++ < 3)); + } while (unlikely(!success && ++attempt < _k_max_write_attempts)); if (success) { - ++_nb_elements; + ++_nb_addresses; } return success; } bool AddressBitset::unset(uintptr_t addr) { - size_t hash_addr = std::hash().operator()(addr); + uint64_t hash_addr = remove_lower_bits(addr); int significant_bits = hash_addr & _nb_bits_mask; - auto res = std::div(significant_bits, 64); + auto res = std::div(significant_bits, _nb_bits_per_elt); unsigned index_array = res.quot; unsigned bit_offset = res.rem; bool success = false; @@ -53,13 +64,21 @@ bool AddressBitset::unset(uintptr_t addr) { new_value ^= static_cast(1) << bit_offset; success = _address_bitset[index_array].compare_exchange_weak(old_value, new_value); - } while (unlikely(!success) && ++attempt < 4); - assert(attempt < 4); // This can desync our live heap view (Should not happen) - if (success && _nb_elements.load(std::memory_order_relaxed) >= 0) { + } while (unlikely(!success && ++attempt < _k_max_write_attempts)); + assert(attempt < _k_max_write_attempts); // This can desync our live heap view + // (Should not happen) + if (success && _nb_addresses.load(std::memory_order_relaxed) >= 0) { // a reset could hit us, just prior to decrementing - --_nb_elements; // fetch_add - 1 + --_nb_addresses; // fetch_add - 1 } return success; } -} // namespace ddprof \ No newline at end of file +void AddressBitset::clear() { + for (unsigned i = 0; i < _k_nb_elements; ++i) { + _address_bitset[i].store(0, std::memory_order_relaxed); + } + _nb_addresses.store(0); +} + +} // namespace ddprof diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc index 544d19293..ee0058e0c 100644 --- a/src/lib/allocation_tracker.cc +++ b/src/lib/allocation_tracker.cc @@ -118,9 +118,9 @@ DDRes AllocationTracker::allocation_tracking_init( void *volatile p = ::malloc(1); ::free(p); - DDRES_CHECK_FWD(instance->init(allocation_profiling_rate, - flags & kDeterministicSampling, - stack_sample_size, ring_buffer)); + DDRES_CHECK_FWD(instance->init( + allocation_profiling_rate, flags & kDeterministicSampling, + flags & kTrackDeallocations, stack_sample_size, ring_buffer)); _instance = instance; state.init(true, flags & kTrackDeallocations); @@ -130,6 +130,7 @@ DDRes AllocationTracker::allocation_tracking_init( DDRes AllocationTracker::init(uint64_t mem_profile_interval, bool deterministic_sampling, + bool track_deallocations, uint32_t stack_sample_size, const RingBufferInfo &ring_buffer) { _sampling_interval = mem_profile_interval; @@ -139,6 +140,10 @@ DDRes AllocationTracker::init(uint64_t mem_profile_interval, static_cast(RingBufferType::kMPSCRingBuffer)) { return ddres_error(DD_WHAT_PERFRB); } + if (track_deallocations) { + // 16 times as we want to probability of collision to be low enough + _allocated_address_set.init(liveallocation::kMaxTracked * 16); + } return ddprof::ring_buffer_attach(ring_buffer, &_pevent); } @@ -162,9 +167,6 @@ void AllocationTracker::allocation_tracking_free() { if (!instance) { return; } - if (instance->_state.track_deallocations) { - instance->_dealloc_bitset.clear(); - } TrackerThreadLocalState *tl_state = get_tl_state(); if (unlikely(!tl_state)) { log_once("Error: Unable to find tl_state during %s\n", __FUNCTION__); @@ -232,16 +234,16 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size, uint64_t total_size = nsamples * sampling_interval; if (_state.track_deallocations) { - if (_dealloc_bitset.set(addr)) { - if (unlikely(_dealloc_bitset.nb_elements() > + if (_allocated_address_set.set(addr)) { + if (unlikely(_allocated_address_set.nb_addresses() > ddprof::liveallocation::kMaxTracked)) { // Check if we reached max number of elements // Clear elements if we reach too many // todo: should we just stop new live tracking (like java) ? if (IsDDResOK(push_clear_live_allocation(tl_state))) { - _dealloc_bitset.clear(); + _allocated_address_set.clear(); // still set this as we are pushing the allocation to ddprof - _dealloc_bitset.set(addr); + _allocated_address_set.set(addr); } else { log_once( "Error: %s", @@ -257,7 +259,7 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size, bool success = IsDDResOK(push_alloc_sample(addr, total_size, tl_state)); free_on_consecutive_failures(success); if (unlikely(!success) && _state.track_deallocations) { - _dealloc_bitset.unset(addr); + _allocated_address_set.unset(addr); } } @@ -271,7 +273,7 @@ void AllocationTracker::track_deallocation(uintptr_t addr, return; } - if (!_state.track_deallocations || !_dealloc_bitset.unset(addr)) { + if (!_state.track_deallocations || !_allocated_address_set.unset(addr)) { return; } diff --git a/test/address_bitset-ut.cc b/test/address_bitset-ut.cc index d3baac0fa..248a4dfc5 100644 --- a/test/address_bitset-ut.cc +++ b/test/address_bitset-ut.cc @@ -4,21 +4,24 @@ // Datadog, Inc. #include +#include +#include #include +#include #include "address_bitset.hpp" namespace ddprof { TEST(address_bitset, simple) { - AddressBitset address_bitset; + AddressBitset address_bitset(AddressBitset::_k_default_max_addresses); EXPECT_TRUE(address_bitset.set(0xbadbeef)); EXPECT_FALSE(address_bitset.set(0xbadbeef)); EXPECT_TRUE(address_bitset.unset(0xbadbeef)); } TEST(address_bitset, many_addresses) { - AddressBitset address_bitset; + AddressBitset address_bitset(AddressBitset::_k_default_max_addresses); std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution dis( @@ -36,6 +39,52 @@ TEST(address_bitset, many_addresses) { for (auto addr : addresses) { EXPECT_TRUE(address_bitset.unset(addr)); } - EXPECT_EQ(0, address_bitset.nb_elements()); + EXPECT_EQ(0, address_bitset.nb_addresses()); } + +// This test is mostly to tune the hash approach (it would slow down CI) +#ifdef COLLISION_TEST +// Your hash function +# ifdef TEST_IDENTITY +inline uint64_t my_hash(uintptr_t h1) { return h1; } +# else +inline uint64_t my_hash(uintptr_t h1) { return h1 >> 8; } +# endif + +TEST(address_bitset, hash_function) { + // Number of values to hash + const int num_values = 1000000; + + // A large address range (33 bits) + const uintptr_t min_address = 0x100000000; + const uintptr_t max_address = 0x200000000; + + // Create an unordered set to store hashed values + std::unordered_set lower_bits_hashed_values; + + // Initialize random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distribution(min_address, + max_address); + + // Generate and hash random values within the address range + for (int i = 0; i < num_values; ++i) { + uintptr_t address = distribution(gen); + uint32_t lower_bits = my_hash(address) & (8 * 1024 * 1024 - 1); + // Insert the lower 16 bits into the set + lower_bits_hashed_values.insert(lower_bits); + } + + // Calculate collision statistics + int num_collisions = num_values - lower_bits_hashed_values.size(); + double collision_rate = + static_cast(num_collisions) / num_values * 100.0; + + std::cout << "Hash test results:" << std::endl; + std::cout << "Number of values: " << num_values << std::endl; + std::cout << "Number of collisions: " << num_collisions << std::endl; + std::cout << "Collision rate: " << collision_rate << "%" << std::endl; +} +#endif } // namespace ddprof From 770f5795e353291b12d11138117dc019b54c7444 Mon Sep 17 00:00:00 2001 From: r1viollet Date: Tue, 5 Sep 2023 11:21:14 +0200 Subject: [PATCH 06/16] Performance for deallocation code path - Adjust the hashing logic --- include/lib/address_bitset.hpp | 26 ++++------ src/lib/address_bitset.cc | 94 ++++++++++++++++++---------------- src/lib/allocation_tracker.cc | 3 +- test/CMakeLists.txt | 4 +- test/address_bitset-ut.cc | 4 +- test/allocation_tracker-ut.cc | 18 +++++-- 6 files changed, 81 insertions(+), 68 deletions(-) diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp index 0b3be9798..cb8edb1e1 100644 --- a/include/lib/address_bitset.hpp +++ b/include/lib/address_bitset.hpp @@ -21,21 +21,10 @@ class AddressBitset { public: // Publish 1 Meg as default constexpr static unsigned _k_default_max_addresses = 8 * 1024 * 1024; - AddressBitset(unsigned max_addresses = 0) { init(max_addresses); } - - void init(unsigned max_addresses) { - if (_address_bitset) { - _address_bitset.reset(); - } - _nb_bits = max_addresses; - _k_nb_elements = (_nb_bits) / (_nb_bits_per_elt); - if (_nb_bits) { - _nb_bits_mask = _nb_bits - 1; - _address_bitset = - std::make_unique[]>(_k_nb_elements); - } + AddressBitset(int sampling_period = 1, unsigned max_addresses = 0) { + init(sampling_period, max_addresses); } - + void init(int sampling_period, unsigned max_addresses); // returns true if the element was inserted bool set(uintptr_t addr); // returns true if the element was removed @@ -44,7 +33,8 @@ class AddressBitset { int nb_addresses() const { return _nb_addresses; } private: - constexpr static auto _k_max_write_attempts = 4; + static constexpr unsigned _k_max_bits_ignored = 8; + unsigned _lower_bits_ignored; // element type using Elt_t = uint64_t; constexpr static unsigned _nb_bits_per_elt = sizeof(Elt_t) * 8; @@ -57,5 +47,11 @@ class AddressBitset { // We can not use an actual bitset (for atomicity reasons) std::unique_ptr[]> _address_bitset; std::atomic _nb_addresses = 0; + // instead of a hash function we just remove lower bits + // the assumption is that we sample, so we should have less address ranges + // that are close to each other. + // We can't use the sample period as we can still have addresses + // that are close due to sequences of allocations / frees + uint64_t remove_lower_bits(uintptr_t h1) { return h1 >> _lower_bits_ignored; } }; } // namespace ddprof diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc index c05a82077..e861d0632 100644 --- a/src/lib/address_bitset.cc +++ b/src/lib/address_bitset.cc @@ -4,21 +4,41 @@ // Datadog, Inc. #include "address_bitset.hpp" -#include +#include #include #include namespace ddprof { namespace { -// instead of a hash function we just remove lower bits -// the assumption is that we sample, so we should have less address ranges -// that are close to each other. -// We can't use the sample period as we can still have addresses -// that are close due to sequences of allocations / frees -inline uint64_t remove_lower_bits(uintptr_t h1) { return h1 >> 8; } +int most_significant_bit_pos(int n) { + if (n == 0) + return 0; + int msb = 0; + n = n / 2; + while (n != 0) { + n = n / 2; + msb++; + } + return msb; +} } // namespace +void AddressBitset::init(int sampling_period, unsigned max_addresses) { + _lower_bits_ignored = most_significant_bit_pos(sampling_period); + // we avoid ignoring too many of the lower bits + _lower_bits_ignored = std::min(_k_max_bits_ignored, _lower_bits_ignored); + if (_address_bitset) { + _address_bitset.reset(); + } + _nb_bits = max_addresses; + _k_nb_elements = (_nb_bits) / (_nb_bits_per_elt); + if (_nb_bits) { + _nb_bits_mask = _nb_bits - 1; + _address_bitset = std::make_unique[]>(_k_nb_elements); + } +} + bool AddressBitset::set(uintptr_t addr) { uint64_t hash_addr = remove_lower_bits(addr); unsigned significant_bits = hash_addr & _nb_bits_mask; @@ -27,51 +47,37 @@ bool AddressBitset::set(uintptr_t addr) { // Also, the usage of a power of two value allows for bit operations unsigned index_array = significant_bits / 64; unsigned bit_offset = significant_bits % 64; - bool success = false; - int attempt = 0; - do { - uint64_t old_value = _address_bitset[index_array].load(); - if (old_value & (static_cast(1) << bit_offset)) { - // element is already set (collisions are expected) - return false; - } - uint64_t new_value = old_value; - new_value |= static_cast(1) << bit_offset; - success = _address_bitset[index_array].compare_exchange_weak(old_value, - new_value); - } while (unlikely(!success && ++attempt < _k_max_write_attempts)); - if (success) { - ++_nb_addresses; + uint64_t bit_in_element = (1UL << bit_offset); + uint64_t old_value = _address_bitset[index_array].load(); + if (old_value & bit_in_element) { + // element is already set (collisions are expected) + return false; } - return success; + // there is a possible race between checking the value + // and setting it + _address_bitset[index_array].fetch_or(bit_in_element); + ++_nb_addresses; + return true; } bool AddressBitset::unset(uintptr_t addr) { uint64_t hash_addr = remove_lower_bits(addr); int significant_bits = hash_addr & _nb_bits_mask; - auto res = std::div(significant_bits, _nb_bits_per_elt); - unsigned index_array = res.quot; - unsigned bit_offset = res.rem; - bool success = false; - int attempt = 0; - do { - uint64_t old_value = _address_bitset[index_array].load(); - if (!(old_value & (static_cast(1) << bit_offset))) { - // element is not already set (unexpected?) - break; - } - uint64_t new_value = old_value; - new_value ^= static_cast(1) << bit_offset; - success = _address_bitset[index_array].compare_exchange_weak(old_value, - new_value); - } while (unlikely(!success && ++attempt < _k_max_write_attempts)); - assert(attempt < _k_max_write_attempts); // This can desync our live heap view - // (Should not happen) - if (success && _nb_addresses.load(std::memory_order_relaxed) >= 0) { - // a reset could hit us, just prior to decrementing + unsigned index_array = significant_bits / 64; + unsigned bit_offset = significant_bits % 64; + uint64_t bit_in_element = (1UL << bit_offset); + uint64_t old_value = _address_bitset[index_array].load(); + if (!(old_value & bit_in_element)) { + // element is not already unset de-sync + return false; + } + _address_bitset[index_array].fetch_xor(bit_in_element); + // a reset could hit us, just prior to decrementing + // How important is avoiding a negative value? (vs avoiding this check?) + if (likely(_nb_addresses.load(std::memory_order_relaxed) >= 0)) { --_nb_addresses; // fetch_add - 1 } - return success; + return true; } void AddressBitset::clear() { diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc index ee0058e0c..64de7453e 100644 --- a/src/lib/allocation_tracker.cc +++ b/src/lib/allocation_tracker.cc @@ -142,7 +142,8 @@ DDRes AllocationTracker::init(uint64_t mem_profile_interval, } if (track_deallocations) { // 16 times as we want to probability of collision to be low enough - _allocated_address_set.init(liveallocation::kMaxTracked * 16); + _allocated_address_set.init(_sampling_interval, + liveallocation::kMaxTracked * 16); } return ddprof::ring_buffer_attach(ring_buffer, &_pevent); } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6e2669668..0f064998a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -297,7 +297,7 @@ add_unit_test( ../src/unwind_helpers.cc ../src/unwind_metrics.cc LIBRARIES ${ELFUTILS_LIBRARIES} llvm-demangle DDProf::Parser - DEFINITIONS ${DDPROF_DEFINITION_LIST} KMAX_TRACKED_ALLOCATIONS=100) + DEFINITIONS ${DDPROF_DEFINITION_LIST} KMAX_TRACKED_ALLOCATIONS=16384) add_unit_test(sys_utils-ut sys_utils-ut.cc ../src/sys_utils.cc) @@ -377,7 +377,7 @@ add_benchmark( ../src/user_override.cc ../src/sys_utils.cc LIBRARIES ${ELFUTILS_LIBRARIES} llvm-demangle DDProf::Parser - DEFINITIONS ${DDPROF_DEFINITION_LIST} KMAX_TRACKED_ALLOCATIONS=10000) + DEFINITIONS ${DDPROF_DEFINITION_LIST} KMAX_TRACKED_ALLOCATIONS=16384) if(NOT CMAKE_BUILD_TYPE STREQUAL "SanitizedDebug") add_exe( diff --git a/test/address_bitset-ut.cc b/test/address_bitset-ut.cc index 248a4dfc5..f23875042 100644 --- a/test/address_bitset-ut.cc +++ b/test/address_bitset-ut.cc @@ -14,14 +14,14 @@ namespace ddprof { TEST(address_bitset, simple) { - AddressBitset address_bitset(AddressBitset::_k_default_max_addresses); + AddressBitset address_bitset(1, AddressBitset::_k_default_max_addresses); EXPECT_TRUE(address_bitset.set(0xbadbeef)); EXPECT_FALSE(address_bitset.set(0xbadbeef)); EXPECT_TRUE(address_bitset.unset(0xbadbeef)); } TEST(address_bitset, many_addresses) { - AddressBitset address_bitset(AddressBitset::_k_default_max_addresses); + AddressBitset address_bitset(1, AddressBitset::_k_default_max_addresses); std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution dis( diff --git a/test/allocation_tracker-ut.cc b/test/allocation_tracker-ut.cc index 5f2382506..4875097a8 100644 --- a/test/allocation_tracker-ut.cc +++ b/test/allocation_tracker-ut.cc @@ -150,8 +150,15 @@ TEST(allocation_tracker, max_tracked_allocs) { ASSERT_TRUE(ddprof::AllocationTracker::is_active()); bool clear_found = false; - for (int i = 0; i <= ddprof::liveallocation::kMaxTracked + 10; ++i) { - my_malloc(1, 0x1000 + i); + uint64_t nb_samples = 0; + for (int i = 0; i <= ddprof::liveallocation::kMaxTracked + + ddprof::liveallocation::kMaxTracked / 10; + ++i) { + // We can find pathological allocation patterns that we track badly + // example : 0x1000 + (i*128); + // this will prevent us from using the first bits of the bitset + uintptr_t addr = 0x1000 + (i); + my_malloc(1, addr); ddprof::MPSCRingBufferReader reader{ring_buffer.get_ring_buffer()}; while (reader.available_size() > 0) { auto buf = reader.read_sample(); @@ -159,12 +166,13 @@ TEST(allocation_tracker, max_tracked_allocs) { const perf_event_header *hdr = reinterpret_cast(buf.data()); if (hdr->type == PERF_RECORD_SAMPLE) { + ++nb_samples; perf_event_sample *sample = hdr2samp(hdr, perf_event_default_sample_type() | PERF_SAMPLE_ADDR); ASSERT_EQ(sample->period, 1); ASSERT_EQ(sample->pid, getpid()); ASSERT_EQ(sample->tid, ddprof::gettid()); - ASSERT_EQ(sample->addr, 0x1000 + i); + ASSERT_EQ(sample->addr, addr); } else { if (hdr->type == PERF_CUSTOM_EVENT_CLEAR_LIVE_ALLOCATION) { clear_found = true; @@ -172,5 +180,7 @@ TEST(allocation_tracker, max_tracked_allocs) { } } } - ASSERT_TRUE(clear_found); + fprintf(stderr, "Number of found samples %lu (vs max = %d) \n", nb_samples, + ddprof::liveallocation::kMaxTracked); + EXPECT_TRUE(clear_found); } From 78e4c51394469d1e23f65175c64c6f19f679ab79 Mon Sep 17 00:00:00 2001 From: r1viollet Date: Fri, 8 Sep 2023 09:49:51 +0200 Subject: [PATCH 07/16] Allocation tracker performance code path Remove the dependency between bitset and sampling period. Simplify the hashing logic. --- include/lib/address_bitset.hpp | 26 +++++++++++++++----------- src/lib/address_bitset.cc | 25 +++++-------------------- src/lib/allocation_tracker.cc | 3 +-- test/address_bitset-ut.cc | 4 ++-- test/allocation_tracker-ut.cc | 5 +---- 5 files changed, 24 insertions(+), 39 deletions(-) diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp index cb8edb1e1..a83e81136 100644 --- a/include/lib/address_bitset.hpp +++ b/include/lib/address_bitset.hpp @@ -21,10 +21,8 @@ class AddressBitset { public: // Publish 1 Meg as default constexpr static unsigned _k_default_max_addresses = 8 * 1024 * 1024; - AddressBitset(int sampling_period = 1, unsigned max_addresses = 0) { - init(sampling_period, max_addresses); - } - void init(int sampling_period, unsigned max_addresses); + AddressBitset(unsigned max_addresses = 0) { init(max_addresses); } + void init(unsigned max_addresses); // returns true if the element was inserted bool set(uintptr_t addr); // returns true if the element was removed @@ -33,7 +31,7 @@ class AddressBitset { int nb_addresses() const { return _nb_addresses; } private: - static constexpr unsigned _k_max_bits_ignored = 8; + static constexpr unsigned _k_max_bits_ignored = 4; unsigned _lower_bits_ignored; // element type using Elt_t = uint64_t; @@ -47,11 +45,17 @@ class AddressBitset { // We can not use an actual bitset (for atomicity reasons) std::unique_ptr[]> _address_bitset; std::atomic _nb_addresses = 0; - // instead of a hash function we just remove lower bits - // the assumption is that we sample, so we should have less address ranges - // that are close to each other. - // We can't use the sample period as we can still have addresses - // that are close due to sequences of allocations / frees - uint64_t remove_lower_bits(uintptr_t h1) { return h1 >> _lower_bits_ignored; } + + // This is a kind of hash function + // We remove the lower bits (as the alignment constraints makes them useless) + // We fold the address + // Then we only keep the bits that matter for the order in the bitmap + uint32_t remove_lower_bits(uintptr_t h1) { + uint64_t intermediate = h1 >> _lower_bits_ignored; + uint32_t high = (uint32_t)(intermediate >> 32); + uint32_t low = (uint32_t)intermediate; + uint32_t res = high ^ low; + return res & _nb_bits_mask; + } }; } // namespace ddprof diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc index e861d0632..b7790b3fe 100644 --- a/src/lib/address_bitset.cc +++ b/src/lib/address_bitset.cc @@ -10,24 +10,10 @@ namespace ddprof { -namespace { -int most_significant_bit_pos(int n) { - if (n == 0) - return 0; - int msb = 0; - n = n / 2; - while (n != 0) { - n = n / 2; - msb++; - } - return msb; -} -} // namespace - -void AddressBitset::init(int sampling_period, unsigned max_addresses) { - _lower_bits_ignored = most_significant_bit_pos(sampling_period); - // we avoid ignoring too many of the lower bits - _lower_bits_ignored = std::min(_k_max_bits_ignored, _lower_bits_ignored); +void AddressBitset::init(unsigned max_addresses) { + // Due to memory alignment, on 64 bits we can assume that the first 4 + // bits can be ignored + _lower_bits_ignored = _k_max_bits_ignored; if (_address_bitset) { _address_bitset.reset(); } @@ -40,8 +26,7 @@ void AddressBitset::init(int sampling_period, unsigned max_addresses) { } bool AddressBitset::set(uintptr_t addr) { - uint64_t hash_addr = remove_lower_bits(addr); - unsigned significant_bits = hash_addr & _nb_bits_mask; + uint32_t significant_bits = remove_lower_bits(addr); // As per nsavoire's comment, it is better to use separate operators // than to use the div instruction which generates an extra function call // Also, the usage of a power of two value allows for bit operations diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc index 64de7453e..ee0058e0c 100644 --- a/src/lib/allocation_tracker.cc +++ b/src/lib/allocation_tracker.cc @@ -142,8 +142,7 @@ DDRes AllocationTracker::init(uint64_t mem_profile_interval, } if (track_deallocations) { // 16 times as we want to probability of collision to be low enough - _allocated_address_set.init(_sampling_interval, - liveallocation::kMaxTracked * 16); + _allocated_address_set.init(liveallocation::kMaxTracked * 16); } return ddprof::ring_buffer_attach(ring_buffer, &_pevent); } diff --git a/test/address_bitset-ut.cc b/test/address_bitset-ut.cc index f23875042..248a4dfc5 100644 --- a/test/address_bitset-ut.cc +++ b/test/address_bitset-ut.cc @@ -14,14 +14,14 @@ namespace ddprof { TEST(address_bitset, simple) { - AddressBitset address_bitset(1, AddressBitset::_k_default_max_addresses); + AddressBitset address_bitset(AddressBitset::_k_default_max_addresses); EXPECT_TRUE(address_bitset.set(0xbadbeef)); EXPECT_FALSE(address_bitset.set(0xbadbeef)); EXPECT_TRUE(address_bitset.unset(0xbadbeef)); } TEST(address_bitset, many_addresses) { - AddressBitset address_bitset(1, AddressBitset::_k_default_max_addresses); + AddressBitset address_bitset(AddressBitset::_k_default_max_addresses); std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution dis( diff --git a/test/allocation_tracker-ut.cc b/test/allocation_tracker-ut.cc index 4875097a8..f85421086 100644 --- a/test/allocation_tracker-ut.cc +++ b/test/allocation_tracker-ut.cc @@ -154,10 +154,7 @@ TEST(allocation_tracker, max_tracked_allocs) { for (int i = 0; i <= ddprof::liveallocation::kMaxTracked + ddprof::liveallocation::kMaxTracked / 10; ++i) { - // We can find pathological allocation patterns that we track badly - // example : 0x1000 + (i*128); - // this will prevent us from using the first bits of the bitset - uintptr_t addr = 0x1000 + (i); + uintptr_t addr = 0x1000 + (i * 16); my_malloc(1, addr); ddprof::MPSCRingBufferReader reader{ring_buffer.get_ring_buffer()}; while (reader.available_size() > 0) { From 1499381d1f4d52bc1ccebaf38cf931f11ebf0191 Mon Sep 17 00:00:00 2001 From: r1viollet Date: Fri, 8 Sep 2023 10:02:59 +0200 Subject: [PATCH 08/16] Deallocation code path performance - test hash function Re-test the collision rate of the hash we use. --- test/address_bitset-ut.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/test/address_bitset-ut.cc b/test/address_bitset-ut.cc index 248a4dfc5..9320b8820 100644 --- a/test/address_bitset-ut.cc +++ b/test/address_bitset-ut.cc @@ -42,13 +42,19 @@ TEST(address_bitset, many_addresses) { EXPECT_EQ(0, address_bitset.nb_addresses()); } -// This test is mostly to tune the hash approach (it would slow down CI) +// This test to tune the hash approach +// Collision rate is around 5.7%, which will have an impact on sampling #ifdef COLLISION_TEST // Your hash function # ifdef TEST_IDENTITY inline uint64_t my_hash(uintptr_t h1) { return h1; } # else -inline uint64_t my_hash(uintptr_t h1) { return h1 >> 8; } +inline uint64_t my_hash(uintptr_t h1) { + h1 = h1 >> 4; + uint32_t high = (uint32_t)(h1 >> 32); + uint32_t low = (uint32_t)h1; + return high ^ low; +} # endif TEST(address_bitset, hash_function) { From b72cb178218169885f1b65a6e2ebb2a96681e81b Mon Sep 17 00:00:00 2001 From: r1viollet Date: Fri, 8 Sep 2023 11:31:06 +0200 Subject: [PATCH 09/16] Performance for Deallocation code path Improve the bitset set and unset flow to have a single atomic operation. --- src/lib/address_bitset.cc | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc index b7790b3fe..e8b8a7401 100644 --- a/src/lib/address_bitset.cc +++ b/src/lib/address_bitset.cc @@ -33,16 +33,16 @@ bool AddressBitset::set(uintptr_t addr) { unsigned index_array = significant_bits / 64; unsigned bit_offset = significant_bits % 64; uint64_t bit_in_element = (1UL << bit_offset); - uint64_t old_value = _address_bitset[index_array].load(); - if (old_value & bit_in_element) { - // element is already set (collisions are expected) - return false; - } // there is a possible race between checking the value // and setting it - _address_bitset[index_array].fetch_or(bit_in_element); - ++_nb_addresses; - return true; + if (!(_address_bitset[index_array].fetch_or(bit_in_element) & + bit_in_element)) { + // check that the element was not already set + ++_nb_addresses; + return true; + } + // Collision, element was already set + return false; } bool AddressBitset::unset(uintptr_t addr) { @@ -51,18 +51,13 @@ bool AddressBitset::unset(uintptr_t addr) { unsigned index_array = significant_bits / 64; unsigned bit_offset = significant_bits % 64; uint64_t bit_in_element = (1UL << bit_offset); - uint64_t old_value = _address_bitset[index_array].load(); - if (!(old_value & bit_in_element)) { - // element is not already unset de-sync - return false; - } - _address_bitset[index_array].fetch_xor(bit_in_element); - // a reset could hit us, just prior to decrementing - // How important is avoiding a negative value? (vs avoiding this check?) - if (likely(_nb_addresses.load(std::memory_order_relaxed) >= 0)) { + if ((_address_bitset[index_array].fetch_xor(bit_in_element) & + bit_in_element) && + likely(_nb_addresses.load(std::memory_order_relaxed) >= 0)) { --_nb_addresses; // fetch_add - 1 + return true; } - return true; + return false; } void AddressBitset::clear() { From e432f0b6302186b3f1a81315442ddf44bb022684 Mon Sep 17 00:00:00 2001 From: r1viollet Date: Wed, 13 Sep 2023 10:28:20 +0200 Subject: [PATCH 10/16] Deallocation codepath performances - minor adjustments - Enforce power of two on size of bitset - Change the log_once api to take into account the place where it is called from - Naming updates --- include/lib/address_bitset.hpp | 16 +++++---- include/lib/lib_logger.hpp | 21 ++++++++--- src/lib/address_bitset.cc | 66 +++++++++++++++++++++++++--------- src/lib/allocation_tracker.cc | 20 +++++------ src/lib/dd_profiling.cc | 5 ++- test/CMakeLists.txt | 2 ++ test/address_bitset-ut.cc | 16 ++++----- test/lib_logger-ut.cc | 22 ++++++++++++ test/reentry_guard-ut.cc | 4 +-- 9 files changed, 120 insertions(+), 52 deletions(-) create mode 100644 test/lib_logger-ut.cc diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp index a83e81136..f9ffae4d8 100644 --- a/include/lib/address_bitset.hpp +++ b/include/lib/address_bitset.hpp @@ -20,22 +20,22 @@ class AddressBitset { // Note: the hashing step might be bad for cache locality. public: // Publish 1 Meg as default - constexpr static unsigned _k_default_max_addresses = 8 * 1024 * 1024; + constexpr static unsigned _k_default_bitset_size = 8 * 1024 * 1024; AddressBitset(unsigned max_addresses = 0) { init(max_addresses); } void init(unsigned max_addresses); // returns true if the element was inserted - bool set(uintptr_t addr); + bool add(uintptr_t addr); // returns true if the element was removed - bool unset(uintptr_t addr); + bool remove(uintptr_t addr); void clear(); - int nb_addresses() const { return _nb_addresses; } + int count() const { return _nb_addresses; } private: static constexpr unsigned _k_max_bits_ignored = 4; unsigned _lower_bits_ignored; // element type - using Elt_t = uint64_t; - constexpr static unsigned _nb_bits_per_elt = sizeof(Elt_t) * 8; + using Word_t = uint64_t; + constexpr static unsigned _nb_bits_per_elt = sizeof(Word_t) * 8; // 1 Meg divided in uint64's size // The probability of collision is proportional to the number of elements // already within the bitset @@ -46,11 +46,13 @@ class AddressBitset { std::unique_ptr[]> _address_bitset; std::atomic _nb_addresses = 0; + static unsigned int count_set_bits(Word_t w); + // This is a kind of hash function // We remove the lower bits (as the alignment constraints makes them useless) // We fold the address // Then we only keep the bits that matter for the order in the bitmap - uint32_t remove_lower_bits(uintptr_t h1) { + uint32_t hash_significant_bits(uintptr_t h1) { uint64_t intermediate = h1 >> _lower_bits_ignored; uint32_t high = (uint32_t)(intermediate >> 32); uint32_t low = (uint32_t)intermediate; diff --git a/include/lib/lib_logger.hpp b/include/lib/lib_logger.hpp index 5e2d8c51f..c27fab22e 100644 --- a/include/lib/lib_logger.hpp +++ b/include/lib/lib_logger.hpp @@ -10,13 +10,24 @@ #include namespace ddprof { -template -void log_once(char const *const format, Args... args) { + #ifdef NDEBUG - static std::once_flag flag; - std::call_once(flag, [&, format]() { fprintf(stderr, format, args...); }); +template +void log_once_helper(std::once_flag &flag, Func &&func) { + std::call_once(flag, std::forward(func)); #else - fprintf(stderr, format, args...); +template void log_once_helper(std::once_flag &, Func &&func) { + func(); #endif } + +// create a once flag for the line and file where this is called: +#define LOG_ONCE(format, ...) \ + do { \ + static std::once_flag UNIQUE_ONCE_FLAG_##__LINE__##__FILE__; \ + ddprof::log_once_helper(UNIQUE_ONCE_FLAG_##__LINE__##__FILE__, [&]() { \ + fprintf(stderr, (format), ##__VA_ARGS__); \ + }); \ + } while (0) + } // namespace ddprof diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc index e8b8a7401..3f601968a 100644 --- a/src/lib/address_bitset.cc +++ b/src/lib/address_bitset.cc @@ -10,6 +10,25 @@ namespace ddprof { +namespace { +unsigned round_to_power_of_two(unsigned num) { + if (num == 0) { + return num; + } + // If max_addresses is already a power of two + if ((num & (num - 1)) == 0) { + return num; + } + // not a power of two + unsigned count = 0; + while (num) { + num >>= 1; + count++; + } + return 1 << count; +} +} // namespace + void AddressBitset::init(unsigned max_addresses) { // Due to memory alignment, on 64 bits we can assume that the first 4 // bits can be ignored @@ -17,7 +36,7 @@ void AddressBitset::init(unsigned max_addresses) { if (_address_bitset) { _address_bitset.reset(); } - _nb_bits = max_addresses; + _nb_bits = round_to_power_of_two(max_addresses); _k_nb_elements = (_nb_bits) / (_nb_bits_per_elt); if (_nb_bits) { _nb_bits_mask = _nb_bits - 1; @@ -25,14 +44,14 @@ void AddressBitset::init(unsigned max_addresses) { } } -bool AddressBitset::set(uintptr_t addr) { - uint32_t significant_bits = remove_lower_bits(addr); +bool AddressBitset::add(uintptr_t addr) { + uint32_t significant_bits = hash_significant_bits(addr); // As per nsavoire's comment, it is better to use separate operators // than to use the div instruction which generates an extra function call // Also, the usage of a power of two value allows for bit operations - unsigned index_array = significant_bits / 64; - unsigned bit_offset = significant_bits % 64; - uint64_t bit_in_element = (1UL << bit_offset); + unsigned index_array = significant_bits / sizeof(Word_t); + unsigned bit_offset = significant_bits % sizeof(Word_t); + Word_t bit_in_element = (1UL << bit_offset); // there is a possible race between checking the value // and setting it if (!(_address_bitset[index_array].fetch_or(bit_in_element) & @@ -45,26 +64,39 @@ bool AddressBitset::set(uintptr_t addr) { return false; } -bool AddressBitset::unset(uintptr_t addr) { - uint64_t hash_addr = remove_lower_bits(addr); - int significant_bits = hash_addr & _nb_bits_mask; - unsigned index_array = significant_bits / 64; - unsigned bit_offset = significant_bits % 64; - uint64_t bit_in_element = (1UL << bit_offset); +bool AddressBitset::remove(uintptr_t addr) { + int significant_bits = hash_significant_bits(addr); + unsigned index_array = significant_bits / sizeof(Word_t); + unsigned bit_offset = significant_bits % sizeof(Word_t); + Word_t bit_in_element = (1UL << bit_offset); if ((_address_bitset[index_array].fetch_xor(bit_in_element) & - bit_in_element) && - likely(_nb_addresses.load(std::memory_order_relaxed) >= 0)) { - --_nb_addresses; // fetch_add - 1 + bit_in_element)) { + _nb_addresses.fetch_sub(1, std::memory_order_relaxed); + // in the unlikely event of a clear right at the wrong time, we could + // have a negative number of elements (though count desyncs are acceptable) return true; } return false; } +unsigned int AddressBitset::count_set_bits(Word_t w) { + unsigned int count = 0; + while (w) { + count += w & 1; + w >>= 1; + } + return count; +} + void AddressBitset::clear() { for (unsigned i = 0; i < _k_nb_elements; ++i) { - _address_bitset[i].store(0, std::memory_order_relaxed); + Word_t original_value = _address_bitset[i].exchange(0); + // Count number of set bits in original_value + int num_set_bits = count_set_bits(original_value); + if (num_set_bits > 0) { + _nb_addresses.fetch_sub(num_set_bits, std::memory_order_relaxed); + } } - _nb_addresses.store(0); } } // namespace ddprof diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc index ee0058e0c..e3c117bf6 100644 --- a/src/lib/allocation_tracker.cc +++ b/src/lib/allocation_tracker.cc @@ -67,7 +67,7 @@ TrackerThreadLocalState *AllocationTracker::init_tl_state() { if (res_set) { // should return 0 - log_once("Error: Unable to store tl_state. error %d \n", res_set); + LOG_ONCE("Error: Unable to store tl_state. error %d \n", res_set); delete tl_state; tl_state = nullptr; } @@ -169,7 +169,7 @@ void AllocationTracker::allocation_tracking_free() { } TrackerThreadLocalState *tl_state = get_tl_state(); if (unlikely(!tl_state)) { - log_once("Error: Unable to find tl_state during %s\n", __FUNCTION__); + LOG_ONCE("Error: Unable to find tl_state during %s\n", __FUNCTION__); instance->free(); return; } @@ -234,8 +234,8 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size, uint64_t total_size = nsamples * sampling_interval; if (_state.track_deallocations) { - if (_allocated_address_set.set(addr)) { - if (unlikely(_allocated_address_set.nb_addresses() > + if (_allocated_address_set.add(addr)) { + if (unlikely(_allocated_address_set.count() > ddprof::liveallocation::kMaxTracked)) { // Check if we reached max number of elements // Clear elements if we reach too many @@ -243,9 +243,9 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size, if (IsDDResOK(push_clear_live_allocation(tl_state))) { _allocated_address_set.clear(); // still set this as we are pushing the allocation to ddprof - _allocated_address_set.set(addr); + _allocated_address_set.add(addr); } else { - log_once( + LOG_ONCE( "Error: %s", "Stop allocation profiling. Unable to clear live allocation \n"); free(); @@ -259,7 +259,7 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size, bool success = IsDDResOK(push_alloc_sample(addr, total_size, tl_state)); free_on_consecutive_failures(success); if (unlikely(!success) && _state.track_deallocations) { - _allocated_address_set.unset(addr); + _allocated_address_set.remove(addr); } } @@ -273,7 +273,7 @@ void AllocationTracker::track_deallocation(uintptr_t addr, return; } - if (!_state.track_deallocations || !_allocated_address_set.unset(addr)) { + if (!_state.track_deallocations || !_allocated_address_set.remove(addr)) { return; } @@ -502,7 +502,7 @@ void AllocationTracker::notify_thread_start() { if (unlikely(!tl_state)) { tl_state = init_tl_state(); if (!tl_state) { - log_once("Error: Unable to start allocation profiling on thread %d", + LOG_ONCE("Error: Unable to start allocation profiling on thread %d", ddprof::gettid()); return; } @@ -522,7 +522,7 @@ void AllocationTracker::notify_fork() { if (unlikely(!tl_state)) { // The state should already exist if we forked. // This would mean that we were not able to create the state before forking - log_once("Error: Unable to retrieve tl state after fork thread %d", + LOG_ONCE("Error: Unable to retrieve tl state after fork thread %d", ddprof::gettid()); return; } else { diff --git a/src/lib/dd_profiling.cc b/src/lib/dd_profiling.cc index b66544621..880c276d3 100644 --- a/src/lib/dd_profiling.cc +++ b/src/lib/dd_profiling.cc @@ -277,8 +277,7 @@ int ddprof_start_profiling_internal() { // fails ? g_state.allocation_profiling_started = true; } else { - ddprof::log_once("Error: %s", - "Failure to start allocation profiling\n"); + LOG_ONCE("Error: %s", "Failure to start allocation profiling\n"); } } } catch (const ddprof::DDException &e) { return -1; } @@ -286,7 +285,7 @@ int ddprof_start_profiling_internal() { if (g_state.allocation_profiling_started) { int res = pthread_atfork(nullptr, nullptr, notify_fork); if (res) { - ddprof::log_once("Error:%s", "Unable to setup notify fork"); + LOG_ONCE("Error:%s", "Unable to setup notify fork"); assert(0); } } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0f064998a..1ffc85d6c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -355,6 +355,8 @@ add_unit_test(pthread_tls-ut pthread_tls-ut.cc) add_unit_test(address_bitset-ut address_bitset-ut.cc ../src/lib/address_bitset.cc) +add_unit_test(lib_logger-ut ./lib_logger-ut.cc) + add_benchmark(savecontext-bench savecontext-bench.cc ../src/lib/pthread_fixes.cc ../src/lib/savecontext.cc ../src/lib/saveregisters.cc) diff --git a/test/address_bitset-ut.cc b/test/address_bitset-ut.cc index 9320b8820..59c872fb1 100644 --- a/test/address_bitset-ut.cc +++ b/test/address_bitset-ut.cc @@ -14,14 +14,14 @@ namespace ddprof { TEST(address_bitset, simple) { - AddressBitset address_bitset(AddressBitset::_k_default_max_addresses); - EXPECT_TRUE(address_bitset.set(0xbadbeef)); - EXPECT_FALSE(address_bitset.set(0xbadbeef)); - EXPECT_TRUE(address_bitset.unset(0xbadbeef)); + AddressBitset address_bitset(AddressBitset::_k_default_bitset_size); + EXPECT_TRUE(address_bitset.add(0xbadbeef)); + EXPECT_FALSE(address_bitset.add(0xbadbeef)); + EXPECT_TRUE(address_bitset.remove(0xbadbeef)); } TEST(address_bitset, many_addresses) { - AddressBitset address_bitset(AddressBitset::_k_default_max_addresses); + AddressBitset address_bitset(AddressBitset::_k_default_bitset_size); std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution dis( @@ -31,15 +31,15 @@ TEST(address_bitset, many_addresses) { unsigned nb_elements = 100000; for (unsigned i = 0; i < nb_elements; ++i) { uintptr_t addr = dis(gen); - if (address_bitset.set(addr)) { + if (address_bitset.add(addr)) { addresses.push_back(addr); } } EXPECT_TRUE(nb_elements - (nb_elements / 10) < addresses.size()); for (auto addr : addresses) { - EXPECT_TRUE(address_bitset.unset(addr)); + EXPECT_TRUE(address_bitset.remove(addr)); } - EXPECT_EQ(0, address_bitset.nb_addresses()); + EXPECT_EQ(0, address_bitset.count()); } // This test to tune the hash approach diff --git a/test/lib_logger-ut.cc b/test/lib_logger-ut.cc new file mode 100644 index 000000000..cfcf5f598 --- /dev/null +++ b/test/lib_logger-ut.cc @@ -0,0 +1,22 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. This product includes software +// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present +// Datadog, Inc. + +#include + +#include "lib/lib_logger.hpp" + +namespace ddprof { + +void foo_log(const std::string &str) { LOG_ONCE("%s\n", str.c_str()); } + +TEST(LibLogger, simple) { + LOG_ONCE("something\n"); + LOG_ONCE("else\n"); + foo_log("one"); // this shows + foo_log("two"); // this does not show + const char *some_string = "some_string"; + LOG_ONCE("Some string = %s\n", some_string); +} +} // namespace ddprof diff --git a/test/reentry_guard-ut.cc b/test/reentry_guard-ut.cc index 4b733581e..1c0f6f0aa 100644 --- a/test/reentry_guard-ut.cc +++ b/test/reentry_guard-ut.cc @@ -26,8 +26,8 @@ TEST(ReentryGuardTest, null_init) { guard.register_guard(&reentry_guard); EXPECT_TRUE(guard); { - ddprof::ReentryGuard guard(&reentry_guard); - EXPECT_FALSE(guard); + ddprof::ReentryGuard guard2(&reentry_guard); + EXPECT_FALSE(guard2); } } } From e051a2310e1eb9253d2ade6d4bbf694ea5abe068 Mon Sep 17 00:00:00 2001 From: r1viollet Date: Wed, 13 Sep 2023 10:47:21 +0200 Subject: [PATCH 11/16] Library logger - unit test Improve the unit test for the LOG_ONCE API --- include/lib/address_bitset.hpp | 2 +- test/lib_logger-ut.cc | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp index f9ffae4d8..c0b88fcb2 100644 --- a/include/lib/address_bitset.hpp +++ b/include/lib/address_bitset.hpp @@ -21,7 +21,7 @@ class AddressBitset { public: // Publish 1 Meg as default constexpr static unsigned _k_default_bitset_size = 8 * 1024 * 1024; - AddressBitset(unsigned max_addresses = 0) { init(max_addresses); } + explicit AddressBitset(unsigned max_addresses = 0) { init(max_addresses); } void init(unsigned max_addresses); // returns true if the element was inserted bool add(uintptr_t addr); diff --git a/test/lib_logger-ut.cc b/test/lib_logger-ut.cc index cfcf5f598..1fee22033 100644 --- a/test/lib_logger-ut.cc +++ b/test/lib_logger-ut.cc @@ -9,14 +9,22 @@ namespace ddprof { -void foo_log(const std::string &str) { LOG_ONCE("%s\n", str.c_str()); } +void foo_log(const std::string &str) { LOG_ONCE("%s", str.c_str()); } TEST(LibLogger, simple) { - LOG_ONCE("something\n"); - LOG_ONCE("else\n"); - foo_log("one"); // this shows - foo_log("two"); // this does not show + testing::internal::CaptureStderr(); + LOG_ONCE("something "); + LOG_ONCE("else "); + foo_log("one "); // this shows + foo_log("two "); // this does not show const char *some_string = "some_string"; - LOG_ONCE("Some string = %s\n", some_string); + LOG_ONCE("three %s\n", some_string); + std::string output = testing::internal::GetCapturedStderr(); + fprintf(stderr, "%s", output.c_str()); +#ifdef NDEBUG + EXPECT_EQ(output, "something else one three some_string\n"); +#else + EXPECT_EQ(output, "something else one two three some_string\n"); +#endif } } // namespace ddprof From eb4760a71db151e08955defa65c9b1f64528857f Mon Sep 17 00:00:00 2001 From: r1viollet Date: Wed, 13 Sep 2023 15:25:23 +0200 Subject: [PATCH 12/16] Address bitset - minor fix While refactoring I introduced a regression on how indexes were computed --- include/lib/address_bitset.hpp | 4 ++-- src/lib/address_bitset.cc | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp index c0b88fcb2..c14cca5c0 100644 --- a/include/lib/address_bitset.hpp +++ b/include/lib/address_bitset.hpp @@ -35,12 +35,12 @@ class AddressBitset { unsigned _lower_bits_ignored; // element type using Word_t = uint64_t; - constexpr static unsigned _nb_bits_per_elt = sizeof(Word_t) * 8; + constexpr static unsigned _nb_bits_per_word = sizeof(Word_t) * 8; // 1 Meg divided in uint64's size // The probability of collision is proportional to the number of elements // already within the bitset unsigned _nb_bits = {}; - unsigned _k_nb_elements = {}; + unsigned _k_nb_words = {}; unsigned _nb_bits_mask = {}; // We can not use an actual bitset (for atomicity reasons) std::unique_ptr[]> _address_bitset; diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc index 3f601968a..9a4b0eda0 100644 --- a/src/lib/address_bitset.cc +++ b/src/lib/address_bitset.cc @@ -37,10 +37,10 @@ void AddressBitset::init(unsigned max_addresses) { _address_bitset.reset(); } _nb_bits = round_to_power_of_two(max_addresses); - _k_nb_elements = (_nb_bits) / (_nb_bits_per_elt); + _k_nb_words = (_nb_bits) / (_nb_bits_per_word); if (_nb_bits) { _nb_bits_mask = _nb_bits - 1; - _address_bitset = std::make_unique[]>(_k_nb_elements); + _address_bitset = std::make_unique[]>(_k_nb_words); } } @@ -49,8 +49,8 @@ bool AddressBitset::add(uintptr_t addr) { // As per nsavoire's comment, it is better to use separate operators // than to use the div instruction which generates an extra function call // Also, the usage of a power of two value allows for bit operations - unsigned index_array = significant_bits / sizeof(Word_t); - unsigned bit_offset = significant_bits % sizeof(Word_t); + unsigned index_array = significant_bits / _nb_bits_per_word; + unsigned bit_offset = significant_bits % _nb_bits_per_word; Word_t bit_in_element = (1UL << bit_offset); // there is a possible race between checking the value // and setting it @@ -66,8 +66,8 @@ bool AddressBitset::add(uintptr_t addr) { bool AddressBitset::remove(uintptr_t addr) { int significant_bits = hash_significant_bits(addr); - unsigned index_array = significant_bits / sizeof(Word_t); - unsigned bit_offset = significant_bits % sizeof(Word_t); + unsigned index_array = significant_bits / _nb_bits_per_word; + unsigned bit_offset = significant_bits % _nb_bits_per_word; Word_t bit_in_element = (1UL << bit_offset); if ((_address_bitset[index_array].fetch_xor(bit_in_element) & bit_in_element)) { @@ -89,7 +89,7 @@ unsigned int AddressBitset::count_set_bits(Word_t w) { } void AddressBitset::clear() { - for (unsigned i = 0; i < _k_nb_elements; ++i) { + for (unsigned i = 0; i < _k_nb_words; ++i) { Word_t original_value = _address_bitset[i].exchange(0); // Count number of set bits in original_value int num_set_bits = count_set_bits(original_value); From f7828c916cc3e282eb664cf196ff9fe44afa476b Mon Sep 17 00:00:00 2001 From: r1viollet Date: Thu, 14 Sep 2023 14:04:30 +0200 Subject: [PATCH 13/16] cppcheck fix Rename local variable to avoid conflict with function name. --- src/lib/address_bitset.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc index 9a4b0eda0..8a9372c49 100644 --- a/src/lib/address_bitset.cc +++ b/src/lib/address_bitset.cc @@ -80,12 +80,12 @@ bool AddressBitset::remove(uintptr_t addr) { } unsigned int AddressBitset::count_set_bits(Word_t w) { - unsigned int count = 0; + unsigned int set_bits = 0; while (w) { - count += w & 1; + set_bits += w & 1; w >>= 1; } - return count; + return set_bits; } void AddressBitset::clear() { From e557dd15dc5432062042647dcc76883243e85f60 Mon Sep 17 00:00:00 2001 From: r1viollet Date: Sat, 16 Sep 2023 17:49:22 +0200 Subject: [PATCH 14/16] Live allocation - performances Ensure an allocation sample is pushed even if we encounter a collision in the bitset. --- include/lib/address_bitset.hpp | 14 ++++++--- include/lib/lib_logger.hpp | 4 +-- src/ddprof_worker.cc | 3 +- src/lib/address_bitset.cc | 52 +++++++++++++++++++++++----------- src/lib/allocation_tracker.cc | 10 ++++--- 5 files changed, 55 insertions(+), 28 deletions(-) diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp index c14cca5c0..d9bdbf10f 100644 --- a/include/lib/address_bitset.hpp +++ b/include/lib/address_bitset.hpp @@ -21,8 +21,13 @@ class AddressBitset { public: // Publish 1 Meg as default constexpr static unsigned _k_default_bitset_size = 8 * 1024 * 1024; - explicit AddressBitset(unsigned max_addresses = 0) { init(max_addresses); } - void init(unsigned max_addresses); + explicit AddressBitset(unsigned bitset_size = 0) { init(bitset_size); } + AddressBitset(AddressBitset &&other) noexcept; + AddressBitset &operator=(AddressBitset &&other) noexcept; + + AddressBitset(AddressBitset &other) = delete; + AddressBitset &operator=(AddressBitset &other) = delete; + // returns true if the element was inserted bool add(uintptr_t addr); // returns true if the element was removed @@ -39,15 +44,16 @@ class AddressBitset { // 1 Meg divided in uint64's size // The probability of collision is proportional to the number of elements // already within the bitset - unsigned _nb_bits = {}; + unsigned _bitset_size = {}; unsigned _k_nb_words = {}; unsigned _nb_bits_mask = {}; // We can not use an actual bitset (for atomicity reasons) std::unique_ptr[]> _address_bitset; std::atomic _nb_addresses = 0; - static unsigned int count_set_bits(Word_t w); + void init(unsigned bitset_size); + void move_from(AddressBitset &other) noexcept; // This is a kind of hash function // We remove the lower bits (as the alignment constraints makes them useless) // We fold the address diff --git a/include/lib/lib_logger.hpp b/include/lib/lib_logger.hpp index c27fab22e..eddf3d7e3 100644 --- a/include/lib/lib_logger.hpp +++ b/include/lib/lib_logger.hpp @@ -24,8 +24,8 @@ template void log_once_helper(std::once_flag &, Func &&func) { // create a once flag for the line and file where this is called: #define LOG_ONCE(format, ...) \ do { \ - static std::once_flag UNIQUE_ONCE_FLAG_##__LINE__##__FILE__; \ - ddprof::log_once_helper(UNIQUE_ONCE_FLAG_##__LINE__##__FILE__, [&]() { \ + static std::once_flag UNIQUE_ONCE_FLAG_##__COUNTER__; \ + ddprof::log_once_helper(UNIQUE_ONCE_FLAG_##__COUNTER__, [&]() { \ fprintf(stderr, (format), ##__VA_ARGS__); \ }); \ } while (0) diff --git a/src/ddprof_worker.cc b/src/ddprof_worker.cc index da10aca4f..4d32bb063 100644 --- a/src/ddprof_worker.cc +++ b/src/ddprof_worker.cc @@ -306,7 +306,8 @@ DDRes ddprof_pr_sample(DDProfContext &ctx, perf_event_sample *sample, // Aggregate if unwinding went well (todo : fatal error propagation) if (!IsDDResFatal(res)) { struct UnwindState *us = ctx.worker_ctx.us; - if (Any(EventConfMode::kLiveCallgraph & watcher->output_mode)) { + if (Any(EventConfMode::kLiveCallgraph & watcher->output_mode) && + sample->addr) { // null address means we should not account it // Live callgraph mode // for now we hard code the live aggregation mode ctx.worker_ctx.live_allocation.register_allocation( diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc index 8a9372c49..00a96497c 100644 --- a/src/lib/address_bitset.cc +++ b/src/lib/address_bitset.cc @@ -5,17 +5,18 @@ #include "address_bitset.hpp" #include +#include #include #include namespace ddprof { namespace { -unsigned round_to_power_of_two(unsigned num) { +unsigned round_up_to_power_of_two(unsigned num) { if (num == 0) { return num; } - // If max_addresses is already a power of two + // If num is already a power of two if ((num & (num - 1)) == 0) { return num; } @@ -29,17 +30,43 @@ unsigned round_to_power_of_two(unsigned num) { } } // namespace -void AddressBitset::init(unsigned max_addresses) { +AddressBitset::AddressBitset(AddressBitset &&other) noexcept { + move_from(other); +} + +AddressBitset &AddressBitset::operator=(AddressBitset &&other) noexcept { + if (this != &other) { + move_from(other); + } + return *this; +} + +void AddressBitset::move_from(AddressBitset &other) noexcept { + _lower_bits_ignored = other._lower_bits_ignored; + _bitset_size = other._bitset_size; + _k_nb_words = other._k_nb_words; + _nb_bits_mask = other._nb_bits_mask; + _address_bitset = std::move(other._address_bitset); + _nb_addresses.store(other._nb_addresses.load()); + + // Reset the state of 'other' + other._bitset_size = 0; + other._k_nb_words = 0; + other._nb_bits_mask = 0; + other._nb_addresses = 0; +} + +void AddressBitset::init(unsigned bitset_size) { // Due to memory alignment, on 64 bits we can assume that the first 4 // bits can be ignored _lower_bits_ignored = _k_max_bits_ignored; if (_address_bitset) { _address_bitset.reset(); } - _nb_bits = round_to_power_of_two(max_addresses); - _k_nb_words = (_nb_bits) / (_nb_bits_per_word); - if (_nb_bits) { - _nb_bits_mask = _nb_bits - 1; + _bitset_size = round_up_to_power_of_two(bitset_size); + _k_nb_words = (_bitset_size) / (_nb_bits_per_word); + if (_bitset_size) { + _nb_bits_mask = _bitset_size - 1; _address_bitset = std::make_unique[]>(_k_nb_words); } } @@ -79,20 +106,11 @@ bool AddressBitset::remove(uintptr_t addr) { return false; } -unsigned int AddressBitset::count_set_bits(Word_t w) { - unsigned int set_bits = 0; - while (w) { - set_bits += w & 1; - w >>= 1; - } - return set_bits; -} - void AddressBitset::clear() { for (unsigned i = 0; i < _k_nb_words; ++i) { Word_t original_value = _address_bitset[i].exchange(0); // Count number of set bits in original_value - int num_set_bits = count_set_bits(original_value); + int num_set_bits = std::popcount(original_value); if (num_set_bits > 0) { _nb_addresses.fetch_sub(num_set_bits, std::memory_order_relaxed); } diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc index e3c117bf6..cb28362f3 100644 --- a/src/lib/allocation_tracker.cc +++ b/src/lib/allocation_tracker.cc @@ -142,7 +142,7 @@ DDRes AllocationTracker::init(uint64_t mem_profile_interval, } if (track_deallocations) { // 16 times as we want to probability of collision to be low enough - _allocated_address_set.init(liveallocation::kMaxTracked * 16); + _allocated_address_set = AddressBitset(liveallocation::kMaxTracked * 16); } return ddprof::ring_buffer_attach(ring_buffer, &_pevent); } @@ -252,13 +252,15 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size, } } } else { - // we won't track this allocation as we can't double count in the bitset - return; + // null the address to avoid using this for live heap profiling + // pushing a sample is still good to have a good representation + // of the allocations. + addr = 0; } } bool success = IsDDResOK(push_alloc_sample(addr, total_size, tl_state)); free_on_consecutive_failures(success); - if (unlikely(!success) && _state.track_deallocations) { + if (unlikely(!success) && _state.track_deallocations && addr) { _allocated_address_set.remove(addr); } } From 98227f6cdd14e60acbfe8d17a62e83645616c753 Mon Sep 17 00:00:00 2001 From: r1viollet Date: Mon, 18 Sep 2023 12:22:32 +0200 Subject: [PATCH 15/16] Performance of deallocation code paths Implement a tree like bitset This ensures we account for all possible addresses without collisions --- include/lib/address_bitset.hpp | 51 +++----- include/lib/allocation_tracker.hpp | 2 +- src/ddprof_worker.cc | 1 + src/lib/address_bitset.cc | 192 +++++++++++++++++------------ src/lib/allocation_tracker.cc | 26 +--- test/address_bitset-ut.cc | 14 ++- test/allocation_tracker-ut.cc | 13 +- 7 files changed, 151 insertions(+), 148 deletions(-) diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp index d9bdbf10f..b5644e32f 100644 --- a/include/lib/address_bitset.hpp +++ b/include/lib/address_bitset.hpp @@ -11,20 +11,9 @@ namespace ddprof { class AddressBitset { - // Number of bits is the number of addresses we can store - // We have one address per individual bit). - // so lets say you have 1111, you can store 4 addresses. - // We hash the address to a number (to have an equal probability of using - // all bits). Then we use the mask to position this address in our bitset. - // Addr -> Hash -> Mask (to get useable bits) -> Position in the bitset - // Note: the hashing step might be bad for cache locality. public: - // Publish 1 Meg as default - constexpr static unsigned _k_default_bitset_size = 8 * 1024 * 1024; - explicit AddressBitset(unsigned bitset_size = 0) { init(bitset_size); } - AddressBitset(AddressBitset &&other) noexcept; - AddressBitset &operator=(AddressBitset &&other) noexcept; - + explicit AddressBitset() {} + ~AddressBitset(); AddressBitset(AddressBitset &other) = delete; AddressBitset &operator=(AddressBitset &other) = delete; @@ -36,34 +25,22 @@ class AddressBitset { int count() const { return _nb_addresses; } private: - static constexpr unsigned _k_max_bits_ignored = 4; - unsigned _lower_bits_ignored; + static constexpr unsigned _lower_bits_ignored = 4; // element type using Word_t = uint64_t; constexpr static unsigned _nb_bits_per_word = sizeof(Word_t) * 8; - // 1 Meg divided in uint64's size - // The probability of collision is proportional to the number of elements - // already within the bitset - unsigned _bitset_size = {}; - unsigned _k_nb_words = {}; - unsigned _nb_bits_mask = {}; - // We can not use an actual bitset (for atomicity reasons) - std::unique_ptr[]> _address_bitset; - std::atomic _nb_addresses = 0; + static constexpr unsigned _k_nb_words = 4096 / 64; // (64) + static constexpr unsigned _nb_entries_per_level = 65536; // 2^16 + + struct LeafLevel { + std::atomic leaf[_k_nb_words]; + }; - void init(unsigned bitset_size); + struct MidLevel { + std::atomic mid[_nb_entries_per_level]; + }; - void move_from(AddressBitset &other) noexcept; - // This is a kind of hash function - // We remove the lower bits (as the alignment constraints makes them useless) - // We fold the address - // Then we only keep the bits that matter for the order in the bitmap - uint32_t hash_significant_bits(uintptr_t h1) { - uint64_t intermediate = h1 >> _lower_bits_ignored; - uint32_t high = (uint32_t)(intermediate >> 32); - uint32_t low = (uint32_t)intermediate; - uint32_t res = high ^ low; - return res & _nb_bits_mask; - } + std::atomic _top_level[_nb_entries_per_level]; + std::atomic _nb_addresses; }; } // namespace ddprof diff --git a/include/lib/allocation_tracker.hpp b/include/lib/allocation_tracker.hpp index ae869f134..5b4a9eccd 100644 --- a/include/lib/allocation_tracker.hpp +++ b/include/lib/allocation_tracker.hpp @@ -116,7 +116,7 @@ class AllocationTracker { PEvent _pevent; bool _deterministic_sampling; - AddressBitset _allocated_address_set; + std::unique_ptr _allocated_address_set; // These can not be tied to the internal state of the instance. // The creation of the instance depends on this diff --git a/src/ddprof_worker.cc b/src/ddprof_worker.cc index 4d32bb063..292a86d6a 100644 --- a/src/ddprof_worker.cc +++ b/src/ddprof_worker.cc @@ -306,6 +306,7 @@ DDRes ddprof_pr_sample(DDProfContext &ctx, perf_event_sample *sample, // Aggregate if unwinding went well (todo : fatal error propagation) if (!IsDDResFatal(res)) { struct UnwindState *us = ctx.worker_ctx.us; + LG_DBG("sample addr %lx", sample->addr); if (Any(EventConfMode::kLiveCallgraph & watcher->output_mode) && sample->addr) { // null address means we should not account it // Live callgraph mode diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc index 00a96497c..ab9a43176 100644 --- a/src/lib/address_bitset.cc +++ b/src/lib/address_bitset.cc @@ -8,113 +8,143 @@ #include #include #include +#include namespace ddprof { -namespace { -unsigned round_up_to_power_of_two(unsigned num) { - if (num == 0) { - return num; - } - // If num is already a power of two - if ((num & (num - 1)) == 0) { - return num; - } - // not a power of two - unsigned count = 0; - while (num) { - num >>= 1; - count++; - } - return 1 << count; -} -} // namespace - -AddressBitset::AddressBitset(AddressBitset &&other) noexcept { - move_from(other); -} +bool AddressBitset::add(uintptr_t addr) { + // Extract top 16 bits for top-level index + unsigned top_index = (addr >> 32) & 0xFFFF; -AddressBitset &AddressBitset::operator=(AddressBitset &&other) noexcept { - if (this != &other) { - move_from(other); + // If the entry at this index is null, allocate and try to atomically set it + MidLevel* expected_mid = _top_level[top_index].load(std::memory_order_relaxed); + if (!expected_mid) { + expected_mid = new MidLevel(); + fprintf(stderr, "Added new mid at %u (addr=%lx) \n", top_index, addr); + MidLevel *old_mid = nullptr; + if(!_top_level[top_index].compare_exchange_strong(old_mid, expected_mid)) { + delete expected_mid; + expected_mid = old_mid; + } + assert(expected_mid); + if (!expected_mid) { + // something wend wrong + return false; + } } - return *this; -} -void AddressBitset::move_from(AddressBitset &other) noexcept { - _lower_bits_ignored = other._lower_bits_ignored; - _bitset_size = other._bitset_size; - _k_nb_words = other._k_nb_words; - _nb_bits_mask = other._nb_bits_mask; - _address_bitset = std::move(other._address_bitset); - _nb_addresses.store(other._nb_addresses.load()); - - // Reset the state of 'other' - other._bitset_size = 0; - other._k_nb_words = 0; - other._nb_bits_mask = 0; - other._nb_addresses = 0; -} + // Extract middle 16 bits for mid-level index + unsigned mid_index = (addr >> 16) & 0xFFFF; -void AddressBitset::init(unsigned bitset_size) { - // Due to memory alignment, on 64 bits we can assume that the first 4 - // bits can be ignored - _lower_bits_ignored = _k_max_bits_ignored; - if (_address_bitset) { - _address_bitset.reset(); - } - _bitset_size = round_up_to_power_of_two(bitset_size); - _k_nb_words = (_bitset_size) / (_nb_bits_per_word); - if (_bitset_size) { - _nb_bits_mask = _bitset_size - 1; - _address_bitset = std::make_unique[]>(_k_nb_words); + // If the entry at this mid-level index is null, allocate and try to atomically set it + LeafLevel* expected_leaf = expected_mid->mid[mid_index].load(std::memory_order_acquire); + if (!expected_leaf) { + expected_leaf = new LeafLevel(); + LeafLevel *old_leaf = nullptr; + if (!expected_mid->mid[mid_index].compare_exchange_strong(old_leaf, expected_leaf)) { + delete expected_leaf; + expected_leaf = old_leaf; + } + assert(expected_leaf); + if (!expected_leaf) { + // something went wrong + return false; + } } -} -bool AddressBitset::add(uintptr_t addr) { - uint32_t significant_bits = hash_significant_bits(addr); - // As per nsavoire's comment, it is better to use separate operators - // than to use the div instruction which generates an extra function call - // Also, the usage of a power of two value allows for bit operations - unsigned index_array = significant_bits / _nb_bits_per_word; - unsigned bit_offset = significant_bits % _nb_bits_per_word; + // Extract lower 16 bits and ignore lower bits (12 remain) + unsigned leaf_index = (addr & 0xFFFF) >> _lower_bits_ignored; + unsigned index_array = leaf_index / _nb_bits_per_word; + assert(index_array < _k_nb_words); + unsigned bit_offset = leaf_index % _nb_bits_per_word; Word_t bit_in_element = (1UL << bit_offset); - // there is a possible race between checking the value - // and setting it - if (!(_address_bitset[index_array].fetch_or(bit_in_element) & - bit_in_element)) { - // check that the element was not already set + + if (!(expected_leaf->leaf[index_array].fetch_or(bit_in_element) + & bit_in_element)) { ++_nb_addresses; return true; } - // Collision, element was already set - return false; + return false; // Collision } + bool AddressBitset::remove(uintptr_t addr) { - int significant_bits = hash_significant_bits(addr); - unsigned index_array = significant_bits / _nb_bits_per_word; - unsigned bit_offset = significant_bits % _nb_bits_per_word; + // Extract top 16 bits for top-level index + unsigned top_index = (addr >> 32) & 0xFFFF; + + // Try to get the mid-level pointer. If it's null, return false. + MidLevel* mid = _top_level[top_index].load(std::memory_order_acquire); + if (!mid) { + return false; + } + + // Extract middle 16 bits for mid-level index + unsigned mid_index = (addr >> 16) & 0xFFFF; + + // Try to get the leaf-level pointer from the mid-level. If it's null, return false. + LeafLevel* leaf = mid->mid[mid_index].load(std::memory_order_acquire); + if (!leaf) { + return false; + } + + // Extract lower 16 bits and ignore lower bits (12 remain) + unsigned leaf_index = (addr & 0xFFFF) >> _lower_bits_ignored; + unsigned index_array = leaf_index / _nb_bits_per_word; + assert(index_array < _k_nb_words); + unsigned bit_offset = leaf_index % _nb_bits_per_word; Word_t bit_in_element = (1UL << bit_offset); - if ((_address_bitset[index_array].fetch_xor(bit_in_element) & - bit_in_element)) { + + // Use fetch_xor to toggle the bit and check if it was set before toggling + if (leaf->leaf[index_array].fetch_xor(bit_in_element) & bit_in_element) { _nb_addresses.fetch_sub(1, std::memory_order_relaxed); - // in the unlikely event of a clear right at the wrong time, we could - // have a negative number of elements (though count desyncs are acceptable) return true; } + // Otherwise, the bit wasn't set to begin with, so return false return false; } +// TODO: the performance of this clear is horrible +// For now we will avoid calling it void AddressBitset::clear() { - for (unsigned i = 0; i < _k_nb_words; ++i) { - Word_t original_value = _address_bitset[i].exchange(0); - // Count number of set bits in original_value - int num_set_bits = std::popcount(original_value); - if (num_set_bits > 0) { - _nb_addresses.fetch_sub(num_set_bits, std::memory_order_relaxed); + for (unsigned t = 0; t < _nb_entries_per_level; ++t) { + MidLevel* mid = _top_level[t].load(std::memory_order_acquire); + if (mid) { // if mid-level exists + for (unsigned m = 0; m < _nb_entries_per_level; ++m) { + LeafLevel* leaf = mid->mid[m].load(std::memory_order_acquire); + if (leaf) { // if leaf-level exists + for (unsigned l = 0; l < _k_nb_words; ++l) { + Word_t original_value = leaf->leaf[l].exchange(0); + // Count number of set bits in original_value + int num_set_bits = std::popcount(original_value); + if (num_set_bits > 0) { + _nb_addresses.fetch_sub(num_set_bits, std::memory_order_relaxed); + } + } + } + } + } + } +} + +AddressBitset::~AddressBitset() { + unsigned mid_count = 0; + unsigned leaf_count = 0; + for (unsigned t = 0; t < _nb_entries_per_level; ++t) { + MidLevel* mid = _top_level[t].load(std::memory_order_acquire); + if (mid) { // if mid-level exists + ++mid_count; + for (unsigned m = 0; m < _nb_entries_per_level; ++m) { + LeafLevel* leaf = mid->mid[m].load(std::memory_order_acquire); + if (leaf) { // if leaf-level exists + ++leaf_count; + delete leaf; + } + } + delete mid; } } + fprintf(stderr, "Mid count = %u \n", mid_count); + fprintf(stderr, "Leaf count = %u \n", leaf_count); } } // namespace ddprof diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc index cb28362f3..1da713a9b 100644 --- a/src/lib/allocation_tracker.cc +++ b/src/lib/allocation_tracker.cc @@ -142,7 +142,7 @@ DDRes AllocationTracker::init(uint64_t mem_profile_interval, } if (track_deallocations) { // 16 times as we want to probability of collision to be low enough - _allocated_address_set = AddressBitset(liveallocation::kMaxTracked * 16); + _allocated_address_set = std::make_unique(); } return ddprof::ring_buffer_attach(ring_buffer, &_pevent); } @@ -234,23 +234,9 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size, uint64_t total_size = nsamples * sampling_interval; if (_state.track_deallocations) { - if (_allocated_address_set.add(addr)) { - if (unlikely(_allocated_address_set.count() > - ddprof::liveallocation::kMaxTracked)) { - // Check if we reached max number of elements - // Clear elements if we reach too many - // todo: should we just stop new live tracking (like java) ? - if (IsDDResOK(push_clear_live_allocation(tl_state))) { - _allocated_address_set.clear(); - // still set this as we are pushing the allocation to ddprof - _allocated_address_set.add(addr); - } else { - LOG_ONCE( - "Error: %s", - "Stop allocation profiling. Unable to clear live allocation \n"); - free(); - } - } + if (_allocated_address_set->count() < + ddprof::liveallocation::kMaxTracked && + _allocated_address_set->add(addr)) { } else { // null the address to avoid using this for live heap profiling // pushing a sample is still good to have a good representation @@ -261,7 +247,7 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size, bool success = IsDDResOK(push_alloc_sample(addr, total_size, tl_state)); free_on_consecutive_failures(success); if (unlikely(!success) && _state.track_deallocations && addr) { - _allocated_address_set.remove(addr); + _allocated_address_set->remove(addr); } } @@ -275,7 +261,7 @@ void AllocationTracker::track_deallocation(uintptr_t addr, return; } - if (!_state.track_deallocations || !_allocated_address_set.remove(addr)) { + if (!_state.track_deallocations || !_allocated_address_set->remove(addr)) { return; } diff --git a/test/address_bitset-ut.cc b/test/address_bitset-ut.cc index 59c872fb1..41c1b747d 100644 --- a/test/address_bitset-ut.cc +++ b/test/address_bitset-ut.cc @@ -14,19 +14,27 @@ namespace ddprof { TEST(address_bitset, simple) { - AddressBitset address_bitset(AddressBitset::_k_default_bitset_size); + AddressBitset address_bitset{}; EXPECT_TRUE(address_bitset.add(0xbadbeef)); EXPECT_FALSE(address_bitset.add(0xbadbeef)); EXPECT_TRUE(address_bitset.remove(0xbadbeef)); } + TEST(address_bitset, many_addresses) { - AddressBitset address_bitset(AddressBitset::_k_default_bitset_size); + AddressBitset address_bitset{}; std::random_device rd; std::mt19937 gen(rd()); +#ifdef WORST_CASE + // if addresses are too spread out, we just can not + // keep track of everything std::uniform_int_distribution dis( 0, std::numeric_limits::max()); - +#else + // with more reasonable ranges, we are OK + std::uniform_int_distribution dis( + 0x1000, 0x100000); +#endif std::vector addresses; unsigned nb_elements = 100000; for (unsigned i = 0; i < nb_elements; ++i) { diff --git a/test/allocation_tracker-ut.cc b/test/allocation_tracker-ut.cc index f85421086..3bc4a6207 100644 --- a/test/allocation_tracker-ut.cc +++ b/test/allocation_tracker-ut.cc @@ -149,7 +149,7 @@ TEST(allocation_tracker, max_tracked_allocs) { k_default_perf_stack_sample_size, ring_buffer.get_buffer_info()); ASSERT_TRUE(ddprof::AllocationTracker::is_active()); - bool clear_found = false; + bool max_reached = false; uint64_t nb_samples = 0; for (int i = 0; i <= ddprof::liveallocation::kMaxTracked + ddprof::liveallocation::kMaxTracked / 10; @@ -169,15 +169,16 @@ TEST(allocation_tracker, max_tracked_allocs) { ASSERT_EQ(sample->period, 1); ASSERT_EQ(sample->pid, getpid()); ASSERT_EQ(sample->tid, ddprof::gettid()); - ASSERT_EQ(sample->addr, addr); - } else { - if (hdr->type == PERF_CUSTOM_EVENT_CLEAR_LIVE_ALLOCATION) { - clear_found = true; + if (sample->addr == 0) { + max_reached = true; + } + else { + EXPECT_EQ(sample->addr, addr); } } } } fprintf(stderr, "Number of found samples %lu (vs max = %d) \n", nb_samples, ddprof::liveallocation::kMaxTracked); - EXPECT_TRUE(clear_found); + EXPECT_TRUE(max_reached); } From 0dc37a2c081e43f94450249479c15e0604a25cfb Mon Sep 17 00:00:00 2001 From: r1viollet Date: Thu, 21 Sep 2023 10:02:46 +0200 Subject: [PATCH 16/16] Deallocation code path - Fix the logic - Ensure we have a bounded number of mid level elements --- include/ddprof_perf_event.hpp | 2 + include/lib/address_bitset.hpp | 14 ++--- include/lib/lib_logger.hpp | 13 +++++ src/ddprof_worker.cc | 1 - src/lib/address_bitset.cc | 74 ++++++++++++++++---------- src/lib/allocation_tracker.cc | 3 +- test/address_bitset-ut.cc | 89 ++++++++++++-------------------- test/allocation_tracker-bench.cc | 40 +++++++++++--- test/allocation_tracker-ut.cc | 3 +- 9 files changed, 138 insertions(+), 101 deletions(-) diff --git a/include/ddprof_perf_event.hpp b/include/ddprof_perf_event.hpp index ceda11982..ecd4f8c73 100644 --- a/include/ddprof_perf_event.hpp +++ b/include/ddprof_perf_event.hpp @@ -8,6 +8,8 @@ #include #include +#include "perf.hpp" + // Extend the perf event types // There are <30 different perf events (starting at 1000 seems safe) enum : uint32_t { diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp index b5644e32f..c07c7ead1 100644 --- a/include/lib/address_bitset.hpp +++ b/include/lib/address_bitset.hpp @@ -26,21 +26,23 @@ class AddressBitset { private: static constexpr unsigned _lower_bits_ignored = 4; + static constexpr int _k_default_max_mid_levels = 10; // element type using Word_t = uint64_t; constexpr static unsigned _nb_bits_per_word = sizeof(Word_t) * 8; - static constexpr unsigned _k_nb_words = 4096 / 64; // (64) - static constexpr unsigned _nb_entries_per_level = 65536; // 2^16 + static constexpr unsigned _k_nb_words = 4096 / 64; // (64) + static constexpr unsigned _nb_entries_per_level = 65536; // 2^16 struct LeafLevel { - std::atomic leaf[_k_nb_words]; + std::atomic leaf[_k_nb_words] = {}; }; struct MidLevel { - std::atomic mid[_nb_entries_per_level]; + std::atomic mid[_nb_entries_per_level] = {}; }; - std::atomic _top_level[_nb_entries_per_level]; - std::atomic _nb_addresses; + std::atomic _top_level[_nb_entries_per_level] = {}; + std::atomic _nb_addresses = 0; + std::atomic _nb_mid_levels = 0; }; } // namespace ddprof diff --git a/include/lib/lib_logger.hpp b/include/lib/lib_logger.hpp index eddf3d7e3..c8c83f1e6 100644 --- a/include/lib/lib_logger.hpp +++ b/include/lib/lib_logger.hpp @@ -21,6 +21,19 @@ template void log_once_helper(std::once_flag &, Func &&func) { #endif } +template +void log_always_once_helper(std::once_flag &flag, Func &&func) { + std::call_once(flag, std::forward(func)); +} + +#define LOG_ALWAYS_ONCE(format, ...) \ + do { \ + static std::once_flag UNIQUE_ONCE_FLAG_##__COUNTER__; \ + ddprof::log_always_once_helper(UNIQUE_ONCE_FLAG_##__COUNTER__, [&]() { \ + fprintf(stderr, (format), ##__VA_ARGS__); \ + }); \ + } while (0) + // create a once flag for the line and file where this is called: #define LOG_ONCE(format, ...) \ do { \ diff --git a/src/ddprof_worker.cc b/src/ddprof_worker.cc index 292a86d6a..4d32bb063 100644 --- a/src/ddprof_worker.cc +++ b/src/ddprof_worker.cc @@ -306,7 +306,6 @@ DDRes ddprof_pr_sample(DDProfContext &ctx, perf_event_sample *sample, // Aggregate if unwinding went well (todo : fatal error propagation) if (!IsDDResFatal(res)) { struct UnwindState *us = ctx.worker_ctx.us; - LG_DBG("sample addr %lx", sample->addr); if (Any(EventConfMode::kLiveCallgraph & watcher->output_mode) && sample->addr) { // null address means we should not account it // Live callgraph mode diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc index ab9a43176..041b1bf1e 100644 --- a/src/lib/address_bitset.cc +++ b/src/lib/address_bitset.cc @@ -6,9 +6,10 @@ #include #include +#include #include +#include #include -#include namespace ddprof { @@ -17,18 +18,26 @@ bool AddressBitset::add(uintptr_t addr) { unsigned top_index = (addr >> 32) & 0xFFFF; // If the entry at this index is null, allocate and try to atomically set it - MidLevel* expected_mid = _top_level[top_index].load(std::memory_order_relaxed); + MidLevel *expected_mid = + _top_level[top_index].load(std::memory_order_relaxed); if (!expected_mid) { + if (_nb_mid_levels >= _k_default_max_mid_levels) { + // Every new level adds half a meg of overhead + LOG_ALWAYS_ONCE( + " ddprof: Address bitset reached maximum number of mid levels\n"); + return false; + } expected_mid = new MidLevel(); - fprintf(stderr, "Added new mid at %u (addr=%lx) \n", top_index, addr); MidLevel *old_mid = nullptr; - if(!_top_level[top_index].compare_exchange_strong(old_mid, expected_mid)) { + if (!_top_level[top_index].compare_exchange_strong(old_mid, expected_mid)) { delete expected_mid; expected_mid = old_mid; + } else { + ++_nb_mid_levels; } assert(expected_mid); if (!expected_mid) { - // something wend wrong + // something went wrong return false; } } @@ -36,12 +45,15 @@ bool AddressBitset::add(uintptr_t addr) { // Extract middle 16 bits for mid-level index unsigned mid_index = (addr >> 16) & 0xFFFF; - // If the entry at this mid-level index is null, allocate and try to atomically set it - LeafLevel* expected_leaf = expected_mid->mid[mid_index].load(std::memory_order_acquire); + // If the entry at this mid-level index is null, allocate and try to + // atomically set it + LeafLevel *expected_leaf = + expected_mid->mid[mid_index].load(std::memory_order_acquire); if (!expected_leaf) { expected_leaf = new LeafLevel(); LeafLevel *old_leaf = nullptr; - if (!expected_mid->mid[mid_index].compare_exchange_strong(old_leaf, expected_leaf)) { + if (!expected_mid->mid[mid_index].compare_exchange_strong(old_leaf, + expected_leaf)) { delete expected_leaf; expected_leaf = old_leaf; } @@ -59,31 +71,31 @@ bool AddressBitset::add(uintptr_t addr) { unsigned bit_offset = leaf_index % _nb_bits_per_word; Word_t bit_in_element = (1UL << bit_offset); - if (!(expected_leaf->leaf[index_array].fetch_or(bit_in_element) - & bit_in_element)) { + if (!(expected_leaf->leaf[index_array].fetch_or(bit_in_element) & + bit_in_element)) { ++_nb_addresses; return true; } - return false; // Collision + return false; // Collision } - bool AddressBitset::remove(uintptr_t addr) { // Extract top 16 bits for top-level index unsigned top_index = (addr >> 32) & 0xFFFF; // Try to get the mid-level pointer. If it's null, return false. - MidLevel* mid = _top_level[top_index].load(std::memory_order_acquire); - if (!mid) { + MidLevel *mid = _top_level[top_index].load(std::memory_order_acquire); + if (unlikely(!mid)) { return false; } // Extract middle 16 bits for mid-level index unsigned mid_index = (addr >> 16) & 0xFFFF; - // Try to get the leaf-level pointer from the mid-level. If it's null, return false. - LeafLevel* leaf = mid->mid[mid_index].load(std::memory_order_acquire); - if (!leaf) { + // Try to get the leaf-level pointer from the mid-level. If it's null, return + // false. + LeafLevel *leaf = mid->mid[mid_index].load(std::memory_order_acquire); + if (unlikely(!leaf)) { return false; } @@ -94,8 +106,8 @@ bool AddressBitset::remove(uintptr_t addr) { unsigned bit_offset = leaf_index % _nb_bits_per_word; Word_t bit_in_element = (1UL << bit_offset); - // Use fetch_xor to toggle the bit and check if it was set before toggling - if (leaf->leaf[index_array].fetch_xor(bit_in_element) & bit_in_element) { + // Use fetch_and to zero the bit + if (leaf->leaf[index_array].fetch_and(~bit_in_element) & bit_in_element) { _nb_addresses.fetch_sub(1, std::memory_order_relaxed); return true; } @@ -107,11 +119,11 @@ bool AddressBitset::remove(uintptr_t addr) { // For now we will avoid calling it void AddressBitset::clear() { for (unsigned t = 0; t < _nb_entries_per_level; ++t) { - MidLevel* mid = _top_level[t].load(std::memory_order_acquire); - if (mid) { // if mid-level exists + MidLevel *mid = _top_level[t].load(std::memory_order_acquire); + if (mid) { // if mid-level exists for (unsigned m = 0; m < _nb_entries_per_level; ++m) { - LeafLevel* leaf = mid->mid[m].load(std::memory_order_acquire); - if (leaf) { // if leaf-level exists + LeafLevel *leaf = mid->mid[m].load(std::memory_order_acquire); + if (leaf) { // if leaf-level exists for (unsigned l = 0; l < _k_nb_words; ++l) { Word_t original_value = leaf->leaf[l].exchange(0); // Count number of set bits in original_value @@ -127,24 +139,32 @@ void AddressBitset::clear() { } AddressBitset::~AddressBitset() { +#ifdef DEBUG unsigned mid_count = 0; unsigned leaf_count = 0; +#endif for (unsigned t = 0; t < _nb_entries_per_level; ++t) { - MidLevel* mid = _top_level[t].load(std::memory_order_acquire); - if (mid) { // if mid-level exists + MidLevel *mid = _top_level[t].load(std::memory_order_acquire); + if (mid) { // if mid-level exists +#ifdef DEBUG ++mid_count; +#endif for (unsigned m = 0; m < _nb_entries_per_level; ++m) { - LeafLevel* leaf = mid->mid[m].load(std::memory_order_acquire); - if (leaf) { // if leaf-level exists + LeafLevel *leaf = mid->mid[m].load(std::memory_order_acquire); + if (leaf) { // if leaf-level exists +#ifdef DEBUG ++leaf_count; +#endif delete leaf; } } delete mid; } } +#ifdef DEBUG fprintf(stderr, "Mid count = %u \n", mid_count); fprintf(stderr, "Leaf count = %u \n", leaf_count); +#endif } } // namespace ddprof diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc index 1da713a9b..53fcf2228 100644 --- a/src/lib/allocation_tracker.cc +++ b/src/lib/allocation_tracker.cc @@ -234,8 +234,7 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size, uint64_t total_size = nsamples * sampling_interval; if (_state.track_deallocations) { - if (_allocated_address_set->count() < - ddprof::liveallocation::kMaxTracked && + if (_allocated_address_set->count() < ddprof::liveallocation::kMaxTracked && _allocated_address_set->add(addr)) { } else { // null the address to avoid using this for live heap profiling diff --git a/test/address_bitset-ut.cc b/test/address_bitset-ut.cc index 41c1b747d..aa7259a99 100644 --- a/test/address_bitset-ut.cc +++ b/test/address_bitset-ut.cc @@ -4,13 +4,13 @@ // Datadog, Inc. #include +#include "address_bitset.hpp" #include #include #include +#include #include -#include "address_bitset.hpp" - namespace ddprof { TEST(address_bitset, simple) { @@ -20,11 +20,19 @@ TEST(address_bitset, simple) { EXPECT_TRUE(address_bitset.remove(0xbadbeef)); } +TEST(address_bitset, triple_remove) { + AddressBitset address_bitset{}; + EXPECT_TRUE(address_bitset.add(0xbadbeef)); + EXPECT_TRUE(address_bitset.remove(0xbadbeef)); + EXPECT_FALSE(address_bitset.remove(0xbadbeef)); + EXPECT_FALSE(address_bitset.remove(0xbadbeef)); +} TEST(address_bitset, many_addresses) { AddressBitset address_bitset{}; std::random_device rd; std::mt19937 gen(rd()); +// #define WORST_CASE #ifdef WORST_CASE // if addresses are too spread out, we just can not // keep track of everything @@ -32,73 +40,42 @@ TEST(address_bitset, many_addresses) { 0, std::numeric_limits::max()); #else // with more reasonable ranges, we are OK - std::uniform_int_distribution dis( - 0x1000, 0x100000); + std::uniform_int_distribution dis(0x1000, 0x1000000); #endif - std::vector addresses; + std::set addresses; unsigned nb_elements = 100000; for (unsigned i = 0; i < nb_elements; ++i) { - uintptr_t addr = dis(gen); - if (address_bitset.add(addr)) { - addresses.push_back(addr); + // avoid allocations that are too close to each other + uintptr_t addr = (dis(gen) << 4); + auto res = addresses.insert(addr); + if (res.second) { + EXPECT_TRUE(address_bitset.add(addr)); } } - EXPECT_TRUE(nb_elements - (nb_elements / 10) < addresses.size()); for (auto addr : addresses) { EXPECT_TRUE(address_bitset.remove(addr)); } EXPECT_EQ(0, address_bitset.count()); + for (auto addr : addresses) { + EXPECT_FALSE(address_bitset.remove(addr)); + } } -// This test to tune the hash approach -// Collision rate is around 5.7%, which will have an impact on sampling -#ifdef COLLISION_TEST -// Your hash function -# ifdef TEST_IDENTITY -inline uint64_t my_hash(uintptr_t h1) { return h1; } -# else -inline uint64_t my_hash(uintptr_t h1) { - h1 = h1 >> 4; - uint32_t high = (uint32_t)(h1 >> 32); - uint32_t low = (uint32_t)h1; - return high ^ low; -} -# endif - -TEST(address_bitset, hash_function) { - // Number of values to hash - const int num_values = 1000000; - - // A large address range (33 bits) - const uintptr_t min_address = 0x100000000; - const uintptr_t max_address = 0x200000000; - - // Create an unordered set to store hashed values - std::unordered_set lower_bits_hashed_values; - - // Initialize random number generator +// the aim of this test is just not blow up +TEST(address_bitset, sparse_addressing) { + AddressBitset address_bitset{}; std::random_device rd; std::mt19937 gen(rd()); - std::uniform_int_distribution distribution(min_address, - max_address); - - // Generate and hash random values within the address range - for (int i = 0; i < num_values; ++i) { - uintptr_t address = distribution(gen); - uint32_t lower_bits = my_hash(address) & (8 * 1024 * 1024 - 1); - // Insert the lower 16 bits into the set - lower_bits_hashed_values.insert(lower_bits); + // if addresses are too spread out, we just can not + // keep track of everything + std::uniform_int_distribution dis( + 0, std::numeric_limits::max()); + unsigned nb_elements = 300000; + for (unsigned i = 0; i < nb_elements; ++i) { + // avoid allocations that are too close to each other + uintptr_t addr = (dis(gen) << 4); + address_bitset.add(addr); } - - // Calculate collision statistics - int num_collisions = num_values - lower_bits_hashed_values.size(); - double collision_rate = - static_cast(num_collisions) / num_values * 100.0; - - std::cout << "Hash test results:" << std::endl; - std::cout << "Number of values: " << num_values << std::endl; - std::cout << "Number of collisions: " << num_collisions << std::endl; - std::cout << "Collision rate: " << collision_rate << "%" << std::endl; } -#endif + } // namespace ddprof diff --git a/test/allocation_tracker-bench.cc b/test/allocation_tracker-bench.cc index 76194971e..f98477695 100644 --- a/test/allocation_tracker-bench.cc +++ b/test/allocation_tracker-bench.cc @@ -8,6 +8,7 @@ #include #include "allocation_tracker.hpp" +#include "ddprof_perf_event.hpp" #include "loghandle.hpp" #include "ringbuffer_holder.hpp" @@ -19,25 +20,44 @@ static constexpr uint64_t k_rate = 200000; // The reader thread is interesting, though it starts dominating the CPU // the benchmark focuses on the capture of allocation events. -// #define READER_THREAD +#define READER_THREAD std::atomic reader_continue{true}; std::atomic error_in_reader{false}; // Reader worker thread function void read_buffer(ddprof::RingBufferHolder &holder) { - int nb_samples = 0; + int nb_alloc_samples = 0; + int nb_dealloc_samples = 0; + int nb_unknown_samples = 0; + error_in_reader = false; while (reader_continue) { ddprof::MPSCRingBufferReader reader(holder.get_ring_buffer()); // fprintf(stderr, "size = %lu! \n", reader.available_size()); auto buf = reader.read_sample(); if (!buf.empty()) { - ++nb_samples; + const perf_event_header *hdr = + reinterpret_cast(buf.data()); + + if (hdr->type == PERF_RECORD_SAMPLE) { + + ++nb_alloc_samples; + + } else if (hdr->type == PERF_CUSTOM_EVENT_DEALLOCATION) { + ++nb_dealloc_samples; + } else { + ++nb_unknown_samples; + } } std::chrono::microseconds(10000); } - fprintf(stderr, "Reader thread exit, nb_samples=%d\n", nb_samples); - if (nb_samples == 0) { + fprintf(stderr, + "Reader thread exit," + "nb_alloc_samples=%d," + "nb_dealloc_samples=%d," + "nb_unknown_samples=%d\n", + nb_alloc_samples, nb_dealloc_samples, nb_unknown_samples); + if (nb_alloc_samples == 0) { error_in_reader = true; } } @@ -103,7 +123,7 @@ void perform_memory_operations(bool track_allocations, (i + 1) * page_size - 1); for (int j = 0; j < num_allocations; ++j) { - uintptr_t addr = dis(gen); + uintptr_t addr = dis(gen) << 4; my_malloc(1024, addr); thread_addresses[i].push_back(addr); } @@ -129,6 +149,9 @@ void perform_memory_operations(bool track_allocations, } } + // wait for the reader thread to receive all samples + std::this_thread::sleep_for(std::chrono::microseconds(5000)); + #ifdef READER_THREAD reader_continue = false; reader_thread.join(); @@ -244,7 +267,7 @@ void perform_memory_operations_2(bool track_allocations, std::uniform_int_distribution dis(i * page_size, (i + 1) * page_size - 1); for (int j = 0; j < num_allocations; ++j) { - uintptr_t addr = dis(gen); + uintptr_t addr = dis(gen) << 4; thread_addresses[i].push_back(addr); } } @@ -265,6 +288,9 @@ void perform_memory_operations_2(bool track_allocations, std::this_thread::sleep_for(std::chrono::microseconds(100)); } + // wait for the reader thread to receive all samples + std::this_thread::sleep_for(std::chrono::microseconds(1000)); + #ifdef READER_THREAD reader_continue = false; reader_thread.join(); diff --git a/test/allocation_tracker-ut.cc b/test/allocation_tracker-ut.cc index 3bc4a6207..fc1282e9f 100644 --- a/test/allocation_tracker-ut.cc +++ b/test/allocation_tracker-ut.cc @@ -171,8 +171,7 @@ TEST(allocation_tracker, max_tracked_allocs) { ASSERT_EQ(sample->tid, ddprof::gettid()); if (sample->addr == 0) { max_reached = true; - } - else { + } else { EXPECT_EQ(sample->addr, addr); } }