Skip to content

Commit 20acd88

Browse files
yunwei37Copilot
andcommitted
example: add cpu and gpu together (#454)
* docs: Enhance GPU README with detailed explanations of observability challenges and bpftime's approach * docs: Refine GPU README to clarify observability challenges and enhance explanations of profiling limitations * refactor: Rename NV GPU map types to PERGPUTREAD and GPU_RINGBUF for consistency * refactor: Rename BPF_MAP_TYPE_PERGPUTREAD_ARRAY_MAP to BPF_MAP_TYPE_PERGPUTD_ARRAY_MAP for consistency * refactor: Update README and example files for clarity and consistency; add CUDA probe implementation * refactor: Update .gitignore and README for launchlate example; add eBPF probe for cudaLaunchKernel * refactor: Add clock calibration and offset storage in BPF map for accurate timing * refactor: Update histogram implementation for CUDA kernel launch latency tracking * refactor: Update README sections for clarity and consistency * refactor: Enhance README with detailed examples for kernel exit tracing and execution count histogram * Update example/gpu/README.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent ecaaefb commit 20acd88

File tree

17 files changed

+927
-57
lines changed

17 files changed

+927
-57
lines changed

attach/nv_attach_impl/trampoline/default_trampoline.cu

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,8 @@ struct CommSharedMem {
8080
uint64_t time_sum[8];
8181
};
8282

83-
const int BPF_MAP_TYPE_NV_GPU_ARRAY_MAP = 1502;
84-
const int BPF_MAP_TYPE_NV_GPU_RINGBUF_MAP = 1527;
83+
const int BPF_MAP_TYPE_PERGPUTD_ARRAY_MAP = 1502;
84+
const int BPF_MAP_TYPE_GPU_RINGBUF_MAP = 1527;
8585

8686
struct MapBasicInfo {
8787
bool enabled;
@@ -188,7 +188,7 @@ extern "C" __noinline__ __device__ uint64_t _bpf_helper_ext_0001(
188188
auto &req = global_data->req;
189189
// CallRequest req;
190190
const auto &map_info = ::map_info[map];
191-
if (map_info.map_type == BPF_MAP_TYPE_NV_GPU_ARRAY_MAP) {
191+
if (map_info.map_type == BPF_MAP_TYPE_PERGPUTD_ARRAY_MAP) {
192192
auto real_key = *(uint32_t *)(uintptr_t)key;
193193
auto offset = array_map_offset(real_key, map_info);
194194
return (uint64_t)offset;
@@ -210,7 +210,7 @@ extern "C" __noinline__ __device__ uint64_t _bpf_helper_ext_0002(
210210
CommSharedMem *global_data = (CommSharedMem *)constData;
211211
auto &req = global_data->req;
212212
const auto &map_info = ::map_info[map];
213-
if (map_info.map_type == BPF_MAP_TYPE_NV_GPU_ARRAY_MAP) {
213+
if (map_info.map_type == BPF_MAP_TYPE_PERGPUTD_ARRAY_MAP) {
214214
auto real_key = *(uint32_t *)(uintptr_t)key;
215215
auto offset = array_map_offset(real_key, map_info);
216216
simple_memcpy(offset, (void *)(uintptr_t)value,
@@ -285,7 +285,7 @@ _bpf_helper_ext_0025(uint64_t ctx, uint64_t map, uint64_t flags, uint64_t data,
285285
uint64_t data_size)
286286
{
287287
const auto &map_info = ::map_info[map];
288-
if (map_info.map_type == BPF_MAP_TYPE_NV_GPU_RINGBUF_MAP) {
288+
if (map_info.map_type == BPF_MAP_TYPE_GPU_RINGBUF_MAP) {
289289
// printf("Starting perf output, value size=%d, max entries = %d\n",
290290
// map_info.value_size, map_info.max_entries);
291291
auto entry_size = sizeof(ringbuf_header) +

example/gpu/README.md

Lines changed: 171 additions & 21 deletions
Large diffs are not rendered by default.

example/gpu/cuda-counter/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ You need to start two processes:
4848
### 1. Launch the eBPF Program (Server)
4949

5050
```bash
51-
LD_PRELOAD=build/runtime/syscall-server/libbpftime-syscall-server.so \
51+
BPFTIME_LOG_OUTPUT=console LD_PRELOAD=build/runtime/syscall-server/libbpftime-syscall-server.so \
5252
example/gpu/cuda-counter/cuda_probe
5353
```
5454

example/gpu/kernelretsnoop/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ The challenge is that GPU hardware and traditional profilers only show you aggre
2222

2323
This simple data unlocks powerful insights into your kernel's behavior.
2424

25-
## Why This Matters: Real Performance Stories
25+
## Why This Matters
2626

2727
### The Case of the Divergent Warp
2828

example/gpu/kernelretsnoop/kernelretsnoop.bpf.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@
33
#include <bpf/bpf_helpers.h>
44
#include <bpf/bpf_tracing.h>
55

6-
#define BPF_MAP_TYPE_NV_GPU_ARRAY_MAP 1502
7-
#define BPF_MAP_TYPE_NV_GPU_RINGBUF_MAP 1527
6+
#define BPF_MAP_TYPE_PERGPUTD_ARRAY_MAP 1502
7+
#define BPF_MAP_TYPE_GPU_RINGBUF_MAP 1527
88

99
struct big_struct {
1010
char s[1024];
1111
};
1212

1313
struct {
14-
__uint(type, BPF_MAP_TYPE_NV_GPU_RINGBUF_MAP);
14+
__uint(type, BPF_MAP_TYPE_GPU_RINGBUF_MAP);
1515
__uint(max_entries, 16);
1616
__type(key, u32);
1717
__type(value, struct big_struct);

example/gpu/launchlate/.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
/launchlate
2+
/.output
3+
/victim*
4+
/vec_add.cpp
5+
/vec_add
6+
/vec_add-new.cpp
7+
launchlate

example/gpu/launchlate/Makefile

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
2+
OUTPUT := .output
3+
CLANG ?= clang
4+
LIBBPF_SRC := $(abspath ../../../third_party/libbpf/src)
5+
BPFTOOL_SRC := $(abspath ../../../third_party/bpftool/src)
6+
LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
7+
BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
8+
BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
9+
ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
10+
| sed 's/arm.*/arm/' \
11+
| sed 's/aarch64/arm64/' \
12+
| sed 's/ppc64le/powerpc/' \
13+
| sed 's/mips.*/mips/' \
14+
| sed 's/riscv64/riscv/' \
15+
| sed 's/loongarch64/loongarch/')
16+
VMLINUX := ../../../third_party/vmlinux/$(ARCH)/vmlinux.h
17+
# Use our own libbpf API headers and Linux UAPI headers distributed with
18+
# libbpf to avoid dependency on system-wide headers, which could be missing or
19+
# outdated
20+
INCLUDES := -I$(OUTPUT) -I../../../third_party/libbpf/include/uapi -I$(dir $(VMLINUX))
21+
CFLAGS := -g -Wall
22+
ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)
23+
24+
APPS = launchlate # minimal minimal_legacy uprobe kprobe fentry usdt sockfilter tc ksyscall
25+
26+
CARGO ?= $(shell which cargo)
27+
ifeq ($(strip $(CARGO)),)
28+
BZS_APPS :=
29+
else
30+
BZS_APPS := # profile
31+
APPS += $(BZS_APPS)
32+
# Required by libblazesym
33+
ALL_LDFLAGS += -lrt -ldl -lpthread -lm
34+
endif
35+
36+
# Get Clang's default includes on this system. We'll explicitly add these dirs
37+
# to the includes list when compiling with `-target bpf` because otherwise some
38+
# architecture-specific dirs will be "missing" on some architectures/distros -
39+
# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h,
40+
# sys/cdefs.h etc. might be missing.
41+
#
42+
# Use '-idirafter': Don't interfere with include mechanics except where the
43+
# build would have failed anyways.
44+
CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - </dev/null 2>&1 \
45+
| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
46+
47+
ifeq ($(V),1)
48+
Q =
49+
msg =
50+
else
51+
Q = @
52+
msg = @printf ' %-8s %s%s\n' \
53+
"$(1)" \
54+
"$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \
55+
"$(if $(3), $(3))";
56+
MAKEFLAGS += --no-print-directory
57+
endif
58+
59+
define allow-override
60+
$(if $(or $(findstring environment,$(origin $(1))),\
61+
$(findstring command line,$(origin $(1)))),,\
62+
$(eval $(1) = $(2)))
63+
endef
64+
65+
$(call allow-override,CC,$(CROSS_COMPILE)cc)
66+
$(call allow-override,LD,$(CROSS_COMPILE)ld)
67+
68+
.PHONY: all
69+
all: $(APPS) vec_add
70+
71+
vec_add: vec_add.cu
72+
@if command -v nvcc >/dev/null 2>&1; then \
73+
nvcc -cudart shared vec_add.cu -o vec_add -g; \
74+
else \
75+
echo "Warning: CUDA not found, skipping vec_add build"; \
76+
fi
77+
78+
.PHONY: clean
79+
clean:
80+
$(call msg,CLEAN)
81+
$(Q)rm -rf $(OUTPUT) $(APPS) vec_add
82+
83+
$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
84+
$(call msg,MKDIR,$@)
85+
$(Q)mkdir -p $@
86+
87+
# Build libbpf
88+
$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
89+
$(call msg,LIB,$@)
90+
$(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \
91+
OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \
92+
INCLUDEDIR= LIBDIR= UAPIDIR= \
93+
install
94+
95+
# Build bpftool
96+
$(BPFTOOL): | $(BPFTOOL_OUTPUT)
97+
$(call msg,BPFTOOL,$@)
98+
$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap
99+
100+
101+
$(LIBBLAZESYM_SRC)/target/release/libblazesym.a::
102+
$(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --features=cheader,dont-generate-test-files --release
103+
104+
$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
105+
$(call msg,LIB, $@)
106+
$(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@
107+
108+
$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
109+
$(call msg,LIB,$@)
110+
$(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@
111+
112+
# Build BPF code
113+
$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL)
114+
$(call msg,BPF,$@)
115+
$(Q)$(CLANG) -Xlinker --export-dynamic -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \
116+
$(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \
117+
-c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
118+
$(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
119+
120+
# Generate BPF skeletons
121+
$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL)
122+
$(call msg,GEN-SKEL,$@)
123+
$(Q)$(BPFTOOL) gen skeleton $< > $@
124+
125+
# Build user-space code
126+
$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h
127+
128+
$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT)
129+
$(call msg,CC,$@)
130+
$(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@
131+
132+
$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_HEADER)
133+
134+
$(BZS_APPS): $(LIBBLAZESYM_OBJ)
135+
136+
# Build application binary
137+
$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
138+
$(call msg,BINARY,$@)
139+
$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@
140+
141+
# delete failed targets
142+
.DELETE_ON_ERROR:
143+
144+
# keep intermediate (.skel.h, .bpf.o, etc) targets
145+
.SECONDARY:

0 commit comments

Comments
 (0)