eunomia-bpf
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 5 additions & 18 deletions b/‎README.md‎
Lines changed: 5 additions & 18 deletions
diff --git a/‎benchmark/cuda/README.md‎
Lines changed: 0 additions & 244 deletions b/‎benchmark/cuda/README.md‎
Lines changed: 0 additions & 244 deletions
diff --git a/‎benchmark/cuda/.gitignore‎ renamed to ‎benchmark/gpu/.gitignore‎ b/‎benchmark/cuda/.gitignore‎ renamed to ‎benchmark/gpu/.gitignore‎
diff --git a/‎benchmark/cuda/Makefile‎ renamed to ‎benchmark/gpu/Makefile‎
Lines changed: 24 additions & 8 deletions b/‎benchmark/cuda/Makefile‎ renamed to ‎benchmark/gpu/Makefile‎
Lines changed: 24 additions & 8 deletions
@@ -61,3 +61,4 @@ example/attach_implementation/benchmark/wasm-micro-runtime/
 
 build
 target
+
@@ -27,8 +27,8 @@ bpftime is not `userspace eBPF VM`, it's a userspace runtime framework includes
 
 ## Key Features
 
-- **Dynamic Binary rewriting**: Run eBPF programs in userspace, attaching them to `Uprobes` and `Syscall tracepoints`: **No manual instrumentation or restart required!**. It can `trace` or `change` the execution of a function, `hook` or `filter` all syscalls of a process safely, and efficiently with an eBPF userspace runtime. Can inject eBPF runtime into any running process without the need for a restart or manual recompilation.
-- **Performance**: Experience up to a `10x` speedup in Uprobe overhead compared to kernel uprobe and uretprobe. Read/Write userspace memory is also faster than kernel eBPF.
+- **Dynamic Binary rewriting**: Run eBPF programs in userspace, attaching them to `Uprobes`, `Syscall tracepoints` and inside `GPU` kernel: **No manual instrumentation or restart required!**. It can `trace` or `change` the execution of a function, `hook` or `filter` all syscalls of a process safely, and efficiently with an eBPF userspace runtime. Can inject eBPF runtime into any running process without the need for a restart or manual recompilation.
+- **Performance**: Experience up to a `10x` speedup in Uprobe overhead compared to kernel uprobe and uretprobe， up to a 10x faster than `NVbit`. Read/Write userspace memory is also faster than kernel eBPF.
 - **Interprocess eBPF Maps**: Implement userspace `eBPF maps` in shared userspace memory for summary aggregation or control plane communication.
 - **Compatibility**: use `existing eBPF toolchains` like clang, libbpf and bpftrace to develop userspace eBPF application without any modifications. Supporting CO-RE via BTF, and offering userspace `ufunc` access.
 - **Multi JIT Support**: Support [llvmbpf](https://github.com/eunomia-bpf/llvmbpf), a high-speed `JIT/AOT` compiler powered by LLVM, or using `ubpf JIT` and INTERPRETER. The vm can be built as `a standalone library` like ubpf.
@@ -112,20 +112,7 @@ See [eunomia.dev/bpftime/documents/usage](https://eunomia.dev/bpftime/documents/
 
 ## Examples & Use Cases
 
-For more examples and details, please refer to [eunomia.dev/bpftime/documents/examples/](https://eunomia.dev/bpftime/documents/examples/) webpage.
-
-Examples including:
-
-- [Minimal examples](https://github.com/eunomia-bpf/bpftime/tree/master/example/minimal) of eBPF programs.
-- eBPF `Uprobe/USDT` tracing and `syscall tracing`:
-  - [sslsniff](https://github.com/eunomia-bpf/bpftime/tree/master/example/sslsniff) for trace SSL/TLS unencrypted data.
-  - [opensnoop](https://github.com/eunomia-bpf/bpftime/tree/master/example/opensnoop) for trace file open syscalls.
-  - More [bcc/libbpf-tools](https://github.com/eunomia-bpf/bpftime/tree/master/example/libbpf-tools).
-  - Run with [bpftrace](https://github.com/eunomia-bpf/bpftime/tree/master/example/bpftrace) commands or scripts.
-- [error injection](https://github.com/eunomia-bpf/bpftime/tree/master/example/error-inject): change function behavior with `bpf_override_return`.
-- Use the eBPF LLVM JIT/AOT vm as [a standalone library](https://github.com/eunomia-bpf/llvmbpf/tree/main/example).
-- Userspace [XDP with DPDK and AF_XDP](https://github.com/userspace-xdp/userspace-xdp)
-- [CUDA eBPF Probe/Retprobe Example](https://github.com/eunomia-bpf/bpftime/tree/master/example/cuda-counter)
+For more examples and details, please refer to [eunomia.dev/bpftime/documents/examples/](https://eunomia.dev/bpftime/documents/examples/) webpage and [example](https://github.com/eunomia-bpf/bpftime/tree/master/example/) dir.
 
 ## In-Depth
 
@@ -153,7 +140,7 @@ Current hook implementation is based on binary rewriting and the underly techniq
 
 - Userspace function hook: [frida-gum](https://github.com/frida/frida-gum)
 - Syscall hooks: [zpoline](https://www.usenix.org/conference/atc23/presentation/yasukata) and [pmem/syscall_intercept](https://github.com/pmem/syscall_intercept).
-- GPU hooks: our new implement by convert eBPF into PTX and inject into GPU kernel. See [attach/nv_attach_impl](https://github.com/eunomia-bpf/bpftime/tree/master/attach/nv_attach_impl) for more details.
+- GPU hooks: our new implementation by converting eBPF into PTX and injecting into GPU kernels. See [attach/nv_attach_impl](https://github.com/eunomia-bpf/bpftime/tree/master/attach/nv_attach_impl) for more details.
 - XDP with DPDK. See the [uXDP paper](https://dl.acm.org/doi/10.1145/3748355.3748360) for more details.
 
 The hook can be easily replaced with other DBI methods or frameworks, to make it a general extension framework. See our OSDI '25 paper [Extending Applications Safely and Efficiently](https://www.usenix.org/conference/osdi25/presentation/zheng-yusheng) for details.
@@ -180,7 +167,7 @@ This project is licensed under the MIT License.
 
 ## Contact and citations
 
-Have any questions or suggestions on future development? Free free to open an issue or contact
+Have any questions or suggestions on future development? Feel free to open an issue or contact
 <yunwei356@gmail.com> !
 
 Our OSDI '25 paper: <https://www.usenix.org/conference/osdi25/presentation/zheng-yusheng>
 
@@ -81,8 +81,10 @@ else
 NVCC_OPT=-O3
 endif
 
-# Path to NVBit - adjust this to your system's NVBit location
-NVBIT_PATH=$(HOME)/yunwei37/nvbit_release_x86_64/core
+# Path to NVBit - NVBit is NOT part of CUDA Toolkit, will be auto-downloaded if needed
+# Adjust this to your system's NVBit installation location
+NVBIT_PATH?=$(HOME)/nvbit_release_x86_64/core
+NVBIT_RELEASE_DIR?=$(HOME)/nvbit_release_x86_64
 NVBIT_INCLUDES=-I$(NVBIT_PATH)
 
 NVBIT_LIBS=-L$(NVBIT_PATH) -lnvbit
@@ -92,11 +94,23 @@ NVBIT_SOURCES=nvbit_vec_add.cu nvbit_timing_funcs.cu
 NVBIT_OBJECTS=$(NVBIT_SOURCES:.cu=.o)
 CUDA_ARCH?=all
 
-nvbit_vec_add.so: $(NVBIT_OBJECTS) $(NVBIT_PATH)/libnvbit.a
-	$(NVCC) -arch=$(CUDA_ARCH) $(NVCC_OPT) $(NVBIT_OBJECTS) $(NVBIT_LIBS) $(NVCC_PATH) -lcuda -lcudart_static -shared -o $@
+# Auto-download and install NVBit if not found
+$(NVBIT_PATH)/libnvbit.a:
+	@echo "NVBit not found, downloading version 1.7.6 with SM_120 support..."
+	@mkdir -p $(HOME)
+	@cd $(HOME) && \
+	rm -rf nvbit_release_x86_64 && \
+	wget -q https://github.com/NVlabs/NVBit/releases/download/v1.7.6/nvbit-Linux-x86_64-1.7.6.tar.bz2 && \
+	tar xjf nvbit-Linux-x86_64-1.7.6.tar.bz2 && \
+	rm nvbit-Linux-x86_64-1.7.6.tar.bz2
+	@echo "NVBit 1.7.6 installed to $(NVBIT_RELEASE_DIR)"
 
-%.o: %.cu
-	$(NVCC) -dc -c -std=c++11 $(NVBIT_INCLUDES) -Xptxas -cloning=no -Xcompiler -Wall -arch=$(CUDA_ARCH) $(NVCC_OPT) -Xcompiler -fPIC $< -o $@
+nvbit_vec_add.so: vec_add nvbit_vec_add.o nvbit_timing_funcs.o $(NVBIT_PATH)/libnvbit.a
+	g++ -shared -fPIC nvbit_vec_add.o nvbit_timing_funcs.o \
+		$(NVBIT_LIBS) $(NVCC_PATH) -lcuda -lcudart_static -lpthread -ldl -o $@
+
+nvbit_vec_add.o: nvbit_vec_add.cu
+	$(NVCC) -c -std=c++11 $(NVBIT_INCLUDES) -Xcompiler -Wall $(NVCC_OPT) -Xcompiler -fPIC $< -o $@
 
 nvbit_timing_funcs.o: nvbit_timing_funcs.cu
 	$(NVCC) $(NVBIT_INCLUDES) -Xptxas -astoolspatch --keep-device-functions -arch=$(CUDA_ARCH) -Xcompiler -Wall -Xcompiler -fPIC -c $< -o $@
@@ -166,11 +180,13 @@ $(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
 
 # Run the CUDA vector addition benchmark with NVBit instrumentation
 run_nvbit: nvbit_vec_add.so
-	CUDA_VISIBLE_DEVICES=0 LD_PRELOAD=./nvbit_vec_add.so ./vec_add 100
+	CUDA_VISIBLE_DEVICES=0 LD_PRELOAD=./nvbit_vec_add.so ./vec_add
 
 # Run with verbose output
 run_nvbit_verbose: nvbit_vec_add.so
-	CUDA_VISIBLE_DEVICES=0 LD_PRELOAD=./nvbit_vec_add.so TOOL_VERBOSE=1 ./vec_add 100
+	CUDA_VISIBLE_DEVICES=0 LD_PRELOAD=./nvbit_vec_add.so TOOL_VERBOSE=1 ./vec_add
+
+nvbit: nvbit_vec_add.so
 
 # delete failed targets
 .DELETE_ON_ERROR:
Original file line number	Diff line number	Diff line change
`@@ -61,3 +61,4 @@ example/attach_implementation/benchmark/wasm-micro-runtime/`
`61`	`61`
`62`	`62`	`build`
`63`	`63`	`target`
	`64`	`+`