Add tracer program for CUDA GPU kernel launches (#1523)

grafana · Jan 14, 2025 · da5252f · da5252f
1 parent 3baaf5c
commit da5252f
Show file tree

Hide file tree

Showing 79 changed files with 224,832 additions and 207,967 deletions.
diff --git a/Makefile b/Makefile
@@ -392,3 +392,8 @@ check-ebpf-integrity: docker-generate
 .PHONY: protoc-gen
 protoc-gen:
 	docker run --rm -v $(PWD):/work -w /work $(PROTOC_IMAGE) protoc --go_out=pkg/kubecache --go-grpc_out=pkg/kubecache proto/informer.proto
+
+.PHONY: clang-format
+clang-format:
+	find ./bpf -type f -name "*.c" | xargs clang-format -i
+	find ./bpf -type f -name "*.h" | xargs clang-format -i
diff --git a/NOTICE b/NOTICE
@@ -35,6 +35,10 @@ The Initial Developer of some parts of the product, which are copied from, deriv
 inspired by the New Relic Infrastructure Agent (https://github.com/newrelic/infrastructure-agent).
 Copyright New Relic.
 
+The Initial Developer of some parts of the product, which are copied from, derived from, or
+inspired by Meta strobelight (https://github.com/facebookincubator/strobelight).
+Copyright Meta.
+
 Grafana Beyla uses third-party libraries or other resources that may be
 distributed under licenses different than the Grafana Beyla software. The licenses for 
 these third-party libraries are listed in the attached third_party_licenses.csv file 

diff --git a/README.md b/README.md
@@ -198,7 +198,7 @@ All of them are enforced on pull requests as part of the Beyla github workflows.
 - `make lint` invokes `golangci-lint` on the Go code
 - `make clang-tidy` invokes `clang-tidy` on the C/eBPF code
 
-`clang-format` is invoked automatically as a `pre-commit` git hook, so there is no explicit `Makefile` target for it.
+`clang-format` is invoked automatically as a `pre-commit` git hook, you can run it directly by using the `Makefile` `clang-format` target.
 
 #### Running VM tests
 

diff --git a/bpf/gpuevent.c b/bpf/gpuevent.c
@@ -0,0 +1,122 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// Copyright Grafana Labs
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "vmlinux.h"
+#include "bpf_core_read.h"
+#include "bpf_helpers.h"
+#include "bpf_tracing.h"
+#include "pid.h"
+#include "bpf_dbg.h"
+#include "gpuevent.h"
+
+char LICENSE[] SEC("license") = "Dual MIT/GPL";
+
+const gpu_kernel_launch_t *unused_gpu __attribute__((unused));
+const gpu_malloc_t *unused_gpu1 __attribute__((unused));
+
+#define EVENT_GPU_KERNEL_LAUNCH 1
+#define EVENT_GPU_MALLOC 2
+
+struct {
+    __uint(type, BPF_MAP_TYPE_RINGBUF);
+} rb SEC(".maps");
+
+const volatile struct {
+    bool capture_args;
+    bool capture_stack;
+} prog_cfg = {
+    // These defaults will be overridden from user space
+    .capture_args = true,
+    .capture_stack = true,
+};
+
+// The caller uses registers to pass the first 6 arguments to the callee.  Given
+// the arguments in left-to-right order, the order of registers used is: %rdi,
+// %rsi, %rdx, %rcx, %r8, and %r9. Any remaining arguments are passed on the
+// stack in reverse order so that they can be popped off the stack in order.
+#define SP_OFFSET(offset) (void *)PT_REGS_SP(ctx) + (offset * 8)
+
+SEC("uprobe/cudaLaunchKernel")
+int BPF_KPROBE(handle_cuda_launch,
+               u64 func_off,
+               u64 grid_xy,
+               u64 grid_z,
+               u64 block_xy,
+               u64 block_z,
+               uintptr_t argv) {
+    u64 id = bpf_get_current_pid_tgid();
+
+    if (!valid_pid(id)) {
+        return 0;
+    }
+
+    bpf_dbg_printk("=== cudaLaunchKernel %llx ===", id);
+
+    gpu_kernel_launch_t *e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+    if (!e) {
+        bpf_dbg_printk("Failed to allocate ringbuf entry");
+        return 0;
+    }
+
+    e->flags = EVENT_GPU_KERNEL_LAUNCH;
+    task_pid(&e->pid_info);
+
+    e->kern_func_off = func_off;
+    e->grid_x = (u32)grid_xy;
+    e->grid_y = (u32)(grid_xy >> 32);
+    e->grid_z = (u32)grid_z;
+    e->block_x = (u32)block_xy;
+    e->block_y = (u32)(block_xy >> 32);
+    e->block_z = (u32)block_z;
+
+    bpf_probe_read_user(&e->stream, sizeof(uintptr_t), SP_OFFSET(2));
+
+    if (prog_cfg.capture_args) {
+        // Read the Cuda Kernel Launch Arguments
+        for (int i = 0; i < MAX_GPUKERN_ARGS; i++) {
+            const void *arg_addr;
+            // We don't know how many argument this kernel has until we parse the
+            // signature, so we always attemps to read the maximum number of args,
+            // even if some of these arg values are not valid.
+            bpf_probe_read_user(&arg_addr, sizeof(u64), (const void *)(argv + (i * sizeof(u64))));
+
+            bpf_probe_read_user(&e->args[i], sizeof(arg_addr), arg_addr);
+        }
+    }
+
+    if (prog_cfg.capture_stack) {
+        // Read the Cuda Kernel Launch Stack
+        e->ustack_sz =
+            bpf_get_stack(ctx, e->ustack, sizeof(e->ustack), BPF_F_USER_STACK) / sizeof(uint64_t);
+    }
+
+    bpf_ringbuf_submit(e, 0);
+    return 0;
+}
+
+SEC("uprobe/cudaMalloc")
+int BPF_KPROBE(handle_cuda_malloc, void **devPtr, size_t size) {
+    u64 id = bpf_get_current_pid_tgid();
+
+    if (!valid_pid(id)) {
+        return 0;
+    }
+
+    bpf_dbg_printk("=== cudaMalloc %llx ===", id);
+
+    gpu_malloc_t *e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+    if (!e) {
+        bpf_dbg_printk("Failed to allocate ringbuf entry");
+        return 0;
+    }
+
+    e->flags = EVENT_GPU_MALLOC;
+    task_pid(&e->pid_info);
+    e->size = (u64)size;
+
+    bpf_ringbuf_submit(e, 0);
+    return 0;
+}
diff --git a/bpf/gpuevent.h b/bpf/gpuevent.h
@@ -0,0 +1,40 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// Copyright Grafana Labs
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef GPU_EVENT_H
+#define GPU_EVENT_H
+
+#include "pid_types.h"
+
+#pragma once
+#define TASK_COMM_LEN 16
+#define MAX_GPUKERN_ARGS 16
+
+#ifndef MAX_STACK_DEPTH
+#define MAX_STACK_DEPTH 128
+#endif
+
+typedef uint64_t stack_trace_t[MAX_STACK_DEPTH];
+
+// This is the struct that will be serialized on the ring buffer and sent to user space
+typedef struct gpu_kernel_launch {
+    u8 flags; // Must be first, we use it to tell what kind of packet we have on the ring buffer
+    pid_info pid_info;
+    uint64_t kern_func_off;
+    int grid_x, grid_y, grid_z;
+    int block_x, block_y, block_z;
+    uint64_t stream;
+    uint64_t args[MAX_GPUKERN_ARGS];
+    size_t ustack_sz;
+    stack_trace_t ustack;
+} __attribute__((packed)) gpu_kernel_launch_t;
+
+typedef struct gpu_malloc {
+    u8 flags; // Must be first, we use it to tell what kind of packet we have on the ring buffer
+    u64 size;
+    pid_info pid_info;
+} __attribute__((packed)) gpu_malloc_t;
+
+#endif