Skip to content

Commit

Permalink
Add tracer program for CUDA GPU kernel launches (#1523)
Browse files Browse the repository at this point in the history
  • Loading branch information
grcevski authored Jan 14, 2025
1 parent 3baaf5c commit da5252f
Show file tree
Hide file tree
Showing 79 changed files with 224,832 additions and 207,967 deletions.
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -392,3 +392,8 @@ check-ebpf-integrity: docker-generate
.PHONY: protoc-gen
protoc-gen:
docker run --rm -v $(PWD):/work -w /work $(PROTOC_IMAGE) protoc --go_out=pkg/kubecache --go-grpc_out=pkg/kubecache proto/informer.proto

.PHONY: clang-format
clang-format:
find ./bpf -type f -name "*.c" | xargs clang-format -i
find ./bpf -type f -name "*.h" | xargs clang-format -i
4 changes: 4 additions & 0 deletions NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ The Initial Developer of some parts of the product, which are copied from, deriv
inspired by the New Relic Infrastructure Agent (https://github.com/newrelic/infrastructure-agent).
Copyright New Relic.

The Initial Developer of some parts of the product, which are copied from, derived from, or
inspired by Meta strobelight (https://github.com/facebookincubator/strobelight).
Copyright Meta.

Grafana Beyla uses third-party libraries or other resources that may be
distributed under licenses different than the Grafana Beyla software. The licenses for
these third-party libraries are listed in the attached third_party_licenses.csv file
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ All of them are enforced on pull requests as part of the Beyla github workflows.
- `make lint` invokes `golangci-lint` on the Go code
- `make clang-tidy` invokes `clang-tidy` on the C/eBPF code

`clang-format` is invoked automatically as a `pre-commit` git hook, so there is no explicit `Makefile` target for it.
`clang-format` is invoked automatically as a `pre-commit` git hook, you can run it directly by using the `Makefile` `clang-format` target.

#### Running VM tests

Expand Down
122 changes: 122 additions & 0 deletions bpf/gpuevent.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// Copyright Grafana Labs
//
// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.

#include "vmlinux.h"
#include "bpf_core_read.h"
#include "bpf_helpers.h"
#include "bpf_tracing.h"
#include "pid.h"
#include "bpf_dbg.h"
#include "gpuevent.h"

char LICENSE[] SEC("license") = "Dual MIT/GPL";

const gpu_kernel_launch_t *unused_gpu __attribute__((unused));
const gpu_malloc_t *unused_gpu1 __attribute__((unused));

#define EVENT_GPU_KERNEL_LAUNCH 1
#define EVENT_GPU_MALLOC 2

struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
} rb SEC(".maps");

const volatile struct {
bool capture_args;
bool capture_stack;
} prog_cfg = {
// These defaults will be overridden from user space
.capture_args = true,
.capture_stack = true,
};

// The caller uses registers to pass the first 6 arguments to the callee. Given
// the arguments in left-to-right order, the order of registers used is: %rdi,
// %rsi, %rdx, %rcx, %r8, and %r9. Any remaining arguments are passed on the
// stack in reverse order so that they can be popped off the stack in order.
#define SP_OFFSET(offset) (void *)PT_REGS_SP(ctx) + (offset * 8)

SEC("uprobe/cudaLaunchKernel")
int BPF_KPROBE(handle_cuda_launch,
u64 func_off,
u64 grid_xy,
u64 grid_z,
u64 block_xy,
u64 block_z,
uintptr_t argv) {
u64 id = bpf_get_current_pid_tgid();

if (!valid_pid(id)) {
return 0;
}

bpf_dbg_printk("=== cudaLaunchKernel %llx ===", id);

gpu_kernel_launch_t *e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
if (!e) {
bpf_dbg_printk("Failed to allocate ringbuf entry");
return 0;
}

e->flags = EVENT_GPU_KERNEL_LAUNCH;
task_pid(&e->pid_info);

e->kern_func_off = func_off;
e->grid_x = (u32)grid_xy;
e->grid_y = (u32)(grid_xy >> 32);
e->grid_z = (u32)grid_z;
e->block_x = (u32)block_xy;
e->block_y = (u32)(block_xy >> 32);
e->block_z = (u32)block_z;

bpf_probe_read_user(&e->stream, sizeof(uintptr_t), SP_OFFSET(2));

if (prog_cfg.capture_args) {
// Read the Cuda Kernel Launch Arguments
for (int i = 0; i < MAX_GPUKERN_ARGS; i++) {
const void *arg_addr;
// We don't know how many argument this kernel has until we parse the
// signature, so we always attemps to read the maximum number of args,
// even if some of these arg values are not valid.
bpf_probe_read_user(&arg_addr, sizeof(u64), (const void *)(argv + (i * sizeof(u64))));

bpf_probe_read_user(&e->args[i], sizeof(arg_addr), arg_addr);
}
}

if (prog_cfg.capture_stack) {
// Read the Cuda Kernel Launch Stack
e->ustack_sz =
bpf_get_stack(ctx, e->ustack, sizeof(e->ustack), BPF_F_USER_STACK) / sizeof(uint64_t);
}

bpf_ringbuf_submit(e, 0);
return 0;
}

SEC("uprobe/cudaMalloc")
int BPF_KPROBE(handle_cuda_malloc, void **devPtr, size_t size) {
u64 id = bpf_get_current_pid_tgid();

if (!valid_pid(id)) {
return 0;
}

bpf_dbg_printk("=== cudaMalloc %llx ===", id);

gpu_malloc_t *e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
if (!e) {
bpf_dbg_printk("Failed to allocate ringbuf entry");
return 0;
}

e->flags = EVENT_GPU_MALLOC;
task_pid(&e->pid_info);
e->size = (u64)size;

bpf_ringbuf_submit(e, 0);
return 0;
}
40 changes: 40 additions & 0 deletions bpf/gpuevent.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// Copyright Grafana Labs
//
// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_EVENT_H
#define GPU_EVENT_H

#include "pid_types.h"

#pragma once
#define TASK_COMM_LEN 16
#define MAX_GPUKERN_ARGS 16

#ifndef MAX_STACK_DEPTH
#define MAX_STACK_DEPTH 128
#endif

typedef uint64_t stack_trace_t[MAX_STACK_DEPTH];

// This is the struct that will be serialized on the ring buffer and sent to user space
typedef struct gpu_kernel_launch {
u8 flags; // Must be first, we use it to tell what kind of packet we have on the ring buffer
pid_info pid_info;
uint64_t kern_func_off;
int grid_x, grid_y, grid_z;
int block_x, block_y, block_z;
uint64_t stream;
uint64_t args[MAX_GPUKERN_ARGS];
size_t ustack_sz;
stack_trace_t ustack;
} __attribute__((packed)) gpu_kernel_launch_t;

typedef struct gpu_malloc {
u8 flags; // Must be first, we use it to tell what kind of packet we have on the ring buffer
u64 size;
pid_info pid_info;
} __attribute__((packed)) gpu_malloc_t;

#endif
Loading

0 comments on commit da5252f

Please sign in to comment.