Skip to content

Commit

Permalink
tests/ingition/kdump: add a remote NFS kdump test
Browse files Browse the repository at this point in the history
This way we have good coverage of most-used kdump features.
Some context on the NFS kdump configuration:
coreos/fedora-coreos-tracker#1729

This was previously merged in [1] then reverted in [2] because the nfs
server container was not multi-arch, causing the pipeline to trip on it.

It's also not functionning on systemd256 (so anything f41 and above),
see [3]

This requires coreos#3917 for
the multi-arch container, and
coreos#3921

[1] coreos@b10d8dc
[2] coreos@af1468c
[3] rhkdump/kdump-utils#52
  • Loading branch information
jbtrystram committed Nov 3, 2024
1 parent 4e45505 commit 3750b7d
Showing 1 changed file with 159 additions and 25 deletions.
184 changes: 159 additions & 25 deletions mantle/kola/tests/ignition/kdump.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,63 @@ func init() {
Tags: []string{"kdump", kola.SkipBaseChecksTag, kola.NeedsInternetTag},
Platforms: []string{"qemu"},
})
register.RegisterTest(&register.Test{
Run: kdumpNFSTest,
ClusterSize: 0,
Name: `kdump.crash.nfs`,
Description: "Verifies kdump logs are exported to NFS destination",
Tags: []string{"kdump", kola.SkipBaseChecksTag, kola.NeedsInternetTag},
Platforms: []string{"qemu"},
})
}

// This function test the remote kdump feature by:
// - making sure kdump is ready
// - crashing machine
// - monitoring the expected vmcore path
func testRemoteKdump(c cluster.TestCluster, kdump_machine platform.Machine, remote_machine platform.Machine, crash_path string) {

// Wait for kdump to become active
// 3 minutes should be enough to generate the kdump initramfs
err := util.Retry(12, 15*time.Second, func() error {

kdump_status, err := c.SSH(kdump_machine, "systemctl is-active kdump.service")

if err != nil {
return err
} else if string(kdump_status) == "inactive" {
return fmt.Errorf("Kdump.service is not ready: %s.", string(kdump_status))
}
return nil
})
if err != nil {
c.Fatalf("Timed out while waiting for kdump.service to be ready: %v", err)
}

// crash the kernel
// use systemd-run because direclty calling `echo c > ...` will always
// throw an error as the kernel immediately hangs.
_, err = c.SSH(kdump_machine, "sudo systemd-run sh -c 'sleep 5 && echo c > /proc/sysrq-trigger'")
if err != nil {
c.Fatalf("failed to queue kernel crash: %v", err)
}

// Wait for kdump to create vmcore dump on the remote host
err = util.Retry(8, 10*time.Second, func() error {

// Look for the crash files created on the SSH machine
logs, err := c.SSH(remote_machine, fmt.Sprintf("find %s -type f -name vmcore*", crash_path))

if err != nil {
return fmt.Errorf("failed to search for vmcore: %w", err)
} else if logs == nil {
return fmt.Errorf("No vmcore created on remote host")
}
return nil
})
if err != nil {
c.Fatalf("Timed out while waiting for kdump to create vmcore files: %v", err)
}
}

// The destination VM for kdump logs
Expand Down Expand Up @@ -180,45 +237,122 @@ kernel_arguments:
c.Fatalf("Unable to create test machine: %v", err)
}

// Wait for kdump to become active
// 3 minutes should be enough to generate the kdump initramfs
err = util.Retry(12, 15*time.Second, func() error {
testRemoteKdump(c, kdump_machine, ssh_host.Machine, "/home/core/crash")
}

kdump_status, err := c.SSH(kdump_machine, "systemctl is-active kdump.service")
// The destination VM for kdump logs over NFS
type NfsServer struct {
Machine platform.Machine
MachineAddress string
}

if err != nil {
return err
} else if string(kdump_status) == "inactive" {
return fmt.Errorf(fmt.Sprintf("Kdump.service is not ready: %s.", string(kdump_status)))
}
return nil
})
func setupNFSMachine(c cluster.TestCluster) NfsServer {
var m platform.Machine
var err error

options := platform.QemuMachineOptions{
HostForwardPorts: []platform.HostForwardPort{
{Service: "ssh", HostPort: 0, GuestPort: 22},
// Kdump NFS option does not allow a custom port
{Service: "nfs", HostPort: 2049, GuestPort: 2049},
},
}

nfs_server_butane := conf.Butane(`variant: fcos
version: 1.5.0
storage:
files:
- path: /etc/containers/systemd/nfs.container
overwrite: true
contents:
inline: |
[Container]
Image=quay.io/coreos-assembler/nfs
Volume=/var/nfs:/export
Network=host
PodmanArgs=--privileged
[Install]
WantedBy=default.target
directories:
- path: /var/nfs/crash`)

// start the machine
switch c := c.Cluster.(type) {
// These cases have to be separated because when put together to the same case statement
// the golang compiler no longer checks that the individual types in the case have the
// NewMachineWithQemuOptions function, but rather whether platform.Cluster
// does which fails
case *qemu.Cluster:
m, err = c.NewMachineWithQemuOptions(nfs_server_butane, options)
default:
panic("unreachable")
}
if err != nil {
c.Fatalf("Timed out while waiting for kdump.service to be ready: %v", err)
c.Fatal(err)
}

// crash the kernel
// use systemd-run because direclty calling `echo c...` will alaways
// throw an error as the kernel immediately hangs.
_, err = c.SSH(kdump_machine, "sudo systemd-run sh -c 'sleep 5 && echo c > /proc/sysrq-trigger'")
return NfsServer{
Machine: m,
MachineAddress: "10.0.2.2",
}
}

func kdumpNFSTest(c cluster.TestCluster) {
nfs_host := setupNFSMachine(c)

butane := conf.Butane(fmt.Sprintf(`variant: fcos
version: 1.5.0
storage:
files:
- path: /etc/kdump.conf
overwrite: true
contents:
inline: |
nfs %s:/
path /crash
core_collector makedumpfile -l --message-level 1 -d 31
extra_bins /sbin/mount.nfs
extra_modules nfs nfsv3 nfs_layout_nfsv41_files blocklayoutdriver nfs_layout_flexfiles nfs_layout_nfsv41_files
systemd:
units:
- name: kdump.service
enabled: true
dropins:
- name: debug.conf
contents: |
[Service]
Environment="debug=1"
kernel_arguments:
should_exist:
- crashkernel=512M`,
nfs_host.MachineAddress))

opts := platform.MachineOptions{
MinMemory: 2048,
}

kdump_machine, err := c.NewMachineWithOptions(butane, opts)
if err != nil {
c.Fatalf("failed to queue kernel crash: %v", err)
c.Fatalf("Unable to create test machine: %v", err)
}

// Wait for kdump to create vmcore dump on the remote host
err = util.Retry(5, 10*time.Second, func() error {
// XXX Refactor this
// Wait for nfs server to become active
// 1 minutes should be enough to pull the container image
err = util.Retry(4, 15*time.Second, func() error {

// Look for the crash files created on the SSH machine
logs, err := c.SSH(ssh_host.Machine, "find /home/core/crash -type f -name vmcore*")
nfs_status, err := c.SSH(nfs_host.Machine, "systemctl is-active nfs.service")

if err != nil {
return fmt.Errorf("failed to search for vmcore: %w", err)
} else if logs == nil {
return fmt.Errorf("No vmcore created on remote SSH host")
return err
} else if string(nfs_status) == "inactive" {
return fmt.Errorf("nfs.service is not ready: %s.", string(nfs_status))
}
return nil
})
if err != nil {
c.Fatalf("Timed out while waiting for kdump to create vmcore files: %v", err)
c.Fatalf("Timed out while waiting for nfs.service to be ready: %v", err)
}

testRemoteKdump(c, kdump_machine, nfs_host.Machine, "/var/nfs/crash")
}

0 comments on commit 3750b7d

Please sign in to comment.