Skip to content

Commit 491c5d8

Browse files
authored
fix(bootstrap,server): persist sandbox state across gateway stop/start cycles (#739)
1 parent eea495e commit 491c5d8

File tree

8 files changed

+544
-61
lines changed

8 files changed

+544
-61
lines changed

architecture/gateway-single-node.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ For the target daemon (local or remote):
185185

186186
After the container starts:
187187

188-
1. **Clean stale nodes**: `clean_stale_nodes()` finds `NotReady` nodes via `kubectl get nodes` and deletes them. This is needed when a container is recreated but reuses the persistent volume -- k3s registers a new node (using the container ID as hostname) while old node entries persist in etcd. Non-fatal on error; returns the count of removed nodes.
188+
1. **Clean stale nodes**: `clean_stale_nodes()` finds nodes whose name does not match the deterministic k3s `--node-name` and deletes them. That node name is derived from the gateway name but normalized to a Kubernetes-safe lowercase form so existing gateway names that contain `_`, `.`, or uppercase characters still produce a valid node identity. This cleanup is needed when a container is recreated but reuses the persistent volume -- old node entries can persist in etcd. Non-fatal on error; returns the count of removed nodes.
189189
2. **Push local images** (optional, local deploy only): If `OPENSHELL_PUSH_IMAGES` is set, the comma-separated image refs are exported from the local Docker daemon as a single tar, uploaded into the container via `docker put_archive`, and imported into containerd via `ctr images import` in the `k8s.io` namespace. After import, `kubectl rollout restart deployment/openshell openshell` is run, followed by `kubectl rollout status --timeout=180s` to wait for completion. See `crates/openshell-bootstrap/src/push.rs`.
190190
3. **Wait for gateway health**: `wait_for_gateway_ready()` polls the Docker HEALTHCHECK status up to 180 times, 2 seconds apart (6 min total). A background task streams container logs during this wait. Failure modes:
191191
- Container exits during polling: error includes recent log lines.

architecture/gateway.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,7 @@ The Helm chart template is at `deploy/helm/openshell/templates/statefulset.yaml`
501501

502502
`SandboxClient` (`crates/openshell-server/src/sandbox/mod.rs`) manages `agents.x-k8s.io/v1alpha1/Sandbox` CRDs.
503503

504-
- **Create**: Translates a `Sandbox` proto into a Kubernetes `DynamicObject` with labels (`openshell.ai/sandbox-id`, `openshell.ai/managed-by: openshell`) and a spec that includes the pod template, environment variables, and gateway-required env vars (`OPENSHELL_SANDBOX_ID`, `OPENSHELL_ENDPOINT`, `OPENSHELL_SSH_LISTEN_ADDR`, etc.).
504+
- **Create**: Translates a `Sandbox` proto into a Kubernetes `DynamicObject` with labels (`openshell.ai/sandbox-id`, `openshell.ai/managed-by: openshell`) and a spec that includes the pod template, environment variables, and gateway-required env vars (`OPENSHELL_SANDBOX_ID`, `OPENSHELL_ENDPOINT`, `OPENSHELL_SSH_LISTEN_ADDR`, etc.). When callers do not provide custom `volumeClaimTemplates`, the server injects a default `workspace` PVC and mounts it at `/sandbox` so the default sandbox home/workdir survives pod rescheduling.
505505
- **Delete**: Calls the Kubernetes API to delete the CRD by name. Returns `false` if already gone (404).
506506
- **Pod IP resolution**: `agent_pod_ip()` fetches the agent pod and reads `status.podIP`.
507507

crates/openshell-bootstrap/src/constants.rs

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,100 @@ pub const SERVER_CLIENT_CA_SECRET_NAME: &str = "openshell-server-client-ca";
1313
pub const CLIENT_TLS_SECRET_NAME: &str = "openshell-client-tls";
1414
/// K8s secret holding the SSH handshake HMAC secret (shared by gateway and sandbox pods).
1515
pub const SSH_HANDSHAKE_SECRET_NAME: &str = "openshell-ssh-handshake";
16+
const NODE_NAME_PREFIX: &str = "openshell-";
17+
const NODE_NAME_FALLBACK_SUFFIX: &str = "gateway";
18+
const KUBERNETES_MAX_NAME_LEN: usize = 253;
1619

1720
pub fn container_name(name: &str) -> String {
1821
format!("openshell-cluster-{name}")
1922
}
2023

24+
/// Deterministic k3s node name derived from the gateway name.
25+
///
26+
/// k3s defaults to using the container hostname (= Docker container ID) as
27+
/// the node name. When the container is recreated (e.g. after an image
28+
/// upgrade), the container ID changes, creating a new k3s node. The
29+
/// `clean_stale_nodes` function then deletes PVCs whose backing PVs have
30+
/// node affinity for the old node — wiping the server database and any
31+
/// sandbox persistent volumes.
32+
///
33+
/// By passing a deterministic `--node-name` to k3s, the node identity
34+
/// survives container recreation, and PVCs are never orphaned.
35+
///
36+
/// Gateway names allow Docker-friendly separators and uppercase characters,
37+
/// but Kubernetes node names must be DNS-safe. Normalize the gateway name into
38+
/// a single lowercase RFC 1123 label so previously accepted names such as
39+
/// `prod_us` or `Prod.US` still deploy successfully.
40+
pub fn node_name(name: &str) -> String {
41+
format!("{NODE_NAME_PREFIX}{}", normalize_node_name_suffix(name))
42+
}
43+
44+
fn normalize_node_name_suffix(name: &str) -> String {
45+
let mut normalized = String::with_capacity(name.len());
46+
let mut last_was_separator = false;
47+
48+
for ch in name.chars() {
49+
if ch.is_ascii_alphanumeric() {
50+
normalized.push(ch.to_ascii_lowercase());
51+
last_was_separator = false;
52+
} else if !last_was_separator {
53+
normalized.push('-');
54+
last_was_separator = true;
55+
}
56+
}
57+
58+
let mut normalized = normalized.trim_matches('-').to_string();
59+
if normalized.is_empty() {
60+
normalized.push_str(NODE_NAME_FALLBACK_SUFFIX);
61+
}
62+
63+
let max_suffix_len = KUBERNETES_MAX_NAME_LEN.saturating_sub(NODE_NAME_PREFIX.len());
64+
if normalized.len() > max_suffix_len {
65+
normalized.truncate(max_suffix_len);
66+
normalized.truncate(normalized.trim_end_matches('-').len());
67+
}
68+
69+
if normalized.is_empty() {
70+
normalized.push_str(NODE_NAME_FALLBACK_SUFFIX);
71+
}
72+
73+
normalized
74+
}
75+
2176
pub fn volume_name(name: &str) -> String {
2277
format!("openshell-cluster-{name}")
2378
}
2479

2580
pub fn network_name(name: &str) -> String {
2681
format!("openshell-cluster-{name}")
2782
}
83+
84+
#[cfg(test)]
85+
mod tests {
86+
use super::*;
87+
88+
#[test]
89+
fn node_name_normalizes_uppercase_and_underscores() {
90+
assert_eq!(node_name("Prod_US"), "openshell-prod-us");
91+
}
92+
93+
#[test]
94+
fn node_name_collapses_and_trims_separator_runs() {
95+
assert_eq!(node_name("._Prod..__-Gateway-."), "openshell-prod-gateway");
96+
}
97+
98+
#[test]
99+
fn node_name_falls_back_when_gateway_name_has_no_alphanumerics() {
100+
assert_eq!(node_name("...___---"), "openshell-gateway");
101+
}
102+
103+
#[test]
104+
fn node_name_truncates_to_kubernetes_name_limit() {
105+
let gateway_name = "A".repeat(400);
106+
let node_name = node_name(&gateway_name);
107+
108+
assert!(node_name.len() <= KUBERNETES_MAX_NAME_LEN);
109+
assert!(node_name.starts_with(NODE_NAME_PREFIX));
110+
assert!(node_name.ends_with('a'));
111+
}
112+
}

crates/openshell-bootstrap/src/docker.rs

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// SPDX-License-Identifier: Apache-2.0
33

44
use crate::RemoteOptions;
5-
use crate::constants::{container_name, network_name, volume_name};
5+
use crate::constants::{container_name, network_name, node_name, volume_name};
66
use crate::image::{self, DEFAULT_IMAGE_REPO_BASE, DEFAULT_REGISTRY, parse_image_ref};
77
use bollard::API_DEFAULT_VERSION;
88
use bollard::Docker;
@@ -482,6 +482,7 @@ pub async fn ensure_container(
482482
registry_username: Option<&str>,
483483
registry_token: Option<&str>,
484484
device_ids: &[String],
485+
resume: bool,
485486
) -> Result<u16> {
486487
let container_name = container_name(name);
487488

@@ -491,25 +492,34 @@ pub async fn ensure_container(
491492
.await
492493
{
493494
Ok(info) => {
494-
// Container exists — verify it is using the expected image.
495-
// Resolve the desired image ref to its content-addressable ID so we
496-
// can compare against the container's image field (which Docker
497-
// stores as an ID).
498-
let desired_id = docker
499-
.inspect_image(image_ref)
500-
.await
501-
.ok()
502-
.and_then(|img| img.id);
495+
// On resume we always reuse the existing container — the persistent
496+
// volume holds k3s etcd state, and recreating the container with
497+
// different env vars would cause the entrypoint to rewrite the
498+
// HelmChart manifest, triggering a Helm upgrade that changes the
499+
// StatefulSet image reference while the old pod still runs with the
500+
// previous image. Reusing the container avoids this entirely.
501+
//
502+
// On a non-resume path we check whether the image changed and
503+
// recreate only when necessary.
504+
let reuse = if resume {
505+
true
506+
} else {
507+
let desired_id = docker
508+
.inspect_image(image_ref)
509+
.await
510+
.ok()
511+
.and_then(|img| img.id);
503512

504-
let container_image_id = info.image;
513+
let container_image_id = info.image.clone();
505514

506-
let image_matches = match (&desired_id, &container_image_id) {
507-
(Some(desired), Some(current)) => desired == current,
508-
_ => false,
515+
match (&desired_id, &container_image_id) {
516+
(Some(desired), Some(current)) => desired == current,
517+
_ => false,
518+
}
509519
};
510520

511-
if image_matches {
512-
// The container exists with the correct image, but its network
521+
if reuse {
522+
// The container exists and should be reused. Its network
513523
// attachment may be stale. When the gateway is resumed after a
514524
// container kill, `ensure_network` destroys and recreates the
515525
// Docker network (giving it a new ID). The stopped container
@@ -543,8 +553,8 @@ pub async fn ensure_container(
543553
tracing::info!(
544554
"Container {} exists but uses a different image (container={}, desired={}), recreating",
545555
container_name,
546-
container_image_id.as_deref().map_or("unknown", truncate_id),
547-
desired_id.as_deref().map_or("unknown", truncate_id),
556+
info.image.as_deref().map_or("unknown", truncate_id),
557+
image_ref,
548558
);
549559

550560
let _ = docker.stop_container(&container_name, None).await;
@@ -684,6 +694,11 @@ pub async fn ensure_container(
684694
format!("REGISTRY_HOST={registry_host}"),
685695
format!("REGISTRY_INSECURE={registry_insecure}"),
686696
format!("IMAGE_REPO_BASE={image_repo_base}"),
697+
// Deterministic k3s node name so the node identity survives container
698+
// recreation (e.g. after an image upgrade). Without this, k3s uses
699+
// the container ID as the hostname/node name, which changes on every
700+
// container recreate and triggers stale-node PVC cleanup.
701+
format!("OPENSHELL_NODE_NAME={}", node_name(name)),
687702
];
688703
if let Some(endpoint) = registry_endpoint {
689704
env_vars.push(format!("REGISTRY_ENDPOINT={endpoint}"));
@@ -753,6 +768,14 @@ pub async fn ensure_container(
753768

754769
let config = ContainerCreateBody {
755770
image: Some(image_ref.to_string()),
771+
// Set the container hostname to the deterministic node name.
772+
// k3s uses the container hostname as its default node name. Without
773+
// this, Docker defaults to the container ID (first 12 hex chars),
774+
// which changes on every container recreation and can cause
775+
// `clean_stale_nodes` to delete the wrong node on resume. The
776+
// hostname persists across container stop/start cycles, ensuring a
777+
// stable node identity.
778+
hostname: Some(node_name(name)),
756779
cmd: Some(cmd),
757780
env,
758781
exposed_ports: Some(exposed_ports),

crates/openshell-bootstrap/src/lib.rs

Lines changed: 41 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -314,46 +314,59 @@ where
314314
// idempotent and will reuse the volume, create a container if needed,
315315
// and start it)
316316
let mut resume = false;
317+
let mut resume_container_exists = false;
317318
if let Some(existing) = check_existing_gateway(&target_docker, &name).await? {
318319
if recreate {
319320
log("[status] Removing existing gateway".to_string());
320321
destroy_gateway_resources(&target_docker, &name).await?;
321322
} else if existing.container_running {
322323
log("[status] Gateway is already running".to_string());
323324
resume = true;
325+
resume_container_exists = true;
324326
} else {
325327
log("[status] Resuming gateway from existing state".to_string());
326328
resume = true;
329+
resume_container_exists = existing.container_exists;
327330
}
328331
}
329332

330-
// Ensure the image is available on the target Docker daemon
331-
if remote_opts.is_some() {
332-
log("[status] Downloading gateway".to_string());
333-
let on_log_clone = Arc::clone(&on_log);
334-
let progress_cb = move |msg: String| {
335-
if let Ok(mut f) = on_log_clone.lock() {
336-
f(msg);
337-
}
338-
};
339-
image::pull_remote_image(
340-
&target_docker,
341-
&image_ref,
342-
registry_username.as_deref(),
343-
registry_token.as_deref(),
344-
progress_cb,
345-
)
346-
.await?;
347-
} else {
348-
// Local deployment: ensure image exists (pull if needed)
349-
log("[status] Downloading gateway".to_string());
350-
ensure_image(
351-
&target_docker,
352-
&image_ref,
353-
registry_username.as_deref(),
354-
registry_token.as_deref(),
355-
)
356-
.await?;
333+
// Ensure the image is available on the target Docker daemon.
334+
// When both the container and volume exist we can skip the pull entirely
335+
// — the container already references a valid local image. This avoids
336+
// failures when the original image tag (e.g. a local-only
337+
// `openshell/cluster:dev`) is not available from the default registry.
338+
//
339+
// When only the volume survives (container was removed), we still need
340+
// the image to recreate the container, so the pull must happen.
341+
let need_image = !resume || !resume_container_exists;
342+
if need_image {
343+
if remote_opts.is_some() {
344+
log("[status] Downloading gateway".to_string());
345+
let on_log_clone = Arc::clone(&on_log);
346+
let progress_cb = move |msg: String| {
347+
if let Ok(mut f) = on_log_clone.lock() {
348+
f(msg);
349+
}
350+
};
351+
image::pull_remote_image(
352+
&target_docker,
353+
&image_ref,
354+
registry_username.as_deref(),
355+
registry_token.as_deref(),
356+
progress_cb,
357+
)
358+
.await?;
359+
} else {
360+
// Local deployment: ensure image exists (pull if needed)
361+
log("[status] Downloading gateway".to_string());
362+
ensure_image(
363+
&target_docker,
364+
&image_ref,
365+
registry_username.as_deref(),
366+
registry_token.as_deref(),
367+
)
368+
.await?;
369+
}
357370
}
358371

359372
// All subsequent operations use the target Docker (remote or local)
@@ -444,6 +457,7 @@ where
444457
registry_username.as_deref(),
445458
registry_token.as_deref(),
446459
&device_ids,
460+
resume,
447461
)
448462
.await?;
449463
let port = actual_port;

crates/openshell-bootstrap/src/runtime.rs

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
// SPDX-License-Identifier: Apache-2.0
33

4-
use crate::constants::{KUBECONFIG_PATH, container_name};
4+
use crate::constants::{KUBECONFIG_PATH, container_name, node_name};
55
use bollard::Docker;
66
use bollard::container::LogOutput;
77
use bollard::exec::CreateExecOptions;
@@ -385,11 +385,19 @@ pub async fn clean_stale_nodes(docker: &Docker, name: &str) -> Result<usize> {
385385
let container_name = container_name(name);
386386
let mut stale_nodes: Vec<String> = Vec::new();
387387

388+
// Determine the current node name. With the deterministic `--node-name`
389+
// entrypoint change the k3s node is `openshell-{gateway}`. However, older
390+
// cluster images (built before that change) still use the container hostname
391+
// (= Docker container ID) as the node name. We must handle both:
392+
//
393+
// 1. If the expected deterministic name appears in the node list, use it.
394+
// 2. Otherwise fall back to the container hostname (old behaviour).
395+
//
396+
// This ensures backward compatibility during upgrades where the bootstrap
397+
// CLI is newer than the cluster image.
398+
let deterministic_node = node_name(name);
399+
388400
for attempt in 1..=MAX_ATTEMPTS {
389-
// List ALL node names and the container's own hostname. Any node that
390-
// is not the current container is stale — we cannot rely on the Ready
391-
// condition because k3s may not have marked the old node NotReady yet
392-
// when this runs shortly after container start.
393401
let (output, exit_code) = exec_capture_with_exit(
394402
docker,
395403
&container_name,
@@ -406,16 +414,27 @@ pub async fn clean_stale_nodes(docker: &Docker, name: &str) -> Result<usize> {
406414
.await?;
407415

408416
if exit_code == 0 {
409-
// Determine the current node name (container hostname).
410-
let (hostname_out, _) =
411-
exec_capture_with_exit(docker, &container_name, vec!["hostname".to_string()])
412-
.await?;
413-
let current_hostname = hostname_out.trim().to_string();
414-
415-
stale_nodes = output
417+
let all_nodes: Vec<&str> = output
416418
.lines()
417419
.map(str::trim)
418-
.filter(|l| !l.is_empty() && *l != current_hostname)
420+
.filter(|l| !l.is_empty())
421+
.collect();
422+
423+
// Pick the current node identity: prefer the deterministic name,
424+
// fall back to the container hostname for older cluster images.
425+
let current_node = if all_nodes.contains(&deterministic_node.as_str()) {
426+
deterministic_node.clone()
427+
} else {
428+
// Older cluster image without --node-name: read hostname.
429+
let (hostname_out, _) =
430+
exec_capture_with_exit(docker, &container_name, vec!["hostname".to_string()])
431+
.await?;
432+
hostname_out.trim().to_string()
433+
};
434+
435+
stale_nodes = all_nodes
436+
.into_iter()
437+
.filter(|n| *n != current_node)
419438
.map(ToString::to_string)
420439
.collect();
421440
break;

0 commit comments

Comments
 (0)