Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions codex-rs/app-server/tests/common/models_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ fn preset_to_info(preset: &ModelPreset, priority: i32) -> ModelInfo {
supports_search_tool: false,
auto_review_model_override: None,
tool_mode: None,
multi_agent_version: None,
}
}

Expand Down
1 change: 1 addition & 0 deletions codex-rs/codex-api/tests/models_integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ async fn models_client_hits_models_endpoint() {
supports_search_tool: false,
auto_review_model_override: None,
tool_mode: None,
multi_agent_version: None,
}],
};

Expand Down
87 changes: 87 additions & 0 deletions codex-rs/core/src/guardian/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,17 @@ use codex_model_provider::create_model_provider;
use codex_model_provider_info::AMAZON_BEDROCK_GPT_5_4_MODEL_ID;
use codex_model_provider_info::AMAZON_BEDROCK_PROVIDER_ID;
use codex_model_provider_info::ModelProviderInfo;
use codex_models_manager::manager::RefreshStrategy;
use codex_models_manager::model_info::model_info_from_slug;
use codex_network_proxy::NetworkProxyConfig;
use codex_protocol::ThreadId;
use codex_protocol::approvals::NetworkApprovalProtocol;
use codex_protocol::config_types::ApprovalsReviewer;
use codex_protocol::models::ContentItem;
use codex_protocol::models::PermissionProfile;
use codex_protocol::models::ResponseItem;
use codex_protocol::openai_models::ModelVisibility;
use codex_protocol::openai_models::ModelsResponse;
use codex_protocol::openai_models::ReasoningEffort;
use codex_protocol::permissions::FileSystemAccessMode;
use codex_protocol::permissions::FileSystemPath;
Expand All @@ -45,6 +49,7 @@ use codex_protocol::protocol::GranularApprovalConfig;
use codex_protocol::protocol::GuardianAssessmentStatus;
use codex_protocol::protocol::GuardianRiskLevel;
use codex_protocol::protocol::GuardianUserAuthorization;
use codex_protocol::protocol::MultiAgentVersion;
use codex_protocol::protocol::ReviewDecision;
use codex_protocol::protocol::RolloutItem;
use codex_protocol::protocol::TurnCompleteEvent;
Expand All @@ -55,6 +60,7 @@ use core_test_support::context_snapshot::ContextSnapshotOptions;
use core_test_support::responses::ev_assistant_message;
use core_test_support::responses::ev_completed;
use core_test_support::responses::ev_response_created;
use core_test_support::responses::mount_models_once;
use core_test_support::responses::mount_response_once;
use core_test_support::responses::mount_sse_once;
use core_test_support::responses::mount_sse_sequence;
Expand All @@ -67,6 +73,7 @@ use core_test_support::test_path_buf;
use insta::Settings;
use insta::assert_snapshot;
use pretty_assertions::assert_eq;
use serde_json::Value;
use std::collections::BTreeMap;
use std::collections::HashMap;
use std::sync::Arc;
Expand Down Expand Up @@ -1377,6 +1384,86 @@ async fn guardian_review_uses_preferred_review_model_without_model_catalog_overr
Ok(())
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn guardian_stays_disabled_when_model_selects_multi_agent_v2() -> anyhow::Result<()> {
skip_if_no_network!(Ok(()));

let server = start_mock_server().await;
let guardian_model = "guardian-multi-agent-v2";
let mut model = model_info_from_slug(guardian_model);
model.visibility = ModelVisibility::List;
model.used_fallback_model_metadata = false;
model.multi_agent_version = Some(MultiAgentVersion::V2);
let models_mock = mount_models_once(
&server,
ModelsResponse {
models: vec![model],
},
)
.await;
let request_log = mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-guardian"),
ev_assistant_message("msg-guardian", "{\"outcome\":\"allow\"}"),
ev_completed("resp-guardian"),
]),
)
.await;

let (session, mut turn) = guardian_test_session_and_turn(&server).await;
let _ = session
.services
.models_manager
.list_models(RefreshStrategy::Online)
.await;
Arc::get_mut(&mut turn)
.expect("turn should be unique")
.model_info
.auto_review_model_override = Some(guardian_model.to_string());
seed_guardian_parent_history(&session, &turn).await;

let outcome = run_guardian_review_session_for_test(
Arc::clone(&session),
turn,
GuardianApprovalRequest::Shell {
id: "shell-1".to_string(),
command: vec!["git".to_string(), "push".to_string()],
cwd: test_path_buf("/repo/codex-rs/core").abs(),
sandbox_permissions: crate::sandboxing::SandboxPermissions::UseDefault,
additional_permissions: None,
justification: None,
},
Some("Sandbox denied outbound git push to github.com.".to_string()),
guardian_output_schema(),
/*external_cancel*/ None,
)
.await;
let (GuardianReviewOutcome::Completed(_), _) = outcome else {
panic!("expected guardian assessment");
};
let request_body = request_log.single_request().body_json();
let has_spawn_agent = request_body
.get("tools")
.and_then(Value::as_array)
.is_some_and(|tools| {
tools
.iter()
.any(|tool| tool.get("name").and_then(Value::as_str) == Some("spawn_agent"))
});

assert_eq!(
(
models_mock.requests().len(),
request_body.get("model").and_then(Value::as_str),
has_spawn_agent,
),
(1, Some(guardian_model), false)
);

Ok(())
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn guardian_review_request_layout_matches_model_visible_request_snapshot()
-> anyhow::Result<()> {
Expand Down
3 changes: 3 additions & 0 deletions codex-rs/core/src/session/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -428,11 +428,13 @@ pub(crate) struct CodexSpawnArgs {
fn resolve_multi_agent_version(
conversation_history: &InitialHistory,
inherited_multi_agent_version: Option<MultiAgentVersion>,
model_info: &ModelInfo,
config: &Config,
) -> Option<MultiAgentVersion> {
conversation_history
.get_multi_agent_version()
.or(inherited_multi_agent_version)
.or(model_info.multi_agent_version)
Comment thread
aibrahim-oai marked this conversation as resolved.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Keep legacy rollouts on their original runtime

When resuming a thread whose rollout predates persisted SessionMeta.multi_agent_version, conversation_history.get_multi_agent_version() is None, so this newly added catalog fallback can lock the thread to whatever /models returns today. If the backend starts advertising multi_agent_version: v2 for a model, an older thread that was created with the V1 multi_agent_v1 tool surface will resume as V2 and expose the plain V2 tools, even though its existing context can contain V1 tool calls/outputs; for these legacy rollouts the safer fallback is the feature-derived runtime that was in effect before model metadata existed, or another compatibility inference from the stored history.

Useful? React with 👍 / 👎.

.or_else(|| config.multi_agent_version_from_features())
}

Expand Down Expand Up @@ -554,6 +556,7 @@ impl Codex {
let multi_agent_version = resolve_multi_agent_version(
&conversation_history,
inherited_multi_agent_version,
&model_info,
&config,
);
let _ = config
Expand Down
1 change: 1 addition & 0 deletions codex-rs/core/tests/suite/auto_review.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ fn remote_model_with_auto_review_override(slug: &str, review_model: &str) -> Mod
supports_search_tool: false,
auto_review_model_override: Some(review_model.to_string()),
tool_mode: None,
multi_agent_version: None,
priority: 1,
additional_speed_tiers: Vec::new(),
service_tiers: Vec::new(),
Expand Down
144 changes: 144 additions & 0 deletions codex-rs/core/tests/suite/model_runtime_selectors.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use anyhow::Result;
use anyhow::bail;
use codex_core::config::Config;
use codex_features::Feature;
use codex_login::CodexAuth;
Expand All @@ -11,25 +12,37 @@ use codex_protocol::openai_models::ModelVisibility;
use codex_protocol::openai_models::ModelsResponse;
use codex_protocol::openai_models::ToolMode;
use codex_protocol::protocol::EventMsg;
use codex_protocol::protocol::MultiAgentVersion;
use codex_protocol::protocol::Op;
use codex_protocol::protocol::ThreadSettingsOverrides;
use codex_protocol::user_input::UserInput;
use core_test_support::responses;
use core_test_support::responses::ev_assistant_message;
use core_test_support::responses::ev_completed;
use core_test_support::responses::ev_function_call;
use core_test_support::responses::ev_response_created;
use core_test_support::responses::mount_models_once;
use core_test_support::responses::mount_sse_once;
use core_test_support::responses::mount_sse_once_match;
use core_test_support::responses::sse;
use core_test_support::skip_if_no_network;
use core_test_support::submit_thread_settings;
use core_test_support::test_codex::test_codex;
use core_test_support::wait_for_event;
use pretty_assertions::assert_eq;
use serde_json::Value;
use serde_json::json;
use std::io::Cursor;
use tokio::time::Duration;
use tokio::time::Instant;
use tokio::time::sleep;
use wiremock::Request;

const CHILD_PROMPT: &str = "inspect the child runtime";
const CHILD_MODEL: &str = "test-multi-agent-child";
const ROOT_MODEL: &str = "test-multi-agent-root";
const ROOT_PROMPT: &str = "spawn a child";
const SPAWN_CALL_ID: &str = "spawn-call-1";

fn remote_model(slug: &str) -> ModelInfo {
ModelInfo {
Expand All @@ -39,6 +52,26 @@ fn remote_model(slug: &str) -> ModelInfo {
}
}

fn body_contains(req: &Request, text: &str) -> bool {
let is_zstd = req
.headers
.get("content-encoding")
.and_then(|value| value.to_str().ok())
.is_some_and(|value| {
value
.split(',')
.any(|entry| entry.trim().eq_ignore_ascii_case("zstd"))
});
let bytes = if is_zstd {
zstd::stream::decode_all(Cursor::new(&req.body)).ok()
} else {
Some(req.body.clone())
};
bytes
.and_then(|body| String::from_utf8(body).ok())
.is_some_and(|body| body.contains(text))
}

fn tool_names(body: &Value) -> Vec<String> {
body.get("tools")
.and_then(Value::as_array)
Expand Down Expand Up @@ -171,3 +204,114 @@ async fn remote_tool_mode_selector_overrides_feature_flags() -> Result<()> {

Ok(())
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn remote_multi_agent_selector_overrides_features_and_child_model_info() -> Result<()> {
skip_if_no_network!(Ok(()));

let server = wiremock::MockServer::start().await;
let mut root_model = remote_model(ROOT_MODEL);
root_model.multi_agent_version = Some(MultiAgentVersion::V2);
let mut child_model = remote_model(CHILD_MODEL);
child_model.multi_agent_version = Some(MultiAgentVersion::V1);
let models_mock = mount_models_once(
&server,
ModelsResponse {
models: vec![root_model, child_model],
},
)
.await;
let spawn_args = serde_json::to_string(&json!({
"message": CHILD_PROMPT,
"task_name": "worker",
"model": CHILD_MODEL,
"fork_turns": "none",
}))?;
mount_sse_once_match(
&server,
|req: &Request| body_contains(req, ROOT_PROMPT),
sse(vec![
ev_response_created("resp-root-1"),
ev_function_call(SPAWN_CALL_ID, "spawn_agent", &spawn_args),
ev_completed("resp-root-1"),
]),
)
.await;
mount_sse_once_match(
&server,
|req: &Request| body_contains(req, CHILD_PROMPT) && !body_contains(req, SPAWN_CALL_ID),
sse(vec![
ev_response_created("resp-child-1"),
ev_assistant_message("msg-child-1", "child done"),
ev_completed("resp-child-1"),
]),
)
.await;
let root_followup_mock = mount_sse_once_match(
&server,
|req: &Request| body_contains(req, SPAWN_CALL_ID),
sse(vec![
ev_response_created("resp-root-2"),
ev_assistant_message("msg-root-2", "root done"),
ev_completed("resp-root-2"),
]),
)
.await;

let mut builder = test_codex()
.with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing())
.with_config(|config| {
config
.features
.enable(Feature::Collab)
.expect("test config should allow feature update");
config.model = Some(ROOT_MODEL.to_string());
});
let test = builder.build(&server).await?;
assert_eq!(
(
models_mock.requests().len(),
test.codex.multi_agent_version(),
),
(1, Some(MultiAgentVersion::V2))
);
test.submit_turn(ROOT_PROMPT).await?;
let deadline = Instant::now() + Duration::from_secs(2);
let child_id = loop {
if let Some(child_id) = test
.thread_manager
.list_thread_ids()
.await
.into_iter()
.find(|thread_id| *thread_id != test.session_configured.thread_id)
{
break child_id;
}
if Instant::now() >= deadline {
bail!(
"timed out waiting for spawn_agent to create a child thread: root lock {:?}, spawn output {:?}",
test.codex.multi_agent_version(),
root_followup_mock.function_call_output_text(SPAWN_CALL_ID),
);
}
sleep(Duration::from_millis(10)).await;
};
let child = test.thread_manager.get_thread(child_id).await?;

assert_eq!(
(
models_mock.requests().len(),
test.codex.multi_agent_version(),
child.config_snapshot().await.model,
child.multi_agent_version(),
),
(
1,
Some(MultiAgentVersion::V2),
CHILD_MODEL.to_string(),
Some(MultiAgentVersion::V2),
)
);

Ok(())
}
2 changes: 2 additions & 0 deletions codex-rs/core/tests/suite/model_switching.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ fn test_model_info(
supports_search_tool: false,
auto_review_model_override: None,
tool_mode: None,
multi_agent_version: None,
priority: 1,
additional_speed_tiers: Vec::new(),
service_tiers: Vec::new(),
Expand Down Expand Up @@ -933,6 +934,7 @@ async fn model_switch_to_smaller_model_updates_token_context_window() -> Result<
supports_search_tool: false,
auto_review_model_override: None,
tool_mode: None,
multi_agent_version: None,
priority: 1,
additional_speed_tiers: Vec::new(),
service_tiers: Vec::new(),
Expand Down
1 change: 1 addition & 0 deletions codex-rs/core/tests/suite/models_cache_ttl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -372,5 +372,6 @@ fn test_remote_model(slug: &str, priority: i32) -> ModelInfo {
supports_search_tool: false,
auto_review_model_override: None,
tool_mode: None,
multi_agent_version: None,
}
}
2 changes: 2 additions & 0 deletions codex-rs/core/tests/suite/personality.rs
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,7 @@ async fn remote_model_friendly_personality_instructions_with_feature() -> anyhow
supports_search_tool: false,
auto_review_model_override: None,
tool_mode: None,
multi_agent_version: None,
};

let _models_mock = mount_models_once(
Expand Down Expand Up @@ -706,6 +707,7 @@ async fn user_turn_personality_remote_model_template_includes_update_message() -
supports_search_tool: false,
auto_review_model_override: None,
tool_mode: None,
multi_agent_version: None,
};

let _models_mock = mount_models_once(
Expand Down
Loading
Loading