Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion .github/workflows/config/.secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,15 @@
"line_number": 42
}
],
"tests/functional_tests/tests_inframework/test_deploy_query_vlm_ray.py": [
{
"type": "Base64 High Entropy String",
"filename": "tests/functional_tests/tests_inframework/test_deploy_query_vlm_ray.py",
"hashed_secret": "06d6cd87162dc2700a70b39c5fa328961aa254b1",
"is_verified": false,
"line_number": 66
}
],
"tutorials/onnx_tensorrt/embedding/llama_embedding.ipynb": [
{
"type": "Hex High Entropy String",
Expand Down Expand Up @@ -167,5 +176,5 @@
}
]
},
"generated_at": "2026-02-23T23:04:01Z"
"generated_at": "2026-03-17T20:45:09Z"
}
2 changes: 1 addition & 1 deletion nemo_deploy/multimodal/megatron_multimodal_deployable.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def generate(
tokenizer=self.processor.tokenizer,
image_processor=self.processor.image_processor,
prompts=prompts,
images=images,
images=images if images and len(images) > 0 else None,
processor=self.processor,
random_seed=random_seed,
sampling_params=inference_params,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ def __init__(
**model_config_kwargs,
)
self.rank = rank
LOGGER.warning(
f"Replica {replica_id} - Inference context type: {type(self.model.inference_wrapped_model.inference_context)}"
)
except Exception as e:
LOGGER.error(f"Replica {replica_id} - Failed to initialize multimodal model for rank {rank}: {str(e)}")
raise
Expand Down Expand Up @@ -217,6 +220,11 @@ async def chat_completions(self, request: Dict[Any, Any]):
if not isinstance(messages, list):
prompts = [messages]

# Normalize content: "content" as string -> list of one text part
for message in prompts:
if isinstance(message.get("content"), str):
message["content"] = [{"type": "text", "text": message["content"]}]

# Normalize image_url format to image format for consistent processing
for message in prompts:
for content in message["content"]:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ override-dependencies = [
"transformer-engine-cu13>=2.12.0a0,<2.15.0; sys_platform != 'darwin'",
"transformer-engine-cu12; sys_platform == 'never'",
"mamba-ssm>=2.3.0,<2.4.0",
"transformers>=5.0.0",
"transformers>=5.0.0,<=5.2.0",
"protobuf~=6.33.5",
"opencv-python-headless; sys_platform == 'never'",
"cryptography>=43.0.0,<47",
Expand Down
159 changes: 159 additions & 0 deletions tests/functional_tests/tests_inframework/test_deploy_query_vlm_ray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import subprocess
import time

import requests

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Base64-encoded 1x1 JPEG
BASE64_IMAGE = "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAAQABADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD3+iiigAooooA//9k="


def query_ray_chat_with_base64_image(host: str, port: int, model_id: str, messages: list, max_tokens: int = 16) -> str:
"""Query /v1/chat/completions/ with a messages payload (e.g. containing base64 image)."""
url = f"http://{host}:{port}/v1/chat/completions/"
payload = {
"model": model_id,
"messages": messages,
"max_tokens": max_tokens,
"temperature": 0.0,
"top_p": 1.0,
}
response = requests.post(url, json=payload, timeout=60)
if response.status_code == 200:
result = response.json()
return result.get("choices", [{}])[0].get("message", {}).get("content", "")
logger.error(f"Chat completions error: {response.status_code} - {response.text}")
return ""


from tests.functional_tests.utils.ray_test_utils import (
query_ray_deployment,
terminate_deployment_process,
wait_for_deployment_ready,
)


class TestDeployRayVLM:
def setup_method(self):
"""Setup for each test method."""
self.deploy_proc = None

def teardown_method(self):
"""Cleanup after each test method."""
if self.deploy_proc is not None:
terminate_deployment_process(self.deploy_proc)
# Avoid double termination in case test used finally to clean up
self.deploy_proc = None

def test_deploy_ray(self):
vlm_checkpoint_path = "/home/TestData/megatron_bridge/checkpoints/qwen25-vl-3b"

try:
# Run Ray deployment for Megatron multimodal (VLM) model
self.deploy_proc = subprocess.Popen(
[
"coverage",
"run",
"--data-file=/workspace/.coverage",
"--source=/workspace/",
"--parallel-mode",
"scripts/deploy/multimodal/deploy_ray_inframework.py",
"--megatron_checkpoint",
vlm_checkpoint_path,
"--model_id",
"megatron-multimodal",
"--num_gpus",
str(1),
"--host",
"0.0.0.0",
"--port",
str(8000),
"--cuda_visible_devices",
"0",
]
)
logging.info("Deployment started. Waiting for it to be ready...")

# Wait for deployment to be ready
if not wait_for_deployment_ready(host="0.0.0.0", port=8000, max_wait_time=180):
assert False, "Deployment failed to become ready within timeout"

time.sleep(120)

# Text-only completions (no images)
output = query_ray_deployment(
host="0.0.0.0",
port=8000,
model_id="megatron-multimodal",
prompt="What is the color of a banana?",
max_tokens=5,
)

print(output)

# Check if deployment was successful
assert output != "", "First prediction is empty"

# Text-only chat completions (no images)
chat_messages = [
{
"role": "user",
"content": [{"type": "text", "text": "Hello, how are you?"}],
}
]
output_chat = query_ray_deployment(
host="0.0.0.0",
port=8000,
model_id="megatron-multimodal",
prompt=chat_messages,
max_tokens=5,
use_chat=True,
)
print(output_chat)
# Check if deployment was successful
assert output_chat != "", "Second prediction (chat) is empty"

# Chat with base64 image (base64)
messages_with_image = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": BASE64_IMAGE},
},
{"type": "text", "text": "Describe the image:"},
],
}
]
output_image = query_ray_chat_with_base64_image(
host="0.0.0.0",
port=8000,
model_id="megatron-multimodal",
messages=messages_with_image,
max_tokens=5,
)
print(output_image)
assert output_image != "", "Chat with base64 image returned empty"
finally:
# Ensure the deployment is terminated as soon as queries complete or on failure
if self.deploy_proc is not None:
terminate_deployment_process(self.deploy_proc)
self.deploy_proc = None
23 changes: 23 additions & 0 deletions tests/unit_tests/deploy/test_megatron_multimodal_deployable.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,29 @@ def test_generate_empty_inputs(self, deployable):
results = deployable.generate(prompts=[], images=[])
assert len(results) == 0

def test_generate_with_empty_images(self, deployable):
"""Test generate with images=[] passes None to bridge generate (text-only)."""
prompts = ["Text-only prompt"]
images = []

with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.generate") as mock_generate:
mock_generate.return_value = [MockResult("Generated from text only")]

results = deployable.generate(prompts=prompts, images=images)

mock_generate.assert_called_once_with(
wrapped_model=deployable.inference_wrapped_model,
tokenizer=deployable.processor.tokenizer,
image_processor=deployable.processor.image_processor,
prompts=prompts,
images=None,
processor=deployable.processor,
random_seed=None,
sampling_params=None,
)
assert len(results) == 1
assert results[0].generated_text == "Generated from text only"

def test_generate_mismatched_inputs(self, deployable, sample_image):
"""Test generate method with mismatched prompt and image counts."""
prompts = ["prompt1", "prompt2"]
Expand Down
26 changes: 26 additions & 0 deletions tests/unit_tests/deploy/test_megatron_multimodal_deployable_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,3 +397,29 @@ def test_initialization_with_custom_inference_params(

health_response = requests.get("http://127.0.0.1:8000/v1/health", timeout=10).json()
assert health_response["status"] == "healthy"

def test_chat_completions_with_string_content(self):
"""Test chat_completions when message content is a plain string (normalized to text part)."""
actual_class = MegatronMultimodalRayDeployable.func_or_class
deployment = MagicMock()
deployment.model_id = "test-chat-string-content"
deployment.workers = [MagicMock()]

request = {
"model": "test-chat-string-content",
"messages": [
{"role": "user", "content": "What is the color of the sky?"},
],
"max_tokens": 10,
}

with patch("nemo_deploy.multimodal.megatron_multimodal_deployable_ray.ray.get") as mock_ray_get:
mock_ray_get.return_value = {"sentences": ["Generated multimodal response"]}

result = run_async(actual_class.chat_completions(deployment, request))

assert "choices" in result
assert len(result["choices"]) >= 1
assert "message" in result["choices"][0]
assert "content" in result["choices"][0]["message"]
assert result["choices"][0]["message"]["content"] == "Generated multimodal response"
Loading
Loading