NVIDIA-NeMo · meatybobby · Mar 16, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 18, 2026
@@ -134,6 +134,15 @@
         "line_number": 42
       }
     ],
+    "tests/functional_tests/tests_inframework/test_deploy_query_vlm_ray.py": [
+      {
+        "type": "Base64 High Entropy String",
+        "filename": "tests/functional_tests/tests_inframework/test_deploy_query_vlm_ray.py",
+        "hashed_secret": "06d6cd87162dc2700a70b39c5fa328961aa254b1",
+        "is_verified": false,
+        "line_number": 66
+      }
+    ],
     "tutorials/onnx_tensorrt/embedding/llama_embedding.ipynb": [
       {
         "type": "Hex High Entropy String",
@@ -167,5 +176,5 @@
       }
     ]
   },
-  "generated_at": "2026-02-23T23:04:01Z"
+  "generated_at": "2026-03-17T20:45:09Z"
 }
@@ -139,7 +139,7 @@ def generate(
             tokenizer=self.processor.tokenizer,
             image_processor=self.processor.image_processor,
             prompts=prompts,
-            images=images,
+            images=images if images and len(images) > 0 else None,
             processor=self.processor,
             random_seed=random_seed,
             sampling_params=inference_params,

@@ -79,6 +79,9 @@ def __init__(
                 **model_config_kwargs,
             )
             self.rank = rank
+            LOGGER.warning(
+                f"Replica {replica_id} - Inference context type: {type(self.model.inference_wrapped_model.inference_context)}"
+            )
         except Exception as e:
             LOGGER.error(f"Replica {replica_id} - Failed to initialize multimodal model for rank {rank}: {str(e)}")
             raise
@@ -217,6 +220,11 @@ async def chat_completions(self, request: Dict[Any, Any]):
             if not isinstance(messages, list):
                 prompts = [messages]
 
+            # Normalize content: "content" as string -> list of one text part
+            for message in prompts:
+                if isinstance(message.get("content"), str):
+                    message["content"] = [{"type": "text", "text": message["content"]}]
+
             # Normalize image_url format to image format for consistent processing
             for message in prompts:
                 for content in message["content"]:

@@ -153,7 +153,7 @@ override-dependencies = [
     "transformer-engine-cu13>=2.12.0a0,<2.15.0; sys_platform != 'darwin'",
     "transformer-engine-cu12; sys_platform == 'never'",
     "mamba-ssm>=2.3.0,<2.4.0",
-    "transformers>=5.0.0",
+    "transformers>=5.0.0,<=5.2.0",
     "protobuf~=6.33.5",
     "opencv-python-headless; sys_platform == 'never'",
     "cryptography>=43.0.0,<47",

diff --git a/tests/functional_tests/tests_inframework/test_deploy_query_vlm_ray.py b/tests/functional_tests/tests_inframework/test_deploy_query_vlm_ray.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import subprocess
+import time
+
+import requests
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Base64-encoded 1x1 JPEG
+BASE64_IMAGE = "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAAQABADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD3+iiigAooooA//9k="
+
+
+def query_ray_chat_with_base64_image(host: str, port: int, model_id: str, messages: list, max_tokens: int = 16) -> str:
+    """Query /v1/chat/completions/ with a messages payload (e.g. containing base64 image)."""
+    url = f"http://{host}:{port}/v1/chat/completions/"
+    payload = {
+        "model": model_id,
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "temperature": 0.0,
+        "top_p": 1.0,
+    }
+    response = requests.post(url, json=payload, timeout=60)
+    if response.status_code == 200:
+        result = response.json()
+        return result.get("choices", [{}])[0].get("message", {}).get("content", "")
+    logger.error(f"Chat completions error: {response.status_code} - {response.text}")
+    return ""
+
+
+from tests.functional_tests.utils.ray_test_utils import (
+    query_ray_deployment,
+    terminate_deployment_process,
+    wait_for_deployment_ready,
+)
+
+
+class TestDeployRayVLM:
+    def setup_method(self):
+        """Setup for each test method."""
+        self.deploy_proc = None
+
+    def teardown_method(self):
+        """Cleanup after each test method."""
+        if self.deploy_proc is not None:
+            terminate_deployment_process(self.deploy_proc)
+            # Avoid double termination in case test used finally to clean up
+            self.deploy_proc = None
+
+    def test_deploy_ray(self):
+        vlm_checkpoint_path = "/home/TestData/megatron_bridge/checkpoints/qwen25-vl-3b"
+
+        try:
+            # Run Ray deployment for Megatron multimodal (VLM) model
+            self.deploy_proc = subprocess.Popen(
+                [
+                    "coverage",
+                    "run",
+                    "--data-file=/workspace/.coverage",
+                    "--source=/workspace/",
+                    "--parallel-mode",
+                    "scripts/deploy/multimodal/deploy_ray_inframework.py",
+                    "--megatron_checkpoint",
+                    vlm_checkpoint_path,
+                    "--model_id",
+                    "megatron-multimodal",
+                    "--num_gpus",
+                    str(1),
+                    "--host",
+                    "0.0.0.0",
+                    "--port",
+                    str(8000),
+                    "--cuda_visible_devices",
+                    "0",
+                ]
+            )
+            logging.info("Deployment started. Waiting for it to be ready...")
+
+            # Wait for deployment to be ready
+            if not wait_for_deployment_ready(host="0.0.0.0", port=8000, max_wait_time=180):
+                assert False, "Deployment failed to become ready within timeout"
+
+            time.sleep(120)
+
+            # Text-only completions (no images)
+            output = query_ray_deployment(
+                host="0.0.0.0",
+                port=8000,
+                model_id="megatron-multimodal",
+                prompt="What is the color of a banana?",
+                max_tokens=5,
+            )
+
+            print(output)
+
+            # Check if deployment was successful
+            assert output != "", "First prediction is empty"
+
+            # Text-only chat completions (no images)
+            chat_messages = [
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": "Hello, how are you?"}],
+                }
+            ]
+            output_chat = query_ray_deployment(
+                host="0.0.0.0",
+                port=8000,
+                model_id="megatron-multimodal",
+                prompt=chat_messages,
+                max_tokens=5,
+                use_chat=True,
+            )
+            print(output_chat)
+            # Check if deployment was successful
+            assert output_chat != "", "Second prediction (chat) is empty"
+
+            # Chat with base64 image (base64)
+            messages_with_image = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": BASE64_IMAGE},
+                        },
+                        {"type": "text", "text": "Describe the image:"},
+                    ],
+                }
+            ]
+            output_image = query_ray_chat_with_base64_image(
+                host="0.0.0.0",
+                port=8000,
+                model_id="megatron-multimodal",
+                messages=messages_with_image,
+                max_tokens=5,
+            )
+            print(output_image)
+            assert output_image != "", "Chat with base64 image returned empty"
+        finally:
+            # Ensure the deployment is terminated as soon as queries complete or on failure
+            if self.deploy_proc is not None:
+                terminate_deployment_process(self.deploy_proc)
+                self.deploy_proc = None
diff --git a/tests/unit_tests/deploy/test_megatron_multimodal_deployable.py b/tests/unit_tests/deploy/test_megatron_multimodal_deployable.py
@@ -403,6 +403,29 @@ def test_generate_empty_inputs(self, deployable):
             results = deployable.generate(prompts=[], images=[])
             assert len(results) == 0
 
+    def test_generate_with_empty_images(self, deployable):
+        """Test generate with images=[] passes None to bridge generate (text-only)."""
+        prompts = ["Text-only prompt"]
+        images = []
+
+        with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.generate") as mock_generate:
+            mock_generate.return_value = [MockResult("Generated from text only")]
+
+            results = deployable.generate(prompts=prompts, images=images)
+
+            mock_generate.assert_called_once_with(
+                wrapped_model=deployable.inference_wrapped_model,
+                tokenizer=deployable.processor.tokenizer,
+                image_processor=deployable.processor.image_processor,
+                prompts=prompts,
+                images=None,
+                processor=deployable.processor,
+                random_seed=None,
+                sampling_params=None,
+            )
+            assert len(results) == 1
+            assert results[0].generated_text == "Generated from text only"
+
     def test_generate_mismatched_inputs(self, deployable, sample_image):
         """Test generate method with mismatched prompt and image counts."""
         prompts = ["prompt1", "prompt2"]

diff --git a/tests/unit_tests/deploy/test_megatron_multimodal_deployable_ray.py b/tests/unit_tests/deploy/test_megatron_multimodal_deployable_ray.py
@@ -397,3 +397,29 @@ def test_initialization_with_custom_inference_params(
 
         health_response = requests.get("http://127.0.0.1:8000/v1/health", timeout=10).json()
         assert health_response["status"] == "healthy"
+
+    def test_chat_completions_with_string_content(self):
+        """Test chat_completions when message content is a plain string (normalized to text part)."""
+        actual_class = MegatronMultimodalRayDeployable.func_or_class
+        deployment = MagicMock()
+        deployment.model_id = "test-chat-string-content"
+        deployment.workers = [MagicMock()]
+
+        request = {
+            "model": "test-chat-string-content",
+            "messages": [
+                {"role": "user", "content": "What is the color of the sky?"},
+            ],
+            "max_tokens": 10,
+        }
+
+        with patch("nemo_deploy.multimodal.megatron_multimodal_deployable_ray.ray.get") as mock_ray_get:
+            mock_ray_get.return_value = {"sentences": ["Generated multimodal response"]}
+
+            result = run_async(actual_class.chat_completions(deployment, request))
+
+        assert "choices" in result
+        assert len(result["choices"]) >= 1
+        assert "message" in result["choices"][0]
+        assert "content" in result["choices"][0]["message"]
+        assert result["choices"][0]["message"]["content"] == "Generated multimodal response"