diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml
index fca6d6e..65735bf 100644
--- a/.github/workflows/e2e_tests.yaml
+++ b/.github/workflows/e2e_tests.yaml
@@ -70,12 +70,99 @@ jobs:
       - name: Install NVidia GPU operator for KinD
         uses: ./common/github-actions/nvidia-gpu-operator
 
+      - name: Verify GPU availability in KinD
+        run: |
+          echo "Checking for available GPUs in the KinD cluster..."
+          
+          # Wait for GPU operator pods to be ready (with timeout)
+          echo "Waiting for GPU operator pods to be ready..."
+          TIMEOUT=300  # 5 minutes timeout
+          END=$((SECONDS + TIMEOUT))
+          
+          while [ $SECONDS -lt $END ]; do
+            # Get total number of pods in the namespace
+            TOTAL_PODS=$(kubectl get pods -n gpu-operator --no-headers | wc -l)
+            
+            # Count pods that are either running and ready or completed successfully
+            # Exclude pods that are still initializing
+            READY_PODS=$(kubectl get pods -n gpu-operator --no-headers | grep -E 'Running|Completed' | grep -v 'PodInitializing' | wc -l)
+            
+            if [ "$READY_PODS" -eq "$TOTAL_PODS" ] && [ "$TOTAL_PODS" -gt 0 ]; then
+              echo "All GPU operator pods are ready or completed successfully!"
+              break
+            fi
+            
+            echo "Waiting for GPU operator pods to be ready... ($READY_PODS/$TOTAL_PODS)"
+            echo "Pod status:"
+            kubectl get pods -n gpu-operator
+            sleep 10
+          done
+          
+          if [ $SECONDS -ge $END ]; then
+            echo "::error::Timeout waiting for GPU operator pods to be ready"
+            echo "GPU operator pod status:"
+            kubectl get pods -n gpu-operator -o wide
+            echo "GPU operator pod logs:"
+            kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator
+            echo "GPU operator pod events:"
+            kubectl get events -n gpu-operator
+            exit 1
+          fi
+          
+          echo "Node details:"
+          kubectl describe nodes | grep -E 'nvidia.com/gpu|Allocatable:|Capacity:|Name:'
+          
+          # Check if GPU operator has labeled nodes
+          GPU_LABELS=$(kubectl describe nodes | grep -c "nvidia.com/gpu")
+          if [ "$GPU_LABELS" -eq 0 ]; then
+            echo "::error::No NVIDIA GPU labels found on nodes. GPU operator may not be running correctly."
+            echo "Full node descriptions for debugging:"
+            kubectl describe nodes
+            exit 1
+          fi
+          
+          # Check if GPUs are actually allocatable
+          GPU_ALLOCATABLE=$(kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}' | tr ' ' '\n' | grep -v '^$' | wc -l)
+          if [ "$GPU_ALLOCATABLE" -eq 0 ]; then
+            echo "::error::GPU operator is running but no GPUs are allocatable. Check GPU operator logs."
+            echo "Checking GPU operator pods:"
+            kubectl get pods -n gpu-operator -o wide
+            echo "GPU operator pod logs:"
+            kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator
+            echo "GPU operator pod events:"
+            kubectl get events -n gpu-operator
+            echo "GPU operator pod descriptions:"
+            kubectl describe pods -n gpu-operator
+            exit 1
+          fi
+          
+          echo "Successfully found $GPU_ALLOCATABLE allocatable GPU(s) in the cluster."
+
       - name: Deploy CodeFlare stack
         id: deploy
         run: |
           cd codeflare-operator
           echo Setting up CodeFlare stack
           make setup-e2e
+          
+          # Create ConfigMap to disable mTLS
+          echo "Creating ConfigMap to disable mTLS..."
+          cat <<EOF | kubectl apply -f -
+          apiVersion: v1
+          kind: ConfigMap
+          metadata:
+            name: codeflare-operator-config
+            namespace: ray-system
+          data:
+            config.yaml: |
+              kuberay:
+                mTLSEnabled: false
+                rayDashboardOAuthEnabled: false  
+                ingressDomain: "kind"
+              appwrapper:
+                enabled: true
+          EOF
+          
           echo Deploying CodeFlare operator
           make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
           kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
@@ -86,6 +173,36 @@ jobs:
         with:
           user-name: sdk-user
 
+      - name: Grant sdk-user port-forwarding permissions
+        run: |
+          cat <<EOF | kubectl apply -f -
+          apiVersion: rbac.authorization.k8s.io/v1
+          kind: ClusterRole
+          metadata:
+            name: port-forward-permissions
+          rules:
+          - apiGroups: [""]
+            resources: ["services", "pods"]
+            verbs: ["get", "list", "watch"]
+          - apiGroups: [""]
+            resources: ["pods/portforward"]
+            verbs: ["create"]
+          ---
+          apiVersion: rbac.authorization.k8s.io/v1
+          kind: ClusterRoleBinding
+          metadata:
+            name: sdk-user-port-forward-binding
+          subjects:
+          - kind: User
+            name: sdk-user
+            apiGroup: rbac.authorization.k8s.io
+          roleRef:
+            kind: ClusterRole
+            name: port-forward-permissions
+            apiGroup: rbac.authorization.k8s.io
+          EOF
+        shell: bash
+
       - name: Configure RBAC for sdk user with limited permissions
         run: |
           kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
diff --git a/codeflare-kuberay.code-workspace b/codeflare-kuberay.code-workspace
new file mode 100644
index 0000000..8e91fcb
--- /dev/null
+++ b/codeflare-kuberay.code-workspace
@@ -0,0 +1,13 @@
+{
+	"folders": [
+		{
+			"path": "/Users/bkeane/Code/github.com/codeflare-sdk"
+		},
+		{
+			"path": "/Users/bkeane/Code/github.com/kuberay" 
+		},
+		{
+			"path": "/Users/bkeane/Code/github.com/codeflare-operator"
+		}
+	]
+}
\ No newline at end of file
diff --git a/src/codeflare_sdk/common/utils/generate_cert.py b/src/codeflare_sdk/common/utils/generate_cert.py
index 7c072da..a0b4f8c 100644
--- a/src/codeflare_sdk/common/utils/generate_cert.py
+++ b/src/codeflare_sdk/common/utils/generate_cert.py
@@ -243,9 +243,18 @@ def export_env(cluster_name, namespace):
         - RAY_TLS_SERVER_CERT: Path to the TLS server certificate.
         - RAY_TLS_SERVER_KEY: Path to the TLS server private key.
         - RAY_TLS_CA_CERT: Path to the CA certificate.
+        - RAY_CLIENT_SKIP_TLS_VERIFY: Skips TLS verification by the client.
     """
     tls_dir = os.path.join(os.getcwd(), f"tls-{cluster_name}-{namespace}")
     os.environ["RAY_USE_TLS"] = "1"
     os.environ["RAY_TLS_SERVER_CERT"] = os.path.join(tls_dir, "tls.crt")
     os.environ["RAY_TLS_SERVER_KEY"] = os.path.join(tls_dir, "tls.key")
     os.environ["RAY_TLS_CA_CERT"] = os.path.join(tls_dir, "ca.crt")
+    os.environ["RAY_CLIENT_SKIP_TLS_VERIFY"] = "1" # Skip verification for E2E
+
+    # Optional: Add print statements here if you still want to log them for verification
+    print(f"generate_cert.export_env: RAY_USE_TLS set to: {os.environ.get('RAY_USE_TLS')}")
+    print(f"generate_cert.export_env: RAY_TLS_CA_CERT set to: {os.environ.get('RAY_TLS_CA_CERT')}")
+    print(f"generate_cert.export_env: RAY_TLS_SERVER_CERT is: {os.environ.get('RAY_TLS_SERVER_CERT')}")
+    print(f"generate_cert.export_env: RAY_TLS_SERVER_KEY is: {os.environ.get('RAY_TLS_SERVER_KEY')}")
+    print(f"generate_cert.export_env: RAY_CLIENT_SKIP_TLS_VERIFY is: {os.environ.get('RAY_CLIENT_SKIP_TLS_VERIFY')}")
diff --git a/src/codeflare_sdk/ray/cluster/config.py b/src/codeflare_sdk/ray/cluster/config.py
index 4f646ba..2f954ad 100644
--- a/src/codeflare_sdk/ray/cluster/config.py
+++ b/src/codeflare_sdk/ray/cluster/config.py
@@ -108,6 +108,16 @@ class ClusterConfiguration:
             Kubernetes secret reference containing Redis password. ex: {"name": "secret-name", "key": "password-key"}
         external_storage_namespace:
             The storage namespace to use for GCS fault tolerance. By default, KubeRay sets it to the UID of RayCluster.
+        worker_idle_timeout_seconds:
+            The idle timeout for worker nodes in seconds.
+        worker_num_of_hosts:
+            The number of hosts per worker replica for TPUs.
+        suspend:
+            A boolean indicating whether to suspend the cluster.
+        managed_by:
+            The managed by field value.
+        redis_username_secret:
+            Kubernetes secret reference containing Redis username.
     """
 
     name: str
@@ -134,6 +144,8 @@ class ClusterConfiguration:
     max_memory: Optional[Union[int, str]] = None  # Deprecating
     num_gpus: Optional[int] = None  # Deprecating
     worker_tolerations: Optional[List[V1Toleration]] = None
+    worker_idle_timeout_seconds: Optional[int] = None
+    worker_num_of_hosts: Optional[int] = None
     appwrapper: bool = False
     envs: Dict[str, str] = field(default_factory=dict)
     image: str = ""
@@ -150,8 +162,11 @@ class ClusterConfiguration:
     annotations: Dict[str, str] = field(default_factory=dict)
     volumes: list[V1Volume] = field(default_factory=list)
     volume_mounts: list[V1VolumeMount] = field(default_factory=list)
+    suspend: Optional[bool] = None
+    managed_by: Optional[str] = None
     enable_gcs_ft: bool = False
     redis_address: Optional[str] = None
+    redis_username_secret: Optional[Dict[str, str]] = None
     redis_password_secret: Optional[Dict[str, str]] = None
     external_storage_namespace: Optional[str] = None
 
@@ -181,6 +196,29 @@ def __post_init__(self):
                 raise ValueError(
                     "redis_password_secret must contain both 'name' and 'key' fields"
                 )
+            
+            if self.redis_username_secret and not isinstance(
+                self.redis_username_secret, dict
+            ):
+                raise ValueError(
+                    "redis_username_secret must be a dictionary with 'name' and 'key' fields"
+                )
+
+            if self.redis_username_secret and (
+                "name" not in self.redis_username_secret
+                or "key" not in self.redis_username_secret
+            ):
+                raise ValueError(
+                    "redis_username_secret must contain both 'name' and 'key' fields"
+                )
+
+        if self.managed_by and self.managed_by not in [
+            "ray.io/kuberay-operator",
+            "kueue.x-k8s.io/multikueue",
+        ]:
+            raise ValueError(
+                "managed_by field value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'"
+            )
 
         self._validate_types()
         self._memory_to_resource()
diff --git a/src/codeflare_sdk/ray/job_config.py b/src/codeflare_sdk/ray/job_config.py
new file mode 100644
index 0000000..212f08f
--- /dev/null
+++ b/src/codeflare_sdk/ray/job_config.py
@@ -0,0 +1,100 @@
+"""
+Defines the RayJobConfiguration dataclass for specifying KubeRay RayJob custom resources.
+"""
+
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Union
+
+from codeflare_sdk.ray.cluster.config import ClusterConfiguration
+import corev1_client # Placeholder for kubernetes.client.models.V1PodTemplateSpec
+
+# Placeholder for V1PodTemplateSpec until actual import is resolved
+# from kubernetes.client.models import V1PodTemplateSpec
+# For now, using a generic Dict as a placeholder
+V1PodTemplateSpec = Dict[str, Any]
+
+
+@dataclass
+class RayJobConfiguration:
+    """
+    Configuration for a KubeRay RayJob.
+
+    Args:
+        name: Name of the RayJob.
+        namespace: Namespace for the RayJob.
+        entrypoint: Command to execute for the job.
+        runtime_env_yaml: Runtime environment configuration as a YAML string.
+        job_id: Optional ID for the job. Auto-generated if not set.
+        active_deadline_seconds: Duration in seconds the job may be active.
+        backoff_limit: Number of retries before marking job as failed.
+        deletion_policy: Policy for resource deletion on job completion.
+                         Valid values: "DeleteCluster", "DeleteWorkers", "DeleteSelf", "DeleteNone".
+        submission_mode: How the Ray job is submitted to the RayCluster.
+                         Valid values: "K8sJobMode", "HTTPMode", "InteractiveMode".
+        managed_by: Controller managing the RayJob (e.g., "kueue.x-k8s.io/multikueue").
+        ray_cluster_spec: Specification for the RayCluster if created by this RayJob.
+        cluster_selector: Labels to select an existing RayCluster.
+        submitter_pod_template: Pod template for the job submitter (if K8sJobMode).
+        shutdown_after_job_finishes: Whether to delete the RayCluster after job completion.
+        ttl_seconds_after_finished: TTL for RayCluster cleanup after job completion.
+        suspend: Whether to suspend the RayJob (prevents RayCluster creation).
+        metadata: Metadata for the RayJob.
+        submitter_config_backoff_limit: BackoffLimit for the submitter Kubernetes Job.
+    """
+    name: str
+    namespace: Optional[str] = None
+    entrypoint: str
+    runtime_env_yaml: Optional[str] = None
+    job_id: Optional[str] = None
+    active_deadline_seconds: Optional[int] = None
+    backoff_limit: int = 0 # KubeRay default is 0
+    deletion_policy: Optional[str] = None # Needs validation: DeleteCluster, DeleteWorkers, DeleteSelf, DeleteNone
+    submission_mode: str = "K8sJobMode" # KubeRay default
+    managed_by: Optional[str] = None
+    ray_cluster_spec: Optional[ClusterConfiguration] = None
+    cluster_selector: Dict[str, str] = field(default_factory=dict)
+    submitter_pod_template: Optional[V1PodTemplateSpec] = None # Kubernetes V1PodTemplateSpec
+    shutdown_after_job_finishes: bool = True # Common default, KubeRay itself doesn't default this in RayJobSpec directly
+    ttl_seconds_after_finished: int = 0 # KubeRay default
+    suspend: bool = False
+    metadata: Dict[str, str] = field(default_factory=dict)
+    submitter_config_backoff_limit: Optional[int] = None
+
+
+    def __post_init__(self):
+        if self.deletion_policy and self.deletion_policy not in [
+            "DeleteCluster",
+            "DeleteWorkers",
+            "DeleteSelf",
+            "DeleteNone",
+        ]:
+            raise ValueError(
+                "deletion_policy must be one of 'DeleteCluster', 'DeleteWorkers', 'DeleteSelf', or 'DeleteNone'"
+            )
+
+        if self.submission_mode not in ["K8sJobMode", "HTTPMode", "InteractiveMode"]:
+            raise ValueError(
+                "submission_mode must be one of 'K8sJobMode', 'HTTPMode', or 'InteractiveMode'"
+            )
+
+        if self.managed_by and self.managed_by not in [
+            "ray.io/kuberay-operator",
+            "kueue.x-k8s.io/multikueue",
+        ]:
+            raise ValueError(
+                "managed_by field value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'"
+            )
+
+        if self.ray_cluster_spec and self.cluster_selector:
+            raise ValueError("Only one of ray_cluster_spec or cluster_selector can be provided.")
+
+        if not self.ray_cluster_spec and not self.cluster_selector and self.submission_mode != "InteractiveMode":
+             # In interactive mode, a cluster might already exist and the user connects to it.
+             # Otherwise, a RayJob needs either a spec to create a cluster or a selector to find one.
+            raise ValueError(
+                "Either ray_cluster_spec (to create a new cluster) or cluster_selector (to use an existing one) must be specified unless in InteractiveMode."
+            )
+
+        # TODO: Add validation for submitter_pod_template if submission_mode is K8sJobMode
+        # TODO: Add type validation for all fields
+        pass 
\ No newline at end of file
diff --git a/src/codeflare_sdk/ray/service_config.py b/src/codeflare_sdk/ray/service_config.py
new file mode 100644
index 0000000..ffea902
--- /dev/null
+++ b/src/codeflare_sdk/ray/service_config.py
@@ -0,0 +1,55 @@
+"""
+Defines the RayServiceConfiguration dataclass for specifying KubeRay RayService custom resources.
+"""
+
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+from codeflare_sdk.ray.cluster.config import ClusterConfiguration
+import corev1_client # Placeholder for kubernetes.client.models.V1Service
+
+# Placeholder for V1Service until actual import is resolved
+# from kubernetes.client.models import V1Service
+# For now, using a generic Dict as a placeholder
+V1Service = Dict[str, Any]
+
+@dataclass
+class RayServiceConfiguration:
+    """
+    Configuration for a KubeRay RayService.
+
+    Args:
+        name: Name of the RayService.
+        namespace: Namespace for the RayService.
+        serve_config_v2: YAML string defining the applications and deployments to deploy.
+        ray_cluster_spec: Specification for the RayCluster underpinning the RayService.
+        upgrade_strategy_type: Strategy for upgrading the RayService ("NewCluster" or "None").
+        serve_service: Optional Kubernetes service definition for the serve endpoints.
+        exclude_head_pod_from_serve_svc: If true, head pod won't be part of the K8s serve service.
+        metadata: Metadata for the RayService.
+        annotations: Annotations for the RayService.
+    """
+    name: str
+    namespace: Optional[str] = None
+    serve_config_v2: str
+    ray_cluster_spec: ClusterConfiguration # A RayService always needs a RayClusterSpec
+    upgrade_strategy_type: Optional[str] = "NewCluster" # KubeRay default if not specified, but good to be explicit.
+    serve_service: Optional[V1Service] = None # Kubernetes V1Service
+    exclude_head_pod_from_serve_svc: bool = False
+    metadata: Dict[str, str] = field(default_factory=dict)
+    annotations: Dict[str, str] = field(default_factory=dict)
+
+    def __post_init__(self):
+        if self.upgrade_strategy_type and self.upgrade_strategy_type not in [
+            "NewCluster",
+            "None",
+        ]:
+            raise ValueError(
+                "upgrade_strategy_type must be one of 'NewCluster' or 'None'"
+            )
+        
+        if not self.serve_config_v2:
+            raise ValueError("serve_config_v2 must be provided.")
+
+        # TODO: Add type validation for all fields
+        pass 
\ No newline at end of file
diff --git a/tests/e2e/local_interactive_sdk_kind_test.py b/tests/e2e/local_interactive_sdk_kind_test.py
index c20fd87..137fe12 100644
--- a/tests/e2e/local_interactive_sdk_kind_test.py
+++ b/tests/e2e/local_interactive_sdk_kind_test.py
@@ -8,16 +8,47 @@
 import pytest
 import ray
 import math
+import logging
+import time
+import os
+import subprocess
+import signal  # For explicit signal sending
 
 from support import *
 
+# Configure logging
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
 
 @pytest.mark.kind
 class TestRayLocalInteractiveOauth:
     def setup_method(self):
         initialize_kubernetes_client(self)
+        logger.info("Kubernetes client initalized")
+        self.port_forward_process = None  # Initialize port_forward_process
 
     def teardown_method(self):
+        if self.port_forward_process:
+            logger.info(
+                f"Terminating port-forward process (PID: {self.port_forward_process.pid})..."
+            )
+            self.port_forward_process.terminate()  # Send SIGTERM
+            try:
+                self.port_forward_process.wait(timeout=10)  # Wait for termination
+                logger.info(
+                    f"Port-forward process (PID: {self.port_forward_process.pid}) terminated gracefully."
+                )
+            except subprocess.TimeoutExpired:
+                logger.warning(
+                    f"Port-forward process (PID: {self.port_forward_process.pid}) did not terminate in time, killing..."
+                )
+                self.port_forward_process.kill()  # Send SIGKILL if terminate fails
+                self.port_forward_process.wait()  # Ensure it's dead
+                logger.info(
+                    f"Port-forward process (PID: {self.port_forward_process.pid}) killed."
+                )
+            self.port_forward_process = None
         delete_namespace(self)
         delete_kueue_resources(self)
 
@@ -38,6 +69,9 @@ def run_local_interactives(
         self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
     ):
         cluster_name = "test-ray-cluster-li"
+        logger.info(f"Starting run_local_interactives with {number_of_gpus} GPUs")
+
+        ray.shutdown()
 
         cluster = Cluster(
             ClusterConfiguration(
@@ -49,45 +83,277 @@ def run_local_interactives(
                 head_memory_requests=2,
                 head_memory_limits=2,
                 worker_cpu_requests="500m",
-                worker_cpu_limits=1,
+                worker_cpu_limits="500m",
                 worker_memory_requests=1,
                 worker_memory_limits=4,
                 worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
                 write_to_file=True,
-                verify_tls=False,
+                verify_tls=False,  # This is for SDK's JobSubmissionClient, not ray.init directly
             )
         )
-        cluster.up()
-        cluster.wait_ready()
 
-        generate_cert.generate_tls_cert(cluster_name, self.namespace)
-        generate_cert.export_env(cluster_name, self.namespace)
+        try:  # Wrap main logic in try-finally to ensure port-forward cleanup
+            cluster.up()
+            logger.info("Cluster deployment initiated")
 
-        print(cluster.local_client_url())
+            cluster.wait_ready()
+            cluster.status()
+            logger.info("Cluster is ready")
 
-        ray.shutdown()
-        ray.init(address=cluster.local_client_url(), logging_level="DEBUG")
-
-        @ray.remote(num_gpus=number_of_gpus / 2)
-        def heavy_calculation_part(num_iterations):
-            result = 0.0
-            for i in range(num_iterations):
-                for j in range(num_iterations):
-                    for k in range(num_iterations):
-                        result += math.sin(i) * math.cos(j) * math.tan(k)
-            return result
-
-        @ray.remote(num_gpus=number_of_gpus / 2)
-        def heavy_calculation(num_iterations):
-            results = ray.get(
-                [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)]
+            TIMEOUT = 300  # 5 minutes timeout
+            END = time.time() + TIMEOUT
+
+            head_pod_name = None
+            worker_pod_name = None
+
+            while time.time() < END:
+                # Dynamically find pod names using substrings
+                if not head_pod_name:
+                    head_pod_name = kubectl_get_pod_name_by_substring(
+                        self.namespace, cluster_name, "head"
+                    )
+                    if head_pod_name:
+                        logger.info(
+                            f"Discovered head pod by substring: {head_pod_name}"
+                        )
+                    else:
+                        logger.info(
+                            f"Head pod not yet found by searching for '{cluster_name}' and 'head' in pod names. Retrying..."
+                        )
+
+                if not worker_pod_name:
+                    worker_pod_name = kubectl_get_pod_name_by_substring(
+                        self.namespace, cluster_name, "worker"
+                    )
+                    if worker_pod_name:
+                        logger.info(
+                            f"Discovered worker pod by substring: {worker_pod_name}"
+                        )
+                    else:
+                        logger.info(
+                            f"Worker pod not yet found by searching for '{cluster_name}' and 'worker' in pod names. Retrying..."
+                        )
+
+                head_status = "NotFound"
+                worker_status = "NotFound"
+
+                if head_pod_name:
+                    head_status = kubectl_get_pod_status(self.namespace, head_pod_name)
+                if worker_pod_name:
+                    worker_status = kubectl_get_pod_status(
+                        self.namespace, worker_pod_name
+                    )
+
+                if (
+                    head_pod_name
+                    and worker_pod_name
+                    and "Running" in head_status
+                    and "Running" in worker_status
+                ):
+                    head_ready = kubectl_get_pod_ready(self.namespace, head_pod_name)
+                    worker_ready = kubectl_get_pod_ready(
+                        self.namespace, worker_pod_name
+                    )
+
+                    if head_ready and worker_ready:
+                        logger.info("All discovered pods and containers are ready!")
+                        break
+                    else:
+                        logger.info(
+                            "Discovered pods are running but containers are not all ready yet..."
+                        )
+                        if not head_ready and head_pod_name:
+                            head_container_status = kubectl_get_pod_container_status(
+                                self.namespace, head_pod_name
+                            )
+                            logger.info(
+                                f"Head pod ({head_pod_name}) container status: {head_container_status}"
+                            )
+                        if not worker_ready and worker_pod_name:
+                            worker_container_status = kubectl_get_pod_container_status(
+                                self.namespace, worker_pod_name
+                            )
+                            logger.info(
+                                f"Worker pod ({worker_pod_name}) container status: {worker_container_status}"
+                            )
+                elif (head_pod_name and "Error" in head_status) or (
+                    worker_pod_name and "Error" in worker_status
+                ):
+                    logger.error(
+                        "Error getting pod status for one or more pods, retrying..."
+                    )
+                else:
+                    logger.info(
+                        f"Waiting for pods to be discovered and running... Current status - Head ({head_pod_name or 'N/A'}): {head_status}, Worker ({worker_pod_name or 'N/A'}): {worker_status}"
+                    )
+
+                time.sleep(10)
+
+            if time.time() >= END:
+                logger.error("Timeout waiting for pods to be ready or discovered")
+                if not head_pod_name or not worker_pod_name:
+                    logger.error(
+                        "Could not discover head and/or worker pods by name substring. Listing all pods in namespace for debugging:"
+                    )
+                    try:
+                        all_pods_result = subprocess.run(
+                            [
+                                "kubectl",
+                                "get",
+                                "pods",
+                                "-n",
+                                self.namespace,
+                                "-o",
+                                "wide",
+                            ],
+                            capture_output=True,
+                            text=True,
+                            check=False,
+                        )
+                        logger.error(
+                            f"Pods in namespace '{self.namespace}':\\n{all_pods_result.stdout}"
+                        )
+                        if all_pods_result.stderr:
+                            logger.error(
+                                f"Error listing pods: {all_pods_result.stderr}"
+                            )
+                    except Exception as e_pods:
+                        logger.error(
+                            f"Exception while trying to list all pods: {e_pods}"
+                        )
+
+                if head_pod_name:
+                    logger.error(
+                        f"Final head pod ({head_pod_name}) status: {kubectl_get_pod_container_status(self.namespace, head_pod_name)}"
+                    )
+                else:
+                    logger.error(
+                        f"Final head pod status: Not Discovered by searching for '{cluster_name}' and 'head' in pod names."
+                    )
+
+                if worker_pod_name:
+                    logger.error(
+                        f"Final worker pod ({worker_pod_name}) status: {kubectl_get_pod_container_status(self.namespace, worker_pod_name)}"
+                    )
+                else:
+                    logger.error(
+                        f"Final worker pod status: Not Discovered by searching for '{cluster_name}' and 'worker' in pod names."
+                    )
+                raise TimeoutError(
+                    "Pods did not become ready (or were not discovered by name substring) within the timeout period"
+                )
+
+            generate_cert.generate_tls_cert(cluster_name, self.namespace)
+            generate_cert.export_env(cluster_name, self.namespace)
+
+            # Start port forwarding
+            local_port = "20001"
+            ray_client_port = "10001"
+            head_service_name = f"{cluster_name}-head-svc"
+
+            port_forward_cmd = [
+                "kubectl",
+                "port-forward",
+                "-n",
+                self.namespace,
+                f"svc/{head_service_name}",
+                f"{local_port}:{ray_client_port}",
+            ]
+            logger.info(f"Starting port-forward: {' '.join(port_forward_cmd)}")
+            # Using preexec_fn=os.setsid to create a new session, so we can kill the whole process group later if needed.
+            # However, os.setsid is not available on Windows. For simplicity in a test, direct Popen is used.
+            # Proper cross-platform process group management can be more complex.
+            self.port_forward_process = subprocess.Popen(
+                port_forward_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            logger.info(
+                f"Port-forward process started with PID: {self.port_forward_process.pid}"
             )
-            return sum(results)
+            time.sleep(5)  # Give port-forward a few seconds to establish
 
-        ref = heavy_calculation.remote(3000)
-        result = ray.get(ref)
-        assert result == 1789.4644387076714
-        ray.cancel(ref)
-        ray.shutdown()
+            client_url = f"ray://localhost:{local_port}"
+            # client_url = cluster.local_client_url() # Original line, now replaced
+            cluster.status()
+
+            logger.info(f"Attempting to connect to Ray client at: {client_url}")
+            logger.info("Initializing Ray connection...")
+            try:
+                ray.init(
+                    address=client_url, logging_level="INFO"
+                )  # Removed local_mode=True
+                logger.info("Ray initialization successful")
+            except Exception as e:
+                logger.error(f"Ray initialization failed: {str(e)}")
+                logger.error(f"Error type: {type(e)}")
+                # Log port-forward stdout/stderr if connection fails
+                if self.port_forward_process:
+                    stdout, stderr = self.port_forward_process.communicate(
+                        timeout=5
+                    )  # attempt to read
+                    logger.error(
+                        f"Port-forward stdout: {stdout.decode(errors='ignore')}"
+                    )
+                    logger.error(
+                        f"Port-forward stderr: {stderr.decode(errors='ignore')}"
+                    )
+                raise
+
+            @ray.remote(num_gpus=number_of_gpus / 2)
+            def heavy_calculation_part(num_iterations):
+                result = 0.0
+                for i in range(num_iterations):
+                    for j in range(num_iterations):
+                        for k in range(num_iterations):
+                            result += math.sin(i) * math.cos(j) * math.tan(k)
+                return result
+
+            @ray.remote(num_gpus=number_of_gpus / 2)
+            def heavy_calculation(num_iterations):
+                results = ray.get(
+                    [
+                        heavy_calculation_part.remote(num_iterations // 30)
+                        for _ in range(30)
+                    ]
+                )
+                return sum(results)
+
+            ref = heavy_calculation.remote(3000)
+
+            try:
+                result = ray.get(ref)
+                logger.info(f"Calculation completed with result: {result}")
+                assert result == 1789.4644387076714
+                logger.info("Result assertion passed")
+            except Exception as e:
+                logger.error(f"Error during calculation: {str(e)}")
+                raise
+            finally:
+                logger.info("Cancelling task reference...")
+                ray.cancel(ref)
+                logger.info("Task cancelled")
+
+            ray.shutdown()
+            # Port-forward process is stopped in finally block or teardown_method
 
-        cluster.down()
+        finally:
+            if self.port_forward_process:
+                logger.info(
+                    f"Stopping port-forward process (PID: {self.port_forward_process.pid}) in finally block..."
+                )
+                self.port_forward_process.terminate()
+                try:
+                    self.port_forward_process.wait(timeout=10)
+                    logger.info(
+                        f"Port-forward process (PID: {self.port_forward_process.pid}) terminated from finally."
+                    )
+                except subprocess.TimeoutExpired:
+                    logger.warning(
+                        f"Port-forward process (PID: {self.port_forward_process.pid}) did not terminate in time from finally, killing..."
+                    )
+                    self.port_forward_process.kill()
+                    self.port_forward_process.wait()
+                    logger.info(
+                        f"Port-forward process (PID: {self.port_forward_process.pid}) killed from finally."
+                    )
+                self.port_forward_process = None
+            cluster.down()
diff --git a/tests/e2e/support.py b/tests/e2e/support.py
index d7bee80..4063a8e 100644
--- a/tests/e2e/support.py
+++ b/tests/e2e/support.py
@@ -9,6 +9,7 @@
 from codeflare_sdk.common.kubernetes_cluster.kube_api_helpers import (
     _kube_api_error_handling,
 )
+import time
 
 
 def get_ray_cluster(cluster_name, namespace):
@@ -299,31 +300,38 @@ def create_kueue_resources(
 
 
 def delete_kueue_resources(self):
-    # Delete if given cluster-queue exists
-    for cq in self.cluster_queues:
-        try:
-            self.custom_api.delete_cluster_custom_object(
-                group="kueue.x-k8s.io",
-                plural="clusterqueues",
-                version="v1beta1",
-                name=cq,
-            )
-            print(f"\n'{cq}' cluster-queue deleted")
-        except Exception as e:
-            print(f"\nError deleting cluster-queue '{cq}' : {e}")
-
-    # Delete if given resource-flavor exists
-    for flavor in self.resource_flavors:
-        try:
-            self.custom_api.delete_cluster_custom_object(
-                group="kueue.x-k8s.io",
-                plural="resourceflavors",
-                version="v1beta1",
-                name=flavor,
-            )
-            print(f"'{flavor}' resource-flavor deleted")
-        except Exception as e:
-            print(f"\nError deleting resource-flavor '{flavor}': {e}")
+    try:
+        # Delete if given cluster-queue exists
+        for cq in getattr(self, "cluster_queues", []):
+            try:
+                self.custom_api.delete_cluster_custom_object(
+                    group="kueue.x-k8s.io",
+                    plural="clusterqueues",
+                    version="v1beta1",
+                    name=cq,
+                )
+                print(f"\n'{cq}' cluster-queue deleted")
+            except Exception as e:
+                print(f"\nError deleting cluster-queue '{cq}' : {e}")
+
+        # Delete if given resource-flavor exists
+        for flavor in getattr(self, "resource_flavors", []):
+            try:
+                self.custom_api.delete_cluster_custom_object(
+                    group="kueue.x-k8s.io",
+                    plural="resourceflavors",
+                    version="v1beta1",
+                    name=flavor,
+                )
+                print(f"'{flavor}' resource-flavor deleted")
+            except Exception as e:
+                print(f"\nError deleting resource-flavor '{flavor}': {e}")
+
+        # Wait for resources to be cleaned up
+        time.sleep(5)
+    except Exception as e:
+        print(f"Error during Kueue resource cleanup: {e}")
+        raise
 
 
 def get_pod_node(self, namespace, name):
@@ -407,3 +415,326 @@ def assert_get_cluster_and_jobsubmit(
     assert job_list[0].submission_id == submission_id
 
     cluster.down()
+
+
+def kubectl_get_pod_status(namespace, pod_name):
+    """Get the status of a pod."""
+    try:
+        # First check if the pod exists
+        result = subprocess.run(
+            ["kubectl", "get", "pod", pod_name, "-n", namespace],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        if result.returncode != 0:
+            print(f"Pod {pod_name} not found in namespace {namespace}")
+            print(f"kubectl error output: {result.stderr}")
+            # Try to get events in the namespace to see if there are any issues
+            events = subprocess.run(
+                [
+                    "kubectl",
+                    "get",
+                    "events",
+                    "-n",
+                    namespace,
+                    "--sort-by='.lastTimestamp'",
+                ],
+                capture_output=True,
+                text=True,
+                check=False,
+            )
+            if events.returncode == 0:
+                print(f"Recent events in namespace {namespace}:")
+                print(events.stdout)
+            return "NotFound"
+
+        # Get the pod phase
+        result = subprocess.run(
+            [
+                "kubectl",
+                "get",
+                "pod",
+                pod_name,
+                "-n",
+                namespace,
+                "-o",
+                "jsonpath='{.status.phase}'",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        status = result.stdout.strip("'")
+
+        # Get pod conditions for more detailed status
+        conditions = subprocess.run(
+            [
+                "kubectl",
+                "get",
+                "pod",
+                pod_name,
+                "-n",
+                namespace,
+                "-o",
+                "jsonpath='{.status.conditions}'",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+
+        return status
+    except subprocess.CalledProcessError as e:
+        print(f"Error getting pod status for {pod_name}: {e.stderr}")
+        return "Error"
+
+
+def kubectl_get_pod_ready(namespace, pod_name):
+    """Check if all containers in a pod are ready."""
+    try:
+        # Get container statuses
+        result = subprocess.run(
+            [
+                "kubectl",
+                "get",
+                "pod",
+                pod_name,
+                "-n",
+                namespace,
+                "-o",
+                "jsonpath='{.status.containerStatuses}'",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+
+        # Get ready status
+        result = subprocess.run(
+            [
+                "kubectl",
+                "get",
+                "pod",
+                pod_name,
+                "-n",
+                namespace,
+                "-o",
+                "jsonpath='{.status.containerStatuses[*].ready}'",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        statuses = result.stdout.strip("'").split()
+        ready = all(status == "true" for status in statuses)
+
+        if not ready:
+            # Get container names and their ready status
+            names_result = subprocess.run(
+                [
+                    "kubectl",
+                    "get",
+                    "pod",
+                    pod_name,
+                    "-n",
+                    namespace,
+                    "-o",
+                    "jsonpath='{.status.containerStatuses[*].name}'",
+                ],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            container_names = names_result.stdout.strip("'").split()
+
+            # Get container states for more detailed status
+            states_result = subprocess.run(
+                [
+                    "kubectl",
+                    "get",
+                    "pod",
+                    pod_name,
+                    "-n",
+                    namespace,
+                    "-o",
+                    "jsonpath='{.status.containerStatuses[*].state}'",
+                ],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            states = states_result.stdout.strip("'").split()
+
+            # Get container reasons if not ready
+            reasons_result = subprocess.run(
+                [
+                    "kubectl",
+                    "get",
+                    "pod",
+                    pod_name,
+                    "-n",
+                    namespace,
+                    "-o",
+                    "jsonpath='{.status.containerStatuses[*].state.waiting.reason}'",
+                ],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            reasons = reasons_result.stdout.strip("'").split()
+
+            for name, status, state, reason in zip(
+                container_names, statuses, states, reasons
+            ):
+                print(f"Container {name}:")
+                print(f"  Ready status: {status}")
+                print(f"  State: {state}")
+                if reason and reason != "<no value>":
+                    print(f"  Reason: {reason}")
+
+        return ready
+    except subprocess.CalledProcessError as e:
+        print(f"Error checking pod readiness for {pod_name}: {e.stderr}")
+        return False
+
+
+def kubectl_get_pod_container_status(namespace, pod_name):
+    """Get detailed container status for a pod."""
+    try:
+        # Get container names
+        names_result = subprocess.run(
+            [
+                "kubectl",
+                "get",
+                "pod",
+                pod_name,
+                "-n",
+                namespace,
+                "-o",
+                "jsonpath='{.status.containerStatuses[*].name}'",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        container_names = names_result.stdout.strip("'").split()
+
+        # Get container states
+        states_result = subprocess.run(
+            [
+                "kubectl",
+                "get",
+                "pod",
+                pod_name,
+                "-n",
+                namespace,
+                "-o",
+                "jsonpath='{.status.containerStatuses[*].state}'",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        states = states_result.stdout.strip("'").split()
+
+        # Get container reasons if waiting
+        reasons_result = subprocess.run(
+            [
+                "kubectl",
+                "get",
+                "pod",
+                pod_name,
+                "-n",
+                namespace,
+                "-o",
+                "jsonpath='{.status.containerStatuses[*].state.waiting.reason}'",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        reasons = reasons_result.stdout.strip("'").split()
+
+        # Get container messages if waiting
+        messages_result = subprocess.run(
+            [
+                "kubectl",
+                "get",
+                "pod",
+                pod_name,
+                "-n",
+                namespace,
+                "-o",
+                "jsonpath='{.status.containerStatuses[*].state.waiting.message}'",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        messages = messages_result.stdout.strip("'").split()
+
+        # Combine all information
+        status = {}
+        for name, state, reason, message in zip(
+            container_names, states, reasons, messages
+        ):
+            status[name] = {
+                "state": state,
+                "reason": reason if reason != "<no value>" else None,
+                "message": message if message != "<no value>" else None,
+            }
+
+        return status
+    except subprocess.CalledProcessError as e:
+        print(f"Error getting container status for {pod_name}: {e.stderr}")
+        return "Error"
+
+
+def kubectl_get_pod_name_by_substring(namespace, cluster_name_part, type_substring):
+    """Get the name of the first pod in the namespace that contains both cluster_name_part and type_substring in its name."""
+    try:
+        command = [
+            "kubectl",
+            "get",
+            "pods",
+            "-n",
+            namespace,
+            "-o",
+            "jsonpath={.items[*].metadata.name}",
+        ]
+        result = subprocess.run(command, capture_output=True, text=True, check=False)
+
+        if result.returncode != 0:
+            print(
+                f"kubectl command failed to list pods in {namespace}. stderr: {result.stderr}"
+            )
+            return None
+
+        pod_names_str = result.stdout.strip().strip("'")
+        if not pod_names_str:
+            # print(f"No pods found in namespace {namespace}") # Uncomment for debugging
+            return None
+
+        pod_names = pod_names_str.split()
+        # print(f"Pods found in namespace {namespace}: {pod_names}") # Uncomment for debugging
+
+        for pod_name in pod_names:
+            # Ensure both parts are present. Using lower() for case-insensitive matching of type_substring (e.g. Head vs head)
+            if (
+                cluster_name_part.lower() in pod_name.lower()
+                and type_substring.lower() in pod_name.lower()
+            ):
+                # print(f"Found matching pod: {pod_name} for cluster part '{cluster_name_part}' and type '{type_substring}'") # Uncomment for debugging
+                return pod_name
+
+        # print(f"No pod found containing '{cluster_name_part}' and '{type_substring}' in namespace {namespace}") # Uncomment for debugging
+        return None
+    except subprocess.CalledProcessError as e:
+        print(
+            f"Error listing pods in namespace {namespace} to find by substring: {e.stderr}"
+        )
+        return None
+    except Exception as e:
+        print(f"An unexpected error occurred while getting pod name by substring: {e}")
+        return None