Merge pull request kruize#1336 from bharathappali/gpu-support-pr-10

Add tests for validating recommendation json for accelerator values
dinogun · Oct 15, 2024 · 08e92cb · 08e92cb
2 parents 4db31e3 + 2c55e0a
commit 08e92cb
Show file tree

Hide file tree

Showing 3 changed files with 225 additions and 1 deletion.
diff --git a/tests/scripts/helpers/utils.py b/tests/scripts/helpers/utils.py
@@ -254,6 +254,8 @@
                           "memoryLimit_sum", "memoryLimit_avg", "memoryUsage_sum", "memoryUsage_max", "memoryUsage_avg",
                           "memoryUsage_min", "memoryRSS_sum", "memoryRSS_max", "memoryRSS_avg", "memoryRSS_min"]
 
+MIG_PATTERN = r"nvidia\.com/mig-[1-4|7]g\.(5|10|20|40|80)gb"
+
 
 def generate_test_data(csvfile, test_data, api_name):
     if os.path.isfile(csvfile):
@@ -1526,3 +1528,60 @@ def validate_local_monitoring_recommendation_data_present(recommendations_json):
         for i in range(list_reco_containers_length):
              assert recommendations_json[0]['kubernetes_objects'][0]['containers'][i]['recommendations']['data'], "Recommendations data is expected, but not present."
              assert recommendations_json[0]['kubernetes_objects'][0]['containers'][i]['recommendations']['notifications'][NOTIFICATION_CODE_FOR_RECOMMENDATIONS_AVAILABLE]['message'] == RECOMMENDATIONS_AVAILABLE, "Recommendations notification is expected, but not present."
+
+
+def validate_limits_map_for_accelerator(limits: dict):
+    for resource, resource_obj in limits.items():
+        # Check if the key contains "nvidia" and matches the MIG pattern
+        if "nvidia" in resource:
+            # Assert that the key matches the expected MIG pattern
+            assert re.match(MIG_PATTERN, resource), f"Resource '{resource}' does not match the expected MIG pattern."
+
+            # Assert that the amount is 1.0 and format is "cores"
+            assert resource_obj.get("amount") == 1.0, f"Resource '{resource}' has an invalid amount: {resource_obj.get('amount')}"
+            assert resource_obj.get("format") == "cores", f"Resource '{resource}' has an invalid format: {resource_obj.get('format')}"
+
+
+
+def validate_accelerator_recommendations_for_container(recommendations_json):
+    if 'experiment_type' in recommendations_json[0]:
+        assert recommendations_json[0]['experiment_type'] == CONTAINER_EXPERIMENT_TYPE, "Test is only applicable for container experiment type"
+
+    assert recommendations_json[0]['kubernetes_objects'], "Kubernetes objects expected"
+
+    # Test needs to be changed if we support multiple kubernetes objects
+    kubernetes_obj = recommendations_json[0]['kubernetes_objects'][0]
+    assert kubernetes_obj["containers"], "Containers array expected"
+
+    containers = kubernetes_obj["containers"]
+    assert len(containers) > 0, "Expecting atleast one container"
+
+    for container in containers:
+        assert container['recommendations'], "Recommendations object expected"
+        recommendations = container['recommendations']
+
+        assert recommendations["data"], "Data object expected"
+        data = recommendations["data"]
+
+        assert len(data) > 0, "Data object cannot be empty"
+
+        for timestamp, interval_recommendation_obj in data.items():
+            assert interval_recommendation_obj["recommendation_terms"], "Term based recommendations expected"
+            terms = interval_recommendation_obj["recommendation_terms"]
+
+            assert len(terms) > 0, "Atleast one term is expected"
+
+            for term_name, term_obj in terms.items():
+                term_notifications = term_obj["notifications"]
+
+                if NOTIFICATION_CODE_FOR_COST_RECOMMENDATIONS_AVAILABLE in term_notifications:
+                    cost_limits_map = term_obj["recommendation_engines"]["cost"]["config"]["limits"]
+                    validate_limits_map_for_accelerator(cost_limits_map)
+
+                if NOTIFICATION_CODE_FOR_PERFORMANCE_RECOMMENDATIONS_AVAILABLE in term_notifications:
+                    perf_limits_map = term_obj["recommendation_engines"]["performance"]["config"]["limits"]
+                    validate_limits_map_for_accelerator(perf_limits_map)
+
+
+
+
diff --git a/tests/scripts/local_monitoring_tests/Local_monitoring_tests.md b/tests/scripts/local_monitoring_tests/Local_monitoring_tests.md
@@ -164,3 +164,21 @@ Local monitoring tests can also be run without using the test_autotune.sh. To do
 
 Note: You can check the report.html for the results as it provides better readability
 
+
+### Accelerator Test:
+
+Kruize 0.1 supports the Accelerator Recommendations which provide right sized MIG config as recommendations.
+
+The test `test_list_recommendations.py::test_accelerator_recommendation_if_exists` is created to check if the accelerator recommendations are in expected format.
+
+#### Prerequisites to run the test:
+
+In addition to the pre-requisites mentioned above we need to make sure that a workload with name `human-eval-benchmark` is running in the namespace `unpartitioned` and has the accelerator usage data.
+
+Check this out for running the benchmark: [How to run the human eval benchmark?](https://github.com/kruize/benchmarks/tree/master/human-eval-benchmark)
+
+Else, you can change the workload name and namespace name in the test to match with your workload.
+
+
+Note: The test will fail if it's run as is if there are no matching workloads that the test looks for. This test result can be ignored in case of a non-gpu workload
+
diff --git a/tests/scripts/local_monitoring_tests/rest_apis/test_list_recommendations.py b/tests/scripts/local_monitoring_tests/rest_apis/test_list_recommendations.py
@@ -171,4 +171,151 @@ def test_list_recommendations_namespace_single_result(test_name, expected_status
     # Delete experiment
     response = delete_experiment(input_json_file)
     print("delete exp = ", response.status_code)
-    assert response.status_code == SUCCESS_STATUS_CODE
+    assert response.status_code == SUCCESS_STATUS_CODE
+
+@pytest.mark.sanity
+@pytest.mark.parametrize(
+    "test_name, expected_status_code, version, experiment_name, cluster_name, performance_profile, mode, target_cluster, datasource, experiment_type, kubernetes_obj_type, name, namespace, namespace_name, container_image_name, container_name, measurement_duration, threshold",
+                         [
+                             ("list_accelerator_recommendations", SUCCESS_STATUS_CODE, "v2.0", "human_eval_exp", "cluster-1", "resource-optimization-local-monitoring", "monitor", "local", "prometheus-1", "container", "statefulset", "human-eval-benchmark", "unpartitioned", None, None, "human-eval-benchmark", "15min", "0.1"),
+                         ]
+                         )
+def test_accelerator_recommendation_if_exists(
+        test_name,
+        expected_status_code,
+        version,
+        experiment_name,
+        cluster_name,
+        performance_profile,
+        mode,
+        target_cluster,
+        datasource,
+        experiment_type,
+        kubernetes_obj_type,
+        name,
+        namespace,
+        namespace_name,
+        container_image_name,
+        container_name,
+        measurement_duration,
+        threshold,
+        cluster_type):
+    """
+    Test Description: This test validates listRecommendations by passing a valid
+    container experiment name which has gpu usage
+    """
+    # Generate a temporary JSON filename
+    tmp_json_file = "/tmp/create_exp_" + test_name + ".json"
+    print("tmp_json_file = ", tmp_json_file)
+
+    # Load the Jinja2 template
+    environment = Environment(loader=FileSystemLoader("../json_files/"))
+    template = environment.get_template("create_exp_template.json")
+
+    # Render the JSON content from the template
+    content = template.render(
+        version=version,
+        experiment_name=experiment_name,
+        cluster_name=cluster_name,
+        performance_profile=performance_profile,
+        mode=mode,
+        target_cluster=target_cluster,
+        datasource=datasource,
+        experiment_type=experiment_type,
+        kubernetes_obj_type=kubernetes_obj_type,
+        name=name,
+        namespace=namespace,
+        namespace_name=namespace_name,
+        container_image_name=container_image_name,
+        container_name=container_name,
+        measurement_duration=measurement_duration,
+        threshold=threshold
+    )
+
+    # Convert rendered content to a dictionary
+    json_content = json.loads(content)
+
+    if json_content[0]["kubernetes_objects"][0]["type"] == "None":
+        json_content[0]["kubernetes_objects"][0].pop("type")
+    if json_content[0]["kubernetes_objects"][0]["namespaces"]["namespace_name"] == "None":
+        json_content[0]["kubernetes_objects"][0].pop("namespaces")
+    if json_content[0]["kubernetes_objects"][0]["containers"][0]["container_name"] == "None":
+        json_content[0]["kubernetes_objects"][0].pop("containers")
+
+    # Write the final JSON to the temp file
+    with open(tmp_json_file, mode="w", encoding="utf-8") as message:
+        json.dump(json_content, message, indent=4)
+
+    input_json_file = tmp_json_file
+
+    form_kruize_url(cluster_type)
+    response = delete_experiment(input_json_file)
+    print("delete exp = ", response.status_code)
+
+    #Install default metric profile
+    if cluster_type == "minikube":
+        metric_profile_json_file = metric_profile_dir / 'resource_optimization_local_monitoring_norecordingrules.json'
+
+    if cluster_type == "openshift":
+        metric_profile_json_file = metric_profile_dir / 'resource_optimization_local_monitoring.json'
+
+    response = delete_metric_profile(metric_profile_json_file)
+    print("delete metric profile = ", response.status_code)
+
+    # Create metric profile using the specified json
+    response = create_metric_profile(metric_profile_json_file)
+
+    data = response.json()
+    print(data['message'])
+
+    assert response.status_code == SUCCESS_STATUS_CODE
+    assert data['status'] == SUCCESS_STATUS
+
+    json_file = open(metric_profile_json_file, "r")
+    input_json = json.loads(json_file.read())
+    metric_profile_name = input_json['metadata']['name']
+    assert data['message'] == CREATE_METRIC_PROFILE_SUCCESS_MSG % metric_profile_name
+
+    response = list_metric_profiles(name=metric_profile_name, logging=False)
+    metric_profile_json = response.json()
+
+    assert response.status_code == SUCCESS_200_STATUS_CODE
+
+    # Validate the json against the json schema
+    errorMsg = validate_list_metric_profiles_json(metric_profile_json, list_metric_profiles_schema)
+    assert errorMsg == ""
+
+    # Create namespace experiment using the specified json
+    response = create_experiment(input_json_file)
+
+    data = response.json()
+    print(data['message'])
+
+    assert response.status_code == SUCCESS_STATUS_CODE
+    assert data['status'] == SUCCESS_STATUS
+    assert data['message'] == CREATE_EXP_SUCCESS_MSG
+
+    # generate recommendations
+    json_file = open(input_json_file, "r")
+    input_json = json.loads(json_file.read())
+    exp_name = input_json[0]['experiment_name']
+
+    response = generate_recommendations(exp_name)
+    assert response.status_code == SUCCESS_STATUS_CODE
+
+    # Invoke list recommendations for the specified experiment
+    response = list_recommendations(exp_name)
+    assert response.status_code == SUCCESS_200_STATUS_CODE
+    list_reco_json = response.json()
+
+    # Validate the json against the json schema
+    errorMsg = validate_list_reco_json(list_reco_json, list_reco_json_local_monitoring_schema)
+    assert errorMsg == ""
+
+    # Validate accelerator info
+    validate_accelerator_recommendations_for_container(list_reco_json)
+
+    # Delete experiment
+    response = delete_experiment(input_json_file)
+    print("delete exp = ", response.status_code)
+    assert response.status_code == SUCCESS_STATUS_CODE