Skip to content

Commit

Permalink
Merge pull request kruize#1336 from bharathappali/gpu-support-pr-10
Browse files Browse the repository at this point in the history
Add tests for validating recommendation json for accelerator values
  • Loading branch information
dinogun authored Oct 15, 2024
2 parents 4db31e3 + 2c55e0a commit 08e92cb
Show file tree
Hide file tree
Showing 3 changed files with 225 additions and 1 deletion.
59 changes: 59 additions & 0 deletions tests/scripts/helpers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,8 @@
"memoryLimit_sum", "memoryLimit_avg", "memoryUsage_sum", "memoryUsage_max", "memoryUsage_avg",
"memoryUsage_min", "memoryRSS_sum", "memoryRSS_max", "memoryRSS_avg", "memoryRSS_min"]

MIG_PATTERN = r"nvidia\.com/mig-[1-4|7]g\.(5|10|20|40|80)gb"


def generate_test_data(csvfile, test_data, api_name):
if os.path.isfile(csvfile):
Expand Down Expand Up @@ -1526,3 +1528,60 @@ def validate_local_monitoring_recommendation_data_present(recommendations_json):
for i in range(list_reco_containers_length):
assert recommendations_json[0]['kubernetes_objects'][0]['containers'][i]['recommendations']['data'], "Recommendations data is expected, but not present."
assert recommendations_json[0]['kubernetes_objects'][0]['containers'][i]['recommendations']['notifications'][NOTIFICATION_CODE_FOR_RECOMMENDATIONS_AVAILABLE]['message'] == RECOMMENDATIONS_AVAILABLE, "Recommendations notification is expected, but not present."


def validate_limits_map_for_accelerator(limits: dict):
for resource, resource_obj in limits.items():
# Check if the key contains "nvidia" and matches the MIG pattern
if "nvidia" in resource:
# Assert that the key matches the expected MIG pattern
assert re.match(MIG_PATTERN, resource), f"Resource '{resource}' does not match the expected MIG pattern."

# Assert that the amount is 1.0 and format is "cores"
assert resource_obj.get("amount") == 1.0, f"Resource '{resource}' has an invalid amount: {resource_obj.get('amount')}"
assert resource_obj.get("format") == "cores", f"Resource '{resource}' has an invalid format: {resource_obj.get('format')}"



def validate_accelerator_recommendations_for_container(recommendations_json):
if 'experiment_type' in recommendations_json[0]:
assert recommendations_json[0]['experiment_type'] == CONTAINER_EXPERIMENT_TYPE, "Test is only applicable for container experiment type"

assert recommendations_json[0]['kubernetes_objects'], "Kubernetes objects expected"

# Test needs to be changed if we support multiple kubernetes objects
kubernetes_obj = recommendations_json[0]['kubernetes_objects'][0]
assert kubernetes_obj["containers"], "Containers array expected"

containers = kubernetes_obj["containers"]
assert len(containers) > 0, "Expecting atleast one container"

for container in containers:
assert container['recommendations'], "Recommendations object expected"
recommendations = container['recommendations']

assert recommendations["data"], "Data object expected"
data = recommendations["data"]

assert len(data) > 0, "Data object cannot be empty"

for timestamp, interval_recommendation_obj in data.items():
assert interval_recommendation_obj["recommendation_terms"], "Term based recommendations expected"
terms = interval_recommendation_obj["recommendation_terms"]

assert len(terms) > 0, "Atleast one term is expected"

for term_name, term_obj in terms.items():
term_notifications = term_obj["notifications"]

if NOTIFICATION_CODE_FOR_COST_RECOMMENDATIONS_AVAILABLE in term_notifications:
cost_limits_map = term_obj["recommendation_engines"]["cost"]["config"]["limits"]
validate_limits_map_for_accelerator(cost_limits_map)

if NOTIFICATION_CODE_FOR_PERFORMANCE_RECOMMENDATIONS_AVAILABLE in term_notifications:
perf_limits_map = term_obj["recommendation_engines"]["performance"]["config"]["limits"]
validate_limits_map_for_accelerator(perf_limits_map)




18 changes: 18 additions & 0 deletions tests/scripts/local_monitoring_tests/Local_monitoring_tests.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,21 @@ Local monitoring tests can also be run without using the test_autotune.sh. To do

Note: You can check the report.html for the results as it provides better readability


### Accelerator Test:

Kruize 0.1 supports the Accelerator Recommendations which provide right sized MIG config as recommendations.

The test `test_list_recommendations.py::test_accelerator_recommendation_if_exists` is created to check if the accelerator recommendations are in expected format.

#### Prerequisites to run the test:

In addition to the pre-requisites mentioned above we need to make sure that a workload with name `human-eval-benchmark` is running in the namespace `unpartitioned` and has the accelerator usage data.

Check this out for running the benchmark: [How to run the human eval benchmark?](https://github.com/kruize/benchmarks/tree/master/human-eval-benchmark)

Else, you can change the workload name and namespace name in the test to match with your workload.


Note: The test will fail if it's run as is if there are no matching workloads that the test looks for. This test result can be ignored in case of a non-gpu workload

Original file line number Diff line number Diff line change
Expand Up @@ -171,4 +171,151 @@ def test_list_recommendations_namespace_single_result(test_name, expected_status
# Delete experiment
response = delete_experiment(input_json_file)
print("delete exp = ", response.status_code)
assert response.status_code == SUCCESS_STATUS_CODE
assert response.status_code == SUCCESS_STATUS_CODE

@pytest.mark.sanity
@pytest.mark.parametrize(
"test_name, expected_status_code, version, experiment_name, cluster_name, performance_profile, mode, target_cluster, datasource, experiment_type, kubernetes_obj_type, name, namespace, namespace_name, container_image_name, container_name, measurement_duration, threshold",
[
("list_accelerator_recommendations", SUCCESS_STATUS_CODE, "v2.0", "human_eval_exp", "cluster-1", "resource-optimization-local-monitoring", "monitor", "local", "prometheus-1", "container", "statefulset", "human-eval-benchmark", "unpartitioned", None, None, "human-eval-benchmark", "15min", "0.1"),
]
)
def test_accelerator_recommendation_if_exists(
test_name,
expected_status_code,
version,
experiment_name,
cluster_name,
performance_profile,
mode,
target_cluster,
datasource,
experiment_type,
kubernetes_obj_type,
name,
namespace,
namespace_name,
container_image_name,
container_name,
measurement_duration,
threshold,
cluster_type):
"""
Test Description: This test validates listRecommendations by passing a valid
container experiment name which has gpu usage
"""
# Generate a temporary JSON filename
tmp_json_file = "/tmp/create_exp_" + test_name + ".json"
print("tmp_json_file = ", tmp_json_file)

# Load the Jinja2 template
environment = Environment(loader=FileSystemLoader("../json_files/"))
template = environment.get_template("create_exp_template.json")

# Render the JSON content from the template
content = template.render(
version=version,
experiment_name=experiment_name,
cluster_name=cluster_name,
performance_profile=performance_profile,
mode=mode,
target_cluster=target_cluster,
datasource=datasource,
experiment_type=experiment_type,
kubernetes_obj_type=kubernetes_obj_type,
name=name,
namespace=namespace,
namespace_name=namespace_name,
container_image_name=container_image_name,
container_name=container_name,
measurement_duration=measurement_duration,
threshold=threshold
)

# Convert rendered content to a dictionary
json_content = json.loads(content)

if json_content[0]["kubernetes_objects"][0]["type"] == "None":
json_content[0]["kubernetes_objects"][0].pop("type")
if json_content[0]["kubernetes_objects"][0]["namespaces"]["namespace_name"] == "None":
json_content[0]["kubernetes_objects"][0].pop("namespaces")
if json_content[0]["kubernetes_objects"][0]["containers"][0]["container_name"] == "None":
json_content[0]["kubernetes_objects"][0].pop("containers")

# Write the final JSON to the temp file
with open(tmp_json_file, mode="w", encoding="utf-8") as message:
json.dump(json_content, message, indent=4)

input_json_file = tmp_json_file

form_kruize_url(cluster_type)
response = delete_experiment(input_json_file)
print("delete exp = ", response.status_code)

#Install default metric profile
if cluster_type == "minikube":
metric_profile_json_file = metric_profile_dir / 'resource_optimization_local_monitoring_norecordingrules.json'

if cluster_type == "openshift":
metric_profile_json_file = metric_profile_dir / 'resource_optimization_local_monitoring.json'

response = delete_metric_profile(metric_profile_json_file)
print("delete metric profile = ", response.status_code)

# Create metric profile using the specified json
response = create_metric_profile(metric_profile_json_file)

data = response.json()
print(data['message'])

assert response.status_code == SUCCESS_STATUS_CODE
assert data['status'] == SUCCESS_STATUS

json_file = open(metric_profile_json_file, "r")
input_json = json.loads(json_file.read())
metric_profile_name = input_json['metadata']['name']
assert data['message'] == CREATE_METRIC_PROFILE_SUCCESS_MSG % metric_profile_name

response = list_metric_profiles(name=metric_profile_name, logging=False)
metric_profile_json = response.json()

assert response.status_code == SUCCESS_200_STATUS_CODE

# Validate the json against the json schema
errorMsg = validate_list_metric_profiles_json(metric_profile_json, list_metric_profiles_schema)
assert errorMsg == ""

# Create namespace experiment using the specified json
response = create_experiment(input_json_file)

data = response.json()
print(data['message'])

assert response.status_code == SUCCESS_STATUS_CODE
assert data['status'] == SUCCESS_STATUS
assert data['message'] == CREATE_EXP_SUCCESS_MSG

# generate recommendations
json_file = open(input_json_file, "r")
input_json = json.loads(json_file.read())
exp_name = input_json[0]['experiment_name']

response = generate_recommendations(exp_name)
assert response.status_code == SUCCESS_STATUS_CODE

# Invoke list recommendations for the specified experiment
response = list_recommendations(exp_name)
assert response.status_code == SUCCESS_200_STATUS_CODE
list_reco_json = response.json()

# Validate the json against the json schema
errorMsg = validate_list_reco_json(list_reco_json, list_reco_json_local_monitoring_schema)
assert errorMsg == ""

# Validate accelerator info
validate_accelerator_recommendations_for_container(list_reco_json)

# Delete experiment
response = delete_experiment(input_json_file)
print("delete exp = ", response.status_code)
assert response.status_code == SUCCESS_STATUS_CODE

0 comments on commit 08e92cb

Please sign in to comment.