From 31266fbc3ec803f3ba701dbbdfcdc62542d5422b Mon Sep 17 00:00:00 2001 From: Mudit Verma Date: Wed, 29 May 2024 21:43:11 +0530 Subject: [PATCH] support for node limits --- kraken/chaos_recommender/prometheus.py | 54 ++++++++++++++++++++------ 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/kraken/chaos_recommender/prometheus.py b/kraken/chaos_recommender/prometheus.py index dc76aeda..723b6d5e 100644 --- a/kraken/chaos_recommender/prometheus.py +++ b/kraken/chaos_recommender/prometheus.py @@ -17,16 +17,41 @@ def convert_data_to_dataframe(data, label): def convert_data(data, service): + result = {} + for entry in data: + pod_name = entry['metric']['pod'] + value = entry['value'][1] + result[pod_name] = value + return result.get(service) # for those pods whose limits are not defined they can take as much resources, there assigning a very high value + +def convert_data_limits(data, node_data, service, prometheus): result = {} for entry in data: pod_name = entry['metric']['pod'] value = entry['value'][1] result[pod_name] = value - return result.get(service, '100000000000') # for those pods whose limits are not defined they can take as much resources, there assigning a very high value + return result.get(service, get_node_capacity(node_data, service, prometheus)) # for those pods whose limits are not defined they can take as much resources, there assigning a very high value + +def get_node_capacity(node_data, pod_name, prometheus ): + + # Get the node name on which the pod is running + query = f'kube_pod_info{{pod="{pod_name}"}}' + result = prometheus.custom_query(query) + if not result: + return None + node_name = result[0]['metric']['node'] + + for item in node_data: + if item['metric']['node'] == node_name: + return item['value'][1] + + return '1000000000' + + +def save_utilization_to_file(utilization, filename, prometheus): -def save_utilization_to_file(utilization, filename): merged_df = pd.DataFrame(columns=['namespace', 'service', 'CPU', 'CPU_LIMITS', 'MEM', 'MEM_LIMITS', 'NETWORK']) for namespace in utilization: # Loading utilization_data[] for namespace @@ -41,9 +66,9 @@ def save_utilization_to_file(utilization, filename): new_row_df = pd.DataFrame({ "namespace": namespace, "service": s, "CPU": convert_data(utilization_data[0], s), - "CPU_LIMITS": convert_data(utilization_data[1], s), + "CPU_LIMITS": convert_data_limits(utilization_data[1],utilization_data[5], s, prometheus), "MEM": convert_data(utilization_data[2], s), - "MEM_LIMITS": convert_data(utilization_data[3], s), + "MEM_LIMITS": convert_data_limits(utilization_data[3], utilization_data[6], s, prometheus), "NETWORK": convert_data(utilization_data[4], s)}, index=[0]) merged_df = pd.concat([merged_df, new_row_df], ignore_index=True) @@ -55,11 +80,11 @@ def save_utilization_to_file(utilization, filename): merged_df['NETWORK'] = merged_df['NETWORK'].astype(str) # Extract integer part before the decimal point - merged_df['CPU'] = merged_df['CPU'].str.split('.').str[0] - merged_df['MEM'] = merged_df['MEM'].str.split('.').str[0] - merged_df['CPU_LIMITS'] = merged_df['CPU_LIMITS'].str.split('.').str[0] - merged_df['MEM_LIMITS'] = merged_df['MEM_LIMITS'].str.split('.').str[0] - merged_df['NETWORK'] = merged_df['NETWORK'].str.split('.').str[0] + #merged_df['CPU'] = merged_df['CPU'].str.split('.').str[0] + #merged_df['MEM'] = merged_df['MEM'].str.split('.').str[0] + #merged_df['CPU_LIMITS'] = merged_df['CPU_LIMITS'].str.split('.').str[0] + #merged_df['MEM_LIMITS'] = merged_df['MEM_LIMITS'].str.split('.').str[0] + #merged_df['NETWORK'] = merged_df['NETWORK'].str.split('.').str[0] merged_df.to_csv(filename, sep='\t', index=False) @@ -84,20 +109,27 @@ def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace) cpu_limits_result = prometheus.custom_query(cpu_limits_query) + node_cpu_limits_query = 'kube_node_status_capacity{resource="cpu", unit="core"}*1000' + node_cpu_limits_result = prometheus.custom_query(node_cpu_limits_query) + mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration) mem_result = prometheus.custom_query(mem_query) mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"}) ' %(namespace) mem_limits_result = prometheus.custom_query(mem_limits_query) + node_mem_limits_query = 'kube_node_status_capacity{resource="memory", unit="byte"}' + node_mem_limits_result = prometheus.custom_query(node_mem_limits_query) + network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \ (avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration) network_result = prometheus.custom_query(network_query) - utilization[namespace] = [cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result] + utilization[namespace] = [cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result, node_cpu_limits_result, node_mem_limits_result ] queries[namespace] = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query) - save_utilization_to_file(utilization, saved_metrics_path) + save_utilization_to_file(utilization, saved_metrics_path, prometheus) + return saved_metrics_path, queries