From 3d5c9de586a7c3ed8295fed30eaa5341364388f3 Mon Sep 17 00:00:00 2001 From: Shreya Date: Thu, 22 Aug 2024 15:04:01 +0530 Subject: [PATCH 1/4] Add metric query label constants --- .../com/autotune/analyzer/utils/AnalyzerConstants.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java index c91f600ea..dd63a11be 100644 --- a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java +++ b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java @@ -63,6 +63,10 @@ public class AnalyzerConstants { public static final String NONE = "none"; public static final String POD_VARIABLE = "$POD$"; public static final String NAMESPACE_VARIABLE = "$NAMESPACE$"; + public static final String CONTAINER_VARIABLE = "$CONTAINER_NAME$"; + public static final String MEASUREMENT_DURATION_IN_MIN_VARAIBLE = "$MEASUREMENT_DURATION_IN_MIN$"; + public static final String WORKLOAD_VARIABLE = "$WORKLOAD$"; + public static final String WORKLOAD_TYPE_VARIABLE = "$WORKLOAD_TYPE$"; public static final String API_VERSION = "apiVersion"; public static final String KIND = "kind"; public static final String RESOURCE_VERSION = "resourceVersion"; @@ -159,7 +163,8 @@ public enum MetricName { memoryRequest, memoryLimit, memoryUsage, - memoryRSS + memoryRSS, + maxDate } public enum K8S_OBJECT_TYPES { From 630771860c87fe844d64b4a9654d0aa20e9417df Mon Sep 17 00:00:00 2001 From: Shreya Date: Thu, 22 Aug 2024 15:23:13 +0530 Subject: [PATCH 2/4] Add fetchMetricsBasedOnProfileAndDatasource to fetch metric data using MetricProfile --- .../engine/RecommendationEngine.java | 218 +++++++++++++++++- 1 file changed, 216 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java b/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java index ad06a3d69..b6da95d29 100644 --- a/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java +++ b/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java @@ -2,6 +2,7 @@ import com.autotune.analyzer.kruizeObject.KruizeObject; import com.autotune.analyzer.kruizeObject.RecommendationSettings; +import com.autotune.analyzer.performanceProfiles.PerformanceProfile; import com.autotune.analyzer.plots.PlotManager; import com.autotune.analyzer.recommendations.ContainerRecommendations; import com.autotune.analyzer.recommendations.RecommendationConfigItem; @@ -19,6 +20,8 @@ import com.autotune.analyzer.utils.AnalyzerErrorConstants; import com.autotune.common.data.ValidationOutputData; import com.autotune.common.data.dataSourceQueries.PromQLDataSourceQueries; +import com.autotune.common.data.metrics.AggregationFunctions; +import com.autotune.common.data.metrics.Metric; import com.autotune.common.data.metrics.MetricAggregationInfoResults; import com.autotune.common.data.metrics.MetricResults; import com.autotune.common.data.result.ContainerData; @@ -1423,8 +1426,8 @@ private String getResults(Map mainKruizeExperimentMAP, Kru if (dataSourceInfo == null) { throw new DataSourceNotExist(KruizeConstants.DataSourceConstants.DataSourceErrorMsgs.MISSING_DATASOURCE_INFO); } - // Fetch metrics based on the datasource - fetchMetricsBasedOnDatasource(kruizeObject, interval_end_time, intervalStartTime, dataSourceInfo); + // Fetch metrics dynamically from Metric Profile based on the datasource + fetchMetricsBasedOnProfileAndDatasource(kruizeObject, interval_end_time, intervalStartTime, dataSourceInfo); } return errorMsg; } @@ -1608,4 +1611,215 @@ public void fetchMetricsBasedOnDatasource(KruizeObject kruizeObject, Timestamp i throw new Exception(AnalyzerErrorConstants.APIErrors.UpdateRecommendationsAPI.METRIC_EXCEPTION + e.getMessage()); } } + + /** + * Fetches metrics based on the specified datasource using queries from the metricProfile for the given time interval. + * + * @param kruizeObject KruizeObject + * @param interval_end_time The end time of the interval in the format yyyy-MM-ddTHH:mm:sssZ + * @param interval_start_time The start time of the interval in the format yyyy-MM-ddTHH:mm:sssZ. + * @param dataSourceInfo DataSource object + * @throws Exception + */ + public void fetchMetricsBasedOnProfileAndDatasource(KruizeObject kruizeObject, Timestamp interval_end_time, Timestamp interval_start_time, DataSourceInfo dataSourceInfo) throws Exception { + try { + long interval_end_time_epoc = 0; + long interval_start_time_epoc = 0; + SimpleDateFormat sdf = new SimpleDateFormat(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT, Locale.ROOT); + + String metricProfileName = kruizeObject.getPerformanceProfile(); + PerformanceProfile metricProfile = MetricProfileCollection.getInstance().getMetricProfileCollection().get(metricProfileName); + if (null == metricProfile) { + LOGGER.error("MetricProfile does not exist or is not valid: {}", metricProfileName); + return; + } + + String maxDateQuery = null; + Listmetrics = metricProfile.getSloInfo().getFunctionVariables(); + for (Metric metric: metrics) { + String name = metric.getName(); + if(name.equals("maxDate")){ + String query = metric.getAggregationFunctionsMap().get("max").getQuery(); + maxDateQuery = query; + break; + } + } + + Double measurementDurationMinutesInDouble = kruizeObject.getTrial_settings().getMeasurement_durationMinutes_inDouble(); + List kubernetes_objects = kruizeObject.getKubernetes_objects(); + + // Iterate over Kubernetes objects + for (K8sObject k8sObject : kubernetes_objects) { + String namespace = k8sObject.getNamespace(); + String workload = k8sObject.getName(); + String workload_type = k8sObject.getType(); + HashMap containerDataMap = k8sObject.getContainerDataMap(); + // Iterate over containers + for (Map.Entry entry : containerDataMap.entrySet()) { + ContainerData containerData = entry.getValue(); + String containerName = containerData.getContainer_name(); + if (null == interval_end_time) { + LOGGER.info(KruizeConstants.APIMessages.CONTAINER_USAGE_INFO); + String queryToEncode; + if (null != maxDateQuery) { + LOGGER.info("maxDateQuery: {}", maxDateQuery); + queryToEncode = maxDateQuery + .replace(AnalyzerConstants.NAMESPACE_VARIABLE, namespace) + .replace(AnalyzerConstants.CONTAINER_VARIABLE, containerName) + .replace(AnalyzerConstants.WORKLOAD_VARIABLE, workload) + .replace(AnalyzerConstants.WORKLOAD_TYPE_VARIABLE, workload_type); + } else { + queryToEncode = String.format(PromQLDataSourceQueries.MAX_DATE, containerName, namespace); + } + String dateMetricsUrl = String.format(KruizeConstants.DataSourceConstants.DATE_ENDPOINT_WITH_QUERY, + dataSourceInfo.getUrl(), + URLEncoder.encode(queryToEncode, CHARACTER_ENCODING) + ); + LOGGER.info(dateMetricsUrl); + JSONObject genericJsonObject = new GenericRestApiClient(dateMetricsUrl).fetchMetricsJson(KruizeConstants.APIMessages.GET, ""); + JsonObject jsonObject = new Gson().fromJson(genericJsonObject.toString(), JsonObject.class); + JsonArray resultArray = jsonObject.getAsJsonObject(KruizeConstants.JSONKeys.DATA).getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.RESULT); + // Process fetched metrics + if (null != resultArray && !resultArray.isEmpty()) { + resultArray = resultArray.get(0) + .getAsJsonObject().getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.VALUE); + long epochTime = resultArray.get(0).getAsLong(); + String timestamp = sdf.format(new Date(epochTime * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC)); + Date date = sdf.parse(timestamp); + Timestamp dateTS = new Timestamp(date.getTime()); + interval_end_time_epoc = dateTS.getTime() / KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC + - ((long) dateTS.getTimezoneOffset() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE); + int maxDay = Terms.getMaxDays(kruizeObject.getTerms()); + LOGGER.info(KruizeConstants.APIMessages.MAX_DAY, maxDay); + Timestamp startDateTS = Timestamp.valueOf(Objects.requireNonNull(dateTS).toLocalDateTime().minusDays(maxDay)); + interval_start_time_epoc = startDateTS.getTime() / KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC + - ((long) startDateTS.getTimezoneOffset() * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC); + } + } else { + // Convert timestamps to epoch time + interval_end_time_epoc = interval_end_time.getTime() / KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC + - ((long) interval_end_time.getTimezoneOffset() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE); + interval_start_time_epoc = interval_start_time.getTime() / KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC + - ((long) interval_start_time.getTimezoneOffset() * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC); + } + HashMap containerDataResults = new HashMap<>(); + IntervalResults intervalResults; + HashMap resMap; + HashMap resultMap; + MetricResults metricResults; + MetricAggregationInfoResults metricAggregationInfoResults; + + List metricList = metricProfile.getSloInfo().getFunctionVariables(); + + // Iterate over metrics and aggregation functions + for (Metric metricEntry : metricList) { + HashMap aggregationFunctions = metricEntry.getAggregationFunctionsMap(); + for (Map.Entry aggregationFunctionsEntry: aggregationFunctions.entrySet()) { + // Determine promQL query on metric type + String metricQuery = aggregationFunctionsEntry.getValue().getQuery(); + String promQL = metricQuery; + String format = null; + + + // Determine format based on metric type - Todo move this metric profile + List cpuFunction = Arrays.asList(AnalyzerConstants.MetricName.cpuUsage.toString(), AnalyzerConstants.MetricName.cpuThrottle.toString(), AnalyzerConstants.MetricName.cpuLimit.toString(), AnalyzerConstants.MetricName.cpuRequest.toString()); + List memFunction = Arrays.asList(AnalyzerConstants.MetricName.memoryLimit.toString(), AnalyzerConstants.MetricName.memoryRequest.toString(), AnalyzerConstants.MetricName.memoryRSS.toString(), AnalyzerConstants.MetricName.memoryUsage.toString()); + if (cpuFunction.contains(metricEntry.getName())) { + format = KruizeConstants.JSONKeys.CORES; + } else if (memFunction.contains(metricEntry.getName())) { + format = KruizeConstants.JSONKeys.BYTES; + } + + promQL = promQL + .replace(AnalyzerConstants.NAMESPACE_VARIABLE, namespace) + .replace(AnalyzerConstants.CONTAINER_VARIABLE, containerName) + .replace(AnalyzerConstants.MEASUREMENT_DURATION_IN_MIN_VARAIBLE, Integer.toString(measurementDurationMinutesInDouble.intValue())) + .replace(AnalyzerConstants.WORKLOAD_VARIABLE, workload) + .replace(AnalyzerConstants.WORKLOAD_TYPE_VARIABLE, workload_type); + + // If promQL is determined, fetch metrics from the datasource + if (promQL != null) { + LOGGER.info(promQL); + String podMetricsUrl; + try { + podMetricsUrl = String.format(KruizeConstants.DataSourceConstants.DATASOURCE_ENDPOINT_WITH_QUERY, + dataSourceInfo.getUrl(), + URLEncoder.encode(promQL, CHARACTER_ENCODING), + interval_start_time_epoc, + interval_end_time_epoc, + measurementDurationMinutesInDouble.intValue() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE); + LOGGER.info(podMetricsUrl); + JSONObject genericJsonObject = new GenericRestApiClient(podMetricsUrl).fetchMetricsJson(KruizeConstants.APIMessages.GET, ""); + JsonObject jsonObject = new Gson().fromJson(genericJsonObject.toString(), JsonObject.class); + JsonArray resultArray = jsonObject.getAsJsonObject(KruizeConstants.JSONKeys.DATA).getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.RESULT); + // Process fetched metrics + if (null != resultArray && !resultArray.isEmpty()) { + resultArray = jsonObject.getAsJsonObject(KruizeConstants.JSONKeys.DATA).getAsJsonArray( + KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.RESULT).get(0) + .getAsJsonObject().getAsJsonArray(KruizeConstants.DataSourceConstants + .DataSourceQueryJSONKeys.VALUES); + sdf.setTimeZone(TimeZone.getTimeZone(KruizeConstants.TimeUnitsExt.TimeZones.UTC)); + + // Iterate over fetched metrics + Timestamp sTime = new Timestamp(interval_start_time_epoc); + for (JsonElement element : resultArray) { + JsonArray valueArray = element.getAsJsonArray(); + long epochTime = valueArray.get(0).getAsLong(); + double value = valueArray.get(1).getAsDouble(); + String timestamp = sdf.format(new Date(epochTime * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC)); + Date date = sdf.parse(timestamp); + Timestamp eTime = new Timestamp(date.getTime()); + + // Prepare interval results + if (containerDataResults.containsKey(eTime)) { + intervalResults = containerDataResults.get(eTime); + resMap = intervalResults.getMetricResultsMap(); + } else { + intervalResults = new IntervalResults(); + resMap = new HashMap<>(); + } + AnalyzerConstants.MetricName metricName = AnalyzerConstants.MetricName.valueOf(metricEntry.getName()); + if (resMap.containsKey(metricName)) { + metricResults = resMap.get(metricName); + metricAggregationInfoResults = metricResults.getAggregationInfoResult(); + } else { + metricResults = new MetricResults(); + metricAggregationInfoResults = new MetricAggregationInfoResults(); + } + + Method method = MetricAggregationInfoResults.class.getDeclaredMethod(KruizeConstants.APIMessages.SET + aggregationFunctionsEntry.getKey().substring(0, 1).toUpperCase() + aggregationFunctionsEntry.getKey().substring(1), Double.class); + method.invoke(metricAggregationInfoResults, value); + metricAggregationInfoResults.setFormat(format); + metricResults.setAggregationInfoResult(metricAggregationInfoResults); + metricResults.setName(metricEntry.getName()); + metricResults.setFormat(format); + resMap.put(metricName, metricResults); + intervalResults.setMetricResultsMap(resMap); + intervalResults.setIntervalStartTime(sTime); //Todo this will change + intervalResults.setIntervalEndTime(eTime); + intervalResults.setDurationInMinutes((double) ((eTime.getTime() - sTime.getTime()) + / ((long) KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE + * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC))); + containerDataResults.put(eTime, intervalResults); + sTime = eTime; + } + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + } + + containerData.setResults(containerDataResults); + if (!containerDataResults.isEmpty()) + setInterval_end_time(Collections.max(containerDataResults.keySet())); //TODO Temp fix invalid date is set if experiment having two container with different last seen date + + } + } + } catch (Exception e) { + e.printStackTrace(); + throw new Exception(AnalyzerErrorConstants.APIErrors.UpdateRecommendationsAPI.METRIC_EXCEPTION + e.getMessage()); + } + } } From 06ba1ffbe2a08b63b5599c07aa0afcb0a622e768 Mon Sep 17 00:00:00 2001 From: Shreya Date: Thu, 22 Aug 2024 15:34:25 +0530 Subject: [PATCH 3/4] Add namespace and GPU metric name constants --- .../autotune/analyzer/utils/AnalyzerConstants.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java index dd63a11be..31fc6f628 100644 --- a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java +++ b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java @@ -164,7 +164,19 @@ public enum MetricName { memoryLimit, memoryUsage, memoryRSS, - maxDate + maxDate, + namespaceCpuRequest, + namespaceCpuLimit, + namespaceCpuUsage, + namespaceCpuThrottle, + namespaceMemoryRequest, + namespaceMemoryLimit, + namespaceMemoryUsage, + namespaceMemoryRSS, + namespaceTotalPods, + namespaceRunningPods, + gpuCoreUsage, + gpuMemoryUsage } public enum K8S_OBJECT_TYPES { From 499a252423fc857333a78da64b59c990bc4f7d07 Mon Sep 17 00:00:00 2001 From: Shreya Date: Thu, 22 Aug 2024 15:42:27 +0530 Subject: [PATCH 4/4] Update CPU and memory queries in local monitoring manifest --- ...esource_optimization_local_monitoring.yaml | 91 +++++++++++++------ 1 file changed, 64 insertions(+), 27 deletions(-) diff --git a/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.yaml b/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.yaml index 1e233720b..fc48f70b7 100644 --- a/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.yaml +++ b/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.yaml @@ -23,12 +23,17 @@ slo: aggregation_functions: - function: 'avg' - query: 'avg(kube_pod_container_resource_requests{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container="$CONTAINER_NAME$", namespace="$NAMESPACE", resource="cpu", unit="core"})' + query: 'avg by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="", container!="POD", pod!="", resource="cpu", unit="core" ,namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})' # Show sum of cpu requests in bytes for a container in a deployment - function: 'sum' - query: 'sum(kube_pod_container_resource_requests{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container="$CONTAINER_NAME$", namespace="$NAMESPACE", resource="cpu", unit="core"})' + query: 'sum by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="", container!="POD", pod!="", resource="cpu", unit="core" ,namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})' + - function: 'min' + query: 'min by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="", container!="POD", pod!="", resource="cpu", unit="core" ,namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})' + + - function: 'max' + query: 'max by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="", container!="POD", pod!="", resource="cpu", unit="core" ,namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})' # CPU Limit # Show cpu limits in bytes for a container in a deployment @@ -39,11 +44,17 @@ slo: aggregation_functions: - function: avg - query: 'avg(kube_pod_container_resource_limits{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container="$CONTAINER_NAME$", namespace="$NAMESPACE", resource="cpu", unit="core"})' + query: 'avg by(container,namespace) (kube_pod_container_resource_limits{container!='', container!="", container!="POD", pod!="", resource="cpu", unit="core",namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})' # Show sum of cpu limits in bytes for a container in a deployment - function: sum - query: 'sum(kube_pod_container_resource_limits{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container="$CONTAINER_NAME$", namespace="$NAMESPACE$", resource="cpu", unit="core"})' + query: 'sum by(container,namespace) (kube_pod_container_resource_limits{container!='',container!="", container!="POD", pod!="", resource="cpu", unit="core",namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})' + + - function: 'max' + query: 'max by(container,namespace) (kube_pod_container_resource_limits{container!='', container!="", container!="POD", pod!="", resource="cpu", unit="core",namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})' + + - function: 'max' + query: 'min by(container,namespace) (kube_pod_container_resource_limits{container!='', container!="", container!="POD", pod!="", resource="cpu", unit="core",namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})' # CPU Usage @@ -65,45 +76,45 @@ slo: # For openshift versions <=4.8 aggregation_functions: - function: avg - query: 'avg(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container=”$CONTAINER_NAME$”}[15m]))' + query: 'avg by(container, namespace)(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{container!='', container!="POD", pod!="",namespace="$NAMESPACE$",container="$CONTAINER_NAME$" }[$MEASUREMENT_DURATION_IN_MIN$m]))' versions: "<=4.8" # For openshift versions >=4.9 - function: avg - query: 'avg(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container=”$CONTAINER_NAME$”}[15m]))' + query: 'avg by(container, namespace)(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!="POD", pod!="",namespace="$NAMESPACE$",container="$CONTAINER_NAME$" }[$MEASUREMENT_DURATION_IN_MIN$m]))' versions: ">4.9" # Approx minimum CPU per container in a deployment # For openshift versions <=4.8 - function: min - query: 'min(min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))' + query: 'min by(container, namespace)(min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\" }[$MEASUREMENT_DURATION_IN_MIN$m]))' versions: "<=4.8" # For openshift versions >=4.9 - function: min - query: 'min(min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))' + query: 'min by(container, namespace)(min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))' versions: ">4.9" # Approx maximum CPU per container in a deployment # For openshift versions <=4.8 - function: max - query: 'max(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))' + query: 'max by(container, namespace)(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\" }[$MEASUREMENT_DURATION_IN_MIN$m]))' versions: "<=4.8" # For openshift versions >=4.9 - function: max - query: 'max(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))' + query: 'max by(container, namespace)(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\" }[$MEASUREMENT_DURATION_IN_MIN$m]))' versions: ">4.9" # Sum of CPU usage for a container in all pods of a deployment # For openshift versions <=4.8 - function: sum - query: 'sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))' + query: 'sum by(container, namespace)(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\" }[$MEASUREMENT_DURATION_IN_MIN$m]))' versions: "<=4.8" # For openshift versions >=4.9 - function: sum - query: 'sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))' + query: 'sum by(container, namespace)(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\" }[$MEASUREMENT_DURATION_IN_MIN$m]))' versions: ">4.9" @@ -116,15 +127,19 @@ slo: aggregation_functions: # Average CPU throttling per container in a deployment - function: avg - query: 'avg(rate(container_cpu_cfs_throttled_seconds_total{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container=”$CONTAINER_NAME$”}[15m]))' + query: 'avg by(container,namespace) (rate(container_cpu_cfs_throttled_seconds_total{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))' # Maximum CPU throttling per container in a deployment - function: max - query: 'max(rate(container_cpu_cfs_throttled_seconds_total{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container=”$CONTAINER_NAME$”}[15m]))' + query: 'max by(container,namespace) (rate(container_cpu_cfs_throttled_seconds_total{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m])))' + + # Min of CPU throttling for a container in all pods of a deployment + - function: min + query: 'min by(container,namespace) (rate(container_cpu_cfs_throttled_seconds_total{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m])' # Sum of CPU throttling for a container in all pods of a deployment - function: sum - query: 'sum(rate(container_cpu_cfs_throttled_seconds_total{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container=”$CONTAINER_NAME$”}[15m]))' + query: 'sum by(container,namespace) (rate(container_cpu_cfs_throttled_seconds_total{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))' @@ -139,11 +154,17 @@ slo: aggregation_functions: - function: avg - query: 'avg(kube_pod_container_resource_requests{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource="memory", unit="byte"})' + query: 'avg by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})' # Show sum of memory requests in bytes for a container in a deployment - function: sum - query: 'sum(kube_pod_container_resource_requests{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource="memory", unit="byte"})' + query: 'sum by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})' + + - function: max + query: 'max by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})' + + - function: min + query: 'min by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})' # Memory Limit @@ -155,12 +176,17 @@ slo: aggregation_functions: - function: avg - query: 'avg(kube_pod_container_resource_limits{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container="$CONTAINER_NAME$", namespace="$NAMESPACE", resource="memory", unit="byte"})' + query: 'avg by(container,namespace) (kube_pod_container_resource_limits{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})' # Show sum of memory limits in bytes for a container in a deployment - function: sum - query: 'sum(kube_pod_container_resource_limits{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource="memory", unit="byte"})' + query: 'sum by(container,namespace) (kube_pod_container_resource_limits{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})' + + - function: max + query: 'max by(container,namespace) (kube_pod_container_resource_limits{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})' + - function: min + query: 'min by(container,namespace) (kube_pod_container_resource_limits{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})' # Memory Usage # Average memory per container in a deployment @@ -171,19 +197,19 @@ slo: aggregation_functions: - function: avg - query: 'avg(avg_over_time(container_memory_working_set_bytes{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))' + query: 'avg by(container, namespace) (avg_over_time(container_memory_working_set_bytes{container!='', container!="POD", pod!="", namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))' # Approx minimum memory per container in a deployment - function: min - query: 'min(min_over_time(container_memory_working_set_bytes{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container="$CONTAINER_NAME$"}[15m]))' + query: 'min by(container, namespace) (min_over_time(container_memory_working_set_bytes{container!='', container!="POD", pod!="", namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\" }[$MEASUREMENT_DURATION_IN_MIN$m])' # Approx maximum memory per container in a deployment - function: max - query: 'max(max_over_time(container_memory_working_set_bytes{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container="$CONTAINER_NAME$"}[15m]))' + query: 'max by(container, namespace) (max_over_time(container_memory_working_set_bytes{container!='', container!="POD", pod!="", namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))' # Sum of memory usage for a contianer in all pods of a deployment - function: sum - query: 'sum(avg_over_time(container_memory_working_set_bytes{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container="$CONTAINER_NAME$"}[15m]))' + query: 'sum by(container, namespace) (avg_over_time(container_memory_working_set_bytes{container!='', container!="POD", pod!="", namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))' # 2.4 Memory RSS @@ -195,17 +221,28 @@ slo: aggregation_functions: # Average memory RSS per container in a deployment - function: avg - query: 'avg(avg_over_time(container_memory_rss{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))' + query: 'avg by(container, namespace) (avg_over_time(container_memory_rss{container!='', container!="POD", pod!="" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))' # Approx minimum memory RSS per container in a deployment - function: min - query: 'min(min_over_time(container_memory_rss{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container="$CONTAINER_NAME$"}[15m]))' + query: 'min by(container, namespace) (min_over_time(container_memory_rss{container!='', container!="POD", pod!="" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m])' # Approx maximum memory RSS per container in a deployment - function: max - query: 'max(max_over_time(container_memory_rss{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container="$CONTAINER_NAME$"}[15m]))' + query: 'max by(container, namespace) (max_over_time(container_memory_rss{container!='', container!="POD", pod!="" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))' # Sum of memory RSS for a contianer in all pods of a deployment - function: sum - query: 'sum(avg_over_time(container_memory_rss{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))' + query: 'sum by(container, namespace) (avg_over_time(container_memory_rss{container!='', container!="POD", pod!="" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))' + + + # Container Last Active Timestamp + - name: maxDate + datasource: prometheus + value_type: "double" + kubernetes_object: "container" + + aggregation_functions: + - function: max + query: 'max by(namespace,container) (last_over_time((timestamp(container_cpu_usage_seconds_total{namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"} > 0))[15d:]))' \ No newline at end of file