diff --git a/design/KruizePromQL.md b/design/KruizePromQL.md index 54ad31118..108da9852 100644 --- a/design/KruizePromQL.md +++ b/design/KruizePromQL.md @@ -108,6 +108,29 @@ KruizeMethod_count{application="Kruize",method="generatePlots",status="success", KruizeMethod_sum{application="Kruize",method="generatePlots",status="success",} 0.050705769 ``` +## Kruize Notifications Metrics + +The following are the avalable kruize notifcations that gets generated after updateRecommendations API call. + +Sample Output: + +``` +KruizeNotifications_total{api="updateRecommendations",application="Kruize",experiment_details="quarkus-resteasy-kruize-min-http-response-time-db_1_2|tfb-server-0|2023-01-02T05:30:00.000Z|timestamp|null|null|223002|error|Invalid Amount in CPU Section",} 1.0 + +KruizeNotifications_total{api="updateRecommendations",application="Kruize",experiment_details="quarkus-resteasy-kruize-min-http-response-time-db_1_2|tfb-server-0|2023-01-02T05:30:00.000Z|model|short_term|performance|221001|error|Number of pods cannot be zero",} 1.0 + +KruizeNotifications_total{api="updateRecommendations",application="Kruize",experiment_details="quarkus-resteasy-kruize-min-http-response-time-db_1_2|tfb-server-0|2023-01-02T05:30:00.000Z|model|short_term|cost|221001|error|Number of pods cannot be zero",} 1.0 +``` +Creates a counter with tags for the given level, term, model, and list of recommendation notifications. + +level The level of the notification (e.g., container, timestamp, term, model). + +term The term associated with the notification. + +model The cost or performance model associated with the notification. + +recommendationNotificationList The list of recommendation notifications to create counters for. code , type and message + ## Time taken for KruizeDB metrics To monitor the performance of these methods, you can use the following metrics: diff --git a/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java b/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java index 93080bcf4..ad06a3d69 100644 --- a/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java +++ b/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java @@ -28,6 +28,7 @@ import com.autotune.common.k8sObjects.K8sObject; import com.autotune.common.utils.CommonUtils; import com.autotune.database.service.ExperimentDBService; +import com.autotune.metrics.KruizeNotificationCollectionRegistry; import com.autotune.operator.KruizeDeploymentInfo; import com.autotune.utils.GenericRestApiClient; import com.autotune.utils.KruizeConstants; @@ -329,6 +330,10 @@ public void generateRecommendations(KruizeObject kruizeObject) { // generate recommendations based on each container generateRecommendationsBasedOnContainer(containerData, kruizeObject); // TODO: generate recommendations based on namespace, kubernetes_object name and type + // todo The process of data validation and notification generation is currently tightly coupled and needs to be separated. By doing so, we can avoid additional iterations at kruizeNotificationCollectionRegistry.logNotification. This should be included as part of the code refactor. + KruizeNotificationCollectionRegistry kruizeNotificationCollectionRegistry = new KruizeNotificationCollectionRegistry(kruizeObject.getExperimentName(), getInterval_end_time(), containerData.getContainer_name()); + kruizeNotificationCollectionRegistry.logNotification(containerData); + } } } @@ -390,6 +395,8 @@ private void generateRecommendationsBasedOnContainer(ContainerData containerData containerRecommendations.setNotificationMap(recommendationLevelNM); // set the data object to map containerRecommendations.setData(timestampBasedRecommendationMap); + + // set the container recommendations in container object containerData.setContainerRecommendations(containerRecommendations); } @@ -1430,7 +1437,7 @@ private String getResults(Map mainKruizeExperimentMAP, Kru * @param interval_start_time The start time of the interval for fetching metrics. * @param dataSourceInfo The datasource object to fetch metrics from. * @throws Exception if an error occurs during the fetching process. - * TODO: Need to add right abstractions for this + * TODO: Need to add right abstractions for this */ public void fetchMetricsBasedOnDatasource(KruizeObject kruizeObject, Timestamp interval_end_time, Timestamp interval_start_time, DataSourceInfo dataSourceInfo) throws Exception { try { diff --git a/src/main/java/com/autotune/metrics/KruizeNotificationCollectionRegistry.java b/src/main/java/com/autotune/metrics/KruizeNotificationCollectionRegistry.java new file mode 100644 index 000000000..f03b6242a --- /dev/null +++ b/src/main/java/com/autotune/metrics/KruizeNotificationCollectionRegistry.java @@ -0,0 +1,90 @@ +package com.autotune.metrics; + +import com.autotune.analyzer.recommendations.RecommendationNotification; +import com.autotune.analyzer.recommendations.objects.MappedRecommendationForModel; +import com.autotune.analyzer.recommendations.objects.MappedRecommendationForTimestamp; +import com.autotune.analyzer.recommendations.objects.TermRecommendations; +import com.autotune.common.data.result.ContainerData; +import com.autotune.operator.KruizeDeploymentInfo; +import com.autotune.utils.KruizeConstants; +import com.autotune.utils.MetricsConfig; +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.Tags; + +import java.sql.Timestamp; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; + +/** + * KruizeNotificationCollectionRegistry is responsible for logging and creating metrics for notifications + * related to Kruize recommendations. + */ +public class KruizeNotificationCollectionRegistry { + private String experiment_name; + private Timestamp interval_end_time; + private String container_name; + + /** + * Constructor to initialize KruizeNotificationCollectionRegistry with experiment name, interval end time, and container name. + * + * @param experiment_name Name of the experiment. + * @param interval_end_time End time of the interval. + * @param container_name Name of the container. + */ + public KruizeNotificationCollectionRegistry(String experiment_name, Timestamp interval_end_time, String container_name) { + this.experiment_name = experiment_name; + this.interval_end_time = interval_end_time; + this.container_name = container_name; + } + + /** + * Logs notifications from the given ContainerData by iterating through its recommendation structure and creating appropriate counters. + * + * @param containerData The container data from which to log notifications. + */ + public void logNotification(ContainerData containerData) { + HashMap containerLevelNotifications = containerData.getContainerRecommendations().getNotificationMap(); + createCounterTag("container", null, null, containerLevelNotifications.values()); + for (MappedRecommendationForTimestamp mappedRecommendationForTimestamp : containerData.getContainerRecommendations().getData().values()) { + HashMap timeStampNotificationHashMap = mappedRecommendationForTimestamp.getHigherLevelNotificationMap(); + createCounterTag("timestamp", null, null, timeStampNotificationHashMap.values()); + for (Map.Entry entry : mappedRecommendationForTimestamp.getRecommendationForTermHashMap().entrySet()) { + String termName = entry.getKey(); + TermRecommendations termRecommendations = entry.getValue(); + HashMap termLevelNotificationHashMap = termRecommendations.getNotifications(); + createCounterTag("term", termName, null, termLevelNotificationHashMap.values()); + if (null != termRecommendations.getRecommendationForModelHashMap()) { + for (Map.Entry recommendationForModel : termRecommendations.getRecommendationForModelHashMap().entrySet()) { + String modelName = recommendationForModel.getKey(); + MappedRecommendationForModel mappedRecommendationForModel = recommendationForModel.getValue(); + HashMap modelNotificationHashMap = mappedRecommendationForModel.getNotificationHashMap(); + createCounterTag("model", termName, modelName, modelNotificationHashMap.values()); + } + } + } + } + } + + /** + * Creates a counter with tags for the given level, term, model, and list of recommendation notifications. + * + * @param level The level of the notification (e.g., container, timestamp, term, model). + * @param term The term associated with the notification. + * @param model The cost or performance model associated with the notification. + * @param recommendationNotificationList The list of recommendation notifications to create counters for. + */ + public void createCounterTag(String level, String term, String model, Collection recommendationNotificationList) { + for (RecommendationNotification recommendationNotification : recommendationNotificationList) { + Tags additionalTags = Tags.empty(); + if (("|" + KruizeDeploymentInfo.log_recommendation_metrics_level + "|").contains("|" + recommendationNotification.getType() + "|") == true) { + additionalTags = additionalTags.and(KruizeConstants.KRUIZE_RECOMMENDATION_METRICS.TAG_NAME, String.format(KruizeConstants.KRUIZE_RECOMMENDATION_METRICS.notification_format, this.experiment_name, this.container_name, KruizeConstants.DateFormats.simpleDateFormatForUTC.format(this.interval_end_time), level, term, model, String.valueOf(recommendationNotification.getCode()), recommendationNotification.getType(), recommendationNotification.getMessage())); + Counter counterNotifications = MetricsConfig.meterRegistry().find(KruizeConstants.KRUIZE_RECOMMENDATION_METRICS.METRIC_NAME).tags(additionalTags).counter(); + if (counterNotifications == null) { + counterNotifications = MetricsConfig.timerBKruizeNotifications.tags(additionalTags).register(MetricsConfig.meterRegistry); + } + counterNotifications.increment(); + } + } + } +} diff --git a/src/main/java/com/autotune/operator/KruizeDeploymentInfo.java b/src/main/java/com/autotune/operator/KruizeDeploymentInfo.java index fed68b47c..4be00ff62 100644 --- a/src/main/java/com/autotune/operator/KruizeDeploymentInfo.java +++ b/src/main/java/com/autotune/operator/KruizeDeploymentInfo.java @@ -61,6 +61,7 @@ public class KruizeDeploymentInfo { public static String k8s_type; // ABC public static String auth_type; public static Boolean plots = true; + public static String log_recommendation_metrics_level = "error|critical"; public static String auth_token; public static String database_admin_username; public static String database_admin_password; diff --git a/src/main/java/com/autotune/utils/KruizeConstants.java b/src/main/java/com/autotune/utils/KruizeConstants.java index fe28a5a29..27c468210 100644 --- a/src/main/java/com/autotune/utils/KruizeConstants.java +++ b/src/main/java/com/autotune/utils/KruizeConstants.java @@ -14,8 +14,13 @@ * limitations under the License. *******************************************************************************/ + package com.autotune.utils; +import java.text.SimpleDateFormat; +import java.util.Locale; +import java.util.TimeZone; + /** * Constants for Autotune module */ @@ -29,6 +34,21 @@ public class KruizeConstants { private KruizeConstants() { } + public static enum KRUIZE_RECOMMENDATION_API_VERSION { + V1_0("1.0"), + LATEST("1.0"); + private final String versionNumber; + + KRUIZE_RECOMMENDATION_API_VERSION(String versionNumber) { + this.versionNumber = versionNumber; + } + + public String getVersionNumber() { + return versionNumber; + } + + } + public static class APIMessages { public static final String MAX_DAY = "maxDay : %s"; public static final String SUCCESS = "success"; @@ -46,23 +66,6 @@ public static class APIMessages { public static final String UPDATE_RECOMMENDATIONS_FAILURE_MSG = "UpdateRecommendations API failed for experiment_name: %s and intervalEndTimeStr : %s due to %s"; } - - public static enum KRUIZE_RECOMMENDATION_API_VERSION { - V1_0("1.0"), - LATEST("1.0"); - private final String versionNumber; - - KRUIZE_RECOMMENDATION_API_VERSION(String versionNumber) { - this.versionNumber = versionNumber; - } - - public String getVersionNumber() { - return versionNumber; - } - - } - - /** * Holds the constants of env vars and values to start Autotune in different Modes */ @@ -275,11 +278,11 @@ public static final class TimeUnitsExt { public static final String HOUR_SINGLE_LC = "h"; public static final String HOUR_SINGLE_UC = HOUR_SINGLE_LC.toUpperCase(); - public static final class TimeZones { - public static final String UTC = "UTC"; + private TimeUnitsExt() { } - private TimeUnitsExt() { + public static final class TimeZones { + public static final String UTC = "UTC"; } } @@ -374,22 +377,25 @@ public static class DataSourceConstants { public static final String PROMETHEUS_DEFAULT_SERVICE_PORT = "9090"; public static final String PROMETHEUS_REACHABILITY_QUERY = "up"; public static final String DATASOURCE_ENDPOINT_WITH_QUERY = "%s/api/v1/query_range?query=%s&start=%s&end=%s&step=%s"; - public static final String DATE_ENDPOINT_WITH_QUERY = "%s/api/v1/query?query=%s"; + public static final String DATE_ENDPOINT_WITH_QUERY = "%s/api/v1/query?query=%s"; - public static class DataSourceDetailsInfoConstants { - private DataSourceDetailsInfoConstants() { - } + private DataSourceConstants() { + } + public static class DataSourceDetailsInfoConstants { public static final String version = "v1.0"; public static final String CLUSTER_NAME = "default"; + private DataSourceDetailsInfoConstants() { + } } public static class DataSourceInfoMsgs { public static final String ADDING_DATASOURCE = "Trying to add the datasource to collection: "; public static final String VERIFYING_DATASOURCE_REACHABILITY = "Verifying datasource reachability status: "; public static final String CHECKING_AVAILABLE_DATASOURCE = "Checking available datasources:"; - public static final String CHECKING_AVAILABLE_DATASOURCE_FROM_DB = "Checking available datasources from database:"; - public static final String NO_DATASOURCE_FOUND_IN_DB = "No datasource found in database."; + public static final String CHECKING_AVAILABLE_DATASOURCE_FROM_DB = "Checking available datasources from database:"; + public static final String NO_DATASOURCE_FOUND_IN_DB = "No datasource found in database."; + private DataSourceInfoMsgs() { } } @@ -398,14 +404,12 @@ public static class DataSourceSuccessMsgs { public static final String DATASOURCE_ADDED = "Datasource added to the collection successfully."; public static final String DATASOURCE_FOUND = "Datasource found: "; public static final String DATASOURCE_SERVICEABLE = "Datasource is serviceable."; + private DataSourceSuccessMsgs() { } } public static class DataSourceErrorMsgs { - private DataSourceErrorMsgs() { - } - public static final String MISSING_DATASOURCE_NAME = "Datasource name cannot be empty."; public static final String MISSING_DATASOURCE_PROVIDER = "Datasource provider cannot be empty."; public static final String MISSING_DATASOURCE_NAMESPACE = "Datasource namespace cannot be empty."; @@ -421,53 +425,47 @@ private DataSourceErrorMsgs() { public static final String SERVICE_NOT_FOUND = "Can not find service with specified name."; public static final String ENDPOINT_NOT_FOUND = "Service endpoint not found."; public static final String MISSING_DATASOURCE_INFO = "Datasource is missing, add a valid Datasource"; + private DataSourceErrorMsgs() { + } } public static class DataSourceQueryJSONKeys { - private DataSourceQueryJSONKeys() { - } - public static final String STATUS = "status"; public static final String DATA = "data"; public static final String RESULT = "result"; public static final String METRIC = "metric"; public static final String VALUE = "value"; public static final String VALUES = "values"; + private DataSourceQueryJSONKeys() { + } } public static class DataSourceQueryStatus { - private DataSourceQueryStatus() { - } - public static final String SUCCESS = "success"; public static final String ERROR = "error"; - } - private DataSourceConstants() { + private DataSourceQueryStatus() { + } } public static class DataSourceQueryMetricKeys { - private DataSourceQueryMetricKeys() { - } - public static final String NAMESPACE = "namespace"; public static final String WORKLOAD = "workload"; public static final String WORKLOAD_TYPE = "workload_type"; public static final String CONTAINER_NAME = "container"; public static final String CONTAINER_IMAGE_NAME = "image"; + private DataSourceQueryMetricKeys() { + } } public static class DataSourceMetadataInfoConstants { - private DataSourceMetadataInfoConstants() { - } - public static final String version = "v1.0"; public static final String CLUSTER_NAME = "default"; + private DataSourceMetadataInfoConstants() { + } } public static class DataSourceMetadataErrorMsgs { - private DataSourceMetadataErrorMsgs() { - } public static final String MISSING_DATASOURCE_METADATA_DATASOURCE_NAME = "DataSourceMetadata Datasource name cannot be empty"; public static final String MISSING_DATASOURCE_METADATA_WORKLOAD_MAP = "DataSourceMetadata Workload data cannot be empty or null"; public static final String MISSING_DATASOURCE_METADATA_CONTAINER_MAP = "DataSourceMetadata Container data cannot be empty or null"; @@ -499,11 +497,11 @@ private DataSourceMetadataErrorMsgs() { public static final String DATASOURCE_METADATA_VALIDATION_FAILURE_MSG = "Validation of imported metadata failed, mandatory fields missing: %s"; public static final String NAMESPACE_QUERY_VALIDATION_FAILED = "Validation failed for namespace data query."; public static final String DATASOURCE_OPERATOR_RETRIEVAL_FAILURE = "Failed to retrieve data source operator for provider: %s"; + private DataSourceMetadataErrorMsgs() { + } } public static class DataSourceMetadataInfoJSONKeys { - private DataSourceMetadataInfoJSONKeys() { - } public static final String DATASOURCES = "datasources"; public static final String DATASOURCE_NAME = "datasource_name"; public static final String CLUSTERS = "clusters"; @@ -516,10 +514,13 @@ private DataSourceMetadataInfoJSONKeys() { public static final String CONTAINERS = "containers"; public static final String CONTAINER_NAME = "container_name"; public static final String CONTAINER_IMAGE_NAME = "container_image_name"; + private DataSourceMetadataInfoJSONKeys() { + } } public static class DataSourceMetadataInfoSuccessMsgs { public static final String DATASOURCE_METADATA_DELETED = "Datasource metadata deleted successfully."; + private DataSourceMetadataInfoSuccessMsgs() { } } @@ -574,9 +575,10 @@ public static final class DateFormats { public static final String DB_EXTRACTION_FORMAT = "yyyy-MM-dd HH:mm:ss.SSS"; public static final long MILLI_SECONDS_FOR_DAY = 24 * 60 * 60 * 1000; public static final long MINUTES_FOR_DAY = 24 * 60; + public static SimpleDateFormat simpleDateFormatForUTC = new SimpleDateFormat(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT, Locale.ROOT); private DateFormats() { - + simpleDateFormatForUTC.setTimeZone(TimeZone.getTimeZone(KruizeConstants.TimeUnitsExt.TimeZones.UTC)); } } @@ -627,6 +629,7 @@ public static final class KRUIZE_CONFIG_ENV_NAME { public static final String SETTINGS_HIBERNATE_SHOW_SQL = "hibernate_showsql"; public static final String SETTINGS_HIBERNATE_TIME_ZONE = "hibernate_timezone"; public static final String PLOTS = "plots"; + public static final String log_recommendation_metrics_level = "log_recommendation_metrics_level"; public static final String CLOUDWATCH_LOGS_ACCESS_KEY_ID = "logging_cloudwatch_accessKeyId"; public static final String CLOUDWATCH_LOGS_SECRET_ACCESS_KEY = "logging_cloudwatch_secretAccessKey"; public static final String CLOUDWATCH_LOGS_LOG_GROUP = "logging_cloudwatch_logGroup"; @@ -691,4 +694,11 @@ private RecommendationDurationRanges() { public static final class KRUIZE_CONFIG_DEFAULT_VALUE { public static final int DELETE_PARTITION_THRESHOLD_IN_DAYS = 16; } + + public static final class KRUIZE_RECOMMENDATION_METRICS { + public static final String METRIC_NAME = "KruizeRecommendationsNotification"; + public static final String TAG_NAME = "experiment_details"; + public static final String notification_format = "%s|%s|%s|%s|%s|%s|%s|%s|%s"; //experiment_name,container_name,endtime,level,termname,modelname,code,type,message + + } } diff --git a/src/main/java/com/autotune/utils/MetricsConfig.java b/src/main/java/com/autotune/utils/MetricsConfig.java index 55753e34b..002d1411a 100644 --- a/src/main/java/com/autotune/utils/MetricsConfig.java +++ b/src/main/java/com/autotune/utils/MetricsConfig.java @@ -1,5 +1,6 @@ package com.autotune.utils; +import io.micrometer.core.instrument.Counter; import io.micrometer.core.instrument.Timer; import io.micrometer.core.instrument.binder.jvm.ClassLoaderMetrics; import io.micrometer.core.instrument.binder.jvm.JvmGcMetrics; @@ -10,19 +11,21 @@ import io.micrometer.prometheus.PrometheusMeterRegistry; public class MetricsConfig { - + public static Timer timerListRec, timerListExp, timerCreateExp, timerUpdateResults, timerUpdateRecomendations; public static Timer timerLoadRecExpName, timerLoadResultsExpName, timerLoadExpName, timerLoadRecExpNameDate, timerBoxPlots; public static Timer timerLoadAllRec, timerLoadAllExp, timerLoadAllResults; public static Timer timerAddRecDB, timerAddResultsDB, timerAddExpDB, timerAddBulkResultsDB; public static Timer timerAddPerfProfileDB, timerLoadPerfProfileName, timerLoadAllPerfProfiles; + public static Counter timerKruizeNotifications; public static Timer.Builder timerBListRec, timerBListExp, timerBCreateExp, timerBUpdateResults, timerBUpdateRecommendations; public static Timer.Builder timerBLoadRecExpName, timerBLoadResultsExpName, timerBLoadExpName, timerBLoadRecExpNameDate, timerBBoxPlots; public static Timer.Builder timerBLoadAllRec, timerBLoadAllExp, timerBLoadAllResults; public static Timer.Builder timerBAddRecDB, timerBAddResultsDB, timerBAddExpDB, timerBAddBulkResultsDB; public static Timer.Builder timerBAddPerfProfileDB, timerBLoadPerfProfileName, timerBLoadAllPerfProfiles; + public static Counter.Builder timerBKruizeNotifications; public static PrometheusMeterRegistry meterRegistry; - public static Timer timerListDS, timerImportDSMetadata,timerListDSMetadata; + public static Timer timerListDS, timerImportDSMetadata, timerListDSMetadata; public static Timer.Builder timerBListDS, timerBImportDSMetadata, timerBListDSMetadata; private static MetricsConfig INSTANCE; public String API_METRIC_DESC = "Time taken for Kruize APIs"; @@ -58,6 +61,7 @@ private MetricsConfig() { timerBListDS = Timer.builder("kruizeAPI").description(API_METRIC_DESC).tag("api", "datasources").tag("method", "GET"); timerBImportDSMetadata = Timer.builder("kruizeAPI").description(API_METRIC_DESC).tag("api", "dsmetadata").tag("method", "POST"); timerBListDSMetadata = Timer.builder("kruizeAPI").description(API_METRIC_DESC).tag("api", "dsmetadata").tag("method", "GET"); + timerBKruizeNotifications = Counter.builder("KruizeNotifications").description("Kruize notifications").tag("api", "updateRecommendations"); new ClassLoaderMetrics().bindTo(meterRegistry); new ProcessorMetrics().bindTo(meterRegistry); new JvmGcMetrics().bindTo(meterRegistry); @@ -77,4 +81,4 @@ public static PrometheusMeterRegistry meterRegistry() { return meterRegistry; } -} \ No newline at end of file +}