diff --git a/.idea/workspace.xml b/.idea/workspace.xml deleted file mode 100644 index d6f915dd..00000000 --- a/.idea/workspace.xml +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 1683614225722 - - - - \ No newline at end of file diff --git a/robusta_krr/core/abstract/strategies.py b/robusta_krr/core/abstract/strategies.py index ab5ab6c4..388595fe 100644 --- a/robusta_krr/core/abstract/strategies.py +++ b/robusta_krr/core/abstract/strategies.py @@ -28,7 +28,7 @@ class ResourceRecommendation(pd.BaseModel): request: Optional[float] limit: Optional[float] info: Optional[str] = pd.Field( - None, description="Additional information about the recommendation. Currently used to explain undefined." + None, description="Additional information about the recommendation." ) @classmethod diff --git a/robusta_krr/core/integrations/prometheus/metrics/__init__.py b/robusta_krr/core/integrations/prometheus/metrics/__init__.py index e9a0568b..aea497f6 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/__init__.py +++ b/robusta_krr/core/integrations/prometheus/metrics/__init__.py @@ -1,3 +1,3 @@ from .base import PrometheusMetric from .cpu import CPUAmountLoader, CPULoader, PercentileCPULoader -from .memory import MaxMemoryLoader, MemoryAmountLoader, MemoryLoader +from .memory import MaxMemoryLoader, MemoryAmountLoader, MemoryLoader, MaxOOMKilledMemoryLoader diff --git a/robusta_krr/core/integrations/prometheus/metrics/base.py b/robusta_krr/core/integrations/prometheus/metrics/base.py index 34957ef6..807a11fd 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/base.py +++ b/robusta_krr/core/integrations/prometheus/metrics/base.py @@ -59,6 +59,7 @@ class PrometheusMetric(BaseMetric): query_type: QueryType = QueryType.Query filtering: bool = True pods_batch_size: Optional[int] = 50 + warning_on_no_data: bool = True def __init__( self, @@ -127,7 +128,10 @@ def _query_prometheus_sync(self, data: PrometheusMetricData) -> list[PrometheusS return response["result"] else: # regular query, lighter on preformance - response = self.prometheus.safe_custom_query(query=data.query) + try: + response = self.prometheus.safe_custom_query(query=data.query) + except Exception as e: + raise ValueError(f"Failed to run query: {data.query}") from e results = response["result"] # format the results to return the same format as custom_query_range for result in results: diff --git a/robusta_krr/core/integrations/prometheus/metrics/memory.py b/robusta_krr/core/integrations/prometheus/metrics/memory.py index ca324f6f..9f7e30d0 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/memory.py +++ b/robusta_krr/core/integrations/prometheus/metrics/memory.py @@ -69,3 +69,41 @@ def get_query(self, object: K8sWorkload, duration: str, step: str) -> str: [{duration}:{step}] ) """ + +# TODO: Need to battle test if this one is correct. +class MaxOOMKilledMemoryLoader(PrometheusMetric): + """ + A metric loader for loading the maximum memory limits that were surpassed by the OOMKilled event. + """ + + warning_on_no_data = False + + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: + pods_selector = "|".join(pod.name for pod in object.pods) + cluster_label = self.get_prometheus_cluster_label() + return f""" + max_over_time( + max( + max( + kube_pod_container_resource_limits{{ + resource="memory", + namespace="{object.namespace}", + pod=~"{pods_selector}", + container="{object.container}" + {cluster_label} + }} + ) by (pod, container, job) + * on(pod, container, job) group_left(reason) + max( + kube_pod_container_status_last_terminated_reason{{ + reason="OOMKilled", + namespace="{object.namespace}", + pod=~"{pods_selector}", + container="{object.container}" + {cluster_label} + }} + ) by (pod, container, job, reason) + ) by (container, pod, job) + [{duration}:{step}] + ) + """ diff --git a/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py b/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py index 98951699..f11a32d6 100644 --- a/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py +++ b/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py @@ -206,9 +206,10 @@ async def gather_data( elif "Memory" in LoaderClass.__name__: object.add_warning("NoPrometheusMemoryMetrics") - logger.warning( - f"{metric_loader.service_name} returned no {metric_loader.__class__.__name__} metrics for {object}" - ) + if LoaderClass.warning_on_no_data: + logger.warning( + f"{metric_loader.service_name} returned no {metric_loader.__class__.__name__} metrics for {object}" + ) return data diff --git a/robusta_krr/formatters/table.py b/robusta_krr/formatters/table.py index 3d20b8e4..b028a058 100644 --- a/robusta_krr/formatters/table.py +++ b/robusta_krr/formatters/table.py @@ -33,6 +33,12 @@ def __calc_diff(allocated, recommended, selector, multiplier=1) -> str: return f"{diff_sign}{_format(abs(diff_val) * multiplier)}" +DEFAULT_INFO_COLOR = "grey27" +INFO_COLORS: dict[str, str] = { + "OOMKill detected": "dark_red", +} + + def _format_request_str(item: ResourceScan, resource: ResourceType, selector: str) -> str: allocated = getattr(item.object.allocations, selector)[resource] info = item.recommended.info.get(resource) @@ -46,6 +52,12 @@ def _format_request_str(item: ResourceScan, resource: ResourceType, selector: st if diff != "": diff = f"({diff}) " + if info is None: + info_formatted = "" + else: + color = INFO_COLORS.get(info, DEFAULT_INFO_COLOR) + info_formatted = f"\n[{color}]({info})[/{color}]" + return ( diff + f"[{severity.color}]" @@ -53,7 +65,7 @@ def _format_request_str(item: ResourceScan, resource: ResourceType, selector: st + " -> " + _format(recommended.value) + f"[/{severity.color}]" - + (f" [grey27]({info})[/grey27]" if info else "") + + info_formatted ) diff --git a/robusta_krr/strategies/__init__.py b/robusta_krr/strategies/__init__.py index 05e029bb..3409ebd8 100644 --- a/robusta_krr/strategies/__init__.py +++ b/robusta_krr/strategies/__init__.py @@ -1 +1 @@ -from .simple import SimpleStrategy +from .simple import SimpleStrategy \ No newline at end of file diff --git a/robusta_krr/strategies/simple.py b/robusta_krr/strategies/simple.py index db15f6c9..498401fd 100644 --- a/robusta_krr/strategies/simple.py +++ b/robusta_krr/strategies/simple.py @@ -19,8 +19,10 @@ MemoryAmountLoader, PercentileCPULoader, PrometheusMetric, + MaxOOMKilledMemoryLoader, ) + class SimpleStrategySettings(StrategySettings): cpu_percentile: float = pd.Field(99, gt=0, le=100, description="The percentile to use for the CPU recommendation.") memory_buffer_percentage: float = pd.Field( @@ -29,14 +31,27 @@ class SimpleStrategySettings(StrategySettings): points_required: int = pd.Field( 100, ge=1, description="The number of data points required to make a recommendation for a resource." ) - allow_hpa: bool = pd.Field(False, description="Whether to calculate recommendations even when there is an HPA scaler defined on that resource.") + allow_hpa: bool = pd.Field( + False, + description="Whether to calculate recommendations even when there is an HPA scaler defined on that resource.", + ) + use_oomkill_data: bool = pd.Field( + False, + description="Whether to bump the memory when OOMKills are detected (experimental).", + ) + oom_memory_buffer_percentage: float = pd.Field( + 25, gt=0, description="What percentage to increase the memory when there are OOMKill events." + ) - def calculate_memory_proposal(self, data: PodsTimeData) -> float: + def calculate_memory_proposal(self, data: PodsTimeData, max_oomkill: float = 0) -> float: data_ = [np.max(values[:, 1]) for values in data.values()] if len(data_) == 0: return float("NaN") - return np.max(data_) * (1 + self.memory_buffer_percentage / 100) + return max( + np.max(data_) * (1 + self.memory_buffer_percentage / 100), + max_oomkill * (1 + self.oom_memory_buffer_percentage / 100), + ) def calculate_cpu_proposal(self, data: PodsTimeData) -> float: if len(data) == 0: @@ -75,7 +90,17 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): @property def metrics(self) -> list[type[PrometheusMetric]]: - return [PercentileCPULoader(self.settings.cpu_percentile), MaxMemoryLoader, CPUAmountLoader, MemoryAmountLoader] + metrics = [ + PercentileCPULoader(self.settings.cpu_percentile), + MaxMemoryLoader, + CPUAmountLoader, + MemoryAmountLoader, + ] + + if self.settings.use_oomkill_data: + metrics.append(MaxOOMKilledMemoryLoader) + + return metrics def __calculate_cpu_proposal( self, history_data: MetricsPodData, object_data: K8sWorkload @@ -85,13 +110,20 @@ def __calculate_cpu_proposal( if len(data) == 0: return ResourceRecommendation.undefined(info="No data") + # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] + # As CPUAmountLoader returns only the last value (1 point), [0, 1] is used to get the value + # So each pod is string with pod name, and values is numpy array of shape (N, 2) data_count = {pod: values[0, 1] for pod, values in history_data["CPUAmountLoader"].items()} total_points_count = sum(data_count.values()) if total_points_count < self.settings.points_required: return ResourceRecommendation.undefined(info="Not enough data") - if object_data.hpa is not None and object_data.hpa.target_cpu_utilization_percentage is not None and not self.settings.allow_hpa: + if ( + object_data.hpa is not None + and object_data.hpa.target_cpu_utilization_percentage is not None + and not self.settings.allow_hpa + ): return ResourceRecommendation.undefined(info="HPA detected") cpu_usage = self.settings.calculate_cpu_proposal(data) @@ -102,20 +134,44 @@ def __calculate_memory_proposal( ) -> ResourceRecommendation: data = history_data["MaxMemoryLoader"] + oomkill_detected = False + + if self.settings.use_oomkill_data: + max_oomkill_data = history_data["MaxOOMKilledMemoryLoader"] + # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] + # As MaxOOMKilledMemoryLoader returns only the last value (1 point), [0, 1] is used to get the value + # So each value is numpy array of shape (N, 2) + max_oomkill_value = ( + np.max([values[0, 1] for values in max_oomkill_data.values()]) if len(max_oomkill_data) > 0 else 0 + ) + if max_oomkill_value != 0: + oomkill_detected = True + else: + max_oomkill_value = 0 + if len(data) == 0: return ResourceRecommendation.undefined(info="No data") + # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] + # As MemoryAmountLoader returns only the last value (1 point), [0, 1] is used to get the value + # So each pod is string with pod name, and values is numpy array of shape (N, 2) data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()} total_points_count = sum(data_count.values()) if total_points_count < self.settings.points_required: return ResourceRecommendation.undefined(info="Not enough data") - if object_data.hpa is not None and object_data.hpa.target_memory_utilization_percentage is not None and not self.settings.allow_hpa: + if ( + object_data.hpa is not None + and object_data.hpa.target_memory_utilization_percentage is not None + and not self.settings.allow_hpa + ): return ResourceRecommendation.undefined(info="HPA detected") - memory_usage = self.settings.calculate_memory_proposal(data) - return ResourceRecommendation(request=memory_usage, limit=memory_usage) + memory_usage = self.settings.calculate_memory_proposal(data, max_oomkill_value) + return ResourceRecommendation( + request=memory_usage, limit=memory_usage, info="OOMKill detected" if oomkill_detected else None + ) def run(self, history_data: MetricsPodData, object_data: K8sWorkload) -> RunResult: return {