diff --git a/robusta_krr/core/integrations/prometheus/metrics/memory.py b/robusta_krr/core/integrations/prometheus/metrics/memory.py index 85dfba6b..961fe858 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/memory.py +++ b/robusta_krr/core/integrations/prometheus/metrics/memory.py @@ -107,3 +107,93 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: [{duration}:{step}] ) """ + +class JVMMemoryLoader(PrometheusMetric): + """ + A metric loader for loading JVM memory usage metrics. + This loader specifically looks for JVM heap memory usage. + """ + + query_type: QueryType = QueryType.QueryRange + + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: + pods_selector = "|".join(pod.name for pod in object.pods) + cluster_label = self.get_prometheus_cluster_label() + return f""" + max( + jvm_memory_bytes_used{{ + namespace="{object.namespace}", + pod=~"{pods_selector}", + container="{object.container}", + area="heap" + {cluster_label} + }} + ) by (container, pod, job) + """ + +class MaxJVMMemoryLoader(PrometheusMetric): + """ + A metric loader for loading max JVM memory usage metrics. + """ + + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: + pods_selector = "|".join(pod.name for pod in object.pods) + cluster_label = self.get_prometheus_cluster_label() + return f""" + max_over_time( + max( + jvm_memory_bytes_used{{ + namespace="{object.namespace}", + pod=~"{pods_selector}", + container="{object.container}", + area="heap" + {cluster_label} + }} + ) by (container, pod, job) + [{duration}:{step}] + ) + """ + +class JVMMemoryAmountLoader(PrometheusMetric): + """ + A metric loader for loading JVM memory points count. + """ + + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: + pods_selector = "|".join(pod.name for pod in object.pods) + cluster_label = self.get_prometheus_cluster_label() + return f""" + count_over_time( + max( + jvm_memory_bytes_used{{ + namespace="{object.namespace}", + pod=~"{pods_selector}", + container="{object.container}", + area="heap" + {cluster_label} + }} + ) by (container, pod, job) + [{duration}:{step}] + ) + """ + +class JVMDetector(PrometheusMetric): + """ + A metric loader for detecting if a container is running a JVM application. + """ + + warning_on_no_data = False + + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: + pods_selector = "|".join(pod.name for pod in object.pods) + cluster_label = self.get_prometheus_cluster_label() + return f""" + max( + jvm_memory_bytes_used{{ + namespace="{object.namespace}", + pod=~"{pods_selector}", + container="{object.container}" + {cluster_label} + }} + ) by (container, pod, job) + """ diff --git a/robusta_krr/strategies/simple.py b/robusta_krr/strategies/simple.py index fa9cd777..3fd90d7f 100644 --- a/robusta_krr/strategies/simple.py +++ b/robusta_krr/strategies/simple.py @@ -21,6 +21,10 @@ PercentileCPULoader, PrometheusMetric, MaxOOMKilledMemoryLoader, + JVMMemoryLoader, + MaxJVMMemoryLoader, + JVMMemoryAmountLoader, + JVMDetector, ) @@ -29,6 +33,9 @@ class SimpleStrategySettings(StrategySettings): memory_buffer_percentage: float = pd.Field( 15, gt=0, description="The percentage of added buffer to the peak memory usage for memory recommendation." ) + jvm_memory_buffer_percentage: float = pd.Field( + 30, gt=0, description="The percentage of added buffer to the peak JVM heap memory usage for memory recommendation." + ) points_required: int = pd.Field( 100, ge=1, description="The number of data points required to make a recommendation for a resource." ) @@ -44,13 +51,14 @@ class SimpleStrategySettings(StrategySettings): 25, ge=0, description="What percentage to increase the memory when there are OOMKill events." ) - def calculate_memory_proposal(self, data: PodsTimeData, max_oomkill: float = 0) -> float: + def calculate_memory_proposal(self, data: PodsTimeData, max_oomkill: float = 0, is_jvm: bool = False) -> float: data_ = [np.max(values[:, 1]) for values in data.values()] if len(data_) == 0: return float("NaN") + buffer_percentage = self.jvm_memory_buffer_percentage if is_jvm else self.memory_buffer_percentage return max( - np.max(data_) * (1 + self.memory_buffer_percentage / 100), + np.max(data_) * (1 + buffer_percentage / 100), max_oomkill * (1 + self.oom_memory_buffer_percentage / 100), ) @@ -82,6 +90,9 @@ def metrics(self) -> list[type[PrometheusMetric]]: MaxMemoryLoader, CPUAmountLoader, MemoryAmountLoader, + JVMDetector, + MaxJVMMemoryLoader, + JVMMemoryAmountLoader, ] if self.settings.use_oomkill_data: @@ -140,15 +151,18 @@ def __calculate_cpu_proposal( def __calculate_memory_proposal( self, history_data: MetricsPodData, object_data: K8sObjectData ) -> ResourceRecommendation: - data = history_data["MaxMemoryLoader"] + # Check if this is a JVM application + jvm_data = history_data["JVMDetector"] + is_jvm = len(jvm_data) > 0 + + # Use appropriate memory loader based on whether it's a JVM application + data = history_data["MaxJVMMemoryLoader"] if is_jvm else history_data["MaxMemoryLoader"] + data_count = history_data["JVMMemoryAmountLoader"] if is_jvm else history_data["MemoryAmountLoader"] oomkill_detected = False if self.settings.use_oomkill_data: max_oomkill_data = history_data["MaxOOMKilledMemoryLoader"] - # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] - # As MaxOOMKilledMemoryLoader returns only the last value (1 point), [0, 1] is used to get the value - # So each value is numpy array of shape (N, 2) max_oomkill_value = ( np.max([values[0, 1] for values in max_oomkill_data.values()]) if len(max_oomkill_data) > 0 else 0 ) @@ -160,10 +174,7 @@ def __calculate_memory_proposal( if len(data) == 0: return ResourceRecommendation.undefined(info="No data") - # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] - # As MemoryAmountLoader returns only the last value (1 point), [0, 1] is used to get the value - # So each pod is string with pod name, and values is numpy array of shape (N, 2) - data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()} + data_count = {pod: values[0, 1] for pod, values in data_count.items()} total_points_count = sum(data_count.values()) if total_points_count < self.settings.points_required: @@ -176,9 +187,17 @@ def __calculate_memory_proposal( ): return ResourceRecommendation.undefined(info="HPA detected") - memory_usage = self.settings.calculate_memory_proposal(data, max_oomkill_value) + memory_usage = self.settings.calculate_memory_proposal(data, max_oomkill_value, is_jvm) + info = [] + if oomkill_detected: + info.append("OOMKill detected") + if is_jvm: + info.append("JVM application detected") + return ResourceRecommendation( - request=memory_usage, limit=memory_usage, info="OOMKill detected" if oomkill_detected else None + request=memory_usage, + limit=memory_usage, + info=", ".join(info) if info else None ) def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult: diff --git a/tests/test_jvm_metrics.py b/tests/test_jvm_metrics.py new file mode 100644 index 00000000..cf3f06a1 --- /dev/null +++ b/tests/test_jvm_metrics.py @@ -0,0 +1,160 @@ +import pytest +from datetime import datetime, timedelta +import numpy as np + +from robusta_krr.core.integrations.prometheus.metrics.memory import ( + JVMMemoryLoader, + MaxJVMMemoryLoader, + JVMMemoryAmountLoader, + JVMDetector, +) +from robusta_krr.core.models.objects import K8sObjectData, PodData + + +@pytest.fixture +def mock_pod_data(): + return K8sObjectData( + name="test-app", + namespace="default", + kind="Deployment", + container="app", + pods=[ + PodData(name="test-app-pod-1", namespace="default"), + PodData(name="test-app-pod-2", namespace="default"), + ], + ) + + +@pytest.fixture +def mock_prometheus_response(): + return { + "status": "success", + "data": { + "resultType": "matrix", + "result": [ + { + "metric": { + "container": "app", + "pod": "test-app-pod-1", + "job": "kubernetes-pods", + }, + "values": [ + [1625097600, "1000000"], # 1MB + [1625097900, "2000000"], # 2MB + [1625098200, "1500000"], # 1.5MB + ], + }, + { + "metric": { + "container": "app", + "pod": "test-app-pod-2", + "job": "kubernetes-pods", + }, + "values": [ + [1625097600, "1200000"], # 1.2MB + [1625097900, "1800000"], # 1.8MB + [1625098200, "1600000"], # 1.6MB + ], + }, + ], + }, + } + + +def test_jvm_memory_loader_query(mock_pod_data): + loader = JVMMemoryLoader() + query = loader.get_query(mock_pod_data, "1h", "1m") + + assert "jvm_memory_bytes_used" in query + assert "area=\"heap\"" in query + assert "test-app-pod-1|test-app-pod-2" in query + assert "namespace=\"default\"" in query + assert "container=\"app\"" in query + + +def test_max_jvm_memory_loader_query(mock_pod_data): + loader = MaxJVMMemoryLoader() + query = loader.get_query(mock_pod_data, "1h", "1m") + + assert "jvm_memory_bytes_used" in query + assert "area=\"heap\"" in query + assert "max_over_time" in query + assert "test-app-pod-1|test-app-pod-2" in query + + +def test_jvm_memory_amount_loader_query(mock_pod_data): + loader = JVMMemoryAmountLoader() + query = loader.get_query(mock_pod_data, "1h", "1m") + + assert "jvm_memory_bytes_used" in query + assert "area=\"heap\"" in query + assert "count_over_time" in query + assert "test-app-pod-1|test-app-pod-2" in query + + +def test_jvm_detector_query(mock_pod_data): + loader = JVMDetector() + query = loader.get_query(mock_pod_data, "1h", "1m") + + assert "jvm_memory_bytes_used" in query + assert "test-app-pod-1|test-app-pod-2" in query + + +def test_jvm_memory_loader_parse_response(mock_prometheus_response): + loader = JVMMemoryLoader() + result = loader.parse_response(mock_prometheus_response) + + assert len(result) == 2 + assert "test-app-pod-1" in result + assert "test-app-pod-2" in result + + # Check if values are properly converted to numpy arrays + pod1_values = result["test-app-pod-1"] + assert isinstance(pod1_values, np.ndarray) + assert pod1_values.shape == (3, 2) # 3 timestamps, 2 values each + assert np.max(pod1_values[:, 1]) == 2000000 # Max value should be 2MB + + +def test_max_jvm_memory_loader_parse_response(mock_prometheus_response): + loader = MaxJVMMemoryLoader() + result = loader.parse_response(mock_prometheus_response) + + assert len(result) == 2 + assert "test-app-pod-1" in result + assert "test-app-pod-2" in result + + # Check if values are properly converted to numpy arrays + pod1_values = result["test-app-pod-1"] + assert isinstance(pod1_values, np.ndarray) + assert pod1_values.shape == (3, 2) + assert np.max(pod1_values[:, 1]) == 2000000 # Max value should be 2MB + + +def test_jvm_memory_amount_loader_parse_response(mock_prometheus_response): + loader = JVMMemoryAmountLoader() + result = loader.parse_response(mock_prometheus_response) + + assert len(result) == 2 + assert "test-app-pod-1" in result + assert "test-app-pod-2" in result + + # Check if values are properly converted to numpy arrays + pod1_values = result["test-app-pod-1"] + assert isinstance(pod1_values, np.ndarray) + assert pod1_values.shape == (3, 2) + assert np.sum(pod1_values[:, 1]) == 3 # Should count 3 data points + + +def test_jvm_detector_parse_response(mock_prometheus_response): + loader = JVMDetector() + result = loader.parse_response(mock_prometheus_response) + + assert len(result) == 2 + assert "test-app-pod-1" in result + assert "test-app-pod-2" in result + + # Check if values are properly converted to numpy arrays + pod1_values = result["test-app-pod-1"] + assert isinstance(pod1_values, np.ndarray) + assert pod1_values.shape == (3, 2) + assert np.max(pod1_values[:, 1]) == 2000000 # Max value should be 2MB \ No newline at end of file diff --git a/tests/test_jvm_strategy.py b/tests/test_jvm_strategy.py new file mode 100644 index 00000000..5cb341d1 --- /dev/null +++ b/tests/test_jvm_strategy.py @@ -0,0 +1,167 @@ +import pytest +import numpy as np +from datetime import datetime, timedelta + +from robusta_krr.core.abstract.strategies import MetricsPodData, K8sObjectData, PodData +from robusta_krr.core.integrations.prometheus.metrics.memory import ( + JVMMemoryLoader, + MaxJVMMemoryLoader, + JVMMemoryAmountLoader, + JVMDetector, +) +from robusta_krr.strategies.simple import SimpleStrategy, SimpleStrategySettings + + +@pytest.fixture +def mock_pod_data(): + return K8sObjectData( + name="test-app", + namespace="default", + kind="Deployment", + container="app", + pods=[ + PodData(name="test-app-pod-1", namespace="default"), + PodData(name="test-app-pod-2", namespace="default"), + ], + ) + + +@pytest.fixture +def mock_jvm_metrics_data(): + return { + "JVMDetector": { + "test-app-pod-1": np.array([ + [1625097600, 1000000], # 1MB + [1625097900, 2000000], # 2MB + [1625098200, 1500000], # 1.5MB + ]), + "test-app-pod-2": np.array([ + [1625097600, 1200000], # 1.2MB + [1625097900, 1800000], # 1.8MB + [1625098200, 1600000], # 1.6MB + ]), + }, + "MaxJVMMemoryLoader": { + "test-app-pod-1": np.array([ + [1625097600, 1000000], # 1MB + [1625097900, 2000000], # 2MB + [1625098200, 1500000], # 1.5MB + ]), + "test-app-pod-2": np.array([ + [1625097600, 1200000], # 1.2MB + [1625097900, 1800000], # 1.8MB + [1625098200, 1600000], # 1.6MB + ]), + }, + "JVMMemoryAmountLoader": { + "test-app-pod-1": np.array([[1625098200, 3]]), # 3 data points + "test-app-pod-2": np.array([[1625098200, 3]]), # 3 data points + }, + } + + +@pytest.fixture +def mock_non_jvm_metrics_data(): + return { + "JVMDetector": {}, # Empty JVM metrics + "MaxMemoryLoader": { + "test-app-pod-1": np.array([ + [1625097600, 1000000], # 1MB + [1625097900, 2000000], # 2MB + [1625098200, 1500000], # 1.5MB + ]), + "test-app-pod-2": np.array([ + [1625097600, 1200000], # 1.2MB + [1625097900, 1800000], # 1.8MB + [1625098200, 1600000], # 1.6MB + ]), + }, + "MemoryAmountLoader": { + "test-app-pod-1": np.array([[1625098200, 3]]), # 3 data points + "test-app-pod-2": np.array([[1625098200, 3]]), # 3 data points + }, + } + + +def test_jvm_detection(mock_pod_data, mock_jvm_metrics_data): + strategy = SimpleStrategy(SimpleStrategySettings()) + result = strategy.run(mock_jvm_metrics_data, mock_pod_data) + + # Check if JVM application is detected + assert result["Memory"].info == "JVM application detected" + + # Check if memory recommendation uses JVM buffer percentage + max_memory = 2000000 # 2MB (max from mock data) + expected_memory = max_memory * (1 + strategy.settings.jvm_memory_buffer_percentage / 100) + assert result["Memory"].request == expected_memory + + +def test_non_jvm_detection(mock_pod_data, mock_non_jvm_metrics_data): + strategy = SimpleStrategy(SimpleStrategySettings()) + result = strategy.run(mock_non_jvm_metrics_data, mock_pod_data) + + # Check if non-JVM application is detected + assert result["Memory"].info is None + + # Check if memory recommendation uses regular buffer percentage + max_memory = 2000000 # 2MB (max from mock data) + expected_memory = max_memory * (1 + strategy.settings.memory_buffer_percentage / 100) + assert result["Memory"].request == expected_memory + + +def test_jvm_with_oomkill(mock_pod_data, mock_jvm_metrics_data): + # Add OOMKill data + mock_jvm_metrics_data["MaxOOMKilledMemoryLoader"] = { + "test-app-pod-1": np.array([[1625098200, 2500000]]), # 2.5MB + } + + strategy = SimpleStrategy(SimpleStrategySettings(use_oomkill_data=True)) + result = strategy.run(mock_jvm_metrics_data, mock_pod_data) + + # Check if both JVM and OOMKill are detected + assert "JVM application detected" in result["Memory"].info + assert "OOMKill detected" in result["Memory"].info + + # Check if memory recommendation uses OOMKill value with buffer + oomkill_memory = 2500000 # 2.5MB (from OOMKill data) + expected_memory = oomkill_memory * (1 + strategy.settings.oom_memory_buffer_percentage / 100) + assert result["Memory"].request == expected_memory + + +def test_jvm_with_hpa(mock_pod_data, mock_jvm_metrics_data): + # Add HPA data + mock_pod_data.hpa = type("HPA", (), { + "target_memory_utilization_percentage": 80 + }) + + strategy = SimpleStrategy(SimpleStrategySettings()) + result = strategy.run(mock_jvm_metrics_data, mock_pod_data) + + # Check if HPA is detected + assert result["Memory"].info == "HPA detected" + assert result["Memory"].request is None + + +def test_jvm_with_hpa_override(mock_pod_data, mock_jvm_metrics_data): + # Add HPA data + mock_pod_data.hpa = type("HPA", (), { + "target_memory_utilization_percentage": 80 + }) + + strategy = SimpleStrategy(SimpleStrategySettings(allow_hpa=True)) + result = strategy.run(mock_jvm_metrics_data, mock_pod_data) + + # Check if JVM is detected and HPA is ignored + assert result["Memory"].info == "JVM application detected" + assert result["Memory"].request is not None + + +def test_jvm_custom_buffer_percentage(mock_pod_data, mock_jvm_metrics_data): + custom_buffer = 40 + strategy = SimpleStrategy(SimpleStrategySettings(jvm_memory_buffer_percentage=custom_buffer)) + result = strategy.run(mock_jvm_metrics_data, mock_pod_data) + + # Check if memory recommendation uses custom JVM buffer percentage + max_memory = 2000000 # 2MB (max from mock data) + expected_memory = max_memory * (1 + custom_buffer / 100) + assert result["Memory"].request == expected_memory \ No newline at end of file