Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions robusta_krr/core/integrations/prometheus/metrics/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,93 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""

class JVMMemoryLoader(PrometheusMetric):
"""
A metric loader for loading JVM memory usage metrics.
This loader specifically looks for JVM heap memory usage.
"""

query_type: QueryType = QueryType.QueryRange

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
cluster_label = self.get_prometheus_cluster_label()
return f"""
max(
jvm_memory_bytes_used{{
namespace="{object.namespace}",
pod=~"{pods_selector}",
container="{object.container}",
area="heap"
{cluster_label}
}}
) by (container, pod, job)
"""

class MaxJVMMemoryLoader(PrometheusMetric):
"""
A metric loader for loading max JVM memory usage metrics.
"""

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
cluster_label = self.get_prometheus_cluster_label()
return f"""
max_over_time(
max(
jvm_memory_bytes_used{{
namespace="{object.namespace}",
pod=~"{pods_selector}",
container="{object.container}",
area="heap"
{cluster_label}
}}
) by (container, pod, job)
[{duration}:{step}]
)
"""

class JVMMemoryAmountLoader(PrometheusMetric):
"""
A metric loader for loading JVM memory points count.
"""

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
cluster_label = self.get_prometheus_cluster_label()
return f"""
count_over_time(
max(
jvm_memory_bytes_used{{
namespace="{object.namespace}",
pod=~"{pods_selector}",
container="{object.container}",
area="heap"
{cluster_label}
}}
) by (container, pod, job)
[{duration}:{step}]
)
"""

class JVMDetector(PrometheusMetric):
"""
A metric loader for detecting if a container is running a JVM application.
"""

warning_on_no_data = False

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
cluster_label = self.get_prometheus_cluster_label()
return f"""
max(
jvm_memory_bytes_used{{
namespace="{object.namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
}}
) by (container, pod, job)
"""
43 changes: 31 additions & 12 deletions robusta_krr/strategies/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
PercentileCPULoader,
PrometheusMetric,
MaxOOMKilledMemoryLoader,
JVMMemoryLoader,
MaxJVMMemoryLoader,
JVMMemoryAmountLoader,
JVMDetector,
Comment on lines +24 to +27
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Unused import triggers Ruff F401 – drop or actually use JVMMemoryLoader.

JVMMemoryLoader is imported but never referenced. Ruff flags this as F401, which will fail lint-only CI steps.
Either add the loader to the metrics list or remove the import:

-    JVMMemoryLoader,
     MaxJVMMemoryLoader,

If a plain (non-max) JVM timeseries is not needed here, simply deleting the import keeps the strategy lean and CI green.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
JVMMemoryLoader,
MaxJVMMemoryLoader,
JVMMemoryAmountLoader,
JVMDetector,
MaxJVMMemoryLoader,
JVMMemoryAmountLoader,
JVMDetector,
🧰 Tools
🪛 Ruff (0.11.9)

24-24: robusta_krr.core.integrations.prometheus.metrics.JVMMemoryLoader imported but unused

Remove unused import: robusta_krr.core.integrations.prometheus.metrics.JVMMemoryLoader

(F401)

🤖 Prompt for AI Agents
In robusta_krr/strategies/simple.py around lines 24 to 27, the import
JVMMemoryLoader is unused and triggers a Ruff F401 lint error. To fix this,
either remove the JVMMemoryLoader import if it is not needed or add
JVMMemoryLoader to the metrics list or wherever appropriate in the code to
ensure it is used. This will resolve the lint error and keep the code clean.

)


Expand All @@ -29,6 +33,9 @@ class SimpleStrategySettings(StrategySettings):
memory_buffer_percentage: float = pd.Field(
15, gt=0, description="The percentage of added buffer to the peak memory usage for memory recommendation."
)
jvm_memory_buffer_percentage: float = pd.Field(
30, gt=0, description="The percentage of added buffer to the peak JVM heap memory usage for memory recommendation."
)
points_required: int = pd.Field(
100, ge=1, description="The number of data points required to make a recommendation for a resource."
)
Expand All @@ -44,13 +51,14 @@ class SimpleStrategySettings(StrategySettings):
25, ge=0, description="What percentage to increase the memory when there are OOMKill events."
)

def calculate_memory_proposal(self, data: PodsTimeData, max_oomkill: float = 0) -> float:
def calculate_memory_proposal(self, data: PodsTimeData, max_oomkill: float = 0, is_jvm: bool = False) -> float:
data_ = [np.max(values[:, 1]) for values in data.values()]
if len(data_) == 0:
return float("NaN")

buffer_percentage = self.jvm_memory_buffer_percentage if is_jvm else self.memory_buffer_percentage
return max(
np.max(data_) * (1 + self.memory_buffer_percentage / 100),
np.max(data_) * (1 + buffer_percentage / 100),
max_oomkill * (1 + self.oom_memory_buffer_percentage / 100),
)

Expand Down Expand Up @@ -82,6 +90,9 @@ def metrics(self) -> list[type[PrometheusMetric]]:
MaxMemoryLoader,
CPUAmountLoader,
MemoryAmountLoader,
JVMDetector,
MaxJVMMemoryLoader,
JVMMemoryAmountLoader,
]

if self.settings.use_oomkill_data:
Expand Down Expand Up @@ -140,15 +151,18 @@ def __calculate_cpu_proposal(
def __calculate_memory_proposal(
self, history_data: MetricsPodData, object_data: K8sObjectData
) -> ResourceRecommendation:
data = history_data["MaxMemoryLoader"]
# Check if this is a JVM application
jvm_data = history_data["JVMDetector"]
is_jvm = len(jvm_data) > 0

# Use appropriate memory loader based on whether it's a JVM application
data = history_data["MaxJVMMemoryLoader"] if is_jvm else history_data["MaxMemoryLoader"]
data_count = history_data["JVMMemoryAmountLoader"] if is_jvm else history_data["MemoryAmountLoader"]

oomkill_detected = False

if self.settings.use_oomkill_data:
max_oomkill_data = history_data["MaxOOMKilledMemoryLoader"]
# NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value]
# As MaxOOMKilledMemoryLoader returns only the last value (1 point), [0, 1] is used to get the value
# So each value is numpy array of shape (N, 2)
max_oomkill_value = (
np.max([values[0, 1] for values in max_oomkill_data.values()]) if len(max_oomkill_data) > 0 else 0
)
Expand All @@ -160,10 +174,7 @@ def __calculate_memory_proposal(
if len(data) == 0:
return ResourceRecommendation.undefined(info="No data")

# NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value]
# As MemoryAmountLoader returns only the last value (1 point), [0, 1] is used to get the value
# So each pod is string with pod name, and values is numpy array of shape (N, 2)
data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()}
data_count = {pod: values[0, 1] for pod, values in data_count.items()}
total_points_count = sum(data_count.values())

if total_points_count < self.settings.points_required:
Expand All @@ -176,9 +187,17 @@ def __calculate_memory_proposal(
):
return ResourceRecommendation.undefined(info="HPA detected")

memory_usage = self.settings.calculate_memory_proposal(data, max_oomkill_value)
memory_usage = self.settings.calculate_memory_proposal(data, max_oomkill_value, is_jvm)
info = []
if oomkill_detected:
info.append("OOMKill detected")
if is_jvm:
info.append("JVM application detected")

return ResourceRecommendation(
request=memory_usage, limit=memory_usage, info="OOMKill detected" if oomkill_detected else None
request=memory_usage,
limit=memory_usage,
info=", ".join(info) if info else None
)

def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult:
Expand Down
160 changes: 160 additions & 0 deletions tests/test_jvm_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import pytest
from datetime import datetime, timedelta
import numpy as np

from robusta_krr.core.integrations.prometheus.metrics.memory import (
JVMMemoryLoader,
MaxJVMMemoryLoader,
JVMMemoryAmountLoader,
JVMDetector,
)
from robusta_krr.core.models.objects import K8sObjectData, PodData


@pytest.fixture
def mock_pod_data():
return K8sObjectData(
name="test-app",
namespace="default",
kind="Deployment",
container="app",
pods=[
PodData(name="test-app-pod-1", namespace="default"),
PodData(name="test-app-pod-2", namespace="default"),
],
)


@pytest.fixture
def mock_prometheus_response():
return {
"status": "success",
"data": {
"resultType": "matrix",
"result": [
{
"metric": {
"container": "app",
"pod": "test-app-pod-1",
"job": "kubernetes-pods",
},
"values": [
[1625097600, "1000000"], # 1MB
[1625097900, "2000000"], # 2MB
[1625098200, "1500000"], # 1.5MB
],
},
{
"metric": {
"container": "app",
"pod": "test-app-pod-2",
"job": "kubernetes-pods",
},
"values": [
[1625097600, "1200000"], # 1.2MB
[1625097900, "1800000"], # 1.8MB
[1625098200, "1600000"], # 1.6MB
],
},
],
},
}


def test_jvm_memory_loader_query(mock_pod_data):
loader = JVMMemoryLoader()
query = loader.get_query(mock_pod_data, "1h", "1m")

assert "jvm_memory_bytes_used" in query
assert "area=\"heap\"" in query
assert "test-app-pod-1|test-app-pod-2" in query
assert "namespace=\"default\"" in query
assert "container=\"app\"" in query


def test_max_jvm_memory_loader_query(mock_pod_data):
loader = MaxJVMMemoryLoader()
query = loader.get_query(mock_pod_data, "1h", "1m")

assert "jvm_memory_bytes_used" in query
assert "area=\"heap\"" in query
assert "max_over_time" in query
assert "test-app-pod-1|test-app-pod-2" in query


def test_jvm_memory_amount_loader_query(mock_pod_data):
loader = JVMMemoryAmountLoader()
query = loader.get_query(mock_pod_data, "1h", "1m")

assert "jvm_memory_bytes_used" in query
assert "area=\"heap\"" in query
assert "count_over_time" in query
assert "test-app-pod-1|test-app-pod-2" in query


def test_jvm_detector_query(mock_pod_data):
loader = JVMDetector()
query = loader.get_query(mock_pod_data, "1h", "1m")

assert "jvm_memory_bytes_used" in query
assert "test-app-pod-1|test-app-pod-2" in query


def test_jvm_memory_loader_parse_response(mock_prometheus_response):
loader = JVMMemoryLoader()
result = loader.parse_response(mock_prometheus_response)

assert len(result) == 2
assert "test-app-pod-1" in result
assert "test-app-pod-2" in result

# Check if values are properly converted to numpy arrays
pod1_values = result["test-app-pod-1"]
assert isinstance(pod1_values, np.ndarray)
assert pod1_values.shape == (3, 2) # 3 timestamps, 2 values each
assert np.max(pod1_values[:, 1]) == 2000000 # Max value should be 2MB


def test_max_jvm_memory_loader_parse_response(mock_prometheus_response):
loader = MaxJVMMemoryLoader()
result = loader.parse_response(mock_prometheus_response)

assert len(result) == 2
assert "test-app-pod-1" in result
assert "test-app-pod-2" in result

# Check if values are properly converted to numpy arrays
pod1_values = result["test-app-pod-1"]
assert isinstance(pod1_values, np.ndarray)
assert pod1_values.shape == (3, 2)
assert np.max(pod1_values[:, 1]) == 2000000 # Max value should be 2MB


def test_jvm_memory_amount_loader_parse_response(mock_prometheus_response):
loader = JVMMemoryAmountLoader()
result = loader.parse_response(mock_prometheus_response)

assert len(result) == 2
assert "test-app-pod-1" in result
assert "test-app-pod-2" in result

# Check if values are properly converted to numpy arrays
pod1_values = result["test-app-pod-1"]
assert isinstance(pod1_values, np.ndarray)
assert pod1_values.shape == (3, 2)
assert np.sum(pod1_values[:, 1]) == 3 # Should count 3 data points


def test_jvm_detector_parse_response(mock_prometheus_response):
loader = JVMDetector()
result = loader.parse_response(mock_prometheus_response)

assert len(result) == 2
assert "test-app-pod-1" in result
assert "test-app-pod-2" in result

# Check if values are properly converted to numpy arrays
pod1_values = result["test-app-pod-1"]
assert isinstance(pod1_values, np.ndarray)
assert pod1_values.shape == (3, 2)
assert np.max(pod1_values[:, 1]) == 2000000 # Max value should be 2MB
Loading