Skip to content

Commit

Permalink
holmes integration - WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
arikalon1 committed Jun 21, 2024
1 parent 7eb7548 commit 86fbd01
Show file tree
Hide file tree
Showing 17 changed files with 364 additions and 25 deletions.
6 changes: 5 additions & 1 deletion helm/robusta/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,8 @@ dependencies:
- name: kube-prometheus-stack
version: 55.7.0
condition: enablePrometheusStack
repository: "https://prometheus-community.github.io/helm-charts"
repository: "https://prometheus-community.github.io/helm-charts"
- name: holmes
version: 0.0.2
condition: enableHolmesGPT
repository: "https://robusta-charts.storage.googleapis.com"
4 changes: 4 additions & 0 deletions helm/robusta/templates/runner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ spec:
- name: DISABLE_HELM_MONITORING
value: "True"
{{- end }}
{{- if not .Values.enableHolmesGPT }}
- name: HOLMES_ENABLED
value: "True"
{{- end }}
{{- if .Values.scaleAlertsProcessing }}
- name: ALERTS_WORKERS_POOL
value: "True"
Expand Down
10 changes: 8 additions & 2 deletions helm/robusta/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ global:

automountServiceAccountToken: true

enableHolmesGPT: false

# see https://docs.robusta.dev/master/user-guide/configuration.html#global-config and https://docs.robusta.dev/master/configuration/additional-settings.html#global-config
globalConfig:
check_prometheus_flags: true
Expand All @@ -36,6 +38,10 @@ globalConfig:
alertRelabel: []

# safe actions to enable authenticated users to run

disabledPlaybooks:
- WeeklyKRRScan

lightActions:
- related_pods
- prometheus_enricher
Expand Down Expand Up @@ -73,7 +79,6 @@ lightActions:
- node_dmesg_enricher
- status_enricher
- popeye_scan
- krr_scan
- handle_alertmanager_event
- drain
- cordon
Expand Down Expand Up @@ -497,7 +502,8 @@ platformPlaybooks:
- "robusta_ui_sink"

# Any playbook name listed here will be disabled
disabledPlaybooks: []
disabledPlaybooks:
- WeeklyKRRScan

image:
registry: us-central1-docker.pkg.dev/genuine-flight-317411/devel
Expand Down
3 changes: 3 additions & 0 deletions src/robusta/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from robusta.core.discovery.resource_names import ResourceNameLister
from robusta.core.model.base_params import (
ActionParams,
AIInvestigateParams,
AlertResourceGraphEnricherParams,
BashParams,
ChartValuesFormat,
Expand Down Expand Up @@ -184,6 +185,7 @@
)
from robusta.core.reporting.custom_rendering import RendererType, charts_style, render_value
from robusta.core.reporting.finding_subjects import KubeObjFindingSubject, PodFindingSubject
from robusta.core.reporting.holmes import HolmesRequest, HolmesResult, HolmesResultsBlock
from robusta.core.schedule.model import (
DynamicDelayRepeat,
FixedDelayRepeat,
Expand Down Expand Up @@ -296,6 +298,7 @@
)
from robusta.integrations.prometheus.utils import (
AlertManagerDiscovery,
HolmesDiscovery,
PrometheusDiscovery,
ServiceDiscovery,
get_prometheus_connect,
Expand Down
60 changes: 55 additions & 5 deletions src/robusta/core/model/base_params.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from enum import Enum, auto
from typing import Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Union

from pydantic import BaseModel, SecretStr, validator

Expand All @@ -21,6 +21,7 @@ class ChartValuesFormat(Enum):
def __str__(self):
return self.name


class ResourceChartItemType(Enum):
"""
Item selection for Alert resource enricher
Expand Down Expand Up @@ -70,6 +71,45 @@ def post_initialization(self):
pass


class ResourceInfo(BaseModel):
name: str
namespace: Optional[str]
kind: str
node: Optional[str]
container: Optional[str]


class HolmesParams(ActionParams):

holmes_url: Optional[str]

@validator("holmes_url", allow_reuse=True)
def validate_protocol(cls, v):
if v and not v.startswith("http"): # if the user configured url without http(s)
v = f"http://{v}"
logging.info(f"Adding protocol to holmes_url: {v}")
return v


class AIInvestigateParams(HolmesParams):
"""
:var resource: The resource related to this investigation. A resource has a `name` and `kind`, and may have `namespace` and `node`
:var investigation_type: The type of investigation: Issue/Service/Cluster/Custom
:var runbooks: List of human readable recommended runbooks that holmes can use for the investigation.
:var ask: Override question to ask holmes
:var context: Additional information that can assist with the investigation
:example ask: What are all the issues in my cluster right now?
:example runbooks: ["Try to get the pod logs and find errors", "get the pod yaml and check if there are finalizers"]
"""

resource: Optional[ResourceInfo]
investigation_type: str
runbooks: Optional[List[str]]
ask: Optional[str]
context: Optional[Dict[str, Any]]


class PodRunningParams(ActionParams):
"""
:var custom_annotations: custom annotations to be used for the running pod/job
Expand Down Expand Up @@ -340,7 +380,17 @@ class OomKillParams(OOMGraphEnricherParams):
container_memory_graph: Optional[bool] = False
node_memory_graph: Optional[bool] = False

def __init__(self, attach_logs: Optional[bool] = False, container_memory_graph: Optional[bool] = False,
node_memory_graph: Optional[bool] = False, **kwargs):
super().__init__(attach_logs=attach_logs, container_memory_graph=container_memory_graph,
node_memory_graph=node_memory_graph, resource_type=ResourceChartResourceType.Memory.name, **kwargs)
def __init__(
self,
attach_logs: Optional[bool] = False,
container_memory_graph: Optional[bool] = False,
node_memory_graph: Optional[bool] = False,
**kwargs,
):
super().__init__(
attach_logs=attach_logs,
container_memory_graph=container_memory_graph,
node_memory_graph=node_memory_graph,
resource_type=ResourceChartResourceType.Memory.name,
**kwargs,
)
2 changes: 2 additions & 0 deletions src/robusta/core/model/env_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,5 @@ def load_bool(env_var, default: bool):

POD_WAIT_RETRIES = int(os.environ.get("POD_WAIT_RETRIES", 10))
POD_WAIT_RETRIES_SECONDS = int(os.environ.get("POD_WAIT_RETRIES_SECONDS", 5))

HOLMES_ENABLED = load_bool("HOLMES_ENABLED", False)
Empty file.
64 changes: 64 additions & 0 deletions src/robusta/core/playbooks/internal/ai_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import json
import logging

import requests

from robusta.core.model.base_params import AIInvestigateParams
from robusta.core.model.events import ExecutionBaseEvent
from robusta.core.playbooks.actions_registry import action
from robusta.core.reporting import Finding, FindingSubject
from robusta.core.reporting.base import EnrichmentType
from robusta.core.reporting.consts import FindingSubjectType, FindingType
from robusta.core.reporting.holmes import HolmesRequest, HolmesResult, HolmesResultsBlock
from robusta.integrations.prometheus.utils import HolmesDiscovery


@action
def ask_holmes(event: ExecutionBaseEvent, params: AIInvestigateParams):
holmes_url = HolmesDiscovery.find_holmes_url(params.holmes_url)
if not holmes_url:
logging.error("Holmes url not found")
return

try:
issue_name = params.context.get("issue_type", "unknown health issue")
holmes_req = HolmesRequest(
source=params.context.get("source", "unknown source"),
title=f"{issue_name}",
description="",
subject=params.resource.dict() if params.resource else None,
context=params.context if params.context else None,
include_tool_calls=True,
include_tool_call_results=True,
)
result = requests.post(f"{holmes_url}/api/investigate", data=holmes_req.json())
result.raise_for_status()

holmes_result = HolmesResult(**json.loads(result.text))
title_suffix = (
f" on {params.resource.name}"
if params.resource.name and params.resource.name.lower() != "unresolved"
else ""
)

finding = Finding(
title=f"AI Analysis of {issue_name}{title_suffix}",
aggregation_key="HolmesInvestigationResult",
subject=FindingSubject(
name=params.resource.name,
namespace=params.resource.namespace,
subject_type=FindingSubjectType.from_kind(params.resource.kind),
node=params.resource.node,
container=params.resource.container,
),
finding_type=FindingType.AI_ANALYSIS,
failure=False,
)
finding.add_enrichment(
[HolmesResultsBlock(holmes_result=holmes_result)], enrichment_type=EnrichmentType.ai_analysis
)

event.add_finding(finding)

except Exception:
logging.exception("Failed to get holmes analysis")
1 change: 1 addition & 0 deletions src/robusta/core/reporting/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class VideoLink(BaseModel):

class EnrichmentType(Enum):
graph = "graph"
ai_analysis = "ai_analysis"
node_info = "node_info"
container_info = "container_info"
k8s_events = "k8s_events"
Expand Down
2 changes: 1 addition & 1 deletion src/robusta/core/reporting/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def create_for_func(

if not signing_key:
raise Exception("Cannot create callback request with no signing key. Configure signing_key in globalConfig")
action_params = {} if choice.action_params is None else choice.action_params.dict()
action_params = {} if choice.action_params is None else choice.action_params.dict(exclude_defaults=True)
if choice.kubernetes_object:
action_params["kind"] = choice.kubernetes_object.kind
action_params["name"] = choice.kubernetes_object.metadata.name
Expand Down
1 change: 1 addition & 0 deletions src/robusta/core/reporting/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class FindingType(Enum):
CONF_CHANGE = "configuration_change"
HEALTH_CHECK = "health_check"
REPORT = "report"
AI_ANALYSIS = "ai_analysis"

@classmethod
def from_type(cls, finding_type: str) -> "FindingType":
Expand Down
40 changes: 40 additions & 0 deletions src/robusta/core/reporting/holmes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from typing import Any, Dict, List, Optional

from pydantic import BaseModel

from robusta.core.reporting import BaseBlock


class HolmesRequest(BaseModel):
source: str # "prometheus" etc
title: str
description: str
subject: dict
context: Dict[str, Any]
include_tool_calls: bool = False
include_tool_call_results: bool = False


class ToolCallResult(BaseModel):
tool_name: str
description: str
result: str


class HolmesResult(BaseModel):
tool_calls: Optional[List[ToolCallResult]] = None
analysis: Optional[str] = None


class HolmesResultsBlock(BaseBlock):
holmes_result: Optional[HolmesResult]

def __init__(
self,
holmes_result: Optional[HolmesResult] = None,
**kwargs,
):
super().__init__(
holmes_result=holmes_result,
**kwargs,
)
4 changes: 2 additions & 2 deletions src/robusta/core/sinks/mail/mail_sink_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ def _get_sink_type(cls):
def validate_mailto(cls, mailto):
# Make sure we only handle emails and exclude other schemes provided by apprise
# (there is a lot of them).
if not (mailto.startswith("mailto://") or mailto.startswith("mailtos://")):
raise AttributeError(f"{mailto} is not a mailto(s) address")
# if not (mailto.startswith("mailto://") or mailto.startswith("mailtos://")):
# raise AttributeError(f"{mailto} is not a mailto(s) address")
return mailto


Expand Down
32 changes: 28 additions & 4 deletions src/robusta/core/sinks/robusta/dal/model_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import uuid
from datetime import datetime
from typing import Any, Dict
from typing import Any, Dict, List

from robusta.core.model.env_vars import ENABLE_GRAPH_BLOCK
from robusta.core.reporting import (
Expand All @@ -21,8 +21,9 @@
PrometheusBlock,
TableBlock,
)
from robusta.core.reporting.blocks import GraphBlock, EmptyFileBlock
from robusta.core.reporting.blocks import EmptyFileBlock, GraphBlock
from robusta.core.reporting.callbacks import ExternalActionRequestBuilder
from robusta.core.reporting.holmes import HolmesResultsBlock
from robusta.core.sinks.transformer import Transformer
from robusta.utils.parsing import datetime_to_db_str

Expand Down Expand Up @@ -68,7 +69,7 @@ def to_finding_json(account_id: str, cluster_id: str, finding: Finding):
@staticmethod
def get_file_type(filename: str):
last_dot_idx = filename.rindex(".")
return filename[last_dot_idx + 1:]
return filename[last_dot_idx + 1 :]

@staticmethod
def get_file_object(block: FileBlock):
Expand All @@ -85,6 +86,22 @@ def get_empty_file_object(block: EmptyFileBlock):
"data": "",
}

@staticmethod
def add_ai_analysis_data(structured_data: List[Dict], block: HolmesResultsBlock):
structured_data.append(
{
"type": "markdown",
"metadata": {"type": "ai_investigation_result"},
"data": Transformer.to_github_markdown(block.holmes_result.analysis),
}
)
for tool_call in block.holmes_result.tool_calls:
file_block = FileBlock(f"{tool_call.description}.txt", tool_call.result.encode())
file_block.zip()
data_obj = ModelConversion.get_file_object(file_block)
data_obj["metadata"] = {"description": tool_call.description, "tool_name": tool_call.tool_name}
structured_data.append(data_obj)

@staticmethod
def to_evidence_json(
account_id: str,
Expand All @@ -110,7 +127,12 @@ def to_evidence_json(
elif isinstance(block, GraphBlock):
if ENABLE_GRAPH_BLOCK:
structured_data.append(
{"type": "prometheus", "data": block.graph_data.dict(), "metadata": block.graph_data.metadata, "version": 1.0}
{
"type": "prometheus",
"data": block.graph_data.dict(),
"metadata": block.graph_data.metadata,
"version": 1.0,
}
)
else:
if block.is_text_file():
Expand All @@ -122,6 +144,8 @@ def to_evidence_json(
if block.is_text_file():
block.zip()
structured_data.append(ModelConversion.get_file_object(block))
elif isinstance(block, HolmesResultsBlock):
ModelConversion.add_ai_analysis_data(structured_data, block)
elif isinstance(block, HeaderBlock):
structured_data.append({"type": "header", "data": block.text})
elif isinstance(block, ListBlock):
Expand Down
2 changes: 2 additions & 0 deletions src/robusta/integrations/kubernetes/custom_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,8 @@ def run_simple_job_spec(
pod = job.get_single_pod()
return pod.get_logs() or ""
finally:
if job and not pod:
pod = job.get_single_pod()
if pod and finalizers:
try: # must use patch, since the pod revision changed at this point
body = {"metadata": {"$deleteFromPrimitiveList/finalizers": finalizers}}
Expand Down
Loading

0 comments on commit 86fbd01

Please sign in to comment.