Skip to content

Commit

Permalink
solve race condition between job and pod creation by adding retries (#…
Browse files Browse the repository at this point in the history
…1446)

* solve race condition between job and pod creation by adding retries

* pull default config from env vars
  • Loading branch information
arikalon1 authored May 31, 2024
1 parent e30de52 commit 98a9b1a
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 7 deletions.
7 changes: 5 additions & 2 deletions src/robusta/core/model/env_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,10 @@ def load_bool(env_var, default: bool):
# lowered case k8s kinds in a json array string. "[\"configmap\", \"secret\"]"
RESOURCE_YAML_BLOCK_LIST = json.loads(os.environ.get("RESOURCE_YAML_BLOCK_LIST", "[]"))

NAMESPACE_DATA_TTL = int(os.environ.get("NAMESPACE_DATA_TTL", 30*60)) # in seconds
NAMESPACE_DATA_TTL = int(os.environ.get("NAMESPACE_DATA_TTL", 30 * 60)) # in seconds

PROCESSED_ALERTS_CACHE_TTL = int(os.environ.get("PROCESSED_ALERT_CACHE_TTL", 2*3600))
PROCESSED_ALERTS_CACHE_TTL = int(os.environ.get("PROCESSED_ALERT_CACHE_TTL", 2 * 3600))
PROCESSED_ALERTS_CACHE_MAX_SIZE = int(os.environ.get("PROCESSED_ALERTS_CACHE_MAX_SIZE", 100_000))

POD_WAIT_RETRIES = int(os.environ.get("POD_WAIT_RETRIES", 10))
POD_WAIT_RETRIES_SECONDS = int(os.environ.get("POD_WAIT_RETRIES_SECONDS", 5))
2 changes: 1 addition & 1 deletion src/robusta/core/sinks/robusta/dal/supabase_dal.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ def publish_cluster_nodes(self, node_count: int, pod_count: int):
try:
self.client.rpc(UPDATE_CLUSTER_NODE_COUNT, data).execute()
except Exception as e:
logging.error(f"Failed to publish node count {data} error: {e}")
logging.exception(f"Failed to publish node count {data} error: {e}")

logging.debug(f"cluster nodes: {UPDATE_CLUSTER_NODE_COUNT} => {data}")

Expand Down
20 changes: 16 additions & 4 deletions src/robusta/integrations/kubernetes/custom_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,13 @@
from kubernetes.client import ApiException
from pydantic import BaseModel

from robusta.core.model.env_vars import IMAGE_REGISTRY, INSTALLATION_NAMESPACE, RUNNER_SERVICE_ACCOUNT
from robusta.core.model.env_vars import (
IMAGE_REGISTRY,
INSTALLATION_NAMESPACE,
POD_WAIT_RETRIES,
POD_WAIT_RETRIES_SECONDS,
RUNNER_SERVICE_ACCOUNT,
)
from robusta.integrations.kubernetes.api_client_utils import (
SUCCEEDED_STATE,
exec_shell_command,
Expand Down Expand Up @@ -451,13 +457,19 @@ def get_pods(self) -> List[RobustaPod]:
# we serialize and then deserialize to work around https://github.com/haxsaw/hikaru/issues/15
return [hikaru.from_dict(pod.to_dict(), cls=RobustaPod) for pod in pods.items]

def get_single_pod(self) -> RobustaPod:
def get_single_pod(self, retries: int = POD_WAIT_RETRIES, wait: int = POD_WAIT_RETRIES_SECONDS) -> RobustaPod:
"""
like get_pods() but verifies that only one pod is associated with the job and returns that pod
if no pods, retry X times with Y seconds wait
"""
pods = self.get_pods()
while retries > 0 and len(pods) == 0:
time.sleep(wait)
pods = self.get_pods()
retries -= 1

if len(pods) != 1:
raise Exception(f"got more pods than expected for job {self.metadata.name}: {pods}")
raise Exception(f"got {len(pods)} pods. expected 1 for job {self.metadata.name}: {pods}")
return pods[0]

def create_job_owned_secret(self, job_secret: JobSecret):
Expand All @@ -478,7 +490,7 @@ def create_job_owned_secret(self, job_secret: JobSecret):
metadata=ObjectMeta(name=job_secret.name, ownerReferences=[robusta_owner_reference]), data=job_secret.data
)
try:
return secret.createNamespacedSecret(job_pod.metadata.namespace).obj
secret.createNamespacedSecret(job_pod.metadata.namespace).obj
except Exception as e:
logging.error(f"Failed to create secret {job_secret.name}", exc_info=True)
raise e
Expand Down

0 comments on commit 98a9b1a

Please sign in to comment.