Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/aks-agent/HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@ To release a new version, please select a new version number (usually plus 1 to
Pending
+++++++

1.0.0b22
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor: HISTORY.rst says "Bump aks-agent to v0.7.1" but the PR description says "v0.7.0". The code in _consts.py matches the changelog (v0.7.1). Please update the PR description to be consistent.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

++++++++
* Bump aks-agent to v0.7.1
* Suppress litellm debug logs
* Feature: Separate Azure OpenAI provider into API Key and Microsoft Entra ID (keyless) providers
* Feature: Add --yes/-y flag to agent-cleanup command to skip confirmation prompt

1.0.0b21
++++++++
* Bump aks-agent to v0.6.0
Expand Down
2 changes: 1 addition & 1 deletion src/aks-agent/azext_aks_agent/_consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
AKS_MCP_LABEL_SELECTOR = "app.kubernetes.io/name=aks-mcp"

# AKS Agent Version (shared by helm chart and docker image)
AKS_AGENT_VERSION = "0.6.0"
AKS_AGENT_VERSION = "0.7.1"

# Helm Configuration
HELM_VERSION = "3.16.0"
6 changes: 6 additions & 0 deletions src/aks-agent/azext_aks_agent/_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,9 @@ def load_arguments(self, _):
help="The mode decides how the agent is deployed.",
default="cluster",
)
c.argument(
"yes",
options_list=["--yes", "-y"],
action="store_true",
help="Do not prompt for confirmation.",
)
18 changes: 17 additions & 1 deletion src/aks-agent/azext_aks_agent/agent/k8s/aks_agent_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def _populate_api_keys_from_secret(self):
)

if not secret.data:
logger.warning("Secret '%s' exists but has no data", self.llm_secret_name)
logger.debug("Secret '%s' exists but has no data", self.llm_secret_name)
return

# Decode secret data (base64 encoded)
Expand Down Expand Up @@ -927,6 +927,22 @@ def _create_helm_values(self):
"create": False,
}

# Configure aks-agent pod to use the same service account as aks-mcp for workload identity
helm_values["workloadIdentity"] = {
"enabled": True,
}
helm_values["serviceAccount"] = {
"create": False,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

workload identity is enabled unconditionally for all deployments

These Helm values (workloadIdentity.enabled=true, serviceAccount.create=false) are set for every deployment, even when the user is purely using API key authentication and does not have workload identity configured on their cluster.

This could cause deployment failures or unexpected behavior on clusters without workload identity. Consider gating this behind the has_empty_api_key check below:

has_empty_api_key = any(
    not (model_config.get("api_key") or "").strip()
    for model_config in self.llm_config_manager.model_list.values()
)
if has_empty_api_key:
    helm_values["workloadIdentity"] = {"enabled": True}
    helm_values["serviceAccount"] = {
        "create": False,
        "name": self.aks_mcp_service_account_name,
    }
    helm_values["azureADTokenAuth"] = True

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's also by design. It's ok to not have workload identity.

"name": self.aks_mcp_service_account_name,
}

has_empty_api_key = any(
not model_config.get("api_key") or not model_config.get("api_key").strip()
for model_config in self.llm_config_manager.model_list.values()
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: the empty api_key condition can be simplified. The or short-circuit already handles None/empty string, so the second .strip() check only adds value for whitespace-only strings. A simpler equivalent:

has_empty_api_key = any(
    not (model_config.get("api_key") or "").strip()
    for model_config in self.llm_config_manager.model_list.values()
)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

each llm provider with check the emptiness of the apikey

)
if has_empty_api_key:
helm_values["azureADTokenAuth"] = True

return helm_values

def save_llm_config(self, provider: LLMProvider, params: dict) -> None:
Expand Down
6 changes: 4 additions & 2 deletions src/aks-agent/azext_aks_agent/agent/llm_config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,10 @@ def get_env_vars(self, secret_name: str) -> List[Dict[str, str]]:
"""
env_vars_list = []
for _, model_config in self.model_list.items():
env_var = LLMProvider.to_env_vars(secret_name, model_config)
env_vars_list.append(env_var)
api_key = model_config.get("api_key")
if api_key and api_key.strip():
env_var = LLMProvider.to_env_vars(secret_name, model_config)
env_vars_list.append(env_var)
return env_vars_list


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from .anthropic_provider import AnthropicProvider
from .azure_provider import AzureProvider
from .azure_entraid_provider import AzureEntraIDProvider
from .base import LLMProvider
from .gemini_provider import GeminiProvider
from .openai_compatible_provider import OpenAICompatibleProvider
Expand All @@ -19,11 +20,11 @@

_PROVIDER_CLASSES: List[LLMProvider] = [
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential bug: PROVIDER_REGISTRY key collision

Both AzureProvider and AzureEntraIDProvider have model_route = "azure", which means their name property (derived from model_route) is both "azure". If PROVIDER_REGISTRY is keyed by name, the second registration (AzureEntraIDProvider) will silently overwrite the first (AzureProvider).

This could cause issues when looking up a provider by name. Consider giving AzureEntraIDProvider a distinct name (e.g., by overriding the name property to return "azure_entraid").

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's by design. We want two kinds of azure openai.

AzureProvider,
AzureEntraIDProvider,
OpenAIProvider,
AnthropicProvider,
GeminiProvider,
OpenAICompatibleProvider,
# Add new providers here
]

PROVIDER_REGISTRY = {}
Expand All @@ -49,8 +50,9 @@ def _get_provider_by_index(idx: int) -> LLMProvider:
Raises ValueError if index is out of range.
"""
if 1 <= idx <= len(_PROVIDER_CLASSES):
console.print("You selected provider:", _PROVIDER_CLASSES[idx - 1]().readable_name, style=f"bold {HELP_COLOR}")
return _PROVIDER_CLASSES[idx - 1]()
provider = _PROVIDER_CLASSES[idx - 1]()
console.print("You selected provider:", provider.readable_name, style=f"bold {HELP_COLOR}")
return provider
raise ValueError(f"Invalid provider index: {idx}")


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for license information.
# --------------------------------------------------------------------------------------------


from typing import Tuple

from .base import LLMProvider, is_valid_url, non_empty


def is_valid_api_base(v: str) -> bool:
if not v.startswith("https://"):
return False
return is_valid_url(v)


class AzureEntraIDProvider(LLMProvider):
@property
def readable_name(self) -> str:
return "Azure OpenAI (Microsoft Entra ID)"

@property
def model_route(self) -> str:
return "azure"

@property
def parameter_schema(self):
return {
"model": {
"secret": False,
"default": None,
"hint": "ensure your deployment name is the same as the model name, e.g., gpt-5",
"validator": non_empty,
"alias": "deployment_name"
},
"api_base": {
"secret": False,
"default": None,
"validator": is_valid_api_base
},
"api_version": {
"secret": False,
"default": "2025-04-01-preview",
"hint": None,
"validator": non_empty
}
}

def validate_connection(self, params: dict) -> Tuple[str, str]:
api_base = params.get("api_base")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unlike AzureProvider.validate_connection, this method only checks parameter presence and does not actually validate the connection to the Azure OpenAI endpoint. This is understandable since the Entra ID token is not available at CLI configuration time (it will be obtained by the pod via workload identity).

However, it would be helpful to add a comment here explaining why connection validation is skipped, so future maintainers don't mistake this for an incomplete implementation.

def validate_connection(self, params: dict) -> Tuple[str, str]:
    # Connection validation is skipped for Entra ID auth because the token
    # is obtained at runtime via workload identity, not available at CLI config time.
    ...

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot check validation for keyless auzre openai because it relies on azure role of the current user.

api_version = params.get("api_version")
deployment_name = params.get("model")

if not all([api_base, api_version, deployment_name]):
return "Missing required Azure parameters.", "retry_input"

return None, "save"
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def is_valid_api_base(v: str) -> bool:
class AzureProvider(LLMProvider):
@property
def readable_name(self) -> str:
return "Azure OpenAI"
return "Azure OpenAI (API Key)"

@property
def model_route(self) -> str:
Expand Down
10 changes: 8 additions & 2 deletions src/aks-agent/azext_aks_agent/agent/llm_providers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,8 @@ def to_k8s_secret_data(cls, params: dict):
"""
secret_key = cls.sanitize_k8s_secret_key(params)
secret_value = params.get("api_key")
if not secret_value or not secret_value.strip():
return {}
secret_data = {
secret_key: base64.b64encode(secret_value.encode("utf-8")).decode("utf-8"),
}
Expand Down Expand Up @@ -206,9 +208,13 @@ def to_secured_model_list_config(cls, params: dict) -> Dict[str, dict]:
"""Create a model config dictionary for the model list from the provider parameters.
Returns a copy of params with the api_key replaced by environment variable reference.
"""
secret_key = cls.sanitize_k8s_secret_key(params)
secured_params = params.copy()
secured_params.update({"api_key": f"{{{{ env.{secret_key} }}}}"})
api_key = params.get("api_key")
if api_key and api_key.strip():
secret_key = cls.sanitize_k8s_secret_key(params)
secured_params.update({"api_key": f"{{{{ env.{secret_key} }}}}"})
else:
secured_params.pop("api_key", None)
return secured_params

@classmethod
Expand Down
44 changes: 36 additions & 8 deletions src/aks-agent/azext_aks_agent/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,19 @@ def _setup_helm_deployment(console, aks_agent_manager: AKSAgentManager):
f"\n👤 Current service account in namespace '{aks_agent_manager.namespace}': {service_account_name}",
style="cyan")

# Check if using Azure Entra ID provider and show role assignment reminder
model_list = aks_agent_manager.get_llm_config()
if model_list and any("azure/" in model_name and not model_config.get("api_key") for model_name, model_config in model_list.items()):
console.print(
f"\n⚠️ IMPORTANT: If using keyless authentication with Azure OpenAI, ensure the 'Cognitive Services OpenAI User' or 'Azure AI Developer' role "
f"is assigned to the workload identity (service account: {service_account_name}).",
style=f"bold {INFO_COLOR}"
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This Entra ID detection + warning block is duplicated almost identically in the helm_status == "not_found" branch below. Consider extracting it into a helper function like _warn_entraid_role_assignment(console, model_list, service_account_name) to reduce duplication.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This minor change can be included in a following PR.

)
console.print(
"Learn more: https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/managed-identity\n",
style=INFO_COLOR
)

elif helm_status == "not_found":
console.print(
f"Helm chart not deployed (status: {helm_status}). Setting up deployment...",
Expand All @@ -197,6 +210,19 @@ def _setup_helm_deployment(console, aks_agent_manager: AKSAgentManager):
"'azure.workload.identity/client-id: <managed-identity-client-id>'.",
style=WARNING_COLOR)

# Check if using Azure Entra ID provider and show role assignment note
model_list = aks_agent_manager.get_llm_config()
if model_list and any("azure/" in model_name and not model_config.get("api_key") for model_name, model_config in model_list.items()):
console.print(
"\n⚠️ NOTE: You are using keyless authentication with Azure OpenAI. "
"Ensure the 'Cognitive Services OpenAI User' or 'Azure AI Developer' role is assigned to the workload identity.",
style=f"bold {INFO_COLOR}"
)
console.print(
"Learn more: https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/managed-identity",
style=INFO_COLOR
)

# Prompt user for service account name (required)
while True:
user_input = console.input(
Expand Down Expand Up @@ -422,6 +448,7 @@ def aks_agent_cleanup(
cluster_name,
namespace,
mode=None,
yes=False,
):
"""Cleanup and uninstall the AKS agent."""
with CLITelemetryClient(event_type="cleanup") as telemetry_client:
Expand All @@ -442,16 +469,17 @@ def aks_agent_cleanup(
f"⚠️ Warning: --namespace '{namespace}' is specified but will be ignored in client mode.",
style=WARNING_COLOR)

console.print(
"\n⚠️ Warning: This will uninstall the AKS agent and delete all associated resources.",
style=WARNING_COLOR)
if not yes:
console.print(
"\n⚠️ Warning: This will uninstall the AKS agent and delete all associated resources.",
style=WARNING_COLOR)

user_confirmation = console.input(
f"\n[{WARNING_COLOR}]Are you sure you want to proceed with cleanup? (y/N): [/]").strip().lower()
user_confirmation = console.input(
f"\n[{WARNING_COLOR}]Are you sure you want to proceed with cleanup? (y/N): [/]").strip().lower()

if user_confirmation not in ['y', 'yes']:
console.print("❌ Cleanup cancelled.", style=INFO_COLOR)
return
if user_confirmation not in ['y', 'yes']:
console.print("❌ Cleanup cancelled.", style=INFO_COLOR)
return

console.print("\n🗑️ Starting cleanup (this typically takes a few seconds)...", style=INFO_COLOR)

Expand Down
2 changes: 1 addition & 1 deletion src/aks-agent/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from setuptools import find_packages, setup

VERSION = "1.0.0b21"
VERSION = "1.0.0b22"

CLASSIFIERS = [
"Development Status :: 4 - Beta",
Expand Down
Loading