diff --git a/api/v1/aiplatform_types.go b/api/v1/aiplatform_types.go index 344c6ce..3d5ba37 100644 --- a/api/v1/aiplatform_types.go +++ b/api/v1/aiplatform_types.go @@ -364,13 +364,13 @@ type SidecarSpec struct { // ObjectStorageSpec defines object storage configuration for AI artifacts, tasks, and models type ObjectStorageSpec struct { // Remote volume URI in the format s3://bucketname/, gs://bucketname/, - // azure://containername/, or minio://bucketname/ + // azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// // +kubebuilder:validation:Required - // +kubebuilder:validation:Pattern=`^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$` + // +kubebuilder:validation:Pattern=`^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$` Path string `json:"path"` - // Optional override endpoint (only needed for S3-compatible services like MinIO) - // Must be a valid HTTP/HTTPS URL + // Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + // Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) // +kubebuilder:validation:Optional // +kubebuilder:validation:Pattern=`^https?://.*$` Endpoint string `json:"endpoint,omitempty"` @@ -380,11 +380,17 @@ type ObjectStorageSpec struct { // +kubebuilder:validation:MinLength=1 Region string `json:"region"` - // Secret name containing storage credentials + // Secret name containing storage credentials (e.g. s3_access_key, s3_secret_key for S3-compatible backends) // +kubebuilder:validation:Optional // +kubebuilder:validation:MinLength=1 // +kubebuilder:validation:MaxLength=253 SecretRef string `json:"secretRef,omitempty"` + + // Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + // Values: aws, minio, seaweedfs, s3compat, gcs, azure + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Enum=aws;minio;seaweedfs;s3compat;gcs;azure + Provider string `json:"provider,omitempty"` } // IngressSpec defines Ingress configuration for external access to platform services diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index fe8f28d..d40e531 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -1,13 +1,19 @@ applications: - name: Entrypoint - import_path: splunkai_models_apps.custom.deployments.entrypoint.main:SERVE_APP + import_path: main:SERVE_APP route_prefix: / runtime_env: + working_dir: "file:///home/ray/ray/applications/entrypoint.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: entrypoint ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -22,7 +28,7 @@ applications: gpu_type_options_override: H100: ray_actor_options: - num_gpus: 0.025 + num_gpus: 0.0375 L40S: ray_actor_options: num_gpus: 0.05 @@ -37,7 +43,7 @@ applications: gpu_type_model_config_override: H100: engine_args: - gpu_memory_utilization: 0.025 + gpu_memory_utilization: 0.0375 L40S: engine_args: gpu_memory_utilization: 0.05 @@ -47,17 +53,23 @@ applications: tensor_parallel_size: 1 model_id: uae_large model_loader: - object_storage: - prefix: model_artifacts/uae-large + blob_storage: + blob_prefix: model_artifacts/uae-large name: UaeLarge - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /uae_large runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: uae_large ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -91,17 +103,23 @@ applications: tensor_parallel_size: 1 model_id: all_minilm_l6_v2 model_loader: - object_storage: - prefix: model_artifacts/all-minilm-l6-v2 + blob_storage: + blob_prefix: model_artifacts/all-minilm-l6-v2 name: AllMinilmL6V2 - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /all_minilm_l6_v2 runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: all_minilm_l6_v2 ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -135,17 +153,23 @@ applications: tensor_parallel_size: 1 model_id: bi_encoder model_loader: - object_storage: - prefix: model_artifacts/bi-encoder + blob_storage: + blob_prefix: model_artifacts/bi-encoder name: BiEncoder - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /bi_encoder runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: bi_encoder ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -172,20 +196,21 @@ applications: ray_actor_options: num_gpus: 0.2 deployment_type: custom_deployment - model_definition: - model_id: mbart_translator - model_loader: - object_storage: - prefix: model_artifacts/mbart-translator name: MbartTranslator - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /mbart_translator runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: mbart_translator ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -222,21 +247,26 @@ applications: model_config: engine_args: gpu_memory_utilization: 0.1 - task: classify tensor_parallel_size: 1 model_id: xlm_roberta_language_classifier model_loader: - object_storage: - prefix: model_artifacts/xlm-roberta-language-classifier + blob_storage: + blob_prefix: model_artifacts/xlm-roberta-language-classifier name: XlmRobertaLanguageClassifier - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /xlm_roberta_language_classifier runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: xlm_roberta_language_classifier ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -249,14 +279,20 @@ applications: custom_deployment_import_path: prompt_injection_tfidf:PromptInjectionTfidfDeployment deployment_type: custom_deployment name: PromptInjectionTfidf - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /prompt_injection_tfidf runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: APPLICATION_NAME: "PromptInjectionTfidf" API_VERSION: "v1" ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -290,18 +326,24 @@ applications: tensor_parallel_size: 1 model_id: cross_encoder model_loader: - object_storage: - prefix: model_artifacts/cross-encoder + blob_storage: + blob_prefix: model_artifacts/cross-encoder model_type: vllm_scoring_model name: CrossEncoder - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /cross_encoder runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: cross_encoder ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -310,69 +352,70 @@ applications: SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" - args: - application_name: Llama31Instruct + application_name: GptOss20b deployment_configs: LLMDeployment: gpu_type_options_override: - A10G: - ray_actor_options: - num_gpus: 2 H100: ray_actor_options: num_gpus: 0.5 L40S: ray_actor_options: num_gpus: 1 - T4: - ray_actor_options: - num_gpus: 4 - runtime_env: - pip: - - triton==3.2.0 options: autoscaling_config: - max_replicas: {{.Replicas.Llama31Instruct}} - min_replicas: {{.Replicas.Llama31Instruct}} + max_replicas: {{.Replicas.GptOss20b}} + min_replicas: {{.Replicas.GptOss20b}} deployment_type: text_gen_model_deployment - gpu_types: '["L40S"]' + gpu_types: '["{{.AcceleratorType}}"]' model_definition: gpu_type_model_config_override: - A10G: - engine_args: - tensor_parallel_size: 2 H100: engine_args: - gpu_memory_utilization: 0.5 + gpu_memory_utilization: 0.90 tensor_parallel_size: 1 L40S: engine_args: + gpu_memory_utilization: 0.90 tensor_parallel_size: 1 - T4: - engine_args: - dtype: half - tensor_parallel_size: 4 - model_id: llama31_instruct + model_config: + openai_serving_config: + chat: + enable_auto_tools: true + tool_parser: openai + responses: + enable_auto_tools: true + tool_parser: openai + model_id: gpt_oss_20b model_loader: - object_storage: - prefix: model_artifacts/llama31-8b-instruct + blob_storage: + blob_prefix: model_artifacts/gpt-oss-20b tokenizer_definition: - model_id: llama31_instruct + model_id: gpt_oss_20b model_loader: - object_storage: + blob_storage: artifacts_list: + - chat_template.jinja - config.json - tokenizer_config.json - tokenizer.json - prefix: model_artifacts/llama31-8b-instruct - name: Llama31Instruct - import_path: splunkai_models_apps.main:create_serve_app - route_prefix: /llama31_instruct + blob_prefix: model_artifacts/gpt-oss-20b + name: GptOss20b + import_path: main:create_serve_app + route_prefix: /gpt_oss_20b runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" - APPLICATION_NAME: llama31_instruct + APPLICATION_NAME: gpt_oss_20b + VLLM_ATTENTION_BACKEND: TRITON_ATTN ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -410,21 +453,26 @@ applications: model_config: engine_args: gpu_memory_utilization: 0.1 - task: classify tensor_parallel_size: 1 model_id: e5_language_classifier model_loader: - object_storage: - prefix: model_artifacts/e5-language-classifier + blob_storage: + blob_prefix: model_artifacts/e5-language-classifier name: E5LanguageClassifier - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /e5_language_classifier runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: e5_language_classifier ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -433,78 +481,70 @@ applications: SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" - args: - application_name: Llama3170bInstructAwq + application_name: GptOss120b deployment_configs: LLMDeployment: gpu_type_options_override: - A100: - ray_actor_options: - num_gpus: 4 - A10G: - ray_actor_options: - num_gpus: 4 H100: ray_actor_options: num_gpus: 1 L40S: ray_actor_options: num_gpus: 2 - T4: - ray_actor_options: - num_gpus: 8 - runtime_env: - pip: - - triton==3.2.0 options: autoscaling_config: - max_replicas: {{.Replicas.Llama3170bInstructAwq}} - min_replicas: {{.Replicas.Llama3170bInstructAwq}} - max_ongoing_requests: 4 + max_replicas: {{.Replicas.GptOss120b}} + min_replicas: {{.Replicas.GptOss120b}} deployment_type: text_gen_model_deployment - gpu_types: '["L40S"] ' + gpu_types: '["{{.AcceleratorType}}"]' model_definition: gpu_type_model_config_override: - A100: - engine_args: - tensor_parallel_size: 4 - A10G: - engine_args: - gpu_memory_utilization: 0.95 - tensor_parallel_size: 4 H100: engine_args: - gpu_memory_utilization: 0.95 + gpu_memory_utilization: 0.90 tensor_parallel_size: 1 L40S: engine_args: - gpu_memory_utilization: 0.95 + gpu_memory_utilization: 0.90 tensor_parallel_size: 2 - T4: - engine_args: - dtype: half - tensor_parallel_size: 8 - model_id: llama31_70b_instruct_awq + model_config: + openai_serving_config: + chat: + enable_auto_tools: true + tool_parser: openai + responses: + enable_auto_tools: true + tool_parser: openai + model_id: gpt_oss_120b model_loader: - object_storage: - prefix: model_artifacts/llama31-70b-instruct-awq + blob_storage: + blob_prefix: model_artifacts/gpt-oss-120b tokenizer_definition: - model_id: llama31_70b_instruct_awq + model_id: gpt_oss_120b model_loader: - object_storage: + blob_storage: artifacts_list: + - chat_template.jinja - config.json - tokenizer_config.json - tokenizer.json - prefix: model_artifacts/llama31-70b-instruct-awq - name: Llama3170bInstructAwq - import_path: splunkai_models_apps.main:create_serve_app - route_prefix: /llama31_70b_instruct_awq + blob_prefix: model_artifacts/gpt-oss-120b + name: GptOss120b + import_path: main:create_serve_app + route_prefix: /gpt_oss_120b runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" - APPLICATION_NAME: llama31_70b_instruct_awq + APPLICATION_NAME: gpt_oss_120b + VLLM_ATTENTION_BACKEND: TRITON_ATTN ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -538,14 +578,20 @@ applications: local_path: /home/ray/local_model_artifacts/prompt-injection-cross-encoder-1114 model_type: sentence_transformer_cross_encoder name: PromptInjectionCrossEncoder - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /prompt_injection_cross_encoder runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: prompt_injection_cross_encoder ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" @@ -564,14 +610,20 @@ applications: local_path: /home/ray/local_model_artifacts/prompt-injection-classifier-01052025 model_type: custom_model name: PromptInjectionClassifier - import_path: splunkai_models_apps.main:create_serve_app + import_path: main:create_serve_app route_prefix: /prompt_injection_classifier runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" APPLICATION_NAME: prompt_injection_classifier ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" ENABLE_AUTHN: "false" ENABLE_AUTHZ: "false" SERVICE_EXTERNAL_NAME: "ai-platform-models" diff --git a/config/configs/features/saia.yaml b/config/configs/features/saia.yaml index 0fec5fc..a9192da 100644 --- a/config/configs/features/saia.yaml +++ b/config/configs/features/saia.yaml @@ -4,8 +4,8 @@ applicationScale: CrossEncoder: 1 E5LanguageClassifier: 1 Entrypoint: 1 - Llama31Instruct: 1 - Llama3170bInstructAwq: 1 + GptOss20b: 1 + GptOss120b: 1 MbartTranslator: 1 PromptInjectionClassifier: 1 PromptInjectionCrossEncoder: 1 @@ -17,6 +17,9 @@ instanceScale: l40s-0-gpu: 1 l40s-1-gpu: 2 l40s-2-gpu: 1 + H100: + h100-0-gpu: 1 + h100-1-gpu: 2 H100_NVL: h100-nvl-0-gpu: 1 h100-nvl-1-gpu: 2 \ No newline at end of file diff --git a/config/configs/instance.yaml b/config/configs/instance.yaml index 46518de..e704fd7 100644 --- a/config/configs/instance.yaml +++ b/config/configs/instance.yaml @@ -18,8 +18,8 @@ L40S: cpu: "4" limits: cpu: "16" - memory: "16Gi" - ephemeral-storage: "50Gi" + memory: "64Gi" + ephemeral-storage: "200Gi" nvidia.com/gpu: "1" - tier: l40s-2-gpu gpusPerPod: 2 @@ -31,6 +31,30 @@ L40S: memory: "48Gi" ephemeral-storage: "100Gi" nvidia.com/gpu: "2" +H100: + - tier: h100-0-gpu + gpusPerPod: 0 + env: + NVIDIA_VISIBLE_DEVICES: void + resources: + limits: + cpu: "16" + memory: "32Gi" + ephemeral-storage: "10Gi" + nvidia.com/gpu: "0" + requests: + cpu: "4" + - tier: h100-1-gpu + gpusPerPod: 1 + # No NVIDIA_VISIBLE_DEVICES here - GPUs must be visible for vLLM + resources: + requests: + cpu: "4" + limits: + cpu: "16" + memory: "48Gi" + ephemeral-storage: "100Gi" + nvidia.com/gpu: "1" H100_NVL: - tier: h100-nvl-0-gpu gpusPerPod: 0 diff --git a/config/crd/bases/ai.splunk.com_aiplatforms.yaml b/config/crd/bases/ai.splunk.com_aiplatforms.yaml index 98675dc..67fc505 100644 --- a/config/crd/bases/ai.splunk.com_aiplatforms.yaml +++ b/config/crd/bases/ai.splunk.com_aiplatforms.yaml @@ -2227,15 +2227,27 @@ spec: properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO) - Must be a valid HTTP/HTTPS URL + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) pattern: ^https?://.*$ type: string path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + type: string + provider: + description: |- + Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Values: aws, minio, seaweedfs, s3compat, gcs, azure + enum: + - aws + - minio + - seaweedfs + - s3compat + - gcs + - azure type: string region: description: Region of the remote storage volume. Required for @@ -2243,7 +2255,8 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials + description: Secret name containing storage credentials (e.g. + s3_access_key, s3_secret_key for S3-compatible backends) maxLength: 253 minLength: 1 type: string diff --git a/config/crd/bases/ai.splunk.com_aiservices.yaml b/config/crd/bases/ai.splunk.com_aiservices.yaml index f9c3493..5bce496 100644 --- a/config/crd/bases/ai.splunk.com_aiservices.yaml +++ b/config/crd/bases/ai.splunk.com_aiservices.yaml @@ -1818,15 +1818,27 @@ spec: properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO) - Must be a valid HTTP/HTTPS URL + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) pattern: ^https?://.*$ type: string path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + type: string + provider: + description: |- + Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Values: aws, minio, seaweedfs, s3compat, gcs, azure + enum: + - aws + - minio + - seaweedfs + - s3compat + - gcs + - azure type: string region: description: Region of the remote storage volume. Required for @@ -1834,7 +1846,8 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials + description: Secret name containing storage credentials (e.g. + s3_access_key, s3_secret_key for S3-compatible backends) maxLength: 253 minLength: 1 type: string diff --git a/docs/configuration/object-storage.md b/docs/configuration/object-storage.md new file mode 100644 index 0000000..70a1f7a --- /dev/null +++ b/docs/configuration/object-storage.md @@ -0,0 +1,106 @@ +# Object Storage Selection + +This document describes how the Splunk AI Operator chooses the object storage backend and how to configure AWS S3, MinIO, SeaweedFS, or any S3-compatible storage. + +## How the operator decides the backend + +The operator selects the storage backend **only by the path scheme** in `spec.objectStorage.path`: + +| Path scheme | Backend behavior | cloudProvider | artifactsProvider | +|-----------------|-------------------------------------|---------------|-------------------| +| `s3://` | **AWS S3** (region, IRSA, no custom endpoint) | `aws` | `s3` | +| `s3compat://` | **S3-compatible** (generic; requires endpoint + secretRef) | `s3compat` | `s3` | +| `minio://` | **MinIO** (alias for S3-compatible) | `s3compat` | `s3` | +| `seaweedfs://` | **SeaweedFS** (alias for S3-compatible) | `s3compat` | `s3` | +| `gs://` / `gcs://` | **GCP Cloud Storage** | `gcp` | `gcs` | +| `azure://` | **Azure Blob Storage** | `azure` | `azure` | + +- **Path scheme** is the only decision input; there is no separate "provider type" switch in the operator logic. +- For **S3-compatible** backends (MinIO, SeaweedFS, Ceph, or any custom S3 API), use **`s3compat://bucket/prefix`** with `endpoint` and `secretRef` set. You can also use `minio://` or `seaweedfs://` as aliases; all use the same implementation (AWS S3 SDK with custom endpoint and path-style). + +## cloudProvider vs artifactsProvider + +- **cloudProvider**: Identifies the *platform* (e.g. `aws` for native AWS S3, `s3compat` for MinIO/SeaweedFS/other S3-compatible). Used for telemetry and any logic that needs to distinguish "real AWS" from "custom S3-compatible". +- **artifactsProvider**: The *protocol* used to access artifacts. For all S3 API backends (AWS S3, MinIO, SeaweedFS) the protocol is the S3 API, so `artifactsProvider` is always `s3` for those. Only GCS and Azure use different protocols (`gcs`, `azure`). + +## Path schemes and required fields + +- **`s3://bucket/prefix`** + - Use for **AWS S3** only. + - Set `region`. Optionally use `secretRef` for static credentials; otherwise IRSA or default AWS credential chain is used. Do **not** set `endpoint` for native S3. + +- **`s3compat://bucket/prefix`** + - Use for **any S3-compatible** backend (MinIO, SeaweedFS, Ceph, etc.). + - **Required:** `endpoint` (e.g. `http://minio.namespace.svc:9000` or `http://seaweedfs-s3:8333`), `region` (any value), `secretRef` with `s3_access_key` and `s3_secret_key`. + +- **`minio://bucket/prefix`** + - Alias for S3-compatible; use for **MinIO** (in-cluster or external). Same requirements as `s3compat://`. + +- **`seaweedfs://bucket/prefix`** + - Alias for S3-compatible; use for **SeaweedFS** (bring your own). Same requirements as `s3compat://`. + +## Optional provider field + +`spec.objectStorage.provider` is an optional hint for documentation and tooling. Allowed values: `aws`, `minio`, `seaweedfs`, `s3compat`, `gcs`, `azure`. The operator **does not** use this field to select the backend; behavior is derived only from the path scheme (and for `s3://`, absence of endpoint). Use it for clarity in manifests or scripts. + +## YAML examples + +### AWS S3 + +```yaml +spec: + objectStorage: + path: s3://my-ai-bucket/artifacts + region: us-east-2 + # secretRef optional when using IRSA +``` + +### MinIO (in-cluster) + +```yaml +spec: + objectStorage: + path: minio://ai-platform-bucket/artifacts + endpoint: http://minio.minio.svc.cluster.local:9000 + region: us-east-1 + secretRef: minio-credentials +``` + +### MinIO (external, e.g. EC2) + +```yaml +spec: + objectStorage: + path: minio://ai-platform-bucket/artifacts + endpoint: http://10.0.1.50:9000 + region: us-east-1 + secretRef: minio-credentials +``` + +### SeaweedFS + +```yaml +spec: + objectStorage: + path: seaweedfs://my-bucket/artifacts + endpoint: http://seaweedfs-s3.my-namespace.svc:8333 + region: us-east-1 + secretRef: minio-credentials +``` + +### Generic S3-compatible (e.g. Ceph, custom endpoint) + +```yaml +spec: + objectStorage: + path: s3compat://my-bucket/artifacts + endpoint: http://s3-gateway.my-namespace.svc:8333 + region: us-east-1 + secretRef: minio-credentials +``` + +The same Kubernetes secret format is used for all S3-compatible backends: keys `s3_access_key` and `s3_secret_key`. Pods receive **`S3COMPAT_OBJECT_STORE_ENDPOINT_URL`** (when endpoint is set), **`S3COMPAT_OBJECT_STORE_ACCESS_KEY`**, and **`S3COMPAT_OBJECT_STORE_SECRET_KEY`** from the operator. + +## Adding new S3-compatible backends + +Any storage that exposes an S3-compatible API (e.g. Ceph, DigitalOcean Spaces) can be used by using **`s3compat://bucket`** with the appropriate `endpoint` and `secretRef`. No new client code or scheme is required; `minio://` and `seaweedfs://` remain as optional aliases for clarity. diff --git a/docs/configuration/storage-artifacts.md b/docs/configuration/storage-artifacts.md index 58ae8f9..4584e28 100644 --- a/docs/configuration/storage-artifacts.md +++ b/docs/configuration/storage-artifacts.md @@ -6,10 +6,17 @@ The Splunk AI team has provided global artifact storage in a publicly readable S ## Prerequisites Utilizing the AI Platform requires one of the following remote storage providers: - * An Amazon S3 or S3-API-compliant remote object storage location + * **AWS S3** – Native Amazon S3 (use path scheme `s3://`) + * **MinIO** – S3-compatible, in-cluster or external (use path scheme `s3compat://` or `minio://` with endpoint and credentials) + * **SeaweedFS** – S3-compatible (use path scheme `s3compat://` or `seaweedfs://` with endpoint and credentials) + * Any other **S3-API-compatible** storage (use `s3compat://` with endpoint and secretRef; `minio://` and `seaweedfs://` are optional aliases) * Azure blob storage * GCP Cloud Storage +### Object storage selection + +The operator chooses the backend **by the path scheme** in `spec.objectStorage.path`. Use `s3://` for AWS S3 only; use `s3compat://` (or `minio://` / `seaweedfs://` as aliases) with `endpoint` and `secretRef` for MinIO, SeaweedFS, or any S3-compatible backend. See [Object Storage Selection](object-storage.md) for the full decision table, path schemes, and YAML examples. + ### Prerequisites common to all remote storage providers * Read-write access to the path used to host the files. * Connections to the remote object storage endpoint need to be secured using a minimum version of TLS 1.2. diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 871dbc7..57854d5 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -196,6 +196,167 @@ kubectl logs -l ray.io/node-type=worker -n | grep - `CUDA_VISIBLE_DEVICES is set to empty string` → GPU configuration issue - `RuntimeError: CUDA out of memory` → Increase GPU resources +#### "Invalid repository ID or local directory" (e.g. Llama31Instruct / VLLMTextGenModel) + +If you see a validation error like: + +```text +Invalid repository ID or local directory specified: '/home/ray/.cache/s3/artifacts/model_artifacts/llama31-8b-instruct'. +Please verify the following requirements: +1. Provide a valid Hugging Face repository ID. +2. Specify a local directory that contains a recognized configuration file. + - For Hugging Face models: ensure the presence of a 'config.json'. +``` + +the model loader is trying to use a **local path** where the model should have been downloaded from object storage (S3/MinIO). That path is either missing or does not contain the required files (e.g. `config.json`). Common causes: + +1. **Model not in object storage** + The prefix `model_artifacts/llama31-8b-instruct` must exist in your bucket with a full Hugging Face–style layout (including `config.json` and weight files). + - Download: `./tools/artifacts_download_upload_scripts/download_from_huggingface.sh` + - Upload to MinIO/S3-compatible: `./tools/artifacts_download_upload_scripts/upload_to_minio.sh` (set `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, and credentials as in the [artifacts README](../tools/artifacts_download_upload_scripts/README.md); `MINIO_*` env vars are also accepted). + +2. **Ray workers cannot reach MinIO/S3** + - For **external MinIO** (e.g. EC2): ensure the MinIO endpoint in `cluster-config.yaml` (`storage.minio.endpoint`) is reachable from EKS (security groups, VPC, and if using a public IP, that nodes can egress to it). + - From a Ray worker pod: + `kubectl exec -it -n -- env | grep -E 'OBJECT_STORE|ARTIFACTS|S3'` + then test connectivity (e.g. curl to the object store endpoint or use the same client the SDK uses). + +3. **Wrong or missing credentials** + AIPlatform must have `objectStorage.secretRef` pointing to a secret with `s3_access_key` and `s3_secret_key` (the operator passes these as `S3COMPAT_OBJECT_STORE_ACCESS_KEY` / `S3COMPAT_OBJECT_STORE_SECRET_KEY` to Ray). Verify the secret exists and matches the S3-compatible account that can read the bucket: + - `kubectl get secret minio-credentials -n -o jsonpath='{.data}'` + +4. **Bucket/prefix mismatch** + The bucket name in AIPlatform `objectStorage.path` (e.g. `minio://`) and the prefix in the application config (`model_artifacts/llama31-8b-instruct`) must match where you uploaded the model. + +**Quick checks:** + +- List objects in the object store for the model prefix (from a host with `mc` or AWS CLI configured): + - `mc ls myminio//model_artifacts/llama31-8b-instruct/` + You should see at least `config.json` and the model weight files. +- From a Ray worker pod, confirm env vars and that the path is writable: + - `kubectl exec -it -n -- ls -la /home/ray/.cache/s3/artifacts/model_artifacts/ 2>/dev/null || echo "path missing or empty"` + If the directory is missing or empty, the download from object storage failed (network, credentials, or missing objects). + +**Full reset when the deployment keeps failing (e.g. Llama31Instruct / LLMDeploymentL40S):** + +If the model is correct in object storage and credentials are in the serve config but the replica still fails with "Invalid repository ID or local directory", clear the artifact cache and restart Ray so replicas run a fresh download and load. + +1. **Clear the artifact cache on all workers** + Either remove only the failing model prefix or the entire `model_artifacts` tree (more thorough): + + ```bash + export AI_NS="${AI_NS:-ai-platform}" + + # Option A: clear only the failing model (e.g. llama31-8b-instruct) + for p in $(kubectl get pods -n "$AI_NS" -l ray.io/node-type=worker -o jsonpath='{.items[*].metadata.name}'); do + kubectl exec -n "$AI_NS" "$p" -c ray-worker -- rm -rf /home/ray/.cache/s3/artifacts/model_artifacts/llama31-8b-instruct + done + + # Option B: clear entire model_artifacts (use if multiple models or unknown state) + for p in $(kubectl get pods -n "$AI_NS" -l ray.io/node-type=worker -o jsonpath='{.items[*].metadata.name}'); do + kubectl exec -n "$AI_NS" "$p" -c ray-worker -- rm -rf /home/ray/.cache/s3/artifacts/model_artifacts + done + ``` + +2. **Restart worker pods** so new replicas run and download from object storage: + + ```bash + kubectl delete pods -n "$AI_NS" -l ray.io/node-type=worker + ``` + +3. **Optional: restart the Ray head** to force a full Ray Serve redeploy (new replica placement and startup): + + ```bash + kubectl delete pod -n "$AI_NS" -l ray.io/node-type=head + ``` + +4. **Wait 10–15 minutes** for workers (and head) to be Running and for the deployment replica to download the model and start. The first download can be large (e.g. ~16 GB for Llama 3.1 8B); if the replica is restarted too soon (e.g. after a few quick failures), the download may never complete. + +5. **Verify** the deployment status and, if needed, that a worker has the model: + + ```bash + kubectl get rayservice -n "$AI_NS" -o yaml | grep -A 30 'Llama31Instruct:' + WORKER=$(kubectl get pods -n "$AI_NS" -l ray.io/node-type=worker -o jsonpath='{.items[0].metadata.name}') + kubectl exec -n "$AI_NS" "$WORKER" -c ray-worker -- sh -c 'ls /home/ray/.cache/s3/artifacts/model_artifacts/llama31-8b-instruct/*.safetensors 2>/dev/null || echo "No safetensors"' + ``` + +### Object store credentials and serve config verification + +When using S3-compatible object storage (MinIO, SeaweedFS, etc.), the operator injects credentials from the object storage secret into the Ray Serve config so replicas can download model artifacts. Use these steps to verify the secret and that the updated serve config is applied. + +**1. Check that the AIPlatform object storage secret exists and has the required keys** + +Replace `` with your AIPlatform namespace (e.g. `ai-platform`) and `` with the value of `spec.objectStorage.secretRef` from your AIPlatform (e.g. `minio-credentials`). + +```bash +# Get AIPlatform namespace and secretRef (optional: discover from the CR) +kubectl get aiplatform -A -o custom-columns=NAME:.metadata.name,NS:.metadata.namespace,SECRET:.spec.objectStorage.secretRef + +# Confirm the secret exists in the same namespace as the AIPlatform +kubectl get secret -n + +# List secret keys (names only; values are base64-encoded and must not be logged) +kubectl get secret -n -o jsonpath='{.data}' | jq -r 'keys[]' + +# Verify required keys are present (expect s3_access_key and s3_secret_key) +kubectl get secret -n -o jsonpath='{.data}' | jq -r 'keys[]' | grep -E 's3_access_key|s3_secret_key' +``` + +If either `s3_access_key` or `s3_secret_key` is missing, create or update the secret, for example: + +```bash +kubectl -n create secret generic \ + --from-literal=s3_access_key="" \ + --from-literal=s3_secret_key="" \ + --dry-run=client -o yaml | kubectl apply -f - +``` + +**2. Reconcile or restart the operator with the new image** + +After updating the operator image (with the change that injects object store credentials into the serve config), either trigger a reconcile or restart the operator so it rewrites `RayService.spec.serveConfigV2`. + +- **Option A – Restart the operator deployment** (simplest; causes one reconcile when the pod comes back): + + ```bash + # Replace with the namespace where the operator runs (e.g. splunk-ai-operator-system) + kubectl rollout restart deployment splunk-ai-operator-controller-manager -n + kubectl rollout status deployment splunk-ai-operator-controller-manager -n + ``` + +- **Option B – Trigger reconcile by touching the AIPlatform** (no operator restart): + + ```bash + kubectl annotate aiplatform -n \ + reconcile-$(date +%s)=triggered --overwrite + ``` + + The operator will reconcile and regenerate the RayService; ensure the operator is already running the new image before doing this. + +**3. Confirm RayService.spec.serveConfigV2 includes S3COMPAT_OBJECT_STORE_ACCESS_KEY and S3COMPAT_OBJECT_STORE_SECRET_KEY** + +The serve config is a JSON string in `RayService.spec.serveConfigV2`. Check that it contains the object store env vars for the apps (e.g. after the operator has reconciled). + +```bash +# Set your AIPlatform namespace and RayService name (often the same as AIPlatform name, e.g. splunk-ai-stack) +NAMESPACE="" +RAY_SERVICE_NAME="" + +# Count occurrences of S3COMPAT_OBJECT_STORE_ACCESS_KEY in the serve config (expect > 0 when using S3-compatible storage) +kubectl get rayservice "$RAY_SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.serveConfigV2}' | jq -Rs 'split("S3COMPAT_OBJECT_STORE_ACCESS_KEY") | length - 1' + +# Show a snippet to confirm the keys are present (values are redacted in output) +kubectl get rayservice "$RAY_SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.serveConfigV2}' | grep -o '"S3COMPAT_OBJECT_STORE_ACCESS_KEY"[^,]*' | head -1 +kubectl get rayservice "$RAY_SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.serveConfigV2}' | grep -o '"S3COMPAT_OBJECT_STORE_SECRET_KEY"[^,]*' | head -1 +``` + +If the count is 0, the operator may not be using the new image, or `objectStorage.secretRef` may be unset. Ensure: + +- The AIPlatform has `spec.objectStorage.path` with scheme `s3compat://`, `minio://`, or `seaweedfs://` and `spec.objectStorage.secretRef` set to the secret name. +- The secret exists in the AIPlatform namespace and contains `s3_access_key` and `s3_secret_key`. +- The operator deployment has been restarted (or reconciled) with the image that injects object store credentials into the applications template. + +After confirming, restart Ray workers if needed so they pick up the new env (e.g. scale down and up the Ray cluster or wait for rolling restart), then re-check replica logs and the cache path `/home/ray/.cache/s3/artifacts/model_artifacts/...`. + ### Weaviate Errors ```bash diff --git a/go.mod b/go.mod index 8860ea8..e5daf45 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.9.0 github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.1 github.com/aws/aws-sdk-go v1.55.7 - github.com/cert-manager/cert-manager v1.18.0 + github.com/cert-manager/cert-manager v1.18.5 github.com/go-logr/logr v1.4.3 github.com/google/go-cmp v0.7.0 github.com/onsi/ginkgo/v2 v2.22.2 @@ -31,7 +31,7 @@ require ( ) require ( - cel.dev/expr v0.24.0 // indirect + cel.dev/expr v0.25.1 // indirect cloud.google.com/go v0.121.1 // indirect cloud.google.com/go/auth v0.16.1 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect @@ -48,11 +48,11 @@ require ( github.com/blang/semver/v4 v4.0.0 // indirect github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f // indirect + github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.12.1 // indirect - github.com/envoyproxy/go-control-plane/envoy v1.35.0 // indirect - github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect + github.com/envoyproxy/go-control-plane/envoy v1.36.0 // indirect + github.com/envoyproxy/protoc-gen-validate v1.3.0 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.8.0 // indirect @@ -97,7 +97,7 @@ require ( github.com/stoewer/go-strcase v1.3.0 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect - go.opentelemetry.io/contrib/detectors/gcp v1.38.0 // indirect + go.opentelemetry.io/contrib/detectors/gcp v1.39.0 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect go.opentelemetry.io/otel v1.40.0 // indirect @@ -124,7 +124,7 @@ require ( google.golang.org/genproto v0.0.0-20250505200425-f936aa4a68b2 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20260128011058-8636f8732409 // indirect - google.golang.org/grpc v1.78.0 // indirect + google.golang.org/grpc v1.79.3 // indirect google.golang.org/protobuf v1.36.11 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect diff --git a/go.sum b/go.sum index c6c9fa9..7021646 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,5 @@ -cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY= -cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= +cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4= +cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4= cloud.google.com/go v0.121.1 h1:S3kTQSydxmu1JfLRLpKtxRPA7rSrYPRPEUmL/PavVUw= cloud.google.com/go v0.121.1/go.mod h1:nRFlrHq39MNVWu+zESP2PosMWA0ryJw8KUBZ2iZpxbw= cloud.google.com/go/auth v0.16.1 h1:XrXauHMd30LhQYVRHLGvJiYeczweKQXZxsTbV9TiguU= @@ -54,12 +54,12 @@ github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= -github.com/cert-manager/cert-manager v1.18.0 h1:v7vxC1Mx5tkDz1oGOAktB88zA6TbGKcmpLM92+AIXRc= -github.com/cert-manager/cert-manager v1.18.0/go.mod h1:icDJx4kG9BCNpGjBvrmsFd99d+lXUvWdkkcrSSQdIiw= +github.com/cert-manager/cert-manager v1.18.5 h1:Gx4FSpSPYcSC4MQf43QjbxDfyTEbwZgfZQs5Lq9QlBs= +github.com/cert-manager/cert-manager v1.18.5/go.mod h1:HbPSO5MW/44wu19t84eY/K4c4/WwyPB4bA3uffOH92s= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f h1:Y8xYupdHxryycyPlc9Y+bSQAYZnetRJ70VMVKm5CKI0= -github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f/go.mod h1:HlzOvOjVBOfTGSRXRyY0OiCS/3J1akRGQQpRO/7zyF4= +github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5 h1:6xNmx7iTtyBRev0+D/Tv1FZd4SCg8axKApyNyRsAt/w= +github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5/go.mod h1:KdCmV+x/BuvyMxRnYBlmVaq4OLiKW6iRQfvC62cvdkI= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -69,14 +69,14 @@ github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/r github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= github.com/emicklei/go-restful/v3 v3.12.1 h1:PJMDIM/ak7btuL8Ex0iYET9hxM3CI2sjZtzpL63nKAU= github.com/emicklei/go-restful/v3 v3.12.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/envoyproxy/go-control-plane v0.13.5-0.20251024222203-75eaa193e329 h1:K+fnvUM0VZ7ZFJf0n4L/BRlnsb9pL/GuDG6FqaH+PwM= -github.com/envoyproxy/go-control-plane v0.13.5-0.20251024222203-75eaa193e329/go.mod h1:Alz8LEClvR7xKsrq3qzoc4N0guvVNSS8KmSChGYr9hs= -github.com/envoyproxy/go-control-plane/envoy v1.35.0 h1:ixjkELDE+ru6idPxcHLj8LBVc2bFP7iBytj353BoHUo= -github.com/envoyproxy/go-control-plane/envoy v1.35.0/go.mod h1:09qwbGVuSWWAyN5t/b3iyVfz5+z8QWGrzkoqm/8SbEs= +github.com/envoyproxy/go-control-plane v0.14.0 h1:hbG2kr4RuFj222B6+7T83thSPqLjwBIfQawTkC++2HA= +github.com/envoyproxy/go-control-plane v0.14.0/go.mod h1:NcS5X47pLl/hfqxU70yPwL9ZMkUlwlKxtAohpi2wBEU= +github.com/envoyproxy/go-control-plane/envoy v1.36.0 h1:yg/JjO5E7ubRyKX3m07GF3reDNEnfOboJ0QySbH736g= +github.com/envoyproxy/go-control-plane/envoy v1.36.0/go.mod h1:ty89S1YCCVruQAm9OtKeEkQLTb+Lkz0k8v9W0Oxsv98= github.com/envoyproxy/go-control-plane/ratelimit v0.1.0 h1:/G9QYbddjL25KvtKTv3an9lx6VBE2cnb8wp1vEGNYGI= github.com/envoyproxy/go-control-plane/ratelimit v0.1.0/go.mod h1:Wk+tMFAFbCXaJPzVVHnPgRKdUdwW/KdbRt94AzgRee4= -github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8= -github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU= +github.com/envoyproxy/protoc-gen-validate v1.3.0 h1:TvGH1wof4H33rezVKWSpqKz5NXWg5VPuZ0uONDT6eb4= +github.com/envoyproxy/protoc-gen-validate v1.3.0/go.mod h1:HvYl7zwPa5mffgyeTUHA9zHIH36nmrm7oCbo4YKoSWA= github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls= github.com/evanphx/json-patch v5.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= @@ -225,8 +225,8 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= -go.opentelemetry.io/contrib/detectors/gcp v1.38.0 h1:ZoYbqX7OaA/TAikspPl3ozPI6iY6LiIY9I8cUfm+pJs= -go.opentelemetry.io/contrib/detectors/gcp v1.38.0/go.mod h1:SU+iU7nu5ud4oCb3LQOhIZ3nRLj6FNVrKgtflbaf2ts= +go.opentelemetry.io/contrib/detectors/gcp v1.39.0 h1:kWRNZMsfBHZ+uHjiH4y7Etn2FK26LAGkNFw7RHv1DhE= +go.opentelemetry.io/contrib/detectors/gcp v1.39.0/go.mod h1:t/OGqzHBa5v6RHZwrDBJ2OirWc+4q/w2fTbLZwAKjTk= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 h1:x7wzEgXfnzJcHDwStJT+mxOz4etr2EcexjqhBvmoakw= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0/go.mod h1:rg+RlpR5dKwaS95IyyZqj5Wd4E13lk/msnTS0Xl9lJM= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU= @@ -313,8 +313,8 @@ google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409 h1: google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409/go.mod h1:fl8J1IvUjCilwZzQowmw2b7HQB2eAuYBabMXzWurF+I= google.golang.org/genproto/googleapis/rpc v0.0.0-20260128011058-8636f8732409 h1:H86B94AW+VfJWDqFeEbBPhEtHzJwJfTbgE2lZa54ZAQ= google.golang.org/genproto/googleapis/rpc v0.0.0-20260128011058-8636f8732409/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ= -google.golang.org/grpc v1.78.0 h1:K1XZG/yGDJnzMdd/uZHAkVqJE+xIDOcmdSFZkBUicNc= -google.golang.org/grpc v1.78.0/go.mod h1:I47qjTo4OKbMkjA/aOOwxDIiPSBofUtQUI5EfpWvW7U= +google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE= +google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/helm-chart/splunk-ai-operator/Chart.lock b/helm-chart/splunk-ai-operator/Chart.lock index 0136098..e80b049 100644 --- a/helm-chart/splunk-ai-operator/Chart.lock +++ b/helm-chart/splunk-ai-operator/Chart.lock @@ -13,6 +13,6 @@ dependencies: version: 72.4.0 - name: splunk-operator repository: https://splunk.github.io/splunk-operator - version: 3.0.0 -digest: sha256:41032e66994109109208bc66b07b6f10890c9c8dafe019aa480d73d4effe915a -generated: "2025-12-11T11:23:06.233099-08:00" + version: 3.1.0 +digest: sha256:bc5e962d5c6b465b26a13a91660d7fa45687c394e124abe2beb96e4a2e3760df +generated: "2026-03-21T00:24:00.448397+05:30" diff --git a/helm-chart/splunk-ai-operator/Chart.yaml b/helm-chart/splunk-ai-operator/Chart.yaml index 782cb8e..6101ae9 100644 --- a/helm-chart/splunk-ai-operator/Chart.yaml +++ b/helm-chart/splunk-ai-operator/Chart.yaml @@ -86,6 +86,6 @@ dependencies: # Splunk Operator - Required for managing Splunk Enterprise instances - name: splunk-operator - version: "3.0.0" + version: "3.1.0" repository: "https://splunk.github.io/splunk-operator" condition: splunk-operator.enabled diff --git a/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml b/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml index 98675dc..25bf11b 100644 --- a/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml +++ b/helm-chart/splunk-ai-operator/crds/ai.splunk.com_aiplatforms.yaml @@ -2222,20 +2222,33 @@ spec: type: object objectStorage: description: |- - ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models - Supported providers: S3, GCS, Azure Blob Storage, MinIO + ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models. + Supported: AWS S3, MinIO, SeaweedFS, any S3-compatible (s3:// + endpoint), GCS, Azure Blob. + Backend is selected by path scheme; when endpoint is set with s3://, backend is S3-compatible. properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO) - Must be a valid HTTP/HTTPS URL + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS). + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible. pattern: ^https?://.*$ type: string path: description: |- - Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + Remote volume URI: s3://bucket/prefix, gs://bucket/prefix, azure://container/prefix, + minio://bucket/prefix, or seaweedfs://bucket/prefix + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + type: string + provider: + description: |- + Optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Values: aws, minio, seaweedfs, s3compat, gcs, azure + enum: + - aws + - minio + - seaweedfs + - s3compat + - gcs + - azure type: string region: description: Region of the remote storage volume. Required for @@ -2243,7 +2256,7 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials + description: Secret name containing storage credentials (e.g. s3_access_key, s3_secret_key for S3-compatible) maxLength: 253 minLength: 1 type: string diff --git a/helm-chart/splunk-ai-operator/templates/deployment.yaml b/helm-chart/splunk-ai-operator/templates/deployment.yaml index 579e800..34ed56a 100644 --- a/helm-chart/splunk-ai-operator/templates/deployment.yaml +++ b/helm-chart/splunk-ai-operator/templates/deployment.yaml @@ -40,7 +40,7 @@ spec: {{- toYaml .Values.securityContext | nindent 8 }} containers: - name: manager - image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion | default "latest" }}" + image: "{{ if .Values.image.digest }}{{ .Values.image.repository }}@{{ .Values.image.digest }}{{ else }}{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion | default "latest" }}{{ end }}" imagePullPolicy: {{ .Values.image.pullPolicy }} args: - --metrics-bind-address=:8443 diff --git a/internal/webhook/v1/aiplatform_webhook.go b/internal/webhook/v1/aiplatform_webhook.go index 7e8ceb2..674471f 100644 --- a/internal/webhook/v1/aiplatform_webhook.go +++ b/internal/webhook/v1/aiplatform_webhook.go @@ -195,14 +195,7 @@ func (v *AIPlatformCustomValidator) ValidateUpdate(ctx context.Context, oldObj, warnings = append(warnings, createWarnings...) } - // Validate immutable fields - if oldPlatform.Spec.ObjectStorage.Path != aiplatform.Spec.ObjectStorage.Path { - allErrs = append(allErrs, field.Forbidden( - field.NewPath("spec").Child("objectStorage").Child("path"), - "objectStorage.path is immutable", - )) - } - + // Validate immutable fields (path is mutable to allow switching storage backends, e.g. MinIO to SeaweedFS) if oldPlatform.Spec.ObjectStorage.Region != aiplatform.Spec.ObjectStorage.Region { allErrs = append(allErrs, field.Forbidden( field.NewPath("spec").Child("objectStorage").Child("region"), @@ -237,8 +230,8 @@ func (v *AIPlatformCustomValidator) validateObjectStorage(objStorage *aiv1.Objec if objStorage.Path == "" { allErrs = append(allErrs, field.Required(fldPath.Child("path"), "objectStorage.path must be specified")) } else { - // Validate path format (s3://, gs://, azure://, minio://) - validPrefixes := []string{"s3://", "gs://", "azure://", "minio://"} + // Validate path format (s3://, gs://, azure://, s3compat://, minio://, seaweedfs://) + validPrefixes := []string{"s3://", "gs://", "azure://", "s3compat://", "minio://", "seaweedfs://"} hasValidPrefix := false for _, prefix := range validPrefixes { if strings.HasPrefix(objStorage.Path, prefix) { @@ -250,7 +243,7 @@ func (v *AIPlatformCustomValidator) validateObjectStorage(objStorage *aiv1.Objec allErrs = append(allErrs, field.Invalid( fldPath.Child("path"), objStorage.Path, - "path must start with s3://, gs://, azure://, or minio://", + "path must start with s3://, gs://, azure://, s3compat://, minio://, or seaweedfs://", )) } } diff --git a/internal/webhook/v1/aiservice_webhook.go b/internal/webhook/v1/aiservice_webhook.go index 69a0f46..7d81d77 100644 --- a/internal/webhook/v1/aiservice_webhook.go +++ b/internal/webhook/v1/aiservice_webhook.go @@ -275,7 +275,7 @@ func (v *AIServiceCustomValidator) validateTaskVolume(taskVolume *aiv1.ObjectSto } else { // Validate path format /* - validPrefixes := []string{"s3://", "gs://", "azure://", "minio://"} + validPrefixes := []string{"s3://", "gs://", "azure://", "s3compat://", "minio://", "seaweedfs://"} hasValidPrefix := false for _, prefix := range validPrefixes { if strings.HasPrefix(taskVolume.Path, prefix) { @@ -287,7 +287,7 @@ func (v *AIServiceCustomValidator) validateTaskVolume(taskVolume *aiv1.ObjectSto allErrs = append(allErrs, field.Invalid( fldPath.Child("path"), taskVolume.Path, - "path must start with s3://, gs://, azure://, or minio://", + "path must start with s3://, gs://, azure://, s3compat://, minio://, or seaweedfs://", )) } */ diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index b3106b1..80ae6fe 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -162,17 +162,17 @@ func (r *SaiaReconciler) validateAIService( return fmt.Errorf("VectorDbUrl must be set (either from AIPlatformRef or explicitly)") } - // Default resources + // Default resources — SAIA API needs headroom beyond 2Gi or the kubelet OOMKills during startup. if ai.Spec.Resources.Requests == nil { ai.Spec.Resources.Requests = corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("500m"), - corev1.ResourceMemory: resource.MustParse("1Gi"), + corev1.ResourceMemory: resource.MustParse("2Gi"), } } if ai.Spec.Resources.Limits == nil { ai.Spec.Resources.Limits = corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("1"), - corev1.ResourceMemory: resource.MustParse("2Gi"), + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4Gi"), } } if ai.Spec.TaskVolume.Path == "" { @@ -619,16 +619,16 @@ func (r *SaiaReconciler) reconcileSAIADeployment( {Name: "S3_BUCKET", Value: extractBucketName(ai.Spec.TaskVolume.Path)}, } - // MinIO support: Add MinIO-specific environment variables if endpoint is configured - if strings.HasPrefix(ai.Spec.TaskVolume.Path, "minio") && ai.Spec.TaskVolume.Endpoint != "" { - env = append(env, corev1.EnvVar{Name: "MINIO_ENDPOINT_URL", Value: ai.Spec.TaskVolume.Endpoint}) + // S3-compatible object store: set S3COMPAT_OBJECT_STORE_ENDPOINT_URL for custom endpoint (MinIO, SeaweedFS, etc.). + if ai.Spec.TaskVolume.Endpoint != "" { + env = append(env, corev1.EnvVar{Name: "S3COMPAT_OBJECT_STORE_ENDPOINT_URL", Value: ai.Spec.TaskVolume.Endpoint}) } - // MinIO credentials: If secretRef is provided, add MINIO_ACCESS_KEY and MINIO_SECRET_KEY from secret + // S3-compatible object store credentials from secretRef (S3COMPAT_OBJECT_STORE_ACCESS_KEY, S3COMPAT_OBJECT_STORE_SECRET_KEY). if ai.Spec.TaskVolume.SecretRef != "" { env = append(env, corev1.EnvVar{ - Name: "MINIO_ACCESS_KEY", + Name: "S3COMPAT_OBJECT_STORE_ACCESS_KEY", ValueFrom: &corev1.EnvVarSource{ SecretKeyRef: &corev1.SecretKeySelector{ LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, @@ -637,7 +637,7 @@ func (r *SaiaReconciler) reconcileSAIADeployment( }, }, corev1.EnvVar{ - Name: "MINIO_SECRET_KEY", + Name: "S3COMPAT_OBJECT_STORE_SECRET_KEY", ValueFrom: &corev1.EnvVarSource{ SecretKeyRef: &corev1.SecretKeySelector{ LocalObjectReference: corev1.LocalObjectReference{Name: ai.Spec.TaskVolume.SecretRef}, @@ -932,14 +932,16 @@ func (r *SaiaReconciler) createOrUpdateConfigMap( } // extractBucketName extracts the bucket name from an object storage path. -// Supports s3://, minio://, gs://, and azure:// prefixes. +// Supports s3://, s3compat://, minio://, seaweedfs://, gs://, and azure:// prefixes. // Examples: // - "s3://my-bucket/path/to/dir" -> "my-bucket" +// - "s3compat://bucket-name" -> "bucket-name" // - "minio://bucket-name" -> "bucket-name" +// - "seaweedfs://my-bucket/prefix" -> "my-bucket" // - "gs://my-bucket" -> "my-bucket" func extractBucketName(path string) string { // Remove supported prefixes - prefixes := []string{"s3://", "minio://", "gs://", "azure://"} + prefixes := []string{"s3://", "s3compat://", "minio://", "seaweedfs://", "gs://", "azure://"} for _, prefix := range prefixes { if strings.HasPrefix(path, prefix) { path = strings.TrimPrefix(path, prefix) diff --git a/pkg/ai/raybuilder/builder.go b/pkg/ai/raybuilder/builder.go index e29a1a7..3f2ed32 100644 --- a/pkg/ai/raybuilder/builder.go +++ b/pkg/ai/raybuilder/builder.go @@ -44,9 +44,16 @@ type Builder struct { } type ApplicationParams struct { - ArtifactBucketName string `yaml:"ARTIFACTS_S3_BUCKET"` - CloudProvider string `yaml:"CLOUD_PROVIDER"` - Replicas map[string]int32 `yaml:"REPLICAS"` + ArtifactBucketName string `yaml:"ARTIFACTS_S3_BUCKET"` + ArtifactsProvider string `yaml:"ARTIFACTS_PROVIDER"` + CloudProvider string `yaml:"CLOUD_PROVIDER"` + S3CompatObjectStoreEndpointUrl string `yaml:"S3COMPAT_OBJECT_STORE_ENDPOINT_URL"` + S3CompatObjectStoreAccessKey string `yaml:"S3COMPAT_OBJECT_STORE_ACCESS_KEY"` + S3CompatObjectStoreSecretKey string `yaml:"S3COMPAT_OBJECT_STORE_SECRET_KEY"` + Replicas map[string]int32 `yaml:"REPLICAS"` + WorkingDirBase string `yaml:"WORKING_DIR_BASE"` + ModelVersion string `yaml:"MODEL_VERSION"` + AcceleratorType string `yaml:"ACCELERATOR_TYPE"` } type WorkerConfigs map[string][]InstanceDetail @@ -73,6 +80,34 @@ func New(ai *enterpriseApi.AIPlatform, client client.Client, scheme *runtime.Sch } } +// effectiveAcceleratorType returns spec.defaultAcceleratorType or L40S when unset, matching instance.yaml keys (L40S, H100_NVL). +func (b *Builder) effectiveAcceleratorType() string { + if s := strings.TrimSpace(b.ai.Spec.DefaultAcceleratorType); s != "" { + return s + } + return "L40S" +} + +// rayWorkingDirBase builds the base URI for runtime_env.working_dir application zips. +// +// Ray's Serve config rejects plain http:// for remote working_dir URIs; allowed schemes include +// s3 and https. We always use s3:// for S3 and S3-compatible backends (AWS, MinIO, SeaweedFS, etc.). +// Ray pods receive AWS_ENDPOINT_URL plus AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY (when applicable) +// from rayS3DownloadEnv; modern boto3/botocore honor AWS_ENDPOINT_URL for the S3 client used to +// fetch runtime_env packages. +// +// For GCS we use gs:// (scheme may be gs or gcs in objectStorage.path). +func rayWorkingDirBase(scheme, bucket string) string { + switch strings.ToLower(scheme) { + case "s3", "s3compat", "minio", "seaweedfs": + return fmt.Sprintf("s3://%s/ray-services/ai-platform/applications", bucket) + case "gs", "gcs": + return fmt.Sprintf("gs://%s/ray-services/ai-platform/applications", bucket) + default: + return fmt.Sprintf("%s://%s/ray-services/ai-platform/applications", scheme, bucket) + } +} + // --- 7️⃣ ReconcileRayService: build & create/update the RayService CR --- func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPlatform) error { logger := log.FromContext(ctx) // Define logger @@ -89,15 +124,30 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl return err } - // Set CloudProvider based on URL scheme - var cloudProvider string + // Set CloudProvider and artifacts provider/bucket from URL scheme (for SDK model loaders). + // ARTIFACTS_PROVIDER matches storage client GetProvider(): s3/minio/seaweedfs/s3compat -> "s3", gs/gcs -> "gcs", azure -> "azure". + // S3 (AWS) uses cloudProvider "aws" when no custom endpoint; s3compat/minio/seaweedfs use "s3compat". + var cloudProvider, artifactsProvider string switch u.Scheme { case "s3": - cloudProvider = "aws" - case "gs": + if p.Spec.ObjectStorage.Endpoint != "" { + cloudProvider = "s3compat" + } else { + cloudProvider = "aws" + } + artifactsProvider = "s3" + case "s3compat", "minio", "seaweedfs": + cloudProvider = "s3compat" + artifactsProvider = "s3" + case "gs", "gcs": cloudProvider = "gcp" + artifactsProvider = "gcs" + case "azure": + cloudProvider = "azure" + artifactsProvider = "azure" default: - cloudProvider = "azure" // TODO: FIX THIS, need to support minio + cloudProvider = "azure" + artifactsProvider = "azure" } // Initialize the replicas map by iterating through features @@ -135,10 +185,43 @@ func (b *Builder) ReconcileRayService(ctx context.Context, p *enterpriseApi.AIPl } } + // S3-compatible backends (s3compat, minio, seaweedfs) need custom endpoint and credentials. S3 (AWS) uses region/IRSA only. + s3CompatScheme := (u.Scheme == "s3compat" || u.Scheme == "minio" || u.Scheme == "seaweedfs") + s3CompatObjectStoreEndpoint := "" + if s3CompatScheme && p.Spec.ObjectStorage.Endpoint != "" { + s3CompatObjectStoreEndpoint = p.Spec.ObjectStorage.Endpoint + } + + var s3CompatObjectStoreAccessKey, s3CompatObjectStoreSecretKey string + if p.Spec.ObjectStorage.SecretRef != "" && s3CompatScheme { + var secret corev1.Secret + secretRef := types.NamespacedName{Namespace: p.Namespace, Name: p.Spec.ObjectStorage.SecretRef} + if err := b.Get(ctx, secretRef, &secret); err != nil { + logger.Error(err, "Failed to get object storage secret for S3-compatible credentials", "secret", p.Spec.ObjectStorage.SecretRef) + return err + } + if raw, ok := secret.Data["s3_access_key"]; ok { + s3CompatObjectStoreAccessKey = string(raw) + } + if raw, ok := secret.Data["s3_secret_key"]; ok { + s3CompatObjectStoreSecretKey = string(raw) + } + } + + // Build working_dir base (s3:// or gs://; see rayWorkingDirBase). + workingDirBase := rayWorkingDirBase(u.Scheme, u.Host) + param := ApplicationParams{ - ArtifactBucketName: u.Host, - CloudProvider: cloudProvider, - Replicas: replicasMap, + ArtifactBucketName: u.Host, + ArtifactsProvider: artifactsProvider, + CloudProvider: cloudProvider, + S3CompatObjectStoreEndpointUrl: s3CompatObjectStoreEndpoint, + S3CompatObjectStoreAccessKey: s3CompatObjectStoreAccessKey, + S3CompatObjectStoreSecretKey: s3CompatObjectStoreSecretKey, + Replicas: replicasMap, + WorkingDirBase: workingDirBase, + ModelVersion: os.Getenv("MODEL_VERSION"), + AcceleratorType: b.effectiveAcceleratorType(), } // Use embedded applications.yaml content @@ -578,6 +661,7 @@ func (b *Builder) Build(ctx context.Context) (*rayv1.RayService, error) { } func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec, error) { + acceleratorType := b.effectiveAcceleratorType() annotations, labels := buildHeadAnnotationsAndLabels(b.ai) head := rayv1.HeadGroupSpec{ RayStartParams: map[string]string{ @@ -628,7 +712,7 @@ func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec if err != nil { return nil, fmt.Errorf("failed to parse feature YAML file %s: %v", fileName, err) } - for k, val := range featureConfig.InstanceScale[b.ai.Spec.DefaultAcceleratorType] { + for k, val := range featureConfig.InstanceScale[acceleratorType] { old_val, ok := instanceScale[k] if ok { instanceScale[k] = old_val + val @@ -639,17 +723,23 @@ func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec } var workers []rayv1.WorkerGroupSpec - var gpuConfigs = instanceMap[b.ai.Spec.DefaultAcceleratorType] + gpuConfigs := instanceMap[acceleratorType] + if len(gpuConfigs) == 0 { + return nil, fmt.Errorf("instance.yaml has no worker tiers for defaultAcceleratorType %q; keys must match exactly (e.g. L40S, H100_NVL)", acceleratorType) + } for _, cfg := range gpuConfigs { annotations, labels := buildWorkerAnnotationsAndLabels(b.ai, cfg) cpuLimit := cfg.Resources.Limits[corev1.ResourceCPU] + replicas := instanceScale[cfg.Tier] wg := rayv1.WorkerGroupSpec{ - GroupName: cfg.Tier, - Replicas: int32Ptr(instanceScale[cfg.Tier]), + GroupName: cfg.Tier, + Replicas: int32Ptr(replicas), + MinReplicas: int32Ptr(replicas), + MaxReplicas: int32Ptr(replicas + 5), RayStartParams: map[string]string{ "num-cpus": cpuLimit.String(), - "resources": fmt.Sprintf(`"{\"accelerator_type:%s\":1,\"gpu_count:%d\":1}"`, b.ai.Spec.DefaultAcceleratorType, cfg.GPUsPerPod), + "resources": fmt.Sprintf(`"{\"accelerator_type:%s\":1,\"gpu_count:%d\":1}"`, acceleratorType, cfg.GPUsPerPod), }, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ @@ -662,15 +752,100 @@ func (b *Builder) buildClusterConfig(ctx context.Context) (*rayv1.RayClusterSpec workers = append(workers, wg) } + idleTimeout := int32Ptr(600) return &rayv1.RayClusterSpec{ RayVersion: os.Getenv("RAY_VERSION"), EnableInTreeAutoscaling: boolPtr(true), + AutoscalerOptions: &rayv1.AutoscalerOptions{IdleTimeoutSeconds: idleTimeout}, HeadGroupSpec: head, WorkerGroupSpecs: workers, }, nil } +// objectStorageSecretEnv returns env vars for S3COMPAT_OBJECT_STORE_ACCESS_KEY and S3COMPAT_OBJECT_STORE_SECRET_KEY from +// the objectStorage secret (s3_access_key/s3_secret_key) for S3-compatible object storage. +func (b *Builder) objectStorageSecretEnv() []corev1.EnvVar { + if b.ai.Spec.ObjectStorage.SecretRef == "" { + return nil + } + secretName := b.ai.Spec.ObjectStorage.SecretRef + return []corev1.EnvVar{ + { + Name: "S3COMPAT_OBJECT_STORE_ACCESS_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: secretName}, + Key: "s3_access_key", + }, + }, + }, + { + Name: "S3COMPAT_OBJECT_STORE_SECRET_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: secretName}, + Key: "s3_secret_key", + }, + }, + }, + } +} + +// rayS3DownloadEnv sets AWS_* variables so application code and Ray's runtime_env S3 fetch use the +// configured S3-compatible endpoint (via AWS_ENDPOINT_URL) and credentials when present. +func (b *Builder) rayS3DownloadEnv() []corev1.EnvVar { + u, err := url.Parse(b.ai.Spec.ObjectStorage.Path) + if err != nil { + return nil + } + endpoint := strings.TrimSpace(b.ai.Spec.ObjectStorage.Endpoint) + s3CompatScheme := u.Scheme == "s3compat" || u.Scheme == "minio" || u.Scheme == "seaweedfs" + s3WithCustomEndpoint := u.Scheme == "s3" && endpoint != "" + if (!s3CompatScheme && !s3WithCustomEndpoint) || endpoint == "" { + return nil + } + var out []corev1.EnvVar + out = append(out, corev1.EnvVar{Name: "AWS_ENDPOINT_URL", Value: endpoint}) + if r := strings.TrimSpace(b.ai.Spec.ObjectStorage.Region); r != "" { + out = append(out, + corev1.EnvVar{Name: "AWS_DEFAULT_REGION", Value: r}, + corev1.EnvVar{Name: "AWS_REGION", Value: r}, + ) + } + if b.ai.Spec.ObjectStorage.SecretRef == "" { + return out + } + sn := b.ai.Spec.ObjectStorage.SecretRef + out = append(out, + corev1.EnvVar{ + Name: "AWS_ACCESS_KEY_ID", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: sn}, + Key: "s3_access_key", + }, + }, + }, + corev1.EnvVar{ + Name: "AWS_SECRET_ACCESS_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: sn}, + Key: "s3_secret_key", + }, + }, + }, + ) + return out +} + func (b *Builder) makeHeadTemplate() corev1.PodTemplateSpec { + headEnv := []corev1.EnvVar{ + {Name: "DEFAULT_GPU_TYPE", Value: b.effectiveAcceleratorType()}, + {Name: "CLUSTER_NAME", Value: "ai-platform-models"}, // FIXME + } + headEnv = append(headEnv, b.rayS3DownloadEnv()...) + headEnv = append(headEnv, b.objectStorageSecretEnv()...) spec := corev1.PodSpec{ Containers: []corev1.Container{{ Name: "ray-head", @@ -684,10 +859,7 @@ func (b *Builder) makeHeadTemplate() corev1.PodTemplateSpec { "-lc", "--", }, - Env: []corev1.EnvVar{ - {Name: "DEFAULT_GPU_TYPE", Value: b.ai.Spec.DefaultAcceleratorType}, - {Name: "CLUSTER_NAME", Value: "ai-platform-models"}, // FIXME - }, + Env: headEnv, Lifecycle: &corev1.Lifecycle{ PreStop: &corev1.LifecycleHandler{ Exec: &corev1.ExecAction{ @@ -756,13 +928,13 @@ func (b *Builder) makeHeadTemplate() corev1.PodTemplateSpec { func (b *Builder) makeWorkerTemplate(cfg InstanceDetail) corev1.PodTemplateSpec { defaultEnv := []corev1.EnvVar{ - {Name: "DEFAULT_GPU_TYPE", Value: b.ai.Spec.DefaultAcceleratorType}, + {Name: "DEFAULT_GPU_TYPE", Value: b.effectiveAcceleratorType()}, {Name: "RAY_HEAD_SERVICE_HOST", Value: fmt.Sprintf("%s.%s.svc.%s", b.ai.Name+"-head-svc", b.ai.Namespace, os.Getenv("CLUSTER_DOMAIN"))}, {Name: "SERVICE_NAME", Value: b.ai.Name}, {Name: "SERVICE_INTERNAL_NAME", Value: b.ai.Name}, {Name: "USE_SYSTEM_PERMISSIONS", Value: "true"}, {Name: "GPG_PUBLICKEY_PATH", Value: "kv-splunk/al-platform.ray-worker-sa/gpgkey"}, // FIXME - {Name: "GPU_TYPE", Value: b.ai.Spec.DefaultAcceleratorType}, // FIXME + {Name: "GPU_TYPE", Value: b.effectiveAcceleratorType()}, // FIXME } // Combine defaultEnv with cfg.Env to create combinedEnv @@ -783,11 +955,14 @@ func (b *Builder) makeWorkerTemplate(cfg InstanceDetail) corev1.PodTemplateSpec combinedEnv = append(combinedEnv, corev1.EnvVar{Name: key, Value: value}) } } + // S3-compatible: boto3 for Ray runtime_env working_dir + app-level S3COMPAT_* keys + combinedEnv = append(combinedEnv, b.rayS3DownloadEnv()...) + combinedEnv = append(combinedEnv, b.objectStorageSecretEnv()...) rayCommand := fmt.Sprintf(`echo %s worker; ulimit -n 65536; export PATH="/home/ray/anaconda3/bin:$PATH"; KUBERAY_GEN_RAY_START_CMD=$(echo $KUBERAY_GEN_RAY_START_CMD | sed -e 's/"{/{/g' -e 's/}"/}/g' -e 's/\\\"/"/g'); - $KUBERAY_GEN_RAY_START_CMD;`, cfg.Tier) + $KUBERAY_GEN_RAY_START_CMD`, cfg.Tier) spec := corev1.PodSpec{ Affinity: b.ai.Spec.GPUSchedulingSpec.Affinity, Tolerations: b.ai.Spec.GPUSchedulingSpec.Tolerations, diff --git a/pkg/ai/raybuilder/builder_test.go b/pkg/ai/raybuilder/builder_test.go index 394d700..e5a1120 100644 --- a/pkg/ai/raybuilder/builder_test.go +++ b/pkg/ai/raybuilder/builder_test.go @@ -498,3 +498,4 @@ func TestSetImageRegistry(t *testing.T) { }) } } + diff --git a/pkg/ai/reconciler.go b/pkg/ai/reconciler.go index 3230af1..9e2a803 100644 --- a/pkg/ai/reconciler.go +++ b/pkg/ai/reconciler.go @@ -131,6 +131,9 @@ func (r *AIPlatformReconciler) ReconcileFeatures(ctx context.Context, platform * svc.Namespace = platform.Namespace _, err := controllerutil.CreateOrUpdate(ctx, r.Client, &svc, func() error { + // After client Get, svc holds the live AIService (empty on first create). + preservedResources := svc.Spec.Resources + // Ensure ownership if err := controllerutil.SetControllerReference(platform, &svc, r.Scheme); err != nil { return err @@ -142,6 +145,12 @@ func (r *AIPlatformReconciler) ReconcileFeatures(ctx context.Context, platform * // Copy desired spec svc.Spec = built.Spec + // buildAIService does not set Resources; without this, every AIPlatform reconcile + // wipes kubectl patches / user-set limits (e.g. SAIA memory) back to empty → 2Gi defaults. + if resourceRequirementsNonEmpty(preservedResources) { + svc.Spec.Resources = preservedResources + } + // Merge labels if svc.Labels == nil { svc.Labels = map[string]string{} @@ -189,6 +198,10 @@ func (r *AIPlatformReconciler) ReconcileFeatures(ctx context.Context, platform * return nil } +func resourceRequirementsNonEmpty(r corev1.ResourceRequirements) bool { + return len(r.Requests) > 0 || len(r.Limits) > 0 +} + func (r *AIPlatformReconciler) buildAIService(ctx context.Context, platform *aiApi.AIPlatform, feature aiApi.FeatureSpec, name string) *aiApi.AIService { vectorDbUrl := platform.Status.VectorDbServiceName diff --git a/pkg/storage/azure.go b/pkg/storage/azure.go index fa5f0ba..abbde0c 100644 --- a/pkg/storage/azure.go +++ b/pkg/storage/azure.go @@ -31,6 +31,9 @@ func NewAzureClient( namespace, container, prefix string, vs ai.ObjectStorageSpec, ) (StorageClient, error) { + if container == "" { + return nil, fmt.Errorf("Azure Blob storage requires a container name; use path format azure://container-name/prefix (e.g. azure://my-container/model_artifacts). Without it, model deployments fail with 'Please specify a container name'") + } var cred azcore.TokenCredential var err error diff --git a/pkg/storage/minio.go b/pkg/storage/minio.go index f55a4ba..d8a2abd 100644 --- a/pkg/storage/minio.go +++ b/pkg/storage/minio.go @@ -3,44 +3,17 @@ package storage import ( "context" - "github.com/aws/aws-sdk-go/aws" - "github.com/aws/aws-sdk-go/aws/credentials" - "github.com/aws/aws-sdk-go/aws/session" - "github.com/aws/aws-sdk-go/service/s3" ai "github.com/splunk/splunk-ai-operator/api/v1" - corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" ) +// NewMinioClient creates a StorageClient for MinIO (S3-compatible). It delegates to NewS3CompatibleClient. +// Deprecated: Prefer NewS3CompatibleClient for MinIO, SeaweedFS, or any S3-compatible backend. func NewMinioClient( ctx context.Context, k8sClient client.Client, namespace, bucket, prefix string, vs ai.ObjectStorageSpec, ) (StorageClient, error) { - awsCfg := &aws.Config{ - Endpoint: aws.String(vs.Endpoint), - Region: aws.String(vs.Region), - S3ForcePathStyle: aws.Bool(true), - } - if vs.SecretRef != "" { - secret := &corev1.Secret{} - if err := k8sClient.Get(ctx, - client.ObjectKey{Namespace: namespace, Name: vs.SecretRef}, - secret, - ); err != nil { - return nil, err - } - awsCfg.Credentials = credentials.NewStaticCredentials( - string(secret.Data["s3_access_key"]), - string(secret.Data["s3_secret_key"]), - "", - ) - } - // no SecretRef → AWS SDK default chain (IRSA, env, etc) - sess, err := session.NewSession(awsCfg) - if err != nil { - return nil, err - } - return &s3Client{cli: s3.New(sess), bucket: bucket, prefix: prefix}, nil + return NewS3CompatibleClient(ctx, k8sClient, namespace, bucket, prefix, vs) } diff --git a/pkg/storage/s3compat.go b/pkg/storage/s3compat.go new file mode 100644 index 0000000..b50a735 --- /dev/null +++ b/pkg/storage/s3compat.go @@ -0,0 +1,47 @@ +package storage + +import ( + "context" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/s3" + ai "github.com/splunk/splunk-ai-operator/api/v1" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// NewS3CompatibleClient creates a StorageClient for any S3-compatible backend (MinIO, SeaweedFS, etc.). +// Endpoint must be set on vs; credentials come from vs.SecretRef (s3_access_key, s3_secret_key) if set. +func NewS3CompatibleClient( + ctx context.Context, + k8sClient client.Client, + namespace, bucket, prefix string, + vs ai.ObjectStorageSpec, +) (StorageClient, error) { + awsCfg := &aws.Config{ + Endpoint: aws.String(vs.Endpoint), + Region: aws.String(vs.Region), + S3ForcePathStyle: aws.Bool(true), + } + if vs.SecretRef != "" { + secret := &corev1.Secret{} + if err := k8sClient.Get(ctx, + client.ObjectKey{Namespace: namespace, Name: vs.SecretRef}, + secret, + ); err != nil { + return nil, err + } + awsCfg.Credentials = credentials.NewStaticCredentials( + string(secret.Data["s3_access_key"]), + string(secret.Data["s3_secret_key"]), + "", + ) + } + sess, err := session.NewSession(awsCfg) + if err != nil { + return nil, err + } + return &s3Client{cli: s3.New(sess), bucket: bucket, prefix: prefix}, nil +} diff --git a/pkg/storage/storageclient.go b/pkg/storage/storageclient.go index 7dbea32..9f73b95 100644 --- a/pkg/storage/storageclient.go +++ b/pkg/storage/storageclient.go @@ -43,15 +43,35 @@ func NewStorageClient( switch u.Scheme { case "s3": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: S3 path must include bucket name (e.g. s3://bucket-name/prefix)", vs.Path) + } return NewS3Client(ctx, k8sClient, namespace, u.Host, prefix, vs) case "gs", "gcs": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: GCS path must include bucket name (e.g. gs://bucket-name/prefix)", vs.Path) + } return NewGCSClient(ctx, k8sClient, namespace, u.Host, prefix, vs) case "azure": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: Azure path must include container name (e.g. azure://container-name/prefix). Without it, model deployments fail with 'Please specify a container name'", vs.Path) + } return NewAzureClient(ctx, k8sClient, namespace, u.Host, prefix, vs) + case "s3compat": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: S3-compatible path must include bucket name (e.g. s3compat://bucket-name/prefix)", vs.Path) + } + return NewS3CompatibleClient(ctx, k8sClient, namespace, u.Host, prefix, vs) case "minio": - // everything after "//" is host (bucket) and path. We treat u.Host as bucket, - // vs.Endpoint *must* be set to our MinIO URL for this case. - return NewMinioClient(ctx, k8sClient, namespace, u.Host, prefix, vs) + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: MinIO path must include bucket name (e.g. minio://bucket-name/prefix)", vs.Path) + } + return NewS3CompatibleClient(ctx, k8sClient, namespace, u.Host, prefix, vs) + case "seaweedfs": + if u.Host == "" { + return nil, fmt.Errorf("invalid volume URI %q: SeaweedFS path must include bucket name (e.g. seaweedfs://bucket-name/prefix)", vs.Path) + } + return NewS3CompatibleClient(ctx, k8sClient, namespace, u.Host, prefix, vs) case "fixture": // fixture:// is a special scheme for testing purposes, using a fake client. // It does not require any credentials or endpoint. diff --git a/pkg/storage/storageclient_test.go b/pkg/storage/storageclient_test.go index c97dcc2..87742d2 100644 --- a/pkg/storage/storageclient_test.go +++ b/pkg/storage/storageclient_test.go @@ -75,12 +75,52 @@ func TestNewStorageClient(t *testing.T) { }, }, { - name: "MinIO storage", + name: "MinIO storage (S3-compatible)", volumeSpec: ai.ObjectStorageSpec{ Path: "minio://my-bucket/prefix", Endpoint: "http://minio.default.svc:9000", + Region: "us-east-1", }, - wantType: "minio", + wantType: "s3", + wantErr: false, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "S3-compatible storage (generic s3compat scheme)", + volumeSpec: ai.ObjectStorageSpec{ + Path: "s3compat://my-bucket/prefix", + Endpoint: "http://s3compat.default.svc:9000", + Region: "us-east-1", + }, + wantType: "s3", + wantErr: false, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "SeaweedFS storage (S3-compatible)", + volumeSpec: ai.ObjectStorageSpec{ + Path: "seaweedfs://my-bucket/prefix", + Endpoint: "http://seaweedfs.default.svc:8333", + Region: "us-east-1", + }, + wantType: "s3", + wantErr: false, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "S3 with custom endpoint (S3-compatible)", + volumeSpec: ai.ObjectStorageSpec{ + Path: "s3://my-bucket/prefix", + Endpoint: "http://custom-s3.example.com:9000", + Region: "us-east-1", + }, + wantType: "s3", wantErr: false, setupClient: func() *fake.ClientBuilder { return fake.NewClientBuilder().WithScheme(s) @@ -91,7 +131,7 @@ func TestNewStorageClient(t *testing.T) { volumeSpec: ai.ObjectStorageSpec{ Path: "fixture://test-bucket/prefix", }, - wantType: "fixture", + wantType: "s3", // fixtureClient.GetProvider() returns "s3" for artifact compatibility wantErr: false, setupClient: func() *fake.ClientBuilder { return fake.NewClientBuilder().WithScheme(s) @@ -117,6 +157,52 @@ func TestNewStorageClient(t *testing.T) { return fake.NewClientBuilder().WithScheme(s) }, }, + { + name: "Azure path without container name", + volumeSpec: ai.ObjectStorageSpec{ + Path: "azure:///model_artifacts", + Region: "eastus", + }, + wantErr: true, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "S3 path without bucket name", + volumeSpec: ai.ObjectStorageSpec{ + Path: "s3:///prefix", + Region: "us-west-2", + }, + wantErr: true, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "S3-compatible path without bucket name", + volumeSpec: ai.ObjectStorageSpec{ + Path: "s3compat:///prefix", + Endpoint: "http://s3compat:9000", + Region: "us-east-1", + }, + wantErr: true, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, + { + name: "SeaweedFS path without bucket name", + volumeSpec: ai.ObjectStorageSpec{ + Path: "seaweedfs:///prefix", + Endpoint: "http://seaweedfs:8333", + Region: "us-east-1", + }, + wantErr: true, + setupClient: func() *fake.ClientBuilder { + return fake.NewClientBuilder().WithScheme(s) + }, + }, } for _, tt := range tests { @@ -134,7 +220,7 @@ func TestNewStorageClient(t *testing.T) { // Verify provider matches expected type provider := client.GetProvider() - assert.NotEmpty(t, provider) + assert.Equal(t, tt.wantType, provider, "GetProvider() should match wantType") // Verify bucket/container is extracted bucket := client.GetBucket() diff --git a/tools/artifacts_download_upload_scripts/README.md b/tools/artifacts_download_upload_scripts/README.md index 98a5ce6..3f47a7f 100755 --- a/tools/artifacts_download_upload_scripts/README.md +++ b/tools/artifacts_download_upload_scripts/README.md @@ -70,13 +70,14 @@ sudo ./download_from_huggingface.sh - Script returns non-zero exit code on failure (suitable for CI/CD pipelines) ### 2. `upload_to_minio.sh` -Uploads downloaded artifacts to MinIO storage. +Uploads downloaded artifacts to MinIO or any S3-compatible storage (e.g. SeaweedFS). **Features:** - Automatically uploads **all artifacts** from `./model_artifacts/` directory - No config file needed - just uploads everything found - **Auto-creates bucket** if it doesn't exist - Uses native MinIO Client (mc) for optimal performance +- Works with **MinIO, SeaweedFS, or any S3-compatible** backend; set endpoint and credentials to match your store. - Comprehensive dependency installation: - MinIO Client via **Homebrew on macOS** or **direct download on Linux** - Supports macOS (Intel & Apple Silicon) and Linux (amd64 & arm64) @@ -92,16 +93,109 @@ Or with sudo if dependency installation fails: sudo ./upload_to_minio.sh ``` +**Environment variables (S3-compatible target):** +Preferred generic names; `MINIO_*` are accepted for backward compatibility. + +| Preferred (generic) | Fallback | Description | +|---------------------|----------|-------------| +| `S3COMPAT_OBJECT_STORE_ENDPOINT` | `MINIO_ENDPOINT` | S3 API endpoint URL (e.g. http://host:9000 for MinIO, http://host:8333 for SeaweedFS) | +| `S3COMPAT_OBJECT_STORE_BUCKET` | `MINIO_BUCKET` | Bucket name | +| `S3COMPAT_OBJECT_STORE_ACCESS_KEY` | `MINIO_ROOT_USER` or `MINIO_ACCESS_KEY` | Access key | +| `S3COMPAT_OBJECT_STORE_SECRET_KEY` | `MINIO_ROOT_PASSWORD` or `MINIO_SECRET_KEY` | Secret key | + +Example for SeaweedFS: `S3COMPAT_OBJECT_STORE_ENDPOINT=http://seaweedfs:8333 S3COMPAT_OBJECT_STORE_BUCKET=my-bucket ./upload_to_minio.sh` + **Prerequisites:** - Run `download_from_huggingface.sh` first to download artifacts - May require sudo for installing MinIO Client (mc) -- Configure MinIO settings in the script or use environment variables: - - `MINIO_ENDPOINT` (default: http://127.0.0.1:9000) - - `MINIO_BUCKET` (default: personal) - - `MINIO_ROOT_USER` (default: minioadmin) - - `MINIO_ROOT_PASSWORD` (default: minioadmin) +- Set endpoint, bucket, and credentials via the env vars above (defaults point to a local MinIO). + +### 3. `upload_to_seaweedfs.sh` +Uploads downloaded artifacts to SeaweedFS (S3-compatible). If SeaweedFS is not running at the endpoint, the script can **install and start it** (downloads the `weed` binary from GitHub releases, no Docker). If you run SeaweedFS via **systemd** (see **§4 `install_seaweedfs_systemd.sh`** below), ensure the service is up (`sudo systemctl start seaweedfs`) before running the upload script so the script doesn’t start a second instance. + +**Features:** +- **Auto-install SeaweedFS** when not reachable: downloads latest `weed` for Linux/macOS (amd64/arm64), installs to `/usr/local/bin` or `~/.local/bin`, and starts `weed server -s3` in the background (S3 gateway on port 8333). +- Auto-install only runs when the endpoint is local (`127.0.0.1` or `localhost`). For remote endpoints, SeaweedFS must already be running. +- Creates configured buckets (from `SEAWEEDFS_BUCKETS` or primary bucket), then uploads all of `./model_artifacts/` to the primary bucket. +- Uses MinIO Client (mc); installs mc if missing. + +**Usage:** +```bash +./upload_to_seaweedfs.sh +``` + +With a remote SeaweedFS: +```bash +S3COMPAT_OBJECT_STORE_ENDPOINT=http://seaweedfs-host:8333 S3COMPAT_OBJECT_STORE_BUCKET=my-bucket ./upload_to_seaweedfs.sh +``` + +To skip auto-install and only fail if unreachable: +```bash +SEAWEEDFS_SKIP_INSTALL=1 ./upload_to_seaweedfs.sh +``` + +**Volume limit:** When the script starts SeaweedFS it uses `-volume.max=100` (set `SEAWEEDFS_VOLUME_MAX`; use `0` for auto). The default (~7) can cause "0 node candidates" once the volume server is "full." + +**Environment variables:** `S3COMPAT_OBJECT_STORE_ENDPOINT` (default: http://127.0.0.1:8333), `S3COMPAT_OBJECT_STORE_BUCKET`, `S3COMPAT_OBJECT_STORE_ACCESS_KEY`, `S3COMPAT_OBJECT_STORE_SECRET_KEY`, `SEAWEEDFS_BUCKETS`, `SEAWEEDFS_SKIP_INSTALL`, `SEAWEEDFS_UPLOAD_RETRIES`, `SEAWEEDFS_UPLOAD_RETRY_DELAY`, `SEAWEEDFS_PARALLEL_JOBS`, `SEAWEEDFS_ERROR_LOG`, `SEAWEEDFS_SKIP_EXISTING`, `SEAWEEDFS_WAIT_VOLUME_SERVER`, `SEAWEEDFS_MASTER`, `SEAWEEDFS_VOLUME_MAX` (default 100). -### 3. `upload_to_minio_aws.sh` +**SeaweedFS credentials:** SeaweedFS S3 has no built-in users (unlike MinIO’s default `minioadmin`). If you start SeaweedFS yourself, it must be configured to accept the same access key/secret the script uses (defaults: `minioadmin`/`minioadmin`). Options: (1) Start with env vars: `AWS_ACCESS_KEY_ID=minioadmin AWS_SECRET_ACCESS_KEY=minioadmin weed server -s3`; (2) Use a JSON config file with `weed s3 -config=/path/to/s3.json` (see [SeaweedFS S3 Credentials](https://github.com/seaweedfs/seaweedfs/wiki/S3-Credentials)). If you see *"The access key ID you provided does not exist in our records"*, restart SeaweedFS with the same credentials as `S3COMPAT_OBJECT_STORE_ACCESS_KEY`/`S3COMPAT_OBJECT_STORE_SECRET_KEY` (or set those env vars to match your SeaweedFS config). + +**Volume server readiness:** After SeaweedFS has just started (or restarted), the master may not see a volume server yet, so uploads can fail with "Not enough data nodes found". The script can **wait for a volume server** (when endpoint is local and `weed` is available): it polls `weed shell -master=... cluster.ps` for up to `SEAWEEDFS_WAIT_VOLUME_SERVER` seconds (default 60) before starting uploads. Set `SEAWEEDFS_WAIT_VOLUME_SERVER=0` to skip. + +**Parallel uploads and error log:** Uploads run in parallel (up to `SEAWEEDFS_PARALLEL_JOBS` at a time, default 3). Directory artifacts are uploaded **file-by-file** with per-file retries, so one failed file (e.g. a single `.safetensors` shard) only retries that file, not the whole artifact. Failed files/artifacts are appended to `SEAWEEDFS_ERROR_LOG` (default `./seaweedfs_upload_errors.log`) with artifact id and relative path; at the end the script prints that file and exits with code 1 if any failed. + +**Large artifacts (e.g. LLaMA 70B):** Uploads of very large files (multi-GB `.safetensors` shards) can fail with *"We encountered an internal error, please try again"*. The script retries each artifact up to `SEAWEEDFS_UPLOAD_RETRIES` (default 3) with `SEAWEEDFS_UPLOAD_RETRY_DELAY` seconds between attempts. If failures persist, check SeaweedFS host memory and disk (`/tmp/seaweedfs.log` or volume server logs), ensure enough free space for the full object, and consider increasing retries: `SEAWEEDFS_UPLOAD_RETRIES=5 SEAWEEDFS_UPLOAD_RETRY_DELAY=30 ./upload_to_seaweedfs.sh`. + +**"0 node candidates" / "Not enough data nodes":** Usually the volume server hit its max volume count (default ~7), disk is near full (read-only), heartbeat timeouts, or OOM. The script and systemd unit use `-volume.max=100` by default. When the error happens: `curl -s http://localhost:9333/cluster/status | jq` (master view); `curl -s http://127.0.0.1:8080/status | jq` (volume server; if Max==Count, increase `SEAWEEDFS_VOLUME_MAX`). See `tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md` for full troubleshooting. + +**Prerequisites:** +- Run `download_from_huggingface.sh` first to download artifacts +- For auto-install: curl, tar; optional sudo for `/usr/local/bin` +- No Docker required + +**Create standard folders:** To create the platform folders (`apps/`, `artifacts/`, `config/`, `job_groups/`, `model_artifacts/`, `tasks/`) in SeaweedFS, run `./create_seaweedfs_folders.sh` after SeaweedFS is up. It uses the same endpoint and credentials as `upload_to_seaweedfs.sh`. + +**Upload Splunk AI Assistant app:** To upload `Splunk_AI_Assistant_Cloud.tgz` to `bucket/apps/`, run `./upload_splunk_app_to_seaweedfs.sh`. Put the .tgz in the current directory or set `SPLUNK_APP_LOCAL_PATH=/path/to/Splunk_AI_Assistant_Cloud.tgz`. Same endpoint/credentials as above. + +### 4. `install_seaweedfs_systemd.sh` +Installs SeaweedFS as a **systemd service** so it starts on boot and restarts on failure. Run this on the host where SeaweedFS should run (e.g. EC2), after the `weed` binary is installed. + +**Features:** +- Copies `seaweedfs.service` from this directory into `/etc/systemd/system/` +- Enables and starts the `seaweedfs` service (master, volume, filer, S3 gateway) +- Service runs as `ec2-user` (configurable in the unit file); data directory is `/home/ec2-user/data` by default +- Handles SELinux: on Enforcing systems, labels `/usr/local/bin/weed` so the service can execute it +- Requires the `weed` binary at `/usr/local/bin/weed` (install it first via `upload_to_seaweedfs.sh` or manually from [SeaweedFS releases](https://github.com/seaweedfs/seaweedfs/releases)) + +**Usage:** +```bash +# 1. Install weed first (e.g. run upload_to_seaweedfs.sh once, or download weed and put it in /usr/local/bin) +# 2. Then install the systemd service (requires sudo) +sudo ./install_seaweedfs_systemd.sh +``` + +**Prerequisites:** +- `weed` at `/usr/local/bin/weed` (run `./upload_to_seaweedfs.sh` once to auto-install it, or download and extract from GitHub releases) +- Run the script as root: `sudo ./install_seaweedfs_systemd.sh` +- The `seaweedfs.service` unit file must be in the same directory as the script + +**After install:** +- **Status:** `sudo systemctl status seaweedfs` +- **Logs:** `journalctl -u seaweedfs -f` +- **Stop:** `sudo systemctl stop seaweedfs` +- **Restart:** `sudo systemctl restart seaweedfs` +- **S3 endpoint:** http://127.0.0.1:8333 (default credentials: minioadmin/minioadmin) +- **Data directory:** `/home/ec2-user/data` (edit the unit file or use a drop-in to change) + +**Unit file details (`seaweedfs.service`):** +- `ExecStart`: `/usr/local/bin/weed server -s3 -ip.bind=0.0.0.0 -dir=/home/ec2-user/data -volume.max=100` +- `Restart=on-failure`, `RestartSec=5` +- S3 credentials are set via `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` in the unit (default minioadmin/minioadmin); override with `/etc/default/seaweedfs` or a systemd drop-in if needed +- To use a different user or data dir, copy the unit to a drop-in or edit `/etc/systemd/system/seaweedfs.service` after install + +**Troubleshooting:** If the service fails to start, check `sudo systemctl status seaweedfs` and `journalctl -u seaweedfs -n 50`. Ensure `/home/ec2-user/data` exists and is writable by `ec2-user`, and that `/usr/local/bin/weed` is executable. On SELinux systems, the script runs `chcon -t bin_t /usr/local/bin/weed` to allow execution. + +### 5. `upload_to_minio_aws.sh` Uploads downloaded artifacts to MinIO using AWS CLI (S3-compatible API). **Features:** @@ -128,18 +222,18 @@ sudo ./upload_to_minio_aws.sh **Prerequisites:** - Run `download_from_huggingface.sh` first to download artifacts - May require sudo for installing AWS CLI -- Configure MinIO settings in the script: - - `MINIO_ENDPOINT` (default: http://127.0.0.1:9000) - - `MINIO_BUCKET` (default: ml-platform-artifacts) - - `MINIO_ACCESS_KEY` (default: minioadmin) - - `MINIO_SECRET_KEY` (default: minioadmin) +- Use generic env vars (MINIO_* accepted for backward compatibility): + - `S3COMPAT_OBJECT_STORE_ENDPOINT` (default: http://127.0.0.1:9000) + - `S3COMPAT_OBJECT_STORE_BUCKET` (default: ai-platform-artifacts-bucket) + - `S3COMPAT_OBJECT_STORE_ACCESS_KEY` (default: minioadmin) + - `S3COMPAT_OBJECT_STORE_SECRET_KEY` (default: minioadmin) **When to use this vs `upload_to_minio.sh`:** - Use this if you prefer AWS CLI over MinIO Client (mc) - Use this if you already have AWS CLI installed - Use `upload_to_minio.sh` for better MinIO native support -### 4. `upload_to_s3.sh` +### 6. `upload_to_s3.sh` Uploads downloaded artifacts to AWS S3 storage. **Features:** @@ -181,12 +275,12 @@ sudo S3_BUCKET=your-bucket-name ./upload_to_s3.sh - Set `S3_BUCKET` environment variable - Optional: Set `S3_REGION` (default: us-east-1) and `S3_PREFIX` (default: model_artifacts) -### 5. `test_minio_connection.sh` -Diagnostic script to test MinIO connectivity and troubleshoot issues. +### 7. `test_minio_connection.sh` +Diagnostic script to test S3-compatible object store connectivity (MinIO, SeaweedFS, etc.) and troubleshoot issues. **Features:** - Tests MinIO Client (mc) installation -- Verifies MinIO endpoint connectivity +- Verifies endpoint connectivity - Tests authentication with credentials - Lists all existing buckets - Tests bucket creation permissions @@ -197,9 +291,9 @@ Diagnostic script to test MinIO connectivity and troubleshoot issues. ./test_minio_connection.sh ``` -Or with custom settings: +Or with custom settings (use generic names; MINIO_* also accepted): ```bash -MINIO_ENDPOINT=http://localhost:9000 MINIO_BUCKET=nexus ./test_minio_connection.sh +S3COMPAT_OBJECT_STORE_ENDPOINT=http://localhost:9000 S3COMPAT_OBJECT_STORE_BUCKET=nexus ./test_minio_connection.sh ``` Or with sudo if dependency installation fails: @@ -213,7 +307,7 @@ sudo ./test_minio_connection.sh **When to use:** - Before running upload scripts for the first time - When bucket creation fails -- To diagnose MinIO connectivity issues +- To diagnose object store connectivity issues - To verify credentials and permissions ## Configuration @@ -333,19 +427,18 @@ All artifacts in the list will be downloaded and uploaded automatically. ### For Download Script: - No additional environment variables needed (reads from `model_artifacts_configs.yaml`) -### For MinIO Upload Script (using mc): +### For MinIO / S3-compatible Upload Script (using mc, `upload_to_minio.sh`): - No config file needed - automatically uploads all artifacts from `./model_artifacts/` -- `MINIO_ENDPOINT`: MinIO server endpoint (default: http://127.0.0.1:9000) -- `MINIO_BUCKET`: Target bucket name (default: personal) -- `MINIO_ROOT_USER`: MinIO access key (default: minioadmin) -- `MINIO_ROOT_PASSWORD`: MinIO secret key (default: minioadmin) +- Works with MinIO, SeaweedFS, or any S3-compatible backend. +- **Preferred (generic):** `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, `S3COMPAT_OBJECT_STORE_ACCESS_KEY`, `S3COMPAT_OBJECT_STORE_SECRET_KEY` +- **Backward compatibility:** `MINIO_ENDPOINT`, `MINIO_BUCKET`, `MINIO_ROOT_USER`, `MINIO_ROOT_PASSWORD` (or `MINIO_ACCESS_KEY`/`MINIO_SECRET_KEY`) +- Defaults: endpoint http://127.0.0.1:9000, bucket ai-platform-bucket, minioadmin/minioadmin -### For MinIO Upload Script (using AWS CLI): +### For S3-compatible Upload Script (using AWS CLI, `upload_to_minio_aws.sh`): - No config file needed - automatically uploads all artifacts from `./model_artifacts/` -- `MINIO_ENDPOINT`: MinIO server endpoint (default: http://127.0.0.1:9000) -- `MINIO_BUCKET`: Target bucket name (default: ml-platform-artifacts) -- `MINIO_ACCESS_KEY`: MinIO access key (default: minioadmin) -- `MINIO_SECRET_KEY`: MinIO secret key (default: minioadmin) +- **Preferred (generic):** `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, `S3COMPAT_OBJECT_STORE_ACCESS_KEY`, `S3COMPAT_OBJECT_STORE_SECRET_KEY` +- **Backward compatibility:** `MINIO_ENDPOINT`, `MINIO_BUCKET`, `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY` (or `MINIO_ROOT_USER`/`MINIO_ROOT_PASSWORD`) +- Defaults: endpoint http://127.0.0.1:9000, bucket ai-platform-artifacts-bucket, minioadmin/minioadmin ### For S3 Upload Script: - No config file needed - automatically uploads all artifacts from `./model_artifacts/` diff --git a/tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md b/tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md new file mode 100644 index 0000000..a4b9caa --- /dev/null +++ b/tools/artifacts_download_upload_scripts/SEAWEEDFS_SYSTEMD.md @@ -0,0 +1,134 @@ +# SeaweedFS as a systemd service + +Run SeaweedFS as a systemd service so it **restarts on failure** and **starts on boot**. + +## Prerequisites + +- **weed** binary at `/usr/local/bin/weed`. If missing, run the upload script once from the artifacts directory (it installs weed), or [download a release](https://github.com/seaweedfs/seaweedfs/releases) and copy `weed` to `/usr/local/bin/`. +- **Root/sudo** on the host to install the service. + +## Quick install (EC2 or single host) + +On the host where SeaweedFS should run: + +```bash +cd /path/to/splunk-ai-operator/tools/cluster_setup +sudo ./install_seaweedfs_systemd.sh +``` + +This copies `seaweedfs.service` to `/etc/systemd/system/`, enables and starts the service. + +## Manual install + +1. Copy the unit file: + ```bash + sudo cp tools/cluster_setup/seaweedfs.service /etc/systemd/system/ + sudo systemctl daemon-reload + ``` + +2. Optionally override credentials or data dir via a drop-in or env file: + ```bash + sudo mkdir -p /etc/systemd/system/seaweedfs.service.d + echo -e '[Service]\nEnvironment="AWS_ACCESS_KEY_ID=mykey"\nEnvironment="AWS_SECRET_ACCESS_KEY=mysecret"' | sudo tee /etc/systemd/system/seaweedfs.service.d/override.conf + sudo systemctl daemon-reload + ``` + +3. Enable and start: + ```bash + sudo systemctl enable seaweedfs + sudo systemctl start seaweedfs + ``` + +## Service details + +- **User:** `ec2-user` (change in the unit if needed). +- **Data dir:** `/home/ec2-user/data` (hardcoded in `ExecStart`; override via a systemd drop-in that replaces `ExecStart` if needed). +- **Volume max:** `100` in `ExecStart` (override via drop-in if needed). +- **S3 credentials:** `minioadmin` / `minioadmin` by default; override with `Environment=` or `EnvironmentFile=-/etc/default/seaweedfs` in a drop-in. +- **Restart:** `on-failure` with 5s delay. +- **Logs:** `journalctl -u seaweedfs -f` + +## Useful commands + +| Command | Description | +|--------|-------------| +| `sudo systemctl status seaweedfs` | Show status | +| `journalctl -u seaweedfs -f` | Follow logs | +| `sudo systemctl restart seaweedfs` | Restart | +| `sudo systemctl stop seaweedfs` | Stop | +| `sudo systemctl disable seaweedfs` | Disable start on boot | + +## After install + +- S3 endpoint: **http://127.0.0.1:8333** (or the host’s IP if accessing remotely). +- Use the same credentials in the upload script or set `OBJECT_STORE_ACCESS_KEY` / `OBJECT_STORE_SECRET_KEY` to match the service. + +## Troubleshooting: "0 node candidates" / "Not enough data nodes found" + +When the Master has no writable volume servers, uploads fail with those errors. Common causes and fixes: + +| Cause | Fix | +|-------|-----| +| **1. Max volumes reached** | Volume server default `-max` is often 7–8. The unit sets `SEAWEEDFS_VOLUME_MAX=100`. To increase: add `Environment="SEAWEEDFS_VOLUME_MAX=200"` in a drop-in and restart. | +| **2. Disk space** | At ~95% usage the volume server reports read-only. Check `df -h` on the host; free space or add storage. | +| **3. Heartbeat / gRPC timeouts** | Under heavy load the volume server may miss heartbeats and be marked dead. Check `journalctl -u seaweedfs` for "heartbeat" or "connection refused" around the failure time. | +| **4. OOM** | On small instances the process may be killed. Run `dmesg -T | grep -i oom` on the host. | + +**When the error is happening, run:** + +```bash +# Master's view of nodes (look for empty Nodes or IsReadOnly: true) +curl -s http://localhost:9333/cluster/status | jq + +# Volume server status (check if Max and Count are equal = full) +curl -s http://127.0.0.1:8080/status | jq +``` + +If `Max == Count` on the volume server, increase `SEAWEEDFS_VOLUME_MAX` and restart the service. + +### "Permission denied" when starting the service (status=203/EXEC) + +The service runs as `ec2-user`. Common causes: + +1. **File permissions** – Ensure the binary is executable by all: + ```bash + sudo chmod 755 /usr/local/bin/weed + ``` + +2. **SELinux (Enforcing)** – On RHEL/Amazon Linux, SELinux can block execution. Fix by labeling the binary: + ```bash + sudo chcon -t bin_t /usr/local/bin/weed + sudo systemctl restart seaweedfs + ``` + To confirm SELinux is the cause: `sudo setenforce 0`, restart the service; if it then runs, re-enable with `sudo setenforce 1` and apply the `chcon` above. + +The install script runs `chmod 755` and, when SELinux is Enforcing, `chcon -t bin_t` automatically. + +### Connect timeout from EKS / Ray pods (Connection to <host> timed out) + +Ray workers (and other pods) in the cluster need to reach the SeaweedFS S3 endpoint to download model artifacts. If you see: + +- `Connect timeout on endpoint URL: "http://:8333/..."` +- `Connection to timed out. (connect timeout=60)"` + +then **pods cannot reach the SeaweedFS host** on port 8333. + +**Fix:** + +1. **Security group on the SeaweedFS EC2** + Allow **inbound TCP port 8333** from the EKS cluster: + - **Option A:** From the **EKS worker node security group** (so any pod on those nodes can reach SeaweedFS). + - **Option B:** From the **VPC CIDR** (e.g. `10.0.0.0/16` or `192.168.0.0/16`) so all pods in the VPC can reach SeaweedFS. + + In AWS Console: EC2 → Security Groups → select the security group attached to the SeaweedFS instance → Edit inbound rules → Add rule: Type = Custom TCP, Port = 8333, Source = node SG or VPC CIDR. + +2. **Prefer private IP when in the same VPC** + If SeaweedFS and EKS are in the same VPC, set `storage.objectStore.endpoint` in `cluster-config.yaml` to the **private IP** and port (e.g. `http://172.31.23.74:8333`). Then: + - Traffic stays inside the VPC (no internet path). + - The security group still must allow 8333 from the node SG or VPC CIDR as above. + +3. **Verify from a pod** (optional): + ```bash + kubectl run -it --rm curl --image=curlimages/curl --restart=Never -- curl -s -o /dev/null -w "%{http_code}" http://:8333 + ``` + Use the same IP (public or private) and port as in your config. A 200/403/400 means the pod can reach SeaweedFS. diff --git a/tools/artifacts_download_upload_scripts/create_seaweedfs_folders.sh b/tools/artifacts_download_upload_scripts/create_seaweedfs_folders.sh new file mode 100755 index 0000000..823c7eb --- /dev/null +++ b/tools/artifacts_download_upload_scripts/create_seaweedfs_folders.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Create standard folder prefixes in SeaweedFS (S3-compatible). Uses the same +# OBJECT_STORE_* / SEAWEEDFS_* env vars as upload_to_seaweedfs.sh. Run after +# SeaweedFS is up (e.g. systemd service or upload script has started it). + +set -e + +# Same endpoint/credentials as upload_to_seaweedfs.sh +OBJECT_STORE_ENDPOINT="${OBJECT_STORE_ENDPOINT:-${SEAWEEDFS_ENDPOINT:-http://127.0.0.1:8333}}" +OBJECT_STORE_BUCKET="${OBJECT_STORE_BUCKET:-${SEAWEEDFS_BUCKET:-ai-platform-bucket}}" +OBJECT_STORE_ACCESS_KEY="${OBJECT_STORE_ACCESS_KEY:-${SEAWEEDFS_ACCESS_KEY:-minioadmin}}" +OBJECT_STORE_SECRET_KEY="${OBJECT_STORE_SECRET_KEY:-${SEAWEEDFS_SECRET_KEY:-minioadmin}}" + +OBJECT_STORE_BUCKET=$(echo "$OBJECT_STORE_BUCKET" | tr '[:upper:]' '[:lower:]') + +# Standard folders expected by the platform (create by uploading .keep) +FOLDERS=(apps artifacts config job_groups model_artifacts tasks) + +seaweedfs_ok() { + local code + code=$(curl -s -o /dev/null -w "%{http_code}" "${OBJECT_STORE_ENDPOINT}" 2>/dev/null || echo "000") + [[ "$code" == "200" || "$code" == "403" || "$code" == "400" ]] +} + +if ! seaweedfs_ok; then + echo "SeaweedFS not reachable at ${OBJECT_STORE_ENDPOINT}. Start SeaweedFS first (e.g. sudo systemctl start seaweedfs)." + exit 1 +fi + +# Install mc if needed +if ! command -v mc &>/dev/null; then + echo "Installing MinIO Client (mc)..." + OS="$(uname -s)" + ARCH="$(uname -m)" + if [[ "$OS" == "Darwin" ]]; then + if command -v brew &>/dev/null; then + brew install minio/stable/mc + else + if [[ "$ARCH" == "arm64" ]]; then MC_URL="https://dl.min.io/client/mc/release/darwin-arm64/mc"; else MC_URL="https://dl.min.io/client/mc/release/darwin-amd64/mc"; fi + curl -o /tmp/mc "$MC_URL" && chmod +x /tmp/mc && sudo mv /tmp/mc /usr/local/bin/mc + fi + elif [[ "$OS" == "Linux" ]]; then + if [[ "$ARCH" == "x86_64" ]]; then MC_URL="https://dl.min.io/client/mc/release/linux-amd64/mc"; elif [[ "$ARCH" == "aarch64" || "$ARCH" == "arm64" ]]; then MC_URL="https://dl.min.io/client/mc/release/linux-arm64/mc"; else echo "Unsupported arch: $ARCH"; exit 1; fi + curl -o /tmp/mc "$MC_URL" && chmod +x /tmp/mc + sudo mv /tmp/mc /usr/local/bin/mc 2>/dev/null || { mkdir -p ~/.local/bin; mv /tmp/mc ~/.local/bin/mc; export PATH="$PATH:$HOME/.local/bin"; } + else + echo "Unsupported OS: $OS"; exit 1 + fi +fi + +MC_ALIAS="seaweedfs" +mc alias set "$MC_ALIAS" "$OBJECT_STORE_ENDPOINT" "$OBJECT_STORE_ACCESS_KEY" "$OBJECT_STORE_SECRET_KEY" --api S3v4 +mc mb "${MC_ALIAS}/${OBJECT_STORE_BUCKET}" --ignore-existing 2>/dev/null || true + +echo "Creating folders in ${OBJECT_STORE_BUCKET}: ${FOLDERS[*]}" +for dir in "${FOLDERS[@]}"; do + echo "placeholder" | mc pipe "${MC_ALIAS}/${OBJECT_STORE_BUCKET}/${dir}/.keep" 2>/dev/null || true + echo " ${dir}/" +done +echo "Done. Folders: apps/, artifacts/, config/, job_groups/, model_artifacts/, tasks/" diff --git a/tools/artifacts_download_upload_scripts/install_minio_ec2.sh b/tools/artifacts_download_upload_scripts/install_minio_ec2.sh new file mode 100755 index 0000000..fcf93a1 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/install_minio_ec2.sh @@ -0,0 +1,335 @@ +#!/usr/bin/env bash +# ----------------------------------------------------------------------------- +# MinIO on EC2 for Splunk AI Platform (EKS) +# +# Mode 1 - Install on this machine (run ON the EC2 instance after SSH, as root): +# sudo ./install_minio_ec2.sh [--bucket NAME] [--user USER] [--password PASSWORD] +# +# Mode 2 - Launch EC2 in same VPC as EKS, then install MinIO (run from laptop): +# CONFIG_FILE=./cluster-config.yaml ./install_minio_ec2.sh --launch-ec2 +# Then SSH to the instance and run: ./install_minio_ec2.sh (with same bucket/user/password) +# +# Prerequisites: aws CLI, same VPC as EKS (or provide VPC/subnet). For --launch-ec2: jq, yq (optional). +# ----------------------------------------------------------------------------- +set -euo pipefail + +MINIO_BUCKET="${MINIO_BUCKET:-ai-platform}" +MINIO_ROOT_USER="${MINIO_ROOT_USER:-minioadmin}" +MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-}" +MINIO_DATA_DIR="${MINIO_DATA_DIR:-/data/minio}" +MINIO_PORT="${MINIO_PORT:-9000}" + +# Launch-EC2 options (when --launch-ec2) +MINIO_EC2_INSTANCE_TYPE="${MINIO_EC2_INSTANCE_TYPE:-t3.xlarge}" +MINIO_EC2_AMI_QUERY="${MINIO_EC2_AMI_QUERY:-Amazon Linux 2023}" +MINIO_EC2_KEY_NAME="${MINIO_EC2_KEY_NAME:-}" +MINIO_EC2_VOLUME_SIZE="${MINIO_EC2_VOLUME_SIZE:-150}" + +log() { echo "[minio-ec2] $*"; } +err() { echo "[minio-ec2] ERROR: $*" >&2; } + +# ---------- Parse args ---------- +LAUNCH_EC2=false +while [[ $# -gt 0 ]]; do + case "$1" in + --launch-ec2) LAUNCH_EC2=true; shift ;; + --bucket) MINIO_BUCKET="$2"; shift 2 ;; + --user) MINIO_ROOT_USER="$2"; shift 2 ;; + --password) MINIO_ROOT_PASSWORD="$2"; shift 2 ;; + --data-dir) MINIO_DATA_DIR="$2"; shift 2 ;; + --port) MINIO_PORT="$2"; shift 2 ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +# ---------- Mode 2: Launch EC2 in EKS VPC ---------- +launch_ec2_in_eks_vpc() { + need_file "${CONFIG_FILE:-}" + local cfg="${CONFIG_FILE}" + local cluster_name region vpc_id subnet_id sg_id instance_id private_ip + + if command -v yq &>/dev/null; then + cluster_name="$(yq eval '.cluster.name' "$cfg")" + region="$(yq eval '.cluster.region' "$cfg")" + else + cluster_name="$(grep -A1 'cluster:' "$cfg" | grep 'name:' | head -1 | sed 's/.*name: *"\(.*\)".*/\1/')" + region="$(grep 'region:' "$cfg" | head -1 | sed 's/.*region: *"\(.*\)".*/\1/')" + fi + [[ -z "$cluster_name" || -z "$region" ]] && { err "Could not read cluster.name and cluster.region from $cfg"; exit 1; } + + log "Cluster: $cluster_name, Region: $region" + if ! aws eks describe-cluster --name "$cluster_name" --region "$region" &>/dev/null; then + err "EKS cluster '$cluster_name' not found. Create the cluster first or provide VPC/subnet via MINIO_EC2_VPC_ID and MINIO_EC2_SUBNET_ID." + exit 1 + fi + + vpc_id="$(aws eks describe-cluster --name "$cluster_name" --region "$region" --query 'cluster.resourcesVpcConfig.vpcId' --output text)" + # Prefer private subnet for MinIO + subnet_id="$(aws eks describe-cluster --name "$cluster_name" --region "$region" --query 'cluster.resourcesVpcConfig.subnetIds[0]' --output text)" + [[ -z "$vpc_id" || "$vpc_id" == "None" ]] && { err "No VPC from cluster"; exit 1; } + [[ -z "$subnet_id" || "$subnet_id" == "None" ]] && { err "No subnet from cluster"; exit 1; } + + local vpc_cidr + vpc_cidr="$(aws ec2 describe-vpcs --vpc-ids "$vpc_id" --region "$region" --query 'Vpcs[0].CidrBlock' --output text 2>/dev/null || echo "10.0.0.0/8")" + + log "VPC: $vpc_id, Subnet: $subnet_id, CIDR: $vpc_cidr" + + # Security group: SSH (22) from anywhere; MinIO (9000) from VPC (reuse if exists) + local sg_name="minio-ec2-${cluster_name}" + sg_id="$(aws ec2 describe-security-groups --filters "Name=group-name,Values=$sg_name" "Name=vpc-id,Values=$vpc_id" --region "$region" --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null)" + if [[ -z "$sg_id" || "$sg_id" == "None" ]]; then + sg_id="$(aws ec2 create-security-group --group-name "$sg_name" --description "MinIO EC2 for EKS" --vpc-id "$vpc_id" --region "$region" --query 'GroupId' --output text)" + fi + aws ec2 authorize-security-group-ingress --group-id "$sg_id" --protocol tcp --port 22 --cidr 0.0.0.0/0 --region "$region" 2>/dev/null || true + aws ec2 authorize-security-group-ingress --group-id "$sg_id" --protocol tcp --port "$MINIO_PORT" --cidr "$vpc_cidr" --region "$region" 2>/dev/null || true + log "Security group: $sg_id (22 from 0.0.0.0/0, ${MINIO_PORT} from $vpc_cidr)" + + # Key pair: use existing or create (idempotent: reuse same key name per cluster) + local key_name="$MINIO_EC2_KEY_NAME" + local key_file="" + if [[ -z "$key_name" ]]; then + key_name="minio-ec2-${cluster_name}" + key_file="/tmp/minio-ec2-${cluster_name}.pem" + if aws ec2 describe-key-pairs --key-names "$key_name" --region "$region" &>/dev/null; then + log "Using existing key pair: $key_name (if you lost the .pem, set MINIO_EC2_KEY_NAME to another key)" + elif aws ec2 create-key-pair --key-name "$key_name" --query 'KeyMaterial' --output text --region "$region" > "$key_file" 2>/dev/null; then + chmod 600 "$key_file" + log "Key pair created: $key_name (saved to $key_file)" + else + err "Create key pair failed. Set MINIO_EC2_KEY_NAME to an existing key name in this region." + exit 1 + fi + fi + + # AMI: Amazon Linux 2023 + local ami_id + ami_id="$(aws ec2 describe-images --owners amazon --filters "Name=name,Values=al2023-ami-*-x86_64" "Name=state,Values=available" --query 'sort_by(Images,&CreationDate)[-1].ImageId' --output text --region "$region")" + [[ -z "$ami_id" || "$ami_id" == "None" ]] && ami_id="$(aws ec2 describe-images --owners amazon --filters "Name=name,Values=amzn2-ami-hvm-*-x86_64-gp2" "Name=state,Values=available" --query 'sort_by(Images,&CreationDate)[-1].ImageId' --output text --region "$region")" + + instance_id="$(aws ec2 run-instances \ + --image-id "$ami_id" \ + --instance-type "$MINIO_EC2_INSTANCE_TYPE" \ + --subnet-id "$subnet_id" \ + --security-group-ids "$sg_id" \ + --key-name "$key_name" \ + --block-device-mappings "[{\"DeviceName\":\"/dev/xvda\",\"Ebs\":{\"VolumeSize\":${MINIO_EC2_VOLUME_SIZE},\"VolumeType\":\"gp3\"}}]" \ + --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=minio-ec2-${cluster_name}},{Key=Cluster,Value=${cluster_name}}]" \ + --region "$region" \ + --query 'Instances[0].InstanceId' --output text)" + log "Launched instance: $instance_id (key: $key_name)" + + log "Waiting for instance to get private IP..." + aws ec2 wait instance-running --instance-ids "$instance_id" --region "$region" + private_ip="$(aws ec2 describe-instances --instance-ids "$instance_id" --region "$region" --query 'Reservations[0].Instances[0].PrivateIpAddress' --output text)" + [[ -z "$private_ip" || "$private_ip" == "None" ]] && private_ip="(check console)" + + echo "" + log "=== MinIO EC2 instance ready ===" + echo " Instance ID: $instance_id" + echo " Private IP: $private_ip" + echo " Region: $region" + echo " Key name: $key_name" + [[ -n "$key_file" && -f "$key_file" ]] && echo " Key file: $key_file" + echo "" + echo "Next steps:" + echo " 1. SSH to the instance: ssh -i ${key_file:-/path/to/$key_name.pem} ec2-user@${private_ip}" + echo " 2. On the instance, copy and run this script (install-only mode, requires sudo):" + echo " sudo ./install_minio_ec2.sh --bucket ${MINIO_BUCKET} --user ${MINIO_ROOT_USER} --password ''" + echo " 3. Add to cluster-config.yaml (storage.minio):" + echo " enabled: true" + echo " external: true" + echo " endpoint: \"http://${private_ip}:${MINIO_PORT}\"" + echo " bucket: \"${MINIO_BUCKET}\"" + echo " auth: { rootUser: \"${MINIO_ROOT_USER}\", rootPassword: \"\" }" + echo "" +} + +need_file() { [[ -n "${1:-}" && -f "${1}" ]] || { err "File required: $1"; exit 1; }; } + +# ---------- Entry ---------- +if [[ "$LAUNCH_EC2" == "true" ]]; then + launch_ec2_in_eks_vpc + exit 0 +fi + +# ---------- Mode 1: Install MinIO on this machine ---------- +# Require root (for /usr/local/bin, /etc/default/minio, systemd) +if [[ "$(id -u)" -ne 0 ]]; then + err "This script must be run as root (or with sudo)." + err "Run: sudo $0 ${*:-}" + exit 1 +fi + +# Generate password if not set +if [[ -z "${MINIO_ROOT_PASSWORD}" ]]; then + MINIO_ROOT_PASSWORD="$(openssl rand -base64 24 2>/dev/null || head -c 32 /dev/urandom | base64)" + log "Generated MINIO_ROOT_PASSWORD (save it for cluster-config.yaml)" +fi + +# Install MinIO binary (use stable "latest" URL; archive URLs can 404 and return HTML) +install_minio_binary() { + local arch + arch="$(uname -m)" + case "$arch" in + x86_64|amd64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + *) err "Unsupported arch: $arch"; exit 1 ;; + esac + local url="https://dl.min.io/server/minio/release/linux-${arch}/minio" + local tmp="/tmp/minio.$$" + log "Downloading MinIO (linux-${arch})..." + if ! curl -sSL -o "$tmp" "$url"; then + err "Download failed. Check network or try: curl -sSL -o /tmp/minio '$url'" + rm -f "$tmp" + exit 1 + fi + # Reject HTML/error pages (e.g. 404); binary should not start with < or "Not" + if head -c 4 "$tmp" | grep -q '^<\|^Not'; then + err "Download returned HTML/error instead of binary. URL may be wrong or blocked." + head -1 "$tmp" + rm -f "$tmp" + exit 1 + fi + chmod +x "$tmp" + mv "$tmp" /usr/local/bin/minio + minio --version +} + +install_mc() { + local arch + arch="$(uname -m)" + case "$arch" in + x86_64|amd64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + *) arch=amd64 ;; + esac + local tmp="/tmp/mc.$$" + log "Downloading MinIO Client (mc)..." + if ! curl -sSL -o "$tmp" "https://dl.min.io/client/mc/release/linux-${arch}/mc"; then + err "Download failed for mc." + rm -f "$tmp" + exit 1 + fi + if head -c 4 "$tmp" | grep -q '^<\|^Not'; then + err "mc download returned HTML/error instead of binary." + rm -f "$tmp" + exit 1 + fi + chmod +x "$tmp" + mv "$tmp" /usr/local/bin/mc + mc --version +} + +# Stop MinIO so we can replace the binary without restart loop (e.g. after wrong-arch fix). +systemctl stop minio 2>/dev/null || true +# Always (re)install MinIO binary so we get the correct architecture for this host. +# A wrong-arch binary (e.g. amd64 on arm64 EC2) causes "Exec format error" and crash-loop. +install_minio_binary +if ! command -v mc &>/dev/null; then + install_mc +else + log "mc already present: $(mc --version 2>/dev/null || true)" +fi + +mkdir -p "$MINIO_DATA_DIR" +chmod 755 "$MINIO_DATA_DIR" +ENV_FILE="/etc/default/minio" +cat > "$ENV_FILE" < /etc/systemd/system/minio.service </dev/null | grep -q 200; then + minio_ok=true + break + fi + sleep 2 +done +if [[ "$minio_ok" != "true" ]]; then + err "MinIO did not respond on port ${MINIO_PORT} within 60s. Service may be failing or crash-looping." + echo "" >&2 + systemctl status minio --no-pager 2>&1 || true + echo "" >&2 + journalctl -u minio -n 30 --no-pager 2>&1 || true + exit 1 +fi +# Verify port is actually listening +if ! ( ss -tlnp 2>/dev/null || netstat -tlnp 2>/dev/null ) | grep -qE "[.:]${MINIO_PORT}([^0-9]|$)"; then + err "MinIO health passed but port ${MINIO_PORT} is not listening. Showing service status:" + systemctl status minio --no-pager 2>&1 || true + exit 1 +fi +sleep 2 + +export MC_HOST_local="http://${MINIO_ROOT_USER}:${MINIO_ROOT_PASSWORD}@127.0.0.1:${MINIO_PORT}" +mc mb "local/${MINIO_BUCKET}" --ignore-existing 2>/dev/null || true +for prefix in apps artifacts config job_groups model_artifacts tasks; do + echo -n | mc pipe "local/${MINIO_BUCKET}/${prefix}/.keep" 2>/dev/null || true +done +log "Bucket '${MINIO_BUCKET}' and prefixes apps/, artifacts/, config/, job_groups/, model_artifacts/, tasks/ ready" + +if command -v firewall-cmd &>/dev/null && systemctl is-active --quiet firewalld 2>/dev/null; then + firewall-cmd --permanent --add-port="${MINIO_PORT}/tcp" 2>/dev/null || true + firewall-cmd --reload 2>/dev/null || true +elif command -v ufw &>/dev/null && ufw status 2>/dev/null | grep -q "Status: active"; then + ufw allow "${MINIO_PORT}/tcp" 2>/dev/null || true + ufw reload 2>/dev/null || true +fi + +PRIVATE_IP="" +if command -v hostname &>/dev/null; then + PRIVATE_IP="$(hostname -I 2>/dev/null | awk '{print $1}')" +fi +[[ -z "$PRIVATE_IP" ]] && PRIVATE_IP="$(curl -s --connect-timeout 2 http://169.254.169.254/latest/meta-data/local-ipv4 2>/dev/null || echo 'MINIO_EC2_PRIVATE_IP')" +ENDPOINT="http://${PRIVATE_IP}:${MINIO_PORT}" + +echo "" +log "=== MinIO on EC2 is ready ===" +echo " Endpoint: ${ENDPOINT}" +echo " Bucket: ${MINIO_BUCKET}" +echo " Root user: ${MINIO_ROOT_USER}" +echo " Root pass: ${MINIO_ROOT_PASSWORD}" +echo "" +echo "Add to cluster-config.yaml (storage.minio):" +echo " minio:" +echo " enabled: true" +echo " external: true" +echo " endpoint: \"${ENDPOINT}\"" +echo " bucket: \"${MINIO_BUCKET}\"" +echo " auth:" +echo " rootUser: \"${MINIO_ROOT_USER}\"" +echo " rootPassword: \"${MINIO_ROOT_PASSWORD}\"" +echo "" +echo "Ensure EC2 security group allows inbound TCP ${MINIO_PORT} from your EKS node security group or VPC CIDR." +echo "" +echo "If MinIO is not reachable, check: systemctl status minio && ss -tlnp | grep ${MINIO_PORT}" +echo "" diff --git a/tools/artifacts_download_upload_scripts/install_seaweedfs_systemd.sh b/tools/artifacts_download_upload_scripts/install_seaweedfs_systemd.sh new file mode 100755 index 0000000..2f21090 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/install_seaweedfs_systemd.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Install SeaweedFS as a systemd service (restart on failure, start on boot). +# Run with sudo on the host where SeaweedFS should run (e.g. EC2). +# Prereqs: weed binary at /usr/local/bin/weed (run upload_to_seaweedfs.sh once to install, or install manually). + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SERVICE_NAME="seaweedfs" +UNIT_FILE="${SCRIPT_DIR}/seaweedfs.service" + +if [[ "$(id -u)" -ne 0 ]]; then + echo "Run with sudo to install the systemd service." + exit 1 +fi + +if [[ ! -f /usr/local/bin/weed ]]; then + echo "weed not found at /usr/local/bin/weed. Install it first, e.g.:" + echo " Run ./upload_to_seaweedfs.sh once (it will install weed), or" + echo " download from https://github.com/seaweedfs/seaweedfs/releases and extract weed to /usr/local/bin/" + exit 1 +fi + +# Service runs as ec2-user; ensure the binary is executable by that user (fixes "Permission denied" on EXEC). +chmod 755 /usr/local/bin/weed +# On SELinux systems (e.g. RHEL, Amazon Linux), label the binary so the service can execute it. +if command -v getenforce &>/dev/null && [[ "$(getenforce 2>/dev/null)" == "Enforcing" ]]; then + if command -v chcon &>/dev/null; then + chcon -t bin_t /usr/local/bin/weed 2>/dev/null || true + fi +fi + +echo "Installing ${SERVICE_NAME}.service..." +cp "$UNIT_FILE" /etc/systemd/system/"${SERVICE_NAME}.service" +chmod 644 /etc/systemd/system/"${SERVICE_NAME}.service" +systemctl daemon-reload + +echo "Enabling ${SERVICE_NAME} to start on boot..." +systemctl enable "${SERVICE_NAME}" + +echo "Starting ${SERVICE_NAME} now..." +systemctl start "${SERVICE_NAME}" + +sleep 2 +if ! systemctl is-active --quiet "${SERVICE_NAME}"; then + echo "Warning: ${SERVICE_NAME} did not stay running. Check: sudo systemctl status ${SERVICE_NAME} && journalctl -u ${SERVICE_NAME} -n 30" + exit 1 +fi + +echo "" +echo "SeaweedFS is running as a systemd service." +echo " status: sudo systemctl status ${SERVICE_NAME}" +echo " logs: journalctl -u ${SERVICE_NAME} -f" +echo " stop: sudo systemctl stop ${SERVICE_NAME}" +echo " restart: sudo systemctl restart ${SERVICE_NAME}" +echo "" +echo "S3 endpoint: http://127.0.0.1:8333 (default credentials minioadmin/minioadmin)" +echo "Data dir: /home/ec2-user/data (edit SEAWEEDFS_DIR in the unit to change)" diff --git a/tools/artifacts_download_upload_scripts/seaweedfs.service b/tools/artifacts_download_upload_scripts/seaweedfs.service new file mode 100644 index 0000000..1dc4079 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/seaweedfs.service @@ -0,0 +1,39 @@ +# SeaweedFS all-in-one server (master, volume, filer, S3). +# Install: see tools/cluster_setup/SEAWEEDFS_SYSTEMD.md or run install_seaweedfs_systemd.sh +# Credentials: set in /etc/default/seaweedfs or use the drop-in below. + +[Unit] +Description=SeaweedFS server (master, volume, filer, S3) +Documentation=https://github.com/seaweedfs/seaweedfs +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=ec2-user +Group=ec2-user +# Data directory (must exist and be writable by User). Override via drop-in if needed. +Environment="SEAWEEDFS_DIR=/home/ec2-user/data" +# Max volumes per volume server. Override via drop-in if needed. +Environment="SEAWEEDFS_VOLUME_MAX=100" +# S3 credentials (must match upload script / mc alias) +Environment="AWS_ACCESS_KEY_ID=minioadmin" +Environment="AWS_SECRET_ACCESS_KEY=minioadmin" +# Override with /etc/default/seaweedfs or systemd drop-in if needed: +# EnvironmentFile=-/etc/default/seaweedfs + +# Use explicit paths so ExecStart works even if env expansion is not applied (e.g. after copy from Windows). +ExecStart=/usr/local/bin/weed server -s3 -ip.bind=0.0.0.0 -dir=/home/ec2-user/data -volume.max=100 +WorkingDirectory=/home/ec2-user +Restart=on-failure +RestartSec=5 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=seaweedfs + +# Security: no new privileges, restrict to usual caps +NoNewPrivileges=true +PrivateTmp=true + +[Install] +WantedBy=multi-user.target diff --git a/tools/artifacts_download_upload_scripts/test_minio_connection.sh b/tools/artifacts_download_upload_scripts/test_minio_connection.sh index 6d90525..9f1baf2 100755 --- a/tools/artifacts_download_upload_scripts/test_minio_connection.sh +++ b/tools/artifacts_download_upload_scripts/test_minio_connection.sh @@ -1,10 +1,10 @@ #!/bin/bash # Test script to diagnose MinIO connectivity and bucket creation issues -MINIO_ENDPOINT="${MINIO_ENDPOINT:-http://127.0.0.1:9000}" +MINIO_ENDPOINT="${MINIO_ENDPOINT:-http://18.221.188.50:9000}" MINIO_ROOT_USER="${MINIO_ROOT_USER:-minioadmin}" -MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-minioadmin}" -MINIO_BUCKET="${MINIO_BUCKET:-personal}" +MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-AAnwWE2sLfFduYTpPy4v7PcyczSHGrVM}" +MINIO_BUCKET="${MINIO_BUCKET:-ai-platform-bucket-us-east-2}" echo "==========================================" echo "MinIO Connection Test" diff --git a/tools/artifacts_download_upload_scripts/upload_splunk_app_to_seaweedfs.sh b/tools/artifacts_download_upload_scripts/upload_splunk_app_to_seaweedfs.sh new file mode 100755 index 0000000..eafda79 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/upload_splunk_app_to_seaweedfs.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Upload Splunk_AI_Assistant_Cloud.tgz to SeaweedFS at bucket/apps/Splunk_AI_Assistant_Cloud.tgz. +# Uses the same OBJECT_STORE_* / SEAWEEDFS_* env vars as upload_to_seaweedfs.sh and create_seaweedfs_folders.sh. + +set -e + +APP_FILENAME="${SPLUNK_APP_FILENAME:-Splunk_AI_Assistant_Cloud.tgz}" +LOCAL_PATH="${SPLUNK_APP_LOCAL_PATH:-./${APP_FILENAME}}" + +OBJECT_STORE_ENDPOINT="${OBJECT_STORE_ENDPOINT:-${SEAWEEDFS_ENDPOINT:-http://127.0.0.1:8333}}" +OBJECT_STORE_BUCKET="${OBJECT_STORE_BUCKET:-${SEAWEEDFS_BUCKET:-ai-platform-bucket-minio-us-east-2}}" +OBJECT_STORE_ACCESS_KEY="${OBJECT_STORE_ACCESS_KEY:-${SEAWEEDFS_ACCESS_KEY:-minioadmin}}" +OBJECT_STORE_SECRET_KEY="${OBJECT_STORE_SECRET_KEY:-${SEAWEEDFS_SECRET_KEY:-minioadmin}}" + +OBJECT_STORE_BUCKET=$(echo "$OBJECT_STORE_BUCKET" | tr '[:upper:]' '[:lower:]') + +seaweedfs_ok() { + local code + code=$(curl -s -o /dev/null -w "%{http_code}" "${OBJECT_STORE_ENDPOINT}" 2>/dev/null || echo "000") + [[ "$code" == "200" || "$code" == "403" || "$code" == "400" ]] +} + +if [[ ! -f "$LOCAL_PATH" ]]; then + echo "Error: App file not found: $LOCAL_PATH" + echo "Set SPLUNK_APP_LOCAL_PATH to the path of Splunk_AI_Assistant_Cloud.tgz, or put the file in the current directory." + exit 1 +fi + +if ! seaweedfs_ok; then + echo "SeaweedFS not reachable at ${OBJECT_STORE_ENDPOINT}. Start SeaweedFS first (e.g. sudo systemctl start seaweedfs)." + exit 1 +fi + +if ! command -v mc &>/dev/null; then + echo "MinIO Client (mc) is required. Install it or run create_seaweedfs_folders.sh first (it installs mc)." + exit 1 +fi + +MC_ALIAS="seaweedfs" +mc alias set "$MC_ALIAS" "$OBJECT_STORE_ENDPOINT" "$OBJECT_STORE_ACCESS_KEY" "$OBJECT_STORE_SECRET_KEY" --api S3v4 +mc mb "${MC_ALIAS}/${OBJECT_STORE_BUCKET}" --ignore-existing 2>/dev/null || true + +DEST="${MC_ALIAS}/${OBJECT_STORE_BUCKET}/apps/${APP_FILENAME}" +echo "Uploading ${LOCAL_PATH} to ${DEST}..." +mc cp "$LOCAL_PATH" "$DEST" +echo "Done. App is at ${OBJECT_STORE_BUCKET}/apps/${APP_FILENAME}" diff --git a/tools/artifacts_download_upload_scripts/upload_to_minio.sh b/tools/artifacts_download_upload_scripts/upload_to_minio.sh index 826e275..3b314ac 100755 --- a/tools/artifacts_download_upload_scripts/upload_to_minio.sh +++ b/tools/artifacts_download_upload_scripts/upload_to_minio.sh @@ -1,12 +1,18 @@ #!/bin/bash -# Script to upload model artifacts to MinIO +# Script to upload model artifacts to MinIO or any S3-compatible storage (e.g. SeaweedFS). +# Prefer generic env vars; MINIO_* are accepted for backward compatibility. SOURCE_DIR="./model_artifacts" -MINIO_ENDPOINT="http://127.0.0.1:9000" -# Change the bucket name to the one you want to use. It will be created if it doesn't exist. -MINIO_BUCKET="ai-platform-artifacts-bucket" -MINIO_ROOT_USER="minioadmin" -MINIO_ROOT_PASSWORD="minioadmin" +# Generic names (preferred); fallback to MINIO_* for backward compatibility +OBJECT_STORE_ENDPOINT="${OBJECT_STORE_ENDPOINT:-${MINIO_ENDPOINT:-http://127.0.0.1:9000}}" +OBJECT_STORE_BUCKET="${OBJECT_STORE_BUCKET:-${MINIO_BUCKET:-ai-platform-bucket-minio-us-east-2}}" +OBJECT_STORE_ACCESS_KEY="${OBJECT_STORE_ACCESS_KEY:-${MINIO_ROOT_USER:-${MINIO_ACCESS_KEY:-minioadmin}}}" +OBJECT_STORE_SECRET_KEY="${OBJECT_STORE_SECRET_KEY:-${MINIO_ROOT_PASSWORD:-${MINIO_SECRET_KEY:-minioadmin}}}" +# Internal use (script uses one set) +MINIO_ENDPOINT="${OBJECT_STORE_ENDPOINT}" +MINIO_BUCKET="${OBJECT_STORE_BUCKET}" +MINIO_ROOT_USER="${OBJECT_STORE_ACCESS_KEY}" +MINIO_ROOT_PASSWORD="${OBJECT_STORE_SECRET_KEY}" # Convert bucket name to lowercase (S3/MinIO requirement) ORIGINAL_BUCKET="$MINIO_BUCKET" @@ -176,7 +182,7 @@ if [ $CONNECTION_STATUS -ne 0 ]; then echo "" # Check for specific error types - if echo "$CONNECTION_TEST" | grep -q "Access Denied\|InvalidAccessKeyId\|SignatureDoesNotMatch"; then + if echo "$CONNECTION_TEST" | grep -qi "Access Denied\|InvalidAccessKeyId\|SignatureDoesNotMatch\|signature.*does not match"; then echo "Error: Authentication failed - Invalid credentials" echo "" echo "Current configuration:" @@ -189,7 +195,8 @@ if [ $CONNECTION_STATUS -ne 0 ]; then echo " 3. Default MinIO credentials are usually:" echo " - Username: minioadmin" echo " - Password: minioadmin" - echo " 4. If you changed MinIO credentials, update them in this script" + echo " 4. If you installed MinIO with a custom password (e.g. install_minio_ec2.sh --password 'xxx'), run:" + echo " MINIO_ROOT_PASSWORD='your-password' ./upload_to_minio.sh" elif echo "$CONNECTION_TEST" | grep -q "dial tcp\|connection refused\|no such host"; then echo "Error: Cannot reach MinIO endpoint" echo "" @@ -252,10 +259,10 @@ for artifact_path in "$SOURCE_DIR"/*; do echo "Processing: $id" if [[ -d "$artifact_path" ]]; then - # It's a directory - upload recursively + # It's a directory - upload recursively (trailing slash on source = copy contents, not directory as single object) echo "Uploading directory to MinIO: $MINIO_ENDPOINT/$MINIO_BUCKET/model_artifacts/$id/" - mc cp --recursive "$artifact_path" "$MINIO_ALIAS/$MINIO_BUCKET/model_artifacts/$id/" + mc cp --recursive "$artifact_path/" "$MINIO_ALIAS/$MINIO_BUCKET/model_artifacts/$id/" else # It's a file - upload directly echo "Uploading file to MinIO: $MINIO_ENDPOINT/$MINIO_BUCKET/model_artifacts/$id" diff --git a/tools/artifacts_download_upload_scripts/upload_to_seaweedfs.sh b/tools/artifacts_download_upload_scripts/upload_to_seaweedfs.sh new file mode 100644 index 0000000..8f4bf08 --- /dev/null +++ b/tools/artifacts_download_upload_scripts/upload_to_seaweedfs.sh @@ -0,0 +1,264 @@ +#!/bin/bash +# Upload model artifacts to SeaweedFS (S3-compatible). If SeaweedFS is not running, +# the script can install and start it (weed binary, no Docker). Creates configured +# buckets and uploads from ./model_artifacts. Use OBJECT_STORE_* or SEAWEEDFS_* env vars. + +set -e + +SOURCE_DIR="./model_artifacts" +SEAWEEDFS_PORT="${SEAWEEDFS_PORT:-8333}" + +# Endpoint and credentials (prefer generic OBJECT_STORE_*, then SEAWEEDFS_*). +# SeaweedFS S3 has no built-in users: if the server is started with credentials (env or -config), +# they must match these values. This script sets them when it auto-starts SeaweedFS. +OBJECT_STORE_ENDPOINT="${OBJECT_STORE_ENDPOINT:-${SEAWEEDFS_ENDPOINT:-http://127.0.0.1:8333}}" +OBJECT_STORE_BUCKET="${OBJECT_STORE_BUCKET:-${SEAWEEDFS_BUCKET:-ai-platform-bucket}}" +OBJECT_STORE_ACCESS_KEY="${OBJECT_STORE_ACCESS_KEY:-${SEAWEEDFS_ACCESS_KEY:-minioadmin}}" +OBJECT_STORE_SECRET_KEY="${OBJECT_STORE_SECRET_KEY:-${SEAWEEDFS_SECRET_KEY:-minioadmin}}" +# Bucket list to create (comma-separated). If unset, only primary bucket is created. +SEAWEEDFS_BUCKETS="${SEAWEEDFS_BUCKETS:-$OBJECT_STORE_BUCKET}" +# Set to 1 to skip auto-install and only fail if SeaweedFS is not reachable. +SEAWEEDFS_SKIP_INSTALL="${SEAWEEDFS_SKIP_INSTALL:-0}" +# Retries for each artifact upload (large files can trigger transient "internal error"). +SEAWEEDFS_UPLOAD_RETRIES="${SEAWEEDFS_UPLOAD_RETRIES:-3}" +SEAWEEDFS_UPLOAD_RETRY_DELAY="${SEAWEEDFS_UPLOAD_RETRY_DELAY:-15}" +# Max concurrent uploads (1 = sequential). +SEAWEEDFS_PARALLEL_JOBS="${SEAWEEDFS_PARALLEL_JOBS:-1}" +# Path to log failed artifact ids and messages (appended to on failure). +SEAWEEDFS_ERROR_LOG="${SEAWEEDFS_ERROR_LOG:-./seaweedfs_upload_errors.log}" +# Set to 1 to skip uploading a file if it already exists at destination (avoids re-uploading on script re-runs). +SEAWEEDFS_SKIP_EXISTING="${SEAWEEDFS_SKIP_EXISTING:-0}" +# Wait up to this many seconds for a volume server to appear in the cluster before uploading (avoids "0 node candidates"). +# Set to 0 to skip. Only used when endpoint is local and weed is available. +SEAWEEDFS_WAIT_VOLUME_SERVER="${SEAWEEDFS_WAIT_VOLUME_SERVER:-60}" +# Master address for cluster.ps (default: host from endpoint with port 9333). +SEAWEEDFS_MASTER="${SEAWEEDFS_MASTER:-}" +# Max volumes per volume server (default 100; 0 = auto from disk). Avoids "0 node candidates" when default (e.g. 7) is reached. +SEAWEEDFS_VOLUME_MAX="${SEAWEEDFS_VOLUME_MAX:-100}" + +# Normalize primary bucket to lowercase +OBJECT_STORE_BUCKET=$(echo "$OBJECT_STORE_BUCKET" | tr '[:upper:]' '[:lower:]') + +# ---- Check SeaweedFS is reachable ---- +seaweedfs_ok() { + local code + code=$(curl -s -o /dev/null -w "%{http_code}" "${OBJECT_STORE_ENDPOINT}" 2>/dev/null || echo "000") + [[ "$code" == "200" || "$code" == "403" || "$code" == "400" ]] && return 0 + return 1 +} + +# ---- Install and start SeaweedFS (weed binary from GitHub releases) ---- +install_and_start_seaweedfs() { + local os arch tag asset url tmpdir bindir + os="$(uname -s)" + arch="$(uname -m)" + case "$os" in + Linux) case "$arch" in x86_64|amd64) asset="linux_amd64.tar.gz";; aarch64|arm64) asset="linux_arm64.tar.gz";; *) echo "Unsupported arch: $arch"; return 1;; esac ;; + Darwin) case "$arch" in x86_64|amd64) asset="darwin_amd64.tar.gz";; arm64) asset="darwin_arm64.tar.gz";; *) echo "Unsupported arch: $arch"; return 1;; esac ;; + *) echo "Unsupported OS: $os"; return 1 ;; + esac + echo "Installing SeaweedFS (weed) for $os $arch..." + tag=$(curl -sL https://api.github.com/repos/seaweedfs/seaweedfs/releases/latest | grep '"tag_name":' | sed -E 's/.*"tag_name":\s*"([^"]+)".*/\1/') + [[ -z "$tag" ]] && { echo "Could not get latest SeaweedFS release tag."; return 1; } + url="https://github.com/seaweedfs/seaweedfs/releases/download/${tag}/${asset}" + tmpdir="$(mktemp -d)" + if ! curl -sSL -o "$tmpdir/weed.tar.gz" "$url"; then + echo "Download failed: $url"; rm -rf "$tmpdir"; return 1 + fi + tar -xzf "$tmpdir/weed.tar.gz" -C "$tmpdir" + [[ ! -f "$tmpdir/weed" ]] && { echo "weed binary not found in archive."; rm -rf "$tmpdir"; return 1; } + chmod +x "$tmpdir/weed" + if [[ "$(id -u)" -eq 0 ]] && [[ -d /usr/local/bin ]]; then + mv "$tmpdir/weed" /usr/local/bin/weed + bindir="/usr/local/bin" + elif command -v sudo &>/dev/null && [[ -d /usr/local/bin ]]; then + sudo mv "$tmpdir/weed" /usr/local/bin/weed + bindir="/usr/local/bin" + else + mkdir -p ~/.local/bin + mv "$tmpdir/weed" ~/.local/bin/weed + bindir="$HOME/.local/bin" + export PATH="$PATH:$bindir" + echo "Note: weed installed to $bindir (ensure it is in your PATH)" + fi + rm -rf "$tmpdir" + echo "Installed: $bindir/weed" + "$bindir/weed" version 2>/dev/null || true + echo "Starting SeaweedFS (master, volume, filer, S3 on port ${SEAWEEDFS_PORT}, volume.max=${SEAWEEDFS_VOLUME_MAX})..." + # SeaweedFS S3 validates credentials when provided; use script defaults so mc alias works. + export AWS_ACCESS_KEY_ID="${OBJECT_STORE_ACCESS_KEY:-minioadmin}" + export AWS_SECRET_ACCESS_KEY="${OBJECT_STORE_SECRET_KEY:-minioadmin}" + nohup env AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID" AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY" "$bindir/weed" server -s3 -ip.bind=0.0.0.0 -volume.max="$SEAWEEDFS_VOLUME_MAX" > /tmp/seaweedfs.log 2>&1 & + echo $! > /tmp/seaweedfs.pid + local i + for i in {1..30}; do + sleep 2 + if seaweedfs_ok; then echo "SeaweedFS is up."; return 0; fi + done + echo "Timeout waiting for SeaweedFS. Check /tmp/seaweedfs.log" + return 1 +} + +if ! seaweedfs_ok; then + if [[ "$SEAWEEDFS_SKIP_INSTALL" == "1" ]]; then + echo "Error: SeaweedFS S3 gateway is not reachable at $OBJECT_STORE_ENDPOINT" + echo "Set OBJECT_STORE_ENDPOINT or start SeaweedFS manually (weed server -s3)." + exit 1 + fi + # Only auto-install when endpoint is local (otherwise we'd start local server while user meant a remote one) + if [[ "$OBJECT_STORE_ENDPOINT" != *"127.0.0.1"* ]] && [[ "$OBJECT_STORE_ENDPOINT" != *"localhost"* ]]; then + echo "Error: SeaweedFS is not reachable at $OBJECT_STORE_ENDPOINT" + echo "For a remote endpoint, start SeaweedFS on that host or set OBJECT_STORE_ENDPOINT=http://127.0.0.1:8333 and run again to install locally." + exit 1 + fi + echo "SeaweedFS not reachable at $OBJECT_STORE_ENDPOINT. Attempting to install and start..." + if ! install_and_start_seaweedfs; then + echo "" + echo "Install failed or SeaweedFS did not start. You can:" + echo " 1. Install manually: https://github.com/seaweedfs/seaweedfs/releases" + echo " 2. Run: weed server -s3" + echo " 3. Or set OBJECT_STORE_ENDPOINT=http://:8333 if SeaweedFS runs elsewhere" + exit 1 + fi +fi +echo "SeaweedFS reachable at $OBJECT_STORE_ENDPOINT" + +# ---- Wait for volume server (avoids "Not enough data nodes found" right after restart) ---- +if [[ "$SEAWEEDFS_WAIT_VOLUME_SERVER" -gt 0 ]] && command -v weed &>/dev/null; then + if [[ "$OBJECT_STORE_ENDPOINT" == *"127.0.0.1"* ]] || [[ "$OBJECT_STORE_ENDPOINT" == *"localhost"* ]]; then + master="${SEAWEEDFS_MASTER}" + [[ -z "$master" ]] && master="127.0.0.1:9333" + echo "Waiting up to ${SEAWEEDFS_WAIT_VOLUME_SERVER}s for a volume server in the cluster..." + waited=0 + while [[ $waited -lt "$SEAWEEDFS_WAIT_VOLUME_SERVER" ]]; do + out=$(echo -e "cluster.ps\nexit" | weed shell -master="$master" 2>/dev/null) || true + if echo "$out" | grep -q "volume servers" && echo "$out" | grep -q ":8080"; then + echo "Volume server is ready." + break + fi + sleep 2 + waited=$((waited + 2)) + done + if [[ $waited -ge "$SEAWEEDFS_WAIT_VOLUME_SERVER" ]]; then + echo "Warning: no volume server seen after ${SEAWEEDFS_WAIT_VOLUME_SERVER}s. Upload may fail with 'Not enough data nodes'. Wait longer and re-run, or set SEAWEEDFS_WAIT_VOLUME_SERVER=0 to skip." + fi + fi +fi +echo "" + +# ---- Install mc if needed (same pattern as upload_to_minio.sh) ---- +OS="$(uname -s)" +ARCH="$(uname -m)" +if ! command -v mc &>/dev/null; then + echo "Installing MinIO Client (mc)..." + if [[ "$OS" == "Darwin" ]]; then + if command -v brew &>/dev/null; then + brew install minio/stable/mc + else + if [[ "$ARCH" == "arm64" ]]; then MC_URL="https://dl.min.io/client/mc/release/darwin-arm64/mc"; else MC_URL="https://dl.min.io/client/mc/release/darwin-amd64/mc"; fi + curl -o /tmp/mc "$MC_URL" && chmod +x /tmp/mc && sudo mv /tmp/mc /usr/local/bin/mc + fi + elif [[ "$OS" == "Linux" ]]; then + if [[ "$ARCH" == "x86_64" ]]; then MC_URL="https://dl.min.io/client/mc/release/linux-amd64/mc"; elif [[ "$ARCH" == "aarch64" || "$ARCH" == "arm64" ]]; then MC_URL="https://dl.min.io/client/mc/release/linux-arm64/mc"; else echo "Unsupported arch: $ARCH"; exit 1; fi + curl -o /tmp/mc "$MC_URL" && chmod +x /tmp/mc + sudo mv /tmp/mc /usr/local/bin/mc 2>/dev/null || { mkdir -p ~/.local/bin; mv /tmp/mc ~/.local/bin/mc; export PATH="$PATH:$HOME/.local/bin"; } + else + echo "Unsupported OS: $OS"; exit 1 + fi +fi +mc --version +echo "" + +# ---- Source dir and count ---- +[[ ! -d "$SOURCE_DIR" ]] && { echo "Error: $SOURCE_DIR not found. Run ./download_from_huggingface.sh first."; exit 1; } +artifact_count=$(find "$SOURCE_DIR" -mindepth 1 -maxdepth 1 | wc -l | tr -d ' ') +[[ "$artifact_count" -eq 0 ]] && { echo "No artifacts in $SOURCE_DIR."; exit 1; } +echo "Found $artifact_count artifacts to upload." +echo "" + +# ---- Configure mc alias ---- +MC_ALIAS="seaweedfs" +mc alias set "$MC_ALIAS" "$OBJECT_STORE_ENDPOINT" "$OBJECT_STORE_ACCESS_KEY" "$OBJECT_STORE_SECRET_KEY" --api S3v4 + +# ---- Create buckets (from list + primary) ---- +for b in $(echo "$SEAWEEDFS_BUCKETS" | tr ',' '\n'); do + b=$(echo "$b" | tr '[:upper:]' '[:lower:]' | tr -d ' ') + [[ -z "$b" ]] && continue + mc mb "${MC_ALIAS}/${b}" --ignore-existing 2>/dev/null || true +done +mc mb "${MC_ALIAS}/${OBJECT_STORE_BUCKET}" --ignore-existing 2>/dev/null || true +echo "" + +# ---- Upload with retries (single file; large files can trigger "internal error") ---- +do_upload_file() { + local src="$1" dest="$2" attempt=1 + if [[ "$SEAWEEDFS_SKIP_EXISTING" == "1" ]]; then + mc stat "$dest" &>/dev/null && return 0 + fi + while [[ $attempt -le "$SEAWEEDFS_UPLOAD_RETRIES" ]]; do + mc cp "$src" "$dest" && return 0 + echo "Attempt $attempt/$SEAWEEDFS_UPLOAD_RETRIES failed. Retrying in ${SEAWEEDFS_UPLOAD_RETRY_DELAY}s..." + attempt=$((attempt + 1)) + [[ $attempt -le "$SEAWEEDFS_UPLOAD_RETRIES" ]] && sleep "$SEAWEEDFS_UPLOAD_RETRY_DELAY" + done + return 1 +} + +# Upload a directory artifact file-by-file (per-file retries; one failed file doesn't re-upload the rest). +upload_artifact_dir() { + local artifact_path="$1" dest_base="$2" id="$3" failed=0 f rel + while IFS= read -r -d '' f; do + rel="${f#${artifact_path}/}" + if ! do_upload_file "$f" "${dest_base}/${rel}"; then + echo "$(date -Iseconds 2>/dev/null || date) FAILED: $id $rel" >> "$SEAWEEDFS_ERROR_LOG" + failed=1 + fi + done < <(find "$artifact_path" -type f -print0) + return $failed +} + +# Clear error log from previous runs +: > "$SEAWEEDFS_ERROR_LOG" + +# Build list of artifacts for parallel upload +artifact_paths=() +for artifact_path in "$SOURCE_DIR"/*; do + [[ -e "$artifact_path" ]] || continue + artifact_paths+=("$artifact_path") +done + +parallel_jobs="$SEAWEEDFS_PARALLEL_JOBS" +[[ "$parallel_jobs" -lt 1 ]] && parallel_jobs=1 +idx=0 +total=${#artifact_paths[@]} +echo "Uploading $total artifacts (per-file) with up to $parallel_jobs parallel job(s). Errors logged to: $SEAWEEDFS_ERROR_LOG" +[[ "$SEAWEEDFS_SKIP_EXISTING" == "1" ]] && echo "Skip-existing is ON: files already present at destination will be skipped." +echo "" + +while [[ $idx -lt $total ]]; do + batch=0 + while [[ $batch -lt $parallel_jobs && $idx -lt $total ]]; do + artifact_path="${artifact_paths[$idx]}" + id=$(basename "$artifact_path") + dest_base="${MC_ALIAS}/${OBJECT_STORE_BUCKET}/model_artifacts/$id" + ( + if [[ -d "$artifact_path" ]]; then + upload_artifact_dir "$artifact_path" "$dest_base" "$id" || exit 1 + else + do_upload_file "$artifact_path" "$dest_base" || { echo "$(date -Iseconds 2>/dev/null || date) FAILED: $id" >> "$SEAWEEDFS_ERROR_LOG"; exit 1; } + fi + echo "Completed: $id" + ) & + batch=$((batch + 1)) + idx=$((idx + 1)) + done + wait || true +done + +if [[ -s "$SEAWEEDFS_ERROR_LOG" ]]; then + echo "" + echo "One or more artifacts failed. See $SEAWEEDFS_ERROR_LOG:" + cat "$SEAWEEDFS_ERROR_LOG" + exit 1 +fi +echo "Upload complete. Uploaded $artifact_count artifacts to ${OBJECT_STORE_ENDPOINT}/${OBJECT_STORE_BUCKET}/model_artifacts/" \ No newline at end of file diff --git a/tools/cluster_setup/EKS_README.md b/tools/cluster_setup/EKS_README.md index f160c1a..c65c94e 100644 --- a/tools/cluster_setup/EKS_README.md +++ b/tools/cluster_setup/EKS_README.md @@ -53,13 +53,14 @@ The script installs everything needed for the AI Platform: 4. **EBS CSI Driver** - Persistent volumes backed by AWS EBS 5. **Cluster Autoscaler** - Automatic node scaling based on demand 6. **Cert-Manager** - Automated certificate management -7. **Kube-Prometheus Stack** - Monitoring with Prometheus + Grafana -8. **OpenTelemetry Operator** - Distributed tracing and telemetry -9. **NVIDIA Device Plugin** - GPU support for AI workloads -10. **KubeRay Operator** - Ray cluster management for distributed AI -11. **Splunk Operator** - Splunk Enterprise management -12. **Splunk AI Platform Operator** - AI platform orchestration -13. **AI Platform CR** - Complete AI deployment with features +7. **Object storage** - AWS S3 or external S3-compatible only (MinIO, SeaweedFS, etc.; no in-cluster MinIO install) +8. **Kube-Prometheus Stack** - Monitoring with Prometheus + Grafana +9. **OpenTelemetry Operator** - Distributed tracing and telemetry +10. **NVIDIA Device Plugin** - GPU support for AI workloads +11. **KubeRay Operator** - Ray cluster management for distributed AI +12. **Splunk Operator** - Splunk Enterprise management +13. **Splunk AI Platform Operator** - AI platform orchestration +14. **AI Platform CR** - Complete AI deployment with features ### AWS Integration Features @@ -539,10 +540,53 @@ storage: # (3-63 chars, lowercase, numbers, hyphens) ``` +**Generic object store (`storage.objectStore.type`)** +Only **AWS S3** or **external S3-compatible** storage is supported (no in-cluster MinIO install). Set `storage.objectStore.type` to `aws`, `s3compat`, `minio`, or `seaweedfs` (default is `aws` when unset). The script sets the AIPlatform `objectStorage.path` and creates a credentials secret for s3compat/minio/seaweedfs; you must provide `endpoint` and credentials. See [Object Storage Selection](../../docs/configuration/object-storage.md). + +**External S3-compatible (MinIO, SeaweedFS, etc.)** +Set `storage.objectStore.type` to `minio`, `s3compat`, or `seaweedfs`, and set `storage.objectStore.endpoint` (e.g. `http://:9000` for MinIO) and credentials. You can run MinIO or SeaweedFS on EC2 or elsewhere; use `install_minio_ec2.sh` to install MinIO on an EC2 in the same VPC if desired. Pre-populate artifacts before cluster setup. The Splunk app (when using `splunkStandalone.localAppPath`) is not uploaded to external object storage automatically; upload it to your bucket at `apps/` via console or `mc`/`aws s3 --endpoint-url`. + +**S3-compatible / SeaweedFS (bring your own)** +- **Generic (`s3compat`):** Set `storage.objectStore.type: s3compat`, `storage.objectStore.endpoint`, `storage.objectStore.bucket`, and credentials. The script creates the credentials secret and sets the path to `s3compat://bucket`; it does not install any storage. Use for any S3-compatible backend (Ceph, custom gateway, etc.). +- **SeaweedFS:** Set `storage.objectStore.type: seaweedfs`, `storage.objectStore.endpoint` (e.g. `http://seaweedfs-s3:8333`), `storage.objectStore.bucket`, and credentials (env `MINIO_ROOT_USER`/`MINIO_ROOT_PASSWORD` or `objectStore.auth`). The script does not install SeaweedFS; it only creates the credentials secret and sets the AIPlatform path to `seaweedfs://bucket`. Ensure your SeaweedFS S3 gateway is reachable from the cluster. + +**Ensuring SeaweedFS is used (not MinIO)** +To force the stack to use SeaweedFS instead of MinIO: + +1. **Config:** In `cluster-config.yaml` set `storage.objectStore.type: "seaweedfs"` and `storage.objectStore.endpoint` to your SeaweedFS S3 URL with **port 8333** (e.g. `http://3.144.157.201:8333`). MinIO uses port 9000; using 8333 avoids pointing at MinIO by mistake. +2. **Preflight:** When you run the install script, preflight prints `Object storage: external S3-compatible (seaweedfs)` and `SeaweedFS endpoint: ...`. If the endpoint shows `:9000`, the script warns you to use `:8333` for SeaweedFS. +3. **After install:** Confirm the AIPlatform CR uses SeaweedFS: + ```bash + kubectl -n ai-platform get aiplatform -o yaml | grep -A6 objectStorage + ``` + You should see `path: seaweedfs://` and `endpoint: "http://...:8333"`. The secret name remains `minio-credentials` (used for any S3-compatible store). + +**Secure MinIO credentials (recommended)** +The script reads MinIO credentials in this order: **environment variables first**, then config file. Prefer not storing passwords in `cluster-config.yaml` (e.g. to avoid committing secrets to Git). + +| Approach | How | When to use | +|----------|-----|-------------| +| **Environment variables** | Export before running the script: `export MINIO_ROOT_USER=minioadmin` and `export MINIO_ROOT_PASSWORD=''`. You can leave `storage.objectStore.auth.rootUser` / `rootPassword` empty or omit them in config; env takes precedence. | Local runs, CI/CD (set secrets in pipeline), one-off setups. | +| **Config file only** | Set `storage.objectStore.auth.rootUser` and `storage.objectStore.auth.rootPassword` in `cluster-config.yaml`. | Quick testing only; avoid if the file is in version control. | +| **Pre-created Kubernetes Secret** | Create the secret yourself (e.g. from Vault or AWS Secrets Manager) in the AI platform namespace as `minio-credentials` with keys `s3_access_key` and `s3_secret_key`. The script can still create the secret from env/config; for stricter control, use a separate flow that only references the existing secret. | GitOps, when you already have a secrets pipeline. | +| **External secret manager** | Store credentials in AWS Secrets Manager, HashiCorp Vault, or similar. Before running the script, fetch the secret and set `MINIO_ROOT_USER` and `MINIO_ROOT_PASSWORD` (e.g. via a wrapper or CI step). Do not put the password in config. | Production; keeps secrets out of config and Git. | + +Example (MinIO credentials from environment only; no secrets in config): + +```bash +export MINIO_ROOT_USER=minioadmin +export MINIO_ROOT_PASSWORD='your-secure-password' +CONFIG_FILE=./cluster-config.yaml ./eks_cluster_with_stack.sh install +``` + +**Idempotency and existing VPC** +- The install is **idempotent**: if the EKS cluster already exists, the script skips cluster creation and only runs reconcile (addons, operators, AIPlatform). Set `cluster.useExisting: true` to require an existing cluster (script fails if the cluster is not found). +- **Use an existing VPC:** Provide `cluster.subnets` (private and public subnet IDs and AZs). eksctl will use that VPC and will not create a new one. + **Important Notes:** - **Cluster Name**: Must be DNS-1123 compliant (lowercase letters, numbers, hyphens; start/end with alphanumeric) -- **S3 Bucket**: Must be globally unique across all AWS accounts -- **Subnets**: If provided, script validates NAT Gateway, Internet Gateway, and route tables exist +- **S3 Bucket**: Must be globally unique across all AWS accounts (ignored when MinIO is enabled) +- **Subnets**: If provided, script validates NAT Gateway, Internet Gateway, and route tables exist; cluster uses this existing VPC - **Subnets**: Leave empty or comment out to let eksctl create a new VPC automatically **What each section configures:** @@ -551,8 +595,10 @@ storage: |---------|--------------|------------------| | `cluster.name` | EKS cluster name | ✅ **REQUIRED:** Change to your cluster name | | `cluster.region` | AWS region | ✅ **REQUIRED:** Change to your region | -| `cluster.subnets` | VPC subnets for nodes | ⚙️ **OPTIONAL:** Leave empty for new VPC or provide existing subnet IDs | -| `storage.s3Bucket` | S3 bucket for AI artifacts | ✅ **REQUIRED:** Choose unique name | +| `cluster.useExisting` | Use existing cluster only (do not create) | ⚙️ Set `true` to skip cluster creation; script fails if cluster not found | +| `cluster.subnets` | VPC subnets for nodes | ⚙️ **OPTIONAL:** Leave empty for new VPC or provide existing subnet IDs to use existing VPC | +| `storage.s3Bucket` | S3 bucket for AI artifacts (used when `objectStore.type` is aws) | ✅ **REQUIRED** if not using MinIO/SeaweedFS | +| `storage.objectStore` | Object store: `type` (aws \| s3compat \| minio \| seaweedfs), `bucket`, `endpoint`, `auth`. Default type is `aws` when unset. External only (no in-cluster install). | ⚙️ Required for s3compat/minio/seaweedfs: set `endpoint` and credentials. See [Object Storage Selection](../../docs/configuration/object-storage.md). | | `images.registry` | Container registry URL | ✅ **REQUIRED:** Your ECR/Docker registry | | `images.*` | All container images | ✅ **REQUIRED:** Configure all image paths | | `nodeGroups.cpu` | CPU node group settings | ⚙️ Optional: adjust size/type | @@ -723,18 +769,93 @@ CONFIG_FILE=./my-cluster-config.yaml ./eks_cluster_with_stack.sh install ### 4. Verify Installation +After running `eks_cluster_with_stack.sh install` (or upgrade) with the latest operator image, use the commands below to verify the setup. Default namespace and AIPlatform name come from `cluster-config.yaml` (`aiPlatform.namespace` and `aiPlatform.name`); if you use a custom config, set `AI_NS` and `AI_PLATFORM_NAME` accordingly. + ```bash # Set kubeconfig (done automatically by script) export KUBECONFIG=~/.kube/config -# Check cluster +# ----- Optional: load namespace/name from your config ----- +# CONFIG_FILE="${CONFIG_FILE:-./cluster-config.yaml}" +# AI_NS="$(yq eval '.aiPlatform.namespace' "$CONFIG_FILE")" +# AI_PLATFORM_NAME="$(yq eval '.aiPlatform.name' "$CONFIG_FILE")" +# Or use defaults: +export AI_NS="${AI_NS:-ai-platform}" +export AI_PLATFORM_NAME="${AI_PLATFORM_NAME:-splunk-ai-stack}" +export SPLUNK_AI_NS="${SPLUNK_AI_NS:-splunk-ai-operator-system}" +``` + +**1. Cluster and nodes** + +```bash kubectl get nodes +kubectl get nodes -o wide +``` -# Check AI Platform -kubectl get aiplatform -n ai-platform +**2. Splunk AI Operator (confirm it is running the image you deployed)** -# Check all pods -kubectl get pods --all-namespaces +```bash +kubectl get deploy -n "$SPLUNK_AI_NS" -l app.kubernetes.io/name=splunk-ai-operator -o wide +kubectl get pods -n "$SPLUNK_AI_NS" -l app.kubernetes.io/name=splunk-ai-operator +# Show operator image (replace deployment name if different) +kubectl get deploy -n "$SPLUNK_AI_NS" -o jsonpath='{.items[0].spec.template.spec.containers[0].image}'; echo +``` + +**3. AIPlatform CR and status** + +```bash +kubectl get aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" +kubectl get aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" -o jsonpath='{.status.conditions[*].type}{"\n"}{.status.conditions[*].status}'; echo +# Detailed readiness (expect Ready=True when healthy) +kubectl get aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" -o jsonpath='{.status.conditions[?(@.type=="Ready")]}' | jq . +``` + +**4. Object storage secret (MinIO/S3 credentials for serve config)** + +```bash +# Secret name comes from AIPlatform spec.objectStorage.secretRef +SECRET_NAME="$(kubectl get aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" -o jsonpath='{.spec.objectStorage.secretRef}')" +echo "SecretRef: ${SECRET_NAME:-}" +kubectl get secret "${SECRET_NAME:-minio-credentials}" -n "$AI_NS" 2>/dev/null && echo "✓ Secret exists" || echo "✗ Secret missing" +kubectl get secret "${SECRET_NAME:-minio-credentials}" -n "$AI_NS" -o jsonpath='{.data}' 2>/dev/null | jq -r 'keys[]' | grep -E 's3_access_key|s3_secret_key' && echo "✓ Required keys present" || echo "✗ Check s3_access_key / s3_secret_key" +``` + +**5. RayService and serve config (object store credentials in apps)** + +```bash +kubectl get rayservice "$AI_PLATFORM_NAME" -n "$AI_NS" +# Count S3COMPAT_OBJECT_STORE_ACCESS_KEY in serve config (expect > 0 when using S3-compatible storage) +kubectl get rayservice "$AI_PLATFORM_NAME" -n "$AI_NS" -o jsonpath='{.spec.serveConfigV2}' | grep -o 'S3COMPAT_OBJECT_STORE_ACCESS_KEY' | wc -l +``` + +**6. Ray and application pods** + +```bash +kubectl get pods -n "$AI_NS" -l ray.io/cluster="$AI_PLATFORM_NAME" +kubectl get pods -n "$AI_NS" -l ai.splunk.com/platform="$AI_PLATFORM_NAME" +``` + +**7. Services (Ray Serve, Weaviate)** + +```bash +kubectl get svc -n "$AI_NS" -l ray.io/cluster="$AI_PLATFORM_NAME" +kubectl get svc -n "$AI_NS" | grep -E "ray|weaviate" +``` + +**8. Events (recent issues)** + +```bash +kubectl get events -n "$AI_NS" --sort-by='.lastTimestamp' | tail -30 +kubectl describe aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" | tail -40 +``` + +**Quick one-liner summary** + +```bash +echo "--- Operator ---"; kubectl get deploy -n "$SPLUNK_AI_NS" -o 'custom-columns=NAME:.metadata.name,READY:.status.readyReplicas,IMAGE:.spec.template.spec.containers[0].image' +echo "--- AIPlatform ---"; kubectl get aiplatform "$AI_PLATFORM_NAME" -n "$AI_NS" -o 'custom-columns=NAME:.metadata.name,READY:.status.conditions[0].status' +echo "--- RayService ---"; kubectl get rayservice "$AI_PLATFORM_NAME" -n "$AI_NS" +echo "--- Pods ---"; kubectl get pods -n "$AI_NS" --no-headers | wc -l; kubectl get pods -n "$AI_NS" | head -20 ``` --- @@ -2136,6 +2257,40 @@ EOF ## Troubleshooting +### Ray / AI model deployment: "Invalid repository ID or local directory" + +If a Ray Serve replica (e.g. `Llama31Instruct:LLMDeploymentL40S`) fails with: + +```text +Invalid repository ID or local directory specified: '/home/ray/.cache/s3/artifacts/model_artifacts/llama31-8b-instruct'. +Please verify the following requirements: +1. Provide a valid Hugging Face repository ID. +2. Specify a local directory that contains a recognized configuration file (e.g. config.json). +``` + +the model is loaded from object storage (S3/MinIO) into that path inside the pod. The path is missing or incomplete because the download from object storage failed or the model was never uploaded. + +**Checklist:** + +1. **Model is in MinIO/S3** + Upload the model so the bucket has the prefix `model_artifacts/llama31-8b-instruct/` with at least `config.json` and the model weights (see [artifacts README](../artifacts_download_upload_scripts/README.md)): + - Download: `./tools/artifacts_download_upload_scripts/download_from_huggingface.sh` + - Upload: `./tools/artifacts_download_upload_scripts/upload_to_minio.sh` (set `S3COMPAT_OBJECT_STORE_ENDPOINT`, `S3COMPAT_OBJECT_STORE_BUCKET`, and credentials to match your `cluster-config.yaml`; `MINIO_*` env vars are also accepted). + +2. **External MinIO reachable from EKS** + If using external MinIO (e.g. EC2), ensure: + - `storage.objectStore.endpoint` in `cluster-config.yaml` is correct (e.g. `http://:9000`). + - The EC2 security group allows **inbound TCP 9000** from your EKS node security group or VPC CIDR (see `install_minio_ec2.sh` output). + - From a Ray worker pod: + `kubectl exec -it -n -- curl -s -o /dev/null -w "%{http_code}" http:///minio/health/live` + +3. **Credentials secret** + AIPlatform must have `objectStorage.secretRef` set (e.g. `minio-credentials`). The secret must contain `s3_access_key` and `s3_secret_key` matching the MinIO user that can read the bucket: + - `kubectl get secret minio-credentials -n -o jsonpath='{.data}'` + +4. **Full troubleshooting steps** + See [Troubleshooting: Invalid repository ID or local directory](../../docs/troubleshooting.md) in the main docs for verification commands and details. + ### Script Execution Issues #### Issue: Script Exits Silently Without Error Message diff --git a/tools/cluster_setup/K0S_README.md b/tools/cluster_setup/K0S_README.md index 18668d5..bb1adfc 100644 --- a/tools/cluster_setup/K0S_README.md +++ b/tools/cluster_setup/K0S_README.md @@ -390,7 +390,7 @@ The script installs everything needed for the AI Platform: 1. **k0s Kubernetes Cluster** (v1.30+) - CNCF certified Kubernetes 2. **Calico CNI** - High-performance networking with VXLAN -3. **MinIO** - S3-compatible object storage (replaces AWS S3) +3. **MinIO** - S3-compatible object storage (replaces AWS S3). The AI Platform also supports SeaweedFS and other S3-compatible stores via `s3compat://`, `minio://`, or `seaweedfs://`; see [Object storage](../../docs/configuration/object-storage.md) for path schemes and configuration. 4. **Cert-Manager** - Automated certificate management 5. **Kube-Prometheus Stack** - Monitoring with Prometheus + Grafana 6. **OpenTelemetry Operator** - Distributed tracing and telemetry diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index 39d6c5d..c44a36e 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -2237,8 +2237,8 @@ spec: path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, minio://bucketname/, seaweedfs://bucketname/, or s3compat://bucketname/ + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string region: description: Region of the remote storage volume. Required for @@ -4873,8 +4873,8 @@ spec: path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, minio://bucketname/, seaweedfs://bucketname/, or s3compat://bucketname/ + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string region: description: Region of the remote storage volume. Required for diff --git a/tools/cluster_setup/cluster-config.yaml b/tools/cluster_setup/cluster-config.yaml index d3738db..513b425 100644 --- a/tools/cluster_setup/cluster-config.yaml +++ b/tools/cluster_setup/cluster-config.yaml @@ -13,29 +13,56 @@ # ---------- Cluster Configuration ---------- cluster: - useExisting: false - name: "my-ai-cluster" # CHANGE THIS: Your EKS cluster name (DNS-1123 compliant: lowercase, numbers, hyphens) - region: "us-west-2" # CHANGE THIS: Your AWS region (e.g., us-east-1, us-west-2, eu-west-1) + useExisting: false # true = do not create cluster; use existing one (script fails if cluster not found) + name: "ai-tier-sok-test-east2" # CHANGE THIS: Your EKS cluster name (DNS-1123 compliant: lowercase, numbers, hyphens) + region: "us-east-2" # CHANGE THIS: Your AWS region (e.g., us-east-1, us-west-2, eu-west-1) k8sVersion: "1.31" # Kubernetes version (1.29, 1.30, 1.31 supported) + # When true: require subnets (existing VPC). On 'delete', only EKS and related resources are removed; VPC is preserved so you can redeploy (e.g. with MinIO on EC2 in same VPC). + preserveVpcOnDelete: false # Set true to keep VPC on delete and redeploy without recreating VPC - # If you donot provide any subnet information, eksctl will create a new VPC with public and private subnets automatically. + # To use an EXISTING VPC: provide subnets below; eksctl will not create a new VPC. Idempotent: cluster is only created if it does not exist. + # If you do not provide subnets, eksctl creates a new VPC and subnets automatically. # VPC Subnets - CHANGE ALL OF THESE to your actual subnet IDs # Find your subnets: aws ec2 describe-subnets --filters "Name=vpc-id,Values=vpc-xxxxx" --region us-west-2 - #subnets: - # private: # Private subnets (at least 2 in different AZs) - # - id: "subnet-1a2b3c4d5e6f7g8h" # CHANGE THIS: Your private subnet 1 - # az: "us-west-2a" # CHANGE THIS: Availability zone for subnet 1 - # - id: "subnet-9h8g7f6e5d4c3b2a" # CHANGE THIS: Your private subnet 2 - # az: "us-west-2b" # CHANGE THIS: Availability zone for subnet 2 - # public: # Public subnets (at least 2 in different AZs) - # - id: "subnet-a1b2c3d4e5f6g7h8" # CHANGE THIS: Your public subnet 1 - # az: "us-west-2a" # CHANGE THIS: Availability zone for subnet 1 - # - id: "subnet-h8g7f6e5d4c3b2a1" # CHANGE THIS: Your public subnet 2 - # az: "us-west-2b" # CHANGE THIS: Availability zone for subnet 2 - # - id: "subnet-1h2g3f4e5d6c7b8a" # OPTIONAL: Additional public subnet for HA - # az: "us-west-2c" # OPTIONAL: Third availability zone +# subnets: +# private: # Private subnets (at least 2 in different AZs) +# - id: "subnet-02734905b10e7ad5a" # CHANGE THIS: Your private subnet 1 +# az: "us-east-2b" # CHANGE THIS: Availability zone for subnet 1 +# - id: "subnet-0c1d7dc49788d11dc" # CHANGE THIS: Your private subnet 2 +# az: "us-east-2c" # CHANGE THIS: Availability zone for subnet 2 +# - id: "subnet-0f8f94998d65dfcd2" # CHANGE THIS: Your private subnet 2 +# az: "us-east-2a" +# public: # Public subnets (at least 2 in different AZs) +# - id: "subnet-0f0ea3b190a618540" # CHANGE THIS: Your public subnet 1 +# az: "us-east-2c" # CHANGE THIS: Availability zone for subnet 1 +# - id: "subnet-02b736130e7c2a787" # CHANGE THIS: Your public subnet 2 +# az: "us-east-2a" # CHANGE THIS: Availability zone for subnet 2 +# - id: "subnet-02c35a8cd0b5d90a5" # OPTIONAL: Additional public subnet for HA +# az: "us-east-2b" # OPTIONAL: Third availability zone # ---------- Node Groups ---------- +# +# GPU TYPE QUICK REFERENCE — set instanceType and defaultAcceleratorType (under aiPlatform) together: +# +# L40S (default): +# instanceType: g6e.12xlarge (4x L40S GPUs, 48 GB VRAM each) +# defaultAcceleratorType: L40S +# capacityReservation: not required +# availabilityZones: not required +# +# H100: +# instanceType: p5.4xlarge (8x H100 GPUs, 80 GB VRAM each; capacity reservation required) +# defaultAcceleratorType: H100 +# capacityReservation: required — uncomment block below and set id + az +# availabilityZones: required — must match capacityReservation.az +# maxSize: must equal desiredCapacity (capacity reservations are fixed-size) +# +# H100_NVL: +# instanceType: p4de.24xlarge (8x H100 NVL GPUs, 94 GB VRAM each) +# defaultAcceleratorType: H100_NVL +# capacityReservation: not required +# availabilityZones: not required +# nodeGroups: cpu: enabled: true # Set to false to skip CPU node group @@ -48,19 +75,46 @@ nodeGroups: gpu: enabled: true # Set to false to skip GPU nodes (saves cost) - instanceType: "g6e.12xlarge" # GPU instance type (g6e.12xlarge=4xL40S GPUs, g5.xlarge=1xA10G) + instanceType: "g6e.12xlarge" # CHANGE THIS: see GPU TYPE QUICK REFERENCE above desiredCapacity: 2 # Initial number of GPU nodes minSize: 2 # Minimum GPU nodes - maxSize: 4 # Maximum GPU nodes + maxSize: 4 # Maximum GPU nodes (set equal to desiredCapacity for H100) volumeSize: 1000 # EBS volume size per GPU node (GB) - larger for model storage volumeType: "gp3" # EBS volume type + # ── H100 ONLY ────────────────────────────────────────────────────────────── + # Capacity Reservation: required for P5/H100 instances (scarce capacity). + # Uncomment and fill in when defaultAcceleratorType is H100. + # capacityReservation: + # id: "cr-xxxxxxxxxxxxxxxxx" # CHANGE THIS: your capacity reservation ID + # az: "us-east-2c" # CHANGE THIS: AZ where the reservation exists + + # Availability Zones: lock GPU nodes to the AZ matching the capacity reservation. + # Uncomment and fill in when defaultAcceleratorType is H100. + # availabilityZones: + # - "us-east-2c" # CHANGE THIS: must match capacityReservation.az + # ─────────────────────────────────────────────────────────────────────────── + # ---------- Storage Configuration ---------- +# Object storage: only AWS S3 or external S3-compatible (no in-cluster MinIO install). +# Use objectStore.type: aws (S3) or s3compat | minio | seaweedfs (external; endpoint + credentials required). storage: - s3Bucket: "my-company-ai-platform-bucket" # CHANGE THIS: Globally unique S3 bucket name + s3Bucket: "ai-platform-bucket-minio-us-east-2" # Used when objectStore.type is aws storageClass: "gp3" # Storage class for Kubernetes PVCs (gp3, gp2, io1, io2) vectorDbSize: "50Gi" # VectorDB persistent volume size + # Object store: aws (S3) or external S3-compatible (s3compat, minio, seaweedfs). No in-cluster install. + # - s3compat: generic S3 API (MinIO :9000, SeaweedFS S3 :8333, etc.) — AIPlatform path uses s3compat://bucket + # - minio: same wiring as s3compat but path uses minio:// (use if an older operator webhook rejects s3compat://) + # - seaweedfs: path uses seaweedfs:// (requires operator webhook that allows that scheme) + objectStore: + type: "minio" # aws | s3compat | minio | seaweedfs (external only for non-aws) + bucket: "ai-platform-bucket-minio-us-east-2" + endpoint: "http://13.59.216.105:9000" # MinIO API (9000) or SeaweedFS S3 gateway (8333) + auth: + rootUser: "minioadmin" + rootPassword: "minioadmin" # Must match SeaweedFS env (AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY) or MinIO root + # ---------- Container Images Configuration ---------- images: # ================================================================================== @@ -82,7 +136,7 @@ images: # # REQUIRED: Specify your private registry URL for custom images # Leave empty to use Docker Hub defaults for all images - registry: "1234567890.dkr.ecr.us-west-2.amazonaws.com" # CHANGE THIS: Your ECR/Docker/Harbor registry + registry: "658391232643.dkr.ecr.us-east-2.amazonaws.com" # CHANGE THIS: Your ECR/Docker/Harbor registry # ================================================================================== # CONTAINER IMAGES - Specify paths (registry prefix auto-applied if needed) @@ -97,18 +151,17 @@ images: # Option 2: Full path (ignores registry prefix) # image: "docker.io/myorg/splunk-ai-operator:v1.0.0" # Result: "docker.io/myorg/splunk-ai-operator:v1.0.0" - image: "docker.io/splunk/splunk-ai-operator:0.1.0" + # Bump tag after building fixed operator (SAIA 8Gi default, SchemaJobId persist, feature config) + #image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/splunk-ai-operator:v0.1.8" + image: "docker.io/kpratyush775/splunk-ai-operator:v0.1.31" # Splunk Enterprise Images splunk: - # Option 1: Relative path (uses registry prefix) - # image: "splunk/splunk:10.2.0" - # Result: "123456789012.dkr.ecr.us-west-2.amazonaws.com/splunk/splunk:10.2.0" - # - # Option 2: Full path (ignores registry prefix) - # image: "docker.io/myorg/splunk:10.2.0" - # Result: "docker.io/myorg/splunk:10.2.0" - image: "splunk/splunk:10-2-ai-custom" + # Splunk Enterprise image + # Default behavior: If no registry in path, uses Docker Hub + # "splunk/splunk:10.2.0" → Docker Hub + # "123456789012.dkr.ecr.us-west-2.amazonaws.com/splunk/splunk:10.2.0" → ECR + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" # Splunk Operator image (optional - has default) # Default: "docker.io/splunk/splunk-operator:3.0.0" @@ -123,8 +176,8 @@ images: # Option 2: Full path with different registry # headImage: "docker.io/rayproject/ray:2.44.0" # Result: "docker.io/rayproject/ray:2.44.0" - headImage: "ml-platform/ray/ray-head:build-17" - workerImage: "ml-platform/ray/ray-worker-gpu:build-17" + headImage: "ml-platform/ray/ray-head:build-008" + workerImage: "ml-platform/ray/ray-worker-gpu:build-008" # Weaviate Vector Database weaviate: @@ -136,8 +189,8 @@ images: # SAIA (Splunk AI Assistant) Images saia: # Relative paths - registry prefix auto-applied - apiImage: "ml-platform/saia/saia-api:build-1" - dataLoaderImage: "ml-platform/saia/saia-data-loader:build-1" + apiImage: "ml-platform/saia/saia-api:build-005" + dataLoaderImage: "ml-platform/saia/saia-data-loader:build-003" # Supporting Images fluentBit: @@ -146,6 +199,11 @@ images: # image: "fluent-bit:1.9.6" → uses registry prefix image: "docker.io/fluent/fluent-bit:1.9.6" + # OpenTelemetry Collector (use full URL so it is not rewritten to ECR) + otelCollector: + # Public image - full path so registry prefix is NOT applied; validation checks this URL + image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" + # ---------- Operator Versions ---------- operators: ray: @@ -170,8 +228,12 @@ aiPlatform: rayWorker: "ray-worker-sa" # no change saiaService: "saia-service-sa" # no change - # Default accelerator type - defaultAcceleratorType: "L40S" + # Default accelerator type — must match a top-level key in instance.yaml. + # Must be changed in sync with nodeGroups.gpu.instanceType (see GPU TYPE QUICK REFERENCE above). + # L40S → instanceType: g6e.12xlarge + # H100 → instanceType: p5.4xlarge (also uncomment capacityReservation + availabilityZones) + # H100_NVL → instanceType: p4de.24xlarge + defaultAcceleratorType: "L40S" # Features to enable features: # no change diff --git a/tools/cluster_setup/eks_cluster_with_stack.sh b/tools/cluster_setup/eks_cluster_with_stack.sh index 62e64ee..7426ae1 100755 --- a/tools/cluster_setup/eks_cluster_with_stack.sh +++ b/tools/cluster_setup/eks_cluster_with_stack.sh @@ -29,6 +29,7 @@ load_config() { CLUSTER_NAME="$(yq eval '.cluster.name' "$cfg")" REGION="$(yq eval '.cluster.region' "$cfg")" K8S_VERSION="$(yq eval '.cluster.k8sVersion' "$cfg")" + USE_EXISTING_CLUSTER="$(yq eval '.cluster.useExisting // false' "$cfg")" # Node groups ENABLE_CPU="$(yq eval '.nodeGroups.cpu.enabled' "$cfg")" @@ -47,10 +48,40 @@ load_config() { GPU_VOLUME_SIZE="$(yq eval '.nodeGroups.gpu.volumeSize' "$cfg")" GPU_VOLUME_TYPE="$(yq eval '.nodeGroups.gpu.volumeType' "$cfg")" + # GPU Availability Zones (optional - for capacity-constrained instance types like P5/H100) + GPU_AVAILABILITY_ZONES=() + while IFS= read -r az; do + [[ -n "$az" ]] && GPU_AVAILABILITY_ZONES+=("$az") + done < <(yq eval '.nodeGroups.gpu.availabilityZones[]' "$cfg" 2>/dev/null) + + # Capacity Reservation (optional - for H100/P5 instances) + GPU_CAPACITY_RESERVATION_ID="$(yq eval '.nodeGroups.gpu.capacityReservation.id' "$cfg" 2>/dev/null)" + GPU_CAPACITY_RESERVATION_AZ="$(yq eval '.nodeGroups.gpu.capacityReservation.az' "$cfg" 2>/dev/null)" + [[ "$GPU_CAPACITY_RESERVATION_ID" == "null" ]] && GPU_CAPACITY_RESERVATION_ID="" + [[ "$GPU_CAPACITY_RESERVATION_AZ" == "null" ]] && GPU_CAPACITY_RESERVATION_AZ="" + + # Cluster options + PRESERVE_VPC_ON_DELETE="$(yq eval '.cluster.preserveVpcOnDelete // false' "$cfg")" + # Storage S3_BUCKET="$(yq eval '.storage.s3Bucket' "$cfg")" STORAGE_CLASS="$(yq eval '.storage.storageClass' "$cfg")" VECTORDB_SIZE="$(yq eval '.storage.vectorDbSize' "$cfg")" + # Object storage: objectStore.type (aws | s3compat | minio | seaweedfs); default aws when unset + OBJ_STORE_TYPE="$(yq eval '.storage.objectStore.type // "aws"' "$cfg")" + OBJ_STORE_BUCKET="$(yq eval '.storage.objectStore.bucket // .storage.s3Bucket // "ai-platform"' "$cfg")" + OBJ_STORE_ENDPOINT="$(yq eval '.storage.objectStore.endpoint // ""' "$cfg")" + OBJ_STORE_NS="$(yq eval '.storage.objectStore.namespace // "minio"' "$cfg")" + _obj_user="$(yq eval '.storage.objectStore.auth.rootUser // "minioadmin"' "$cfg")" + _obj_pw="$(yq eval '.storage.objectStore.auth.rootPassword // ""' "$cfg")" + # External S3-compatible only (no in-cluster MinIO install). True when type is s3compat, minio, or seaweedfs. + USE_EXTERNAL_OBJ_STORE="false" + case "${OBJ_STORE_TYPE}" in s3compat|minio|seaweedfs) USE_EXTERNAL_OBJ_STORE="true"; esac + MINIO_ENDPOINT="${OBJ_STORE_ENDPOINT}" + MINIO_NS="${OBJ_STORE_NS}" + MINIO_BUCKET="${OBJ_STORE_BUCKET}" + MINIO_ROOT_USER="${MINIO_ROOT_USER:-$_obj_user}" + MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-$_obj_pw}" # AI Platform AI_NS="$(yq eval '.aiPlatform.namespace' "$cfg")" @@ -93,32 +124,44 @@ load_config() { FLUENT_BIT_IMAGE="$(yq eval '.images.fluentBit.image' "$cfg")" OTEL_COLLECTOR_IMAGE="$(yq eval '.images.otelCollector.image' "$cfg")" - # Subnets - read as arrays (Bash 3.2 compatible) + # Subnets - read as arrays (support both cluster.subnets and top-level subnets) PRIVATE_SUBNETS=() while IFS= read -r subnet; do [[ -n "$subnet" ]] && PRIVATE_SUBNETS+=("$subnet") - done < <(yq eval '.cluster.subnets.private[].id' "$cfg") + done < <(yq eval '.cluster.subnets.private[].id // .subnets.private[].id' "$cfg") PRIVATE_SUBNETS_AZ=() while IFS= read -r az; do [[ -n "$az" ]] && PRIVATE_SUBNETS_AZ+=("$az") - done < <(yq eval '.cluster.subnets.private[].az' "$cfg") + done < <(yq eval '.cluster.subnets.private[].az // .subnets.private[].az' "$cfg") PUBLIC_SUBNETS=() while IFS= read -r subnet; do [[ -n "$subnet" ]] && PUBLIC_SUBNETS+=("$subnet") - done < <(yq eval '.cluster.subnets.public[].id' "$cfg") + done < <(yq eval '.cluster.subnets.public[].id // .subnets.public[].id' "$cfg") PUBLIC_SUBNETS_AZ=() while IFS= read -r az; do [[ -n "$az" ]] && PUBLIC_SUBNETS_AZ+=("$az") - done < <(yq eval '.cluster.subnets.public[].az' "$cfg") + done < <(yq eval '.cluster.subnets.public[].az // .subnets.public[].az' "$cfg") else # Fallback: simple grep-based parsing (less robust but works without yq) CLUSTER_NAME="$(grep 'name:' "$cfg" | head -1 | sed 's/.*name: *"\(.*\)".*/\1/')" REGION="$(grep 'region:' "$cfg" | head -1 | sed 's/.*region: *"\(.*\)".*/\1/')" K8S_VERSION="$(grep 'k8sVersion:' "$cfg" | sed 's/.*k8sVersion: *"\(.*\)".*/\1/')" + USE_EXISTING_CLUSTER="false" + PRESERVE_VPC_ON_DELETE="false" S3_BUCKET="$(grep 's3Bucket:' "$cfg" | sed 's/.*s3Bucket: *"\(.*\)".*/\1/')" + OBJ_STORE_TYPE="" + OBJ_STORE_BUCKET="${S3_BUCKET}" + OBJ_STORE_ENDPOINT="" + OBJ_STORE_NS="minio" + USE_EXTERNAL_OBJ_STORE="false" + MINIO_ENDPOINT="" + MINIO_NS="minio" + MINIO_BUCKET="ai-platform" + MINIO_ROOT_USER="${MINIO_ROOT_USER:-minioadmin}" + MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-}" AI_NS="$(grep 'namespace:' "$cfg" | grep -A2 'aiPlatform:' | tail -1 | sed 's/.*namespace: *"\(.*\)".*/\1/')" AI_PLATFORM_NAME="splunk-ai-stack" AI_STANDALONE_NAME="splunk-standalone" @@ -152,6 +195,9 @@ load_config() { GPU_MAX=4 GPU_VOLUME_SIZE=1000 GPU_VOLUME_TYPE="gp3" + GPU_AVAILABILITY_ZONES=() + GPU_CAPACITY_RESERVATION_ID="" + GPU_CAPACITY_RESERVATION_AZ="" SPLUNK_APP_LOCAL_PATH="" # Hardcoded subnets for fallback @@ -163,6 +209,7 @@ load_config() { ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" S3_PREFIXES=("artifacts/" "apps/" "tasks/") AI_BUCKET_POLICY_NAME="S3Access-${CLUSTER_NAME}-ai-platform" + AI_ECR_ONLY_POLICY_NAME="ECRAccess-${CLUSTER_NAME}-ai-platform" # IRSA for EBS CSI EBS_IRSA_ROLE_NAME="EBSCSIDriverRole-${CLUSTER_NAME}" @@ -762,7 +809,11 @@ generate_node_groups() { k8s.io/cluster-autoscaler/enabled: \"true\" k8s.io/cluster-autoscaler/${CLUSTER_NAME}: owned" fi - if [[ "$ENABLE_GPU" == "true" ]]; then + # H100 with capacity reservation: node group created separately via CloudFormation + # All other GPU types (L40S, H100_NVL): standard eksctl managed node group + if [[ "$ENABLE_GPU" == "true" && "$DEFAULT_ACCELERATOR" == "H100" && -n "$GPU_CAPACITY_RESERVATION_ID" ]]; then + log "GPU nodes will be created separately with capacity reservation ${GPU_CAPACITY_RESERVATION_ID}" + elif [[ "$ENABLE_GPU" == "true" ]]; then nodes+=" - name: gpu-nodes instanceType: ${GPU_INSTANCE_TYPE} @@ -770,7 +821,17 @@ generate_node_groups() { minSize: ${GPU_MIN} maxSize: ${GPU_MAX} volumeSize: ${GPU_VOLUME_SIZE} - volumeType: ${GPU_VOLUME_TYPE} + volumeType: ${GPU_VOLUME_TYPE}" + # Lock to specific AZ when availabilityZones are specified (e.g. for H100_NVL) + if [[ ${#GPU_AVAILABILITY_ZONES[@]} -gt 0 ]]; then + nodes+=" + availabilityZones:" + for az in "${GPU_AVAILABILITY_ZONES[@]}"; do + nodes+=" + - ${az}" + done + fi + nodes+=" tags: Name: ${CLUSTER_NAME}-gpu Environment: prod @@ -853,6 +914,174 @@ EOF create_cluster() { log "Creating EKS cluster..."; eksctl create cluster -f eks-cluster-config.yaml; ensure_kubeconfig; } +# Create GPU node group with Capacity Block using CloudFormation. +# Only called when DEFAULT_ACCELERATOR=H100 and GPU_CAPACITY_RESERVATION_ID is set. +create_gpu_nodegroup_with_capacity_block() { + if [[ "$DEFAULT_ACCELERATOR" != "H100" || -z "$GPU_CAPACITY_RESERVATION_ID" ]]; then + return 0 + fi + + log "Creating GPU node group with Capacity Block (H100)..." + log " Reservation: ${GPU_CAPACITY_RESERVATION_ID} in ${GPU_CAPACITY_RESERVATION_AZ}" + + local stack_name="${CLUSTER_NAME}-gpu-capacity-block" + local cfn_template_file="/tmp/${stack_name}-template.yaml" + + # Get cluster info + local cluster_info vpc_id cluster_sg + cluster_info=$(aws eks describe-cluster --name "${CLUSTER_NAME}" --region "${REGION}" --query 'cluster') + vpc_id=$(echo "$cluster_info" | jq -r '.resourcesVpcConfig.vpcId') + cluster_sg=$(echo "$cluster_info" | jq -r '.resourcesVpcConfig.clusterSecurityGroupId') + log " VPC: ${vpc_id}, Security Group: ${cluster_sg}" + + # Get EKS GPU AMI + local ami_id + ami_id=$(aws ssm get-parameter \ + --name "/aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2-gpu/recommended/image_id" \ + --region "${REGION}" --query 'Parameter.Value' --output text) + log " AMI: ${ami_id}" + + # Get node IAM role created by eksctl for the CPU node group + local node_role_arn + node_role_arn=$(aws iam list-roles \ + --query "Roles[?contains(RoleName, '${CLUSTER_NAME}') && contains(RoleName, 'NodeInstanceRole')].Arn" \ + --output text | head -1) + log " Node Role: ${node_role_arn}" + + if [[ -z "$node_role_arn" || "$node_role_arn" == "None" ]]; then + err "Node role not found — ensure CPU node group was created first." + fi + + # Find subnet in the capacity reservation AZ + local subnet_id + subnet_id=$(aws ec2 describe-subnets --region "${REGION}" \ + --filters "Name=availability-zone,Values=${GPU_CAPACITY_RESERVATION_AZ}" \ + "Name=vpc-id,Values=${vpc_id}" \ + "Name=tag:Name,Values=*eksctl-${CLUSTER_NAME}*Private*" \ + --query 'Subnets[0].SubnetId' --output text) + if [[ -z "$subnet_id" || "$subnet_id" == "None" ]]; then + subnet_id=$(aws ec2 describe-subnets --region "${REGION}" \ + --filters "Name=availability-zone,Values=${GPU_CAPACITY_RESERVATION_AZ}" \ + "Name=vpc-id,Values=${vpc_id}" \ + --query 'Subnets[0].SubnetId' --output text) + fi + if [[ -z "$subnet_id" || "$subnet_id" == "None" ]]; then + err "Subnet not found in ${GPU_CAPACITY_RESERVATION_AZ} for VPC ${vpc_id}" + fi + log " Subnet: ${subnet_id}" + + # Generate CloudFormation template + cat > "${cfn_template_file}" </dev/null || echo "NOT_EXISTS") + + if [[ "$stack_status" == "CREATE_COMPLETE" || "$stack_status" == "UPDATE_COMPLETE" ]]; then + log "GPU node group already exists and is healthy — skipping." + rm -f "${cfn_template_file}"; return 0 + elif [[ "$stack_status" != "NOT_EXISTS" ]]; then + log "Deleting ${stack_status} stack before retry..." + aws cloudformation delete-stack --stack-name "${stack_name}" --region "${REGION}" + aws cloudformation wait stack-delete-complete --stack-name "${stack_name}" --region "${REGION}" || true + fi + + aws cloudformation deploy \ + --template-file "${cfn_template_file}" \ + --stack-name "${stack_name}" \ + --region "${REGION}" \ + --parameter-overrides \ + ClusterName="${CLUSTER_NAME}" \ + ReservationId="${GPU_CAPACITY_RESERVATION_ID}" \ + SubnetId="${subnet_id}" \ + NodeRoleArn="${node_role_arn}" \ + SecurityGroupId="${cluster_sg}" \ + AmiId="${ami_id}" \ + InstanceType="${GPU_INSTANCE_TYPE}" \ + VolumeSize="${GPU_VOLUME_SIZE}" \ + DesiredCapacity="${GPU_DESIRED}" \ + --capabilities CAPABILITY_IAM \ + --no-fail-on-empty-changeset + + rm -f "${cfn_template_file}" + + local final_status + final_status=$(aws cloudformation describe-stacks --stack-name "${stack_name}" --region "${REGION}" \ + --query 'Stacks[0].StackStatus' --output text) + if [[ "$final_status" != "CREATE_COMPLETE" && "$final_status" != "UPDATE_COMPLETE" ]]; then + err "CloudFormation stack failed: ${final_status}. Check: aws cloudformation describe-stack-events --stack-name ${stack_name} --region ${REGION}" + fi + + log "GPU node group with Capacity Block created successfully." + log "Waiting for nodes to join cluster..." + sleep 30 + kubectl get nodes -l nvidia.com/gpu=true 2>/dev/null || log "(Nodes may still be joining...)" +} + ensure_oidc() { log "Ensuring IAM OIDC provider is associated..." @@ -1002,6 +1231,7 @@ ensure_ebs_irsa_role() { # Create IRSA for EBS CSI using eksctl (handles role creation, trust policy, and SA annotation) eksctl create iamserviceaccount \ --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ --namespace "${EBS_NS}" \ --name "${EBS_SA}" \ --role-name "${EBS_IRSA_ROLE_NAME}" \ @@ -1086,6 +1316,7 @@ install_cluster_autoscaler() { log "Installing Cluster Autoscaler with IRSA..." eksctl create iamserviceaccount \ --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ --name "${AUTOSCALER_SA}" \ --namespace "${AUTOSCALER_NS}" \ --role-name "${AUTOSCALER_ROLE_NAME}" \ @@ -1134,6 +1365,35 @@ install_cert_manager() { check_ready cert-manager "app.kubernetes.io/instance=cert-manager,app.kubernetes.io/component=controller" } +# ---------- External S3-compatible object storage (credentials only; no in-cluster install) ---------- +ensure_s3compat_credentials() { + # Only create credentials secret when using external S3-compatible storage (s3compat, minio, seaweedfs). + if [[ "${USE_EXTERNAL_OBJ_STORE}" != "true" ]]; then + return 0 + fi + + log "Object store type is ${OBJ_STORE_TYPE}; creating credentials secret for external S3-compatible storage." + if [[ -z "${OBJ_STORE_ENDPOINT}" && -z "${MINIO_ENDPOINT}" ]]; then + err "storage.objectStore.type=${OBJ_STORE_TYPE} requires storage.objectStore.endpoint" + return 1 + fi + if [[ -z "${MINIO_ROOT_PASSWORD}" ]]; then + err "External S3-compatible storage requires credentials (objectStore.auth.rootPassword or MINIO_ROOT_PASSWORD)" + return 1 + fi + ensure_namespace "${AI_NS}" + local secret_name="minio-credentials" + kubectl -n "${AI_NS}" create secret generic "${secret_name}" \ + --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ + --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | kubectl -n "${AI_NS}" apply -f - + log "✓ External S3-compatible credentials secret ${AI_NS}/${secret_name} ready" +} + # ---------- OTEL Operator + contrib collector (idempotent) ---------- install_otel_operator_and_contrib_collector() { log "Installing OpenTelemetry Operator (Helm)..." @@ -1328,6 +1588,62 @@ EOF printf "%s" "$arn" } +# ECR-only policy for IRSA when using MinIO (no S3) - allows pulling images from ECR +ensure_ecr_only_policy() { + local name="${AI_ECR_ONLY_POLICY_NAME}" + local expected_arn="arn:aws:iam::${ACCOUNT_ID}:policy/${name}" + if aws iam get-policy --policy-arn "$expected_arn" >/dev/null 2>&1; then + printf "%s" "$expected_arn" + return 0 + fi + local arn + arn="$(get_policy_arn_by_name "$name")" + if [[ -z "$arn" ]]; then + log "Creating IAM policy ${name} (ECR read-only, for MinIO-only mode)" + local pd; pd="$(mktemp)"; TMP_FILES+=("$pd") + cat > "$pd" <<'ECRPOL' +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "ECRAuth", + "Effect": "Allow", + "Action": "ecr:GetAuthorizationToken", + "Resource": "*" + }, + { + "Sid": "ECRPull", + "Effect": "Allow", + "Action": [ + "ecr:BatchCheckLayerAvailability", + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage" + ], + "Resource": "arn:aws:ecr:*:*:repository/*" + } + ] +} +ECRPOL + local create_out rc + set +e + create_out="$(aws iam create-policy --policy-name "${name}" --policy-document "file://${pd}" --query 'Policy.Arn' --output text 2>&1)" + rc=$? + set -e + if (( rc == 0 )); then + arn="$(normalize_arn "$create_out")" + else + if grep -qi 'EntityAlreadyExists' <<<"$create_out"; then + arn="$(get_policy_arn_by_name "$name")" + else + err "Failed to create IAM policy ${name}: $create_out" + fi + fi + fi + arn="$(normalize_arn "$arn")" + [[ -z "$arn" ]] && err "Failed to resolve ARN for policy ${name}" + printf "%s" "$arn" +} + # ------- IRSA helpers: ensure & validate ------- generate_irsa_trust_policy() { local ns="$1" sa="$2" @@ -1386,6 +1702,18 @@ ensure_irsa_for_sa() { local sa="$1" ns="$2" policy_arn_raw="${3:-}" local role="IRSA-${CLUSTER_NAME}-${sa}" + # Fail fast if kubectl cannot reach the cluster (e.g. wrong KUBECONFIG or context) + local kerr + kerr="$(kubectl get ns "${ns}" 2>&1)" || true + if echo "${kerr}" | grep -q "connection refused\|localhost:8080\|dial tcp.*8080"; then + err "kubectl cannot reach the cluster (API server connection refused). \ +Fix: run 'aws eks update-kubeconfig --name ${CLUSTER_NAME} --region ${REGION}' and ensure KUBECONFIG (if set) points to that file. \ +Then re-run this script." + fi + if ! kubectl get ns "${ns}" >/dev/null 2>&1; then + err "Cannot access namespace ${ns} (kubectl get ns failed). Ensure the cluster is reachable and the namespace exists." + fi + # Resolve/repair policy ARN if invalid local policy_arn; policy_arn="$(normalize_arn "$policy_arn_raw")" if [[ -z "$policy_arn" || $policy_arn != arn:aws:iam::* ]]; then @@ -1401,6 +1729,7 @@ ensure_irsa_for_sa() { log "Ensuring IRSA (role ${role}) for ${ns}/${sa} with policy ${policy_arn}" eksctl create iamserviceaccount \ --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ --namespace "${ns}" \ --name "${sa}" \ --role-name "${role}" \ @@ -1454,28 +1783,34 @@ install_splunk_standalone() { ensure_namespace "${AI_NS}" wait_for_crd standalones.enterprise.splunk.com 600 - # Create IRSA for Splunk Standalone (recommended approach) + # IRSA for Splunk Standalone: S3 bucket policy when using AWS S3, ECR-only when using external S3-compatible log "Setting up IRSA for Splunk Standalone service account..." - local policy_arn; policy_arn="$(ensure_bucket_policy "${AI_BUCKET_POLICY_NAME}" "${S3_BUCKET}")" + local policy_arn + if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then + policy_arn="$(ensure_ecr_only_policy)" + else + policy_arn="$(ensure_bucket_policy "${AI_BUCKET_POLICY_NAME}" "${S3_BUCKET}")" + fi ensure_irsa_for_sa "${STANDALONE_SA}" "${AI_NS}" "${policy_arn}" - # DEPRECATED: Create s3-secret using AWS credentials - # This is legacy approach - IRSA above is preferred, but Splunk Operator may still require the secret - log "Creating s3-secret for Splunk Standalone (fallback if IRSA not fully supported)..." - if resolve_aws_creds_for_secret 2>/dev/null; then - local ak="${AWS_ACCESS_KEY_ID:-}"; local sk="${AWS_SECRET_ACCESS_KEY:-}"; local st="${AWS_SESSION_TOKEN:-}" - if [[ -n "$ak" && -n "$sk" ]]; then - kubectl -n "${AI_NS}" create secret generic s3-secret \ - --from-literal=s3_access_key="${ak}" \ - --from-literal=s3_secret_key="${sk}" \ - $( [[ -n "$st" ]] && printf -- "--from-literal=s3_session_token=%s" "$st" ) \ - --dry-run=client -o yaml | kubectl apply -f - - log "✓ Created s3-secret with explicit credentials" + if [[ "${USE_EXTERNAL_OBJ_STORE}" != "true" ]]; then + # Create s3-secret for Standalone when using S3 (fallback if IRSA not fully supported) + log "Creating s3-secret for Splunk Standalone (S3 mode)..." + if resolve_aws_creds_for_secret 2>/dev/null; then + local ak="${AWS_ACCESS_KEY_ID:-}"; local sk="${AWS_SECRET_ACCESS_KEY:-}"; local st="${AWS_SESSION_TOKEN:-}" + if [[ -n "$ak" && -n "$sk" ]]; then + kubectl -n "${AI_NS}" create secret generic s3-secret \ + --from-literal=s3_access_key="${ak}" \ + --from-literal=s3_secret_key="${sk}" \ + $( [[ -n "$st" ]] && printf -- "--from-literal=s3_session_token=%s" "$st" ) \ + --dry-run=client -o yaml | kubectl apply -f - + log "✓ Created s3-secret with explicit credentials" + else + warn "No AWS credentials available - s3-secret not created. Splunk Standalone will use IRSA." + fi else - warn "No AWS credentials available - s3-secret not created. Splunk Standalone will use IRSA." + warn "AWS credentials not available - s3-secret not created. Splunk Standalone will use IRSA via ${STANDALONE_SA}." fi - else - warn "AWS credentials not available - s3-secret not created. Splunk Standalone will use IRSA via ${STANDALONE_SA}." fi cat <<'YAML' | kubectl -n "${AI_NS}" apply -f - @@ -1497,7 +1832,47 @@ data: sslPassword: password YAML - cat </dev/null 2>&1; then pf_ok "$t found ($(command -v $t))"; else pf_fail "$t not found in PATH"; fi done @@ -2445,6 +2870,13 @@ preflight_env() { pf_header "Subnets exist" # Check if subnets are provided (arrays may be empty) local subnet_count=$((${#PRIVATE_SUBNETS[@]} + ${#PUBLIC_SUBNETS[@]})) + if [[ "${PRESERVE_VPC_ON_DELETE}" == "true" ]]; then + if [[ ${#PRIVATE_SUBNETS[@]} -lt 2 ]]; then + pf_fail "cluster.preserveVpcOnDelete is true: you must specify at least 2 private subnets under cluster.subnets.private so the cluster uses an existing VPC (VPC will not be deleted on 'delete')." + else + pf_ok "Preserve VPC on delete: using existing VPC (subnets specified); VPC will not be deleted when you run delete." + fi + fi if [[ $subnet_count -eq 0 ]]; then pf_ok "No subnets specified - eksctl will create new VPC and subnets automatically" else @@ -2668,11 +3100,20 @@ add_ecr_permissions_to_role() { # ---------- Orchestrator for AI Platform setup ---------- install_ai_platform_stack() { log "=== Setting up Splunk AI Platform stack ===" - ensure_s3_bucket_and_prefixes - ensure_s3_upload_splunk_app + if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then + log "Using external S3-compatible object storage (${OBJ_STORE_TYPE}); skipping S3 bucket creation; using ECR-only policy for IRSA." + else + ensure_s3_bucket_and_prefixes + ensure_s3_upload_splunk_app + fi ensure_namespace "${AI_NS}" - local policy_arn; policy_arn="$(ensure_bucket_policy "${AI_BUCKET_POLICY_NAME}" "${S3_BUCKET}")" + local policy_arn + if [[ "${USE_EXTERNAL_OBJ_STORE}" == "true" ]]; then + policy_arn="$(ensure_ecr_only_policy)" + else + policy_arn="$(ensure_bucket_policy "${AI_BUCKET_POLICY_NAME}" "${S3_BUCKET}")" + fi ensure_irsa_for_sa "${RAY_HEAD_SA}" "${AI_NS}" "${policy_arn}" ensure_irsa_for_sa "${RAY_WORKER_SA}" "${AI_NS}" "${policy_arn}" @@ -2698,7 +3139,14 @@ install_ai_platform_stack() { } # ---------- CREATE / RECONCILE / DELETE FLOWS ---------- -create_cluster_flow() { create_cluster_config; create_cluster; } +create_cluster_flow() { + create_cluster_config + create_cluster + # H100 with capacity reservation: eksctl cannot manage these nodes — create via CloudFormation + if [[ "$DEFAULT_ACCELERATOR" == "H100" && -n "$GPU_CAPACITY_RESERVATION_ID" ]]; then + create_gpu_nodegroup_with_capacity_block + fi +} reconcile_flow() { ensure_oidc @@ -2709,8 +3157,19 @@ reconcile_flow() { install_cluster_autoscaler install_nvidia_device_plugin uncordon_ready_nodes + # H100 with capacity reservation: create GPU node group if not already present + if [[ "$DEFAULT_ACCELERATOR" == "H100" && -n "$GPU_CAPACITY_RESERVATION_ID" ]]; then + local gpu_node_count + gpu_node_count=$(kubectl get nodes -l nvidia.com/gpu=true --no-headers 2>/dev/null | wc -l | tr -d ' ') + if [[ "$gpu_node_count" -lt 1 ]]; then + create_gpu_nodegroup_with_capacity_block + else + log "Found ${gpu_node_count} H100 GPU node(s) — skipping capacity block creation." + fi + fi install_kube_prometheus install_cert_manager + ensure_s3compat_credentials install_otel_operator_and_contrib_collector install_ray_operator install_splunk_operator @@ -2722,11 +3181,16 @@ reconcile_flow() { # ---------- MAIN ---------- main_install() { - for t in aws eksctl kubectl helm git jq; do need "$t"; done + for t in aws eksctl kubectl helm git jq yq; do need "$t"; done # Load configuration from YAML file load_config + # Force region for all AWS CLI and eksctl commands + export AWS_DEFAULT_REGION="${REGION}" + export AWS_REGION="${REGION}" + log "Using AWS Region: ${REGION}" + # Validate and configure container images validate_image_config configure_images @@ -2750,9 +3214,16 @@ main_install() { pf_summary fi + # Idempotent: create cluster only if it does not exist. When cluster.useExisting is true, fail if cluster is missing. if ! cluster_exists; then + if [[ "${USE_EXISTING_CLUSTER}" == "true" ]]; then + err "cluster.useExisting is true but cluster '${CLUSTER_NAME}' was not found in ${REGION}. Create the cluster first or set useExisting: false." + exit 1 + fi create_cluster_flow ensure_kubeconfig + else + log "Cluster ${CLUSTER_NAME} already exists; skipping cluster creation (idempotent)." fi preflight_api_connectivity diff --git a/tools/cluster_setup/k0s-cluster-config.yaml b/tools/cluster_setup/k0s-cluster-config.yaml new file mode 100644 index 0000000..258f43a --- /dev/null +++ b/tools/cluster_setup/k0s-cluster-config.yaml @@ -0,0 +1,119 @@ +# =================================================================== +# k0s Cluster Configuration Template for Splunk AI Platform +# =================================================================== +# IMPORTANT: This is a template file with placeholder values. +# Copy this file and replace ALL placeholder values with your actual resources. +# +# Quick Start: +# 1. Copy: cp k0s-cluster-config.yaml my-cluster-config.yaml +# 2. Edit: vi my-cluster-config.yaml +# 3. Replace all values marked with "CHANGE THIS" +# 4. Run: CONFIG_FILE=./my-cluster-config.yaml ./k0s_cluster_with_stack.sh install +# =================================================================== + +# ---------- Cluster Configuration ---------- +cluster: + name: "my-ai-cluster" # CHANGE THIS: Your cluster name + useExisting: "auto" # auto | force | never + region: "us-east-2" # CHANGE THIS: AWS region (used when creating EC2 instances) + sshUser: "ubuntu" # SSH username for nodes + sshKeyPath: "~/.ssh/id_rsa" # CHANGE THIS: Path to SSH private key + +# ---------- Node Configuration ---------- +# +# GPU TYPE QUICK REFERENCE — set gpuWorker instanceType and defaultAcceleratorType together: +# +# L40S (default): +# gpuWorker instanceType: g6e.12xlarge (4x L40S GPUs, 48 GB VRAM each) +# defaultAcceleratorType: L40S +# +# H100: +# gpuWorker instanceType: p5.4xlarge (8x H100 GPUs, 80 GB VRAM each) +# defaultAcceleratorType: H100 +# +# H100_NVL: +# gpuWorker instanceType: p4de.24xlarge (8x H100 NVL GPUs, 94 GB VRAM each) +# defaultAcceleratorType: H100_NVL +# +# On-premises (existing hardware): +# Set existingIPs below — instanceTypes are ignored when IPs are provided. +# The defaultAcceleratorType must still match the physical GPU in your nodes. +# +nodes: + controllers: 1 # 1 (single) or 3 (HA) + cpuWorkers: 2 # Number of CPU worker nodes (EC2 mode only) + gpuWorkers: 1 # Number of GPU worker nodes (EC2 mode only) + + # On-premises / existing nodes: provide IPs to skip EC2 instance creation. + # Leave lists empty to create new EC2 instances automatically. + existingIPs: + controllers: [] # e.g. ["10.0.0.1"] or ["10.0.0.1", "10.0.0.2", "10.0.0.3"] for HA + workers: [] # e.g. ["10.0.1.1", "10.0.1.2", "10.0.2.1"] + +# ---------- EC2 Instance Types (ignored when existingIPs are set) ---------- +instanceTypes: + controller: "t3.xlarge" # Controller node (4 vCPU, 16 GB RAM) + cpuWorker: "m5.4xlarge" # CPU worker (16 vCPU, 64 GB RAM) + gpuWorker: "g6e.12xlarge" # CHANGE THIS: see GPU TYPE QUICK REFERENCE above + +# ---------- EC2 Network (required when creating EC2 instances) ---------- +ec2: + vpcId: "" # CHANGE THIS: your VPC ID (e.g. vpc-xxxxxxxxxxxxxxxxx) + subnetId: "" # CHANGE THIS: your subnet ID (e.g. subnet-xxxxxxxxxxxxxxxxx) + keyName: "" # CHANGE THIS: your EC2 key pair name + +# ---------- MinIO Object Storage ---------- +minio: + accessKey: "minioadmin" # CHANGE THIS: MinIO admin username + secretKey: "minioadmin" # CHANGE THIS: MinIO admin password + bucket: "ai-platform-data" # MinIO bucket name + +# ---------- Kubernetes ---------- +kubernetes: + namespace: "ai-platform" # no change + +# ---------- Splunk ---------- +splunk: + standaloneName: "splunk-standalone" # no change + +# ---------- ECR (for private AWS image repositories) ---------- +ecr: + account: "" # CHANGE THIS: your AWS account ID (e.g. "123456789012") + # Leave empty to auto-detect from AWS CLI + +# ---------- Image Pull Secrets ---------- +imagePullSecrets: + autoCreateECR: false # Set true to auto-create ECR pull secret + dockerHub: + enabled: false # Set true if images are on Docker Hub (private) + username: "" + password: "" + gcr: + enabled: false + acr: + enabled: false + custom: + enabled: false + +# ---------- File Paths ---------- +files: + splunkOperator: "./splunk-operator-cluster.yaml" + aiPlatform: "./artifacts.yaml" + +# ---------- AI Platform Configuration ---------- +aiPlatform: + namespace: "ai-platform" # no change + name: "splunk-ai-stack" # no change + + # Service Accounts + serviceAccounts: + rayHead: "ray-head-sa" # no change + rayWorker: "ray-worker-sa" # no change + saiaService: "saia-service-sa" # no change + + # Default accelerator type — must match a top-level key in instance.yaml. + # Must be changed in sync with instanceTypes.gpuWorker (see GPU TYPE QUICK REFERENCE above). + # L40S → gpuWorker: g6e.12xlarge + # H100 → gpuWorker: p5.4xlarge + # H100_NVL → gpuWorker: p4de.24xlarge + defaultAcceleratorType: "L40S" diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 1e65fd1..4ac7787 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -120,9 +120,11 @@ load_config() { CPU_WORKER_INSTANCE_TYPE=$(yq eval '.instanceTypes.cpuWorker' "${CONFIG_FILE}" 2>/dev/null || echo "m5.4xlarge") GPU_WORKER_INSTANCE_TYPE=$(yq eval '.instanceTypes.gpuWorker' "${CONFIG_FILE}" 2>/dev/null || echo "g5.2xlarge") - # MinIO configuration - MINIO_ACCESS_KEY=$(yq eval '.minio.accessKey' "${CONFIG_FILE}" 2>/dev/null || echo "minioadmin") - MINIO_SECRET_KEY=$(yq eval '.minio.secretKey' "${CONFIG_FILE}" 2>/dev/null || echo "minioadmin123") + # MinIO configuration: prefer environment variables (secure); fall back to config + _minio_ak=$(yq eval '.minio.accessKey' "${CONFIG_FILE}" 2>/dev/null || echo "minioadmin") + _minio_sk=$(yq eval '.minio.secretKey' "${CONFIG_FILE}" 2>/dev/null || echo "minioadmin123") + MINIO_ACCESS_KEY="${MINIO_ACCESS_KEY:-$_minio_ak}" + MINIO_SECRET_KEY="${MINIO_SECRET_KEY:-$_minio_sk}" MINIO_BUCKET=$(yq eval '.minio.bucket' "${CONFIG_FILE}" 2>/dev/null || echo "ai-platform-data") # Kubernetes namespace @@ -155,7 +157,11 @@ load_config() { SPLUNK_OPERATOR_FILE=$(yq eval '.files.splunkOperator' "${CONFIG_FILE}" 2>/dev/null || echo "./splunk-operator-cluster.yaml") SPLUNK_AI_FILE=$(yq eval '.files.aiPlatform' "${CONFIG_FILE}" 2>/dev/null || echo "./artifacts.yaml") - log "Configuration loaded: cluster=${CLUSTER_NAME}, namespace=${AI_NS}" + # Default accelerator type (must match a key in instance.yaml: L40S | H100 | H100_NVL) + DEFAULT_ACCELERATOR=$(yq eval '.aiPlatform.defaultAcceleratorType' "${CONFIG_FILE}" 2>/dev/null || echo "") + [[ "$DEFAULT_ACCELERATOR" == "null" || -z "$DEFAULT_ACCELERATOR" ]] && DEFAULT_ACCELERATOR="L40S" + + log "Configuration loaded: cluster=${CLUSTER_NAME}, namespace=${AI_NS}, accelerator=${DEFAULT_ACCELERATOR}" if [[ -n "${ECR_ACCOUNT}" ]]; then log "ECR Account: ${ECR_ACCOUNT}" fi @@ -176,7 +182,7 @@ load_config() { # ====== PREFLIGHT CHECKS ====== preflight_checks() { pf_header "Required tools" - for tool in ssh kubectl helm git jq; do + for tool in ssh kubectl helm git jq yq; do if command -v "$tool" >/dev/null 2>&1; then pf_ok "$tool found" else @@ -184,13 +190,6 @@ preflight_checks() { fi done - # Check for yq - if command -v yq >/dev/null 2>&1; then - pf_ok "yq found" - else - pf_warn "yq not found - using fallback parsing (install yq for better results)" - fi - pf_header "Configuration" [[ -n "${CLUSTER_NAME}" ]] && pf_ok "Cluster name: ${CLUSTER_NAME}" || pf_fail "Cluster name not set" [[ -f "${SPLUNK_OPERATOR_FILE}" ]] && pf_ok "Splunk operator file: ${SPLUNK_OPERATOR_FILE}" || pf_warn "Splunk operator file not found: ${SPLUNK_OPERATOR_FILE}"