From 8c6c0fae85a36c46a64cd861cb3d9a7a2e70047f Mon Sep 17 00:00:00 2001 From: Ramesh Maddegoda <94033485+ramesh-maddegoda@users.noreply.github.com> Date: Tue, 11 Feb 2025 21:16:13 -0800 Subject: [PATCH] UPDATE to have a single URL as pds_nucleus_opensearch_url for all the nodes, have Node specific OpenSearch registry names and used a data source for S3 to read existing staging bucket in MCP Prod. --- terraform/README.md | 22 ++------ terraform/main.tf | 4 +- .../pds-nucleus-product-completion-checker.py | 6 +- .../product-copy-completion-checker.tf | 55 +++++++++++-------- .../variables.tf | 18 +++++- terraform/variables.tf | 13 ++++- terraform/variables/terraform.tfvars.dev | 3 +- terraform/variables/terraform.tfvars.test | 3 +- 8 files changed, 74 insertions(+), 50 deletions(-) diff --git a/terraform/README.md b/terraform/README.md index a2b597c..f3bff75 100644 --- a/terraform/README.md +++ b/terraform/README.md @@ -81,6 +81,9 @@ Note: Examples of `terraform.tfvars` files are available at `terraform/variable - pds_node_names = List of PDS Node names to be supported (E.g.: ["PDS_SBN", "PDS_IMG", "PDS_EN"]).The following node name format should be used. - (PDS_ATM, PDS_ENG, PDS_GEO, PDS_IMG, PDS_NAIF, PDS_RMS, PDS_SBN, PSA, JAXA, ROSCOSMOS) - Please check https://nasa-pds.github.io/registry/user/harvest_job_configuration.html for PDS Node name descriptions. + + - pds_nucleus_opensearch_url : OpenSearch URL to be used with Harvest tool + - pds_nucleus_opensearch_registry_names : List of Nod3e specific OpenSearch registry names (E.g.: ["pds-nucleus-sbn-registry"", "pds-nucleus-img-registry"]) - pds_nucleus_opensearch_urls : List of Node specific OpenSearch URLs (E.g.: ["https://abcdef.us-west-2.aoss.amazonaws.com", "https://opqrst.us-west-2.aoss.amazonaws.com"]) - pds_nucleus_opensearch_credential_relative_url : Opensearch Credential URL (E.g.: "http:///AWS_CONTAINER_CREDENTIALS_RELATIVE_URI") - pds_nucleus_harvest_replace_prefix_with_list : List of harvest replace with strings (E.g.: ["s3://pds-sbn-nucleus-staging","s3://pds-img-nucleus-staging"]) @@ -121,7 +124,8 @@ aws_secretmanager_key_arn = "arn:aws:kms:us-west-2:12345678:key/12345-12 # Please check https://nasa-pds.github.io/registry/user/harvest_job_configuration.html for PDS Node name descriptions. pds_node_names = ["PDS_SBN", "PDS_IMG"] -pds_nucleus_opensearch_urls = ["https://abcdef.us-west-2.aoss.amazonaws.com", "https://opqrst.us-west-2.aoss.amazonaws.com"] +pds_nucleus_opensearch_url = "https://abcdef.us-west-2.aoss.amazonaws.com" +pds_nucleus_opensearch_registry_names = ["pds-nucleus-sbn-registry"", "pds-nucleus-img-registry"] pds_nucleus_opensearch_credential_relative_url = "http:///AWS_CONTAINER_CREDENTIALS_RELATIVE_URI" pds_nucleus_harvest_replace_prefix_with_list = ["s3://pds-sbn-nucleus-staging", "s3://pds-img-nucleus-staging"] @@ -183,21 +187,7 @@ terraform apply 13. The DAGs can be added to the Airflow by uploading Airflow DAG files to the DAG folder of S3 bucket configured as `mwaa_dag_s3_bucket_name` in the `terraform.tfvars` file. -14. Go to the AWS Secret manager (https://us-west-2.console.aws.amazon.com/secretsmanager/listsecrets?region=us-west-2) and locate the secrets in the following format. - - pds/nucleus/opensearch/creds//user - - pds/nucleus/opensearch/creds//password - - E.g.: - - pds/nucleus/opensearch/creds/PDS_IMG/user - - pds/nucleus/opensearch/creds/PDS_SBN/user - - pds/nucleus/opensearch/creds/PDS_IMG/password - - pds/nucleus/opensearch/creds/PDS_SBN/password - -15. Obtain the Opensearch username and password for each PDS Node and update the above secrets with relevant usernames and passwords. - - To update a secret, click on a secret -> Retrieve secret value -> Edit -> Save - - -15. Use the PDS Data Upload Manager (DUM) tool to upload files to pds_nucleus_staging_bucket. +16. Use the PDS Data Upload Manager (DUM) tool to upload files to pds_nucleus_staging_bucket. ## Steps to Access Nucleus Airflow UI With Cognito Credentials diff --git a/terraform/main.tf b/terraform/main.tf index 7ea683c..4303486 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -102,7 +102,8 @@ module "product-copy-completion-checker" { pds_nucleus_cold_archive_bucket_name_postfix = var.pds_nucleus_cold_archive_bucket_name_postfix pds_node_names = var.pds_node_names - pds_nucleus_opensearch_urls = var.pds_nucleus_opensearch_urls + pds_nucleus_opensearch_url = var.pds_nucleus_opensearch_url + pds_nucleus_opensearch_registry_names = var.pds_nucleus_opensearch_registry_names pds_nucleus_opensearch_credential_relative_url = var.pds_nucleus_opensearch_credential_relative_url pds_nucleus_harvest_replace_prefix_with_list = var.pds_nucleus_harvest_replace_prefix_with_list @@ -141,4 +142,3 @@ module "cognito-auth" { cognito_user_pool_id = var.cognito_user_pool_id aws_elb_account_id_for_the_region = var.aws_elb_account_id_for_the_region } - diff --git a/terraform/terraform-modules/product-copy-completion-checker/lambda/pds-nucleus-product-completion-checker.py b/terraform/terraform-modules/product-copy-completion-checker/lambda/pds-nucleus-product-completion-checker.py index 5606551..027431f 100644 --- a/terraform/terraform-modules/product-copy-completion-checker/lambda/pds-nucleus-product-completion-checker.py +++ b/terraform/terraform-modules/product-copy-completion-checker/lambda/pds-nucleus-product-completion-checker.py @@ -33,6 +33,7 @@ dag_name = os.environ.get('AIRFLOW_DAG_NAME') pds_node_name = os.environ.get('PDS_NODE_NAME') opensearch_endpoint = os.environ.get('OPENSEARCH_ENDPOINT') +opensearch_registry_name = os.environ.get('OPENSEARCH_REGISTRY_NAME') pds_nucleus_opensearch_credential_relative_url = os.environ.get('OPENSEARCH_CREDENTIAL_RELATIVE_URL') replace_prefix_with = os.environ.get('REPLACE_PREFIX_WITH') efs_mount_path = os.environ.get('EFS_MOUNT_PATH') @@ -45,6 +46,7 @@ pds_hot_archive_bucket_name = os.environ.get('PDS_HOT_ARCHIVE_S3_BUCKET_NAME') pds_cold_archive_bucket_name = os.environ.get('PDS_COLD_ARCHIVE_S3_BUCKET_NAME') pds_staging_bucket_name = os.environ.get('PDS_STAGING_S3_BUCKET_NAME') +product_batch_size = os.environ.get('PRODUCT_BATCH_SIZE') replace_prefix = efs_mount_path @@ -98,7 +100,7 @@ def process_completed_products(): logger.debug(f"Number of completed product labels : {str(response['records'])}") logger.debug(f"Number of completed product labels : {str(len(response['records']))}") - n = 10 + n = product_batch_size count = 0 list_of_product_labels_to_process = [] @@ -222,7 +224,7 @@ def create_harvest_configs_and_trigger_nucleus(list_of_product_labels_to_process logger.info(f"Created harvest config XML file: {harvest_config_file_path}") connection_xml_content = f""" - + {pds_nucleus_opensearch_credential_relative_url} """ diff --git a/terraform/terraform-modules/product-copy-completion-checker/product-copy-completion-checker.tf b/terraform/terraform-modules/product-copy-completion-checker/product-copy-completion-checker.tf index e3a94b6..34a41e9 100644 --- a/terraform/terraform-modules/product-copy-completion-checker/product-copy-completion-checker.tf +++ b/terraform/terraform-modules/product-copy-completion-checker/product-copy-completion-checker.tf @@ -235,13 +235,36 @@ resource "aws_s3_bucket" "pds_nucleus_s3_config_bucket" { force_destroy = true } -# Create a staging S3 Bucket for each PDS Node -resource "aws_s3_bucket" "pds_nucleus_s3_staging_bucket" { - count = length(var.pds_node_names) - # convert PDS node name to S3 bucket name compatible format +# This data source is added to access existing S3 buckets, bcause an S3 staging bucket is already available in MCP Prod environment. +data "aws_s3_bucket" "pds_nucleus_s3_staging_bucket" { + count = length(var.pds_node_names) bucket = "${lower(replace(var.pds_node_names[count.index], "_", "-"))}-${var.pds_nucleus_staging_bucket_name_postfix}" } +# Commented out the following S3 bucket resources, because an S3 staging bucket is already available in MCP Prod environment. +# However, this resource is useful when deploying in a fresh environment. + +# # Create a staging S3 Bucket for each PDS Node +# resource "aws_s3_bucket" "pds_nucleus_s3_staging_bucket" { +# count = length(var.pds_node_names) +# # convert PDS node name to S3 bucket name compatible format +# bucket = "${lower(replace(var.pds_node_names[count.index], "_", "-"))}-${var.pds_nucleus_staging_bucket_name_postfix}" +# } + +# # Create an aws_s3_bucket_notification for each s3 bucket of each Node +# resource "aws_s3_bucket_notification" "pds_nucleus_s3_staging_bucket_notification" { +# +# count = length(var.pds_node_names) +# # convert PDS node name to S3 bucket name compatible format +# bucket = "${lower(replace(var.pds_node_names[count.index], "_", "-"))}-${var.pds_nucleus_staging_bucket_name_postfix}" +# +# queue { +# events = ["s3:ObjectCreated:*"] +# queue_arn = aws_sqs_queue.pds_nucleus_files_to_save_in_database_sqs_queue[count.index].arn +# } +# } + + # Create pds_nucleus_s3_file_file_event_processor_function for each PDS Node resource "aws_lambda_function" "pds_nucleus_s3_file_file_event_processor_function" { count = length(var.pds_node_names) @@ -292,7 +315,8 @@ resource "aws_lambda_function" "pds_nucleus_product_completion_checker_function" DB_SECRET_ARN = aws_secretsmanager_secret.pds_nucleus_rds_credentials.arn EFS_MOUNT_PATH = "/mnt/data" ES_AUTH_CONFIG_FILE_PATH = "/etc/es-auth.cfg" - OPENSEARCH_ENDPOINT = var.pds_nucleus_opensearch_urls[count.index] + OPENSEARCH_ENDPOINT = var.pds_nucleus_opensearch_url + OPENSEARCH_REGISTRY_NAME = var.pds_nucleus_opensearch_registry_names[count.index] OPENSEARCH_CREDENTIAL_RELATIVE_URL = var.pds_nucleus_opensearch_credential_relative_url PDS_NODE_NAME = var.pds_node_names[count.index] PDS_NUCLEUS_CONFIG_BUCKET_NAME = var.pds_nucleus_config_bucket_name @@ -300,7 +324,8 @@ resource "aws_lambda_function" "pds_nucleus_product_completion_checker_function" PDS_MWAA_ENV_NAME = var.airflow_env_name PDS_HOT_ARCHIVE_S3_BUCKET_NAME = "${lower(replace(var.pds_node_names[count.index], "_", "-"))}-${var.pds_nucleus_hot_archive_bucket_name_postfix}" PDS_COLD_ARCHIVE_S3_BUCKET_NAME = "${lower(replace(var.pds_node_names[count.index], "_", "-"))}-${var.pds_nucleus_cold_archive_bucket_name_postfix}" - PDS_STAGING_S3_BUCKET_NAME = aws_s3_bucket.pds_nucleus_s3_staging_bucket[count.index].id + PDS_STAGING_S3_BUCKET_NAME = data.aws_s3_bucket.pds_nucleus_s3_staging_bucket[count.index].id + PRODUCT_BATCH_SIZE = var.product_batch_size } } } @@ -342,7 +367,7 @@ resource "aws_lambda_permission" "s3-lambda-permission" { action = "lambda:InvokeFunction" function_name = aws_lambda_function.pds_nucleus_s3_file_file_event_processor_function[count.index].function_name principal = "s3.amazonaws.com" - source_arn = aws_s3_bucket.pds_nucleus_s3_staging_bucket[count.index].arn + source_arn = data.aws_s3_bucket.pds_nucleus_s3_staging_bucket[count.index].arn } # Create an SQS queue to receive S3 bucket notifications for each s3 bucket of each Node @@ -374,7 +399,7 @@ data "aws_iam_policy_document" "pds_nucleus_files_to_save_in_database_sqs_queue_ condition { test = "StringEquals" variable = "aws:SourceArn" - values = [aws_s3_bucket.pds_nucleus_s3_staging_bucket[count.index].arn] + values = [data.aws_s3_bucket.pds_nucleus_s3_staging_bucket[count.index].arn] } } } @@ -386,20 +411,6 @@ resource "aws_sqs_queue_policy" "pds_nucleus_files_to_save_in_database_sqs_queue policy = data.aws_iam_policy_document.pds_nucleus_files_to_save_in_database_sqs_queue_policy_document[count.index].json } -# Create an aws_s3_bucket_notification for each s3 bucket of each Node -resource "aws_s3_bucket_notification" "pds_nucleus_s3_staging_bucket_notification" { - - count = length(var.pds_node_names) - # convert PDS node name to S3 bucket name compatible format - bucket = "${lower(replace(var.pds_node_names[count.index], "_", "-"))}-${var.pds_nucleus_staging_bucket_name_postfix}" - - queue { - events = ["s3:ObjectCreated:*"] - queue_arn = aws_sqs_queue.pds_nucleus_files_to_save_in_database_sqs_queue[count.index].arn - } -} - - resource "time_sleep" "wait_for_database" { create_duration = "2m" diff --git a/terraform/terraform-modules/product-copy-completion-checker/variables.tf b/terraform/terraform-modules/product-copy-completion-checker/variables.tf index d181364..b13d173 100644 --- a/terraform/terraform-modules/product-copy-completion-checker/variables.tf +++ b/terraform/terraform-modules/product-copy-completion-checker/variables.tf @@ -75,9 +75,15 @@ variable "pds_node_names" { sensitive = true } -variable "pds_nucleus_opensearch_urls" { - description = "List of PDS Nucleus OpenSearch Config file paths" - type = list(string) +variable "pds_nucleus_opensearch_url" { + description = "List of PDS Nucleus OpenSearch URL" + type = string + sensitive = true +} + +variable "pds_nucleus_opensearch_registry_names" { + description = "List of PDS Nucleus OpenSearch Registry Names" + type = list(string) sensitive = true } @@ -119,6 +125,12 @@ variable "airflow_env_name" { type = string } +variable "product_batch_size" { + description = "Size of the product batch to send to Nuclees DAG top process per given DAG invocation" + default = 10 + type = number +} + variable "region" { description = "AWS Region" type = string diff --git a/terraform/variables.tf b/terraform/variables.tf index bf0741e..ae5b51c 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -101,15 +101,22 @@ variable "pds_nucleus_default_airflow_dag_id" { variable "pds_node_names" { description = "List of PDS Node Names" type = list(string) - default = ["pds-sbn", "pds-img"] + sensitive = true +} + +variable "pds_nucleus_opensearch_url" { + description = "List of PDS Nucleus OpenSearch URL" + type = string + sensitive = true } -variable "pds_nucleus_opensearch_urls" { - description = "List of PDS Nucleus OpenSearch Config file paths" +variable "pds_nucleus_opensearch_registry_names" { + description = "List of PDS Nucleus OpenSearch Registry Names" type = list(string) sensitive = true } + variable "pds_nucleus_opensearch_credential_relative_url" { description = "List of PDS Nucleus OpenSearch Credential Relative URL" type = string diff --git a/terraform/variables/terraform.tfvars.dev b/terraform/variables/terraform.tfvars.dev index 735aeb2..be2b474 100644 --- a/terraform/variables/terraform.tfvars.dev +++ b/terraform/variables/terraform.tfvars.dev @@ -13,7 +13,8 @@ aws_secretmanager_key_arn = "arn:aws:kms:us-west-2:12345678:key/abcdef-a # (PDS_ATM, PDS_ENG, PDS_GEO, PDS_IMG, PDS_NAIF, PDS_RMS, PDS_SBN, PSA, JAXA, ROSCOSMOS) pds_node_names = ["PDS_SBN", "PDS_IMG"] -pds_nucleus_opensearch_urls = ["https://abcdef.us-west-2.aoss.amazonaws.com", "https://pqrst.us-west-2.aoss.amazonaws.com"] +pds_nucleus_opensearch_url = "https://abcdef.us-west-2.aoss.amazonaws.com" +pds_nucleus_opensearch_registry_names = ["pds-nucleus-sbn-registry"", "pds-nucleus-img-registry"] pds_nucleus_opensearch_credential_relative_url = "http:///AWS_CONTAINER_CREDENTIALS_RELATIVE_URI" pds_nucleus_harvest_replace_prefix_with_list = ["s3://pds-sbn-nucleus-staging", "s3://pds-img-nucleus-staging"] diff --git a/terraform/variables/terraform.tfvars.test b/terraform/variables/terraform.tfvars.test index 44ef7d3..724e62a 100644 --- a/terraform/variables/terraform.tfvars.test +++ b/terraform/variables/terraform.tfvars.test @@ -13,7 +13,8 @@ aws_secretmanager_key_arn = "arn:aws:kms:us-west-2:12345678:key/abcdef-a # (PDS_ATM, PDS_ENG, PDS_GEO, PDS_IMG, PDS_NAIF, PDS_RMS, PDS_SBN, PSA, JAXA, ROSCOSMOS) pds_node_names = ["PDS_SBN", "PDS_IMG"] -pds_nucleus_opensearch_urls = ["https://abcdef.us-west-2.aoss.amazonaws.com", "https://pqrst.us-west-2.aoss.amazonaws.com"] +pds_nucleus_opensearch_url = "https://abcdef.us-west-2.aoss.amazonaws.com" +pds_nucleus_opensearch_registry_names = ["pds-nucleus-sbn-registry"", "pds-nucleus-img-registry"] pds_nucleus_opensearch_credential_relative_url = "http:///AWS_CONTAINER_CREDENTIALS_RELATIVE_URI" pds_nucleus_harvest_replace_prefix_with_list = ["s3://pds-sbn-nucleus-staging", "s3://pds-img-nucleus-staging"]