Skip to content

Commit

Permalink
Merge pull request #132 from NASA-PDS/new-harvest
Browse files Browse the repository at this point in the history
Integrate the latest version 4.0.1 of harvest in Nucleus workflowNew harvest
  • Loading branch information
ramesh-maddegoda authored Nov 8, 2024
2 parents 636389c + 8c99489 commit 66fb5b8
Show file tree
Hide file tree
Showing 17 changed files with 319 additions and 82 deletions.
92 changes: 85 additions & 7 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -157,31 +157,41 @@
"filename": "terraform/README.md",
"hashed_secret": "f2d4e04179e44fa7386b985ac3c7ee4d95dfd65d",
"is_verified": false,
"line_number": 97,
"line_number": 102,
"is_secret": false
},
{
"type": "AWS Sensitive Information (Experimental Plugin)",
"filename": "terraform/README.md",
"hashed_secret": "659a4d010b74afeddbcb9c4e8eae01f4390eeacc",
"is_verified": false,
"line_number": 98,
"line_number": 103,
"is_secret": false
},
{
"type": "AWS Sensitive Information (Experimental Plugin)",
"filename": "terraform/README.md",
"hashed_secret": "bd3b85b91cb8cf6cfc6a4adc7a2505714939505b",
"is_verified": false,
"line_number": 98,
"line_number": 103,
"is_secret": false
},
{
"type": "Secret Keyword",
"filename": "terraform/README.md",
"hashed_secret": "a356cb3f3d1c9797cf59daf5b22fc0c7434d8dc7",
"is_verified": false,
"line_number": 101,
"line_number": 107,
"is_secret": false
}
],
"terraform/terraform-modules/ecs-ecr/docker/deploy-ecr-images.sh": [
{
"type": "AWS Sensitive Information (Experimental Plugin)",
"filename": "terraform/terraform-modules/ecs-ecr/docker/deploy-ecr-images.sh",
"hashed_secret": "9ad897024d8c36c541d7fe84084c4e9f4df00b2a",
"is_verified": false,
"line_number": 4,
"is_secret": false
}
],
Expand Down Expand Up @@ -209,15 +219,15 @@
"filename": "terraform/terraform-modules/ecs-ecr/ecs_ecr.tf",
"hashed_secret": "957580e87fca1bd3e2acdfbae2a6c6e24a1d4ade",
"is_verified": false,
"line_number": 197,
"line_number": 185,
"is_secret": false
},
{
"type": "Secret Keyword",
"filename": "terraform/terraform-modules/ecs-ecr/ecs_ecr.tf",
"hashed_secret": "227f2d989bdd935539c4e9bd92b8c4a5965505ac",
"is_verified": false,
"line_number": 211,
"line_number": 199,
"is_secret": false
}
],
Expand Down Expand Up @@ -320,7 +330,75 @@
"line_number": 11,
"is_secret": false
}
],
"terraform/variables/terraform.tfvars.dev": [
{
"type": "AWS Sensitive Information (Experimental Plugin)",
"filename": "terraform/variables/terraform.tfvars.dev",
"hashed_secret": "f2d4e04179e44fa7386b985ac3c7ee4d95dfd65d",
"is_verified": false,
"line_number": 3,
"is_secret": false
},
{
"type": "AWS Sensitive Information (Experimental Plugin)",
"filename": "terraform/variables/terraform.tfvars.dev",
"hashed_secret": "226201cd08f00a589068e569d01716d0ad488ae4",
"is_verified": false,
"line_number": 4,
"is_secret": false
},
{
"type": "AWS Sensitive Information (Experimental Plugin)",
"filename": "terraform/variables/terraform.tfvars.dev",
"hashed_secret": "4592cff3a9944664c9c182333782a5d551ec2516",
"is_verified": false,
"line_number": 4,
"is_secret": false
},
{
"type": "Secret Keyword",
"filename": "terraform/variables/terraform.tfvars.dev",
"hashed_secret": "b293afb11f1f9b32461ab510aacb65a27ccb6111",
"is_verified": false,
"line_number": 9,
"is_secret": false
}
],
"terraform/variables/terraform.tfvars.test": [
{
"type": "AWS Sensitive Information (Experimental Plugin)",
"filename": "terraform/variables/terraform.tfvars.test",
"hashed_secret": "f2d4e04179e44fa7386b985ac3c7ee4d95dfd65d",
"is_verified": false,
"line_number": 3,
"is_secret": false
},
{
"type": "AWS Sensitive Information (Experimental Plugin)",
"filename": "terraform/variables/terraform.tfvars.test",
"hashed_secret": "226201cd08f00a589068e569d01716d0ad488ae4",
"is_verified": false,
"line_number": 4,
"is_secret": false
},
{
"type": "AWS Sensitive Information (Experimental Plugin)",
"filename": "terraform/variables/terraform.tfvars.test",
"hashed_secret": "4592cff3a9944664c9c182333782a5d551ec2516",
"is_verified": false,
"line_number": 4,
"is_secret": false
},
{
"type": "Secret Keyword",
"filename": "terraform/variables/terraform.tfvars.test",
"hashed_secret": "b293afb11f1f9b32461ab510aacb65a27ccb6111",
"is_verified": false,
"line_number": 9,
"is_secret": false
}
]
},
"generated_at": "2024-10-03T02:25:24Z"
"generated_at": "2024-11-08T07:13:21Z"
}
67 changes: 51 additions & 16 deletions terraform/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,20 +62,24 @@ cd nucleus/terraform

4. Create a `terraform.tfvars` file locally under `./terraform/terraform.tfvars` and enter the value for variables specified in `variables.tf` file at `nucleus/terraform/terraform-modules/mwaa-env/variables.tf`. Ensure these values match with your AWS Setup and also the variable value types (ex: string `" "`, number `1`, list(string)`[" "]`, etc). Most of the below values can be obtained by the system admin team of your AWS account.

Note: Examples of `terraform.tfvars` files are available at `terraform/variables` directory for your reference.

- env : Name of the Cloud environment to deploy PDS Nucleus (E.g: "mcp-dev", "mcp-test")
- region : AWS Region
- vpc_id : VPC ID of your AWS VPC
- subnet_ids : List of Private Subnet IDs to be used for the MWAA
- vpc_cidr : VPC CIDR for MWAA (E.g.: "10.1.0.0/16")
- permission_boundary_for_iam_roles : The permission boundary for IAM roles can be obtained from the MCP System Admins or PDS Engineering Node team
- database_availability_zones : RDS database availability zones (E.g.: ["us-west-2a"])
- aws_secretmanager_key_arn : The ARN of aws/secretmanager key obtained from KMS -> AWS managed keys (E.g.: "arn:aws:kms:us-west-2:12345678:key/12345-1234-1234-1234-12345abcd")

- Set node specific values the following lists in correct order
- pds_node_names = List of PDS Node names to be supported (E.g.: ["PDS_SBN", "PDS_IMG", "PDS_EN"]).The following node name format should be used.
- (PDS_ATM, PDS_ENG, PDS_GEO, PDS_IMG, PDS_NAIF, PDS_RMS, PDS_SBN, PSA, JAXA, ROSCOSMOS)
- Please check https://nasa-pds.github.io/registry/user/harvest_job_configuration.html for PDS Node name descriptions.
- pds_nucleus_opensearch_urls = List of Node specific OpenSearch URLs (E.g.:["https://search-node2-dev-abcdefghijklmnop.us-west-2.es.amazonaws.com:443","https://search-node2-dev-abcdefghijklmnop.us-west-2.es.amazonaws.com:443"])
- pds_nucleus_harvest_replace_prefix_with_list = List of harvest replace with strings (E.g.: ["s3://pds-sbn-nucleus-staging","s3://pds-img-nucleus-staging"])
- pds_nucleus_opensearch_urls : List of Node specific OpenSearch URLs (E.g.: ["https://abcdef.us-west-2.aoss.amazonaws.com", "https://opqrst.us-west-2.aoss.amazonaws.com"])
- pds_nucleus_opensearch_credential_relative_url : Opensearch Credential URL (E.g.: "http://<IP ADDRESS>/AWS_CONTAINER_CREDENTIALS_RELATIVE_URI")
- pds_nucleus_harvest_replace_prefix_with_list : List of harvest replace with strings (E.g.: ["s3://pds-sbn-nucleus-staging","s3://pds-img-nucleus-staging"])
- pds_nucleus_harvest_replace_prefix_with : Prefix to replace in PDS Harvest tool
- airflow_env_name: Name of the Nucleus Airflow environment (E.g.: "pds-nucleus-airflow-env")
Expand All @@ -85,38 +89,45 @@ cd nucleus/terraform
- pds_nucleus_cold_archive_bucket_name_postfix : Postfix of the S3 Bucket name to keep PDS cold archive data files (E.g.: archive-cold-mcp-dev)
- pds_nucleus_config_bucket_name : S3 Bucket name to keep temporary configurations (E.g.: pds-nucleus-config-mcp-test)
- pds_nucleus_default_airflow_dag_id : The default example DAG to be included for testing (E.g.: pds-basic-registry-load-use-case)
- pds_registry_loader_harvest_task_role_arn: An IAM role which is associated with a Cognito user group


> Note: `terraform.tfvars` is only used to test with your configuration with the actual values in your AWS account. This file will not be uploaded to GitHub as it's ignored by Git. Once testing is completed successfully work with your admin to get the values for these tested variables updated via GitHub secrets, which are dynamically passed in during runtime.
```
# Example terraform.tfvars
env = "mcp-test"
region = "us-west-2"
vpc_id = "vpc-12345678"
subnet_ids = ["subnet-123456789", "subnet-987654321"]
vpc_cidr = "10.2.0.0/16"
permission_boundary_for_iam_roles = "mcp-example-role"database_availability_zones = ["us-west-2a"]
env = "mcp-test"
region = "us-west-2"
vpc_id = "vpc-12345678"
subnet_ids = ["subnet-123456789", "subnet-987654321"]
vpc_cidr = "10.2.0.0/16"
permission_boundary_for_iam_roles = "permission_boundary_role_name"
database_availability_zones = ["us-west-2a"]
aws_secretmanager_key_arn = "arn:aws:kms:us-west-2:12345678:key/12345-1234-1234-1234-12345abcd"
# Set node specific values the following lists in correct order. For the list of node names
# the following node name format should be used.
# (PDS_ATM, PDS_ENG, PDS_GEO, PDS_IMG, PDS_NAIF, PDS_RMS, PDS_SBN, PSA, JAXA, ROSCOSMOS)
# Please check https://nasa-pds.github.io/registry/user/harvest_job_configuration.html for PDS Node name descriptions.
pds_node_names = ["PDS_SBN", "PDS_IMG"]
pds_nucleus_opensearch_urls = ["https://search-node2-dev-abcdefghijklmnop.us-west-2.es.amazonaws.com:443","https://search-node2-dev-abcdefghijklmnop.us-west-2.es.amazonaws.com:443"]
pds_nucleus_harvest_replace_prefix_with_list = ["s3://pds-sbn-nucleus-staging","s3://pds-img-nucleus-staging"]
pds_node_names = ["PDS_SBN", "PDS_IMG"]
pds_nucleus_opensearch_urls = ["https://abcdef.us-west-2.aoss.amazonaws.com", "https://opqrst.us-west-2.aoss.amazonaws.com"]
pds_nucleus_opensearch_credential_relative_url = "http://<IP ADDRESS>/AWS_CONTAINER_CREDENTIALS_RELATIVE_URI"
pds_nucleus_harvest_replace_prefix_with_list = ["s3://pds-sbn-nucleus-staging", "s3://pds-img-nucleus-staging"]
airflow_env_name = "pds-nucleus-airflow-env"
mwaa_dag_s3_bucket_name = "pds-nucleus-airflow-dags-bucket-mcp-dev"
pds_nucleus_staging_bucket_name_postfix = "staging-mcp-dev"
pds_nucleus_hot_archive_bucket_name_postfix = "archive-hot-mcp-dev"
airflow_env_name = "pds-nucleus-airflow-env"
mwaa_dag_s3_bucket_name = "pds-nucleus-airflow-dags-bucket-mcp-dev"
pds_nucleus_staging_bucket_name_postfix = "staging-mcp-dev"
pds_nucleus_hot_archive_bucket_name_postfix = "archive-hot-mcp-dev"
pds_nucleus_cold_archive_bucket_name_postfix = "archive-cold-mcp-dev"
pds_nucleus_config_bucket_name = "pds-nucleus-config-mcp-dev"
pds_nucleus_config_bucket_name = "pds-nucleus-config-mcp-dev"
pds_nucleus_default_airflow_dag_id = "pds-basic-registry-load-use-case"
pds_registry_loader_harvest_task_role_arn = "arn:aws:iam::12345678:role/harvest-task-role"
```


Expand Down Expand Up @@ -211,3 +222,27 @@ python get-airflow-ui-webtoken.py

7. Copy the generated Nucleus Airflow UI web token and paste that in a webbrowser address bar to access the Airflow UI.


## Steps to Uninstall the PDS Nucleus Baseline System

1. Open a terminal and change current working directory to the `nucleus/terraform` directory.

```shell
cd nucleus/terraform
```

2. Uninstall Nucleus baseline system using Terraform destroy.

```shell
terraform destroy
```

3. The above command will fail to remove the non-empty S3 buckets (expected behaviour). Note the S3 bucket names failed to delete in
the output of the above `terraform destroy` command and empty those S3 buckets manually as explained in
https://docs.aws.amazon.com/AmazonS3/latest/userguide/empty-bucket.html.

4. Execute the following command again to remove the remaining S3 buckets.

```shell
terraform destroy
```
8 changes: 5 additions & 3 deletions terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ module "ecs_ecr" {

pds_registry_loader_harvest_cloudwatch_logs_group = var.pds_registry_loader_harvest_cloudwatch_logs_group
pds_registry_loader_harvest_cloudwatch_logs_region = var.region
pds_registry_loader_harvest_task_role_arn = var.pds_registry_loader_harvest_task_role_arn

pds_validate_cloudwatch_logs_group = var.pds_validate_cloudwatch_logs_group
pds_validate_cloudwatch_logs_region = var.region
Expand Down Expand Up @@ -100,9 +101,10 @@ module "product-copy-completion-checker" {
pds_nucleus_hot_archive_bucket_name_postfix = var.pds_nucleus_hot_archive_bucket_name_postfix
pds_nucleus_cold_archive_bucket_name_postfix = var.pds_nucleus_cold_archive_bucket_name_postfix

pds_node_names = var.pds_node_names
pds_nucleus_opensearch_urls = var.pds_nucleus_opensearch_urls
pds_nucleus_harvest_replace_prefix_with_list = var.pds_nucleus_harvest_replace_prefix_with_list
pds_node_names = var.pds_node_names
pds_nucleus_opensearch_urls = var.pds_nucleus_opensearch_urls
pds_nucleus_opensearch_credential_relative_url = var.pds_nucleus_opensearch_credential_relative_url
pds_nucleus_harvest_replace_prefix_with_list = var.pds_nucleus_harvest_replace_prefix_with_list

database_availability_zones = var.database_availability_zones
airflow_env_name = var.airflow_env_name
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"entryPoint": [
"/bin/sh",
"-c",
"echo 'trust.self-signed = true' >> /etc/es-auth.cfg && echo 'user = $OPENSEARCH_USER' >> /etc/es-auth.cfg && echo 'password = $OPENSEARCH_PASSWORD' >> /etc/es-auth.cfg && harvest"
"echo 'user = $OPENSEARCH_USER' >> /etc/es-auth.cfg && echo 'password = $OPENSEARCH_PASSWORD' >> /etc/es-auth.cfg && harvest -v DEBUG -c $HARVEST_CFG"
],
"command": [],
"environment": [],
Expand Down
26 changes: 26 additions & 0 deletions terraform/terraform-modules/ecs-ecr/docker/deploy-ecr-images.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

# Login to ECR
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin "${pds_nucleus_aws_account_id}".dkr.ecr.us-west-2.amazonaws.com

# Deploy pds-nucleus-config-init ECR image
cd ./terraform-modules/ecs-ecr/docker/config-init
docker build -t pds-nucleus-config-init .
docker tag pds-nucleus-config-init:latest "${pds_nucleus_aws_account_id}".dkr.ecr.us-west-2.amazonaws.com/pds-nucleus-config-init:latest
docker push "${pds_nucleus_aws_account_id}".dkr.ecr.us-west-2.amazonaws.com/pds-nucleus-config-init:latest

# Deploy pds-nucleus-s3-to-efs-copy ECR image
cd ../s3-to-efs-copy
docker build -t pds-nucleus-s3-to-efs-copy .
docker tag pds-nucleus-s3-to-efs-copy:latest "${pds_nucleus_aws_account_id}".dkr.ecr.us-west-2.amazonaws.com/pds-nucleus-s3-to-efs-copy:latest
docker push "${pds_nucleus_aws_account_id}".dkr.ecr.us-west-2.amazonaws.com/pds-nucleus-s3-to-efs-copy:latest

# Deploy pds-registry-loader-harvest ECR image
docker image pull nasapds/registry-loader
docker tag nasapds/registry-loader:latest "${pds_nucleus_aws_account_id}".dkr.ecr.us-west-2.amazonaws.com/pds-registry-loader-harvest:latest
docker push "${pds_nucleus_aws_account_id}".dkr.ecr.us-west-2.amazonaws.com/pds-registry-loader-harvest:latest

# Deploy pds-validate ECR image
docker image pull nasapds/validate
docker tag nasapds/validate:latest "${pds_nucleus_aws_account_id}".dkr.ecr.us-west-2.amazonaws.com/pds-validate:latest
docker push "${pds_nucleus_aws_account_id}".dkr.ecr.us-west-2.amazonaws.com/pds-validate:latest
14 changes: 1 addition & 13 deletions terraform/terraform-modules/ecs-ecr/ecs_ecr.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,6 @@ resource "local_file" "ecs_task_role_iam_policy_file" {
filename = "terraform-modules/ecs-ecr/ecs_task_role_iam_policy.json"

depends_on = [data.template_file.ecs_task_role_iam_policy_template]

lifecycle {
prevent_destroy = true
}
}

data "template_file" "ecs_task_execution_role_iam_policy_template" {
Expand All @@ -47,10 +43,6 @@ resource "local_file" "ecs_task_execution_role_iam_policy_file" {
filename = "terraform-modules/ecs-ecr/ecs_task_execution_role_iam_policy.json"

depends_on = [data.template_file.ecs_task_execution_role_iam_policy_template]

lifecycle {
prevent_destroy = true
}
}

data "template_file" "deploy_ecr_images_script_template" {
Expand All @@ -66,10 +58,6 @@ resource "local_file" "deploy_ecr_images_script_file" {
filename = "terraform-modules/ecs-ecr/docker/deploy-ecr-images.sh"

depends_on = [data.template_file.ecs_task_execution_role_iam_policy_template]

lifecycle {
prevent_destroy = true
}
}

#-------------------------------------
Expand Down Expand Up @@ -253,7 +241,7 @@ resource "aws_ecs_task_definition" "pds-registry-loader-harvest" {


container_definitions = data.template_file.pds-registry-loader-harvest-containers-json-template[count.index].rendered
task_role_arn = aws_iam_role.pds_nucleus_ecs_task_role.arn
task_role_arn = var.pds_registry_loader_harvest_task_role_arn
execution_role_arn = aws_iam_role.pds_nucleus_ecs_task_execution_role.arn

depends_on = [data.template_file.pds-validate-containers-json-template]
Expand Down
8 changes: 7 additions & 1 deletion terraform/terraform-modules/ecs-ecr/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,13 @@ variable "pds_registry_loader_harvest_cloudwatch_logs_group" {

variable "pds_registry_loader_harvest_cloudwatch_logs_region" {
type = string
description = "PDS Validate CloudWatch Logs Region"
description = "PDS Registry Loader Harvest CloudWatch Logs Region"
sensitive = true
}

variable "pds_registry_loader_harvest_task_role_arn" {
type = string
description = "PDS Registry Loader Harvest Task Role ARN"
sensitive = true
}

Expand Down
4 changes: 0 additions & 4 deletions terraform/terraform-modules/mwaa-env/mwaa_env.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@ resource "local_file" "mwaa_inline_policy_file" {
filename = "terraform-modules/mwaa-env/mwaa_execution_role_iam_policy.json"

depends_on = [data.template_file.mwaa_inline_policy_template]

lifecycle {
prevent_destroy = true
}
}

# IAM Policy Document for Inline Policy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
{
"Effect": "Allow",
"Action": "iam:PassRole",
"Resource": "arn:aws:iam::${pds_nucleus_aws_account_id}:role/pds-nucleus*"
"Resource": "arn:aws:iam::${pds_nucleus_aws_account_id}:role/pds-*"
},
{
"Effect": "Allow",
Expand Down
Loading

0 comments on commit 66fb5b8

Please sign in to comment.