Ingest GCP folders, projects, and instances (#71)

* Add GCP folders (no cleanup yet) * Add GCP projects * Add GCP instances * Add indices * Add cleanup jobs for GCP folders, projects, and instances. Separate compute module to own python file. * Fix flake8 problems * Update schema docs with GCP Projects, Folders, and Instances * Doc update and remove extra space * Refactor so that we only get projects once and provide that data to all dependent modules * Add docs on gcp/__init__.py * Improve googleapi error handling * Add docs to compute.py functions * Replace :PARENT relationship with :RESOURCE to be consistent with AWS. Separate GCP schema docs from AWS schema docs. * Fix linter errors * Fix exception handling with getting project zones * Space * Add docs for compute.py * Add generic :Instance label to EC2Instances and GCPInstances * Explicitly handle being unable to enum projects, folders, and organizations. MERGE organizations rather than MATCH them when ingesting projects to avoid crash when a project has an organization that we do not know about as a parent. * Handle projects that do not have parents. Update schema docs to the correct links. * avoid link rot * Reraise exceptions unless we can actually handle them * Fix exception handling, add debug and info messages before syncs, add generic sync function for compute.py * Catch specific exception instead of base one
cartography-cncf · May 23, 2019 · c9633f4 · c9633f4
1 parent 8f3f4b7
commit c9633f4
Show file tree

Hide file tree

Showing 14 changed files with 722 additions and 80 deletions.
diff --git a/README.md b/README.md
@@ -189,7 +189,7 @@ Our only method of accepting code changes is through Github pull requests.
 ## Reference
 
 ### Schema
-Detailed view of [our schema and all data types](docs/schema.md) 😁.
+Detailed view of [our schema and all data types](docs/schema/index.md) 😁.
 
 
 ### Sample queries

diff --git a/cartography/data/indexes.cypher b/cartography/data/indexes.cypher
@@ -45,4 +45,7 @@ CREATE INDEX ON :AWSVpc(id);
 CREATE INDEX ON :AWSCidrBlock(id);
 CREATE INDEX ON :AWSIpv4CidrBlock(id);
 CREATE INDEX ON :AWSIpv6CidrBlock(id);
-CREATE INDEX ON :GCPOrganization(id);
+CREATE INDEX ON :GCPOrganization(id);
+CREATE INDEX ON :GCPFolder(id);
+CREATE INDEX ON :GCPProject(id);
+CREATE INDEX ON :GCPInstance(id);
diff --git a/cartography/data/jobs/cleanup/gcp_compute_instance_cleanup.json b/cartography/data/jobs/cleanup/gcp_compute_instance_cleanup.json
@@ -0,0 +1,17 @@
+{
+  "statements": [
+    {
+      "query": "MATCH (n:GCPInstance) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
+      "iterative": true,
+      "iterationsize": 100,
+      "__comment__": "Delete GCP Instances that no longer exist and detach them from all previously connected nodes."
+    },
+    {
+      "query": "MATCH (:GCPInstance)<-[r:RESOURCE]-(:GCPProject) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
+      "iterative": true,
+      "iterationsize": 100,
+      "__comment__": "Remove GCP Instance-to-Project relationships that are out of date."
+    }
+  ],
+  "name": "cleanup GCP Instances"
+}
diff --git a/cartography/data/jobs/cleanup/gcp_crm_folder_cleanup.json b/cartography/data/jobs/cleanup/gcp_crm_folder_cleanup.json
@@ -0,0 +1,23 @@
+{
+  "statements": [
+    {
+      "query": "MATCH (n:GCPFolder) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
+      "iterative": true,
+      "iterationsize": 100,
+      "__comment__": "Delete GCPFolders that no longer exist and detach them from all previously connected nodes"
+    },
+    {
+      "query": "MATCH (:GCPFolder)-[r:RESOURCE]-(:GCPFolder) WHERE r.lastupdated <> {UPDATE_TAG} WITH distinct r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
+      "iterative": true,
+      "iterationsize": 100,
+      "__comment__": "Remove GCP Folder-to-Folder relationships that are out of date."
+    },
+    {
+      "query": "MATCH (:GCPFolder)<-[r:RESOURCE]-(:GCPOrganization) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
+      "iterative": true,
+      "iterationsize": 100,
+      "__comment__": "Remove GCP Folder-to-Organization relationships that are out of date."
+    }
+  ],
+  "name": "cleanup GCP Folders"
+}
diff --git a/cartography/data/jobs/cleanup/gcp_crm_organization_cleanup.json b/cartography/data/jobs/cleanup/gcp_crm_organization_cleanup.json
@@ -0,0 +1,17 @@
+{
+  "statements": [
+    {
+      "query": "MATCH (n:GCPOrganization) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
+      "iterative": true,
+      "iterationsize": 100,
+      "__comment__": "Remove GCP organizations that are out of date."
+    },
+    {
+      "query": "MATCH (:GCPOrganization)-[r:RESOURCE]->(:GCPProject) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
+      "iterative": true,
+      "iterationsize": 100,
+      "__comment__": "Remove GCP Organization relationships that are out of date."
+    }
+  ],
+  "name": "cleanup GCP Organizations"
+}
diff --git a/cartography/data/jobs/cleanup/gcp_crm_project_cleanup.json b/cartography/data/jobs/cleanup/gcp_crm_project_cleanup.json
@@ -0,0 +1,23 @@
+{
+  "statements": [
+    {
+      "query": "MATCH (n:GCPProject) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
+      "iterative": true,
+      "iterationsize": 100,
+      "__comment__": "Delete GCP Projects that no longer exist and detach them from all previously connected nodes."
+    },
+    {
+      "query": "MATCH (:GCPProject)<-[r:RESOURCE]-(:GCPFolder) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
+      "iterative": true,
+      "iterationsize": 100,
+      "__comment__": "Remove GCP Project-to-Folder relationships that are out of date."
+    },
+    {
+      "query": "MATCH (:GCPProject)<-[r:RESOURCE]-(:GCPOrganization) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
+      "iterative": true,
+      "iterationsize": 100,
+      "__comment__": "Remove GCP Project-to-Organization relationships that are out of date."
+    }
+  ],
+  "name": "cleanup GCP Projects"
+}
diff --git a/cartography/data/jobs/cleanup/gcp_organization_cleanup.json b/cartography/data/jobs/cleanup/gcp_organization_cleanup.json
diff --git a/cartography/intel/aws/ec2.py b/cartography/intel/aws/ec2.py
@@ -85,7 +85,7 @@ def load_ec2_instances(session, data, region, current_aws_account_id, aws_update
     """
 
     ingest_instance = """
-    MERGE (instance:EC2Instance{instanceid: {InstanceId}})
+    MERGE (instance:Instance:EC2Instance{instanceid: {InstanceId}})
     ON CREATE SET instance.firstseen = timestamp()
     SET instance.publicdnsname = {PublicDnsName}, instance.privateipaddress = {PrivateIpAddress},
     instance.imageid = {ImageId}, instance.instancetype = {InstanceType}, instance.monitoringstate = {MonitoringState},
@@ -374,7 +374,7 @@ def load_ec2_auto_scaling_groups(session, data, region, current_aws_account_id,
     """
 
     ingest_instance = """
-    MERGE (instance:EC2Instance{instanceid: {InstanceId}})
+    MERGE (instance:Instance:EC2Instance{instanceid: {InstanceId}})
     ON CREATE SET instance.firstseen = timestamp()
     SET instance.lastupdated = {aws_update_tag}, instance.region={Region}
     WITH instance

diff --git a/cartography/intel/gcp/__init__.py b/cartography/intel/gcp/__init__.py
@@ -1,12 +1,104 @@
 from oauth2client.client import GoogleCredentials, ApplicationDefaultCredentialsError
+import googleapiclient.discovery
 import logging
+from collections import namedtuple
 
-from cartography.intel.gcp import crm
+from cartography.intel.gcp import crm, compute
 
 logger = logging.getLogger(__name__)
+Resources = namedtuple('Resources', 'crm_v1 crm_v2 compute')
+
+
+def _get_crm_resource_v1(credentials):
+    """
+    Instantiates a Google Compute Resource Manager v1 resource object to call the Resource Manager API.
+    See https://cloud.google.com/resource-manager/reference/rest/.
+    :param credentials: The GoogleCredentials object
+    :return: A CRM v1 resource object
+    """
+    # cache_discovery=False to suppress extra warnings.
+    # See https://github.com/googleapis/google-api-python-client/issues/299#issuecomment-268915510 and related issues
+    return googleapiclient.discovery.build('cloudresourcemanager', 'v1', credentials=credentials, cache_discovery=False)
+
+
+def _get_crm_resource_v2(credentials):
+    """
+    Instantiates a Google Compute Resource Manager v2 resource object to call the Resource Manager API.
+    We need a v2 resource object to query for GCP folders.
+    :param credentials: The GoogleCredentials object
+    :return: A CRM v2 resource object
+    """
+    return googleapiclient.discovery.build('cloudresourcemanager', 'v2', credentials=credentials, cache_discovery=False)
+
+
+def _get_compute_resource(credentials):
+    """
+    Instantiates a Google Compute resource object to call the Compute API. This is used to pull zone, instance, and
+    networking data. See https://cloud.google.com/compute/docs/reference/rest/v1/.
+    :param credentials: The GoogleCredentials object
+    :return: A Compute resource object
+    """
+    return googleapiclient.discovery.build('compute', 'v1', credentials=credentials, cache_discovery=False)
+
+
+def _initialize_resources(credentials):
+    """
+    Create namedtuple of all resource objects necessary for GCP data gathering.
+    :param credentials: The GoogleCredentials object
+    :return: namedtuple of all resource objects
+    """
+    return Resources(
+        crm_v1=_get_crm_resource_v1(credentials),
+        crm_v2=_get_crm_resource_v2(credentials),
+        compute=_get_compute_resource(credentials)
+    )
+
+
+def _sync_single_project(session, resources, project_id, gcp_update_tag, common_job_parameters):
+    """
+    Handles graph sync for a single GCP project.
+    :param session: The Neo4j session
+    :param resources: namedtuple of the GCP resource objects
+    :param project_id: The project ID number to sync.  See  the `projectId` field in
+    https://cloud.google.com/resource-manager/reference/rest/v1/projects
+    :param gcp_update_tag: The timestamp value to set our new Neo4j nodes with
+    :param common_job_parameters: Other parameters sent to Neo4j
+    :return: Nothing
+    """
+    compute.sync(session, resources.compute, project_id, gcp_update_tag, common_job_parameters)
+
+
+def _sync_multiple_projects(session, resources, projects, gcp_update_tag, common_job_parameters):
+    """
+    Handles graph sync for multiple GCP projects.
+    :param session: The Neo4j session
+    :param resources: namedtuple of the GCP resource objects
+    :param: projects: A list of projects. At minimum, this list should contain a list of dicts with the key "projectId"
+     defined; so it would look like this: [{"projectId": "my-project-id-12345"}].
+    This is the returned data from `crm.get_gcp_projects()`.
+    See https://cloud.google.com/resource-manager/reference/rest/v1/projects.
+    :param gcp_update_tag: The timestamp value to set our new Neo4j nodes with
+    :param common_job_parameters: Other parameters sent to Neo4j
+    :return: Nothing
+    """
+    logger.debug("Syncing %d GCP projects.", len(projects))
+    crm.sync_gcp_projects(session, projects, gcp_update_tag, common_job_parameters)
+
+    for project in projects:
+        project_id = project['projectId']
+        logger.info("Syncing GCP project %s.", project_id)
+        _sync_single_project(session, resources, project_id, gcp_update_tag, common_job_parameters)
 
 
 def start_gcp_ingestion(session, config):
+    """
+    Starts the GCP ingestion process by initializing Google Application Default Credentials, creating the necessary
+    resource objects, listing all GCP organizations and projects available to the GCP identity, and supplying that
+    context to all intel modules.
+    :param session: The Neo4j session
+    :param config: A `cartography.config` object
+    :return: Nothing
+    """
     common_job_parameters = {
         "UPDATE_TAG": config.update_tag,
     }
@@ -27,4 +119,12 @@ def start_gcp_ingestion(session, config):
             e
         )
         return
-    crm.sync_gcp_organizations(session, credentials, config.update_tag, common_job_parameters)
+    resources = _initialize_resources(credentials)
+
+    # If we don't have perms to pull Orgs or Folders from GCP, we will skip safely
+    crm.sync_gcp_organizations(session, resources.crm_v1, config.update_tag, common_job_parameters)
+    crm.sync_gcp_folders(session, resources.crm_v2, config.update_tag, common_job_parameters)
+
+    projects = crm.get_gcp_projects(resources.crm_v1)
+
+    _sync_multiple_projects(session, resources, projects, config.update_tag, common_job_parameters)