Skip to content

Commit

Permalink
Ingest GCP folders, projects, and instances (#71)
Browse files Browse the repository at this point in the history
* Add GCP folders (no cleanup yet)

* Add GCP projects

* Add GCP instances

* Add indices

* Add cleanup jobs for GCP folders, projects, and instances. Separate compute module to own python file.

* Fix flake8 problems

* Update schema docs with GCP Projects, Folders, and Instances

* Doc update and remove extra space

* Refactor so that we only get projects once and provide that data to all dependent modules

* Add docs on gcp/__init__.py

* Improve googleapi error handling

* Add docs to compute.py functions

* Replace :PARENT relationship with :RESOURCE to be consistent with AWS. Separate GCP schema docs from AWS schema docs.

* Fix linter errors

* Fix exception handling with getting project zones

* Space

* Add docs for compute.py

* Add generic :Instance label to EC2Instances and GCPInstances

* Explicitly handle being unable to enum projects, folders, and organizations. MERGE organizations rather than MATCH them when ingesting projects to avoid crash when a project has an organization that we do not know about as a parent.

* Handle projects that do not have parents. Update schema docs to the correct links.

* avoid link rot

* Reraise exceptions unless we can actually handle them

* Fix exception handling, add debug and info messages before syncs, add generic sync function for compute.py

* Catch specific exception instead of base one
  • Loading branch information
achantavy authored May 23, 2019
1 parent 8f3f4b7 commit c9633f4
Show file tree
Hide file tree
Showing 14 changed files with 722 additions and 80 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ Our only method of accepting code changes is through Github pull requests.
## Reference

### Schema
Detailed view of [our schema and all data types](docs/schema.md) 😁.
Detailed view of [our schema and all data types](docs/schema/index.md) 😁.


### Sample queries
Expand Down
5 changes: 4 additions & 1 deletion cartography/data/indexes.cypher
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,7 @@ CREATE INDEX ON :AWSVpc(id);
CREATE INDEX ON :AWSCidrBlock(id);
CREATE INDEX ON :AWSIpv4CidrBlock(id);
CREATE INDEX ON :AWSIpv6CidrBlock(id);
CREATE INDEX ON :GCPOrganization(id);
CREATE INDEX ON :GCPOrganization(id);
CREATE INDEX ON :GCPFolder(id);
CREATE INDEX ON :GCPProject(id);
CREATE INDEX ON :GCPInstance(id);
17 changes: 17 additions & 0 deletions cartography/data/jobs/cleanup/gcp_compute_instance_cleanup.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"statements": [
{
"query": "MATCH (n:GCPInstance) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
"iterative": true,
"iterationsize": 100,
"__comment__": "Delete GCP Instances that no longer exist and detach them from all previously connected nodes."
},
{
"query": "MATCH (:GCPInstance)<-[r:RESOURCE]-(:GCPProject) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
"iterative": true,
"iterationsize": 100,
"__comment__": "Remove GCP Instance-to-Project relationships that are out of date."
}
],
"name": "cleanup GCP Instances"
}
23 changes: 23 additions & 0 deletions cartography/data/jobs/cleanup/gcp_crm_folder_cleanup.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"statements": [
{
"query": "MATCH (n:GCPFolder) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
"iterative": true,
"iterationsize": 100,
"__comment__": "Delete GCPFolders that no longer exist and detach them from all previously connected nodes"
},
{
"query": "MATCH (:GCPFolder)-[r:RESOURCE]-(:GCPFolder) WHERE r.lastupdated <> {UPDATE_TAG} WITH distinct r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
"iterative": true,
"iterationsize": 100,
"__comment__": "Remove GCP Folder-to-Folder relationships that are out of date."
},
{
"query": "MATCH (:GCPFolder)<-[r:RESOURCE]-(:GCPOrganization) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
"iterative": true,
"iterationsize": 100,
"__comment__": "Remove GCP Folder-to-Organization relationships that are out of date."
}
],
"name": "cleanup GCP Folders"
}
17 changes: 17 additions & 0 deletions cartography/data/jobs/cleanup/gcp_crm_organization_cleanup.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"statements": [
{
"query": "MATCH (n:GCPOrganization) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
"iterative": true,
"iterationsize": 100,
"__comment__": "Remove GCP organizations that are out of date."
},
{
"query": "MATCH (:GCPOrganization)-[r:RESOURCE]->(:GCPProject) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
"iterative": true,
"iterationsize": 100,
"__comment__": "Remove GCP Organization relationships that are out of date."
}
],
"name": "cleanup GCP Organizations"
}
23 changes: 23 additions & 0 deletions cartography/data/jobs/cleanup/gcp_crm_project_cleanup.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"statements": [
{
"query": "MATCH (n:GCPProject) WHERE n.lastupdated <> {UPDATE_TAG} WITH n LIMIT {LIMIT_SIZE} DETACH DELETE (n) return COUNT(*) as TotalCompleted",
"iterative": true,
"iterationsize": 100,
"__comment__": "Delete GCP Projects that no longer exist and detach them from all previously connected nodes."
},
{
"query": "MATCH (:GCPProject)<-[r:RESOURCE]-(:GCPFolder) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
"iterative": true,
"iterationsize": 100,
"__comment__": "Remove GCP Project-to-Folder relationships that are out of date."
},
{
"query": "MATCH (:GCPProject)<-[r:RESOURCE]-(:GCPOrganization) WHERE r.lastupdated <> {UPDATE_TAG} WITH r LIMIT {LIMIT_SIZE} DELETE (r) return COUNT(*) as TotalCompleted",
"iterative": true,
"iterationsize": 100,
"__comment__": "Remove GCP Project-to-Organization relationships that are out of date."
}
],
"name": "cleanup GCP Projects"
}
9 changes: 0 additions & 9 deletions cartography/data/jobs/cleanup/gcp_organization_cleanup.json

This file was deleted.

4 changes: 2 additions & 2 deletions cartography/intel/aws/ec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def load_ec2_instances(session, data, region, current_aws_account_id, aws_update
"""

ingest_instance = """
MERGE (instance:EC2Instance{instanceid: {InstanceId}})
MERGE (instance:Instance:EC2Instance{instanceid: {InstanceId}})
ON CREATE SET instance.firstseen = timestamp()
SET instance.publicdnsname = {PublicDnsName}, instance.privateipaddress = {PrivateIpAddress},
instance.imageid = {ImageId}, instance.instancetype = {InstanceType}, instance.monitoringstate = {MonitoringState},
Expand Down Expand Up @@ -374,7 +374,7 @@ def load_ec2_auto_scaling_groups(session, data, region, current_aws_account_id,
"""

ingest_instance = """
MERGE (instance:EC2Instance{instanceid: {InstanceId}})
MERGE (instance:Instance:EC2Instance{instanceid: {InstanceId}})
ON CREATE SET instance.firstseen = timestamp()
SET instance.lastupdated = {aws_update_tag}, instance.region={Region}
WITH instance
Expand Down
104 changes: 102 additions & 2 deletions cartography/intel/gcp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,104 @@
from oauth2client.client import GoogleCredentials, ApplicationDefaultCredentialsError
import googleapiclient.discovery
import logging
from collections import namedtuple

from cartography.intel.gcp import crm
from cartography.intel.gcp import crm, compute

logger = logging.getLogger(__name__)
Resources = namedtuple('Resources', 'crm_v1 crm_v2 compute')


def _get_crm_resource_v1(credentials):
"""
Instantiates a Google Compute Resource Manager v1 resource object to call the Resource Manager API.
See https://cloud.google.com/resource-manager/reference/rest/.
:param credentials: The GoogleCredentials object
:return: A CRM v1 resource object
"""
# cache_discovery=False to suppress extra warnings.
# See https://github.com/googleapis/google-api-python-client/issues/299#issuecomment-268915510 and related issues
return googleapiclient.discovery.build('cloudresourcemanager', 'v1', credentials=credentials, cache_discovery=False)


def _get_crm_resource_v2(credentials):
"""
Instantiates a Google Compute Resource Manager v2 resource object to call the Resource Manager API.
We need a v2 resource object to query for GCP folders.
:param credentials: The GoogleCredentials object
:return: A CRM v2 resource object
"""
return googleapiclient.discovery.build('cloudresourcemanager', 'v2', credentials=credentials, cache_discovery=False)


def _get_compute_resource(credentials):
"""
Instantiates a Google Compute resource object to call the Compute API. This is used to pull zone, instance, and
networking data. See https://cloud.google.com/compute/docs/reference/rest/v1/.
:param credentials: The GoogleCredentials object
:return: A Compute resource object
"""
return googleapiclient.discovery.build('compute', 'v1', credentials=credentials, cache_discovery=False)


def _initialize_resources(credentials):
"""
Create namedtuple of all resource objects necessary for GCP data gathering.
:param credentials: The GoogleCredentials object
:return: namedtuple of all resource objects
"""
return Resources(
crm_v1=_get_crm_resource_v1(credentials),
crm_v2=_get_crm_resource_v2(credentials),
compute=_get_compute_resource(credentials)
)


def _sync_single_project(session, resources, project_id, gcp_update_tag, common_job_parameters):
"""
Handles graph sync for a single GCP project.
:param session: The Neo4j session
:param resources: namedtuple of the GCP resource objects
:param project_id: The project ID number to sync. See the `projectId` field in
https://cloud.google.com/resource-manager/reference/rest/v1/projects
:param gcp_update_tag: The timestamp value to set our new Neo4j nodes with
:param common_job_parameters: Other parameters sent to Neo4j
:return: Nothing
"""
compute.sync(session, resources.compute, project_id, gcp_update_tag, common_job_parameters)


def _sync_multiple_projects(session, resources, projects, gcp_update_tag, common_job_parameters):
"""
Handles graph sync for multiple GCP projects.
:param session: The Neo4j session
:param resources: namedtuple of the GCP resource objects
:param: projects: A list of projects. At minimum, this list should contain a list of dicts with the key "projectId"
defined; so it would look like this: [{"projectId": "my-project-id-12345"}].
This is the returned data from `crm.get_gcp_projects()`.
See https://cloud.google.com/resource-manager/reference/rest/v1/projects.
:param gcp_update_tag: The timestamp value to set our new Neo4j nodes with
:param common_job_parameters: Other parameters sent to Neo4j
:return: Nothing
"""
logger.debug("Syncing %d GCP projects.", len(projects))
crm.sync_gcp_projects(session, projects, gcp_update_tag, common_job_parameters)

for project in projects:
project_id = project['projectId']
logger.info("Syncing GCP project %s.", project_id)
_sync_single_project(session, resources, project_id, gcp_update_tag, common_job_parameters)


def start_gcp_ingestion(session, config):
"""
Starts the GCP ingestion process by initializing Google Application Default Credentials, creating the necessary
resource objects, listing all GCP organizations and projects available to the GCP identity, and supplying that
context to all intel modules.
:param session: The Neo4j session
:param config: A `cartography.config` object
:return: Nothing
"""
common_job_parameters = {
"UPDATE_TAG": config.update_tag,
}
Expand All @@ -27,4 +119,12 @@ def start_gcp_ingestion(session, config):
e
)
return
crm.sync_gcp_organizations(session, credentials, config.update_tag, common_job_parameters)
resources = _initialize_resources(credentials)

# If we don't have perms to pull Orgs or Folders from GCP, we will skip safely
crm.sync_gcp_organizations(session, resources.crm_v1, config.update_tag, common_job_parameters)
crm.sync_gcp_folders(session, resources.crm_v2, config.update_tag, common_job_parameters)

projects = crm.get_gcp_projects(resources.crm_v1)

_sync_multiple_projects(session, resources, projects, config.update_tag, common_job_parameters)
Loading

0 comments on commit c9633f4

Please sign in to comment.