diff --git a/README.md b/README.md index 1c5062c..2aa2924 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,11 @@ Simple Azure example: python3 cloudgrep.py -an some_account -cn some_container -q my_search ``` +Simple Google example: +``` +python3 cloudgrep.py -gb do-not-delete-api-tests-bucket -q my_search +``` + More complicated example: ``` @@ -86,9 +91,11 @@ You can run this from your local laptop, or from a virtual machine in your cloud ### Running in your Cloud and Authentication ### #### AWS #### +Your system will need access to the S3 bucket. For example, if you are running on your laptop, you will need to [configure the AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html). +If you are running on an EC2, an [Instance Profile](https://devopscube.com/aws-iam-role-instance-profile/) is likely the best choice. + If you run on an EC2 instance in the same region as the S3 bucket with a [VPC endpoint for S3](https://aws.amazon.com/blogs/architecture/overview-of-data-transfer-costs-for-common-architectures/) you can [avoid egress charges](https://awsmadeeasy.com/blog/aws-s3-vpc-endpoint-transfer-cost-reduction/). You can authenticate in a [number of ways](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html). -If you are running on an EC2, an [Instance Profile](https://devopscube.com/aws-iam-role-instance-profile/) is likely the best choice. #### Azure #### The simplest way to authenticate with Azure is to first run: @@ -97,6 +104,12 @@ az login ``` This will open a browser window and prompt you to login to Azure. +#### GCP #### +You will need to create a service account and download the credentials file then set with: +``` +export GOOGLE_APPLICATION_CREDENTIALS="/Users/creds.json" +``` + ### Contributions ### We welcome any contributions to this project! Please add via a Pull Request. diff --git a/cloudgrep.py b/cloudgrep.py index 9852ec3..fe3c1f5 100644 --- a/cloudgrep.py +++ b/cloudgrep.py @@ -9,6 +9,7 @@ parser.add_argument("-b", "--bucket", help="AWS S3 Bucket to search. E.g. my-bucket", required=False) parser.add_argument("-an", "--account-name", help="Azure Account Name to Search", required=False) parser.add_argument("-cn", "--container-name", help="Azure Container Name to Search", required=False) + parser.add_argument("-gb", "--google-bucket", help="Google Cloud Bucket to Search", required=False) parser.add_argument( "-q", "--query", help="Text to search for. Will be parsed as a Regex. E.g. example.com", required=True ) @@ -56,6 +57,7 @@ args["bucket"], args["account_name"], args["container_name"], + args["google_bucket"], args["query"], args["file_size"], args["prefix"], diff --git a/core/cloudgrep.py b/core/cloudgrep.py index 18e82b3..2d35fbe 100644 --- a/core/cloudgrep.py +++ b/core/cloudgrep.py @@ -2,6 +2,7 @@ from azure.storage.blob import BlobServiceClient, BlobProperties from azure.identity import DefaultAzureCredential from azure.core.exceptions import ResourceNotFoundError +from google.cloud import storage # type: ignore from datetime import timezone, datetime from dateutil.parser import parse import botocore @@ -120,6 +121,30 @@ def download_file(key: str) -> None: return matched_count + def download_from_google(self, bucket: str, files: List[str], query: str, hide_filenames: bool) -> int: + """Download every file in the bucket from google + Returns number of matched files""" + + matched_count = 0 + client = storage.Client() + bucket_gcp = client.get_bucket(bucket) + + def download_file(key: str) -> None: + with tempfile.NamedTemporaryFile() as tmp: + logging.info(f"Downloading {bucket} {key} to {tmp.name}") + blob = bucket_gcp.get_blob(key) + blob.download_to_filename(tmp.name) + matched = self.search_file(tmp.name, key, query, hide_filenames) + if matched: + nonlocal matched_count + matched_count += 1 + + # Use ThreadPoolExecutor to download the files + with concurrent.futures.ThreadPoolExecutor() as executor: + executor.map(download_file, files) + + return matched_count + def filter_object( self, obj: dict, @@ -158,6 +183,22 @@ def filter_object_azure( return False # Object does not contain the key_contains string return True + def filter_object_google( + self, + obj: storage.blob.Blob, + key_contains: Optional[str], + from_date: Optional[datetime], + to_date: Optional[datetime], + ) -> bool: + last_modified = obj.updated + if last_modified and from_date and from_date > last_modified: + return False + if last_modified and to_date and last_modified > to_date: + return False + if key_contains and key_contains not in obj.name: + return False + return True + def get_objects( self, bucket: str, @@ -205,11 +246,33 @@ def get_azure_objects( ): yield blob.name + def get_google_objects( + self, + bucket: str, + prefix: Optional[str], + key_contains: Optional[str], + from_date: Optional[datetime], + end_date: Optional[datetime], + ) -> Iterator[str]: + """Get all objects in a GCP bucket with a given prefix""" + client = storage.Client() + bucket_gcp = client.get_bucket(bucket) + blobs = bucket_gcp.list_blobs(prefix=prefix) + for blob in blobs: + if self.filter_object_google( + blob, + key_contains, + from_date, + end_date, + ): + yield blob.name + def search( self, bucket: Optional[str], account_name: Optional[str], container_name: Optional[str], + google_bucket: Optional[str], query: str, file_size: int, prefix: Optional[str] = None, @@ -246,3 +309,12 @@ def search( ) print(f"Searching {len(matching_keys)} files in {account_name}/{container_name} for {query}...") self.download_from_azure(account_name, container_name, matching_keys, query, hide_filenames) + + if google_bucket: + matching_keys = list( + self.get_google_objects(google_bucket, prefix, key_contains, parsed_from_date, parsed_end_date) + ) + + print(f"Searching {len(matching_keys)} files in {google_bucket} for {query}...") + + self.download_from_google(google_bucket, matching_keys, query, hide_filenames) diff --git a/requirements.txt b/requirements.txt index 785612d..e7f34ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,4 @@ pip-audit==2.6.1 azure-storage-blob==12.18.3 azure-core==1.29.4 azure-identity==1.14.1 -pytest-mock==3.11.1 \ No newline at end of file +google-cloud-storage==2.12.0 \ No newline at end of file diff --git a/tests/tests_unit.py b/tests/tests_unit.py index c1e6159..48146a9 100644 --- a/tests/tests_unit.py +++ b/tests/tests_unit.py @@ -5,6 +5,7 @@ import unittest import os import boto3 +from google.cloud import storage # type: ignore import timeout_decorator from moto import mock_s3 from datetime import datetime @@ -83,3 +84,15 @@ def test_object_not_empty_and_size_greater_than_file_size(self) -> None: result = cloud_grep.filter_object_azure(obj, key_contains, from_date, to_date, file_size) # type: ignore assert result == True + + # Returns True if all conditions are met + def test_returns_true_if_all_conditions_are_met(self) -> None: + obj = storage.blob.Blob(name="example_file.txt", bucket="example_bucket") + key_contains = "example" + from_date = datetime(2021, 1, 1) + to_date = datetime(2023, 1, 1) + + cloud_grep = CloudGrep() + result = cloud_grep.filter_object_google(obj, key_contains, from_date, to_date) + + self.assertTrue(result)