cado-security · chrisdoman · Oct 14, 2023 · Oct 14, 2023 · Oct 14, 2023 · Oct 14, 2023
diff --git a/README.md b/README.md
@@ -26,6 +26,11 @@ Simple Azure example:
 python3 cloudgrep.py -an some_account -cn some_container -q my_search
 ```
 
+Simple Google example:
+```
+python3 cloudgrep.py -gb do-not-delete-api-tests-bucket -q my_search
+```
+
 
 More complicated example:
 ```
@@ -86,9 +91,11 @@ You can run this from your local laptop, or from a virtual machine in your cloud
 ### Running in your Cloud and Authentication ###
 
 #### AWS ####
+Your system will need access to the S3 bucket. For example, if you are running on your laptop, you will need to [configure the AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html).
+If you are running on an EC2, an [Instance Profile](https://devopscube.com/aws-iam-role-instance-profile/) is likely the best choice.
+
 If you run on an EC2 instance in the same region as the S3 bucket with a [VPC endpoint for S3](https://aws.amazon.com/blogs/architecture/overview-of-data-transfer-costs-for-common-architectures/) you can [avoid egress charges](https://awsmadeeasy.com/blog/aws-s3-vpc-endpoint-transfer-cost-reduction/).
 You can authenticate in a [number of ways](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html).
-If you are running on an EC2, an [Instance Profile](https://devopscube.com/aws-iam-role-instance-profile/) is likely the best choice.
 
 #### Azure ####
 The simplest way to authenticate with Azure is to first run:
@@ -97,6 +104,12 @@ az login
 ```
 This will open a browser window and prompt you to login to Azure.
 
+#### GCP ####
+You will need to create a service account and download the credentials file then set with:
+```
+export GOOGLE_APPLICATION_CREDENTIALS="/Users/creds.json"
+```
+
 ### Contributions ###
 We welcome any contributions to this project! Please add via a Pull Request.
 

diff --git a/cloudgrep.py b/cloudgrep.py
@@ -9,6 +9,7 @@
     parser.add_argument("-b", "--bucket", help="AWS S3 Bucket to search. E.g. my-bucket", required=False)
     parser.add_argument("-an", "--account-name", help="Azure Account Name to Search", required=False)
     parser.add_argument("-cn", "--container-name", help="Azure Container Name to Search", required=False)
+    parser.add_argument("-gb", "--google-bucket", help="Google Cloud Bucket to Search", required=False)
     parser.add_argument(
         "-q", "--query", help="Text to search for. Will be parsed as a Regex. E.g. example.com", required=True
     )
@@ -56,6 +57,7 @@
         args["bucket"],
         args["account_name"],
         args["container_name"],
+        args["google_bucket"],
         args["query"],
         args["file_size"],
         args["prefix"],

diff --git a/core/cloudgrep.py b/core/cloudgrep.py
@@ -2,6 +2,7 @@
 from azure.storage.blob import BlobServiceClient, BlobProperties
 from azure.identity import DefaultAzureCredential
 from azure.core.exceptions import ResourceNotFoundError
+from google.cloud import storage  # type: ignore
 from datetime import timezone, datetime
 from dateutil.parser import parse
 import botocore
@@ -120,6 +121,30 @@ def download_file(key: str) -> None:
 
         return matched_count
 
+    def download_from_google(self, bucket: str, files: List[str], query: str, hide_filenames: bool) -> int:
+        """Download every file in the bucket from google
+        Returns number of matched files"""
+
+        matched_count = 0
+        client = storage.Client()
+        bucket_gcp = client.get_bucket(bucket)
+
+        def download_file(key: str) -> None:
+            with tempfile.NamedTemporaryFile() as tmp:
+                logging.info(f"Downloading {bucket} {key} to {tmp.name}")
+                blob = bucket_gcp.get_blob(key)
+                blob.download_to_filename(tmp.name)
+                matched = self.search_file(tmp.name, key, query, hide_filenames)
+                if matched:
+                    nonlocal matched_count
+                    matched_count += 1
+
+        # Use ThreadPoolExecutor to download the files
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            executor.map(download_file, files)
+
+        return matched_count
+
     def filter_object(
         self,
         obj: dict,
@@ -158,6 +183,22 @@ def filter_object_azure(
             return False  # Object does not contain the key_contains string
         return True
 
+    def filter_object_google(
+        self,
+        obj: storage.blob.Blob,
+        key_contains: Optional[str],
+        from_date: Optional[datetime],
+        to_date: Optional[datetime],
+    ) -> bool:
+        last_modified = obj.updated
+        if last_modified and from_date and from_date > last_modified:
+            return False
+        if last_modified and to_date and last_modified > to_date:
+            return False
+        if key_contains and key_contains not in obj.name:
+            return False
+        return True
+
     def get_objects(
         self,
         bucket: str,
@@ -205,11 +246,33 @@ def get_azure_objects(
             ):
                 yield blob.name
 
+    def get_google_objects(
+        self,
+        bucket: str,
+        prefix: Optional[str],
+        key_contains: Optional[str],
+        from_date: Optional[datetime],
+        end_date: Optional[datetime],
+    ) -> Iterator[str]:
+        """Get all objects in a GCP bucket with a given prefix"""
+        client = storage.Client()
+        bucket_gcp = client.get_bucket(bucket)
+        blobs = bucket_gcp.list_blobs(prefix=prefix)
+        for blob in blobs:
+            if self.filter_object_google(
+                blob,
+                key_contains,
+                from_date,
+                end_date,
+            ):
+                yield blob.name
+
     def search(
         self,
         bucket: Optional[str],
         account_name: Optional[str],
         container_name: Optional[str],
+        google_bucket: Optional[str],
         query: str,
         file_size: int,
         prefix: Optional[str] = None,
@@ -246,3 +309,12 @@ def search(
             )
             print(f"Searching {len(matching_keys)} files in {account_name}/{container_name} for {query}...")
             self.download_from_azure(account_name, container_name, matching_keys, query, hide_filenames)
+
+        if google_bucket:
+            matching_keys = list(
+                self.get_google_objects(google_bucket, prefix, key_contains, parsed_from_date, parsed_end_date)
+            )
+
+            print(f"Searching {len(matching_keys)} files in {google_bucket} for {query}...")
+
+            self.download_from_google(google_bucket, matching_keys, query, hide_filenames)
diff --git a/requirements.txt b/requirements.txt
@@ -11,4 +11,4 @@ pip-audit==2.6.1
 azure-storage-blob==12.18.3
 azure-core==1.29.4
 azure-identity==1.14.1
-pytest-mock==3.11.1
+google-cloud-storage==2.12.0
diff --git a/tests/tests_unit.py b/tests/tests_unit.py
@@ -5,6 +5,7 @@
 import unittest
 import os
 import boto3
+from google.cloud import storage  # type: ignore
 import timeout_decorator
 from moto import mock_s3
 from datetime import datetime
@@ -83,3 +84,15 @@ def test_object_not_empty_and_size_greater_than_file_size(self) -> None:
         result = cloud_grep.filter_object_azure(obj, key_contains, from_date, to_date, file_size)  # type: ignore
 
         assert result == True
+
+    # Returns True if all conditions are met
+    def test_returns_true_if_all_conditions_are_met(self) -> None:
+        obj = storage.blob.Blob(name="example_file.txt", bucket="example_bucket")
+        key_contains = "example"
+        from_date = datetime(2021, 1, 1)
+        to_date = datetime(2023, 1, 1)
+
+        cloud_grep = CloudGrep()
+        result = cloud_grep.filter_object_google(obj, key_contains, from_date, to_date)
+
+        self.assertTrue(result)