Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gcp #6

Merged
merged 7 commits into from
Oct 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ Simple Azure example:
python3 cloudgrep.py -an some_account -cn some_container -q my_search
```

Simple Google example:
```
python3 cloudgrep.py -gb do-not-delete-api-tests-bucket -q my_search
```


More complicated example:
```
Expand Down Expand Up @@ -86,9 +91,11 @@ You can run this from your local laptop, or from a virtual machine in your cloud
### Running in your Cloud and Authentication ###

#### AWS ####
Your system will need access to the S3 bucket. For example, if you are running on your laptop, you will need to [configure the AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html).
If you are running on an EC2, an [Instance Profile](https://devopscube.com/aws-iam-role-instance-profile/) is likely the best choice.

If you run on an EC2 instance in the same region as the S3 bucket with a [VPC endpoint for S3](https://aws.amazon.com/blogs/architecture/overview-of-data-transfer-costs-for-common-architectures/) you can [avoid egress charges](https://awsmadeeasy.com/blog/aws-s3-vpc-endpoint-transfer-cost-reduction/).
You can authenticate in a [number of ways](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html).
If you are running on an EC2, an [Instance Profile](https://devopscube.com/aws-iam-role-instance-profile/) is likely the best choice.

#### Azure ####
The simplest way to authenticate with Azure is to first run:
Expand All @@ -97,6 +104,12 @@ az login
```
This will open a browser window and prompt you to login to Azure.

#### GCP ####
You will need to create a service account and download the credentials file then set with:
```
export GOOGLE_APPLICATION_CREDENTIALS="/Users/creds.json"
```

### Contributions ###
We welcome any contributions to this project! Please add via a Pull Request.

Expand Down
2 changes: 2 additions & 0 deletions cloudgrep.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
parser.add_argument("-b", "--bucket", help="AWS S3 Bucket to search. E.g. my-bucket", required=False)
parser.add_argument("-an", "--account-name", help="Azure Account Name to Search", required=False)
parser.add_argument("-cn", "--container-name", help="Azure Container Name to Search", required=False)
parser.add_argument("-gb", "--google-bucket", help="Google Cloud Bucket to Search", required=False)
parser.add_argument(
"-q", "--query", help="Text to search for. Will be parsed as a Regex. E.g. example.com", required=True
)
Expand Down Expand Up @@ -56,6 +57,7 @@
args["bucket"],
args["account_name"],
args["container_name"],
args["google_bucket"],
args["query"],
args["file_size"],
args["prefix"],
Expand Down
72 changes: 72 additions & 0 deletions core/cloudgrep.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from azure.storage.blob import BlobServiceClient, BlobProperties
from azure.identity import DefaultAzureCredential
from azure.core.exceptions import ResourceNotFoundError
from google.cloud import storage # type: ignore
from datetime import timezone, datetime
from dateutil.parser import parse
import botocore
Expand Down Expand Up @@ -120,6 +121,30 @@ def download_file(key: str) -> None:

return matched_count

def download_from_google(self, bucket: str, files: List[str], query: str, hide_filenames: bool) -> int:
"""Download every file in the bucket from google
Returns number of matched files"""

matched_count = 0
client = storage.Client()
bucket_gcp = client.get_bucket(bucket)

def download_file(key: str) -> None:
with tempfile.NamedTemporaryFile() as tmp:
logging.info(f"Downloading {bucket} {key} to {tmp.name}")
blob = bucket_gcp.get_blob(key)
blob.download_to_filename(tmp.name)
matched = self.search_file(tmp.name, key, query, hide_filenames)
if matched:
nonlocal matched_count
matched_count += 1

# Use ThreadPoolExecutor to download the files
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(download_file, files)

return matched_count

def filter_object(
self,
obj: dict,
Expand Down Expand Up @@ -158,6 +183,22 @@ def filter_object_azure(
return False # Object does not contain the key_contains string
return True

def filter_object_google(
self,
obj: storage.blob.Blob,
key_contains: Optional[str],
from_date: Optional[datetime],
to_date: Optional[datetime],
) -> bool:
last_modified = obj.updated
if last_modified and from_date and from_date > last_modified:
return False
if last_modified and to_date and last_modified > to_date:
return False
if key_contains and key_contains not in obj.name:
return False
return True

def get_objects(
self,
bucket: str,
Expand Down Expand Up @@ -205,11 +246,33 @@ def get_azure_objects(
):
yield blob.name

def get_google_objects(
self,
bucket: str,
prefix: Optional[str],
key_contains: Optional[str],
from_date: Optional[datetime],
end_date: Optional[datetime],
) -> Iterator[str]:
"""Get all objects in a GCP bucket with a given prefix"""
client = storage.Client()
bucket_gcp = client.get_bucket(bucket)
blobs = bucket_gcp.list_blobs(prefix=prefix)
for blob in blobs:
if self.filter_object_google(
blob,
key_contains,
from_date,
end_date,
):
yield blob.name

def search(
self,
bucket: Optional[str],
account_name: Optional[str],
container_name: Optional[str],
google_bucket: Optional[str],
query: str,
file_size: int,
prefix: Optional[str] = None,
Expand Down Expand Up @@ -246,3 +309,12 @@ def search(
)
print(f"Searching {len(matching_keys)} files in {account_name}/{container_name} for {query}...")
self.download_from_azure(account_name, container_name, matching_keys, query, hide_filenames)

if google_bucket:
matching_keys = list(
self.get_google_objects(google_bucket, prefix, key_contains, parsed_from_date, parsed_end_date)
)

print(f"Searching {len(matching_keys)} files in {google_bucket} for {query}...")

self.download_from_google(google_bucket, matching_keys, query, hide_filenames)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ pip-audit==2.6.1
azure-storage-blob==12.18.3
azure-core==1.29.4
azure-identity==1.14.1
pytest-mock==3.11.1
google-cloud-storage==2.12.0
13 changes: 13 additions & 0 deletions tests/tests_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import unittest
import os
import boto3
from google.cloud import storage # type: ignore
import timeout_decorator
from moto import mock_s3
from datetime import datetime
Expand Down Expand Up @@ -83,3 +84,15 @@ def test_object_not_empty_and_size_greater_than_file_size(self) -> None:
result = cloud_grep.filter_object_azure(obj, key_contains, from_date, to_date, file_size) # type: ignore

assert result == True

# Returns True if all conditions are met
def test_returns_true_if_all_conditions_are_met(self) -> None:
obj = storage.blob.Blob(name="example_file.txt", bucket="example_bucket")
key_contains = "example"
from_date = datetime(2021, 1, 1)
to_date = datetime(2023, 1, 1)

cloud_grep = CloudGrep()
result = cloud_grep.filter_object_google(obj, key_contains, from_date, to_date)

self.assertTrue(result)