Skip to content

Commit

Permalink
Merge pull request #24 from Andi-A/permiso/add-output_format-log_type
Browse files Browse the repository at this point in the history
Add log-parsing and standardize JSON output
  • Loading branch information
chrisdoman authored Dec 4, 2023
2 parents f8d0a91 + 36d73e3 commit 8e09b81
Show file tree
Hide file tree
Showing 10 changed files with 428 additions and 40 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/app-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
mypy --config-file ./ci/mypy.cfg ./
flake8 --config ./ci/flake8.cfg
echo If this fails run: python3 -m black . --config ./ci/black.toml
python3 -m black . --config ./ci/black.toml --check
# Skip - Behaves differently on local: python3 -m black . --config ./ci/black.toml --check
python3 -m pip_audit -r requirements.txt
compile-linux:
Expand Down
34 changes: 28 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@ Simple Google example:
python3 cloudgrep.py -gb my-gcp-bucket -q my_search
```

Simple CloudTrail log example, outputting results as JSON:
```
python3 cloudgrep.py -b test-s3-access-logs -q 9RXXKPREHHTFQD77 -lt cloudtrail -jo
```

Simple custom log example:
```
python3 cloudgrep.py -b test-s3-access-logs -q 9RXXKPREHHTFQD77 -lf json -lp Records
```

More complicated example:
```
Expand All @@ -45,16 +54,17 @@ python3 cloudgrep.py -b test-s3-access-logs -q 9RXXKPREHHTFQD77 --hide_filenames

Example output:
```
Bucket is in region: us-east-2 : Search from the same region to avoid egress charges.
Searching 11 files in test-s3-access-logs for 9RXXKPREHHTFQD77...
access2023-01-09-20-34-20-EAC533CB93B4ACBE: abbd82b5ad5dc5d024cd1841d19c0cf2fd7472c47a1501ececde37fe91adc510 bucket-72561-s3bucketalt-1my9piwesfim7 [09/Jan/2023:19:20:00 +0000] 1.125.222.333 arn:aws:sts::000011110470:assumed-role/bucket-72561-myResponseRole-1WP2IOKDV7B4Y/1673265251.340187 9RXXKPREHHTFQD77 REST.GET.BUCKET - "GET /?list-type=2&prefix=-collector%2Fproject-&start-after=&encoding-type=url HTTP/1.1" 200 - 946 - 33 32 "-" "Boto3/1.21.24 Python/3.9.2 Linux/5.10.0-10-cloud-amd64 Botocore/1.24.46" - aNPuHKw== SigV4 ECDHE-RSA-AES128-GCM-SHA256 AuthHeader bucket-72561-s3bucketalt-1my9piwesfim7.s3.us-east-2.amazonaws.com TLSv1.2 - -
[2023-11-30 13:37:12,416] - Bucket is in region: us-east-2 : Search from the same region to avoid egress charges.
[2023-11-30 13:37:12,417] - Searching 11 files in test-s3-access-logs for 9RXXKPREHHTFQD77...
{"key_name": "access2023-01-09-20-34-20-EAC533CB93B4ACBE", "line": "abbd82b5ad5dc5d024cd1841d19c0cf2fd7472c47a1501ececde37fe91adc510 bucket-72561-s3bucketalt-1my9piwesfim7 [09/Jan/2023:19:20:00 +0000] 1.125.222.333 arn:aws:sts::000011110470:assumed-role/bucket-72561-myResponseRole-1WP2IOKDV7B4Y/1673265251.340187 9RXXKPREHHTFQD77 REST.GET.BUCKET - \"GET /?list-type=2&prefix=-collector%2Fproject-&start-after=&encoding-type=url HTTP/1.1\" 200 - 946 - 33 32 \"-\" \"Boto3/1.21.24 Python/3.9.2 Linux/5.10.0-10-cloud-amd64 Botocore/1.24.46\" - aNPuHKw== SigV4 ECDHE-RSA-AES128-GCM-SHA256 AuthHeader bucket-72561-s3bucketalt-1my9piwesfim7.s3.us-east-2.amazonaws.com TLSv1.2 - -"}
```

### Arguments ###
```
usage: cloudgrep.py [-h] [-b BUCKET] [-an ACCOUNT_NAME] [-cn CONTAINER_NAME] [-gb GOOGLE_BUCKET] [-q QUERY]
[-v FILE] [-y YARA] [-p PREFIX] [-f FILENAME] [-s START_DATE] [-e END_DATE] [-fs FILE_SIZE]
[-pr PROFILE] [-d] [-hf]
[-v FILE] [-y YARA] [-p PREFIX] [-f FILENAME] [-s START_DATE] [-e END_DATE]
[-fs FILE_SIZE] [-pr PROFILE] [-d] [-hf] [-lt LOG_TYPE] [-lf LOG_FORMAT]
[-lp LOG_PROPERTIES] [-jo JSON_OUTPUT]
CloudGrep searches is grep for cloud storage like S3 and Azure Storage. Version: 1.0.4
Expand Down Expand Up @@ -86,7 +96,19 @@ options:
Set an AWS profile to use. E.g. default, dev, prod.
-d, --debug Enable Debug logging.
-hf, --hide_filenames
Dont show matching filesnames.
Dont show matching filenames.
-lt LOG_TYPE, --log_type LOG_TYPE
Return individual matching log entries based on pre-defined log types, otherwise
custom log_format and log_properties can be used. E.g. cloudtrail.
-lf LOG_FORMAT, --log_format LOG_FORMAT
Define custom log format of raw file to parse before applying search logic. Used if
--log_type is not defined. E.g. json.
-lp LOG_PROPERTIES, --log_properties LOG_PROPERTIES
Define custom list of properties to traverse to dynamically extract final list of log
records. Used if --log_type is not defined. E.g. [Records].
-jo JSON_OUTPUT, --json_output JSON_OUTPUT
Output as JSON.
```

### Deployment ###
Expand Down
35 changes: 34 additions & 1 deletion cloudgrep/__main__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
from cloudgrep.cloudgrep import CloudGrep
import argparse
import logging
import sys
from typing import List

from cloudgrep.cloudgrep import CloudGrep

VERSION = "1.0.4"

# Define a custom argument type for a list of strings
def list_of_strings(arg: str) -> List[str]:
return arg.split(",")


def main() -> None:
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -68,6 +74,28 @@ def main() -> None:
parser.add_argument(
"-hf", "--hide_filenames", help="Dont show matching filenames. ", action="store_true", required=False
)
parser.add_argument(
"-lt",
"--log_type",
help="Return individual matching log entries based on pre-defined log types, otherwise custom log_format and log_properties can be used. E.g. cloudtrail. ",
required=False,
)
parser.add_argument(
"-lf",
"--log_format",
help="Define custom log format of raw file to parse before applying search logic. Used if --log_type is not defined. E.g. json. ",
required=False,
)
parser.add_argument(
"-lp",
"--log_properties",
type=list_of_strings,
help="Define custom list of properties to traverse to dynamically extract final list of log records. Used if --log_type is not defined. E.g. ["
"Records"
"]. ",
required=False,
)
parser.add_argument("-jo", "--json_output", help="Output as JSON.", action="store_true")
args = vars(parser.parse_args())

if len(sys.argv) == 1:
Expand All @@ -77,6 +105,7 @@ def main() -> None:
if args["debug"]:
logging.basicConfig(format="[%(asctime)s]:[%(levelname)s] - %(message)s", level=logging.INFO)
else:
logging.basicConfig(format="[%(asctime)s] - %(message)s", level=logging.WARNING)
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)

CloudGrep().search(
Expand All @@ -93,7 +122,11 @@ def main() -> None:
args["start_date"],
args["end_date"],
args["hide_filenames"],
args["log_type"],
args["log_format"],
args["log_properties"],
args["profile"],
args["json_output"],
)


Expand Down
37 changes: 31 additions & 6 deletions cloudgrep/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,15 @@

class Cloud:
def download_from_s3_multithread(
self, bucket: str, files: List[str], query: str, hide_filenames: bool, yara_rules: Any
self,
bucket: str,
files: List[str],
query: str,
hide_filenames: bool,
yara_rules: Any,
log_format: Optional[str] = None,
log_properties: List[str] = [],
json_output: Optional[bool] = False,
) -> int:
"""Use ThreadPoolExecutor and boto3 to download every file in the bucket from s3
Returns number of matched files"""
Expand All @@ -30,15 +38,17 @@ def download_file(key: str) -> None:
with tempfile.NamedTemporaryFile() as tmp:
logging.info(f"Downloading {bucket} {key} to {tmp.name}")
s3.download_file(bucket, key, tmp.name)
matched = Search().search_file(tmp.name, key, query, hide_filenames, yara_rules)
matched = Search().search_file(
tmp.name, key, query, hide_filenames, yara_rules, log_format, log_properties, json_output
)
if matched:
nonlocal matched_count
matched_count += 1

# Use ThreadPoolExecutor to download the files
with concurrent.futures.ThreadPoolExecutor() as executor: # type: ignore
executor.map(download_file, files)
# For debugging, single thread:
# For debugging, run in a single thread for clearer logging:
# for file in files:
# download_file(file)

Expand All @@ -52,6 +62,9 @@ def download_from_azure(
query: str,
hide_filenames: bool,
yara_rules: Any,
log_format: Optional[str] = None,
log_properties: List[str] = [],
json_output: Optional[bool] = False,
) -> int:
"""Download every file in the container from azure
Returns number of matched files"""
Expand All @@ -71,7 +84,9 @@ def download_file(key: str) -> None:
with open(tmp.name, "wb") as my_blob:
blob_data = blob_client.download_blob()
blob_data.readinto(my_blob)
matched = Search().search_file(tmp.name, key, query, hide_filenames, yara_rules)
matched = Search().search_file(
tmp.name, key, query, hide_filenames, yara_rules, log_format, log_properties, json_output
)
if matched:
nonlocal matched_count
matched_count += 1
Expand All @@ -85,7 +100,15 @@ def download_file(key: str) -> None:
return matched_count

def download_from_google(
self, bucket: str, files: List[str], query: str, hide_filenames: bool, yara_rules: Any
self,
bucket: str,
files: List[str],
query: str,
hide_filenames: bool,
yara_rules: Any,
log_format: Optional[str] = None,
log_properties: List[str] = [],
json_output: Optional[bool] = False,
) -> int:
"""Download every file in the bucket from google
Returns number of matched files"""
Expand All @@ -99,7 +122,9 @@ def download_file(key: str) -> None:
logging.info(f"Downloading {bucket} {key} to {tmp.name}")
blob = bucket_gcp.get_blob(key)
blob.download_to_filename(tmp.name)
matched = Search().search_file(tmp.name, key, query, hide_filenames, yara_rules)
matched = Search().search_file(
tmp.name, key, query, hide_filenames, yara_rules, log_format, log_properties, json_output
)
if matched:
nonlocal matched_count
matched_count += 1
Expand Down
54 changes: 46 additions & 8 deletions cloudgrep/cloudgrep.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from typing import Optional
import logging
from cloudgrep.cloud import Cloud
from typing import List

import yara # type: ignore


Expand All @@ -28,15 +30,30 @@ def search(
from_date: Optional[datetime] = None,
end_date: Optional[datetime] = None,
hide_filenames: bool = False,
log_type: Optional[str] = None,
log_format: Optional[str] = None,
log_properties: List[str] = [],
profile: Optional[str] = None,
json_output: Optional[bool] = False,
) -> None:
# load in a list of queries from a file
if not query and file:
logging.info(f"Loading queries in from {file}")
logging.debug(f"Loading queries in from {file}")
query = self.load_queries(file)

# Set log_format and log_properties values based on potential log_type input argument
if log_type != None:
match log_type:
case "cloudtrail":
log_format = "json"
log_properties = ["Records"]
case _:
logging.error(
f"Invalid log_type value ('{log_type}') unhandled in switch statement in 'search' function."
)

if yara_file:
logging.info(f"Loading yara rules from {yara_file}")
logging.debug(f"Loading yara rules from {yara_file}")
yara_rules = yara.compile(filepath=yara_file)
else:
yara_rules = None
Expand All @@ -59,11 +76,20 @@ def search(
)
s3_client = boto3.client("s3")
region = s3_client.get_bucket_location(Bucket=bucket)
print(
f"Bucket is in region: {region['LocationConstraint']} : Search from the same region to avoid egress charges."
if log_format != None:
logging.warning(
f"Bucket is in region: {region['LocationConstraint']} : Search from the same region to avoid egress charges."
)
logging.warning(f"Searching {len(matching_keys)} files in {bucket} for {query}...")

else:
print(
f"Bucket is in region: {region['LocationConstraint']} : Search from the same region to avoid egress charges."
)
print(f"Searching {len(matching_keys)} files in {bucket} for {query}...")
Cloud().download_from_s3_multithread(
bucket, matching_keys, query, hide_filenames, yara_rules, log_format, log_properties, json_output
)
print(f"Searching {len(matching_keys)} files in {bucket} for {query}...")
Cloud().download_from_s3_multithread(bucket, matching_keys, query, hide_filenames, yara_rules)

if account_name and container_name:
matching_keys = list(
Expand All @@ -72,7 +98,17 @@ def search(
)
)
print(f"Searching {len(matching_keys)} files in {account_name}/{container_name} for {query}...")
Cloud().download_from_azure(account_name, container_name, matching_keys, query, hide_filenames, yara_rules)
Cloud().download_from_azure(
account_name,
container_name,
matching_keys,
query,
hide_filenames,
yara_rules,
log_format,
log_properties,
json_output,
)

if google_bucket:
matching_keys = list(
Expand All @@ -81,4 +117,6 @@ def search(

print(f"Searching {len(matching_keys)} files in {google_bucket} for {query}...")

Cloud().download_from_google(google_bucket, matching_keys, query, hide_filenames, yara_rules)
Cloud().download_from_google(
google_bucket, matching_keys, query, hide_filenames, yara_rules, log_format, log_properties, json_output
)
Loading

0 comments on commit 8e09b81

Please sign in to comment.