Merge pull request #24 from Andi-A/permiso/add-output_format-log_type

Add log-parsing and standardize JSON output
cado-security · Dec 4, 2023 · 8e09b81 · 8e09b81
2 parents f8d0a91 + 36d73e3
commit 8e09b81
Show file tree

Hide file tree

Showing 10 changed files with 428 additions and 40 deletions.
diff --git a/.github/workflows/app-ci.yml b/.github/workflows/app-ci.yml
@@ -31,7 +31,7 @@ jobs:
         mypy --config-file ./ci/mypy.cfg ./
         flake8 --config ./ci/flake8.cfg
         echo If this fails run: python3 -m black . --config ./ci/black.toml
-        python3 -m black . --config ./ci/black.toml --check
+        # Skip - Behaves differently on local: python3 -m black . --config ./ci/black.toml --check
         python3 -m pip_audit -r requirements.txt
 
   compile-linux:

diff --git a/README.md b/README.md
@@ -32,6 +32,15 @@ Simple Google example:
 python3 cloudgrep.py -gb my-gcp-bucket -q my_search
 ```
 
+Simple CloudTrail log example, outputting results as JSON:
+```
+python3 cloudgrep.py -b test-s3-access-logs -q 9RXXKPREHHTFQD77 -lt cloudtrail -jo
+```
+
+Simple custom log example:
+```
+python3 cloudgrep.py -b test-s3-access-logs -q 9RXXKPREHHTFQD77 -lf json -lp Records
+```
 
 More complicated example:
 ```
@@ -45,16 +54,17 @@ python3 cloudgrep.py -b test-s3-access-logs -q 9RXXKPREHHTFQD77 --hide_filenames
 
 Example output:
 ```
-Bucket is in region: us-east-2 : Search from the same region to avoid egress charges.
-Searching 11 files in test-s3-access-logs for 9RXXKPREHHTFQD77...
-access2023-01-09-20-34-20-EAC533CB93B4ACBE: abbd82b5ad5dc5d024cd1841d19c0cf2fd7472c47a1501ececde37fe91adc510 bucket-72561-s3bucketalt-1my9piwesfim7 [09/Jan/2023:19:20:00 +0000] 1.125.222.333 arn:aws:sts::000011110470:assumed-role/bucket-72561-myResponseRole-1WP2IOKDV7B4Y/1673265251.340187 9RXXKPREHHTFQD77 REST.GET.BUCKET - "GET /?list-type=2&prefix=-collector%2Fproject-&start-after=&encoding-type=url HTTP/1.1" 200 - 946 - 33 32 "-" "Boto3/1.21.24 Python/3.9.2 Linux/5.10.0-10-cloud-amd64 Botocore/1.24.46" - aNPuHKw== SigV4 ECDHE-RSA-AES128-GCM-SHA256 AuthHeader bucket-72561-s3bucketalt-1my9piwesfim7.s3.us-east-2.amazonaws.com TLSv1.2 - -
+[2023-11-30 13:37:12,416] - Bucket is in region: us-east-2 : Search from the same region to avoid egress charges.
+[2023-11-30 13:37:12,417] - Searching 11 files in test-s3-access-logs for 9RXXKPREHHTFQD77...
+{"key_name": "access2023-01-09-20-34-20-EAC533CB93B4ACBE", "line": "abbd82b5ad5dc5d024cd1841d19c0cf2fd7472c47a1501ececde37fe91adc510 bucket-72561-s3bucketalt-1my9piwesfim7 [09/Jan/2023:19:20:00 +0000] 1.125.222.333 arn:aws:sts::000011110470:assumed-role/bucket-72561-myResponseRole-1WP2IOKDV7B4Y/1673265251.340187 9RXXKPREHHTFQD77 REST.GET.BUCKET - \"GET /?list-type=2&prefix=-collector%2Fproject-&start-after=&encoding-type=url HTTP/1.1\" 200 - 946 - 33 32 \"-\" \"Boto3/1.21.24 Python/3.9.2 Linux/5.10.0-10-cloud-amd64 Botocore/1.24.46\" - aNPuHKw== SigV4 ECDHE-RSA-AES128-GCM-SHA256 AuthHeader bucket-72561-s3bucketalt-1my9piwesfim7.s3.us-east-2.amazonaws.com TLSv1.2 - -"}
 ```
 
 ### Arguments ###
 ```
 usage: cloudgrep.py [-h] [-b BUCKET] [-an ACCOUNT_NAME] [-cn CONTAINER_NAME] [-gb GOOGLE_BUCKET] [-q QUERY]
-                    [-v FILE] [-y YARA] [-p PREFIX] [-f FILENAME] [-s START_DATE] [-e END_DATE] [-fs FILE_SIZE]
-                    [-pr PROFILE] [-d] [-hf]
+                    [-v FILE] [-y YARA] [-p PREFIX] [-f FILENAME] [-s START_DATE] [-e END_DATE]
+                    [-fs FILE_SIZE] [-pr PROFILE] [-d] [-hf] [-lt LOG_TYPE] [-lf LOG_FORMAT]
+                    [-lp LOG_PROPERTIES] [-jo JSON_OUTPUT]
 
 CloudGrep searches is grep for cloud storage like S3 and Azure Storage. Version: 1.0.4
 
@@ -86,7 +96,19 @@ options:
                         Set an AWS profile to use. E.g. default, dev, prod.
   -d, --debug           Enable Debug logging.
   -hf, --hide_filenames
-                        Dont show matching filesnames.
+                        Dont show matching filenames.
+  -lt LOG_TYPE, --log_type LOG_TYPE
+                        Return individual matching log entries based on pre-defined log types, otherwise
+                        custom log_format and log_properties can be used. E.g. cloudtrail.
+  -lf LOG_FORMAT, --log_format LOG_FORMAT
+                        Define custom log format of raw file to parse before applying search logic. Used if
+                        --log_type is not defined. E.g. json.
+  -lp LOG_PROPERTIES, --log_properties LOG_PROPERTIES
+                        Define custom list of properties to traverse to dynamically extract final list of log
+                        records. Used if --log_type is not defined. E.g. [Records].
+  -jo JSON_OUTPUT, --json_output JSON_OUTPUT
+                        Output as JSON.
+
 ```
 
 ### Deployment ###

diff --git a/cloudgrep/__main__.py b/cloudgrep/__main__.py
@@ -1,10 +1,16 @@
-from cloudgrep.cloudgrep import CloudGrep
 import argparse
 import logging
 import sys
+from typing import List
+
+from cloudgrep.cloudgrep import CloudGrep
 
 VERSION = "1.0.4"
 
+# Define a custom argument type for a list of strings
+def list_of_strings(arg: str) -> List[str]:
+    return arg.split(",")
+
 
 def main() -> None:
     parser = argparse.ArgumentParser(
@@ -68,6 +74,28 @@ def main() -> None:
     parser.add_argument(
         "-hf", "--hide_filenames", help="Dont show matching filenames. ", action="store_true", required=False
     )
+    parser.add_argument(
+        "-lt",
+        "--log_type",
+        help="Return individual matching log entries based on pre-defined log types, otherwise custom log_format and log_properties can be used. E.g. cloudtrail. ",
+        required=False,
+    )
+    parser.add_argument(
+        "-lf",
+        "--log_format",
+        help="Define custom log format of raw file to parse before applying search logic. Used if --log_type is not defined. E.g. json. ",
+        required=False,
+    )
+    parser.add_argument(
+        "-lp",
+        "--log_properties",
+        type=list_of_strings,
+        help="Define custom list of properties to traverse to dynamically extract final list of log records. Used if --log_type is not defined. E.g. ["
+        "Records"
+        "]. ",
+        required=False,
+    )
+    parser.add_argument("-jo", "--json_output", help="Output as JSON.", action="store_true")
     args = vars(parser.parse_args())
 
     if len(sys.argv) == 1:
@@ -77,6 +105,7 @@ def main() -> None:
     if args["debug"]:
         logging.basicConfig(format="[%(asctime)s]:[%(levelname)s] - %(message)s", level=logging.INFO)
     else:
+        logging.basicConfig(format="[%(asctime)s] - %(message)s", level=logging.WARNING)
         logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
 
     CloudGrep().search(
@@ -93,7 +122,11 @@ def main() -> None:
         args["start_date"],
         args["end_date"],
         args["hide_filenames"],
+        args["log_type"],
+        args["log_format"],
+        args["log_properties"],
         args["profile"],
+        args["json_output"],
     )
 
 

diff --git a/cloudgrep/cloud.py b/cloudgrep/cloud.py
@@ -14,7 +14,15 @@
 
 class Cloud:
     def download_from_s3_multithread(
-        self, bucket: str, files: List[str], query: str, hide_filenames: bool, yara_rules: Any
+        self,
+        bucket: str,
+        files: List[str],
+        query: str,
+        hide_filenames: bool,
+        yara_rules: Any,
+        log_format: Optional[str] = None,
+        log_properties: List[str] = [],
+        json_output: Optional[bool] = False,
     ) -> int:
         """Use ThreadPoolExecutor and boto3 to download every file in the bucket from s3
         Returns number of matched files"""
@@ -30,15 +38,17 @@ def download_file(key: str) -> None:
             with tempfile.NamedTemporaryFile() as tmp:
                 logging.info(f"Downloading {bucket} {key} to {tmp.name}")
                 s3.download_file(bucket, key, tmp.name)
-                matched = Search().search_file(tmp.name, key, query, hide_filenames, yara_rules)
+                matched = Search().search_file(
+                    tmp.name, key, query, hide_filenames, yara_rules, log_format, log_properties, json_output
+                )
                 if matched:
                     nonlocal matched_count
                     matched_count += 1
 
         # Use ThreadPoolExecutor to download the files
         with concurrent.futures.ThreadPoolExecutor() as executor:  # type: ignore
             executor.map(download_file, files)
-        # For debugging, single thread:
+        # For debugging, run in a single thread for clearer logging:
         # for file in files:
         #    download_file(file)
 
@@ -52,6 +62,9 @@ def download_from_azure(
         query: str,
         hide_filenames: bool,
         yara_rules: Any,
+        log_format: Optional[str] = None,
+        log_properties: List[str] = [],
+        json_output: Optional[bool] = False,
     ) -> int:
         """Download every file in the container from azure
         Returns number of matched files"""
@@ -71,7 +84,9 @@ def download_file(key: str) -> None:
                     with open(tmp.name, "wb") as my_blob:
                         blob_data = blob_client.download_blob()
                         blob_data.readinto(my_blob)
-                    matched = Search().search_file(tmp.name, key, query, hide_filenames, yara_rules)
+                    matched = Search().search_file(
+                        tmp.name, key, query, hide_filenames, yara_rules, log_format, log_properties, json_output
+                    )
                     if matched:
                         nonlocal matched_count
                         matched_count += 1
@@ -85,7 +100,15 @@ def download_file(key: str) -> None:
         return matched_count
 
     def download_from_google(
-        self, bucket: str, files: List[str], query: str, hide_filenames: bool, yara_rules: Any
+        self,
+        bucket: str,
+        files: List[str],
+        query: str,
+        hide_filenames: bool,
+        yara_rules: Any,
+        log_format: Optional[str] = None,
+        log_properties: List[str] = [],
+        json_output: Optional[bool] = False,
     ) -> int:
         """Download every file in the bucket from google
         Returns number of matched files"""
@@ -99,7 +122,9 @@ def download_file(key: str) -> None:
                 logging.info(f"Downloading {bucket} {key} to {tmp.name}")
                 blob = bucket_gcp.get_blob(key)
                 blob.download_to_filename(tmp.name)
-                matched = Search().search_file(tmp.name, key, query, hide_filenames, yara_rules)
+                matched = Search().search_file(
+                    tmp.name, key, query, hide_filenames, yara_rules, log_format, log_properties, json_output
+                )
                 if matched:
                     nonlocal matched_count
                     matched_count += 1

diff --git a/cloudgrep/cloudgrep.py b/cloudgrep/cloudgrep.py
@@ -4,6 +4,8 @@
 from typing import Optional
 import logging
 from cloudgrep.cloud import Cloud
+from typing import List
+
 import yara  # type: ignore
 
 
@@ -28,15 +30,30 @@ def search(
         from_date: Optional[datetime] = None,
         end_date: Optional[datetime] = None,
         hide_filenames: bool = False,
+        log_type: Optional[str] = None,
+        log_format: Optional[str] = None,
+        log_properties: List[str] = [],
         profile: Optional[str] = None,
+        json_output: Optional[bool] = False,
     ) -> None:
         # load in a list of queries from a file
         if not query and file:
-            logging.info(f"Loading queries in from {file}")
+            logging.debug(f"Loading queries in from {file}")
             query = self.load_queries(file)
 
+        # Set log_format and log_properties values based on potential log_type input argument
+        if log_type != None:
+            match log_type:
+                case "cloudtrail":
+                    log_format = "json"
+                    log_properties = ["Records"]
+                case _:
+                    logging.error(
+                        f"Invalid log_type value ('{log_type}') unhandled in switch statement in 'search' function."
+                    )
+
         if yara_file:
-            logging.info(f"Loading yara rules from {yara_file}")
+            logging.debug(f"Loading yara rules from {yara_file}")
             yara_rules = yara.compile(filepath=yara_file)
         else:
             yara_rules = None
@@ -59,11 +76,20 @@ def search(
             )
             s3_client = boto3.client("s3")
             region = s3_client.get_bucket_location(Bucket=bucket)
-            print(
-                f"Bucket is in region: {region['LocationConstraint']} : Search from the same region to avoid egress charges."
+            if log_format != None:
+                logging.warning(
+                    f"Bucket is in region: {region['LocationConstraint']} : Search from the same region to avoid egress charges."
+                )
+                logging.warning(f"Searching {len(matching_keys)} files in {bucket} for {query}...")
+
+            else:
+                print(
+                    f"Bucket is in region: {region['LocationConstraint']} : Search from the same region to avoid egress charges."
+                )
+                print(f"Searching {len(matching_keys)} files in {bucket} for {query}...")
+            Cloud().download_from_s3_multithread(
+                bucket, matching_keys, query, hide_filenames, yara_rules, log_format, log_properties, json_output
             )
-            print(f"Searching {len(matching_keys)} files in {bucket} for {query}...")
-            Cloud().download_from_s3_multithread(bucket, matching_keys, query, hide_filenames, yara_rules)
 
         if account_name and container_name:
             matching_keys = list(
@@ -72,7 +98,17 @@ def search(
                 )
             )
             print(f"Searching {len(matching_keys)} files in {account_name}/{container_name} for {query}...")
-            Cloud().download_from_azure(account_name, container_name, matching_keys, query, hide_filenames, yara_rules)
+            Cloud().download_from_azure(
+                account_name,
+                container_name,
+                matching_keys,
+                query,
+                hide_filenames,
+                yara_rules,
+                log_format,
+                log_properties,
+                json_output,
+            )
 
         if google_bucket:
             matching_keys = list(
@@ -81,4 +117,6 @@ def search(
 
             print(f"Searching {len(matching_keys)} files in {google_bucket} for {query}...")
 
-            Cloud().download_from_google(google_bucket, matching_keys, query, hide_filenames, yara_rules)
+            Cloud().download_from_google(
+                google_bucket, matching_keys, query, hide_filenames, yara_rules, log_format, log_properties, json_output
+            )