From 31ce61d9940918bf5539ca05fb14fd8882fb6c43 Mon Sep 17 00:00:00 2001
From: eduardocerqueira <eduardomcerqueira@gmail.com>
Date: Tue, 17 Sep 2024 17:12:22 +0000
Subject: [PATCH] 2024-09-17 17:12:22.213604 new snippets

---
 seeker/report.txt                     |  20 +++
 seeker/snippet/clone-github.sh        |  49 +++++++
 seeker/snippet/data-exploration.py    |  13 ++
 seeker/snippet/data-preprocessing.py  |  35 +++++
 seeker/snippet/dataset-preparation.py |  20 +++
 seeker/snippet/find-lambda.sh         | 161 ++++++++++++++++++++++
 seeker/snippet/fix-atime.py           |  24 ++++
 seeker/snippet/install.sh             |  25 ++++
 seeker/snippet/main.py                | 183 ++++++++++++++++++++++++++
 seeker/snippet/read-dataset.py        |  13 ++
 10 files changed, 543 insertions(+)
 create mode 100644 seeker/snippet/clone-github.sh
 create mode 100644 seeker/snippet/data-exploration.py
 create mode 100644 seeker/snippet/data-preprocessing.py
 create mode 100644 seeker/snippet/dataset-preparation.py
 create mode 100644 seeker/snippet/find-lambda.sh
 create mode 100644 seeker/snippet/fix-atime.py
 create mode 100644 seeker/snippet/install.sh
 create mode 100644 seeker/snippet/main.py
 create mode 100644 seeker/snippet/read-dataset.py
diff --git a/seeker/report.txt b/seeker/report.txt
index ce3258a7..a5528eb5 100644
--- a/seeker/report.txt
+++ b/seeker/report.txt
@@ -1,3 +1,23 @@
+--------------------------------------------------------------------------------
+ 2024-09-17 17:12:22.213604
+--------------------------------------------------------------------------------
+  On branch main
+Your branch is up to date with 'origin/main'.
+
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+	snippet/clone-github.sh
+	snippet/data-exploration.py
+	snippet/data-preprocessing.py
+	snippet/dataset-preparation.py
+	snippet/find-lambda.sh
+	snippet/fix-atime.py
+	snippet/install.sh
+	snippet/main.py
+	snippet/read-dataset.py
+
+nothing added to commit but untracked files present (use "git add" to track)
+
 --------------------------------------------------------------------------------
  2024-09-16 17:12:34.795202
 --------------------------------------------------------------------------------
diff --git a/seeker/snippet/clone-github.sh b/seeker/snippet/clone-github.sh
new file mode 100644
index 00000000..2ba7e1c9
--- /dev/null
+++ b/seeker/snippet/clone-github.sh
@@ -0,0 +1,49 @@
+#date: 2024-09-17T16:55:11Z
+#url: https://api.github.com/gists/ecec01f35abce4af34a2409af65dc85a
+#owner: https://api.github.com/users/dleslie
+
+#!/bin/bash
+
+username="${username:=dleslie}"
+
+function git_clone_or_update {
+    local url=$1
+    local name="${url##*/}";
+    local name="${name%.git}";
+    if [ -d $name ]; then
+        pushd $name
+        git pull
+        popd
+    else
+        git clone --recursive $url $name
+    fi
+}
+
+found_urls=""
+function get_type {
+    local type=$1
+    local page=$2
+    found_urls=`curl -s https://api.github.com/users/$username/$type?per_page=100\&page=$page | jq -c '.[] | .ssh_url'`
+}
+
+function fetch_type {
+    local type=$1
+    local root=$2
+    local index=1
+
+    mkdir -p $root
+    pushd $root
+    get_type $type $index
+    while [ ! -z "$found_urls" ]; do
+        for url in $found_urls; do
+            url=`echo $url | tr -d \"`;
+            git_clone_or_update $url;
+        done
+        index=$((index+1));
+        get_type $type $index
+    done
+    popd
+}
+
+fetch_type repos $username
+fetch_type starred starred
\ No newline at end of file
diff --git a/seeker/snippet/data-exploration.py b/seeker/snippet/data-exploration.py
new file mode 100644
index 00000000..3f1e271b
--- /dev/null
+++ b/seeker/snippet/data-exploration.py
@@ -0,0 +1,13 @@
+#date: 2024-09-17T17:09:06Z
+#url: https://api.github.com/gists/262f97965307559af70c6cdf976ea6a0
+#owner: https://api.github.com/users/docsallover
+
+# How many articles per subject?
+print(data.groupby(['subject'])['text'].count())
+data.groupby(['subject'])['text'].count().plot(kind="bar")
+plt.show()
+
+# How many fake and real articles?
+print(data.groupby(['target'])['text'].count())
+data.groupby(['target'])['text'].count().plot(kind="bar")
+plt.show()
\ No newline at end of file
diff --git a/seeker/snippet/data-preprocessing.py b/seeker/snippet/data-preprocessing.py
new file mode 100644
index 00000000..fc07d730
--- /dev/null
+++ b/seeker/snippet/data-preprocessing.py
@@ -0,0 +1,35 @@
+#date: 2024-09-17T16:59:16Z
+#url: https://api.github.com/gists/af16acf0a46155283562cb074498248c
+#owner: https://api.github.com/users/docsallover
+
+from nltk.corpus import stopwords
+nltk.download('stopwords')  # Download stopwords if not already downloaded
+
+# Remove the date column (assuming it's not relevant for analysis)
+data.drop(["date"], axis=1, inplace=True)
+print(data.head())
+
+# Remove the title column (focusing on text content)
+data.drop(["title"], axis=1, inplace=True)
+print(data.head())
+
+# Convert text to lowercase for consistency
+data['text'] = data['text'].apply(lambda x: x.lower())
+print(data.head())
+
+import string
+
+# Remove punctuation for cleaner analysis
+def punctuation_removal(text):
+    all_list = [char for char in text if char not in string.punctuation]
+    clean_str = ''.join(all_list)
+    return clean_str
+
+data['text'] = data['text'].apply(punctuation_removal)
+print(data.head())
+
+# Remove stopwords for better word representation
+stop = stopwords.words('english')
+
+data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
+print(data.head())
\ No newline at end of file
diff --git a/seeker/snippet/dataset-preparation.py b/seeker/snippet/dataset-preparation.py
new file mode 100644
index 00000000..232bf1c2
--- /dev/null
+++ b/seeker/snippet/dataset-preparation.py
@@ -0,0 +1,20 @@
+#date: 2024-09-17T16:52:09Z
+#url: https://api.github.com/gists/68d5b273d0473a793c5e192956037efd
+#owner: https://api.github.com/users/docsallover
+
+# Add a flag to track fake and real news
+fake['target'] = 'fake'
+true['target'] = 'true'
+
+# Concatenate the dataframes
+data = pd.concat([fake, true]).reset_index(drop=True)
+
+# Check the shape of the combined dataset
+print(data.shape)
+
+# Shuffle the data
+data = shuffle(data)
+data = data.reset_index(drop=True)
+
+# Check the first few rows of the shuffled data
+print(data.head())
\ No newline at end of file
diff --git a/seeker/snippet/find-lambda.sh b/seeker/snippet/find-lambda.sh
new file mode 100644
index 00000000..4b2071e3
--- /dev/null
+++ b/seeker/snippet/find-lambda.sh
@@ -0,0 +1,161 @@
+#date: 2024-09-17T17:00:19Z
+#url: https://api.github.com/gists/3d35243c3b968b9d1d883680ec0ea2fe
+#owner: https://api.github.com/users/darksinge
+
+#!/usr/bin/env bash
+
+set -e
+
+NO_CACHE=0
+CLEAN=0
+
+AWS_PROFILE=""
+AWS_REGION="us-east-1"
+
+help() {
+    cat << EOF
+Usage: $(basename "$0") [OPTIONS]
+
+This script helps you find and open an AWS Lambda function in the AWS Console.
+
+Options:
+  -p, --profile PROFILE   Specify the AWS profile to use
+  -r, --region  REGION    Specify the AWS region to use
+  --no-cache              Disable caching of AWS resources
+  --clean                 Clear out cached results
+  -h, --help              Display this help message and exit
+
+The script will prompt you to:
+1. Choose or enter a stage (dev/stage/prod/other)
+2. Select a CloudFormation stack
+3. Choose a Lambda function from the selected stack
+
+It will then open the chosen Lambda function in your default web browser.
+
+Note: This script requires the following tools to be installed:
+- AWS CLI
+- jq
+- gum (for interactive prompts)
+EOF
+}
+
+if ! gum -v >/dev/null 2>&1; then
+  echo "The 'gum' command was not found."
+  echo "Visit https://github.com/charmbracelet/gum for installation instructions."
+  exit 1
+fi
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -p|--profile)
+      export AWS_PROFILE="$2"
+      shift 2
+      ;;
+    --no-cache)
+      export NO_CACHE=1
+      shift
+      ;;
+    --region)
+      AWS_REGION="$2"
+      shift 2
+      ;;
+    --clean)
+      CLEAN=1
+      ;;
+    -h|--help)
+      help
+      exit 0
+      ;;
+    *)
+      help
+      exit 1
+      ;;
+  esac
+done
+
+CACHE_DIR="$HOME/.cache/find-fn"
+if [ ! -d "$CACHE_DIR" ]; then
+  mkdir -p "$CACHE_DIR"
+fi
+
+if [ $CLEAN -eq 1 ]; then
+  rm -rf "$CACHE_DIR" >/dev/null 2>&1
+  mkdir -p "$CACHE_DIR"
+  exit 0
+fi
+
+STAGE=$(gum choose "dev" "stage" "prod" "other")
+if [ "$STAGE" == "other" ]; then
+  STAGE=$(gum input --placeholder "stage name?")
+fi
+
+STACKS_LIST_CACHE="$CACHE_DIR/$STAGE-stacks"
+
+function _make_temp() {
+  type="$1"
+  fcache="$CACHE_DIR/$STAGE-$type"
+
+  if [ $NO_CACHE -eq 1 ]; then
+    echo "$(mktemp)"
+    return 0
+  fi
+
+  local tmp=""
+  if [ -f "$fcache" ]; then
+    tmp=$(cat "$fcache")
+  fi
+
+  if [ ! -f "$tmp" ]; then
+    tmp=$(mktemp)
+    echo "$tmp" > "$fcache"
+  else
+    tmp=$(cat "$fcache")
+  fi
+
+  echo "$tmp"
+}
+
+function make_temp() {
+  set +e
+  echo $(_make_temp "$1")
+  set -e
+}
+
+stack_list_cache=$(make_temp "stacks")
+if [ -f "$stack_list_cache" ]; then
+  STACKS=$(cat "$stack_list_cache")
+fi
+
+if [ -z "$STACKS" ]; then
+  STACKS=$(gum spin --spinner dot --title 'Fetching stacks' --show-output -- \
+            aws cloudformation list-stacks \
+            --query "StackSummaries[?starts_with(StackName, '$STAGE-certifications-service-')].StackName" \
+            --output json)
+
+  echo "$STACKS" > "$stack_list_cache"
+fi
+
+PREFIX="$STAGE-certifications-service-"
+STACK_NAME=$(gum choose $(echo "$STACKS" | jq -r '.[]' | sed "s/$PREFIX//"))
+STACK_NAME="$PREFIX$STACK_NAME"
+
+resource_cache=$(make_temp "$STACK_NAME-resources")
+if [ -f "$resource_cache" ]; then
+  RESOURCES=$(cat "$resource_cache")
+fi
+
+if [ -z "$RESOURCES" ]; then
+  RESOURCES=$(gum spin --spinner dot --title 'Fetching resources' --show-output -- \
+              aws cloudformation list-stack-resources --stack-name "$STACK_NAME" \
+              --output json)
+  echo "$RESOURCES" > "$resource_cache"
+fi
+
+RESOURCES=$(cat "$resource_cache" | jq '.StackResourceSummaries')
+
+LOGICAL_ID=$(echo "$RESOURCES" | jq -r '.[] | select(.ResourceType == "AWS::Lambda::Function") | .LogicalResourceId' | gum filter)
+PHYSICAL_ID=$(echo "$RESOURCES" | jq -r ".[] | select(.LogicalResourceId == \"$LOGICAL_ID\") | .PhysicalResourceId")
+
+if [ -n "$PHYSICAL_ID" ]; then
+  open "https://$AWS_REGION.console.aws.amazon.com/lambda/home?region=$AWS_REGION#/functions/$PHYSICAL_ID?tab=monitor"
+fi
diff --git a/seeker/snippet/fix-atime.py b/seeker/snippet/fix-atime.py
new file mode 100644
index 00000000..5883a438
--- /dev/null
+++ b/seeker/snippet/fix-atime.py
@@ -0,0 +1,24 @@
+#date: 2024-09-17T17:09:25Z
+#url: https://api.github.com/gists/10ab9650a38e8556c74a1dd4876cd60c
+#owner: https://api.github.com/users/meeb
+
+#!/usr/bin/python3
+
+import os
+import time
+from pathlib import Path
+
+this_dir = os.path.dirname(os.path.realpath(__file__))
+silly_time = time.mktime(time.strptime('2038-01-01', '%Y-%m-%d'))
+
+for root, dirs, files in os.walk(this_dir):
+    rootpath = Path(root)
+    for f in files:
+        filepath = rootpath / f
+        if not os.path.isfile(filepath):
+            continue
+        access_time = os.path.getatime(filepath)
+        # if the file access time is > 2038-01-01 then touch it
+        if access_time > silly_time:
+            print(f'Fixing future access time file: {filepath} ({access_time} > {silly_time})')
+            filepath.touch()
\ No newline at end of file
diff --git a/seeker/snippet/install.sh b/seeker/snippet/install.sh
new file mode 100644
index 00000000..d9b72aa1
--- /dev/null
+++ b/seeker/snippet/install.sh
@@ -0,0 +1,25 @@
+#date: 2024-09-17T16:55:05Z
+#url: https://api.github.com/gists/18888e23fee5a7788305f1aa35a1df3b
+#owner: https://api.github.com/users/zitterbewegung
+
+#!/bin/bash
+
+# Make the script executable
+chmod +x install.sh
+
+# Install Python dependencies
+echo "Installing Python dependencies from requirements.txt..."
+pip install -r requirements.txt
+
+# Check and install Ghidra if needed
+if ! command -v ghidra &> /dev/null; then
+    echo "Ghidra not found. Please install Ghidra manually from https://ghidra-sre.org/"
+fi
+
+# Check and install DTrace if needed
+if ! command -v dtrace &> /dev/null; then
+    echo "DTrace not found. Please install DTrace using your system's package manager."
+fi
+
+# Add any other specific dependency checks as needed
+echo "All dependencies installed. You may need to restart your terminal for some changes to take effect."
diff --git a/seeker/snippet/main.py b/seeker/snippet/main.py
new file mode 100644
index 00000000..cfde251f
--- /dev/null
+++ b/seeker/snippet/main.py
@@ -0,0 +1,183 @@
+#date: 2024-09-17T16:55:05Z
+#url: https://api.github.com/gists/18888e23fee5a7788305f1aa35a1df3b
+#owner: https://api.github.com/users/zitterbewegung
+
+#!/usr/bin/env python3
+
+import subprocess
+import os
+import git
+import requests
+import re
+import json
+from langgraph import LangGraph, RAGCodeAssistant, Tool
+from pygdbmi.gdbcontroller import GdbController  # GDB Python interface
+from ghidra_bridge import GhidraBridge  # Ghidra Python bridge
+from litellm import LiteLLMClient  # Import LiteLLM client
+import vowpalwabbit
+from dotenv import load_dotenv
+import argparse
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Retrieve paths and tokens from environment variables
+GDB_PATH = os.getenv('GDB_PATH')
+GHIDRA_PATH = os.getenv('GHIDRA_PATH')
+DTRACE_PATH = os.getenv('DTRACE_PATH')
+LITELLM_API_KEY = os.getenv('LITELLM_API_KEY')
+ADVISORY_DB_API_TOKEN = "**********"
+ADVISORY_DB_URL = "https://api.github.com/repos/github/advisory-database/contents/advisories"
+
+# Initialize LiteLLM client with the API key from the .env file
+llm_client = LiteLLMClient(api_key=LITELLM_API_KEY)
+
+class LiteLLMAgent:
+    """Agent to interact with LiteLLM for generating code explanations and other prompts."""
+
+    def __init__(self, model="gpt-3.5"):
+        self.model = model
+        self.client = llm_client
+
+    def generate_response(self, prompt):
+        """Generate response using LiteLLM."""
+        response = self.client.complete(
+            prompt=prompt,
+            model=self.model,
+            max_tokens= "**********"
+        )
+        return response.text
+
+class CodeExplanationTool(Tool):
+    """A tool that uses LiteLLM to explain the functionality of C/C++ source code."""
+
+    def __init__(self):
+        super().__init__(name="code_explanation", description="Generates explanations for source code using LiteLLM.")
+        self.llm_agent = LiteLLMAgent()
+
+    def explain_code(self, code):
+        """Generate a description of what the code does using LiteLLM."""
+        explanation_prompt = f"Analyze the following C/C++ code and provide a detailed explanation of what it does:\n\n{code}\n\nExplanation:"
+        response = self.llm_agent.generate_response(explanation_prompt)
+        return response
+
+class AdvisoryScanTool(Tool):
+    """A tool that scans a GitHub repository for known vulnerabilities based on the GitHub Advisory Database."""
+
+    def __init__(self):
+        super().__init__(name="advisory_scan", description="Scans dependencies for known vulnerabilities.")
+        self.advisory_data = self.load_advisory_database()
+
+    def load_advisory_database(self):
+        """Loads advisories from the GitHub Advisory Database."""
+        headers = {'Authorization': "**********"
+        response = requests.get(ADVISORY_DB_URL, headers=headers)
+        if response.status_code == 200:
+            advisories = response.json()
+            print("Advisory Database Loaded Successfully")
+            return advisories
+        else:
+            print("Failed to load Advisory Database")
+            return []
+
+    def parse_advisory_entries(self, advisory):
+        """Parse advisories to extract relevant information."""
+        details = {
+            "package_name": advisory.get("package_name", ""),
+            "vulnerable_versions": advisory.get("vulnerable_versions", ""),
+            "description": advisory.get("description", ""),
+            "severity": advisory.get("severity", ""),
+            "identifiers": advisory.get("identifiers", []),
+        }
+        return details
+
+    def scan_repository(self, repo_path):
+        """Scans the given GitHub repository path for dependencies and checks against known advisories."""
+        repo = git.Repo(repo_path)
+        dependencies = self.extract_dependencies(repo)
+        vulnerabilities = self.match_advisories(dependencies)
+        return vulnerabilities
+
+    def extract_dependencies(self, repo):
+        """Extracts dependencies from the repository (example for C/C++ projects)."""
+        dependencies = []
+        files = repo.git.ls_files('*.txt', '*.json', '*.yaml', '*.lock').splitlines()
+        for file_path in files:
+            with open(os.path.join(repo.working_dir, file_path), 'r') as file:
+                content = file.read()
+                dependencies.extend(self.parse_dependencies_from_file(content))
+        return dependencies
+
+    def parse_dependencies_from_file(self, content):
+        """Parses dependencies from a given file content."""
+        dependencies = []
+        for line in content.splitlines():
+            if "==" in line:
+                package = line.split("==")[0].strip()
+                dependencies.append(package)
+        return dependencies
+
+    def match_advisories(self, dependencies):
+        """Matches dependencies against advisories from the GitHub Advisory Database."""
+        matched_vulnerabilities = []
+        for advisory in self.advisory_data:
+            advisory_details = self.parse_advisory_entries(advisory)
+            for dependency in dependencies:
+                if advisory_details["package_name"].lower() == dependency.lower():
+                    matched_vulnerabilities.append(advisory_details)
+                    print(f"Vulnerability found for {dependency}: {advisory_details}")
+        return matched_vulnerabilities
+
+# Main LangGraph application setup
+class CodeAnalysisLangGraph:
+    def __init__(self):
+        self.langgraph = LangGraph()
+        self.vulnerability_tool = VulnerabilityDetectionTool()
+        self.advisory_scan_tool = AdvisoryScanTool()
+        self.code_explanation_tool = CodeExplanationTool()
+
+    def analyze_repository(self, repo_path):
+        """Main function to analyze the repository and program behavior."""
+        # Initialize GitPython repository
+        repo = git.Repo(repo_path)
+
+        # Scan and explain each C/C++ source file
+        source_files = repo.git.ls_files('*.c', '*.cpp').splitlines()
+        for file_path in source_files:
+            full_path = os.path.join(repo_path, file_path)
+            with open(full_path, 'r') as file:
+                source_code = file.read()
+                
+                # Vulnerability Detection
+                vulnerabilities = self.vulnerability_tool.scan_code(source_code)
+                if vulnerabilities:
+                    print(f"Vulnerabilities in {file_path}:")
+                    for vulnerability in vulnerabilities:
+                        print("-", vulnerability)
+
+                # Code Explanation
+                explanation = self.code_explanation_tool.explain_code(source_code)
+                print(f"Explanation for {file_path}:\n{explanation}\n")
+
+        # Advisory Scan Tool
+        vulnerabilities = self.advisory_scan_tool.scan_repository(repo_path)
+        if vulnerabilities:
+            print("Dependency Vulnerabilities Detected:")
+            for vulnerability in vulnerabilities:
+                print(json.dumps(vulnerability, indent=4))
+
+def main():
+    parser = argparse.ArgumentParser(description="Analyze a GitHub repository for vulnerabilities and code functionality.")
+    parser.add_argument('--repo', type=str, help='Path to the GitHub repository', default=os.getenv('REPO_PATH'))
+    args = parser.parse_args()
+
+    # Run the analysis with the provided or default repository path
+    analyzer = CodeAnalysisLangGraph()
+    analyzer.analyze_repository(args.repo)
+
+if __name__ == "__main__":
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/seeker/snippet/read-dataset.py b/seeker/snippet/read-dataset.py
new file mode 100644
index 00000000..d143c8e6
--- /dev/null
+++ b/seeker/snippet/read-dataset.py
@@ -0,0 +1,13 @@
+#date: 2024-09-17T16:45:08Z
+#url: https://api.github.com/gists/58b059fb7f27264e9e563d064e058157
+#owner: https://api.github.com/users/docsallover
+
+# Load the fake news dataset
+fake = pd.read_csv("data/Fake.csv")
+
+# Load the true news dataset
+true = pd.read_csv("data/True.csv")
+
+# Print the shape of each dataset
+print("Fake news dataset shape:", fake.shape)
+print("True news dataset shape:", true.shape)
\ No newline at end of file