From 31ce61d9940918bf5539ca05fb14fd8882fb6c43 Mon Sep 17 00:00:00 2001 From: eduardocerqueira Date: Tue, 17 Sep 2024 17:12:22 +0000 Subject: [PATCH] 2024-09-17 17:12:22.213604 new snippets --- seeker/report.txt | 20 +++ seeker/snippet/clone-github.sh | 49 +++++++ seeker/snippet/data-exploration.py | 13 ++ seeker/snippet/data-preprocessing.py | 35 +++++ seeker/snippet/dataset-preparation.py | 20 +++ seeker/snippet/find-lambda.sh | 161 ++++++++++++++++++++++ seeker/snippet/fix-atime.py | 24 ++++ seeker/snippet/install.sh | 25 ++++ seeker/snippet/main.py | 183 ++++++++++++++++++++++++++ seeker/snippet/read-dataset.py | 13 ++ 10 files changed, 543 insertions(+) create mode 100644 seeker/snippet/clone-github.sh create mode 100644 seeker/snippet/data-exploration.py create mode 100644 seeker/snippet/data-preprocessing.py create mode 100644 seeker/snippet/dataset-preparation.py create mode 100644 seeker/snippet/find-lambda.sh create mode 100644 seeker/snippet/fix-atime.py create mode 100644 seeker/snippet/install.sh create mode 100644 seeker/snippet/main.py create mode 100644 seeker/snippet/read-dataset.py diff --git a/seeker/report.txt b/seeker/report.txt index ce3258a7..a5528eb5 100644 --- a/seeker/report.txt +++ b/seeker/report.txt @@ -1,3 +1,23 @@ +-------------------------------------------------------------------------------- + 2024-09-17 17:12:22.213604 +-------------------------------------------------------------------------------- + On branch main +Your branch is up to date with 'origin/main'. + +Untracked files: + (use "git add ..." to include in what will be committed) + snippet/clone-github.sh + snippet/data-exploration.py + snippet/data-preprocessing.py + snippet/dataset-preparation.py + snippet/find-lambda.sh + snippet/fix-atime.py + snippet/install.sh + snippet/main.py + snippet/read-dataset.py + +nothing added to commit but untracked files present (use "git add" to track) + -------------------------------------------------------------------------------- 2024-09-16 17:12:34.795202 -------------------------------------------------------------------------------- diff --git a/seeker/snippet/clone-github.sh b/seeker/snippet/clone-github.sh new file mode 100644 index 00000000..2ba7e1c9 --- /dev/null +++ b/seeker/snippet/clone-github.sh @@ -0,0 +1,49 @@ +#date: 2024-09-17T16:55:11Z +#url: https://api.github.com/gists/ecec01f35abce4af34a2409af65dc85a +#owner: https://api.github.com/users/dleslie + +#!/bin/bash + +username="${username:=dleslie}" + +function git_clone_or_update { + local url=$1 + local name="${url##*/}"; + local name="${name%.git}"; + if [ -d $name ]; then + pushd $name + git pull + popd + else + git clone --recursive $url $name + fi +} + +found_urls="" +function get_type { + local type=$1 + local page=$2 + found_urls=`curl -s https://api.github.com/users/$username/$type?per_page=100\&page=$page | jq -c '.[] | .ssh_url'` +} + +function fetch_type { + local type=$1 + local root=$2 + local index=1 + + mkdir -p $root + pushd $root + get_type $type $index + while [ ! -z "$found_urls" ]; do + for url in $found_urls; do + url=`echo $url | tr -d \"`; + git_clone_or_update $url; + done + index=$((index+1)); + get_type $type $index + done + popd +} + +fetch_type repos $username +fetch_type starred starred \ No newline at end of file diff --git a/seeker/snippet/data-exploration.py b/seeker/snippet/data-exploration.py new file mode 100644 index 00000000..3f1e271b --- /dev/null +++ b/seeker/snippet/data-exploration.py @@ -0,0 +1,13 @@ +#date: 2024-09-17T17:09:06Z +#url: https://api.github.com/gists/262f97965307559af70c6cdf976ea6a0 +#owner: https://api.github.com/users/docsallover + +# How many articles per subject? +print(data.groupby(['subject'])['text'].count()) +data.groupby(['subject'])['text'].count().plot(kind="bar") +plt.show() + +# How many fake and real articles? +print(data.groupby(['target'])['text'].count()) +data.groupby(['target'])['text'].count().plot(kind="bar") +plt.show() \ No newline at end of file diff --git a/seeker/snippet/data-preprocessing.py b/seeker/snippet/data-preprocessing.py new file mode 100644 index 00000000..fc07d730 --- /dev/null +++ b/seeker/snippet/data-preprocessing.py @@ -0,0 +1,35 @@ +#date: 2024-09-17T16:59:16Z +#url: https://api.github.com/gists/af16acf0a46155283562cb074498248c +#owner: https://api.github.com/users/docsallover + +from nltk.corpus import stopwords +nltk.download('stopwords') # Download stopwords if not already downloaded + +# Remove the date column (assuming it's not relevant for analysis) +data.drop(["date"], axis=1, inplace=True) +print(data.head()) + +# Remove the title column (focusing on text content) +data.drop(["title"], axis=1, inplace=True) +print(data.head()) + +# Convert text to lowercase for consistency +data['text'] = data['text'].apply(lambda x: x.lower()) +print(data.head()) + +import string + +# Remove punctuation for cleaner analysis +def punctuation_removal(text): + all_list = [char for char in text if char not in string.punctuation] + clean_str = ''.join(all_list) + return clean_str + +data['text'] = data['text'].apply(punctuation_removal) +print(data.head()) + +# Remove stopwords for better word representation +stop = stopwords.words('english') + +data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) +print(data.head()) \ No newline at end of file diff --git a/seeker/snippet/dataset-preparation.py b/seeker/snippet/dataset-preparation.py new file mode 100644 index 00000000..232bf1c2 --- /dev/null +++ b/seeker/snippet/dataset-preparation.py @@ -0,0 +1,20 @@ +#date: 2024-09-17T16:52:09Z +#url: https://api.github.com/gists/68d5b273d0473a793c5e192956037efd +#owner: https://api.github.com/users/docsallover + +# Add a flag to track fake and real news +fake['target'] = 'fake' +true['target'] = 'true' + +# Concatenate the dataframes +data = pd.concat([fake, true]).reset_index(drop=True) + +# Check the shape of the combined dataset +print(data.shape) + +# Shuffle the data +data = shuffle(data) +data = data.reset_index(drop=True) + +# Check the first few rows of the shuffled data +print(data.head()) \ No newline at end of file diff --git a/seeker/snippet/find-lambda.sh b/seeker/snippet/find-lambda.sh new file mode 100644 index 00000000..4b2071e3 --- /dev/null +++ b/seeker/snippet/find-lambda.sh @@ -0,0 +1,161 @@ +#date: 2024-09-17T17:00:19Z +#url: https://api.github.com/gists/3d35243c3b968b9d1d883680ec0ea2fe +#owner: https://api.github.com/users/darksinge + +#!/usr/bin/env bash + +set -e + +NO_CACHE=0 +CLEAN=0 + +AWS_PROFILE="" +AWS_REGION="us-east-1" + +help() { + cat << EOF +Usage: $(basename "$0") [OPTIONS] + +This script helps you find and open an AWS Lambda function in the AWS Console. + +Options: + -p, --profile PROFILE Specify the AWS profile to use + -r, --region REGION Specify the AWS region to use + --no-cache Disable caching of AWS resources + --clean Clear out cached results + -h, --help Display this help message and exit + +The script will prompt you to: +1. Choose or enter a stage (dev/stage/prod/other) +2. Select a CloudFormation stack +3. Choose a Lambda function from the selected stack + +It will then open the chosen Lambda function in your default web browser. + +Note: This script requires the following tools to be installed: +- AWS CLI +- jq +- gum (for interactive prompts) +EOF +} + +if ! gum -v >/dev/null 2>&1; then + echo "The 'gum' command was not found." + echo "Visit https://github.com/charmbracelet/gum for installation instructions." + exit 1 +fi + +while [[ $# -gt 0 ]]; do + case $1 in + -p|--profile) + export AWS_PROFILE="$2" + shift 2 + ;; + --no-cache) + export NO_CACHE=1 + shift + ;; + --region) + AWS_REGION="$2" + shift 2 + ;; + --clean) + CLEAN=1 + ;; + -h|--help) + help + exit 0 + ;; + *) + help + exit 1 + ;; + esac +done + +CACHE_DIR="$HOME/.cache/find-fn" +if [ ! -d "$CACHE_DIR" ]; then + mkdir -p "$CACHE_DIR" +fi + +if [ $CLEAN -eq 1 ]; then + rm -rf "$CACHE_DIR" >/dev/null 2>&1 + mkdir -p "$CACHE_DIR" + exit 0 +fi + +STAGE=$(gum choose "dev" "stage" "prod" "other") +if [ "$STAGE" == "other" ]; then + STAGE=$(gum input --placeholder "stage name?") +fi + +STACKS_LIST_CACHE="$CACHE_DIR/$STAGE-stacks" + +function _make_temp() { + type="$1" + fcache="$CACHE_DIR/$STAGE-$type" + + if [ $NO_CACHE -eq 1 ]; then + echo "$(mktemp)" + return 0 + fi + + local tmp="" + if [ -f "$fcache" ]; then + tmp=$(cat "$fcache") + fi + + if [ ! -f "$tmp" ]; then + tmp=$(mktemp) + echo "$tmp" > "$fcache" + else + tmp=$(cat "$fcache") + fi + + echo "$tmp" +} + +function make_temp() { + set +e + echo $(_make_temp "$1") + set -e +} + +stack_list_cache=$(make_temp "stacks") +if [ -f "$stack_list_cache" ]; then + STACKS=$(cat "$stack_list_cache") +fi + +if [ -z "$STACKS" ]; then + STACKS=$(gum spin --spinner dot --title 'Fetching stacks' --show-output -- \ + aws cloudformation list-stacks \ + --query "StackSummaries[?starts_with(StackName, '$STAGE-certifications-service-')].StackName" \ + --output json) + + echo "$STACKS" > "$stack_list_cache" +fi + +PREFIX="$STAGE-certifications-service-" +STACK_NAME=$(gum choose $(echo "$STACKS" | jq -r '.[]' | sed "s/$PREFIX//")) +STACK_NAME="$PREFIX$STACK_NAME" + +resource_cache=$(make_temp "$STACK_NAME-resources") +if [ -f "$resource_cache" ]; then + RESOURCES=$(cat "$resource_cache") +fi + +if [ -z "$RESOURCES" ]; then + RESOURCES=$(gum spin --spinner dot --title 'Fetching resources' --show-output -- \ + aws cloudformation list-stack-resources --stack-name "$STACK_NAME" \ + --output json) + echo "$RESOURCES" > "$resource_cache" +fi + +RESOURCES=$(cat "$resource_cache" | jq '.StackResourceSummaries') + +LOGICAL_ID=$(echo "$RESOURCES" | jq -r '.[] | select(.ResourceType == "AWS::Lambda::Function") | .LogicalResourceId' | gum filter) +PHYSICAL_ID=$(echo "$RESOURCES" | jq -r ".[] | select(.LogicalResourceId == \"$LOGICAL_ID\") | .PhysicalResourceId") + +if [ -n "$PHYSICAL_ID" ]; then + open "https://$AWS_REGION.console.aws.amazon.com/lambda/home?region=$AWS_REGION#/functions/$PHYSICAL_ID?tab=monitor" +fi diff --git a/seeker/snippet/fix-atime.py b/seeker/snippet/fix-atime.py new file mode 100644 index 00000000..5883a438 --- /dev/null +++ b/seeker/snippet/fix-atime.py @@ -0,0 +1,24 @@ +#date: 2024-09-17T17:09:25Z +#url: https://api.github.com/gists/10ab9650a38e8556c74a1dd4876cd60c +#owner: https://api.github.com/users/meeb + +#!/usr/bin/python3 + +import os +import time +from pathlib import Path + +this_dir = os.path.dirname(os.path.realpath(__file__)) +silly_time = time.mktime(time.strptime('2038-01-01', '%Y-%m-%d')) + +for root, dirs, files in os.walk(this_dir): + rootpath = Path(root) + for f in files: + filepath = rootpath / f + if not os.path.isfile(filepath): + continue + access_time = os.path.getatime(filepath) + # if the file access time is > 2038-01-01 then touch it + if access_time > silly_time: + print(f'Fixing future access time file: {filepath} ({access_time} > {silly_time})') + filepath.touch() \ No newline at end of file diff --git a/seeker/snippet/install.sh b/seeker/snippet/install.sh new file mode 100644 index 00000000..d9b72aa1 --- /dev/null +++ b/seeker/snippet/install.sh @@ -0,0 +1,25 @@ +#date: 2024-09-17T16:55:05Z +#url: https://api.github.com/gists/18888e23fee5a7788305f1aa35a1df3b +#owner: https://api.github.com/users/zitterbewegung + +#!/bin/bash + +# Make the script executable +chmod +x install.sh + +# Install Python dependencies +echo "Installing Python dependencies from requirements.txt..." +pip install -r requirements.txt + +# Check and install Ghidra if needed +if ! command -v ghidra &> /dev/null; then + echo "Ghidra not found. Please install Ghidra manually from https://ghidra-sre.org/" +fi + +# Check and install DTrace if needed +if ! command -v dtrace &> /dev/null; then + echo "DTrace not found. Please install DTrace using your system's package manager." +fi + +# Add any other specific dependency checks as needed +echo "All dependencies installed. You may need to restart your terminal for some changes to take effect." diff --git a/seeker/snippet/main.py b/seeker/snippet/main.py new file mode 100644 index 00000000..cfde251f --- /dev/null +++ b/seeker/snippet/main.py @@ -0,0 +1,183 @@ +#date: 2024-09-17T16:55:05Z +#url: https://api.github.com/gists/18888e23fee5a7788305f1aa35a1df3b +#owner: https://api.github.com/users/zitterbewegung + +#!/usr/bin/env python3 + +import subprocess +import os +import git +import requests +import re +import json +from langgraph import LangGraph, RAGCodeAssistant, Tool +from pygdbmi.gdbcontroller import GdbController # GDB Python interface +from ghidra_bridge import GhidraBridge # Ghidra Python bridge +from litellm import LiteLLMClient # Import LiteLLM client +import vowpalwabbit +from dotenv import load_dotenv +import argparse + +# Load environment variables from .env file +load_dotenv() + +# Retrieve paths and tokens from environment variables +GDB_PATH = os.getenv('GDB_PATH') +GHIDRA_PATH = os.getenv('GHIDRA_PATH') +DTRACE_PATH = os.getenv('DTRACE_PATH') +LITELLM_API_KEY = os.getenv('LITELLM_API_KEY') +ADVISORY_DB_API_TOKEN = "**********" +ADVISORY_DB_URL = "https://api.github.com/repos/github/advisory-database/contents/advisories" + +# Initialize LiteLLM client with the API key from the .env file +llm_client = LiteLLMClient(api_key=LITELLM_API_KEY) + +class LiteLLMAgent: + """Agent to interact with LiteLLM for generating code explanations and other prompts.""" + + def __init__(self, model="gpt-3.5"): + self.model = model + self.client = llm_client + + def generate_response(self, prompt): + """Generate response using LiteLLM.""" + response = self.client.complete( + prompt=prompt, + model=self.model, + max_tokens= "**********" + ) + return response.text + +class CodeExplanationTool(Tool): + """A tool that uses LiteLLM to explain the functionality of C/C++ source code.""" + + def __init__(self): + super().__init__(name="code_explanation", description="Generates explanations for source code using LiteLLM.") + self.llm_agent = LiteLLMAgent() + + def explain_code(self, code): + """Generate a description of what the code does using LiteLLM.""" + explanation_prompt = f"Analyze the following C/C++ code and provide a detailed explanation of what it does:\n\n{code}\n\nExplanation:" + response = self.llm_agent.generate_response(explanation_prompt) + return response + +class AdvisoryScanTool(Tool): + """A tool that scans a GitHub repository for known vulnerabilities based on the GitHub Advisory Database.""" + + def __init__(self): + super().__init__(name="advisory_scan", description="Scans dependencies for known vulnerabilities.") + self.advisory_data = self.load_advisory_database() + + def load_advisory_database(self): + """Loads advisories from the GitHub Advisory Database.""" + headers = {'Authorization': "**********" + response = requests.get(ADVISORY_DB_URL, headers=headers) + if response.status_code == 200: + advisories = response.json() + print("Advisory Database Loaded Successfully") + return advisories + else: + print("Failed to load Advisory Database") + return [] + + def parse_advisory_entries(self, advisory): + """Parse advisories to extract relevant information.""" + details = { + "package_name": advisory.get("package_name", ""), + "vulnerable_versions": advisory.get("vulnerable_versions", ""), + "description": advisory.get("description", ""), + "severity": advisory.get("severity", ""), + "identifiers": advisory.get("identifiers", []), + } + return details + + def scan_repository(self, repo_path): + """Scans the given GitHub repository path for dependencies and checks against known advisories.""" + repo = git.Repo(repo_path) + dependencies = self.extract_dependencies(repo) + vulnerabilities = self.match_advisories(dependencies) + return vulnerabilities + + def extract_dependencies(self, repo): + """Extracts dependencies from the repository (example for C/C++ projects).""" + dependencies = [] + files = repo.git.ls_files('*.txt', '*.json', '*.yaml', '*.lock').splitlines() + for file_path in files: + with open(os.path.join(repo.working_dir, file_path), 'r') as file: + content = file.read() + dependencies.extend(self.parse_dependencies_from_file(content)) + return dependencies + + def parse_dependencies_from_file(self, content): + """Parses dependencies from a given file content.""" + dependencies = [] + for line in content.splitlines(): + if "==" in line: + package = line.split("==")[0].strip() + dependencies.append(package) + return dependencies + + def match_advisories(self, dependencies): + """Matches dependencies against advisories from the GitHub Advisory Database.""" + matched_vulnerabilities = [] + for advisory in self.advisory_data: + advisory_details = self.parse_advisory_entries(advisory) + for dependency in dependencies: + if advisory_details["package_name"].lower() == dependency.lower(): + matched_vulnerabilities.append(advisory_details) + print(f"Vulnerability found for {dependency}: {advisory_details}") + return matched_vulnerabilities + +# Main LangGraph application setup +class CodeAnalysisLangGraph: + def __init__(self): + self.langgraph = LangGraph() + self.vulnerability_tool = VulnerabilityDetectionTool() + self.advisory_scan_tool = AdvisoryScanTool() + self.code_explanation_tool = CodeExplanationTool() + + def analyze_repository(self, repo_path): + """Main function to analyze the repository and program behavior.""" + # Initialize GitPython repository + repo = git.Repo(repo_path) + + # Scan and explain each C/C++ source file + source_files = repo.git.ls_files('*.c', '*.cpp').splitlines() + for file_path in source_files: + full_path = os.path.join(repo_path, file_path) + with open(full_path, 'r') as file: + source_code = file.read() + + # Vulnerability Detection + vulnerabilities = self.vulnerability_tool.scan_code(source_code) + if vulnerabilities: + print(f"Vulnerabilities in {file_path}:") + for vulnerability in vulnerabilities: + print("-", vulnerability) + + # Code Explanation + explanation = self.code_explanation_tool.explain_code(source_code) + print(f"Explanation for {file_path}:\n{explanation}\n") + + # Advisory Scan Tool + vulnerabilities = self.advisory_scan_tool.scan_repository(repo_path) + if vulnerabilities: + print("Dependency Vulnerabilities Detected:") + for vulnerability in vulnerabilities: + print(json.dumps(vulnerability, indent=4)) + +def main(): + parser = argparse.ArgumentParser(description="Analyze a GitHub repository for vulnerabilities and code functionality.") + parser.add_argument('--repo', type=str, help='Path to the GitHub repository', default=os.getenv('REPO_PATH')) + args = parser.parse_args() + + # Run the analysis with the provided or default repository path + analyzer = CodeAnalysisLangGraph() + analyzer.analyze_repository(args.repo) + +if __name__ == "__main__": + main() + + +if __name__ == "__main__": + main() diff --git a/seeker/snippet/read-dataset.py b/seeker/snippet/read-dataset.py new file mode 100644 index 00000000..d143c8e6 --- /dev/null +++ b/seeker/snippet/read-dataset.py @@ -0,0 +1,13 @@ +#date: 2024-09-17T16:45:08Z +#url: https://api.github.com/gists/58b059fb7f27264e9e563d064e058157 +#owner: https://api.github.com/users/docsallover + +# Load the fake news dataset +fake = pd.read_csv("data/Fake.csv") + +# Load the true news dataset +true = pd.read_csv("data/True.csv") + +# Print the shape of each dataset +print("Fake news dataset shape:", fake.shape) +print("True news dataset shape:", true.shape) \ No newline at end of file