diff --git a/seeker/snippet/3sum.java b/seeker/snippet/3sum.java deleted file mode 100644 index 584c2681..00000000 --- a/seeker/snippet/3sum.java +++ /dev/null @@ -1,39 +0,0 @@ -//date: 2024-08-14T18:07:51Z -//url: https://api.github.com/gists/62caae13cf49004b4d972db686ff6308 -//owner: https://api.github.com/users/kymotz - -class Solution { - public List> threeSum(int[] nums) { - - List> res = new ArrayList<>(); - Arrays.sort(nums); - - for (int i = 0; i < nums.length; i++) { - if (nums[i] > 0) return res; - // i 重复 - if (i > 0 && nums[i] == nums[i - 1]) { - continue; - } - int left = i + 1, right = nums.length - 1; - while (left < right) { - int sum = nums[i] + nums[left] + nums[right]; - if (sum > 0) { - right--; - } else if (sum < 0) { - left++; - } else { - res.add(Arrays.asList(nums[i], nums[left], nums[right])); - left++; - right--; - while (left < right && nums[left] == nums[left - 1]) { - left++; - } - while (left < right && nums[right] == nums[right + 1]) { - right--; - } - } - } - } - return res; - } -} \ No newline at end of file diff --git a/seeker/snippet/Dockerfile b/seeker/snippet/Dockerfile deleted file mode 100644 index d05ab528..00000000 --- a/seeker/snippet/Dockerfile +++ /dev/null @@ -1,19 +0,0 @@ -#date: 2024-08-14T18:12:56Z -#url: https://api.github.com/gists/80c8d32519a4bcb1d59608d0651a8981 -#owner: https://api.github.com/users/tuffacton - -# Add kubectl to the Docker container -FROM alpine:latest AS kubectl - -# Specify the kubectl version to download -ARG KUBECTL_VERSION=v1.28.2 - -# Install kubectl -RUN apk add --no-cache curl && \ - curl -LO "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/$(uname -m)/kubectl" && \ - chmod +x kubectl && \ - mv kubectl /usr/local/bin/ - -# Copy kubectl into the main image -FROM moby/buildkit:master-rootless AS final-image -COPY --from=kubectl /usr/local/bin/kubectl /usr/local/bin/kubectl \ No newline at end of file diff --git a/seeker/snippet/acp.sh b/seeker/snippet/acp.sh deleted file mode 100644 index c49cfc16..00000000 --- a/seeker/snippet/acp.sh +++ /dev/null @@ -1,21 +0,0 @@ -#date: 2024-08-14T18:25:13Z -#url: https://api.github.com/gists/65d429bf9eec149e030ad7d53317f5a1 -#owner: https://api.github.com/users/z8leo - -#!/bin/bash - -# Automate git add, commit and push with a single command with a simple alias within git - -git config --global alias.acp '!f() { git add -A && git commit -m "${1:-changes}" && git push; }; f' - -# Example usage: -# git acp "custom commit message" -# git acp # Applies "changes" as defualt commit message - -# Alias for add and commit -# In some cases, i do not want to push. The alias for this case is: -git config --global alias.ac '!f() { git add -A && git commit -m "${1:-changes}"; }; f' - -# Example usage: -# git ac "custom commit message" -# git ac # Applies "changes" as defualt commit message \ No newline at end of file diff --git a/seeker/snippet/aws_s3_bucket_object_setup_and_retrieval.sh b/seeker/snippet/aws_s3_bucket_object_setup_and_retrieval.sh deleted file mode 100644 index b7f8d9a1..00000000 --- a/seeker/snippet/aws_s3_bucket_object_setup_and_retrieval.sh +++ /dev/null @@ -1,109 +0,0 @@ -#date: 2024-08-14T18:13:30Z -#url: https://api.github.com/gists/e078984ee9953dca2618e39f4e01d9df -#owner: https://api.github.com/users/terrancedejesus - -#!/bin/bash - -# Disable AWS CLI pager -export AWS_PAGER="" - -# Step 1: Define variables -bucket_name="test-vulnerable-access-bucket-$(date +%s)" -region="us-east-1" # Update if in a different region -object_prefix="sensitive-file" -num_objects=5 -num_access_attempts=10 - -# Step 2: Create the S3 bucket -echo "[+] Creating a new S3 bucket: $bucket_name" -if [ "$region" == "us-east-1" ]; then - aws s3api create-bucket --bucket $bucket_name --region $region -else - aws s3api create-bucket --bucket $bucket_name --region $region --create-bucket-configuration LocationConstraint=$region -fi - -if [ $? -ne 0 ]; then - echo "[-] Error: Could not create S3 bucket." - exit 1 -fi - -echo "[+] S3 bucket '$bucket_name' created." - -# Step 2b: Disable Block Public Access settings (if enabled) -echo "[+] Disabling S3 Block Public Access settings for the bucket." -aws s3api put-public-access-block --bucket $bucket_name --public-access-block-configuration BlockPublicAcls=false,IgnorePublicAcls=false,BlockPublicPolicy=false,RestrictPublicBuckets=false - -if [ $? -ne 0 ]; then - echo "[-] Error: Could not disable Block Public Access settings." - exit 1 -fi -echo "[+] Block Public Access settings disabled." - -# Step 3: Configure bucket policy to allow public access -echo "[+] Configuring bucket policy to allow public access." -aws s3api put-bucket-policy --bucket $bucket_name --policy '{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": "*", - "Action": "s3:GetObject", - "Resource": "arn:aws:s3:::'"$bucket_name"'/*" - } - ] -}' - -if [ $? -ne 0 ]; then - echo "[-] Error: Could not set bucket policy." - exit 1 -fi - -echo "[+] Bucket policy configured for public access." - -# Step 4: Upload fake sensitive objects to the bucket -echo "[+] Uploading fake sensitive objects." -for i in $(seq 1 $num_objects); do - echo "This is sensitive content $i" > "${object_prefix}-${i}.txt" - aws s3 cp "${object_prefix}-${i}.txt" "s3://$bucket_name/" - if [ $? -ne 0 ]; then - echo "[-] Error: Could not upload object ${object_prefix}-${i}.txt." - exit 1 - fi - echo "[+] Uploaded ${object_prefix}-${i}.txt" -done - -# Step 5: Access the objects publicly in rapid succession -echo "[+] Publicly accessing objects in rapid succession." -for i in $(seq 1 $num_access_attempts); do - for j in $(seq 1 $num_objects); do - curl -s "https://${bucket_name}.s3.${region}.amazonaws.com/${object_prefix}-${j}.txt" > /dev/null - echo "[+] Accessed ${object_prefix}-${j}.txt (Attempt $i)" - done -done - -# Step 6: Cleanup - Remove objects and the S3 bucket -echo "[+] Cleaning up - Removing objects and the S3 bucket." - -# Disable bucket versioning to allow deletion -aws s3api delete-bucket-policy --bucket $bucket_name - -# Delete the objects -for i in $(seq 1 $num_objects); do - aws s3 rm "s3://$bucket_name/${object_prefix}-${i}.txt" - if [ $? -ne 0 ]; then - echo "[-] Error: Could not delete object ${object_prefix}-${i}.txt." - exit 1 - fi - rm "${object_prefix}-${i}.txt" - echo "[+] Deleted ${object_prefix}-${i}.txt" -done - -# Delete the bucket -aws s3api delete-bucket --bucket $bucket_name --region $region - -if [ $? -ne 0 ]; then - echo "[-] Error: Could not delete S3 bucket." - exit 1 -fi - -echo "[+] S3 bucket '$bucket_name' and objects deleted. Cleanup complete." diff --git a/seeker/snippet/bybit_get_historical_kline.py b/seeker/snippet/bybit_get_historical_kline.py deleted file mode 100644 index bd697eac..00000000 --- a/seeker/snippet/bybit_get_historical_kline.py +++ /dev/null @@ -1,169 +0,0 @@ -#date: 2024-08-14T18:11:03Z -#url: https://api.github.com/gists/33f40cc8ea790877d36bb7e546ded798 -#owner: https://api.github.com/users/abhijiths101 - -# this code is based on get_historical_data() from python-binance module -# https://github.com/sammchardy/python-binance -# it also requires pybybit.py available from this page -# https://note.mu/mtkn1/n/n9ef3460e4085 -# (where pandas & websocket-client are needed) - -import time -import dateparser -import pytz -import json -import csv -import pandas as pd -from datetime import datetime - - -def get_historical_klines(symbol, interval, start_str, end_str=None): - """Get Historical Klines from Bybit - - See dateparse docs for valid start and end string formats http://dateparser.readthedocs.io/en/latest/ - - If using offset strings for dates add "UTC" to date string e.g. "now UTC", "11 hours ago UTC" - - :param symbol: Name of symbol pair -- BTCUSD, ETCUSD, EOSUSD, XRPUSD - :type symbol: str - :param interval: Bybit Kline interval -- 1 3 5 15 30 60 120 240 360 720 "D" "M" "W" "Y" - :type interval: str - :param start_str: Start date string in UTC format - :type start_str: str - :param end_str: optional - end date string in UTC format - :type end_str: str - - :return: list of OHLCV values - - """ - - # set parameters for kline() - timeframe = str(interval) - limit = 200 - start_ts = int(date_to_milliseconds(start_str)/1000) - end_ts = None - if end_str: - end_ts = int(date_to_milliseconds(end_str)/1000) - else: - end_ts = int(date_to_milliseconds('now')/1000) - - - # init our list - output_data = [] - - # loop counter - idx = 0 - # it can be difficult to know when a symbol was listed on Binance so allow start time to be before list date - symbol_existed = False - while True: - # fetch the klines from start_ts up to max 200 entries - temp_dict = bybit.kline(symbol=symbol, interval=timeframe, _from=start_ts, limit=limit) - - # handle the case where our start date is before the symbol pair listed on Binance - if not symbol_existed and len(temp_dict): - symbol_existed = True - - if symbol_existed: - # extract data and convert to list - temp_data = [list(i.values())[2:] for i in temp_dict['result']] - # append this loops data to our output data - output_data += temp_data - - # update our start timestamp using the last value in the array and add the interval timeframe - # NOTE: current implementation ignores inteval of D/W/M/Y for now - start_ts = temp_data[len(temp_data) - 1][0] + interval*60 - - else: - # it wasn't listed yet, increment our start date - start_ts += timeframe - - idx += 1 - # check if we received less than the required limit and exit the loop - if len(temp_data) < limit: - # exit the while loop - break - - # sleep after every 3rd call to be kind to the API - if idx % 3 == 0: - time.sleep(0.2) - - return output_data - -def get_historical_klines_pd(symbol, interval, start_str, end_str=None): - """Get Historical Klines from Bybit - - See dateparse docs for valid start and end string formats - http://dateparser.readthedocs.io/en/latest/ - - If using offset strings for dates add "UTC" to date string - e.g. "now UTC", "11 hours ago UTC" - - :param symbol: Name of symbol pair -- BTCUSD, ETCUSD, EOSUSD, XRPUSD - :type symbol: str - :param interval: Bybit Kline interval -- 1 3 5 15 30 60 120 240 360 720 "D" "M" "W" "Y" - :type interval: str - :param start_str: Start date string in UTC format - :type start_str: str - :param end_str: optional - end date string in UTC format - :type end_str: str - - :return: list of OHLCV values - - """ - - # set parameters for kline() - timeframe = str(interval) - limit = 200 - start_ts = int(date_to_milliseconds(start_str)/1000) - end_ts = None - if end_str: - end_ts = int(date_to_milliseconds(end_str)/1000) - else: - end_ts = int(date_to_milliseconds('now')/1000) - - - # init our list - output_data = [] - - # loop counter - idx = 0 - # it can be difficult to know when a symbol was listed on Binance so allow start time to be before list date - symbol_existed = False - while True: - # fetch the klines from start_ts up to max 200 entries - temp_dict = bybit.kline(symbol=symbol, interval=timeframe, _from=start_ts, limit=limit) - - # handle the case where our start date is before the symbol pair listed on Binance - if not symbol_existed and len(temp_dict): - symbol_existed = True - - if symbol_existed: - # extract data and convert to list - temp_data = [list(i.values())[2:] for i in temp_dict['result']] - # append this loops data to our output data - output_data += temp_data - - # update our start timestamp using the last value in the array and add the interval timeframe - # NOTE: current implementation does not support inteval of D/W/M/Y - start_ts = temp_data[len(temp_data) - 1][0] + interval*60 - - else: - # it wasn't listed yet, increment our start date - start_ts += timeframe - - idx += 1 - # check if we received less than the required limit and exit the loop - if len(temp_data) < limit: - # exit the while loop - break - - # sleep after every 3rd call to be kind to the API - if idx % 3 == 0: - time.sleep(0.2) - - # convert to data frame - df = pd.DataFrame(output_data, columns=['TimeStamp', 'Open', 'High', 'Low', 'Close', 'Volume', 'TurnOver']) - df['Date'] = [datetime.fromtimestamp(i).strftime('%Y-%m-%d %H:%M:%S.%d')[:-3] for i in df['TimeStamp']] - - return df - diff --git a/seeker/snippet/cached_method.py b/seeker/snippet/cached_method.py deleted file mode 100644 index e55043f3..00000000 --- a/seeker/snippet/cached_method.py +++ /dev/null @@ -1,28 +0,0 @@ -#date: 2024-08-14T18:14:50Z -#url: https://api.github.com/gists/99daf49b001c0c98eb3058f5f2efdcd0 -#owner: https://api.github.com/users/ktbarrett - -class cached_method: - - def __init__(self, method): - self._method = method - update_wrapper(self, method) - - def __get__(self, instance, objtype=None): - if instance is None: - return self - - cache = {} - - @wraps(self._method) - def lookup(*args, **kwargs): - key = (args, tuple(kwargs.items())) - try: - return cache[key] - except KeyError: - res = self._method(instance, *args, **kwargs) - cache[key] = res - return res - - setattr(instance, self._method.__name__, lookup) - return lookup \ No newline at end of file diff --git a/seeker/snippet/dump_git.py b/seeker/snippet/dump_git.py deleted file mode 100644 index a1de27b2..00000000 --- a/seeker/snippet/dump_git.py +++ /dev/null @@ -1,715 +0,0 @@ -#date: 2024-08-14T18:11:17Z -#url: https://api.github.com/gists/4f6365cbf223b208845bd236f808a8b4 -#owner: https://api.github.com/users/s3rgeym - -#!/usr/bin/env python -# https://git-scm.com/book/ru/v2/Git-изнутри-Протоколы-передачи-данных -from __future__ import annotations - -import argparse -import asyncio -import contextlib -import logging -import pathlib -import re -import shlex -import shutil -import subprocess -import sys -import tempfile -import zlib -from collections import Counter -from concurrent.futures import ProcessPoolExecutor -from io import BytesIO -from typing import ( - AsyncIterator, - Sequence, - TextIO, -) - -import aiohttp -import yarl - -sys.path.insert(0, str(pathlib.Path(__file__).parent)) - -from git_index import GitIndex - -__author__ = "Sergey M" -__version__ = "1.0.0" - -IMAGE_EXTENSIONS = ( - ".bmp", - ".gif", - ".heic", - ".ico", - ".jpeg", - ".jpg", - ".png", - ".svg", - ".tif", - ".tiff", - ".webp", -) - -VIDEO_EXTENSIONS = ( - ".3gp", - ".avi", - ".flv", - ".m4v", - ".mkv", - ".mov", - ".mp4", - ".mpeg", - ".mpg", - ".webm", - ".wmv", -) - -AUDIO_EXTENSIONS = ( - ".aac", - ".aiff", - ".alac", - ".flac", - ".m4a", - ".mp3", - ".ogg", - ".wav", - ".wma", -) - -DOCUMENT_EXTENSIONS = ( - # ".doc", - # ".docx", - # ".md", - ".odp", - ".ods", - ".odt", - ".pdf", - ".pot", - ".ppt", - ".pptx", - ".psd", - ".rtf", - ".ai", - ".sketch", - # в них пароли хранятся - # ".txt", - # ".xls", - # ".xlsx", -) - -FONT_EXTENSIONS = (".ttf", ".otf", ".woff", ".woff2", ".eot") - -WEB_EXTENSIONS = ( - ".htm", - ".html", - ".css", - ".less", - ".scss", - ".sass", - ".pug", - # иногда серверный код находит - ".js", - ".jsx", - ".ts", - ".tsx", - ".vue", - ".map", - ".tpl", - ".webmanifest", - ".swf", -) - -TRANSLATION_EXTENSIONS = (".po", ".mo") - -EXECUTABLE_EXTENSIONS = ( - ".exe", - ".dll", - ".msi", - ".apk", - ".bin", - ".crx", -) - -DYNAMIC_CONTENT_EXTENSIONS = ( - ".php", - ".jsp", - ".aspx", -) - -ALL_EXTENSIONS = ( - IMAGE_EXTENSIONS - + VIDEO_EXTENSIONS - + AUDIO_EXTENSIONS - + DOCUMENT_EXTENSIONS - + FONT_EXTENSIONS - + WEB_EXTENSIONS - + TRANSLATION_EXTENSIONS - + EXECUTABLE_EXTENSIONS - + DYNAMIC_CONTENT_EXTENSIONS -) - -# FORCE_DOWNLOAD = ("robots.txt",) - -COMMON_FILES = [ - "COMMIT_EDITMSG", - "description", - "FETCH_HEAD", - "HEAD", - "index", - "info/exclude", - "info/refs", - "logs/HEAD", - "objects/info/packs", - "ORIG_HEAD", - "packed-refs", - "refs/remotes/origin/HEAD", - # "hooks/applypatch-msg", - # "hooks/commit-msg", - # "hooks/fsmonitor-watchman", - # "hooks/post-update", - # "hooks/pre-applypatch", - # "hooks/pre-commit", - # "hooks/pre-merge-commit", - # "hooks/pre-push", - # "hooks/pre-rebase", - # "hooks/pre-receive", - # "hooks/prepare-commit-msg", - # "hooks/push-to-checkout", - # "hooks/sendemail-validate", - # "hooks/update", -] - -HTML_RE = re.compile(rb"\s*<[!a-zA-Z]") -LINK_RE = re.compile(b' str: - message = self._fmt.format(record) - return f"{self._level_colors[record.levelname]}{message}{ANSI.RESET}" - - -logger = logging.getLogger(__name__) - - -class NameSpace(argparse.Namespace): - input: TextIO - output: pathlib.Path - git_folder: str - workers: int - user_agent: str - timeout: float - download_all: bool - force_download: bool - host_error: int - probe_size: int - verbosity: int - - -def parse_args( - argv: Sequence[str] | None = None, -) -> tuple[argparse.ArgumentParser, NameSpace]: - parser = argparse.ArgumentParser( - description="Dump exposed .git repositories.", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument( - "-i", - "--input", - type=argparse.FileType(), - default="-", - help="File with URLs to process (default: standard input).", - ) - parser.add_argument( - "-o", - "--output", - type=pathlib.Path, - default=pathlib.Path.cwd() / "dumps", - help="Directory to save downloaded files.", - ) - parser.add_argument( - "--git-folder", - default=GIT_FOLDER, - help="git folder", - ) - parser.add_argument( - "-w", - "--workers", - type=int, - default=10, - help="Number of asynchronous worker tasks.", - ) - parser.add_argument( - "-u", - "--user-agent", - default=DEFAULT_USER_AGENT, - help="User-Agent for HTTP requests.", - ) - parser.add_argument( - "-t", - "--timeout", - type=float, - default=60.0, - help="Timeout.", - ) - parser.add_argument( - "-a", - "--download-all", - "--all", - action="store_true", - default=False, - help="Download all files, including those usually skipped.", - ) - parser.add_argument( - "-f", - "--force-download", - action="store_true", - default=False, - help="Force download and overwrite existinbg files.", - ) - parser.add_argument( - "-e", - "--host-error", - type=int, - default=-1, - help="Number of maximum errors per host.", - ) - parser.add_argument( - "-s", - "--probe-size", - "--probe", - type=int, - default=PROBE_SIZE, - help="Probe size limit.", - ) - parser.add_argument( - "-v", - "--verbosity", - help="Be more verbosity.", - action="count", - default=0, - ) - parser.add_argument( - "--version", action="version", version=f"%(prog)s {__version__}" - ) - return parser, parser.parse_args(argv) - - -async def main(argv: Sequence[str] | None = None) -> None: - parser, args = parse_args(argv) - - if not re.fullmatch(r"[^\\/:\*\?\|<>]+", args.git_folder): - parser.error("invalid git folder name") - - logger.setLevel( - max(logging.DEBUG, logging.WARNING - logging.DEBUG * args.verbosity) - ) - - logger.addHandler(ColorHandler()) - - urls = set(map(normalize_url, filter(None, map(str.strip, args.input)))) - - queue = asyncio.Queue() - seen = set() - host_error = Counter() - executor = ProcessPoolExecutor() - - async with get_session(args) as session: - await asyncio.gather( - process_queue(queue, urls, args), - *( - worker(session, queue, seen, host_error, executor, args) - for _ in range(args.workers) - ), - ) - - for git_path, result in executor.map( - restore_repo, - (url2path(url.joinpath(args.git_folder), args.output) for url in urls), - ): - if result: - logger.info(f"Git repo restored: {git_path}") - else: - logger.error(f"Can't restore git repo: {git_path}") - - logger.info("finished!") - - -def restore_repo(git_path: pathlib.Path) -> tuple[pathlib.Path, bool]: - work_tree = git_path.parent - temp_dir = tempfile.mkdtemp(dir=work_tree.parent) - - try: - # скаченные файлы переместим во временный каталог - for item in work_tree.iterdir(): - # Не трогаем .git - if git_path.name != item.name: - shutil.move(item, temp_dir) - - # выполним команду git checkout, которая восстановит файлы из - # репозитория (и удалила бы все которых там нет) - - # можно ли использовать git stash для того чтобы спрятать скаченные - # файлы, а потом восстановить? - cmd = ( - f"git --git-dir={shlex.quote(str(git_path))}" - f" --work-tree={shlex.quote(str(work_tree))} checkout ." - ) - - subprocess.check_call(cmd, shell=True) - return git_path, True - except subprocess.CalledProcessError: - return git_path, False - finally: - # перемещаем скаченные файлы обратно - sync_directories(temp_dir, work_tree) - - # удалим временный каталог - shutil.rmtree(temp_dir) - - -def sync_directories( - src_dir: str | pathlib.Path, - dest_dir: str | pathlib.Path, -) -> None: - src_dir = pathlib.Path(src_dir) - dest_dir = pathlib.Path(dest_dir) - - if not dest_dir.exists(): - dest_dir.mkdir(parents=True, exist_ok=True) - - for src_path in src_dir.rglob("*"): - relative_path = src_path.relative_to(src_dir) - dest_path = dest_dir / relative_path - - if src_path.is_dir(): - dest_path.mkdir(parents=True, exist_ok=True) - else: - if dest_path.exists(): - dest_path.unlink() - shutil.copy2(src_path, dest_path) - - -@contextlib.asynccontextmanager -async def get_session(args: NameSpace) -> AsyncIterator[aiohttp.ClientSession]: - tmt = aiohttp.ClientTimeout(total=args.timeout) - con = aiohttp.TCPConnector(ssl=False, limit=None) - async with aiohttp.ClientSession(connector=con, timeout=tmt) as session: - session.headers.update( - { - "User-Agent": args.user_agent, - "Accept": "*/*", - "Accept-Language": "en-US,en", - } - ) - yield session - - -async def process_queue( - queue: asyncio.Queue[QueueItem], - urls: set[yarl.URL], - args: NameSpace, -) -> None: - for url in urls: - # Проверим сначала на листинг директорий - # Без слеша в конце перенаправит с 301 на адрес со слешем - await queue.put((url, args.git_folder + "/")) - - await queue.put((url, ".gitignore")) - - for item in COMMON_FILES: - await queue.put((url, args.git_folder + "/" + item)) - - await queue.join() - - for _ in range(args.workers): - queue.put_nowait((None, None)) - - -QueueItem = tuple[yarl.URL | None, str | None, bool | None] - - -async def worker( - session: aiohttp.ClientSession, - queue: asyncio.Queue[QueueItem], - seen: set[yarl.URL], - host_error: Counter, - executor: ProcessPoolExecutor, - args: NameSpace, -) -> None: - task_name = asyncio.current_task().get_name() - logger.debug(f"task started: {task_name}") - - while True: - try: - base_url, path = await queue.get() - - if base_url is None: - break - - if ( - args.host_error > 0 - and host_error[base_url.host] >= args.host_error - ): - logger.warning( - f"maximum host connection errors exceeded: {base_url.host}" - ) - continue - - target_url = base_url.joinpath(path) - - if target_url in seen: - logger.warning(f"already seen: {target_url}") - continue - - logger.debug(f"get: {target_url}") - response = await session.get(target_url, allow_redirects=False) - - seen.add(target_url) - - log_message = f"{response.status} - {response.url}" - if response.status != 200: - logger.warning(log_message) - continue - - logger.info(log_message) - - contents = await response.content.read(args.probe_size) - - if await check_html(contents, response, base_url, path, queue): - logger.debug(f"html in response: {response.url}") - continue - - if path == ".gitignore": - await handle_gitignore(contents, base_url, queue) - elif path.startswith(args.git_folder): - await handle_git( - contents, response, base_url, path, queue, executor, args - ) - - await save_file(contents, response, args) - except (aiohttp.ClientError, asyncio.TimeoutError): - logger.warning(f"connection error: {base_url.host}") - host_error[base_url.host] += 1 - except Exception as ex: - logger.error(ex) - finally: - queue.task_done() - - logger.debug(f"task finished: {task_name}") - - -async def check_html( - contents: bytes, - response: aiohttp.ClientResponse, - base_url: yarl.URL, - path: str, - queue: asyncio.Queue[QueueItem], -) -> bool: - # тут обрабатываются два случая: - # 1. - проверка листинга - # 2. - перехват 404-ой - if not HTML_RE.match(contents): - return False - - if b"Index of /" in contents: - logger.debug(f"directory listing detected: {response.url}") - for link in parse_links(contents): - # <a href="?C=N;O=D">Name</a> - # <a href="?C=M;O=A">Last modified</a> - # ... - if "?" not in link: - logger.debug(f"add link: {link}") - await queue.put( - ( - base_url, - path.rstrip("/") + "/" + link.lstrip("/"), - ) - ) - - return True - - -async def handle_gitignore( - contents: bytes, - base_url: str, - queue: asyncio.Queue, -) -> None: - # https://git-scm.com/docs/gitignore/en - lines = contents.decode(errors="ignore").splitlines() - - for item in lines: - # символы, которые используются в шаблонах - # https://www.php.net/manual/en/function.fnmatch.php - if not re.fullmatch(r"[^?\[\]*]+", item): - continue - - item = item.lstrip("/") - - if not item.lower().endswith(DYNAMIC_CONTENT_EXTENSIONS): - await queue.put((base_url, item)) - - -async def handle_git( - contents: bytes, - response: aiohttp.ClientResponse, - base_url: str, - path: str, - queue: asyncio.Queue[QueueItem], - executor: ProcessPoolExecutor, - args: NameSpace, -) -> None: - if path.endswith("/index"): - for entry in GitIndex.parse(BytesIO(contents)): - logger.debug( - f"found entry in {response.url}: {entry.sha1} => {entry.filename}" - ) - await queue.put( - ( - base_url, - args.git_folder + "/" + hash2path(entry.sha1), - ) - ) - - lower_filename = entry.filename.lower() - - if ( - args.download_all - and lower_filename.endswith(DYNAMIC_CONTENT_EXTENSIONS) - ) or ( - not args.download_all - and lower_filename.endswith(ALL_EXTENSIONS) - ): - continue - - # пробуем скачать файл - await queue.put((base_url, entry.filename.lstrip("/"))) - - elif OBJECTS_PATH_RE.search(path): - decompressed = await asyncio.get_event_loop().run_in_executor( - executor, - decompress, - contents, - ) - - if decompressed.startswith((b"commit", b"tree")): - for hash in parse_hashes(decompressed): - logger.debug(f"found hash in {response.url}: {hash}") - await queue.put( - ( - base_url, - args.git_folder + "/" + hash2path(hash), - ) - ) - else: - # Мне лень разбираться в куче форматов файлов - for ref in parse_refs(contents): - await queue.put((base_url, args.git_folder + "/" + ref)) - - for hash in parse_hashes(contents): - await queue.put((base_url, args.git_folder + "/" + hash2path(hash))) - - for pack in parse_packs(contents): - for ext in ("pack", "idx"): - await queue.put( - ( - base_url, - f"{args.git_folder}/objects/pack/{pack}.{ext}", - ) - ) - - -async def save_file( - contents: bytes, - response: aiohttp.ClientResponse, - args: NameSpace, -) -> None: - save_path = url2path(response.url, args.output) - - if save_path.exists() and not args.force_download: - logger.warning(f"skip existing file: {save_path}") - return - - save_path.parent.mkdir(parents=True, exist_ok=True) - - with save_path.open("wb") as fp: - fp.write(contents) - async for chunk in response.content.iter_chunked(1 << 16): - fp.write(chunk) - - logger.info(f"saved: {save_path}") - - -def decompress(data: bytes) -> bytes: - # zlib.decompress не поддерживает частичную декомпрессию - return zlib.decompressobj().decompress(data) - - -def parse_links(contents: bytes) -> list[str]: - return list(map(bytes.decode, LINK_RE.findall(contents))) - - -def parse_refs(contents: bytes) -> list[str]: - return list(map(bytes.decode, REFS_PATH_RE.findall(contents))) - - -def parse_hashes(contents: bytes) -> list[str]: - return list(map(bytes.decode, HASH_RE.findall(contents))) - - -def parse_packs(contents: bytes) -> list[str]: - return list(map(bytes.decode, PACK_RE.findall(contents))) - - -def hash2path(hash: str) -> str: - return f"objects/{hash[:2]}/{hash[2:]}" - - -def url2path(url: yarl.URL, base_path: pathlib.Path) -> pathlib.Path: - return base_path / url.host / url.path[1:] - - -def normalize_url(url: str) -> yarl.URL: - return yarl.URL(("https://", "")["://" in url] + url) - - -if __name__ == "__main__": - with contextlib.suppress(KeyboardInterrupt): - asyncio.run(main()) diff --git a/seeker/snippet/helpful-docker.sh b/seeker/snippet/helpful-docker.sh deleted file mode 100644 index ee70cba2..00000000 --- a/seeker/snippet/helpful-docker.sh +++ /dev/null @@ -1,19 +0,0 @@ -#date: 2024-08-14T18:28:09Z -#url: https://api.github.com/gists/fb89ef02d1b318f672eb731d7fd143f5 -#owner: https://api.github.com/users/nmarsceau - -# Bash Version - -# Shortcut for opening a new Bash shell in a container, if Bash is available. Otherwise, use /bin/sh. -docker-shell () { - container="$1" - shell='/bin/sh' - [ -n $(docker container exec $container which bash) ] && shell='/bin/bash' - docker container exec -it $container $shell -} -export -f docker-shell - -# Shortcuts for more readable `docker container ls` and `docker image ls` output. -alias dc-ls="docker container ls --format \"table {{.Names}}\t{{.ID}}\t{{.Image}}\t{{.Status}}\"" -alias dc-ls-a="docker container ls --all --format \"table {{.Names}}\t{{.ID}}\t{{.Image}}\t{{.Status}}\"" -alias di-ls="docker image ls --format \"table {{.Repository}}:{{.Tag}}\t{{.ID}}\t{{.Size}}\t{{.CreatedSince}}\"" diff --git a/seeker/snippet/number_of_words_in_git_repo.sh b/seeker/snippet/number_of_words_in_git_repo.sh deleted file mode 100644 index e6aa4e7c..00000000 --- a/seeker/snippet/number_of_words_in_git_repo.sh +++ /dev/null @@ -1,5 +0,0 @@ -#date: 2024-08-14T18:24:28Z -#url: https://api.github.com/gists/4528722d4ad3b910d25adca208934323 -#owner: https://api.github.com/users/wteuber - -git ls-files | xargs wc -w 2> /dev/null | ruby -e "puts ARGF.map{_1.scan(/^\s*(\d+)/)[0][0].to_i}.inject(&:+)" \ No newline at end of file diff --git a/seeker/snippet/s3_client.py b/seeker/snippet/s3_client.py deleted file mode 100644 index 4f90ab79..00000000 --- a/seeker/snippet/s3_client.py +++ /dev/null @@ -1,85 +0,0 @@ -#date: 2024-08-14T18:26:48Z -#url: https://api.github.com/gists/ae93222a1e9087df10f92eafe86bc182 -#owner: https://api.github.com/users/feudor - -import asyncio -from contextlib import asynccontextmanager - -from aiobotocore.session import get_session -from botocore.exceptions import ClientError - - -class S3Client: - def __init__( - self, - access_key: "**********" - secret_key: "**********" - endpoint_url: str, - bucket_name: str, - ): - self.config = { - "aws_access_key_id": "**********" - "aws_secret_access_key": "**********" - "endpoint_url": endpoint_url, - } - self.bucket_name = bucket_name - self.session = get_session() - - @asynccontextmanager - async def get_client(self): - async with self.session.create_client("s3", **self.config) as client: - yield client - - async def upload_file( - self, - file_path: str, - ): - object_name = file_path.split("/")[-1] # /users/artem/cat.jpg - try: - async with self.get_client() as client: - with open(file_path, "rb") as file: - await client.put_object( - Bucket=self.bucket_name, - Key=object_name, - Body=file, - ) - print(f"File {object_name} uploaded to {self.bucket_name}") - except ClientError as e: - print(f"Error uploading file: {e}") - - async def delete_file(self, object_name: str): - try: - async with self.get_client() as client: - await client.delete_object(Bucket=self.bucket_name, Key=object_name) - print(f"File {object_name} deleted from {self.bucket_name}") - except ClientError as e: - print(f"Error deleting file: {e}") - - async def get_file(self, object_name: str, destination_path: str): - try: - async with self.get_client() as client: - response = await client.get_object(Bucket=self.bucket_name, Key=object_name) - data = await response["Body"].read() - with open(destination_path, "wb") as file: - file.write(data) - print(f"File {object_name} downloaded to {destination_path}") - except ClientError as e: - print(f"Error downloading file: {e}") - - -async def main(): - s3_client = S3Client( - access_key= "**********" - secret_key= "**********" - endpoint_url="", # для Selectel используйте https://s3.storage.selcloud.ru - bucket_name="", - ) - - # Проверка, что мы можем загрузить, скачать и удалить файл - await s3_client.upload_file("test.txt") - await s3_client.get_file("test.txt", "text_local_file.txt") - await s3_client.delete_file("test.txt") - - -if __name__ == "__main__": - asyncio.run(main())