From 5322356ced317c8d5515427c1d8916c0cd6eeb2b Mon Sep 17 00:00:00 2001 From: hdoerner Date: Thu, 15 Oct 2020 07:15:21 +0200 Subject: [PATCH] Add torrent search script --- Python/torrent_search/LICENSE | 21 ++ Python/torrent_search/README.md | 47 ++++ Python/torrent_search/requirements.txt | 2 + Python/torrent_search/torrent_search.py | 306 ++++++++++++++++++++++++ 4 files changed, 376 insertions(+) create mode 100644 Python/torrent_search/LICENSE create mode 100644 Python/torrent_search/README.md create mode 100644 Python/torrent_search/requirements.txt create mode 100755 Python/torrent_search/torrent_search.py diff --git a/Python/torrent_search/LICENSE b/Python/torrent_search/LICENSE new file mode 100644 index 0000000..18c645a --- /dev/null +++ b/Python/torrent_search/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Holger Dörner + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/Python/torrent_search/README.md b/Python/torrent_search/README.md new file mode 100644 index 0000000..d46a073 --- /dev/null +++ b/Python/torrent_search/README.md @@ -0,0 +1,47 @@ +# torrent_search.py +A torrent-meta-searcher + +This script searches (scrapes) on multiple Torrent sites for supplied keywords and returns the results grouped by categories as `JSON`. + +Currently, the following sites are queried: +- thepiratebay10.org +- 1337x.to +- torlock.com + +## Requirements and Dependencies +Recommendet Python-Version +- `3.5+` + +The only external dependency are +- `BeautifulSoap` +- `aiohttp` + +## Installation +Install the required external dependencies with + +```shell +$ pip3 install -r requirements.txt +``` + +Also, a `chmod +x torrent_search.py` *can* be needed to make the script executable, otherwise You have to call it with `python3` (see [Usage Examples](#usage-examples)) + +## Usage examples +Getting help: +```shell +$ (python3) torrent_search.py -h +``` + +Perform a simple search: +```shell +$ (python3) torrent_search.py doom +``` + +Filter results with at least 20 seeders for `big bang theory`: +```shell +$ (python3) torrent_search.py "big bang theory" -s 20 +``` + +Filter by categories: +```shell +$ (python3) torrent_search.py doom -c Movies Games +``` diff --git a/Python/torrent_search/requirements.txt b/Python/torrent_search/requirements.txt new file mode 100644 index 0000000..fc9b498 --- /dev/null +++ b/Python/torrent_search/requirements.txt @@ -0,0 +1,2 @@ +BeautifulSoup4 +aiohttp \ No newline at end of file diff --git a/Python/torrent_search/torrent_search.py b/Python/torrent_search/torrent_search.py new file mode 100755 index 0000000..29e56eb --- /dev/null +++ b/Python/torrent_search/torrent_search.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python3 + +import asyncio +import json +import argparse +from aiohttp import ClientSession, ClientError +from aiohttp.http_exceptions import HttpProcessingError +from urllib.parse import quote +from collections import defaultdict +from re import sub +from bs4 import BeautifulSoup + +results = defaultdict(list) + +urlList = { + 'thepiratebay': 'https://thepiratebay10.org/search/{query}/{page}/99/0', + '1337x': 'https://1337x.to/search/{query}/{page}/', + 'torlock': 'https://www.torlock.com/all/torrents/{query}/{page}.html' +} + +category_list = { + 'anime': 'Anime', + 'applications': 'Applications', + 'documentaries': 'Documentaries', + 'games': 'Games', + 'movies': 'Movies', + 'music': 'Music', + 'other': 'Other', + 'television': 'Television', + 'xxx': 'XXX', + 'unknown': 'Unknown' +} + + +def to_object( + title: str = '', + url: str = '', + magnet_link: str = '', + seeder: int = None, + leecher: int = None +): + return { + 'title': title, + 'url': url, + 'magett-link': magnet_link, + 'seeder': seeder, + 'leecher': leecher + } + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description='Meta-Search-Script for torrents.' + ) + parser.add_argument('query', metavar='QUERY', type=str, + help='the search term') + parser.add_argument('-s', '--seeder-limit', dest='seeder_limit', type=int, + action='store', default=0, + help='minimum number of seeders') + parser.add_argument('-c', '--categories', dest='categories', + nargs='*', default=[], + help='filter by categories') + + args = parser.parse_args() + + for i in range(len(args.categories)): + args.categories[i] = args.categories[i].casefold() + + return args + + +async def send_query( + url: str, + session: ClientSession, + headers: dict = None +) -> str: + response = await session.request(method='GET', url=url, headers=headers) + response.raise_for_status() + return await response.text(errors='ignore') + + +async def parse_thepiratebay( + args: argparse.Namespace, + session: ClientSession +) -> None: + page = 1 + + while (True): + try: + html = await send_query(urlList['thepiratebay'].format_map({ + 'query': quote(args.query), + 'page': page}), + session + ) + except (ClientError, HttpProcessingError): + break + + soup = BeautifulSoup(html, 'html.parser') + + try: + rows = soup.find('div', id='main-content').find_all('tr')[1:-1] + except AttributeError: + break + + if len(rows) == 0: + break + + for row in rows: + columns = row.find_all('td') + category = sub(r'\W+', ' ', columns[0].find('a').text).strip() + category = category_list.get( + category.lower(), + category_list.get('unknown') + ) + title = sub( + '[\n\t]+', + ' ', + columns[1].find('div', class_='detName').text + ).strip() + url = columns[1].find('a', href=True)['href'] + magnet_link = columns[1].find_all('a')[1]['href'] + seeder = int(columns[2].text.strip()) + leecher = int(columns[3].text.strip()) + + if len(args.categories) > 0: + if category.casefold() not in args.categories: + continue + + if seeder < args.seeder_limit: + continue + + results[category].append(to_object( + title=title, + url=url, + magnet_link=magnet_link, + seeder=seeder, + leecher=leecher + )) + + page += 1 + + +async def parse_1337x( + args: argparse.Namespace, + session: ClientSession +) -> None: + category_mappings = { + '6': category_list.get('television'), + '9': category_list.get('documentaries'), + '10': category_list.get('games'), + '18': category_list.get('applications'), + '19': category_list.get('applications'), + '22': category_list.get('music'), + '23': category_list.get('music'), + '28': category_list.get('anime'), + '36': category_list.get('other'), + '41': category_list.get('television'), + '42': category_list.get('movies'), + '48': category_list.get('xxx'), + '50': category_list.get('other'), + '51': category_list.get('xxx'), + '52': category_list.get('other'), + '53': category_list.get('music'), + '56': category_list.get('applications'), + '59': category_list.get('music'), + '71': category_list.get('television'), + '75': category_list.get('television'), + '78': category_list.get('anime'), + '80': category_list.get('anime'), + } + + page = 1 + + while (True): + try: + html = await send_query(urlList['1337x'].format_map({ + 'query': quote(args.query), + 'page': page}), + session + ) + except (ClientError, HttpProcessingError): + break + + soup = BeautifulSoup(html, 'html.parser') + + try: + rows = soup.find('table', class_='table-list').find_all('tr')[1:] + except AttributeError: + break + + if len(rows) == 0: + break + + for row in rows: + columns = row.find_all('td') + category = columns[0].find_all('a')[0]['href'].split('/')[2] + category = category_mappings.get( + category, + category_list.get('unknown') + ) + title = columns[0].find_all('a')[1].text.strip() + url = 'https://1337x.to' + columns[0].find_all('a')[1]['href'] + seeder = int(columns[1].text.strip()) + leecher = int(columns[2].text.strip()) + + if len(args.categories) > 0: + if category.casefold() not in args.categories: + continue + + if seeder < args.seeder_limit: + continue + + results[category].append(to_object( + title=title, + url=url, + seeder=seeder, + leecher=leecher + )) + + page += 1 + + +async def parse_torlock(args, session: ClientSession) -> None: + category_mappings = { + 'tv0': category_list.get('other'), + 'tv1': category_list.get('movie'), + 'tv2': category_list.get('music'), + 'tv3': category_list.get('television'), + 'tv4': category_list.get('games'), + 'tv5': category_list.get('application'), + 'tv6': category_list.get('anime'), + 'tv7': category_list.get('xxx'), + 'tv8': category_list.get('other'), + 'tv9': category_list.get('other'), + 'tv10': category_list.get('television'), + 'tv12': category_list.get('other') + } + + page = 1 + + while (True): + try: + html = await send_query(urlList['torlock'].format_map({ + 'query': quote(args.query), + 'page': page}), + session + ) + except (ClientError, HttpProcessingError): + break + + soup = BeautifulSoup(html, 'html.parser') + + try: + rows = soup.find_all('table')[4].find_all('tr')[1:] + except AttributeError: + break + + if len(rows) == 0: + break + + for row in rows: + columns = row.find_all('td') + category = columns[0].find('span')['class'][0] + category = category_mappings.get( + category, + category_list.get('unknown') + ) + title = columns[0].find('a').text.strip() + url = 'https://torlock.to' + columns[0].find('a')['href'] + seeder = int(columns[3].text.strip()) + leecher = int(columns[4].text.strip()) + + if len(args.categories) > 0: + if category.casefold() not in args.categories: + continue + + if seeder < args.seeder_limit: + continue + + results[category].append(to_object( + title=title, + url=url, + seeder=seeder, + leecher=leecher + )) + + page += 1 + + +async def run(args): + task_list = [ + parse_thepiratebay, + parse_1337x, + parse_torlock + ] + + async with ClientSession() as session: + for i in range(len(task_list)): + task_list[i] = task_list[i](args, session) + await asyncio.gather(*task_list) + + +if __name__ == "__main__": + args = parse_args() + asyncio.run(run(args)) + print(json.dumps(results, indent=4))