From f9da41f8a4780c938959b57d609e7850c4db4304 Mon Sep 17 00:00:00 2001 From: Kirk Roerig Date: Mon, 12 Aug 2024 08:31:22 -0600 Subject: [PATCH 1/6] Parsing the index page and downloading latest taxonomy based on page contents --- multitax/silvatx.py | 4 ++-- multitax/utils.py | 22 +++++++++++++++++++++- pyproject.toml | 1 + 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/multitax/silvatx.py b/multitax/silvatx.py index 180187e..49fac36 100644 --- a/multitax/silvatx.py +++ b/multitax/silvatx.py @@ -1,10 +1,10 @@ from .multitax import MultiTax +from .utils import fuzzy_find_download_links import warnings class SilvaTx(MultiTax): - _default_urls = [ - "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_138.1.txt.gz"] + _default_urls = fuzzy_find_download_links("https://www.arb-silva.de/no_cache/download/archive/current/Exports/taxonomy/", ".*tax_slv_ssu_.*.txt.gz$") def __init__(self, **kwargs): super().__init__(**kwargs) diff --git a/multitax/utils.py b/multitax/utils.py index 5771c56..8800788 100644 --- a/multitax/utils.py +++ b/multitax/utils.py @@ -3,10 +3,13 @@ import os import tarfile import urllib.request +import urllib.parse import zlib import warnings +import re from collections import OrderedDict from urllib.error import HTTPError +from bs4 import BeautifulSoup def check_dir(prefix: str): @@ -97,7 +100,7 @@ def join_check(elements, sep: str): def load_url_mem(url: str): - """ + """import Parameters: * **url** *[str]*: URL to load into memory @@ -171,4 +174,21 @@ def warning_on_one_line(message, category, filename, lineno, file=None, line=Non return '%s:%s: %s: %s\n' % (filename, lineno, category.__name__, message) +def fuzzy_find_download_links(url: str, regex_pattern: str): + """ + Parameters: + * **url** *[str]*: URL to load into memory + * **pattern** *[str]*: Link pattern to search for in the page + """ + page = urllib.request.urlopen(url) + o = urllib.parse.urlparse(url) + soup = BeautifulSoup(page, 'html.parser') + domain = url.split('/') + return [f'{o.scheme}://{o.netloc}/{a.attrs['href']}' for a in soup.find_all('a', attrs={'href' : re.compile(regex_pattern)})] + warnings.formatwarning = warning_on_one_line + + +if __name__ == "__main__": + links = fuzzy_find_download_links("https://www.arb-silva.de/no_cache/download/archive/current/Exports/taxonomy/", ".*tax_slv_ssu_.*.txt.gz$") + print(links) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index b5a3c46..44e1a91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,7 @@ [build-system] requires = [ "setuptools>=42", + "beautifulsoup4==4.12.3", "wheel" ] build-backend = "setuptools.build_meta" \ No newline at end of file From e1ce6e6595670d0aacc2300c43f1d2f6730788bd Mon Sep 17 00:00:00 2001 From: Kirk Roerig Date: Mon, 12 Aug 2024 18:24:14 -0600 Subject: [PATCH 2/6] utils unit test --- tests/multitax/unit/test_utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 tests/multitax/unit/test_utils.py diff --git a/tests/multitax/unit/test_utils.py b/tests/multitax/unit/test_utils.py new file mode 100644 index 0000000..d5bffde --- /dev/null +++ b/tests/multitax/unit/test_utils.py @@ -0,0 +1,9 @@ +import unittest +from multitax.utils import fuzzy_find_download_link + + +class TestUtils(unittest.TestCase): + def fuzzy_find_download_link(self): + links = fuzzy_find_download_link("https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy") + print(links) + pass From e1b0d82e0607405ed4b8a4ecd52cbd4a31807e4b Mon Sep 17 00:00:00 2001 From: Kirk Roerig Date: Mon, 12 Aug 2024 19:16:51 -0600 Subject: [PATCH 3/6] Fuzzy link finder test --- multitax/utils.py | 6 ++++-- pyproject.toml | 1 - requirements.txt | 1 + setup.py | 6 ++++++ tests/multitax/unit/test_utils.py | 10 +++++----- 5 files changed, 16 insertions(+), 8 deletions(-) create mode 100644 requirements.txt diff --git a/multitax/utils.py b/multitax/utils.py index 8800788..18fe5b3 100644 --- a/multitax/utils.py +++ b/multitax/utils.py @@ -174,13 +174,15 @@ def warning_on_one_line(message, category, filename, lineno, file=None, line=Non return '%s:%s: %s: %s\n' % (filename, lineno, category.__name__, message) -def fuzzy_find_download_links(url: str, regex_pattern: str): +def fuzzy_find_download_links(url: str, regex_pattern: str, page=None): """ Parameters: * **url** *[str]*: URL to load into memory * **pattern** *[str]*: Link pattern to search for in the page + * **page** *[str]*: Optional page content to parse, primarily for unit testing """ - page = urllib.request.urlopen(url) + if page is None: + page = urllib.request.urlopen(url) o = urllib.parse.urlparse(url) soup = BeautifulSoup(page, 'html.parser') domain = url.split('/') diff --git a/pyproject.toml b/pyproject.toml index 44e1a91..b5a3c46 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,6 @@ [build-system] requires = [ "setuptools>=42", - "beautifulsoup4==4.12.3", "wheel" ] build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5baed2b --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +beautifulsoup4==4.12.3 \ No newline at end of file diff --git a/setup.py b/setup.py index 025f351..ce59fa2 100755 --- a/setup.py +++ b/setup.py @@ -7,6 +7,11 @@ with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() +lib_folder = os.path.dirname(os.path.realpath(__file__)) +install_requires = [] +with open(f"{lib_folder}/requirements.txt") as f: + install_requires = f.read().splitlines() + setup( name="multitax", version="1.3.1", @@ -17,6 +22,7 @@ long_description=long_description, long_description_content_type="text/markdown", packages=["multitax"], + install_requires=install_requires, python_requires=">=3.4", classifiers=[ 'License :: OSI Approved :: MIT License', diff --git a/tests/multitax/unit/test_utils.py b/tests/multitax/unit/test_utils.py index d5bffde..1eadea4 100644 --- a/tests/multitax/unit/test_utils.py +++ b/tests/multitax/unit/test_utils.py @@ -1,9 +1,9 @@ import unittest -from multitax.utils import fuzzy_find_download_link +from multitax.utils import fuzzy_find_download_links class TestUtils(unittest.TestCase): - def fuzzy_find_download_link(self): - links = fuzzy_find_download_link("https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy") - print(links) - pass + def test_fuzzy_find_download_links(self): + registery_url = "https://www.arb-silva.de/no_cache/download/archive/current/Exports/taxonomy/" + links = fuzzy_find_download_links(registery_url, ".*tax_slv_ssu_.*.txt.gz$") + self.assertTrue('https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_' in links[0]) From 984736d4fdb1c13a49f277d91f75d67e71fa4b3e Mon Sep 17 00:00:00 2001 From: Kirk Roerig Date: Mon, 12 Aug 2024 19:36:36 -0600 Subject: [PATCH 4/6] Compatibility for older pythons --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ce59fa2..598525c 100755 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ lib_folder = os.path.dirname(os.path.realpath(__file__)) install_requires = [] -with open(f"{lib_folder}/requirements.txt") as f: +with open("{0}/requirements.txt".format(lib_folder)) as f: install_requires = f.read().splitlines() setup( From 77ec213c6052426ec4ad617ce3311794c188cd36 Mon Sep 17 00:00:00 2001 From: Kirk Roerig Date: Mon, 12 Aug 2024 19:54:38 -0600 Subject: [PATCH 5/6] Replace more f-strings --- multitax/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multitax/utils.py b/multitax/utils.py index 18fe5b3..f09ee92 100644 --- a/multitax/utils.py +++ b/multitax/utils.py @@ -186,7 +186,7 @@ def fuzzy_find_download_links(url: str, regex_pattern: str, page=None): o = urllib.parse.urlparse(url) soup = BeautifulSoup(page, 'html.parser') domain = url.split('/') - return [f'{o.scheme}://{o.netloc}/{a.attrs['href']}' for a in soup.find_all('a', attrs={'href' : re.compile(regex_pattern)})] + return ['{0}://{1}/{2}'.format(o.scheme, o.netloc, a.attrs['href']) for a in soup.find_all('a', attrs={'href' : re.compile(regex_pattern)})] warnings.formatwarning = warning_on_one_line From c996fd224d251b6447e1a025e9a68c2193b3d570 Mon Sep 17 00:00:00 2001 From: Kirk Roerig Date: Mon, 12 Aug 2024 19:58:46 -0600 Subject: [PATCH 6/6] Remove strict version requirement --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5baed2b..041f722 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -beautifulsoup4==4.12.3 \ No newline at end of file +beautifulsoup4 \ No newline at end of file