diff --git a/multitax/silvatx.py b/multitax/silvatx.py index 180187e..49fac36 100644 --- a/multitax/silvatx.py +++ b/multitax/silvatx.py @@ -1,10 +1,10 @@ from .multitax import MultiTax +from .utils import fuzzy_find_download_links import warnings class SilvaTx(MultiTax): - _default_urls = [ - "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_138.1.txt.gz"] + _default_urls = fuzzy_find_download_links("https://www.arb-silva.de/no_cache/download/archive/current/Exports/taxonomy/", ".*tax_slv_ssu_.*.txt.gz$") def __init__(self, **kwargs): super().__init__(**kwargs) diff --git a/multitax/utils.py b/multitax/utils.py index 5771c56..f09ee92 100644 --- a/multitax/utils.py +++ b/multitax/utils.py @@ -3,10 +3,13 @@ import os import tarfile import urllib.request +import urllib.parse import zlib import warnings +import re from collections import OrderedDict from urllib.error import HTTPError +from bs4 import BeautifulSoup def check_dir(prefix: str): @@ -97,7 +100,7 @@ def join_check(elements, sep: str): def load_url_mem(url: str): - """ + """import Parameters: * **url** *[str]*: URL to load into memory @@ -171,4 +174,23 @@ def warning_on_one_line(message, category, filename, lineno, file=None, line=Non return '%s:%s: %s: %s\n' % (filename, lineno, category.__name__, message) +def fuzzy_find_download_links(url: str, regex_pattern: str, page=None): + """ + Parameters: + * **url** *[str]*: URL to load into memory + * **pattern** *[str]*: Link pattern to search for in the page + * **page** *[str]*: Optional page content to parse, primarily for unit testing + """ + if page is None: + page = urllib.request.urlopen(url) + o = urllib.parse.urlparse(url) + soup = BeautifulSoup(page, 'html.parser') + domain = url.split('/') + return ['{0}://{1}/{2}'.format(o.scheme, o.netloc, a.attrs['href']) for a in soup.find_all('a', attrs={'href' : re.compile(regex_pattern)})] + warnings.formatwarning = warning_on_one_line + + +if __name__ == "__main__": + links = fuzzy_find_download_links("https://www.arb-silva.de/no_cache/download/archive/current/Exports/taxonomy/", ".*tax_slv_ssu_.*.txt.gz$") + print(links) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..041f722 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +beautifulsoup4 \ No newline at end of file diff --git a/setup.py b/setup.py index 025f351..598525c 100755 --- a/setup.py +++ b/setup.py @@ -7,6 +7,11 @@ with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() +lib_folder = os.path.dirname(os.path.realpath(__file__)) +install_requires = [] +with open("{0}/requirements.txt".format(lib_folder)) as f: + install_requires = f.read().splitlines() + setup( name="multitax", version="1.3.1", @@ -17,6 +22,7 @@ long_description=long_description, long_description_content_type="text/markdown", packages=["multitax"], + install_requires=install_requires, python_requires=">=3.4", classifiers=[ 'License :: OSI Approved :: MIT License', diff --git a/tests/multitax/unit/test_utils.py b/tests/multitax/unit/test_utils.py new file mode 100644 index 0000000..1eadea4 --- /dev/null +++ b/tests/multitax/unit/test_utils.py @@ -0,0 +1,9 @@ +import unittest +from multitax.utils import fuzzy_find_download_links + + +class TestUtils(unittest.TestCase): + def test_fuzzy_find_download_links(self): + registery_url = "https://www.arb-silva.de/no_cache/download/archive/current/Exports/taxonomy/" + links = fuzzy_find_download_links(registery_url, ".*tax_slv_ssu_.*.txt.gz$") + self.assertTrue('https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_' in links[0])