Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions multitax/silvatx.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from .multitax import MultiTax
from .utils import fuzzy_find_download_links
import warnings


class SilvaTx(MultiTax):
_default_urls = [
"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_138.1.txt.gz"]
_default_urls = fuzzy_find_download_links("https://www.arb-silva.de/no_cache/download/archive/current/Exports/taxonomy/", ".*tax_slv_ssu_.*.txt.gz$")

def __init__(self, **kwargs):
super().__init__(**kwargs)
Expand Down
24 changes: 23 additions & 1 deletion multitax/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
import os
import tarfile
import urllib.request
import urllib.parse
import zlib
import warnings
import re
from collections import OrderedDict
from urllib.error import HTTPError
from bs4 import BeautifulSoup


def check_dir(prefix: str):
Expand Down Expand Up @@ -97,7 +100,7 @@ def join_check(elements, sep: str):


def load_url_mem(url: str):
"""
"""import
Parameters:
* **url** *[str]*: URL to load into memory

Expand Down Expand Up @@ -171,4 +174,23 @@ def warning_on_one_line(message, category, filename, lineno, file=None, line=Non
return '%s:%s: %s: %s\n' % (filename, lineno, category.__name__, message)


def fuzzy_find_download_links(url: str, regex_pattern: str, page=None):
"""
Parameters:
* **url** *[str]*: URL to load into memory
* **pattern** *[str]*: Link pattern to search for in the page
* **page** *[str]*: Optional page content to parse, primarily for unit testing
"""
if page is None:
page = urllib.request.urlopen(url)
o = urllib.parse.urlparse(url)
soup = BeautifulSoup(page, 'html.parser')
domain = url.split('/')
return ['{0}://{1}/{2}'.format(o.scheme, o.netloc, a.attrs['href']) for a in soup.find_all('a', attrs={'href' : re.compile(regex_pattern)})]

warnings.formatwarning = warning_on_one_line


if __name__ == "__main__":
links = fuzzy_find_download_links("https://www.arb-silva.de/no_cache/download/archive/current/Exports/taxonomy/", ".*tax_slv_ssu_.*.txt.gz$")
print(links)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
beautifulsoup4
6 changes: 6 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()

lib_folder = os.path.dirname(os.path.realpath(__file__))
install_requires = []
with open("{0}/requirements.txt".format(lib_folder)) as f:
install_requires = f.read().splitlines()

setup(
name="multitax",
version="1.3.1",
Expand All @@ -17,6 +22,7 @@
long_description=long_description,
long_description_content_type="text/markdown",
packages=["multitax"],
install_requires=install_requires,
python_requires=">=3.4",
classifiers=[
'License :: OSI Approved :: MIT License',
Expand Down
9 changes: 9 additions & 0 deletions tests/multitax/unit/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import unittest
from multitax.utils import fuzzy_find_download_links


class TestUtils(unittest.TestCase):
def test_fuzzy_find_download_links(self):
registery_url = "https://www.arb-silva.de/no_cache/download/archive/current/Exports/taxonomy/"
links = fuzzy_find_download_links(registery_url, ".*tax_slv_ssu_.*.txt.gz$")
self.assertTrue('https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_' in links[0])