diff --git a/.github/actions/setup-deps/action.yaml b/.github/actions/setup-deps/action.yaml index 7523214b8fa..7f40ec146ad 100644 --- a/.github/actions/setup-deps/action.yaml +++ b/.github/actions/setup-deps/action.yaml @@ -74,6 +74,8 @@ inputs: default: 'networkx' openmm: default: 'openmm' + pooch: + default: 'pooch' pytng: default: 'pytng>=0.2.3' rdkit: @@ -145,6 +147,7 @@ runs: ${{ inputs.netcdf4 }} ${{ inputs.networkx }} ${{ inputs.openmm }} + ${{ inputs.pooch }} ${{ inputs.pytng }} ${{ inputs.rdkit }} ${{ inputs.scikit-learn }} diff --git a/package/CHANGELOG b/package/CHANGELOG index e009d480303..d0df1abf78d 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -14,13 +14,16 @@ The rules for this file: ------------------------------------------------------------------------------- -??/??/?? IAlibay +??/??/?? IAlibay, jauy123, BradyAJohnston * 2.11.0 Fixes Enhancements +* Added function `topology.PDBParser.fetch_pdb` (accessible as + `MDAnalysis.fetch_pdb()`) to download structure files from wwPDB using + `pooch` as optional dependency (Issue #4907, PR #4943) Changes diff --git a/package/MDAnalysis/__init__.py b/package/MDAnalysis/__init__.py index 6843c3738ab..5d301fbe1a3 100644 --- a/package/MDAnalysis/__init__.py +++ b/package/MDAnalysis/__init__.py @@ -221,6 +221,8 @@ from .due import due, Doi, BibTeX +from .topology.PDBParser import fetch_pdb + due.cite( Doi("10.25080/majora-629e541a-00e"), description="Molecular simulation analysis library", diff --git a/package/MDAnalysis/topology/PDBParser.py b/package/MDAnalysis/topology/PDBParser.py index b168b0a67ae..0b9becb9d3a 100644 --- a/package/MDAnalysis/topology/PDBParser.py +++ b/package/MDAnalysis/topology/PDBParser.py @@ -57,13 +57,17 @@ * :class:`MDAnalysis.core.universe.Universe` -Classes -------- +Classes and Functions +--------------------- .. autoclass:: PDBParser :members: :inherited-members: +.. autofunction:: fetch_pdb + +.. autodata:: DEFAULT_CACHE_NAME_DOWNLOADER + """ import numpy as np import warnings @@ -95,6 +99,33 @@ # Set up a logger for the PDBParser logger = logging.getLogger("MDAnalysis.topology.PDBParser") +try: + import pooch +except ImportError: + HAS_POOCH = False +else: + HAS_POOCH = True + +#: Name of the :mod:`pooch` cache directory ``pooch.os_cache(DEFAULT_CACHE_NAME_DOWNLOADER)``; +#: see :func:`pooch.os_cache` for further details. +#: +#: .. versionadded:: 2.11.0 +DEFAULT_CACHE_NAME_DOWNLOADER = "MDAnalysis_pdbs" + +# These file formats are here (https://www.rcsb.org/docs/programmatic-access/file-download-services) under "PDB entry files" +SUPPORTED_FILE_FORMATS_DOWNLOADER = ( + "cif", + "cif.gz", + "bcif", + "bcif.gz", + "xml", + "xml.gz", + "pdb", + "pdb.gz", + "pdb1", + "pdb1.gz", +) + def float_or_default(val, default): try: @@ -515,3 +546,132 @@ def _parse_conect(conect): bond_atoms = (int(conect[11 + i * 5: 16 + i * 5]) for i in range(n_bond_atoms)) return atom_id, bond_atoms + + +def fetch_pdb( + pdb_ids=None, + cache_path=None, + progressbar=False, + file_format="pdb.gz", +): + """ + Download one or more PDB files from the RCSB Protein Data Bank and cache + them locally. + + Given one or multiple PDB IDs, downloads the corresponding structure files + format and stores them in a local cache directory. If files are cached on + disk, *fetch_pdb* will skip the download and use the cached version instead. + + Returns the path(s) as a string to the downloaded file(s). + + Parameters + ---------- + pdb_ids : str or sequence of str + A single PDB ID as a string, or a sequence of PDB IDs to fetch. + cache_path : str or pathlib.Path + Directory where downloaded file(s) will be cached. + The default ``None`` argument uses the :mod:`pooch` default cache with + project name :data:`DEFAULT_CACHE_NAME_DOWNLOADER`. + file_format : str + The file extension/format to download (e.g., "cif", "pdb"). + See the Notes section below for a list of all supported file formats. + progressbar : bool, optional + If True, display a progress bar during file downloads. Default is False. + + Returns + ------- + str or list of str + The path(s) to the downloaded file(s). Returns a single string if + one PDB ID is given, or a list of strings if multiple PDB IDs are + provided. + + Raises + ------ + ValueError + For an invalid file format. Supported file formats are under Notes. + + :class:`requests.exceptions.HTTPError` + If an invalid PDB code is specified. Note that this is :mod:`requests`, not the + standard library :mod:`urllib.request`. + + Notes + ----- + This function uses the `RCSB File Download Services`_ for directly downloading + structure files via https. + + .. _`RCSB File Download Services`: + https://www.rcsb.org/docs/programmatic-access/file-download-services + + The RCSB currently provides data in ``'cif'`` , ``'cif.gz'`` , ``'bcif'`` , + ``'bcif.gz'`` , ``'xml'`` , ``'xml.gz'`` , ``'pdb'`` , ``'pdb.gz'``, + ``'pdb1'``, ``'pdb1.gz'`` file formats and can therefore be downloaded. + Not all of these formats can be currently read with MDAnalysis. + + Caching, controlled by the `cache_path` parameter, is handled internally by + :mod:`pooch`. The default cache name is taken from + :data:`DEFAULT_CACHE_NAME_DOWNLOADER`. To clear cache (and subsequently force + re-fetching), it is required to delete the cache folder as specified by + `cache_path`. + + Examples + -------- + Download a single PDB file: + + >>> mda.fetch_pdb("1AKE", file_format="cif") + './MDAnalysis_pdbs/1AKE.cif' + + Download multiple PDB files with a progress bar: + + >>> mda.fetch_pdb(["1AKE", "4BWZ"], progressbar=True) + ['./MDAnalysis_pdbs/1AKE.pdb.gz', './MDAnalysis_pdbs/4BWZ.pdb.gz'] + + Download a single PDB file and convert it to a universe: + + >>> mda.Universe(mda.fetch_pdb("1AKE"), file_format="pdb.gz") + + + Download multiple PDB files and convert each of them into a universe: + + >>> [mda.Universe(pdb) for pdb in mda.fetch_pdb(["1AKE", "4BWZ"], progressbar=True)] + [, ] + + + .. versionadded:: 2.11.0 + """ + + if not HAS_POOCH: + raise ModuleNotFoundError( + "pooch is needed as a dependency for fetch_pdb()" + ) + elif file_format not in SUPPORTED_FILE_FORMATS_DOWNLOADER: + raise ValueError( + "Invalid file format. Supported file formats " + f"are {SUPPORTED_FILE_FORMATS_DOWNLOADER}" + ) + + if isinstance(pdb_ids, str): + _pdb_ids = (pdb_ids,) + else: + _pdb_ids = pdb_ids + + if cache_path is None: + cache_path = pooch.os_cache(DEFAULT_CACHE_NAME_DOWNLOADER) + + # Have to do this dictionary approach instead of using pooch.retrieve in order + # to prevent the hardcoded known_hash warning from showing up. + registry_dictionary = { + f"{pdb_id}.{file_format}": None for pdb_id in _pdb_ids + } + + downloader = pooch.create( + path=cache_path, + base_url="https://files.wwpdb.org/download/", + registry=registry_dictionary, + ) + + paths = [ + downloader.fetch(fname=file_name, progressbar=progressbar) + for file_name in registry_dictionary.keys() + ] + + return paths if not isinstance(pdb_ids, str) else paths[0] diff --git a/package/doc/sphinx/source/conf.py b/package/doc/sphinx/source/conf.py index 0aba418eb3b..44070577893 100644 --- a/package/doc/sphinx/source/conf.py +++ b/package/doc/sphinx/source/conf.py @@ -350,4 +350,6 @@ class KeyStyle(UnsrtStyle): "mdahole2": ("https://www.mdanalysis.org/mdahole2/", None), "dask": ("https://docs.dask.org/en/stable/", None), "imdclient": ("https://imdclient.readthedocs.io/en/stable/", None), + "pooch": ("https://www.fatiando.org/pooch/latest/", None), + "requests": ("https://requests.readthedocs.io/en/latest/", None), } diff --git a/package/pyproject.toml b/package/pyproject.toml index 3dc32092fcb..283980b1332 100644 --- a/package/pyproject.toml +++ b/package/pyproject.toml @@ -72,6 +72,7 @@ extra_formats = [ "h5py>=2.10", "chemfiles>=0.10", "parmed", + "pooch", "pyedr>=0.7.0", "pytng>=0.2.3", "gsd>3.0.0", diff --git a/package/requirements.txt b/package/requirements.txt index a196a9fe0f7..17c3961a5ef 100644 --- a/package/requirements.txt +++ b/package/requirements.txt @@ -13,6 +13,7 @@ networkx numpy>=1.23.2 packaging parmed +pooch pytest scikit-learn scipy diff --git a/testsuite/MDAnalysisTests/topology/test_fetch_pdb.py b/testsuite/MDAnalysisTests/topology/test_fetch_pdb.py new file mode 100644 index 00000000000..9210203238c --- /dev/null +++ b/testsuite/MDAnalysisTests/topology/test_fetch_pdb.py @@ -0,0 +1,161 @@ +# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- +# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 fileencoding=utf-8 +# +# MDAnalysis --- https://www.mdanalysis.org +# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors +# (see the file AUTHORS for the full list of names) +# +# Released under the Lesser GNU Public Licence, v2.1 or any higher version +# +# Please cite your use of MDAnalysis in published work: +# +# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler, +# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein. +# MDAnalysis: A Python package for the rapid analysis of molecular dynamics +# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th +# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy. +# doi: 10.25080/majora-629e541a-00e +# +# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein. +# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations. +# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787 +# + +import pytest + +import MDAnalysis as mda +from MDAnalysis.topology.PDBParser import ( + DEFAULT_CACHE_NAME_DOWNLOADER, + HAS_POOCH, + SUPPORTED_FILE_FORMATS_DOWNLOADER, +) + +import re +from urllib import request +from shutil import rmtree +from pathlib import Path + +if HAS_POOCH: + from requests.exceptions import HTTPError + import pooch + +try: + request.urlopen("https://files.wwpdb.org/", timeout=2) + HAS_ACCESS_TO_WWPDB = True +except request.URLError: + HAS_ACCESS_TO_WWPDB = False + + +def true_basename(path): + """This is needed because pathlib.Path(foo.pdb.gz).stem as foo.pdb""" + return Path(path).stem.split(".")[0] + + +@pytest.mark.skipif( + HAS_POOCH, + reason="Pooch is installed.", +) +def test_pooch_installation(tmp_path): + with pytest.raises( + ModuleNotFoundError, + match="pooch is needed as a dependency for fetch_pdb()", + ): + mda.fetch_pdb("1AKE", cache_path=tmp_path, file_format="cif") + + +@pytest.mark.skipif(not HAS_POOCH, reason="Pooch is not installed.") +@pytest.mark.skipif( + not HAS_ACCESS_TO_WWPDB, + reason="Can not connect to https://files.wwpdb.org/", +) +class TestDocstringExamples: + """This class tests all the examples found in fetch_pdb's docstring""" + + @pytest.mark.parametrize("pdb_id", ["1AKE", "4BWZ"]) + def test_one_file_download(self, tmp_path, pdb_id): + path = mda.fetch_pdb(pdb_id, cache_path=tmp_path, file_format="cif") + assert isinstance(path, str) + assert true_basename(path) == pdb_id + + def test_multiple_files_download(self, tmp_path): + list_of_path_strings = mda.fetch_pdb( + ["1AKE", "4BWZ"], cache_path=tmp_path, progressbar=True + ) + assert all(isinstance(pdb_id, str) for pdb_id in list_of_path_strings) + assert all( + [ + true_basename(path) == name + for path, name in zip( + list_of_path_strings, ["1AKE", "4BWZ"], strict=True + ) + ] + ) + + @pytest.mark.parametrize( + "pdb_id, n_atoms", [("1AKE", 3816), ("4BWZ", 2824)] + ) + def test_files_to_universe(self, tmp_path, pdb_id, n_atoms): + u = mda.Universe( + mda.fetch_pdb( + pdb_id, + file_format="pdb.gz", + cache_path=tmp_path, + progressbar=True, + ) + ) + assert isinstance(u, mda.Universe) and (len(u.atoms) == n_atoms) + + +@pytest.fixture() +def clean_up_default_cache(): + rmtree(pooch.os_cache(DEFAULT_CACHE_NAME_DOWNLOADER), ignore_errors=True) + yield + rmtree(pooch.os_cache(DEFAULT_CACHE_NAME_DOWNLOADER)) + + +@pytest.mark.skipif(not HAS_POOCH, reason="Pooch is not installed.") +@pytest.mark.skipif( + not HAS_ACCESS_TO_WWPDB, + reason="Can not connect to https://files.wwpdb.org/", +) +class TestExpectedBehaviors: + + def test_no_cache_path(self, clean_up_default_cache): + assert isinstance(mda.fetch_pdb("1AKE", cache_path=None), str) + + def test_str_input_gives_str_output(self, tmp_path): + assert isinstance( + mda.fetch_pdb( + pdb_ids="1AKE", cache_path=tmp_path, file_format="cif" + ), + str, + ) + + def test_list_input_gives_list_output(self, tmp_path): + assert isinstance( + mda.fetch_pdb(pdb_ids=["1AKE"], cache_path=tmp_path), list + ) + + +@pytest.mark.skipif(not HAS_POOCH, reason="Pooch is not installed.") +@pytest.mark.skipif( + not HAS_ACCESS_TO_WWPDB, + reason="Can not connect to https://files.wwpdb.org/", +) +class TestExpectedErrors: + + def test_invalid_pdb(self, tmp_path): + with pytest.raises(HTTPError): + mda.fetch_pdb(pdb_ids="foobar", cache_path=tmp_path) + + def test_invalid_file_format(self, tmp_path): + with pytest.raises( + ValueError, + match=re.escape( + "Invalid file format. Supported file formats " + f"are {SUPPORTED_FILE_FORMATS_DOWNLOADER}" + ), + ): + mda.fetch_pdb( + pdb_ids="1AKE", cache_path=tmp_path, file_format="barfoo" + )