Skip to content
Open
Show file tree
Hide file tree
Changes from 97 commits
Commits
Show all changes
107 commits
Select commit Hold shift + click to select a range
7899f3d
Added requests as a dependency
jauy123 Feb 25, 2025
44393be
Inital download code
jauy123 Mar 3, 2025
b1f6002
fixed typo
jauy123 Mar 3, 2025
9c6e87a
cleaner convert_to_universe()
jauy123 Mar 3, 2025
aecefc9
Added abc module and allowed closing of file stream for downloaded te…
jauy123 Mar 4, 2025
9510cc6
Fixed __all__ -- should fixed pull request test on github
jauy123 Mar 4, 2025
8c1a196
refactored cache logic
jauy123 Mar 5, 2025
f0e30ed
Initial tests
jauy123 Mar 5, 2025
1c7d909
Added __init.py to make tests work
jauy123 Mar 5, 2025
eb23ed1
typos fixed
jauy123 Mar 6, 2025
b0c7f5a
Refactored Tests -- put them in classes!
jauy123 Mar 7, 2025
f2ec203
PdbDownloader().download() now downloads in binary rather than text (…
jauy123 Mar 7, 2025
a21fd94
Updated Tests to comply with pdb.gz
jauy123 Mar 7, 2025
d58bed9
Added Progress bar to PdbDownloader().download()
jauy123 Mar 8, 2025
1147b6d
Added a few clarifications to _requests_progress_bar
jauy123 Mar 8, 2025
91feb16
Added filename attribute() to BaseDownloader()
jauy123 Mar 8, 2025
ddcef9e
made _requests_progress_bar a private method of PdbDownloader
jauy123 Mar 9, 2025
bf3e07f
minor comments
jauy123 Mar 9, 2025
09cc409
Added Buffer as default option for PdbDownloader.download()
jauy123 Mar 9, 2025
d78a954
Renamed PdbDownloader to PDBDownloader to match PDBReader()
jauy123 Mar 9, 2025
560e1c2
better __str__ method for BaseDownloader()
jauy123 Mar 9, 2025
c43c10d
Enhanced tests
jauy123 Mar 9, 2025
e6a0f05
Added TODO list for future me
jauy123 Mar 10, 2025
ada1b38
Added requests as optional dep to pyproject.toml
jauy123 Mar 18, 2025
043c006
update todo list
jauy123 Jun 26, 2025
ea5c5b7
minor cleanup
jauy123 Jun 26, 2025
5d6d3e8
Ran black on package/
jauy123 Jun 26, 2025
6590c42
Ran black on tests
jauy123 Jun 26, 2025
6e9b9f3
updated TODO
jauy123 Jun 26, 2025
252b23c
attempt to fix mypy issue
jauy123 Jun 26, 2025
440e3b8
inital working pooch-based implementation of fetch_pdb()
jauy123 Aug 20, 2025
c3f74f9
merge from working pooch branch
jauy123 Aug 20, 2025
10f66be
removed all of old non-pooch based fetch_pdb() implementation and tests
jauy123 Aug 20, 2025
cda3559
cleaned up __init__.py of old non-pooched based fetch_pdb
jauy123 Aug 20, 2025
cecd570
fetch_pdb() now returns paths instead of universes
jauy123 Aug 22, 2025
03638c8
Cleaned up return logic with syntactic sugar
jauy123 Aug 22, 2025
544de38
package/MDAnalysis/coordinates/fetch_pdb.py
jauy123 Aug 22, 2025
fdaacf1
ok, this is proper version of the cleaned up fetch_pdb()
jauy123 Aug 22, 2025
215ee43
cleaned up return logic with Syntactic sugar
jauy123 Aug 22, 2025
5990939
Made explicit that in the one pdb case that the return type is a string
jauy123 Aug 22, 2025
7f7387f
remove redundant comments
jauy123 Aug 22, 2025
f3456a5
Moved fetch_pdb() to PDBParser
jauy123 Aug 22, 2025
8b8492f
Added fetch_pdb() docstring
jauy123 Aug 22, 2025
3fea571
Added default cache folder to fetch_pdb()
jauy123 Aug 22, 2025
f5d6a9f
Added Unit Test for fetch_pdb()'s docstring
jauy123 Aug 22, 2025
64ac4e5
Finalized tests and docstring
jauy123 Aug 22, 2025
0f54e8e
Spagetti fingered fetch_pdb() docstring
jauy123 Aug 22, 2025
867614a
Added pooch to requirements.txt in order to get github's test to work…
jauy123 Aug 22, 2025
c85fd75
Added pooch to pyproject.toml to get github's online test
jauy123 Aug 22, 2025
96dbf05
Added pooch to pyproject.toml to get github's online test to work
jauy123 Aug 22, 2025
b15d148
Merge branch 'downloads' of github.com:jauy123/mdanalysis into downloads
jauy123 Aug 22, 2025
2d10ad3
Modified the action.yaml to HOPEFULLY get github's online test to work
jauy123 Aug 22, 2025
ab7bc8a
i have fat fingers
jauy123 Aug 22, 2025
9289792
action.yaml attempt number 2
jauy123 Aug 22, 2025
96d7341
An attempt to make pooch optional
jauy123 Aug 22, 2025
c74a46e
Cleaned up fetch_pdb(), added Universe PDB assertions -- still need t…
jauy123 Aug 23, 2025
6b20e86
Pre-black tests
jauy123 Aug 23, 2025
0d793e9
Moved requests' HTTPExeception inside has_pooch() since requests is a…
jauy123 Aug 23, 2025
d964bc5
Ran Black on test_fetch_pdb.py
jauy123 Aug 23, 2025
124d06a
Added test to catch missing pooch dependency
jauy123 Aug 23, 2025
b8f7a81
Renamed pooch dependency test
jauy123 Aug 23, 2025
d78bae6
removed requests from pyproject.toml and requirements.txt
jauy123 Aug 23, 2025
577ac9d
PDBParser.py pre-black
jauy123 Aug 23, 2025
608d991
post black PDBParser.py
jauy123 Aug 23, 2025
07d124c
Removed err assignment from has_internet()
jauy123 Aug 23, 2025
8a9ac84
Merge remote-tracking branch 'upstream/develop' into downloads
jauy123 Aug 23, 2025
939d5f0
modified in pytest fixtures
jauy123 Aug 25, 2025
7107aa4
Made pooch import global
jauy123 Aug 26, 2025
c869bbc
added pytest fixtures to test_fetch_pdb
jauy123 Aug 26, 2025
557b1e9
Modified test_pooch_installation to be like coordinates/test_gcd.py/t…
jauy123 Aug 26, 2025
f3a4d7b
Restored previous backup
jauy123 Aug 26, 2025
2a97d9b
Made test_fetch_pdb() more topology/ test_gsd()
jauy123 Aug 26, 2025
9d0f53a
Rewrote test skip if condition
jauy123 Aug 27, 2025
595423a
renamed HAS_INTERNET() to HAS_ACCESS_TO_WWPDB
jauy123 Aug 27, 2025
eed80ed
Changed wwPDB download url to be a module level variable
jauy123 Aug 27, 2025
802183f
moved url backed into fetch_pdb()
jauy123 Sep 3, 2025
bf9292c
remove comment in inital __init__.py
jauy123 Sep 3, 2025
b595f09
minor cleanup in PDBParser.py
jauy123 Sep 3, 2025
e93c73a
Updated CHANGELOG
jauy123 Sep 3, 2025
5f407ba
oops, put the text in the wrong section in CHANGELOG
jauy123 Sep 3, 2025
f09115a
Added no cache test
jauy123 Oct 21, 2025
e2141a8
merged with upstream
jauy123 Oct 21, 2025
bf81128
Revert "Added no cache test"
jauy123 Oct 21, 2025
934eda3
Copy and Pasted from old changelog
jauy123 Oct 21, 2025
9b8da31
uunrevert commit bf81128
jauy123 Oct 21, 2025
0b80840
Split pytest fixtures into two
jauy123 Oct 22, 2025
a7519af
renamed pdb_cache to MDAnalysis_pdbs
jauy123 Oct 22, 2025
72c24e0
Added supported file_formats to code.
jauy123 Oct 22, 2025
ffcc270
Added documentation and update test for file formats
jauy123 Oct 22, 2025
b07a16d
post darker pre flake8
jauy123 Oct 22, 2025
a2aff4c
Updated fetch_pdb return types
jauy123 Oct 22, 2025
98fa75b
Manual flake 8 on fetch_pdb
jauy123 Oct 22, 2025
1e635c4
Addressing obeckset's fetch_pdb's docstring changes
jauy123 Oct 22, 2025
447de56
Changed 'Multiple PDBs example in docstring
jauy123 Oct 22, 2025
c28110f
wrote tests for str/tuple arguement -- need to refactor however
jauy123 Oct 23, 2025
02d81a3
Updated PDBParser.py and applied flake8
jauy123 Oct 23, 2025
adbfabb
Modified Tests per orbeckst's requests
jauy123 Oct 23, 2025
31a8f7b
Modified fetch_pdb()'s logic and added cache docs
jauy123 Oct 23, 2025
e2a28ec
Changed check on fetch_pdb() return
jauy123 Oct 24, 2025
546538a
Modified fetch_pdb() and its test per github feedback
jauy123 Oct 24, 2025
735586f
Added autodocumenation, so fetch_pdb is visible to sphinx
jauy123 Oct 27, 2025
b5da9be
Change spacing under "Classes and Functions"
jauy123 Oct 28, 2025
b0c808a
added autodata for DEFAULT_CACHE_NAME_DOWNLOADER
jauy123 Oct 28, 2025
ab0f635
Modified docs per suggestions and did a general pass with adding sphi…
jauy123 Oct 28, 2025
d2f7857
doc grammar
jauy123 Oct 28, 2025
09e7ef5
Got sphinx markup to work
jauy123 Oct 28, 2025
6beffd8
Added markup to file formats
jauy123 Oct 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/actions/setup-deps/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ inputs:
default: 'networkx'
openmm:
default: 'openmm'
pooch:
default: 'pooch'
pytng:
default: 'pytng>=0.2.3'
rdkit:
Expand Down Expand Up @@ -145,6 +147,7 @@ runs:
${{ inputs.netcdf4 }}
${{ inputs.networkx }}
${{ inputs.openmm }}
${{ inputs.pooch }}
${{ inputs.pytng }}
${{ inputs.rdkit }}
${{ inputs.scikit-learn }}
Expand Down
5 changes: 4 additions & 1 deletion package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,16 @@ The rules for this file:


-------------------------------------------------------------------------------
??/??/?? IAlibay
??/??/?? IAlibay, jauy123, BradyAJohnston

* 2.11.0

Fixes

Enhancements
* Added function `topology.PDBParser.fetch_pdb` (accessible as
`MDAnalysis.fetch_pdb()`) to download structure files from wwPDB using
`pooch` as optional dependency (Issue #4907, PR #4943)

Changes

Expand Down
2 changes: 2 additions & 0 deletions package/MDAnalysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,8 @@

from .due import due, Doi, BibTeX

from .topology.PDBParser import fetch_pdb
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@IAlibay @BradyAJohnston are we sure that we want the import at the top level?

If we do more fetch_xxx() in the future then we may have to deprecate it again, e.g. in favor of a mda.fetch.pdb(...) or Universe.from_fetched.

I think it's ok to leave it here for now because we don't have anything else. If we get more before 3.0, we still have time to deprecate and remove in 3.0.

If it is left in then does it need to be documented at the top level, too?


due.cite(
Doi("10.25080/majora-629e541a-00e"),
description="Molecular simulation analysis library",
Expand Down
140 changes: 139 additions & 1 deletion package/MDAnalysis/topology/PDBParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
.. autoclass:: PDBParser
:members:
:inherited-members:

"""
import numpy as np
import warnings
Expand Down Expand Up @@ -95,6 +95,27 @@
# Set up a logger for the PDBParser
logger = logging.getLogger("MDAnalysis.topology.PDBParser")

try:
import pooch
except ImportError:
HAS_POOCH = False
else:
HAS_POOCH = True

# These file formats are here (https://www.rcsb.org/docs/programmatic-access/file-download-services) under "PDB entry files"
SUPPORTED_FILE_FORMATS_DOWNLOADER = (
"cif",
"cif.gz",
"bcif",
"bcif.gz",
"xml",
"xml.gz",
"pdb",
"pdb.gz",
"pdb1",
"pdb1.gz",
)


def float_or_default(val, default):
try:
Expand Down Expand Up @@ -515,3 +536,120 @@ def _parse_conect(conect):
bond_atoms = (int(conect[11 + i * 5: 16 + i * 5]) for i in
range(n_bond_atoms))
return atom_id, bond_atoms


def fetch_pdb(
pdb_ids=None,
cache_path=None,
progressbar=False,
file_format="pdb.gz",
):
"""
Download one or more PDB files from the RCSB Protein Data Bank and cache
them locally.

Given one or multiple PDB IDs, downloads the corresponding structure files
format and stores them in a local cache directory. If files are cached on
disk, fetch_pdb() will skip the download and use the cached version instead.

Returns the path(s) as a string to the downloaded file(s).

Parameters
----------
pdb_ids : str or sequence of str
A single PDB ID as a string, or a sequence of PDB IDs to fetch.
cache_path : str or pathlib.Path
Directory where downloaded file(s) will be cached.
file_format : str
The file extension/format to download (e.g., "cif", "pdb").
See the Notes section below for a list of all supported file formats.
progressbar : bool, optional
If True, display a progress bar during file downloads. Default is False.

Returns
-------
str or list of str
The path(s) to the downloaded file(s). Returns a single string if
one PDB ID is given, or a list of strings if multiple PDB IDs are
provided.

Raises
------
ValueError
For an invalid file format. Supported file formats are under Notes.

requests.exceptions.HTTPError
If an invalid PDB code or file format is specified.

Notes
-----
This function uses the `RCSB File Download Services`_ for directly downloading
structure files via https.

.. _`RCSB File Download Services`:
https://www.rcsb.org/docs/programmatic-access/file-download-services

The RCSB currently provides data in 'cif', 'cif.gz', 'bcif', 'bcif.gz', 'xml',
'xml.gz', 'pdb', 'pdb.gz', 'pdb1', 'pdb1.gz' file formats and can therefore be
downloaded. Not all of these formats can be currently read with MDAnalysis.

Examples
--------
Download a single PDB file:

>>> mda.fetch_pdb("1AKE", file_format="cif")
'./MDAnalysis_pdbs/1AKE.cif'

Download multiple PDB files with a progress bar:

>>> mda.fetch_pdb(["1AKE", "4BWZ"], progressbar=True)
['./MDAnalysis_pdbs/1AKE.pdb.gz', './MDAnalysis_pdbs/4BWZ.pdb.gz']

Download a single PDB file and converting it to a universe:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

converting -> convert

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix grammar


>>> mda.Universe(mda.fetch_pdb("1AKE"), file_format="pdb.gz")
<Universe with 3816 atoms>

Download multiple PDB files and convert each of them into a universe:

>>> [mda.Universe(pdb) for pdb in mda.fetch_pdb(["1AKE", "4BWZ"], progressbar=True)]
[<Universe with 3816 atoms>, <Universe with 2824 atoms>]


.. versionadded:: 2.11.0
"""

if not HAS_POOCH:
raise ModuleNotFoundError(
"pooch is needed as a dependency for fetch_pdb()"
)
elif file_format not in SUPPORTED_FILE_FORMATS_DOWNLOADER:
raise ValueError(
"Invalid file format. Supported file formats "
f"are {SUPPORTED_FILE_FORMATS_DOWNLOADER}"
)

if isinstance(pdb_ids, str):
_pdb_ids = (pdb_ids,)
else:
_pdb_ids = pdb_ids

if cache_path is None:
cache_path = pooch.os_cache("MDAnalysis_pdbs")

# Have to do this dictionary approach instead of using pooch.retrieve in order
# to prevent the hardcoded known_hash warning from showing up.
registry_dictionary = {
f"{pdb_id}.{file_format}": None for pdb_id in _pdb_ids
}

downloader = pooch.create(
path=cache_path,
base_url="https://files.wwpdb.org/download/",
registry=registry_dictionary,
)

paths = [downloader.fetch(fname=file_name, progressbar=progressbar)
for file_name in registry_dictionary.keys()]

return paths if type(pdb_ids) is not str else paths[0]
1 change: 1 addition & 0 deletions package/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ extra_formats = [
"h5py>=2.10",
"chemfiles>=0.10",
"parmed",
"pooch",
"pyedr>=0.7.0",
"pytng>=0.2.3",
"gsd>3.0.0",
Expand Down
1 change: 1 addition & 0 deletions package/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ networkx
numpy>=1.23.2
packaging
parmed
pooch
pytest
scikit-learn
scipy
Expand Down
149 changes: 149 additions & 0 deletions testsuite/MDAnalysisTests/topology/test_fetch_pdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 fileencoding=utf-8
#
# MDAnalysis --- https://www.mdanalysis.org
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the Lesser GNU Public Licence, v2.1 or any higher version
#
# Please cite your use of MDAnalysis in published work:
#
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
# doi: 10.25080/majora-629e541a-00e
#
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
#

import pytest

import MDAnalysis as mda
from MDAnalysis.topology.PDBParser import HAS_POOCH

from urllib import request
from shutil import rmtree
from pathlib import Path

if HAS_POOCH:
from requests.exceptions import HTTPError
from pooch import os_cache

try:
request.urlopen("https://files.wwpdb.org/", timeout=2)
HAS_ACCESS_TO_WWPDB = True
except request.URLError:
HAS_ACCESS_TO_WWPDB = False


def true_basename(path):
"""This is needed because pathlib.Path(foo.pdb.gz).stem as foo.pdb"""
return Path(path).stem.split(".")[0]


@pytest.mark.skipif(
HAS_POOCH,
reason="Pooch is installed.",
)
def test_pooch_installation(tmp_path):
with pytest.raises(
ModuleNotFoundError,
match="pooch is needed as a dependency for fetch_pdb()",
):
mda.fetch_pdb("1AKE", cache_path=tmp_path, file_format="cif")


@pytest.mark.skipif(not HAS_POOCH, reason="Pooch is not installed.")
@pytest.mark.skipif(
not HAS_ACCESS_TO_WWPDB,
reason="Can not connect to https://files.wwpdb.org/",
)
class TestDocstringExamples:
"""This class tests all the examples found in fetch_pdb's docstring"""

@pytest.mark.parametrize("pdb_id", [("1AKE"), ("4BWZ")])
def test_one_file_download(self, tmp_path, pdb_id):
path = mda.fetch_pdb(pdb_id, cache_path=tmp_path, file_format="cif")
assert isinstance(path, str)
assert true_basename(path) == pdb_id

def test_multiple_files_download(self, tmp_path):
list_of_path_strings = mda.fetch_pdb(
["1AKE", "4BWZ"], cache_path=tmp_path, progressbar=True
)
assert all(isinstance(pdb_id, str) for pdb_id in list_of_path_strings)
assert all(
[
true_basename(path) == name
for path, name in zip(
list_of_path_strings, ["1AKE", "4BWZ"], strict=True
)
]
)

@pytest.mark.parametrize(
"pdb_id, n_atoms", [("1AKE", 3816), ("4BWZ", 2824)]
)
def test_files_to_universe(self, tmp_path, pdb_id, n_atoms):
u = mda.Universe(
mda.fetch_pdb(
pdb_id,
file_format="pdb.gz",
cache_path=tmp_path,
progressbar=True,
)
)
assert isinstance(u, mda.Universe) and (len(u.atoms) == n_atoms)


@pytest.fixture()
def clean_up_default_cache():
yield
rmtree(os_cache("MDAnalysis_pdbs"))


@pytest.mark.skipif(not HAS_POOCH, reason="Pooch is not installed.")
@pytest.mark.skipif(
not HAS_ACCESS_TO_WWPDB,
reason="Can not connect to https://files.wwpdb.org/",
)
class TestExpectedBehaviors:

def test_no_cache_path(self, clean_up_default_cache):
assert isinstance(mda.fetch_pdb("1AKE", cache_path=None), str)

def test_str_input_gives_str_output(self, tmp_path):
assert isinstance(
mda.fetch_pdb(
pdb_ids="1AKE", cache_path=tmp_path, file_format="cif"
),
str,
)

def test_list_input_gives_list_output(self, tmp_path):
assert isinstance(
mda.fetch_pdb(pdb_ids=["1AKE"], cache_path=tmp_path), list
)


@pytest.mark.skipif(not HAS_POOCH, reason="Pooch is not installed.")
@pytest.mark.skipif(
not HAS_ACCESS_TO_WWPDB,
reason="Can not connect to https://files.wwpdb.org/",
)
class TestExpectedErrors:

def test_invalid_pdb(self, tmp_path):
with pytest.raises(HTTPError):
mda.fetch_pdb(pdb_ids="foobar", cache_path=tmp_path)

def test_invalid_file_format(self, tmp_path):
with pytest.raises(ValueError):
mda.fetch_pdb(
pdb_ids="1AKE", cache_path=tmp_path, file_format="barfoo"
)
Loading