Skip to content

Commit 4a525ee

Browse files
orbeckstmarinegor
andauthored
fix inclusion of PRO in secondary structure (#5065)
* fix inclusion of PRO in secondary structure - fix #4913 - Fixes incorrect assignment of secondary structure to proline residues in DSSP by porting upstream PyDSSP 0.9.1 fix - ported fix from PyDSSP 0.9.1 by @ShintaroMinami to analysis.dssp.DSSP (see also ShintaroMinami/PyDSSP#2) - updated dssp test files to match PyDSSP output starting 0.9.1 - refactored `get_hbond_map` to correctly reflect residues and not atoms - updated docs - minimal regression tests - updated CHANGELOG --------- Co-authored-by: Egor Marin <[email protected]>
1 parent 5c7c480 commit 4a525ee

File tree

15 files changed

+91
-48
lines changed

15 files changed

+91
-48
lines changed

package/CHANGELOG

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@ The rules for this file:
1414

1515

1616
-------------------------------------------------------------------------------
17-
??/??/?? IAlibay
17+
??/??/?? IAlibay, orbeckst, marinegor
1818

1919
* 2.11.0
2020

2121
Fixes
22-
23-
Enhancements
22+
* Fixes incorrect assignment of secondary structure to proline residues in
23+
DSSP by porting upstream PyDSSP 0.9.1 fix (Issue #4913)
2424

2525
Changes
2626

package/MDAnalysis/analysis/dssp/dssp.py

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -112,27 +112,27 @@
112112
:inherited-members:
113113
114114
.. attribute:: results.dssp
115-
116-
Contains the time series of the DSSP assignment as a
115+
116+
Contains the time series of the DSSP assignment as a
117117
:class:`numpy.ndarray` array of shape ``(n_frames, n_residues)`` where each row
118-
contains the assigned secondary structure character for each residue (whose
118+
contains the assigned secondary structure character for each residue (whose
119119
corresponding resid is stored in :attr:`results.resids`). The three characters
120120
are ['H', 'E', '-'] and representi alpha-helix, sheet and loop, respectively.
121121
122122
.. attribute:: results.dssp_ndarray
123-
123+
124124
Contains the one-hot encoding of the time series of the DSSP assignment
125-
as a :class:`numpy.ndarray` Boolean array of shape ``(n_frames, n_residues, 3)``
125+
as a :class:`numpy.ndarray` Boolean array of shape ``(n_frames, n_residues, 3)``
126126
where for each residue the encoding is stored as ``(3,)`` shape
127-
:class:`numpy.ndarray` of Booleans so that ``True`` at index 0 represents loop
128-
('-'), ``True`` at index 1 represents helix ('H'), and ``True`` at index 2
127+
:class:`numpy.ndarray` of Booleans so that ``True`` at index 0 represents loop
128+
('-'), ``True`` at index 1 represents helix ('H'), and ``True`` at index 2
129129
represents sheet 'E'.
130130
131131
.. SeeAlso:: :func:`translate`
132-
132+
133133
134134
.. attribute:: results.resids
135-
135+
136136
A :class:`numpy.ndarray` of length ``n_residues`` that contains the residue IDs
137137
(resids) for the protein residues that were assigned a secondary structure.
138138
@@ -144,12 +144,14 @@
144144
.. autofunction:: translate
145145
"""
146146

147-
from typing import Union
147+
from typing import Optional, Union
148+
148149
import numpy as np
149-
from MDAnalysis import Universe, AtomGroup
150150

151+
from MDAnalysis import AtomGroup, Universe
152+
153+
from ...due import Doi, due
151154
from ..base import AnalysisBase, ResultsGroup
152-
from ...due import due, Doi
153155

154156
due.cite(
155157
Doi("10.1002/bip.360221211"),
@@ -163,17 +165,17 @@
163165

164166
try: # pragma: no cover
165167
from pydssp.pydssp_numpy import (
166-
assign,
167168
_get_hydrogen_atom_position,
169+
assign,
168170
)
169171

170172
HAS_PYDSSP = True
171173

172174
except ModuleNotFoundError:
173175
HAS_PYDSSP = False
174176
from .pydssp_numpy import (
175-
assign,
176177
_get_hydrogen_atom_position,
178+
assign,
177179
)
178180

179181

@@ -280,6 +282,11 @@ class DSSP(AnalysisBase):
280282
Enabled **parallel execution** with the ``multiprocessing`` and ``dask``
281283
backends; use the new method :meth:`get_supported_backends` to see all
282284
supported backends.
285+
286+
.. versionchanged:: 2.10.0
287+
Change treatment of proline and follow pydssp 0.9.1 (prolines are now explicitly
288+
forbidden to participate in the hydrogen bond network). Previous version could yield
289+
wrong assignment of prolines.
283290
"""
284291

285292
_analysis_algorithm_is_parallelizable = True
@@ -315,6 +322,7 @@ def __init__(
315322
]
316323
for t in heavyatom_names
317324
}
325+
self._donor_mask: Optional[np.ndarray] = ag.residues.resnames != "PRO"
318326
self._hydrogens: list["AtomGroup"] = [
319327
res.atoms.select_atoms(f"name {hydrogen_name}")
320328
for res in ag.residues
@@ -391,7 +399,7 @@ def _get_coords(self) -> np.ndarray:
391399

392400
def _single_frame(self):
393401
coords = self._get_coords()
394-
dssp = assign(coords)
402+
dssp = assign(coords, donor_mask=self._donor_mask)
395403
self.results.dssp_ndarray.append(dssp)
396404

397405
def _conclude(self):

package/MDAnalysis/analysis/dssp/pydssp_numpy.py

Lines changed: 51 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -74,20 +74,20 @@ def _unfold(a: np.ndarray, window: int, axis: int):
7474

7575

7676
def _get_hydrogen_atom_position(coord: np.ndarray) -> np.ndarray:
77-
"""Fills in hydrogen atoms positions if they are abscent, under the
77+
"""Fills in hydrogen atoms positions if they are absent, under the
7878
assumption that C-N-H and H-N-CA angles are perfect 120 degrees,
7979
and N-H bond length is 1.01 A.
8080
8181
Parameters
8282
----------
8383
coord : np.ndarray
84-
input coordinates in Angstrom, shape (n_atoms, 4, 3),
84+
input coordinates in Angstrom, shape (n_residues, 4, 3),
8585
where second axes corresponds to (N, CA, C, O) atom coordinates
8686
8787
Returns
8888
-------
8989
np.ndarray
90-
coordinates of additional hydrogens, shape (n_atoms-1, 3)
90+
coordinates of additional hydrogens, shape (n_residues-1, 3)
9191
9292
.. versionadded:: 2.8.0
9393
"""
@@ -118,6 +118,7 @@ def _get_hydrogen_atom_position(coord: np.ndarray) -> np.ndarray:
118118

119119
def get_hbond_map(
120120
coord: np.ndarray,
121+
donor_mask: np.ndarray = None,
121122
cutoff: float = DEFAULT_CUTOFF,
122123
margin: float = DEFAULT_MARGIN,
123124
return_e: bool = False,
@@ -128,8 +129,15 @@ def get_hbond_map(
128129
----------
129130
coord : np.ndarray
130131
input coordinates in either (n, 4, 3) or (n, 5, 3) shape
131-
(without or with hydrogens). If hydrogens are not present, then
132-
ideal positions (see :func:_get_hydrogen_atom_positions) are used.
132+
(without or with hydrogens respectively), where ``n`` is number of residues.
133+
If hydrogens are not present, then ideal positions (see :func:_get_hydrogen_atom_positions)
134+
are used.
135+
donor_mask : np.array
136+
Mask out any hydrogens that should not be considered (in particular HN
137+
in PRO). If ``None`` then all H will be used (behavior up to 2.10.0).
138+
139+
.. versionadded:: 2.10.0
140+
133141
cutoff : float, optional
134142
cutoff, by default DEFAULT_CUTOFF
135143
margin : float, optional
@@ -144,8 +152,12 @@ def get_hbond_map(
144152
145153
146154
.. versionadded:: 2.8.0
155+
156+
.. versionchanged:: 2.10.0
157+
Support masking of hydrogen donors via `donor_mask` (especially needed
158+
for ignoring HN on proline residues). Backport of PRO fix from pydssp 0.9.1.
147159
"""
148-
n_atoms, n_atom_types, _ = coord.shape
160+
n_residues, n_atom_types, _xyz = coord.shape
149161
assert n_atom_types in (
150162
4,
151163
5,
@@ -161,13 +173,13 @@ def get_hbond_map(
161173
"Number of atoms should be 4 (N,CA,C,O) or 5 (N,CA,C,O,H)"
162174
)
163175
# after this:
164-
# h.shape == (n_atoms, 3)
165-
# coord.shape == (n_atoms, 4, 3)
176+
# h.shape == (n_residues, 3)
177+
# coord.shape == (n_residues, 4, 3)
166178

167179
# distance matrix
168180
n_1, c_0, o_0 = coord[1:, 0], coord[0:-1, 2], coord[0:-1, 3]
169181

170-
n = n_atoms - 1
182+
n = n_residues - 1
171183
cmap = np.tile(c_0, (n, 1, 1))
172184
omap = np.tile(o_0, (n, 1, 1))
173185
nmap = np.tile(n_1, (1, 1, n)).reshape(n, n, 3)
@@ -191,18 +203,32 @@ def get_hbond_map(
191203
return e
192204

193205
# mask for local pairs (i,i), (i,i+1), (i,i+2)
194-
local_mask = ~np.eye(n_atoms, dtype=bool)
195-
local_mask *= ~np.diag(np.ones(n_atoms - 1, dtype=bool), k=-1)
196-
local_mask *= ~np.diag(np.ones(n_atoms - 2, dtype=bool), k=-2)
206+
local_mask = ~np.eye(n_residues, dtype=bool)
207+
local_mask *= ~np.diag(np.ones(n_residues - 1, dtype=bool), k=-1)
208+
local_mask *= ~np.diag(np.ones(n_residues - 2, dtype=bool), k=-2)
209+
# mask for donor H absence (Proline)
210+
donor_mask = (
211+
np.array(donor_mask).astype(float)
212+
if donor_mask is not None
213+
else np.ones(n_residues, dtype=float)
214+
)
215+
donor_mask = np.tile(donor_mask[:, np.newaxis], (1, n_residues))
197216
# hydrogen bond map (continuous value extension of original definition)
198217
hbond_map = np.clip(cutoff - margin - e, a_min=-margin, a_max=margin)
199218
hbond_map = (np.sin(hbond_map / margin * np.pi / 2) + 1.0) / 2
200-
hbond_map = hbond_map * local_mask
219+
220+
assert hbond_map.shape == local_mask.shape == donor_mask.shape
221+
222+
hbond_map *= local_mask
223+
hbond_map *= donor_mask
201224

202225
return hbond_map
203226

204227

205-
def assign(coord: np.ndarray) -> np.ndarray:
228+
def assign(
229+
coord: np.ndarray,
230+
donor_mask: np.ndarray = None,
231+
) -> np.ndarray:
206232
"""Assigns secondary structure for a given coordinate array,
207233
either with or without assigned hydrogens
208234
@@ -214,6 +240,12 @@ def assign(coord: np.ndarray) -> np.ndarray:
214240
(N, CA, C, O) atoms coordinates (if k=4), or (N, CA, C, O, H) coordinates
215241
(when k=5).
216242
243+
donor_mask : np.array
244+
Mask out any hydrogens that should not be considered (in particular HN
245+
in PRO). If ``None`` then all H will be used (behavior up to 2.9.0).
246+
247+
.. versionadded:: 2.10.0
248+
217249
Returns
218250
-------
219251
np.ndarray
@@ -222,9 +254,13 @@ def assign(coord: np.ndarray) -> np.ndarray:
222254
223255
224256
.. versionadded:: 2.8.0
257+
258+
.. versionchanged:: 2.10.0
259+
Support masking of donors.
260+
225261
"""
226262
# get hydrogen bond map
227-
hbmap = get_hbond_map(coord)
263+
hbmap = get_hbond_map(coord, donor_mask=donor_mask)
228264
hbmap = np.swapaxes(hbmap, -1, -2) # convert into "i:C=O, j:N-H" form
229265

230266
# identify turn 3, 4, 5

testsuite/MDAnalysisTests/analysis/test_dssp.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
import pytest
21
import glob
3-
import MDAnalysis as mda
42

3+
import MDAnalysis as mda
4+
import pytest
55
from MDAnalysis.analysis.dssp import DSSP, translate
6+
67
from MDAnalysisTests.datafiles import DSSP as DSSP_FOLDER
78
from MDAnalysisTests.datafiles import TPR, XTC
89

@@ -32,8 +33,6 @@ def test_trajectory(client_DSSP):
3233
assert (
3334
first_frame[:10] != last_frame[:10] == avg_frame[:10] == "-EEEEEE---"
3435
)
35-
protein = mda.Universe(TPR, XTC).select_atoms("protein")
36-
run = DSSP(protein).run(**client_DSSP, stop=10)
3736

3837

3938
def test_atomgroup(client_DSSP):
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
--------------HHHHHHHHH-------EEEEE--E------HHHHHHHHHHHHHHHHHH---HHHHHHHHHHHHHHHHHHH-----------EEEEEHHHHHHHHHHHHHHHH--------HHH----E-- 1eteA.pdb
1+
--------------HHHHHHHHH-------EEEEE--E------HHHHHHHHHHHHHHHHHH---HHHHHHHHHHHHHHHHHHH-----------EEEEEHHHHHHHHHHHHHHH---------HHH----E-- 1eteA.pdb
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
----E-----EEEEEEEEE------EEEEEE-------EEE-EEE------HHHHHHHHHHHHH-EEEEE--E-EEEEE----EEEEEEEEEE-EE----HHHHHH---EEEEEHHHHHHHH----HHHHH---- 2fvvA.pdb
1+
----E-----EEEEEEEEE------EEEEEE-------EE--EEE------HHHHHHHHHHHHH-EEEEE--E-EEEEE----EEEEEEEEEE-EE----HHHHHH---EEEEEHHHHHHHH----HHHHH---- 2fvvA.pdb
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
---HHHHHHHHHHHHH---HHHHHHHHHHHHHHHHHHHHHHHHH-HHHHHHHHHHHHHHHHHHHHHHHH-------HHHHHH-HHHHHHH---EEEEE-HHHHHHHHHHHH--HHHHHHHHHHHHHHHEEEEE- 2j49A.pdb
1+
---HHHHHHHHHHHHH------HHHHHHHHHHHHHHHHHHHHHH-HHHHHHHHHHHHHHHHHHHHHHHH-------HHHHHH-HHHHHHH---EEEEE-HHHHHHHHHHHH--HHHHHHHHHHHHHHHEEEEE- 2j49A.pdb
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
-------EEEEEE------EEEEEE----EHHHHHHHHHHHH-------EEEE--EE-----EHHHH------EEEEE- 3a4rA.pdb
1+
-------EEEEEE------EEEEEE-----HHHHHHHHHHHH-------EEEE--EE------HHHH------EEEEE- 3a4rA.pdb
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
-EE-----EEEE--------E-EEEEEE----E-EEEEEE---E---EE----EEEEEE------E-EEEEEE-----EEEEEE----EEEEE----EEEEEE-----EEE-EEEEEEE--EE-EEEEEEEE- 3aqgA.pdb
1+
-EE-----EEEE--------E-EEEEEE----E-EEEEEE---E---EE----EEEEEE------E-EEEEEE-----EEEEEE----EEEEE----EEEEE-------EE-EEEEEEE--EE-EEEEEEE-- 3aqgA.pdb
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
-----EEEE-----------EE-----EE-----HHHHHHHHHHH---EEEEE-----HHHHHHHHH----EEE-----HHHHHHHHHHHH---HHHEEEE---HHHHHHH----EEE------HHHH--------------HHHHHHHHH----HHHHHHH-- 3e8mA.pdb
1+
-----EEEE-----------EE-----EE-----HHHHHHHHHHH----EEEE-----HHHHHHHHH----EEE-----HHHHHHHHHHHH---HHHEEEE---HHHHHHH----EEE------HHHH--------------HHHHHHHHH----HHHHHHH-- 3e8mA.pdb

0 commit comments

Comments
 (0)