Skip to content

Commit

Permalink
Add SMARTS selection (#2883)
Browse files Browse the repository at this point in the history
Towards #2468 

## Work done in this PR
Adds a new SMARTS based selection which uses the RDKit converter.
  • Loading branch information
Cédric Bouysset authored Aug 29, 2020
1 parent a8227f0 commit b2b7bcb
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 2 deletions.
6 changes: 6 additions & 0 deletions package/MDAnalysis/core/groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -2737,6 +2737,10 @@ def select_atoms(self, sel, *othersel, **selgroups):
record_type *record_type*
for selecting either ATOM or HETATM from PDB-like files.
e.g. ``select_atoms('name CA and not record_type HETATM')``
smarts *SMARTS-query*
select atoms using Daylight's SMARTS queries, e.g. ``smarts
[#7;R]`` to find nitrogen atoms in rings. Requires RDKit.
All matches (max 1000) are combined as a unique match
**Boolean**
Expand Down Expand Up @@ -2883,6 +2887,8 @@ def select_atoms(self, sel, *othersel, **selgroups):
equivalent ``global group`` selection.
Removed flags affecting default behaviour for periodic selections;
periodic are now on by default (as with default flags)
.. versionchanged:: 2.0.0
Added the *smarts* selection.
"""

if not sel:
Expand Down
54 changes: 54 additions & 0 deletions package/MDAnalysis/core/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,60 @@ def apply(self, group):
return group[group.aromaticities].unique


class SmartsSelection(Selection):
"""Select atoms based on SMARTS queries.
Uses RDKit to run the query and converts the result to MDAnalysis.
Supports chirality.
"""
token = 'smarts'

def __init__(self, parser, tokens):
# The parser will add spaces around parentheses and then split the
# selection based on spaces to create the tokens
# If the input SMARTS query contained parentheses, the query will be
# split because of that and we need to reconstruct it
# We also need to keep the parentheses that are not part of the smarts
# query intact
pattern = []
counter = {"(": 0, ")": 0}
# loop until keyword but ignore parentheses as a keyword
while tokens[0] in ["(", ")"] or not is_keyword(tokens[0]):
# keep track of the number of open and closed parentheses
if tokens[0] in ["(", ")"]:
counter[tokens[0]] += 1
# if the char is a closing ")" but there's no corresponding
# open "(" then we've reached then end of the smarts query and
# the current token ")" is part of a grouping parenthesis
if tokens[0] == ")" and counter["("] < (counter[")"]):
break
# add the token to the pattern and remove it from the tokens
val = tokens.popleft()
pattern.append(val)
self.pattern = "".join(pattern)

def apply(self, group):
try:
from rdkit import Chem
except ImportError:
raise ImportError("RDKit is required for SMARTS-based atom "
"selection but it's not installed. Try "
"installing it with \n"
"conda install -c conda-forge rdkit")
pattern = Chem.MolFromSmarts(self.pattern)
if not pattern:
raise ValueError(f"{self.pattern!r} is not a valid SMARTS query")
mol = group.convert_to("RDKIT")
matches = mol.GetSubstructMatches(pattern, useChirality=True)
# convert rdkit indices to mdanalysis'
indices = [
mol.GetAtomWithIdx(idx).GetIntProp("_MDAnalysis_index")
for match in matches for idx in match]
# create boolean mask for atoms based on index
mask = np.in1d(range(group.n_atoms), np.unique(indices))
return group[mask]


class ResidSelection(Selection):
"""Select atoms based on numerical fields
Expand Down
6 changes: 6 additions & 0 deletions package/doc/sphinx/source/documentation_pages/selections.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ moltype *molecule-type*
select by molecule type, e.g. ``moltype Protein_A``. At the moment, only
the TPR format defines the molecule type.

smarts *SMARTS-query*
select atoms using Daylight's SMARTS queries, e.g. ``smarts [#7;R]`` to
find nitrogen atoms in rings. Requires RDKit. All matches (max 1000) are
combined as a unique match.


Pattern matching
----------------

Expand Down
31 changes: 29 additions & 2 deletions testsuite/MDAnalysisTests/core/test_atomselections.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
TRZ_psf, TRZ,
PDB_icodes,
PDB_HOLE,
PDB_helix,
)
from MDAnalysisTests import make_Universe

Expand Down Expand Up @@ -526,17 +527,43 @@ def u(self):
u = MDAnalysis.Universe.from_smiles(smi, addHs=False,
generate_coordinates=False)
return u


@pytest.fixture
def u2(self):
u = MDAnalysis.Universe.from_smiles("Nc1cc(C[C@H]([O-])C=O)c[nH]1")
return u

@pytest.mark.parametrize("sel_str, n_atoms", [
("aromatic", 5),
("not aromatic", 1),
("type N and aromatic", 1),
("type C and aromatic", 4),
])
def test_selection(self, u, sel_str, n_atoms):
def test_aromatic_selection(self, u, sel_str, n_atoms):
sel = u.select_atoms(sel_str)
assert sel.n_atoms == n_atoms

@pytest.mark.parametrize("sel_str, indices", [
("smarts n", [10]),
("smarts [#7]", [0, 10]),
("smarts a", [1, 2, 3, 9, 10]),
("smarts c", [1, 2, 3, 9]),
("smarts [*-]", [6]),
("smarts [$([!#1]);$([!R][R])]", [0, 4]),
("smarts [$([C@H](-[CH2])(-[O-])-C=O)]", [5]),
("smarts [$([C@@H](-[CH2])(-[O-])-C=O)]", []),
("smarts a and type C", [1, 2, 3, 9]),
("(smarts a) and (type C)", [1, 2, 3, 9]),
("smarts a and type N", [10]),
])
def test_smarts_selection(self, u2, sel_str, indices):
sel = u2.select_atoms(sel_str)
assert_equal(sel.indices, indices)

def test_invalid_smarts_sel_raises_error(self, u2):
with pytest.raises(ValueError, match="not a valid SMARTS"):
u2.select_atoms("smarts foo")


class TestSelectionsNucleicAcids(object):
@pytest.fixture(scope='class')
Expand Down

0 comments on commit b2b7bcb

Please sign in to comment.