Add SMARTS selection (#2883)

Towards #2468 ## Work done in this PR Adds a new SMARTS based selection which uses the RDKit converter.
MDAnalysis · Aug 29, 2020 · b2b7bcb · b2b7bcb
1 parent a8227f0
commit b2b7bcb
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 2 deletions.
diff --git a/package/MDAnalysis/core/groups.py b/package/MDAnalysis/core/groups.py
@@ -2737,6 +2737,10 @@ def select_atoms(self, sel, *othersel, **selgroups):
             record_type *record_type*
                 for selecting either ATOM or HETATM from PDB-like files.
                 e.g. ``select_atoms('name CA and not record_type HETATM')``
+            smarts *SMARTS-query*
+                select atoms using Daylight's SMARTS queries, e.g. ``smarts
+                [#7;R]`` to find nitrogen atoms in rings. Requires RDKit.
+                All matches (max 1000) are combined as a unique match
 
         **Boolean**
 
@@ -2883,6 +2887,8 @@ def select_atoms(self, sel, *othersel, **selgroups):
            equivalent ``global group`` selection.
            Removed flags affecting default behaviour for periodic selections;
            periodic are now on by default (as with default flags)
+        .. versionchanged:: 2.0.0
+            Added the *smarts* selection.
         """
 
         if not sel:

diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
@@ -617,6 +617,60 @@ def apply(self, group):
         return group[group.aromaticities].unique
 
 
+class SmartsSelection(Selection):
+    """Select atoms based on SMARTS queries.
+
+    Uses RDKit to run the query and converts the result to MDAnalysis.
+    Supports chirality.
+    """
+    token = 'smarts'
+
+    def __init__(self, parser, tokens):
+        # The parser will add spaces around parentheses and then split the
+        # selection based on spaces to create the tokens
+        # If the input SMARTS query contained parentheses, the query will be
+        # split because of that and we need to reconstruct it
+        # We also need to keep the parentheses that are not part of the smarts
+        # query intact
+        pattern = []
+        counter = {"(": 0, ")": 0}
+        # loop until keyword but ignore parentheses as a keyword
+        while tokens[0] in ["(", ")"] or not is_keyword(tokens[0]):
+            # keep track of the number of open and closed parentheses
+            if tokens[0] in ["(", ")"]:
+                counter[tokens[0]] += 1
+                # if the char is a closing ")" but there's no corresponding
+                # open "(" then we've reached then end of the smarts query and
+                # the current token ")" is part of a grouping parenthesis
+                if tokens[0] == ")" and counter["("] < (counter[")"]):
+                    break
+            # add the token to the pattern and remove it from the tokens
+            val = tokens.popleft()
+            pattern.append(val)
+        self.pattern = "".join(pattern)
+
+    def apply(self, group):
+        try:
+            from rdkit import Chem
+        except ImportError:
+            raise ImportError("RDKit is required for SMARTS-based atom "
+                              "selection but it's not installed. Try "
+                              "installing it with \n"
+                              "conda install -c conda-forge rdkit")
+        pattern = Chem.MolFromSmarts(self.pattern)
+        if not pattern:
+            raise ValueError(f"{self.pattern!r} is not a valid SMARTS query")
+        mol = group.convert_to("RDKIT")
+        matches = mol.GetSubstructMatches(pattern, useChirality=True)
+        # convert rdkit indices to mdanalysis'
+        indices = [
+            mol.GetAtomWithIdx(idx).GetIntProp("_MDAnalysis_index")
+            for match in matches for idx in match]
+        # create boolean mask for atoms based on index
+        mask = np.in1d(range(group.n_atoms), np.unique(indices))
+        return group[mask]
+
+
 class ResidSelection(Selection):
     """Select atoms based on numerical fields
 

diff --git a/package/doc/sphinx/source/documentation_pages/selections.rst b/package/doc/sphinx/source/documentation_pages/selections.rst
@@ -99,6 +99,12 @@ moltype *molecule-type*
     select by molecule type, e.g. ``moltype Protein_A``. At the moment, only
     the TPR format defines the molecule type.
 
+smarts *SMARTS-query*
+    select atoms using Daylight's SMARTS queries, e.g. ``smarts [#7;R]`` to
+    find nitrogen atoms in rings. Requires RDKit. All matches (max 1000) are
+    combined as a unique match.
+
+
 Pattern matching
 ----------------
 

diff --git a/testsuite/MDAnalysisTests/core/test_atomselections.py b/testsuite/MDAnalysisTests/core/test_atomselections.py
@@ -42,6 +42,7 @@
     TRZ_psf, TRZ,
     PDB_icodes,
     PDB_HOLE,
+    PDB_helix,
 )
 from MDAnalysisTests import make_Universe
 
@@ -526,17 +527,43 @@ def u(self):
         u = MDAnalysis.Universe.from_smiles(smi, addHs=False,
                                             generate_coordinates=False)
         return u
-
+
+    @pytest.fixture
+    def u2(self):
+        u = MDAnalysis.Universe.from_smiles("Nc1cc(C[C@H]([O-])C=O)c[nH]1")
+        return u
+
     @pytest.mark.parametrize("sel_str, n_atoms", [
         ("aromatic", 5),
         ("not aromatic", 1),
         ("type N and aromatic", 1),
         ("type C and aromatic", 4),
     ])
-    def test_selection(self, u, sel_str, n_atoms):
+    def test_aromatic_selection(self, u, sel_str, n_atoms):
         sel = u.select_atoms(sel_str)
         assert sel.n_atoms == n_atoms
 
+    @pytest.mark.parametrize("sel_str, indices", [
+        ("smarts n", [10]),
+        ("smarts [#7]", [0, 10]),
+        ("smarts a", [1, 2, 3, 9, 10]),
+        ("smarts c", [1, 2, 3, 9]),
+        ("smarts [*-]", [6]),
+        ("smarts [$([!#1]);$([!R][R])]", [0, 4]),
+        ("smarts [$([C@H](-[CH2])(-[O-])-C=O)]", [5]),
+        ("smarts [$([C@@H](-[CH2])(-[O-])-C=O)]", []),
+        ("smarts a and type C", [1, 2, 3, 9]),
+        ("(smarts a) and (type C)", [1, 2, 3, 9]),
+        ("smarts a and type N", [10]),
+    ])
+    def test_smarts_selection(self, u2, sel_str, indices):
+        sel = u2.select_atoms(sel_str)
+        assert_equal(sel.indices, indices)
+
+    def test_invalid_smarts_sel_raises_error(self, u2):
+        with pytest.raises(ValueError, match="not a valid SMARTS"):
+            u2.select_atoms("smarts foo")
+
 
 class TestSelectionsNucleicAcids(object):
     @pytest.fixture(scope='class')