From 8d82688413d7a844aaac0da16c6b0b0b3d5fd9d4 Mon Sep 17 00:00:00 2001
From: Oliver Beckstein <orbeckst@gmail.com>
Date: Mon, 11 Jul 2016 01:18:17 -0700
Subject: [PATCH] test and doc updates for align.sequence_alignment

- added unit test
- turned **kwargs into kwargs with defaults
- updated docs
---
 package/MDAnalysis/analysis/align.py          | 51 +++++++++++--------
 .../MDAnalysisTests/analysis/test_align.py    | 40 ++++++++++-----
 2 files changed, 56 insertions(+), 35 deletions(-)

diff --git a/package/MDAnalysis/analysis/align.py b/package/MDAnalysis/analysis/align.py
index 4b1c2c755a0..8cd58a4ae55 100644
--- a/package/MDAnalysis/analysis/align.py
+++ b/package/MDAnalysis/analysis/align.py
@@ -773,10 +773,12 @@ def rms_fit_trj(
     return filename
 
 
-def sequence_alignment(mobile, reference, **kwargs):
-    """Generate a global sequence alignment between residues in *reference* and *mobile*.
+def sequence_alignment(mobile, reference, match_score=2, mismatch_penalty=-1,
+                       gap_penalty=-2, gapextension_penalty=-0.1):
+    """Generate a global sequence alignment between two residue groups.
 
-    The global alignment uses the Needleman-Wunsch algorith as
+    The residues in `reference` and `mobile` will be globally aligned.
+    The global alignment uses the Needleman-Wunsch algorithm as
     implemented in :mod:`Bio.pairwise2`. The parameters of the dynamic
     programming algorithm can be tuned with the keywords. The defaults
     should be suitable for two similar sequences. For sequences with
@@ -789,39 +791,40 @@ def sequence_alignment(mobile, reference, **kwargs):
         Atom group to be aligned
     reference : AtomGroup
         Atom group to be aligned against
-    ** kwargs
-      *match_score*
-         score for matching residues, default :2
-      *mismatch_penalty*
+    match_score : float, optional, default 2
+         score for matching residues, default 2
+    mismatch_penalty : float, optional, default -1
          penalty for residues that do not match , default : -1
-      *gap_penalty*
+    gap_penalty : float, optional, default -2
          penalty for opening a gap; the high default value creates compact
          alignments for highly identical sequences but might not be suitable
          for sequences with low identity, default : -2
-      *gapextension_penalty*
+    gapextension_penalty : float, optional, default -0.1
          penalty for extending a gap, default: -0.1
 
     Returns
     -------
     aln[0]
-        Tuple of top sequence matching output ('Sequence A', 'Sequence B', score,
-        begin, end ,prev_pos, nex_pos)
+        Tuple of top sequence matching output `('Sequence A', 'Sequence B', score,
+        begin, end)`
 
-    BioPython documentation:
-        http://biopython.org/DIST/docs/api/Bio.pairwise2-module.html
+    See Also
+    --------
+    BioPython documentation for `pairwise2`_. Alternatively, use
+    :func:`fasta2select` with :program:`clustalw2` and the option
+    ``is_aligned=False``.
+
+    .. _`pairwise2`: http://biopython.org/DIST/docs/api/Bio.pairwise2-module.html
 
     .. versionadded:: 0.10.0
+
     """
     import Bio.pairwise2
-    kwargs.setdefault('match_score', 2)
-    kwargs.setdefault('mismatch_penalty', -1)
-    kwargs.setdefault('gap_penalty', -2)
-    kwargs.setdefault('gapextension_penalty', -0.1)
 
     aln = Bio.pairwise2.align.globalms(
-        reference.sequence(format="string"), mobile.sequence(format="string"),
-        kwargs['match_score'], kwargs['mismatch_penalty'],
-        kwargs['gap_penalty'], kwargs['gapextension_penalty'])
+        reference.residues.sequence(format="string"),
+        mobile.residues.sequence(format="string"),
+        match_score, mismatch_penalty, gap_penalty, gapextension_penalty)
     # choose top alignment
     return aln[0]
 
@@ -892,10 +895,16 @@ def fasta2select(fastafilename, is_aligned=False,
 
     Returns
     -------
-    select_dict
+    select_dict : dict
         dictionary with 'reference' and 'mobile' selection string
         that can be used immediately in :func:`rms_fit_trj` as
         ``select=select_dict``.
+
+    See Also
+    --------
+    :func:`sequence_alignment`, which does not require external
+    programs.
+
     """
     import Bio.SeqIO
     import Bio.AlignIO
diff --git a/testsuite/MDAnalysisTests/analysis/test_align.py b/testsuite/MDAnalysisTests/analysis/test_align.py
index dddccae33ab..303f1c86404 100644
--- a/testsuite/MDAnalysisTests/analysis/test_align.py
+++ b/testsuite/MDAnalysisTests/analysis/test_align.py
@@ -22,7 +22,7 @@
 
 from numpy.testing import (TestCase, dec,
                            assert_almost_equal, assert_raises, assert_equal,
-                           assert_)
+                           assert_array_equal, assert_)
 import numpy as np
 from nose.plugins.attrib import attr
 
@@ -190,14 +190,14 @@ def test_alignto_checks_selections(self):
         def different_size():
             a = u.atoms[10:100]
             b = u.atoms[10:101]
-            return MDAnalysis.analysis.align.alignto(a, b)
+            return align.alignto(a, b)
 
         assert_raises(SelectionError, different_size)
 
         def different_atoms():
             a = u.atoms[10:20]
             b = u.atoms[10:17] + u.atoms[18:21]
-            return MDAnalysis.analysis.align.alignto(a, b)
+            return align.alignto(a, b)
 
         assert_raises(SelectionError, different_atoms)
 
@@ -216,9 +216,7 @@ def tearDown(self):
     @attr('issue')
     def test_fasta2select_aligned(self):
         """test align.fasta2select() on aligned FASTA (Issue 112)"""
-        from MDAnalysis.analysis.align import fasta2select
-
-        sel = fasta2select(self.seq, is_aligned=True)
+        sel = align.fasta2select(self.seq, is_aligned=True)
         # length of the output strings, not residues or anything real...
         assert_equal(len(sel['reference']), 30623,
                      err_msg="selection string has unexpected length")
@@ -229,14 +227,10 @@ def test_fasta2select_aligned(self):
     @dec.skipif(executable_not_found("clustalw2"),
                 msg="Test skipped because clustalw2 executable not found")
     def test_fasta2select_ClustalW(self):
-        """MDAnalysis.analysis.align: test fasta2select() with calling
-        ClustalW (Issue 113)"""
-        # note: will not be run if clustalw is not installed
-        from MDAnalysis.analysis.align import fasta2select
-
-        sel = fasta2select(self.seq, is_aligned=False,
-                           alnfilename=self.alnfile,
-                           treefilename=self.treefile)
+        """MDAnalysis.analysis.align: test fasta2select() with ClustalW (Issue 113)"""
+        sel = align.fasta2select(self.seq, is_aligned=False,
+                                 alnfilename=self.alnfile,
+                                 treefilename=self.treefile)
         # numbers computed from alignment with clustalw 2.1 on Mac OS X
         # [orbeckst] length of the output strings, not residues or anything
         # real...
@@ -244,3 +238,21 @@ def test_fasta2select_ClustalW(self):
                      err_msg="selection string has unexpected length")
         assert_equal(len(sel['mobile']), 23090,
                      err_msg="selection string has unexpected length")
+
+def test_sequence_alignment():
+    u = MDAnalysis.Universe(PSF)
+    reference = u.atoms
+    mobile = u.select_atoms("resid 122-159")
+    aln = align.sequence_alignment(mobile, reference)
+
+    assert_equal(len(aln), 5, err_msg="return value has wrong tuple size")
+
+    seqA, seqB, score, begin, end = aln
+    assert_equal(seqA, reference.residues.sequence(format="string"),
+                 err_msg="reference sequence mismatch")
+    assert_(mobile.residues.sequence(format="string") in seqB,
+            "mobile sequence mismatch")
+    assert_almost_equal(score, 54.6)
+    assert_array_equal([begin, end], [0, reference.n_residues])
+
+