Author: bugman
Date: Wed Jan 21 15:36:43 2015
New Revision: 27257
URL: http://svn.gna.org/viewcvs/relax?rev=27257&view=rev
Log:
Modification of the Needleman-Wunsch sequence alignment algorithm
implementation.
This is in the lib.sequence_alignment.needleman_wunsch functions. Scoring
matrices are now
supported, as well as a user supplied non-integer gap penalty. The algorithm
for walking through
the traceback matrix has been fixed for a bug under certain conditions.
Modified:
trunk/lib/sequence_alignment/needleman_wunsch.py
Modified: trunk/lib/sequence_alignment/needleman_wunsch.py
URL:
http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/needleman_wunsch.py?rev=27257&r1=27256&r2=27257&view=diff
==============================================================================
--- trunk/lib/sequence_alignment/needleman_wunsch.py (original)
+++ trunk/lib/sequence_alignment/needleman_wunsch.py Wed Jan 21 15:36:43 2015
@@ -23,7 +23,7 @@
"""Functions for implementing the Needleman-Wunsch sequence alignment
algorithm."""
# Python module imports.
-from numpy import int16, zeros
+from numpy import float32, int16, zeros
# Default scores.
SCORE_MATCH = 1
@@ -37,18 +37,24 @@
TRACEBACK_LEFT = 2
-def needleman_wunsch_align(sequence1, sequence2):
+def needleman_wunsch_align(sequence1, sequence2, sub_matrix=None,
sub_seq=None, gap_penalty=SCORE_GAP_PENALTY):
"""Align two sequences using the Needleman-Wunsch algorithm.
This is implemented as described in the U{Wikipedia article on the
Needleman-Wunsch algorithm
<https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm>}.
- @param sequence1: The first sequence.
- @type sequence1: str
- @param sequence2: The second sequence.
- @type sequence2: str
- @return: The two alignment strings and the gap matrix.
- @rtype: str, str, numpy rank-2 int array
+ @param sequence1: The first sequence.
+ @type sequence1: str
+ @param sequence2: The second sequence.
+ @type sequence2: str
+ @keyword sub_matrix: The substitution matrix to use to determine
the penalties.
+ @type sub_matrix: numpy rank-2 int array
+ @keyword sub_seq: The one letter code sequence corresponding to
the substitution matrix indices.
+ @type sub_seq: str
+ @keyword gap_penalty: The penalty for introducing gaps, as a
positive number.
+ @type gap_penalty: float
+ @return: The two alignment strings and the gap matrix.
+ @rtype: str, str, numpy rank-2 int array
"""
# The sequence lengths.
@@ -56,14 +62,18 @@
N = len(sequence2)
# Calculate the scoring and traceback matrices.
- matrix, traceback_matrix = needleman_wunsch_matrix(sequence1, sequence2)
+ matrix, traceback_matrix = needleman_wunsch_matrix(sequence1, sequence2,
sub_matrix=sub_matrix, sub_seq=sub_seq, gap_penalty=gap_penalty)
# Generate the alignment.
i = M - 1
j = N - 1
alignment1 = ""
alignment2 = ""
- while i >= 0 or j >= 0:
+ while 1:
+ # Termination.
+ if i < 0 or j < 0:
+ break
+
# Diagonal.
if traceback_matrix[i, j] == TRACEBACK_DIAG:
alignment1 += sequence1[i]
@@ -99,23 +109,29 @@
return align1, align2, gaps
-def needleman_wunsch_matrix(sequence1, sequence2):
+def needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=None,
sub_seq=None, gap_penalty=SCORE_GAP_PENALTY):
"""Construct the Needleman-Wunsch matrix for the given two sequences.
- @param sequence1: The first sequence.
- @type sequence1: str
- @param sequence2: The second sequence.
- @type sequence2: str
- @return: The Needleman-Wunsch matrix and traceback matrix.
- @rtype: numpy rank-2 int array, numpy rank-2 int array
+ @param sequence1: The first sequence.
+ @type sequence1: str
+ @param sequence2: The second sequence.
+ @type sequence2: str
+ @keyword sub_matrix: The substitution matrix to use to determine
the penalties.
+ @type sub_matrix: numpy rank-2 int16 array
+ @keyword sub_seq: The one letter code sequence corresponding to
the substitution matrix indices.
+ @type sub_seq: str
+ @keyword gap_penalty: The penalty for introducing gaps, as a
positive number.
+ @type gap_penalty: float
+ @return: The Needleman-Wunsch matrix and traceback
matrix.
+ @rtype: numpy rank-2 float32 array, numpy rank-2 int16
array
"""
# Initial scoring matrix.
- matrix = zeros((len(sequence1)+1, len(sequence2)+1), int16)
- for i in range(len(matrix)):
- matrix[i, 0] = -i
- for j in range(len(matrix[0])):
- matrix[0, j] = -j
+ matrix = zeros((len(sequence1)+1, len(sequence2)+1), float32)
+ for i in range(1, len(matrix)):
+ matrix[i, 0] = -gap_penalty*i
+ for j in range(1, len(matrix[0])):
+ matrix[0, j] = -gap_penalty*j
# Initial traceback matrix.
traceback_matrix = zeros((len(sequence1), len(sequence2)), int16)
@@ -123,19 +139,24 @@
# Calculate the scores.
for i in range(1, len(matrix)):
for j in range(1, len(matrix[0])):
- # Substitution scores.
- sub_score = SCORE_MISMATCH
- if sequence1[i-1] == sequence2[j-1]:
- sub_score = SCORE_MATCH
+ # Substitution scores when no matrix is provided.
+ if sub_matrix == None:
+ sub_score = SCORE_MISMATCH
+ if sequence1[i-1] == sequence2[j-1]:
+ sub_score = SCORE_MATCH
+
+ # Substitution scores from the matrix.
+ else:
+ sub_score = sub_matrix[sub_seq.index(sequence1[i-1]),
sub_seq.index(sequence2[j-1])]
# The diagonal score.
SCORES[0] = matrix[i-1, j-1] + sub_score
# The top score.
- SCORES[1] = matrix[i-1, j] + SCORE_GAP_PENALTY
+ SCORES[1] = matrix[i-1, j] - gap_penalty
# The left score.
- SCORES[2] = matrix[i, j-1] + SCORE_GAP_PENALTY
+ SCORES[2] = matrix[i, j-1] - gap_penalty
# Store the best score.
matrix[i, j] = SCORES.max()
_______________________________________________
relax (http://www.nmr-relax.com)
This is the relax-commits mailing list
[email protected]
To unsubscribe from this list, get a password
reminder, or change your subscription options,
visit the list information page at
https://mail.gna.org/listinfo/relax-commits