Hello,I just implemented the Needleman-Wunsch-Algorithm and an object for handling substitution matrices like BLOSSUM, PAM, GONNET and so on. It parses a matrix file and provides a method to get the costs for changing Symbol A to Symbol B. This is realized by two hashes and a private int[][] matrix. If NeedlemanWunsch gets equal weights for gap opening and gap extension it won't consider affine gap penalties, whereas otherwise it will, which needs three times more memory.
Andreas Dräger
/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ /* * Created on 01.08.2005 * */ import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.StringTokenizer; import org.biojava.bio.BioException; import org.biojava.bio.seq.io.SymbolTokenization; import org.biojava.bio.symbol.AlphabetManager; import org.biojava.bio.symbol.FiniteAlphabet; import org.biojava.bio.symbol.IllegalAlphabetException; import org.biojava.bio.symbol.IllegalSymbolException; import org.biojava.bio.symbol.Symbol; /** This object is able to read a substitution matrix file and constructs an int matrix to * store the matrix. Every single element of the matrix can be accessed by the method * <code>getValueAt</code> with the parameters being tow biojava symbols. This it why it is * not necessary to access the matrix directly. If there is no value for the two specified * <code>Symbol</code>s an <code>Exception</code> is thrown. * * @author Andreas Dräger * */ public class SubstitutionMatrix { protected Map rowSymbols, colSymbols; protected int[][] matrix; protected int min, max; protected FiniteAlphabet alphabet; protected String description, name; /** This constructs a <code>SubstitutionMatrix</code>-object that contains two * <code>Map</code> data structures having biojava symbols as key and the value * being the index of the matrix containing the substitution score. * * @param alpha the alphabet of the matrix (DNA, RNA or PROTEIN, or PROTEIN-TERM) * @param matrixFile the file containing the substitution matrix. Lines starting with * '#' are comments. The line starting with a white space, is the table head. * Every line has to start with the one letter representation of the Symbol and then * the values for the exchange. * * @throws IOException * @throws BioException */ public SubstitutionMatrix(FiniteAlphabet alpha, File matrixFile) throws IOException, BioException { this.alphabet = alpha; this.description = ""; this.name = matrixFile.getName(); this.rowSymbols = new HashMap(); this.colSymbols = new HashMap(); String matString = ""; BufferedReader br = new BufferedReader(new FileReader(matrixFile)); while(br.ready()) matString += br.readLine() + "\n"; this.matrix = this.parseMatrix(matString); //this.printMatrix(); } /** With this constructor it is possible to construct a SubstitutionMatrix object from a * substitution matrix file. The given String contains a number of lines separated by * <code>\n</code>. Everything else is the same than for the constructor above. * * @param alpha Das FiniteAlphabet, das benutzt werden soll. * @param matrixString * @param name der Matrix. * @throws BioException */ public SubstitutionMatrix(FiniteAlphabet alpha, String matrixString, String name) throws BioException { this.alphabet = alpha; this.description = ""; this.name = name; this.rowSymbols = new HashMap(); this.colSymbols = new HashMap(); this.matrix = this.parseMatrix(matrixString); //this.printMatrix(); } /** Constructs a SubstitutionMatrix with every Match and every Replace having the same * expenses given by the parameters. Ambigious symbols are not considered because there * might be to many of them (for proteins). * * @param alpha * @param match * @param replace */ public SubstitutionMatrix(FiniteAlphabet alpha, int match, int replace) { int i=0, j=0; this.alphabet = alpha; this.description = "Identity matrix. All replaces and all matches are treated equally."; this.name = "IDENTITY_"+match+"_"+replace; this.rowSymbols = new HashMap(); this.colSymbols = new HashMap(); this.matrix = new int[alpha.size()][alpha.size()]; Symbol[] sym = new Symbol[alpha.size()]; Iterator iter = alpha.iterator(); for (i=0; iter.hasNext(); i++) { sym[i] = (Symbol) iter.next(); rowSymbols.put(sym[i], Integer.valueOf(i)); colSymbols.put(sym[i], Integer.valueOf(i)); } for(i=0; i<alphabet.size(); i++) for(j=0; j<alphabet.size(); j++) if (sym[i].getMatches().contains(sym[j])) matrix[i][j] = match; else matrix[i][j] = replace; //this.printMatrix(); } /** Reads a String representing the contents of a substitution matrix file. * * @param matString * @return matrix * @throws BioException */ protected int[][] parseMatrix(String matString) throws BioException { int i = 0, j = 0, rows = 0, cols = 0; StringTokenizer br, st; SymbolTokenization symtok = alphabet.getTokenization("token"); this.min = Integer.MAX_VALUE; this.max = Integer.MIN_VALUE; /* First: count how many elements are in the matrix * fill lines and rows */ br = new StringTokenizer(matString, "\n"); while (br.hasMoreElements()) { String line = br.nextElement().toString(); if (line.startsWith("#")) { description += line.substring(1); continue; } else if (line.startsWith(" ")) { st = new StringTokenizer(line, " "); for (j=0; st.hasMoreElements(); j++) { colSymbols.put(symtok.parseToken(st.nextElement().toString()), Integer.valueOf(j)); } cols = j; } else if (!line.startsWith("\n")) { // the matrix. st = new StringTokenizer(line, " "); if (st.hasMoreElements()) rowSymbols.put(symtok.parseToken(st.nextElement().toString()), Integer.valueOf(rows++)); } } int[][] matrix = new int[rows][cols]; rows = 0; br = new StringTokenizer(matString, "\n"); /* Second reading. Fill the matrix. */ while (br.hasMoreElements()) { String line = br.nextElement().toString(); if (line.startsWith("#")) continue; else if (line.startsWith(" ")) continue; else if (!line.startsWith("\n")) { // lines: st = new StringTokenizer(line, " "); if (st.hasMoreElements()) st.nextElement(); // throw away Symbol at beginning. for (j=0; st.hasMoreElements(); j++) {// cols: matrix[rows][j] = Integer.parseInt(st.nextElement().toString()); if (matrix[rows][j] > max) max = matrix[rows][j]; // Maximum. if (matrix[rows][j] < min) min = matrix[rows][j]; // Minimum. } rows++; } } return matrix; } /** There are some substitution matrices containing more columns than lines. This * has to do with the ambigious symbols. Lines are always good, columns might not * contain the whole information. The matrix is supposed to be symmetric anyway, so * you can always set the ambigious symbol to be the first argument. * @param row Symbol of the line * @param col Symbol of the column * @return expenses for the exchange of symbol row and symbol col. * @throws BioException */ public int getValueAt(Symbol row, Symbol col) throws BioException { if ((!rowSymbols.containsKey(row)) || (!colSymbols.containsKey(col))) throw new BioException("No entry for the sybols "+row.getName()+" and "+col.getName()); return matrix[((Integer) rowSymbols.get(row)).intValue()][((Integer) colSymbols.get(col)).intValue()]; } /** * @return the comment of the matrix */ public String getDescription() { return description; } /** * @return the name of the matrix. */ public String getName() { return name; } /** * @return minimum of the matrix. */ public int getMin() { return min; } /** * * @return maximum of the matrix. */ public int getMax() { return max; } /** Sets the description to the given value. * * @param desc a description. This doesn't have to start with '#'. */ public void setDescription(String desc) { this.description = desc; } /** * @return the alphabet of this matrix. */ public FiniteAlphabet getAlphabet() { return alphabet; } /** * @return a string representation of this matrix without the description. */ public String stringnifyMatrix() { int i = 0; String matrixString = ""; Symbol[] colSyms = new Symbol[this.colSymbols.keySet().size()]; try { SymbolTokenization symtok = alphabet.getTokenization("default"); matrixString += " "; Iterator colKeys = colSymbols.keySet().iterator(); while (colKeys.hasNext()) { colSyms[i] = (Symbol) colKeys.next(); matrixString += symtok.tokenizeSymbol(colSyms[i++]).toUpperCase() + " "; } matrixString += "\n"; Iterator rowKeys = rowSymbols.keySet().iterator(); while (rowKeys.hasNext()) { Symbol rowSym = (Symbol) rowKeys.next(); matrixString += symtok.tokenizeSymbol(rowSym).toUpperCase() + " "; for (i=0; i<colSyms.length; i++) { matrixString += getValueAt(rowSym, colSyms[i]) + " "; } matrixString += "\n"; } } catch (BioException exc) { exc.printStackTrace(); } return matrixString; } /** * @return Gives a description with approximately 60 letters on every line * separated by <code>\n</code>. Every line starts with <code>#</code>. */ public String stringnifyDescription() { String desc = "", line = "# "; StringTokenizer st = new StringTokenizer(description, " "); while(st.hasMoreElements()) { line += st.nextElement().toString()+" "; if (line.length() >= 60) { desc += line + "\n"; if (st.hasMoreElements()) line = "# "; } else if (!st.hasMoreElements()) desc += line + "\n"; } return desc; } /** Overides the inherited method. * @return Gives a string representation of the SubstitutionMatrix. * This is a valid input for the constructor which needs a matrix string. This String * also contains the description of the matrix if there is one. */ public String toString() { String desc = "", line = "# "; StringTokenizer st = new StringTokenizer(description, " "); while(st.hasMoreElements()) { line += st.nextElement().toString()+" "; if (line.length() >= 60) { desc += line + "\n"; if (st.hasMoreElements()) line = "# "; } else if (!st.hasMoreElements()) desc += line + "\n"; } return desc + stringnifyMatrix(); } /** Just to perform some test. It prints the matrix on the screen. */ public void printMatrix() { // Testausgabe: Iterator rowKeys = rowSymbols.keySet().iterator(); while(rowKeys.hasNext()) { Iterator colKeys = colSymbols.keySet().iterator(); Symbol rowSym = (Symbol) rowKeys.next(); System.out.print(rowSym.getName()+"\t"); while(colKeys.hasNext()) { Symbol colSym = (Symbol) colKeys.next(); int x = ((Integer) rowSymbols.get(rowSym)).intValue(); int y = ((Integer) colSymbols.get(colSym)).intValue(); System.out.print(colSym.getName()+" "+" "+x+" "+y+" "+matrix[x][y]+"\t"); } System.out.println("\n"); } System.out.println(toString()); } /** Just for tests. * @param args */ public static void main(String args[]) { try { new SubstitutionMatrix((FiniteAlphabet) AlphabetManager.alphabetForName("PROTEIN-TERM"), new File(args[0])); } catch (IllegalSymbolException exc) { exc.printStackTrace(); } catch (IOException exc) { exc.printStackTrace(); } catch (IllegalAlphabetException exc) { exc.printStackTrace(); } catch (BioException exc) { exc.printStackTrace(); } } }
/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import org.biojava.bio.BioException; import org.biojava.bio.seq.DNATools; import org.biojava.bio.seq.Sequence; import org.biojava.bio.seq.SequenceIterator; import org.biojava.bio.seq.db.SequenceDB; import org.biojava.bio.seq.impl.SimpleGappedSequence; import org.biojava.bio.seq.impl.SimpleSequence; import org.biojava.bio.seq.io.SeqIOTools; import org.biojava.bio.seq.io.SymbolTokenization; import org.biojava.bio.symbol.Alignment; import org.biojava.bio.symbol.AlphabetManager; import org.biojava.bio.symbol.FiniteAlphabet; import org.biojava.bio.symbol.SimpleAlignment; import org.biojava.bio.symbol.SimpleSymbolList; /* * Created on 23.06.2005 */ /** * @author Andreas Dräger */ public class NeedlemanWunsch { protected double[][] CostMatrix; protected FiniteAlphabet alpha; protected SubstitutionMatrix subMatrix; private Alignment pairalign; private double insert; private double delete; private double gapExt; private String alignment; private double match; private double replace; /** Just for some tests. * @param args */ public static void main (String args[]) { if (args.length < 3) throw new Error("Usage: NeedlemanWunsch sourceSeqFile targetSeqFile substitutionMatrixFile"); try { FiniteAlphabet alphabet = (FiniteAlphabet) AlphabetManager.alphabetForName("DNA"); SubstitutionMatrix matrix = new SubstitutionMatrix(alphabet, new File(args[2])); SequenceAlignment aligner = new NeedlemanWunsch( alphabet, 2, // insert 2, // delete 1, // gapExtend 0, // match 3, // replace matrix // SubstitutionMatrix ); aligner.alignAll( // sources SeqIOTools.readFastaDNA(new BufferedReader(new FileReader(args[0]))), // sequenceDB SeqIOTools.readFasta(new FileInputStream(new File(args[1])), DNATools.getDNA())); } catch (NoSuchElementException exc) { exc.printStackTrace(); } catch (FileNotFoundException exc) { exc.printStackTrace(); } catch (BioException exc) { exc.printStackTrace(); } catch (IOException exc) { exc.printStackTrace(); } catch (Exception exc) { exc.printStackTrace(); } } /** Constructs a new Object with the given parameters based on the Needleman-Wunsch algorithm * @param alpha * @param insert * @param delete * @param gapExtend * @param subMat */ public NeedlemanWunsch(FiniteAlphabet alpha, double insert, double delete, double gapExtend, double match, double replace, SubstitutionMatrix subMat) { this.alpha = alpha; this.subMatrix = subMat; this.alpha = subMatrix.getAlphabet(); this.insert = insert; this.delete = delete; this.gapExt = gapExtend; this.match = match; this.replace = replace; this.alignment = ""; } /** Prints a String representation of the CostMatrix for the given Alignment on the screen. * @param queryChar * @param targetChar * @return */ public static String printCostMatrix (double[][] CostMatrix, char[] queryChar, char[] targetChar) { int line, col; String output = "\t"; for (col=0; col<=targetChar.length; col++) if (col==0) output += "["+col+"]\t"; else output += "["+targetChar[col-1]+"]\t"; for (line=0; line<=queryChar.length; line++) { if (line==0) output += "\n["+line+"]\t"; else output += "\n["+queryChar[line-1]+"]\t"; for (col=0; col<=targetChar.length; col++) output += CostMatrix[line][col]+"\t"; } output += "\ndelta[Edit] = "+CostMatrix[line-1][col-1]+"\n"; return output; } /** Computes the optimal alignment of the two sequences and constructs a String * representing the alignment as well as a new Alignment object, which can be * used in different ways by BioJava. * * @param query * @param target * @return alignment string representation */ private String optimalAlignment(Sequence query, Sequence target) throws BioException { /*StringBuffer[] align = new StringBuffer[] {new StringBuffer(CostMatrix.length), new StringBuffer(CostMatrix[0].length)}; StringBuffer path = new StringBuffer("");//*/ String[] align = new String[] {"", ""}; String path = "";//*/ String output = ""; int j = this.CostMatrix[CostMatrix.length - 1].length -1; SymbolTokenization st = alpha.getTokenization("default"); for (int i = this.CostMatrix.length - 1; i>0; ) { do { // From now only Insert possible. if (i == 0) { align[0] = '~' + align[0]; // st.tokenizeSymbol(alpha.getGapSymbol()) + align[0]; // align[1] = st.tokenizeSymbol(target.symbolAt(j--)) + align[1]; path = ' ' + path;//*/ // From now only Delete possible. } else if (j == 0) { align[0] = st.tokenizeSymbol(query.symbolAt(i--)) + align[0]; align[1] = '~' + align[1]; // st.tokenizeSymbol(alpha.getGapSymbol()) + align[1]; // path = ' ' + path;//*/ // Match/Replace } else if (CostMatrix[i-1][j-1] == min(CostMatrix[i][j-1], CostMatrix[i-1][j-1], CostMatrix[i-1][j])) { align[0] = st.tokenizeSymbol(query.symbolAt(i--)) + align[0]; align[1] = st.tokenizeSymbol(target.symbolAt(j--)) + align[1];//*/ if (query.symbolAt(i+1) == target.symbolAt(j+1)) path = '|' + path;//*/ else path = ' ' + path;//*/ // Insert } else if ((CostMatrix[i][j-1] < CostMatrix[i-1][j-1]) && (CostMatrix[i][j-1] < CostMatrix[i-1][j])) { align[0] = '-' + align[0]; // st.tokenizeSymbol(alpha.getGapSymbol()) + align[0]; // align[1] = st.tokenizeSymbol(target.symbolAt(j--)) + align[1]; path = ' ' + path;//*/ // Delete } else { align[0] = st.tokenizeSymbol(query.symbolAt(i--)) + align[0]; align[1] = '-' + align[1]; // st.tokenizeSymbol(alpha.getGapSymbol()) + align[1]; // path = ' ' + path;//*/ } } while (j>0); } // construct the biojava Alignment object: query = new SimpleGappedSequence( new SimpleSequence( new SimpleSymbolList(query.getAlphabet().getTokenization("token"), align[0]), query.getURN(), query.getName(), query.getAnnotation())); target = new SimpleGappedSequence( new SimpleSequence( new SimpleSymbolList(target.getAlphabet().getTokenization("token"), align[1]), target.getURN(), target.getName(), target.getAnnotation())); Map m = new HashMap(); m.put(query.getName(), query); m.put(target.getName(), target); pairalign = new SimpleAlignment(m); output += "Length:\t"+align[0].length()+"\n"; output += "Score:\t\t"+(-1)*getEditDistance()+"\n\n"; int currline = Math.min(60, align[0].length()); // counts the absoulute position in the String. output += "\nQuery:\t" + align[0].substring(0, currline); output += "\n\t" + path.substring(0, currline); output += "\nTarget:\t" + align[1].substring(0, currline)+"\n"; for (; currline+60 < align[0].length(); currline+=60) { output += "\nQuery:\t" + align[0].substring(currline, currline+60); output += "\n\t" + path.substring(currline, currline+60); output += "\nTarget:\t" + align[1].substring(currline, currline+60)+"\n"; } if (currline+1 < align[0].length()) { output += "\nQuery:\t" + align[0].substring(currline, align[0].length()); output += "\n\t" + path.substring(currline, path.length()); output += "\nTarget:\t" + align[1].substring(currline, align[1].length())+"\n"; } output += "\n\n"; return output; } /** prints the alignment String on the screen. * @param align */ public static void printAlignment(String align) { System.out.print(align); } /** * @return Alignment object containing the two gapped sequences constructed from query and target. * @throws Exception */ public Alignment getAlignment(Sequence query, Sequence target) throws Exception { pairwiseAlignment(query, target); return pairalign; } /** * @return returns the edit_distance computed with the given parameters. */ public double getEditDistance() { return CostMatrix[CostMatrix.length-1][CostMatrix[CostMatrix.length-1].length - 1]; } /** * @param x * @param y * @param z * @return Gives the minimum of three doubles */ protected static double min (double x, double y, double z) { if ((x < y) && (x < z)) return x; if (y < z) return y; return z; } /* (non-Javadoc) * @see toolbox.align.SequenceAlignment#getAlignment() */ public String getAlignmentString() throws BioException { return alignment; } /* (non-Javadoc) * @see toolbox.align.SequenceAlignment#alignAll(org.biojava.bio.seq.SequenceIterator, org.biojava.bio.seq.db.SequenceDB) */ public List alignAll(SequenceIterator source, SequenceDB subjectDB) throws NoSuchElementException, BioException { List l = new LinkedList(); while (source.hasNext()) { Sequence query = source.nextSequence(); // compare all the sequences of both sets. SequenceIterator target = subjectDB.sequenceIterator(); while (target.hasNext()) try { l.add(getAlignment(query, target.nextSequence())); //pairwiseAlignment(query, target.nextSequence()); } catch (Exception exc) { exc.printStackTrace(); } } return l; } /** Global pairwise sequence alginment of two BioJava-Sequence objects according to the * Needleman-Wunsch-algorithm. * * @see toolbox.align.SequenceAlignment#pairAlign(org.biojava.bio.seq.Sequence, org.biojava.bio.seq.Sequence) */ public double pairwiseAlignment(Sequence query, Sequence subject) { long time1 = System.currentTimeMillis(); int i, j; double matchReplace; this.CostMatrix = new double[query.length()+1][subject.length()+1]; // Matrix CostMatrix // construct the matrix: CostMatrix[0][0] = 0; /* If we want to have affine gap penalties, we have to initialise additional matrices: * If this is not necessary, we won't do that (because it's expensive). */ if ((gapExt != delete) || (gapExt != insert)) { double[][] E = new double[query.length()+1][subject.length()+1]; // Inserts double[][] F = new double[query.length()+1][subject.length()+1]; // Deletes double[][] G = new double[query.length()+1][subject.length()+1]; // Match/Replace G[0][0] = 0; E[0][0] = F[0][0] = Double.MAX_VALUE; for (i=1; i<=query.length(); i++) { CostMatrix[i][0] = CostMatrix[i-1][0] + delete; G[i][0] = E[i][0] = Double.MAX_VALUE; F[i][0] = delete; } for (j=1; j<=subject.length(); j++) { CostMatrix[0][j] = CostMatrix[0][j-1] + insert; G[0][j] = F[0][j] = Double.MAX_VALUE; E[0][j] = insert; } for (i=1; i<=query.length(); i++) for (j=1; j<=subject.length(); j++) { try { matchReplace = subMatrix.getValueAt(query.symbolAt(i), subject.symbolAt(j)); } catch (IndexOutOfBoundsException exc) { exc.printStackTrace(); } catch (BioException exc) { System.err.println(exc.getMessage()); } finally { if (query.symbolAt(i).getMatches().contains(subject.symbolAt(j)) || subject.symbolAt(j).getMatches().contains(query.symbolAt(i))) matchReplace = match; //0; else matchReplace = replace; //delete/2 + insert/2; } G[i][j] = CostMatrix[i-1][j-1] - matchReplace; E[i][j] = Math.min(E[i][j-1], CostMatrix[i][j-1] + insert) + gapExt; F[i][j] = Math.min(F[i-1][j], CostMatrix[i-1][j] + delete) + gapExt; CostMatrix[i][j] = min (E[i][j], F[i][j], G[i][j]); } } else { /* * No affine gap penalties, constant gap penalties, which is much faster and needs less memory. */ for (i=1; i<=query.length(); i++) CostMatrix[i][0] = CostMatrix[i-1][0] + delete; for (j=1; j<=subject.length(); j++) CostMatrix[0][j] = CostMatrix[0][j-1] + insert; for (i=1; i<=query.length(); i++) for (j=1; j<=subject.length(); j++) { try { matchReplace = subMatrix.getValueAt(query.symbolAt(i), subject.symbolAt(j)); } catch (IndexOutOfBoundsException exc) { exc.printStackTrace(); } catch (BioException exc) { System.err.println(exc.getMessage()); } finally { if (query.symbolAt(i).getMatches().contains(subject.symbolAt(j)) || subject.symbolAt(j).getMatches().contains(query.symbolAt(i))) matchReplace = match; //0; else matchReplace = replace; //delete/2 + insert/2; } CostMatrix[i][j] = min ( CostMatrix[i-1][j] + delete, CostMatrix[i][j-1] + insert, CostMatrix[i-1][j-1] - matchReplace); } } // Everything is the same from here: try { //this.printCostMatrix(query.seqString().toCharArray(), target.seqString().toCharArray()); // generate "String" with optimal Alignment. this.alignment += this.optimalAlignment(query, subject) + "\n"; // print the Alignment on the screen this.printAlignment(alignment); } catch (BioException exc) { exc.printStackTrace(); } time1 = System.currentTimeMillis() - time1; System.out.println("Zeit:\t"+time1); return getEditDistance(); } }
_______________________________________________ Biojava-l mailing list - Biojava-l@biojava.org http://biojava.org/mailman/listinfo/biojava-l