Hi,

sorry for this late response !
As for point 2) (Is there a wrapper for SequenceIO.fileToBiojava(..)), For one of my projects I've written a factory class which returns a Sequence object according to an URI or a string. The formats taken into account are EMBL, Genbank and SwissProt. This project is still going on and not fully tested but by now this code works with my sequences.
If it can help someone...

Franck

p.s. You can find the java file attached.
package uk.ac.ebi.ftv;

import java.io.*;
import java.net.URL;
import java.net.MalformedURLException;
import java.util.regex.Pattern;

import org.biojava.bio.seq.Sequence;
import org.biojava.bio.seq.SequenceIterator;
import org.biojava.bio.seq.io.SeqIOTools;
import org.biojava.bio.seq.io.SequenceBuilder;
import org.biojava.bio.BioException;

/**
 * Project FTV : Feature Table Viewer
 * F. Valentin - Jul 2005
 * Copyright (c) European Bioinformatics Institute 2005
 * <p/>
 * $Header$
 * Version : $Name$
 * <p/>
 * <p/>
 * $Log$
 */
public abstract class SequenceFactory {

        /* ----------------------- Class variables    
--------------------------- */

        // According to the documentation the first line of EMBL and SwissProt 
files are
        // defined as following :
        // EMBL := ID \s+ <entryname> \s+ <dataclass>; \s+ [circular] \s+ 
<molecule>; \s+
        //                <division>; \s+ <seqlength> \s+ BP.
        // <entryname> := \p{Alpha> \w+
        // <dataclass> := standard
        // <molecule>  := .+  (should be the same as the value in the mol_type 
qualifier).
        // < division> := (PHG)|(CON)|... (see EMBL documentation)
        // <seqlength> := \d+
        // 
------------------------------------------------------------------------------
        // SwissProt := ID \s+ <entryname> \s+ <dataclass>; \s+ <type>; 
<length> \s+ AA.
        // <entryname> := \w{1,12}
        // <dataclass  := (STANDARD) | (PRELIMINARY)
        // <type>      := PRT
        // <length>    := \d+
        // 
------------------------------------------------------------------------------
        // GenBank := LOCUS \s{7} <locusname> \s <length> \s bp \s 
<strandtype><molecule>
        //            \s{2} <type_adn> \s <division> \s <date>
        // <locusname>  := \w ( (\w(?<=\w)) | (\s(?=\s)) ){11}
        // <lentgth>    := \s ( (\s(?<=\s)) | (\d (?=\d) ){4} \d
        // <strandtype> := \s{3} ([sdm]s-)
        // <molecule>   := (NA\s) | ( (DNA) | (tRNA) | (rRNA) | (mRNA) | (uRNA) 
| (snRNA) | (snoRNA)
        // <type_adn>   := (circular) | (linear \s \s)
        // <division>   := \w{3}
        // <date>       := // date format dd-MMM-yyyy
        // 
------------------------------------------------------------------------------
        // DDBJ := the format seems to be the same as Genbank.
        // TODO need to be confirmed.
        //
        // We don't strictly follow these definitions. The important point here 
is to
        // be able to distinguish the different formats. However, if new 
formats are
        // added it's important to adapt the tests to keep the choice 
deterministic !

        private static Pattern EMBL_PATTERN      = 
Pattern.compile("\\AID.+BP\\.\\s*$",     Pattern.MULTILINE);
        private static Pattern GENBANK_PATTERN   = 
Pattern.compile("\\ALOCUS.+\\d{4}\\s*$", Pattern.MULTILINE);
        private static Pattern SWISSPROT_PATTERN = 
Pattern.compile("\\AID.+AA\\.\\s*$",     Pattern.MULTILINE);

        /* ------------------------- Class methods    
--------------------------- */

        /**
         * Create the biojava object Sequence according to the first line of 
the string.
         * @param st A string representing the sequence.
         * @return the sequence object.
         */
        private static Sequence createSequenceFromString(String st) throws 
FtvUserException {
                SequenceIterator iterator;
                BufferedReader   br = new BufferedReader(new StringReader(st));
                Sequence         sequence;

                // If EMBL format
                if (EMBL_PATTERN.matcher(st).find()) {
                        iterator = SeqIOTools.readEmbl(br);
                }
                // Genbank/DDBJ format
                else if (GENBANK_PATTERN.matcher(st).find()) {
                        iterator = SeqIOTools.readGenbank(br);
                }
                // SwissProt format
                else if (SWISSPROT_PATTERN.matcher(st).find()) {
                        iterator = SeqIOTools.readSwissprot(br);
                }
                else {
                        throw new 
FtvUserException(FtvUtil.MSG_SEQ_FORMAT_UNKNOWN);
                }

                // We read only the first sequence from the iterator (we use an 
iterator here because
                // it's simpler than creating the Sequence object directly, see 
StreamReader's
                // implementation to see what's have to be done).
                try {
                        return sequence = iterator.nextSequence();
                } catch (BioException e) {
                        System.out.println("-------------------------");
                        e.getStackTrace();
                        System.out.println("-------------------------");
                        throw new FtvUserException("BioException : " + 
e.getMessage());

                }
        }

        /**
         * Create a Sequence object according to the sort of string given as a 
parameter :<br>
         * The string can be :<br>
         *    - the sequence itself.<br>
         *    - an URI to the sequence.<br>
         *        eg. 
http://www.ebi.ac.uk/cgibin/dbfetch?db=EMBL&id=j00021&forma=embl&style=raw<br>
         *            ftp://www.asite.fr/sequence.embl
         * @param st string that represents a sequence.
         * @return the sequence object.
         */
        public static Sequence createSequence(String st) throws 
FtvUserException, IOException {
                StringBuffer   sb_sequence = new StringBuffer();
                String         st_sequence;
                BufferedReader in       = null;
                URL            url      = null;
                String         seq_line = null ;

                // If the URL has no protocol defined, this is the sequence 
itself.
                // (See http://www.ietf.org/rfc/rfc2396.txt chap 3.1)
                if (! st.matches("\\A\\w*(\\w|\\d|\\+|-|\\.):.+$")) {
                        st_sequence = new String(st);
                }
                else {
                        try {
                                url = new URL(st);
                                in  = new BufferedReader(new 
InputStreamReader(url.openStream()));

                                while ((seq_line = in.readLine()) != null) {
                                        
sb_sequence.append(seq_line).append("\n");
                                }
                                in.close();
                                st_sequence = new String(sb_sequence);

                        } catch (MalformedURLException e) {
                                throw new 
FtvUserException(FtvUtil.MSG_PROTOCOL_UNKNOWN);
                        } catch (FileNotFoundException e)  {
                                throw new 
FtvUserException(FtvUtil.MSG_FILE_NOT_FOUND);
                        } catch (IOException e) {
                                throw e;  //To change body of catch statement 
use File | Settings | File Templates.
                        }
                }
                return createSequenceFromString(st_sequence);
        }
}
_______________________________________________
Biojava-l mailing list  -  Biojava-l@biojava.org
http://biojava.org/mailman/listinfo/biojava-l

Reply via email to