Hi,
sorry for this late response !
As for point 2) (Is there a wrapper for SequenceIO.fileToBiojava(..)),
For one of my projects I've written a factory class which returns a
Sequence object according to an URI or a string. The formats taken into
account are EMBL, Genbank and SwissProt.
This project is still going on and not fully tested but by now this code
works with my sequences.
If it can help someone...
Franck
p.s. You can find the java file attached.
package uk.ac.ebi.ftv;
import java.io.*;
import java.net.URL;
import java.net.MalformedURLException;
import java.util.regex.Pattern;
import org.biojava.bio.seq.Sequence;
import org.biojava.bio.seq.SequenceIterator;
import org.biojava.bio.seq.io.SeqIOTools;
import org.biojava.bio.seq.io.SequenceBuilder;
import org.biojava.bio.BioException;
/**
* Project FTV : Feature Table Viewer
* F. Valentin - Jul 2005
* Copyright (c) European Bioinformatics Institute 2005
* <p/>
* $Header$
* Version : $Name$
* <p/>
* <p/>
* $Log$
*/
public abstract class SequenceFactory {
/* ----------------------- Class variables
--------------------------- */
// According to the documentation the first line of EMBL and SwissProt
files are
// defined as following :
// EMBL := ID \s+ <entryname> \s+ <dataclass>; \s+ [circular] \s+
<molecule>; \s+
// <division>; \s+ <seqlength> \s+ BP.
// <entryname> := \p{Alpha> \w+
// <dataclass> := standard
// <molecule> := .+ (should be the same as the value in the mol_type
qualifier).
// < division> := (PHG)|(CON)|... (see EMBL documentation)
// <seqlength> := \d+
//
------------------------------------------------------------------------------
// SwissProt := ID \s+ <entryname> \s+ <dataclass>; \s+ <type>;
<length> \s+ AA.
// <entryname> := \w{1,12}
// <dataclass := (STANDARD) | (PRELIMINARY)
// <type> := PRT
// <length> := \d+
//
------------------------------------------------------------------------------
// GenBank := LOCUS \s{7} <locusname> \s <length> \s bp \s
<strandtype><molecule>
// \s{2} <type_adn> \s <division> \s <date>
// <locusname> := \w ( (\w(?<=\w)) | (\s(?=\s)) ){11}
// <lentgth> := \s ( (\s(?<=\s)) | (\d (?=\d) ){4} \d
// <strandtype> := \s{3} ([sdm]s-)
// <molecule> := (NA\s) | ( (DNA) | (tRNA) | (rRNA) | (mRNA) | (uRNA)
| (snRNA) | (snoRNA)
// <type_adn> := (circular) | (linear \s \s)
// <division> := \w{3}
// <date> := // date format dd-MMM-yyyy
//
------------------------------------------------------------------------------
// DDBJ := the format seems to be the same as Genbank.
// TODO need to be confirmed.
//
// We don't strictly follow these definitions. The important point here
is to
// be able to distinguish the different formats. However, if new
formats are
// added it's important to adapt the tests to keep the choice
deterministic !
private static Pattern EMBL_PATTERN =
Pattern.compile("\\AID.+BP\\.\\s*$", Pattern.MULTILINE);
private static Pattern GENBANK_PATTERN =
Pattern.compile("\\ALOCUS.+\\d{4}\\s*$", Pattern.MULTILINE);
private static Pattern SWISSPROT_PATTERN =
Pattern.compile("\\AID.+AA\\.\\s*$", Pattern.MULTILINE);
/* ------------------------- Class methods
--------------------------- */
/**
* Create the biojava object Sequence according to the first line of
the string.
* @param st A string representing the sequence.
* @return the sequence object.
*/
private static Sequence createSequenceFromString(String st) throws
FtvUserException {
SequenceIterator iterator;
BufferedReader br = new BufferedReader(new StringReader(st));
Sequence sequence;
// If EMBL format
if (EMBL_PATTERN.matcher(st).find()) {
iterator = SeqIOTools.readEmbl(br);
}
// Genbank/DDBJ format
else if (GENBANK_PATTERN.matcher(st).find()) {
iterator = SeqIOTools.readGenbank(br);
}
// SwissProt format
else if (SWISSPROT_PATTERN.matcher(st).find()) {
iterator = SeqIOTools.readSwissprot(br);
}
else {
throw new
FtvUserException(FtvUtil.MSG_SEQ_FORMAT_UNKNOWN);
}
// We read only the first sequence from the iterator (we use an
iterator here because
// it's simpler than creating the Sequence object directly, see
StreamReader's
// implementation to see what's have to be done).
try {
return sequence = iterator.nextSequence();
} catch (BioException e) {
System.out.println("-------------------------");
e.getStackTrace();
System.out.println("-------------------------");
throw new FtvUserException("BioException : " +
e.getMessage());
}
}
/**
* Create a Sequence object according to the sort of string given as a
parameter :<br>
* The string can be :<br>
* - the sequence itself.<br>
* - an URI to the sequence.<br>
* eg.
http://www.ebi.ac.uk/cgibin/dbfetch?db=EMBL&id=j00021&forma=embl&style=raw<br>
* ftp://www.asite.fr/sequence.embl
* @param st string that represents a sequence.
* @return the sequence object.
*/
public static Sequence createSequence(String st) throws
FtvUserException, IOException {
StringBuffer sb_sequence = new StringBuffer();
String st_sequence;
BufferedReader in = null;
URL url = null;
String seq_line = null ;
// If the URL has no protocol defined, this is the sequence
itself.
// (See http://www.ietf.org/rfc/rfc2396.txt chap 3.1)
if (! st.matches("\\A\\w*(\\w|\\d|\\+|-|\\.):.+$")) {
st_sequence = new String(st);
}
else {
try {
url = new URL(st);
in = new BufferedReader(new
InputStreamReader(url.openStream()));
while ((seq_line = in.readLine()) != null) {
sb_sequence.append(seq_line).append("\n");
}
in.close();
st_sequence = new String(sb_sequence);
} catch (MalformedURLException e) {
throw new
FtvUserException(FtvUtil.MSG_PROTOCOL_UNKNOWN);
} catch (FileNotFoundException e) {
throw new
FtvUserException(FtvUtil.MSG_FILE_NOT_FOUND);
} catch (IOException e) {
throw e; //To change body of catch statement
use File | Settings | File Templates.
}
}
return createSequenceFromString(st_sequence);
}
}
_______________________________________________
Biojava-l mailing list - Biojava-l@biojava.org
http://biojava.org/mailman/listinfo/biojava-l