Simon, Thanks for trying out the new code. We'll check it in to the BioJava CVS repository.
Scott Simon Foote wrote: > > No objections here. > > I made the change, and ran my script that generates a blast database of > all the bacterial proteins in GenBank using the release 127 gbbct files > and it worked fine. Also, worked fine on all the daily update files > upto today. > > Only had to make a slight modification to my script to catch a few > records that for some unexplained reason, didn't have a type (ie. > DNA,RNA, etc) in the LOCUS line, thus causing an incorrect number of > tokens exception. > > Aside from that it parsed all the files flawlessly. > > Cheers, > Simon Foote > > -- > Bioinformatics Specialist > Institute for Biological Sciences > National Research Council of Canada > [T] 613-991-4342 [F] 613-952-9092 > > Scott Markel wrote: > > >NCBI has changed the formatting of the GenBank LOCUS line for release > >127, released on 15 December. The change allows for larger sequences > >and longer locus names. It also allows a tokenization based parsing > >rather than a column based parsing. See section 1.4.1 ("LOCUS line > >format change : to accomodate longer names and sequences") in > >ftp://ftp.ncbi.nih.gov/genbank/release.notes/gb126.release.notes for > >details. > > > >We thought about changing the parsing to handle only the new format. It > >would be nice to be able to just tokenize on white space and get rid of > >the old format, but that would have caused us, and presumably others, > >some maintenance headaches since the old files still exist. The > >approach we took was to support both formats. > > > >To handle this issue for our customers we've made the following change > >to the GenbankContext class in biojava/bio/seq/io/GenbankFormat.java. > > > >Any objections to us checking this change in so that others can use it > >too? > > > >Scott > > > >PS We haven't yet made the corresponding change to the writer, but there > >we would just support the new format, just like NCBI does. > > > >----------------------------------------------------------------------- > > > >current BioJava code - > > > > private void processHeaderLine(String line) > > throws ParseException > > { > > if(line.startsWith(GenbankFormat.LOCUS_TAG)) > > { > > // the LOCUS line is a special case because it contains the > > // locus, size, molecule type, GenBank division, and the date > > // of last modification. > > this.saveSeqAnno(); > > StringTokenizer lineTokens = new StringTokenizer(line); > > headerTag = lineTokens.nextToken(); > > headerTagText = new StringBuffer(lineTokens.nextToken()); > > > > this.saveSeqAnno(); > > headerTag = GenbankFormat.SIZE_TAG; > > headerTagText = new StringBuffer(lineTokens.nextToken()); > > // read past 'bp' > > lineTokens.nextToken(); > > > > // At this point there are three optional fields, strand number, > > // type, and circularity. > > if(line.charAt(34) != ' ') > > { > > this.saveSeqAnno(); > > headerTag = GenbankFormat.STRAND_NUMBER_TAG; > > if(line.charAt(37) == ' ') > > headerTagText = new StringBuffer(lineTokens.nextToken()); > > else // Both STRAND and TYPE fields are in the token > > { > > String fields = lineTokens.nextToken(); > > headerTagText = new StringBuffer(fields.substring(0,3)); > > > > this.saveSeqAnno(); > > headerTag = GenbankFormat.TYPE_TAG; > > headerTagText = new StringBuffer(fields.substring(3)); > > } > > } > > else > > if(line.charAt(37) != ' ') > > { > > this.saveSeqAnno(); > > headerTag = GenbankFormat.TYPE_TAG;// Check this; may be under PROP > > headerTagText = new StringBuffer(lineTokens.nextToken()); > > } > > > > if(line.charAt(43) != ' ') > > { > > this.saveSeqAnno(); > > headerTag = GenbankFormat.CIRCULAR_TAG; > > headerTagText = new StringBuffer(lineTokens.nextToken()); > > } > > > > this.saveSeqAnno(); > > headerTag = GenbankFormat.DIVISION_TAG; // May be under PROP > > headerTagText = new StringBuffer(lineTokens.nextToken()); > > > > this.saveSeqAnno(); > > headerTag = GenbankFormat.DATE_TAG; > > headerTagText = new StringBuffer(lineTokens.nextToken()); > > } > > else if(line.startsWith(GenbankFormat.VERSION_TAG)) > > { > > // VERSION line is a special case because it contains both > > // the VERSION field and the GI number > > this.saveSeqAnno(); > > StringTokenizer lineTokens = new StringTokenizer(line); > > headerTag = lineTokens.nextToken(); > > headerTagText = new StringBuffer(lineTokens.nextToken()); > > > > String nextToken = lineTokens.nextToken(); > > if(nextToken.startsWith(GenbankFormat.GI_TAG)) > > { > > this.saveSeqAnno(); > > headerTag = GenbankFormat.GI_TAG; // Possibly should be UID? > > headerTagText = > > new StringBuffer(nextToken.substring(3)); > > } > > } > > else if (hasHeaderTag(line)) > > { // line has a header tag > > this.saveSeqAnno(); > > headerTag = line.substring(0, TAG_LENGTH).trim(); > > headerTagText = new StringBuffer(line.substring(TAG_LENGTH)); > > } > > else > > { // keep appending tag text value > > headerTagText.append(" " + line.substring(TAG_LENGTH)); > > } > > } > > > >----------------------------------------------------------------------- > > > >modified code - > > > > private void processHeaderLine(String line) > > throws ParseException > > { > > if(line.startsWith(GenbankFormat.LOCUS_TAG)) > > { > > // Genbank changed the format of the Locus line for release 127. > > // The new format is incompatible with the old. > > if(this.isLocusLinePre127(line)) > > { > > this.parseLocusLinePre127(line); > > } > > else > > { > > this.parseLocusLinePost127(line); > > } > > } > > else if (line.startsWith(GenbankFormat.VERSION_TAG)) > > { > > // VERSION line is a special case because it contains both > > // the VERSION field and the GI number > > this.saveSeqAnno(); > > StringTokenizer lineTokens = new StringTokenizer(line); > > headerTag = lineTokens.nextToken(); > > headerTagText = new StringBuffer(lineTokens.nextToken()); > > > > if (lineTokens.hasMoreTokens()) { > > String nextToken = lineTokens.nextToken(); > > if(nextToken.startsWith(GenbankFormat.GI_TAG)) > > { > > this.saveSeqAnno(); > > headerTag = GenbankFormat.GI_TAG; // Possibly should be UID? > > headerTagText = > > new StringBuffer(nextToken.substring(3)); > > } > > } > > } > > else if (hasHeaderTag(line)) > > { // line has a header tag > > this.saveSeqAnno(); > > headerTag = line.substring(0, TAG_LENGTH).trim(); > > headerTagText = new StringBuffer(line.substring(TAG_LENGTH)); > > } > > // gbpri1.seq (Release 125.0) has a line which is not > > // TAG_LENGTH long. Patch offered by Ron Kuhn ([EMAIL PROTECTED]) > > else if (line.length() >= TAG_LENGTH) > > { // keep appending tag text value > > headerTagText.append(" " + line.substring(TAG_LENGTH)); > > } > > } > > > > /** > > * Checks which version of the locus line format is used. The algorithm > > * switches on the size of the line; <75 means pre-127, otherwise it's 127. > > * > > * @param theLine the line to check the format of. > > * @return TRUE if the line is in Genbank release 126 or earlier format. > > * FALSE otherwise > > */ > > private boolean isLocusLinePre127(String theLine) > > { > > return (theLine.length() < 75); > > } > > > > /** > > * Parses the locus line assuming it is in pre release 127 format. > > * > > * @param theLine Locus line to parse. > > * @throws ParseException If the line is too short. > > */ > > private void parseLocusLinePre127(String theLine) > > throws ParseException > > { > > if (theLine.length() < 73) > > { > > throw new ParseException("LOCUS line too short [" + theLine > >+ "]"); > > } > > > > saveSeqAnno2(GenbankFormat.LOCUS_TAG, theLine.substring(12, 22)); > > saveSeqAnno2(GenbankFormat.SIZE_TAG, theLine.substring(22, 29)); > > saveSeqAnno2(GenbankFormat.STRAND_NUMBER_TAG, theLine.substring(33, > >35)); > > saveSeqAnno2(GenbankFormat.TYPE_TAG, theLine.substring(36, 41)); > > saveSeqAnno2(GenbankFormat.CIRCULAR_TAG, theLine.substring(42, > >52)); > > saveSeqAnno2(GenbankFormat.DIVISION_TAG, theLine.substring(52, > >55)); > > saveSeqAnno2(GenbankFormat.DATE_TAG, theLine.substring(62, 73)); > > } > > > > /** > > * Parses the locus line assuming it is in post release 127 format. > > * > > * @param theLine Locus line to parse. > > * @throws ParseException If the line is too short. > > */ > > private void parseLocusLinePost127(String theLine) > > throws ParseException > > { > > if (theLine.length() < 79) > > { > > throw new ParseException("LOCUS line too short [" + theLine > >+ "]"); > > } > > > > StringTokenizer locusTokens = new StringTokenizer(theLine); > > if(locusTokens.countTokens() != 8) > > { > > throw new ParseException("LOCUS line incorrectly tokenized > >[" + theLine + "]"); > > } > > // LOCUS tag; not stored > > locusTokens.nextToken(); > > // Locus name > > saveSeqAnno2(GenbankFormat.LOCUS_TAG, locusTokens.nextToken()); > > // Sequence length > > saveSeqAnno2(GenbankFormat.SIZE_TAG, locusTokens.nextToken()); > > // "bp"; not stored > > locusTokens.nextToken(); > > // Strand information > > // Both the strand and type are in the same token. The strand > > // information is an optional part, so this is a bit hairy > > String strandString = locusTokens.nextToken(); > > StringTokenizer strandTokens = new StringTokenizer(strandString, > >"-"); > > if(strandTokens.countTokens() > 1) > > { > > saveSeqAnno2(GenbankFormat.STRAND_NUMBER_TAG, > >strandTokens.nextToken()); > > } > > saveSeqAnno2(GenbankFormat.TYPE_TAG, strandTokens.nextToken()); > > // Circularity > > saveSeqAnno2(GenbankFormat.CIRCULAR_TAG, locusTokens.nextToken()); > > // Division code > > saveSeqAnno2(GenbankFormat.DIVISION_TAG, locusTokens.nextToken()); > > // Date in dd-MMM-yyyy format > > saveSeqAnno2(GenbankFormat.DATE_TAG, locusTokens.nextToken()); > > } > > > > /** > > * Passes the tag and the text to the listener. > > * > > * @throws ParseException Thrown when an error occurs parsing the file > > */ > > private void saveSeqAnno() > > throws ParseException > > { > > if (!headerTag.equals("")) > > { // save tag and its text > > listener.addSequenceProperty(headerTag, headerTagText.toString()); > > headerTag = ""; > > headerTagText = new StringBuffer(""); > > } > > } > > > > /** > > * Private method to process a header tag and associated value. > > * > > * @param tag The tag to add > > * @param value The value of the associated tag > > * @throws ParseException Thrown when an error occurs parsing the file > > */ > > private void saveSeqAnno2(String tag, String value) > > throws ParseException > > { > > value = value.trim(); // strip whitespace > > if (value.length() > 0) { > > this.saveSeqAnno(); > > headerTag = tag; > > headerTagText = new StringBuffer(value); > > } > > } > > > >----------------------------------------------------------------------- > > -- Scott Markel, Ph.D. NetGenics, Inc. [EMAIL PROTECTED] 4350 Executive Drive Tel: 858 455 5223 Suite 260 FAX: 858 455 1388 San Diego, CA 92121 _______________________________________________ Biojava-l mailing list - [EMAIL PROTECTED] http://biojava.org/mailman/listinfo/biojava-l