NCBI has changed the formatting of the GenBank LOCUS line for release 127, released on 15 December. The change allows for larger sequences and longer locus names. It also allows a tokenization based parsing rather than a column based parsing. See section 1.4.1 ("LOCUS line format change : to accomodate longer names and sequences") in ftp://ftp.ncbi.nih.gov/genbank/release.notes/gb126.release.notes for details.
We thought about changing the parsing to handle only the new format. It would be nice to be able to just tokenize on white space and get rid of the old format, but that would have caused us, and presumably others, some maintenance headaches since the old files still exist. The approach we took was to support both formats. To handle this issue for our customers we've made the following change to the GenbankContext class in biojava/bio/seq/io/GenbankFormat.java. Any objections to us checking this change in so that others can use it too? Scott PS We haven't yet made the corresponding change to the writer, but there we would just support the new format, just like NCBI does. ----------------------------------------------------------------------- current BioJava code - private void processHeaderLine(String line) throws ParseException { if(line.startsWith(GenbankFormat.LOCUS_TAG)) { // the LOCUS line is a special case because it contains the // locus, size, molecule type, GenBank division, and the date // of last modification. this.saveSeqAnno(); StringTokenizer lineTokens = new StringTokenizer(line); headerTag = lineTokens.nextToken(); headerTagText = new StringBuffer(lineTokens.nextToken()); this.saveSeqAnno(); headerTag = GenbankFormat.SIZE_TAG; headerTagText = new StringBuffer(lineTokens.nextToken()); // read past 'bp' lineTokens.nextToken(); // At this point there are three optional fields, strand number, // type, and circularity. if(line.charAt(34) != ' ') { this.saveSeqAnno(); headerTag = GenbankFormat.STRAND_NUMBER_TAG; if(line.charAt(37) == ' ') headerTagText = new StringBuffer(lineTokens.nextToken()); else // Both STRAND and TYPE fields are in the token { String fields = lineTokens.nextToken(); headerTagText = new StringBuffer(fields.substring(0,3)); this.saveSeqAnno(); headerTag = GenbankFormat.TYPE_TAG; headerTagText = new StringBuffer(fields.substring(3)); } } else if(line.charAt(37) != ' ') { this.saveSeqAnno(); headerTag = GenbankFormat.TYPE_TAG;// Check this; may be under PROP headerTagText = new StringBuffer(lineTokens.nextToken()); } if(line.charAt(43) != ' ') { this.saveSeqAnno(); headerTag = GenbankFormat.CIRCULAR_TAG; headerTagText = new StringBuffer(lineTokens.nextToken()); } this.saveSeqAnno(); headerTag = GenbankFormat.DIVISION_TAG; // May be under PROP headerTagText = new StringBuffer(lineTokens.nextToken()); this.saveSeqAnno(); headerTag = GenbankFormat.DATE_TAG; headerTagText = new StringBuffer(lineTokens.nextToken()); } else if(line.startsWith(GenbankFormat.VERSION_TAG)) { // VERSION line is a special case because it contains both // the VERSION field and the GI number this.saveSeqAnno(); StringTokenizer lineTokens = new StringTokenizer(line); headerTag = lineTokens.nextToken(); headerTagText = new StringBuffer(lineTokens.nextToken()); String nextToken = lineTokens.nextToken(); if(nextToken.startsWith(GenbankFormat.GI_TAG)) { this.saveSeqAnno(); headerTag = GenbankFormat.GI_TAG; // Possibly should be UID? headerTagText = new StringBuffer(nextToken.substring(3)); } } else if (hasHeaderTag(line)) { // line has a header tag this.saveSeqAnno(); headerTag = line.substring(0, TAG_LENGTH).trim(); headerTagText = new StringBuffer(line.substring(TAG_LENGTH)); } else { // keep appending tag text value headerTagText.append(" " + line.substring(TAG_LENGTH)); } } ----------------------------------------------------------------------- modified code - private void processHeaderLine(String line) throws ParseException { if(line.startsWith(GenbankFormat.LOCUS_TAG)) { // Genbank changed the format of the Locus line for release 127. // The new format is incompatible with the old. if(this.isLocusLinePre127(line)) { this.parseLocusLinePre127(line); } else { this.parseLocusLinePost127(line); } } else if (line.startsWith(GenbankFormat.VERSION_TAG)) { // VERSION line is a special case because it contains both // the VERSION field and the GI number this.saveSeqAnno(); StringTokenizer lineTokens = new StringTokenizer(line); headerTag = lineTokens.nextToken(); headerTagText = new StringBuffer(lineTokens.nextToken()); if (lineTokens.hasMoreTokens()) { String nextToken = lineTokens.nextToken(); if(nextToken.startsWith(GenbankFormat.GI_TAG)) { this.saveSeqAnno(); headerTag = GenbankFormat.GI_TAG; // Possibly should be UID? headerTagText = new StringBuffer(nextToken.substring(3)); } } } else if (hasHeaderTag(line)) { // line has a header tag this.saveSeqAnno(); headerTag = line.substring(0, TAG_LENGTH).trim(); headerTagText = new StringBuffer(line.substring(TAG_LENGTH)); } // gbpri1.seq (Release 125.0) has a line which is not // TAG_LENGTH long. Patch offered by Ron Kuhn ([EMAIL PROTECTED]) else if (line.length() >= TAG_LENGTH) { // keep appending tag text value headerTagText.append(" " + line.substring(TAG_LENGTH)); } } /** * Checks which version of the locus line format is used. The algorithm * switches on the size of the line; <75 means pre-127, otherwise it's 127. * * @param theLine the line to check the format of. * @return TRUE if the line is in Genbank release 126 or earlier format. * FALSE otherwise */ private boolean isLocusLinePre127(String theLine) { return (theLine.length() < 75); } /** * Parses the locus line assuming it is in pre release 127 format. * * @param theLine Locus line to parse. * @throws ParseException If the line is too short. */ private void parseLocusLinePre127(String theLine) throws ParseException { if (theLine.length() < 73) { throw new ParseException("LOCUS line too short [" + theLine + "]"); } saveSeqAnno2(GenbankFormat.LOCUS_TAG, theLine.substring(12, 22)); saveSeqAnno2(GenbankFormat.SIZE_TAG, theLine.substring(22, 29)); saveSeqAnno2(GenbankFormat.STRAND_NUMBER_TAG, theLine.substring(33, 35)); saveSeqAnno2(GenbankFormat.TYPE_TAG, theLine.substring(36, 41)); saveSeqAnno2(GenbankFormat.CIRCULAR_TAG, theLine.substring(42, 52)); saveSeqAnno2(GenbankFormat.DIVISION_TAG, theLine.substring(52, 55)); saveSeqAnno2(GenbankFormat.DATE_TAG, theLine.substring(62, 73)); } /** * Parses the locus line assuming it is in post release 127 format. * * @param theLine Locus line to parse. * @throws ParseException If the line is too short. */ private void parseLocusLinePost127(String theLine) throws ParseException { if (theLine.length() < 79) { throw new ParseException("LOCUS line too short [" + theLine + "]"); } StringTokenizer locusTokens = new StringTokenizer(theLine); if(locusTokens.countTokens() != 8) { throw new ParseException("LOCUS line incorrectly tokenized [" + theLine + "]"); } // LOCUS tag; not stored locusTokens.nextToken(); // Locus name saveSeqAnno2(GenbankFormat.LOCUS_TAG, locusTokens.nextToken()); // Sequence length saveSeqAnno2(GenbankFormat.SIZE_TAG, locusTokens.nextToken()); // "bp"; not stored locusTokens.nextToken(); // Strand information // Both the strand and type are in the same token. The strand // information is an optional part, so this is a bit hairy String strandString = locusTokens.nextToken(); StringTokenizer strandTokens = new StringTokenizer(strandString, "-"); if(strandTokens.countTokens() > 1) { saveSeqAnno2(GenbankFormat.STRAND_NUMBER_TAG, strandTokens.nextToken()); } saveSeqAnno2(GenbankFormat.TYPE_TAG, strandTokens.nextToken()); // Circularity saveSeqAnno2(GenbankFormat.CIRCULAR_TAG, locusTokens.nextToken()); // Division code saveSeqAnno2(GenbankFormat.DIVISION_TAG, locusTokens.nextToken()); // Date in dd-MMM-yyyy format saveSeqAnno2(GenbankFormat.DATE_TAG, locusTokens.nextToken()); } /** * Passes the tag and the text to the listener. * * @throws ParseException Thrown when an error occurs parsing the file */ private void saveSeqAnno() throws ParseException { if (!headerTag.equals("")) { // save tag and its text listener.addSequenceProperty(headerTag, headerTagText.toString()); headerTag = ""; headerTagText = new StringBuffer(""); } } /** * Private method to process a header tag and associated value. * * @param tag The tag to add * @param value The value of the associated tag * @throws ParseException Thrown when an error occurs parsing the file */ private void saveSeqAnno2(String tag, String value) throws ParseException { value = value.trim(); // strip whitespace if (value.length() > 0) { this.saveSeqAnno(); headerTag = tag; headerTagText = new StringBuffer(value); } } ----------------------------------------------------------------------- -- Scott Markel, Ph.D. NetGenics, Inc. [EMAIL PROTECTED] 4350 Executive Drive Tel: 858 455 5223 Suite 260 FAX: 858 455 1388 San Diego, CA 92121 _______________________________________________ Biojava-l mailing list - [EMAIL PROTECTED] http://biojava.org/mailman/listinfo/biojava-l