This is an automated email from the git hooks/post-receive script. malex-guest pushed a commit to branch master in repository bowtie2.
commit b224a1e79a213864732cf644ee347207713a78cc Author: Alexandre Mestiashvili <[email protected]> Date: Tue Mar 7 10:37:50 2017 +0100 New upstream version 2.3.1 --- MANUAL | 19 ++++ MANUAL.markdown | 40 +++++++ Makefile | 5 +- NEWS | 28 ++++- VERSION | 2 +- bowtie2 | 8 +- bowtie_main.cpp | 241 +++++++++++++++++++++++++++++++++++++++++-- bt2_idx.h | 6 +- bt2_search.cpp | 7 +- doc/manual.html | 24 +++++ doc/website/manual.ssi | 23 ++++- doc/website/old_news.ssi | 40 +++++++ doc/website/recent_news.ssi | 54 ++-------- doc/website/rhsidebar.ssi | 13 +-- formats.h | 1 + opts.h | 3 +- pat.cpp | 145 +++++++++++++++++--------- pat.h | 182 +++++++++++++++++++------------- pthreadGC2.dll | Bin 60273 -> 0 bytes read_qseq.cpp | 8 +- scoring.h | 4 +- scripts/test/simple_tests.pl | 27 +++-- scripts/test/simple_tests.sh | 3 +- 23 files changed, 676 insertions(+), 207 deletions(-) diff --git a/MANUAL b/MANUAL index 359bb68..b1395ac 100644 --- a/MANUAL +++ b/MANUAL @@ -863,6 +863,25 @@ Reads (specified with `<m1>`, `<m2>`, `<s>`) are FASTQ files. FASTQ files usually have extension `.fq` or `.fastq`. FASTQ is the default format. See also: `--solexa-quals` and `--int-quals`. + --interleaved + +Reads interleaved FASTQ files where the first two records (8 lines) +represent a mate pair. + + --tab5 + +Each read or pair is on a single line. An unpaired read line is +[name]\t[seq]\t[qual]\n. A paired-end read line is +[name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n. An input file can be a +mix of unpaired and paired-end reads and Bowtie 2 recognizes each +according to the number of fields, handling each as it should. + + --tab6 + +Similar to `--tab5` except, for paired-end reads, the second end can have a +different name from the first: +[name1]\t[seq1]\t[qual1]\t[name2]\t[seq2]\t[qual2]\n + --qseq Reads (specified with `<m1>`, `<m2>`, `<s>`) are QSEQ files. QSEQ files usually diff --git a/MANUAL.markdown b/MANUAL.markdown index 2ec2828..7378b1a 100644 --- a/MANUAL.markdown +++ b/MANUAL.markdown @@ -921,6 +921,46 @@ usually have extension `.fq` or `.fastq`. FASTQ is the default format. See also: [`--solexa-quals`] and [`--int-quals`]. </td></tr> +<tr><td id="bowtie2-options-interleaved"> + +[`--interleaved`]: #bowtie2-options-interleaved + + --interleaved + +</td><td> + +Reads interleaved FASTQ files where the first two records (8 lines) +represent a mate pair. + +</td></tr> +<tr><td id="bowtie2-options-tab5"> + +[`--tab5`]: #bowtie2-options-tab5 + + --tab5 + +</td><td> + +Each read or pair is on a single line. An unpaired read line is +[name]\t[seq]\t[qual]\n. A paired-end read line is +[name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n. An input file can be a +mix of unpaired and paired-end reads and Bowtie 2 recognizes each +according to the number of fields, handling each as it should. + +</td></tr> +<tr><td id="bowtie2-options-tab6"> + +[`--tab6`]: #bowtie2-options-tab6 + + --tab6 + +</td><td> + +Similar to [`--tab5`] except, for paired-end reads, the second end can have a +different name from the first: +[name1]\t[seq1]\t[qual1]\t[name2]\t[seq2]\t[qual2]\n + +</td></tr> <tr><td id="bowtie2-options-qseq"> [`--qseq`]: #bowtie2-options-qseq diff --git a/Makefile b/Makefile index f2aa557..cca55fb 100644 --- a/Makefile +++ b/Makefile @@ -25,6 +25,7 @@ prefix = /usr/local bindir = $(prefix)/bin INC = +LIBS = -lreadline -ltermcap -lz GCC_PREFIX = $(shell dirname `which gcc`) GCC_SUFFIX = CC ?= $(GCC_PREFIX)/gcc$(GCC_SUFFIX) @@ -94,10 +95,10 @@ endif #default is to use Intel TBB ifneq (1,$(NO_TBB)) - LIBS = $(PTHREAD_LIB) -ltbb -ltbbmalloc_proxy + LIBS += $(PTHREAD_LIB) -ltbb -ltbbmalloc_proxy override EXTRA_FLAGS += -DWITH_TBB else - LIBS = $(PTHREAD_LIB) + LIBS += $(PTHREAD_LIB) endif SEARCH_LIBS = BUILD_LIBS = diff --git a/NEWS b/NEWS index c52d521..de7e1a6 100644 --- a/NEWS +++ b/NEWS @@ -3,7 +3,7 @@ Bowtie 2 NEWS Bowtie 2 is now available for download from the project website, http://bowtie-bio.sf.net/bowtie2. 2.0.0-beta1 is the first version released to -the public and 2.3.0 is the latest version. Bowtie 2 is licensed under +the public and 2.3.1 is the latest version. Bowtie 2 is licensed under the GPLv3 license. See `LICENSE' file for details. Reporting Issues @@ -16,6 +16,32 @@ Please report any issues using the Sourceforge bug tracker: Version Release History ======================= +Version 2.3.1 - Mar 04, 2017 +Please note that as of this release Bowtie 2 now has dependencies on zlib and readline libraries. +Make sure that all dependencies are met before attempting to build from source. + + * Added native support for gzipped read files. The wrapper + script is no longer responsible for decompression. This + simplifies the wrapper and improves speed and thread scalability + for gzipped inputs. + * Fixed a bug that caused `bowtie2-build` to crash when the + first FASTA sequence contains all Ns. + * Add support for interleaved FASTQ format (`-—interleaved`) + * Empty FASTQ inputs would yield an error in Bowtie 2 2.3.0, + whereas previous versions would simply align 0 reads and report + the SAM header as usual. This version returns to the pre-2.3.0 + behavior, resolving a compatibility issue between TopHat2 and + Bowtie 2 2.3.0. + * Fixed a bug whereby combining `-—un-conc*` with `-k` or `-a` + would cause `bowtie2` to print duplicate reads in one or both + of the `--un-conc*` output files, causing the ends to be + misaligned. + * The default `--score-min` for `--local` mode is now `G,20,8`. + That was the stated default in the documentation for a while, + but the actual default was `G,0,10` for many versions. Now the + default matches the documentation and, we find, yields more + accurate alignments than `G,0,10` + Version 2.3.0 - Dec 13, 2016 * Code related to read parsing was completely rewritten to improve scalability to many threads. In short, the critical section is diff --git a/VERSION b/VERSION index 276cbf9..2bf1c1c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.3.0 +2.3.1 diff --git a/bowtie2 b/bowtie2 index 12e3567..16a0175 100755 --- a/bowtie2 +++ b/bowtie2 @@ -217,6 +217,10 @@ for(my $i = 0; $i < scalar(@bt2_args); $i++) { last; } } + if ($arg =~ /(bz2|lz4)$/) { + push @bt2w_args, ("-U", $arg); + $bt2_args[$i] = undef; + } } # If the user asked us to redirect some reads to files, or to suppress # unaligned reads, then we need to capture the output from Bowtie 2 and pass it @@ -312,7 +316,7 @@ sub cat_file($$) { sub wrapInput($$$) { my ($unps, $mate1s, $mate2s) = @_; for my $fn (@$unps, @$mate1s, @$mate2s) { - return 1 if $fn =~ /\.gz$/ || $fn =~ /\.bz2$/ || $fn =~ /\.lz4$/; + return 1 if $fn =~ /\.bz2$/ || $fn =~ /\.lz4$/; } return 0; } @@ -546,7 +550,7 @@ if(defined($cap_out)) { my $fl = substr($_, $tab1_i, $tab2_i - $tab1_i); my $unal = ($fl & 4) != 0; $filt = 1 if $no_unal && $unal; - if($passthru) { + if($passthru && ($fl & 256) == 0) { if(scalar(keys %read_fhs) == 0) { # Next line is read with some whitespace escaped my $l = <BT>; diff --git a/bowtie_main.cpp b/bowtie_main.cpp index 844c8ca..3e39acf 100644 --- a/bowtie_main.cpp +++ b/bowtie_main.cpp @@ -19,10 +19,23 @@ #include <iostream> #include <fstream> +#include <sstream> +#include <iomanip> #include <string.h> #include <stdlib.h> +#include <signal.h> +#include <unistd.h> #include "tokenize.h" -#include "ds.h" +#include <sys/stat.h> +#include <sys/types.h> +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#include <vector> +#include <iterator> +#include <algorithm> +#include <readline/readline.h> +#include <readline/history.h> using namespace std; @@ -39,23 +52,237 @@ extern "C" { * per line, and will dispatch each batch of arguments one at a time to * bowtie. */ + +static volatile sig_atomic_t done = false; + +static const char *options[] = { +"--al", "--al-conc", "--dpad", "--end-to-end", +"--fast", "--fast-local", "--fr", "--gbar", +"--ignore-quals", "--int-quals", "--local", "--ma", +"--met", "--met-file", "--met-stderr", "--mm", +"--mp", "--n-ceil", "--no-1mm-upfront", "--no-contain", +"--no-discordant", "--no-dovetail", "--no-head", "--no-mixed", +"--no-overlap", "--no-sq", "--no-unal", "--nofw", +"--non-deterministic", "--norc", "--np", "--omit-sec-seq", +"--phred33", "--phred64", "--qc-filter", "--qseq", +"--quiet", "--rdg", "--reorder", "--rfg", +"--rg", "--rg-id", "--score-min", "--seed", +"--sensitive", "--sensitive-local", "--un", "--un-conc", +"--un-gz", "--version", "--very-fast", "--very-fast-local", +"--very-sensitive", "--very-sensitive-local", "-3", "-5", +"-D", "-I", "-L", "-N", +"-R", "-X", "-a", "-c", +"-f", "-h", "-i", "-k", +"-p", "-q", "-r", "-s", +"-t", "-u", "-1", "-2", +"-S", "-U", "--all", "--ff", +"--help", "--maxins", "--minins", "--rf", +"--skip", "--threads", "--time", "--trim3", +"--trim5", "--upto", NULL +}; + + +static bool isdirectory(const char *path) { + struct stat statbuf; + if(stat(path, &statbuf) != 0) { + perror("stat"); + return true; + } + return S_ISDIR(statbuf.st_mode); +} + +static char *optgen(const char *text, int state) { + static int list_index, len; + const char *name = NULL; + + if (!state) { + list_index = 0; + len = strlen(text); + } + + name = rl_filename_completion_function(text, state); + if (name != NULL) { + rl_completion_append_character = isdirectory(name) ? '/': ' '; + return strdup(name); + } + + if (text[0] == '-') { + while ((name = options[list_index++])) { + if (strncmp(name, text, len) == 0) { + return strdup(name); + } + } + } + return NULL; +} + +static char **optcomplete(const char *text, int start, int end) { + rl_attempted_completion_over = 1; + return rl_completion_matches(text, optgen); +} + +static void rlinit() { + rl_attempted_completion_function = optcomplete; +} + +static void handler(int sig) { + done = true; +} + +static int _getline(istream *in, char *buf, size_t len) { + if (in == &cin) { + char *input = readline("bowtie2> "); + if (!input) { + buf[0] = '\0'; + } + else { + strncpy(buf, input, len-1); + add_history(buf); + free(input); + } + } + else { + in->getline(buf, len-1); + if (!in->good()) + buf[0] = '\0'; + } + buf[len-1] = '\0'; + return strlen(buf); +} + +static int createfifo(vector<char *>& v) { + const char *pattern = "/tmp/btfifo.XX"; + char *fn = (char *)malloc(strlen(pattern)+1); + memset(fn, 0, strlen(pattern)+1); + strcpy(fn, pattern); + mktemp(fn); + + if (mkfifo(fn, S_IRUSR | S_IWUSR) == -1) { + perror("mkfifo"); + return 1; + } + v.push_back(fn); + return 0; +} + +void printargs(const char **args, size_t size) { + copy(args, args+size, ostream_iterator<string>(cout, " ")); + cout << endl; +} + +bool called_from_wrapper(int argc, const char **argv) { + if (argc > 2 && strcmp(argv[1], "--wrapper") == 0 + && strcmp(argv[2], "basic-0") == 0) + return true; + return false; +} + int main(int argc, const char **argv) { - if(argc > 2 && strcmp(argv[1], "-A") == 0) { - const char *file = argv[2]; + int offset = called_from_wrapper(argc, argv) ? 3 : 1; + + if(argc > offset + 1 && strcmp(argv[offset], "-A") == 0) { + const char *file = argv[offset+1]; ifstream in; - in.open(file); + istream *inptr = ∈ + if (strcmp(file, "-") == 0) { + inptr = &cin; + } + else { + in.open(file); + } char buf[4096]; int lastret = -1; - while(in.getline(buf, 4095)) { - EList<string> args; + + rlinit(); + while(_getline(inptr, buf, 4096)) { + done = false; + vector<string> args; args.push_back(string(argv[0])); + + if (offset > 1) { + args.push_back(string(argv[1])); + args.push_back(string(argv[2])); + } + tokenize(buf, " \t", args); const char **myargs = (const char**)malloc(sizeof(char*)*args.size()); + vector<char *> fifonames; + int sam_outfile_pos = -1; + for(size_t i = 0; i < args.size(); i++) { + if (args[i] == "_") { + if (i > 0 && args[i-1] == "-S") { + sam_outfile_pos = i; + } + else { + createfifo(fifonames); + args[i] = fifonames.back(); + } + } myargs[i] = args[i].c_str(); } if(args.size() == 1) continue; - lastret = bowtie((int)args.size(), myargs); + + if (fifonames.size() > 0) { + struct sigaction sa; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + sa.sa_handler = handler; + + if (sigaction(SIGINT, &sa, NULL) == -1 + || signal(SIGPIPE, SIG_IGN) == SIG_ERR) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + struct pollfd *pollfds = (struct pollfd *)calloc(fifonames.size(), sizeof(struct pollfd)); + if (pollfds == NULL) { + perror("calloc"); + exit(EXIT_FAILURE); + } + + for (size_t i = 0; i < fifonames.size(); i++) { + pollfds[i].fd = open(fifonames[i], O_NONBLOCK | O_EXCL); + pollfds[i].events = POLLIN; + } + + printargs(myargs, args.size()); + + for (int count = 0;; count++) { + size_t r = poll(pollfds, fifonames.size(), -1); + // wait until all fifos are ready + if (r != fifonames.size()) { + if (done) + break; + continue; + } + if (sam_outfile_pos >= 0) { + ostringstream os; + os << "out" << getpid() << "_" << setfill('0') << setw(3) << count << ".sam"; + myargs[sam_outfile_pos] = os.str().c_str(); + printargs(myargs, args.size()); + } + + lastret = bowtie((int)args.size(), myargs); + + // replace the args shuffled by getopt + for (size_t i = 0; i < args.size(); i++) { + myargs[i] = args[i].c_str(); + } + } + + for (size_t i = 0; i < fifonames.size(); i++) { + if (close(pollfds[i].fd)) + perror("close"); + if (remove(fifonames[i])) + perror("remove"); + free(fifonames[i]); + } + free(pollfds); + } + else { + lastret = bowtie((int)args.size(), myargs); + } free(myargs); } if(lastret == -1) { diff --git a/bt2_idx.h b/bt2_idx.h index 0b63148..e2d5083 100644 --- a/bt2_idx.h +++ b/bt2_idx.h @@ -2595,9 +2595,11 @@ void Ebwt::joinToDisk( if(npat >= 0) { writeU<TIndexOffU>(out1, this->plen()[npat], this->toBe()); } - npat++; - this->plen()[npat] = (szs[i].len + szs[i].off); + this->plen()[++npat] = (szs[i].len + szs[i].off); } else { + // edge case, but we could get here with npat == -1 + // e.g. when building from a reference of all Ns + if (npat < 0) npat = 0; this->plen()[npat] += (szs[i].len + szs[i].off); } } diff --git a/bt2_search.cpp b/bt2_search.cpp index 783f544..e5e02c6 100644 --- a/bt2_search.cpp +++ b/bt2_search.cpp @@ -477,6 +477,7 @@ static struct option long_options[] = { {(char*)"12", required_argument, 0, ARG_ONETWO}, {(char*)"tab5", required_argument, 0, ARG_TAB5}, {(char*)"tab6", required_argument, 0, ARG_TAB6}, + {(char*)"interleaved", required_argument, 0, ARG_INTERLEAVED_FASTQ}, {(char*)"phred33-quals", no_argument, 0, ARG_PHRED33}, {(char*)"phred64-quals", no_argument, 0, ARG_PHRED64}, {(char*)"phred33", no_argument, 0, ARG_PHRED33}, @@ -685,6 +686,9 @@ static void printUsage(ostream& out) { << endl << " Input:" << endl << " -q query input files are FASTQ .fq/.fastq (default)" << endl + << " --interleaved query input files are interleaved paired-end FASTQ reads" << endl + << " --tab5 query input files are TAB5 .tab5" << endl + << " --tab6 query input files are TAB6 .tab6" << endl << " --qseq query input files are in Illumina's qseq format" << endl << " -f query input files are (multi-)FASTA .fa/.mfa" << endl << " -r query input files are raw one-sequence-per-line" << endl @@ -753,7 +757,7 @@ static void printUsage(ostream& out) { << " --fr/--rf/--ff -1, -2 mates align fw/rev, rev/fw, fw/fw (--fr)" << endl << " --no-mixed suppress unpaired alignments for paired reads" << endl << " --no-discordant suppress discordant alignments for paired reads" << endl - << " --no-dovetail not concordant when mates extend past each other" << endl + << " --dovetail concordant when mates extend past each other" << endl << " --no-contain not concordant when one mate alignment contains other" << endl << " --no-overlap not concordant when mates overlap at all" << endl << endl @@ -935,6 +939,7 @@ static void parseOption(int next_option, const char *arg) { case ARG_ONETWO: tokenize(arg, ",", mates12); format = TAB_MATE5; break; case ARG_TAB5: tokenize(arg, ",", mates12); format = TAB_MATE5; break; case ARG_TAB6: tokenize(arg, ",", mates12); format = TAB_MATE6; break; + case ARG_INTERLEAVED_FASTQ: tokenize(arg, ",", mates12); format = INTERLEAVED; break; case 'f': format = FASTA; break; case 'F': { format = FASTA_CONT; diff --git a/doc/manual.html b/doc/manual.html index 0b2ba00..fade3a3 100644 --- a/doc/manual.html +++ b/doc/manual.html @@ -358,6 +358,30 @@ Reference: GCAGATTATATGAGTCAGCTACGATATTGTTTGGGGTGACACATTACGCGTCTTTGAC</code></pr </td> </tr> <tr> +<td id="bowtie2-options-interleaved"> +<pre><code>--interleaved</code></pre> +</td> +<td> +<p>Reads interleaved FASTQ files where the first two records (8 lines) represent a mate pair.</p> +</td> +</tr> +<tr> +<td id="bowtie2-options-tab5"> +<pre><code>--tab5</code></pre> +</td> +<td> +<p>Each read or pair is on a single line. An unpaired read line is [name]. A paired-end read line is [name]. An input file can be a mix of unpaired and paired-end reads and Bowtie 2 recognizes each according to the number of fields, handling each as it should.</p> +</td> +</tr> +<tr> +<td id="bowtie2-options-tab6"> +<pre><code>--tab6</code></pre> +</td> +<td> +<p>Similar to <a href="#bowtie2-options-tab5"><code>--tab5</code></a> except, for paired-end reads, the second end can have a different name from the first: [name1]</p> +</td> +</tr> +<tr> <td id="bowtie2-options-qseq"> <pre><code>--qseq</code></pre> </td> diff --git a/doc/website/manual.ssi b/doc/website/manual.ssi index 038aa6a..7c50b2a 100644 --- a/doc/website/manual.ssi +++ b/doc/website/manual.ssi @@ -1,5 +1,5 @@ <h1>Table of Contents</h1> -<p> Version <b>2.3.0</b></p> +<p> Version <b>2.3.1</b></p> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> @@ -347,6 +347,27 @@ Reference: GCAGATTATATGAGTCAGCTACGATATTGTTTGGGGTGACACATTACGCGTCTTTGAC</code></pr <p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are FASTQ files. FASTQ files usually have extension <code>.fq</code> or <code>.fastq</code>. FASTQ is the default format. See also: <a href="#bowtie2-options-solexa-quals"><code>--solexa-quals</code></a> and <a href="#bowtie2-options-int-quals"><code>--int-quals</code></a>.</p> </td></tr> +<tr><td id="bowtie2-options-interleaved"> + +<pre><code>--interleaved</code></pre> +</td><td> + +<p>Reads interleaved FASTQ files where the first two records (8 lines) represent a mate pair.</p> +</td></tr> +<tr><td id="bowtie2-options-tab5"> + +<pre><code>--tab5</code></pre> +</td><td> + +<p>Each read or pair is on a single line. An unpaired read line is [name]. A paired-end read line is [name]. An input file can be a mix of unpaired and paired-end reads and Bowtie 2 recognizes each according to the number of fields, handling each as it should.</p> +</td></tr> +<tr><td id="bowtie2-options-tab6"> + +<pre><code>--tab6</code></pre> +</td><td> + +<p>Similar to <a href="#bowtie2-options-tab5"><code>--tab5</code></a> except, for paired-end reads, the second end can have a different name from the first: [name1]</p> +</td></tr> <tr><td id="bowtie2-options-qseq"> <pre><code>--qseq</code></pre> diff --git a/doc/website/old_news.ssi b/doc/website/old_news.ssi index dc0983b..7df0a31 100644 --- a/doc/website/old_news.ssi +++ b/doc/website/old_news.ssi @@ -1,3 +1,43 @@ +<h2>Version 2.2.4 - Oct 22, 2014</h2> +<ul> + <li>Fixed a Mavericks OSX specific bug caused by some linkage ambiguities.</li> + <li>Added lz4 compression option for the wrapper.</li> + <li>Fixed the vanishing <tt><a href="manual.shtml#bowtie2-options-no-unal">--no-unal</a></tt> help line.</li> + <li>Added the static linkage for MinGW builds.</li> + <li>Added extra seed-hit output.</li> + <li>Fixed missing 0-length read in fastq <tt>--passthrough</tt> output.</li> + <li>Fixed an issue that would cause different output in <tt>-a</tt> mode depending on random seed.</li> +</ul> + +<h2>Version 2.2.3 - May 30, 2014</h2> +<ul> + <li>Fixed a bug that made loading an index into memory crash sometimes.</li> + <li>Fixed a silent failure to warn the user in case the <tt><a href="manual.shtml#bowtie2-options-x">-x</a></tt> option is missing.</li> + <li>Updated <tt><a href="manual.shtml#bowtie2-options-al">--al</a></tt>, <tt><a href="manual.shtml#bowtie2-options-un">--un</a></tt>, <tt><a href="manual.shtml#bowtie2-options-al-conc">al-conc</a></tt> and <tt><a href="manual.shtml#bowtie2-options-un-conc">un-conc</a></tt> options to avoid confusion in cases where the user does not provide a base file name.</li> + <li>Fixed a spurious assert that made bowtie2-inspect debug fail.</li> +</ul> + +<h2>Version 2.2.2 - April 10, 2014</h2> +<ul> + <li>Improved performance for cases where the reference contains ambiguous + or masked nucleobases represented by Ns. </li> +</ul> + +<h2>Version 2.2.1 - February 28, 2014</h2> +<ul> + <li>Improved way in which index files are loaded for alignment. Should fix + efficiency problems on some filesystems.</li> + <li>Fixed a bug that made older systems unable to correctly deal with bowtie + relative symbolic links.</li> + <li>Fixed a bug that, for very big indexes, could determine to determine file + offsets correctly.</li> + <li>Fixed a bug where using <tt><a href="manual.shtml#bowtie2-options-no-unal">--no-unal</a></tt> option incorrectly suppressed + <tt><a href="manual.shtml#bowtie2-options-un">--un</a></tt>/<tt><a href="manual.shtml#bowtie2-options-un-conc">--un-conc</a></tt> output.</li> + <li>Dropped a perl dependency that could cause problems on old systems.</li> + <li>Added <tt><a href="manual.shtml#bowtie2-options-no-1mm-upfront">--no-1mm-upfront</a></tt> option and clarified documentation for parameters + governing the multiseed heuristic.</li> +</ul> + <h2>Bowtie 2 on GitHub - February 4, 2014</h2> <ul> <li>Bowtie 2 source now lives in a <a href="https://github.com/BenLangmead/bowtie2">public GitHub repository</a>.</li> diff --git a/doc/website/recent_news.ssi b/doc/website/recent_news.ssi index 958a2fa..07103af 100644 --- a/doc/website/recent_news.ssi +++ b/doc/website/recent_news.ssi @@ -1,3 +1,13 @@ +<h2>Version 2.3.1 - Mar 03, 2017</h2> +<p>Please note that as of this release Bowtie 2 now has dependencies on zlib and readline libraries. Make sure that all dependencies are met before attempting to build from source.</p> +<ul> + <li>Added native support for gzipped read files. The wrapper script is no longer responsible for decompression. This simplifies the wrapper and improves speed and thread scalability for gzipped inputs.</li> + <li>Fixed a bug that caused <tt>'bowtie2-build'</tt> to crash when the first FASTA sequence contains all Ns.</li> + <li>Add support for interleaved FASTQ format <tt><a href="manual.shtml#bowtie2-options-interleaved">-—interleaved</a></tt>.</li> + <li>Empty FASTQ inputs would yield an error in Bowtie 2 2.3.0, whereas previous versions would simply align 0 reads and report the SAM header as usual. This version returns to the pre-2.3.0 behavior, resolving a compatibility issue between TopHat2 and Bowtie 2 2.3.0.</li> + <li>Fixed a bug whereby combining <tt>'-—un-conc*</tt> with <tt>'-k'</tt> or <tt>'-a'</tt> would cause <tt>'bowtie2'</tt> to print duplicate reads in one or both of the <tt>'--un-conc*'</tt> output files, causing the ends to be misaligned.</li> + <li>The default <tt>'--score-min'</tt> for <tt>'--local'</tt> mode is now <tt>'G,20,8'</tt>. That was the stated default in the documentation for a while, but the actual default was <tt>'G,0,10'</tt> for many versions. Now the default matches the documentation and, we find, yields more accurate alignments than <tt>'G,0,10'</tt></li> +</ul> <h2>Version 2.3.0 - Dec 13, 2016</h2> <p>This is a major release with some larger and many smaller changes. These notes emphasize the large changes. See commit history for details.</p> <ul> @@ -58,47 +68,3 @@ <ul> <li>Lighter is an extremely fast and memory-efficient program for correcting sequencing errors in DNA sequencing data. For details on how error correction can help improve the speed and accuracy of downstream analysis tools, see the <a href="http://genomebiology.com/2014/15/11/509">paper in Genome Biology</a>. Source and software <a href="https://github.com/mourisl/Lighter">available at GitHub</a>. </ul> - - -<h2>Version 2.2.4 - Oct 22, 2014</h2> -<ul> - <li>Fixed a Mavericks OSX specific bug caused by some linkage ambiguities.</li> - <li>Added lz4 compression option for the wrapper.</li> - <li>Fixed the vanishing <tt><a href="manual.shtml#bowtie2-options-no-unal">--no-unal</a></tt> help line.</li> - <li>Added the static linkage for MinGW builds.</li> - <li>Added extra seed-hit output.</li> - <li>Fixed missing 0-length read in fastq <tt>--passthrough</tt> output.</li> - <li>Fixed an issue that would cause different output in <tt>-a</tt> mode depending on random seed.</li> -</ul> - -<h2>Version 2.2.3 - May 30, 2014</h2> -<ul> - <li>Fixed a bug that made loading an index into memory crash sometimes.</li> - <li>Fixed a silent failure to warn the user in case the <tt><a href="manual.shtml#bowtie2-options-x">-x</a></tt> option is missing.</li> - <li>Updated <tt><a href="manual.shtml#bowtie2-options-al">--al</a></tt>, <tt><a href="manual.shtml#bowtie2-options-un">--un</a></tt>, <tt><a href="manual.shtml#bowtie2-options-al-conc">al-conc</a></tt> and <tt><a href="manual.shtml#bowtie2-options-un-conc">un-conc</a></tt> options to avoid confusion in cases where the user does not provide a base file name.</li> - <li>Fixed a spurious assert that made bowtie2-inspect debug fail.</li> -</ul> - -<h2>Version 2.2.2 - April 10, 2014</h2> -<ul> - <li>Improved performance for cases where the reference contains ambiguous - or masked nucleobases represented by Ns. </li> -</ul> - -<h2>Version 2.2.1 - February 28, 2014</h2> -<ul> - <li>Improved way in which index files are loaded for alignment. Should fix - efficiency problems on some filesystems.</li> - <li>Fixed a bug that made older systems unable to correctly deal with bowtie - relative symbolic links.</li> - <li>Fixed a bug that, for very big indexes, could determine to determine file - offsets correctly.</li> - <li>Fixed a bug where using <tt><a href="manual.shtml#bowtie2-options-no-unal">--no-unal</a></tt> option incorrectly suppressed - <tt><a href="manual.shtml#bowtie2-options-un">--un</a></tt>/<tt><a href="manual.shtml#bowtie2-options-un-conc">--un-conc</a></tt> output.</li> - <li>Dropped a perl dependency that could cause problems on old systems.</li> - <li>Added <tt><a href="manual.shtml#bowtie2-options-no-1mm-upfront">--no-1mm-upfront</a></tt> option and clarified documentation for parameters - governing the multiseed heuristic.</li> -</ul> - - - diff --git a/doc/website/rhsidebar.ssi b/doc/website/rhsidebar.ssi index ba11d60..8f976dc 100644 --- a/doc/website/rhsidebar.ssi +++ b/doc/website/rhsidebar.ssi @@ -18,10 +18,10 @@ </tr> <tr> <td> - <a href="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.3.0">Bowtie2 2.3.0</a> + <a href="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.3.1">Bowtie2 2.3.1</a> </td> <td align="right"> - 12/13/16 + 03/04/17 </td> </tr> <tr> @@ -36,7 +36,7 @@ <ul> <li><a href="https://github.com/BenLangmead/bowtie2">Bowtie GitHub repository</a></li> <li><a href="https://github.com/BenLangmead/bowtie2/issues">Report an issue</a></li> - <li><a href="https://lists.sourceforge.net/lists/listinfo/bowtie-bio-announce">Bowtie Mailing list</a></li> + <!--<li><a href="https://lists.sourceforge.net/lists/listinfo/bowtie-bio-announce">Bowtie Mailing list</a></li>--> </ul> </div> <h2>Related Tools</h2> @@ -173,24 +173,25 @@ <ul> <li><a href="http://www.cs.jhu.edu/~langmea/index.shtml">Ben Langmead</a></li> <li><a href="http://www.ccb.jhu.edu/people/infphilo">Daehwan Kim</a></li> - <li>Valentin Antonescu</li> + <li>Rone Charles</li> <li>Chris Wilks</li> + <li>Valentin Antonescu</li> </ul> </div> +<!-- <h2>Other Documentation</h2> <div class="box"> <ul> <li>Bio. of Genomes poster, 5/11 (<a href="http://www.cbcb.umd.edu/~langmead/bog2011_poster.ppt">.ppt</a>, <a href="http://www.cbcb.umd.edu/~langmead/bog2011_poster.pdf">.pdf</a>)</li> </ul> </div> +--> <h2>Related links</h2> <div class="box"> <ul> <li><a href="http://scholar.google.com/scholar?oi=bibs&hl=en&cites=4636016874467197548">Papers citing Bowtie 2</a></li> <li><a href="http://www.jhu.edu/">Johns Hopkins University</a></li> <li><a href="http://www.cs.jhu.edu/">JHU Computer Science</a></li> - <li><a href="http://www.biostat.jhsph.edu/">JHSPH Biostatistics</a></li> - <li><a href="http://seqanswers.com/">SEQanswers</a></li> </ul> </div> </div> <!-- End of "rightside" --> diff --git a/formats.h b/formats.h index 7627827..5067939 100644 --- a/formats.h +++ b/formats.h @@ -30,6 +30,7 @@ enum file_format { FASTA = 1, FASTA_CONT, FASTQ, + INTERLEAVED, TAB_MATE5, TAB_MATE6, RAW, diff --git a/opts.h b/opts.h index a90669b..e57aa26 100644 --- a/opts.h +++ b/opts.h @@ -151,7 +151,8 @@ enum { ARG_DESC_PRIORITIZE, // --desc-prioritize ARG_DESC_FMOPS, // --desc-fmops ARG_LOG_DP, // --log-dp - ARG_LOG_DP_OPP // --log-dp-opp + ARG_LOG_DP_OPP, // --log-dp-opp + ARG_INTERLEAVED_FASTQ // --interleaved }; #endif diff --git a/pat.cpp b/pat.cpp index 782130f..e5f387b 100644 --- a/pat.cpp +++ b/pat.cpp @@ -89,6 +89,7 @@ PatternSource* PatternSource::patsrcFromStrings( case FASTA_CONT: return new FastaContinuousPatternSource(qs, p); case RAW: return new RawPatternSource(qs, p); case FASTQ: return new FastqPatternSource(qs, p); + case INTERLEAVED: return new FastqPatternSource(qs, p, true /* interleaved */); case TAB_MATE5: return new TabbedPatternSource(qs, p, false); case TAB_MATE6: return new TabbedPatternSource(qs, p, true); case CMDLINE: return new VectorPatternSource(qs, p); @@ -166,7 +167,7 @@ pair<bool, bool> PatternSourcePerThread::nextReadPair() { } else { finalize(buf_.read_a()); } - bool this_is_last = buf_.cur_buf_ == last_batch_size_-1; + bool this_is_last = buf_.cur_buf_ == static_cast<unsigned int>(last_batch_size_-1); return make_pair(true, this_is_last ? last_batch_ : false); } @@ -373,14 +374,14 @@ PatternComposer* PatternComposer::setupPatternComposer( } void PatternComposer::free_EList_pmembers( const EList<PatternSource*> &elist) { - for (size_t i = 0; i < elist.size(); i++) - if (elist[i] != NULL) - delete elist[i]; + for (size_t i = 0; i < elist.size(); i++) + if (elist[i] != NULL) + delete elist[i]; } /** * Fill Read with the sequence, quality and name for the next - * read in the list of read files. This function gets called by + * read in the list of read files. This function gets called by * all the search threads, so we must handle synchronization. * * Returns pair<bool, int> where bool indicates whether we're @@ -425,24 +426,44 @@ pair<bool, int> CFilePatternSource::nextBatch( void CFilePatternSource::open() { if(is_open_) { is_open_ = false; - fclose(fp_); - fp_ = NULL; + if (compressed_) { + gzclose(zfp_); + zfp_ = NULL; + } + else { + fclose(fp_); + fp_ = NULL; + } } while(filecur_ < infiles_.size()) { if(infiles_[filecur_] == "-") { - fp_ = stdin; - } else if((fp_ = fopen(infiles_[filecur_].c_str(), "rb")) == NULL) { - if(!errs_[filecur_]) { - cerr << "Warning: Could not open read file \"" - << infiles_[filecur_].c_str() - << "\" for reading; skipping..." << endl; - errs_[filecur_] = true; + // always assume that data from stdin is compressed + compressed_ = true; + int fn = dup(fileno(stdin)); + zfp_ = gzdopen(fn, "rb"); + } + else { + compressed_ = false; + if (is_gzipped_file(infiles_[filecur_])) { + compressed_ = true; + zfp_ = gzopen(infiles_[filecur_].c_str(), "rb"); + } + else { + fp_ = fopen(infiles_[filecur_].c_str(), "rb"); + } + if((compressed_ && zfp_ == NULL) || (!compressed_ && fp_ == NULL)) { + if(!errs_[filecur_]) { + cerr << "Warning: Could not open read file \"" + << infiles_[filecur_].c_str() + << "\" for reading; skipping..." << endl; + errs_[filecur_] = true; + } + filecur_++; + continue; } - filecur_++; - continue; } is_open_ = true; - setvbuf(fp_, buf_, _IOFBF, 64*1024); + compressed_ ? gzbuffer(zfp_, 64*1024) : setvbuf(fp_, buf_, _IOFBF, 64*1024); return; } cerr << "Error: No input read files were valid" << endl; @@ -499,7 +520,7 @@ VectorPatternSource::VectorPatternSource( /** * Read next batch. However, batch concept is not very applicable for this * PatternSource where all the info has already been parsed into the fields - * in the contsructor. This essentially modifies the pt as though we read + * in the contsructor. This essentially modifies the pt as though we read * in some number of patterns. */ pair<bool, int> VectorPatternSource::nextBatch( @@ -622,9 +643,12 @@ pair<bool, int> FastaPatternSource::nextBatchFromFile( int c; EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_; if(first_) { - c = getc_unlocked(fp_); + c = getc_wrapper(); + if (c == EOF) { + return make_pair(true, 0); + } while(c == '\r' || c == '\n') { - c = getc_unlocked(fp_); + c = getc_wrapper(); } if(c != '>') { cerr << "Error: reads file does not look like a FASTA file" << endl; @@ -634,16 +658,13 @@ pair<bool, int> FastaPatternSource::nextBatchFromFile( } bool done = false; size_t readi = 0; - if (feof(fp_)) { - done = true; - } // Read until we run out of input or until we've filled the buffer for(; readi < pt.max_buf_ && !done; readi++) { Read::TBuf& buf = readbuf[readi].readOrigBuf; buf.clear(); buf.append('>'); while(true) { - c = getc_unlocked(fp_); + c = getc_wrapper(); if(c < 0 || c == '>') { done = c < 0; break; @@ -659,7 +680,7 @@ pair<bool, int> FastaPatternSource::nextBatchFromFile( */ bool FastaPatternSource::parse(Read& r, Read& rb, TReadId rdid) const { // We assume the light parser has put the raw data for the separate ends - // into separate Read objects. That doesn't have to be the case, but + // into separate Read objects. That doesn't have to be the case, but // that's how we've chosen to do it for FastqPatternSource assert(!r.readOrigBuf.empty()); assert(r.empty()); @@ -729,7 +750,7 @@ bool FastaPatternSource::parse(Read& r, Read& rb, TReadId rdid) const { * 1. Reads are substrings of a longer FASTA input string * 2. Reads may overlap w/r/t the longer FASTA string * 3. Read names depend on the most recently observed FASTA - * record name + * record name */ pair<bool, int> FastaContinuousPatternSource::nextBatchFromFile( PerThreadReadBuf& pt, @@ -739,13 +760,13 @@ pair<bool, int> FastaContinuousPatternSource::nextBatchFromFile( EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_; size_t readi = 0; while(readi < pt.max_buf_) { - c = getc_unlocked(fp_); + c = getc_wrapper(); if(c < 0) { break; } if(c == '>') { resetForNextFile(); - c = getc_unlocked(fp_); + c = getc_wrapper(); bool sawSpace = false; while(c != '\n' && c != '\r') { if(!sawSpace) { @@ -754,10 +775,10 @@ pair<bool, int> FastaContinuousPatternSource::nextBatchFromFile( if(!sawSpace) { name_prefix_buf_.append(c); } - c = getc_unlocked(fp_); + c = getc_wrapper(); } while(c == '\n' || c == '\r') { - c = getc_unlocked(fp_); + c = getc_wrapper(); } if(c < 0) { break; @@ -869,7 +890,7 @@ bool FastaContinuousPatternSource::parse( /** - * "Light" parser. This is inside the critical section, so the key is to do + * "Light" parser. This is inside the critical section, so the key is to do * just enough parsing so that another function downstream (finalize()) can do * the rest of the parsing. Really this function's only job is to stick every * for lines worth of the input file into a buffer (r.readOrigBuf). finalize() @@ -880,28 +901,32 @@ pair<bool, int> FastqPatternSource::nextBatchFromFile( bool batch_a) { int c; - EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_; + EList<Read>* readbuf = batch_a ? &pt.bufa_ : &pt.bufb_; if(first_) { - c = getc_unlocked(fp_); + c = getc_wrapper(); + if (c == EOF) { + return make_pair(true, 0); + } while(c == '\r' || c == '\n') { - c = getc_unlocked(fp_); + c = getc_wrapper(); } if(c != '@') { cerr << "Error: reads file does not look like a FASTQ file" << endl; throw 1; } first_ = false; - readbuf[0].readOrigBuf.append('@'); + (*readbuf)[0].readOrigBuf.append('@'); } + bool done = false, aborted = false; size_t readi = 0; // Read until we run out of input or until we've filled the buffer - for(; readi < pt.max_buf_ && !done; readi++) { - Read::TBuf& buf = readbuf[readi].readOrigBuf; + while (readi < pt.max_buf_ && !done) { + Read::TBuf& buf = (*readbuf)[readi].readOrigBuf; assert(readi == 0 || buf.empty()); int newlines = 4; while(newlines) { - c = getc_unlocked(fp_); + c = getc_wrapper(); done = c < 0; if(c == '\n' || (done && newlines == 1)) { // Saw newline, or EOF that we're @@ -909,11 +934,29 @@ pair<bool, int> FastqPatternSource::nextBatchFromFile( newlines--; c = '\n'; } else if(done) { - aborted = true; // Unexpected EOF + // account for newline at the end of the file + if (newlines == 4) { + newlines = 0; + } + else { + aborted = true; // Unexpected EOF + } break; } buf.append(c); } + if (c > 0) { + if (interleaved_) { + // alternate between read buffers + batch_a = !batch_a; + readbuf = batch_a ? &pt.bufa_ : &pt.bufb_; + // increment read counter after each pair gets read + readi = batch_a ? readi+1 : readi; + } + else { + readi++; + } + } } if(aborted) { readi--; @@ -926,7 +969,7 @@ pair<bool, int> FastqPatternSource::nextBatchFromFile( */ bool FastqPatternSource::parse(Read &r, Read& rb, TReadId rdid) const { // We assume the light parser has put the raw data for the separate ends - // into separate Read objects. That doesn't have to be the case, but + // into separate Read objects. That doesn't have to be the case, but // that's how we've chosen to do it for FastqPatternSource assert(!r.readOrigBuf.empty()); assert(r.empty()); @@ -1043,9 +1086,9 @@ pair<bool, int> TabbedPatternSource::nextBatchFromFile( PerThreadReadBuf& pt, bool batch_a) { - int c = getc_unlocked(fp_); + int c = getc_wrapper(); while(c >= 0 && (c == '\n' || c == '\r')) { - c = getc_unlocked(fp_); + c = getc_wrapper(); } EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_; size_t readi = 0; @@ -1054,10 +1097,10 @@ pair<bool, int> TabbedPatternSource::nextBatchFromFile( readbuf[readi].readOrigBuf.clear(); while(c >= 0 && c != '\n' && c != '\r') { readbuf[readi].readOrigBuf.append(c); - c = getc_unlocked(fp_); + c = getc_wrapper(); } while(c >= 0 && (c == '\n' || c == '\r')) { - c = getc_unlocked(fp_); + c = getc_wrapper(); } } return make_pair(c < 0, readi); @@ -1177,9 +1220,9 @@ pair<bool, int> RawPatternSource::nextBatchFromFile( PerThreadReadBuf& pt, bool batch_a) { - int c = getc_unlocked(fp_); + int c = getc_wrapper(); while(c >= 0 && (c == '\n' || c == '\r')) { - c = getc_unlocked(fp_); + c = getc_wrapper(); } EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_; size_t readi = 0; @@ -1188,15 +1231,15 @@ pair<bool, int> RawPatternSource::nextBatchFromFile( readbuf[readi].readOrigBuf.clear(); while(c >= 0 && c != '\n' && c != '\r') { readbuf[readi].readOrigBuf.append(c); - c = getc_unlocked(fp_); + c = getc_wrapper(); } while(c >= 0 && (c == '\n' || c == '\r')) { - c = getc_unlocked(fp_); + c = getc_wrapper(); } } // incase a valid character is consumed between batches - if (c >= 0 && c != '\n' && c != '\r') { - ungetc(c, fp_); + if (c >= 0 && c != '\n' && c != '\r') { + ungetc_wrapper(c); } return make_pair(c < 0, readi); } @@ -1252,7 +1295,7 @@ bool RawPatternSource::parse(Read& r, Read& rb, TReadId rdid) const { void wrongQualityFormat(const BTString& read_name) { cerr << "Error: Encountered one or more spaces while parsing the quality " - << "string for read " << read_name << ". If this is a FASTQ file " + << "string for read " << read_name << ". If this is a FASTQ file " << "with integer (non-ASCII-encoded) qualities, try re-running with " << "the --integer-quals option." << endl; throw 1; diff --git a/pat.h b/pat.h index e5071db..a2556c7 100644 --- a/pat.h +++ b/pat.h @@ -20,6 +20,9 @@ #ifndef PAT_H_ #define PAT_H_ +#include <stdio.h> +#include <sys/stat.h> +#include <zlib.h> #include <cassert> #include <string> #include <ctype.h> @@ -75,18 +78,18 @@ struct PatternParams { nthreads(nthreads_), fixName(fixName_) { } - int format; // file format - bool fileParallel; // true -> wrap files with separate PatternComposers - uint32_t seed; // pseudo-random seed - size_t max_buf; // number of reads to buffer in one read - bool solexa64; // true -> qualities are on solexa64 scale - bool phred64; // true -> qualities are on phred64 scale - bool intQuals; // true -> qualities are space-separated numbers - int sampleLen; // length of sampled reads for FastaContinuous... - int sampleFreq; // frequency of sampled reads for FastaContinuous... - size_t skip; // skip the first 'skip' patterns - int nthreads; // number of threads for locking - bool fixName; // + int format; // file format + bool fileParallel; // true -> wrap files with separate PatternComposers + uint32_t seed; // pseudo-random seed + size_t max_buf; // number of reads to buffer in one read + bool solexa64; // true -> qualities are on solexa64 scale + bool phred64; // true -> qualities are on phred64 scale + bool intQuals; // true -> qualities are space-separated numbers + int sampleLen; // length of sampled reads for FastaContinuous... + int sampleFreq; // frequency of sampled reads for FastaContinuous... + size_t skip; // skip the first 'skip' patterns + int nthreads; // number of threads for locking + bool fixName; // }; /** @@ -163,10 +166,10 @@ struct PerThreadReadBuf { } const size_t max_buf_; // max # reads to read into buffer at once - EList<Read> bufa_; // Read buffer for mate as - EList<Read> bufb_; // Read buffer for mate bs - size_t cur_buf_; // Read buffer currently active - TReadId rdid_; // index of read at offset 0 of bufa_/bufb_ + EList<Read> bufa_; // Read buffer for mate as + EList<Read> bufb_; // Read buffer for mate bs + size_t cur_buf_; // Read buffer currently active + TReadId rdid_; // index of read at offset 0 of bufa_/bufb_ }; extern void wrongQualityFormat(const BTString& read_name); @@ -256,7 +259,7 @@ public: /** * Read next batch. However, batch concept is not very applicable for this * PatternSource where all the info has already been parsed into the fields - * in the contsructor. This essentially modifies the pt as though we read + * in the contsructor. This essentially modifies the pt as though we read * in some number of patterns. */ virtual std::pair<bool, int> nextBatch( @@ -280,12 +283,12 @@ public: private: - size_t cur_; // index for first read of next batch - size_t skip_; // # reads to skip - bool paired_; // whether reads are paired - EList<string> tokbuf_; // buffer for storing parsed tokens + size_t cur_; // index for first read of next batch + size_t skip_; // # reads to skip + bool paired_; // whether reads are paired + EList<string> tokbuf_; // buffer for storing parsed tokens EList<Read::TBuf> bufs_; // per-read buffers - char nametmp_[20]; // temp buffer for constructing name + char nametmp_[20]; // temp buffer for constructing name }; /** @@ -303,9 +306,11 @@ public: infiles_(infiles), filecur_(0), fp_(NULL), + zfp_(NULL), is_open_(false), skip_(p.skip), - first_(true) + first_(true), + compressed_(false) { assert_gt(infiles.size(), 0); errs_.resize(infiles_.size()); @@ -319,14 +324,20 @@ public: */ virtual ~CFilePatternSource() { if(is_open_) { - assert(fp_ != NULL); - fclose(fp_); + if (compressed_) { + assert(zfp_ != NULL); + gzclose(zfp_); + } + else { + assert(fp_ != NULL); + fclose(fp_); + } } } /** * Fill Read with the sequence, quality and name for the next - * read in the list of read files. This function gets called by + * read in the list of read files. This function gets called by * all the search threads, so we must handle synchronization. * * Returns pair<bool, int> where bool indicates whether we're @@ -352,7 +363,7 @@ protected: /** * Light-parse a batch of unpaired reads from current file into the given - * buffer. Called from CFilePatternSource.nextBatch(). + * buffer. Called from CFilePatternSource.nextBatch(). */ virtual std::pair<bool, int> nextBatchFromFile( PerThreadReadBuf& pt, @@ -367,15 +378,42 @@ protected: * Open the next file in the list of input files. */ void open(); + + int getc_wrapper() { + return compressed_ ? gzgetc(zfp_) : getc_unlocked(fp_); + } + + int ungetc_wrapper(int c) { + return compressed_ ? gzungetc(c, zfp_) : ungetc(c, fp_); + } + + bool is_gzipped_file(const std::string& filename) { + struct stat s; + if (stat(filename.c_str(), &s) != 0) { + perror("stat"); + } + else { + if (S_ISFIFO(s.st_mode)) + return true; + } + size_t pos = filename.find_last_of("."); + std::string ext = (pos == std::string::npos) ? "" : filename.substr(pos + 1); + if (ext == "" || ext == "gz" || ext == "Z") { + return true; + } + return false; + } EList<std::string> infiles_; // filenames for read files - EList<bool> errs_; // whether we've already printed an error for each file - size_t filecur_; // index into infiles_ of next file to read - FILE *fp_; // read file currently being read from - bool is_open_; // whether fp_ is currently open - TReadId skip_; // number of reads to skip - bool first_; // parsing first record in first file? - char buf_[64*1024]; // file buffer + EList<bool> errs_; // whether we've already printed an error for each file + size_t filecur_; // index into infiles_ of next file to read + FILE *fp_; // read file currently being read from + gzFile zfp_; + bool is_open_; // whether fp_ is currently open + TReadId skip_; // number of reads to skip + bool first_; // parsing first record in first file? + char buf_[64*1024]; // file buffer + bool compressed_; }; /** @@ -469,10 +507,10 @@ protected: PerThreadReadBuf& pt, bool batch_a); - bool solQuals_; // base qualities are log odds + bool solQuals_; // base qualities are log odds bool phred64Quals_; // base qualities are on -64 scale - bool intQuals_; // base qualities are space-separated strings - bool secondName_; // true if --tab6, false if --tab5 + bool intQuals_; // base qualities are space-separated strings + bool secondName_; // true if --tab6, false if --tab5 }; /** @@ -487,8 +525,8 @@ protected: * 5. X coordinate of spot * 6. Y coordinate of spot * 7. Index: "Index sequence or 0. For no indexing, or for a file that - * has not been demultiplexed yet, this field should have a value of - * 0." + * has not been demultiplexed yet, this field should have a value of + * 0." * 8. Read number: 1 for unpaired, 1 or 2 for paired * 9. Sequence * 10. Quality @@ -520,9 +558,9 @@ protected: PerThreadReadBuf& pt, bool batch_a); - bool solQuals_; // base qualities are log odds + bool solQuals_; // base qualities are log odds bool phred64Quals_; // base qualities are on -64 scale - bool intQuals_; // base qualities are space-separated strings + bool intQuals_; // base qualities are space-separated strings EList<std::string> qualToks_; }; @@ -581,19 +619,19 @@ protected: private: const size_t length_; /// length of reads to generate const size_t freq_; /// frequency to sample reads - size_t eat_; /// number of characters we need to skip before - /// we have flushed all of the ambiguous or - /// non-existent characters out of our read - /// window - bool beginning_; /// skipping over the first read length? - char buf_[1024]; /// FASTA sequence buffer + size_t eat_; /// number of characters we need to skip before + /// we have flushed all of the ambiguous or + /// non-existent characters out of our read + /// window + bool beginning_; /// skipping over the first read length? + char buf_[1024]; /// FASTA sequence buffer Read::TBuf name_prefix_buf_; /// FASTA sequence name buffer char name_int_buf_[20]; /// for composing offsets for names - size_t bufCur_; /// buffer cursor; points to where we should - /// insert the next character + size_t bufCur_; /// buffer cursor; points to where we should + /// insert the next character uint64_t subReadCnt_;/// number to subtract from readCnt_ to get - /// the pat id to output (so it resets to 0 for - /// each new sequence) + /// the pat id to output (so it resets to 0 for + /// each new sequence) }; /** @@ -606,12 +644,13 @@ public: FastqPatternSource( const EList<std::string>& infiles, - const PatternParams& p) : + const PatternParams& p, bool interleaved = false) : CFilePatternSource(infiles, p), first_(true), solQuals_(p.solexa64), phred64Quals_(p.phred64), - intQuals_(p.intQuals) { } + intQuals_(p.intQuals), + interleaved_(interleaved) { } virtual void reset() { first_ = true; @@ -639,14 +678,15 @@ protected: first_ = true; } - bool first_; // parsing first read in file - bool solQuals_; // base qualities are log odds + bool first_; // parsing first read in file + bool solQuals_; // base qualities are log odds bool phred64Quals_; // base qualities are on -64 scale - bool intQuals_; // base qualities are space-separated strings + bool intQuals_; // base qualities are space-separated strings + bool interleaved_; // fastq reads are interleaved }; /** - * Read a Raw-format file (one sequence per line). No quality strings + * Read a Raw-format file (one sequence per line). No quality strings * allowed. All qualities are assumed to be 'I' (40 on the Phred-33 * scale). */ @@ -718,15 +758,15 @@ public: * dispense them. */ static PatternComposer* setupPatternComposer( - const EList<std::string>& si, // singles, from argv - const EList<std::string>& m1, // mate1's, from -1 arg - const EList<std::string>& m2, // mate2's, from -2 arg - const EList<std::string>& m12, // both mates on each line, from --12 - const EList<std::string>& q, // qualities associated with singles - const EList<std::string>& q1, // qualities associated with m1 - const EList<std::string>& q2, // qualities associated with m2 - const PatternParams& p, // read-in params - bool verbose); // be talkative? + const EList<std::string>& si, // singles, from argv + const EList<std::string>& m1, // mate1's, from -1 arg + const EList<std::string>& m2, // mate2's, from -2 arg + const EList<std::string>& m12, // both mates on each line, from --12 + const EList<std::string>& q, // qualities associated with singles + const EList<std::string>& q1, // qualities associated with m1 + const EList<std::string>& q2, // qualities associated with m2 + const PatternParams& p, // read-in params + bool verbose); // be talkative? protected: @@ -814,7 +854,7 @@ public: // srca_ and srcb_ must be parallel assert_eq(srca_->size(), srcb_->size()); for(size_t i = 0; i < srca_->size(); i++) { - // Can't have NULL first-mate sources. Second-mate sources + // Can't have NULL first-mate sources. Second-mate sources // can be NULL, in the case when the corresponding first- // mate source is unpaired. assert((*srca_)[i] != NULL); @@ -937,10 +977,10 @@ private: } PatternComposer& composer_; // pattern composer - PerThreadReadBuf buf_; // read data buffer - const PatternParams& pp_; // pattern-related parameters - bool last_batch_; // true if this is final batch - int last_batch_size_; // # reads read in previous batch + PerThreadReadBuf buf_; // read data buffer + const PatternParams& pp_; // pattern-related parameters + bool last_batch_; // true if this is final batch + int last_batch_size_; // # reads read in previous batch }; /** diff --git a/pthreadGC2.dll b/pthreadGC2.dll deleted file mode 100644 index 8b9116c..0000000 Binary files a/pthreadGC2.dll and /dev/null differ diff --git a/read_qseq.cpp b/read_qseq.cpp index be1e868..286e28d 100644 --- a/read_qseq.cpp +++ b/read_qseq.cpp @@ -55,9 +55,9 @@ pair<bool, int> QseqPatternSource::nextBatchFromFile( PerThreadReadBuf& pt, bool batch_a) { - int c = getc_unlocked(fp_); + int c = getc_wrapper(); while(c >= 0 && (c == '\n' || c == '\r')) { - c = getc_unlocked(fp_); + c = getc_wrapper(); } EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_; size_t readi = 0; @@ -66,10 +66,10 @@ pair<bool, int> QseqPatternSource::nextBatchFromFile( readbuf[readi].readOrigBuf.clear(); while(c >= 0 && c != '\n' && c != '\r') { readbuf[readi].readOrigBuf.append(c); - c = getc_unlocked(fp_); + c = getc_wrapper(); } while(c >= 0 && (c == '\n' || c == '\r')) { - c = getc_unlocked(fp_); + c = getc_wrapper(); } } return make_pair(c < 0, readi); diff --git a/scoring.h b/scoring.h index 2ac50a4..54038a6 100644 --- a/scoring.h +++ b/scoring.h @@ -51,8 +51,8 @@ // Linear coefficient a #define DEFAULT_MIN_LINEAR (-0.6f) // Different defaults for --local mode -#define DEFAULT_MIN_CONST_LOCAL (1.0f) -#define DEFAULT_MIN_LINEAR_LOCAL (10.0f) +#define DEFAULT_MIN_CONST_LOCAL (20.0f) +#define DEFAULT_MIN_LINEAR_LOCAL (8.0f) // Constant coefficient b in linear function f(x) = ax + b determining // maximum permitted number of Ns f in a read before it is filtered & diff --git a/scripts/test/simple_tests.pl b/scripts/test/simple_tests.pl index c27744e..6cc1fb4 100644 --- a/scripts/test/simple_tests.pl +++ b/scripts/test/simple_tests.pl @@ -36,10 +36,12 @@ use Test::Deep; my $bowtie2 = ""; my $bowtie2_build = ""; +my $compressed; GetOptions( - "bowtie2=s" => \$bowtie2, - "bowtie2-build=s" => \$bowtie2_build) || die "Bad options"; + "compressed-reads" => \$compressed, + "bowtie2=s" => \$bowtie2, + "bowtie2-build=s" => \$bowtie2_build) || die "Bad options"; if(! -x $bowtie2 || ! -x $bowtie2_build) { my $bowtie2_dir = `dirname $bowtie2`; @@ -4294,8 +4296,8 @@ sub writeReads($$$$$$$$$) { $fq1, $fq2) = @_; - open(FQ1, ">$fq1") || die "Could not open '$fq1' for writing"; - open(FQ2, ">$fq2") || die "Could not open '$fq2' for writing"; + open(FQ1, defined($compressed) ? "| gzip -c >$fq1.gz" : ">$fq1") || die "Could not open '$fq1' for writing"; + open(FQ2, defined($compressed) ? "| gzip -c >$fq2.gz" : ">$fq2") || die "Could not open '$fq2' for writing"; my $pe = (defined($mate1s) && $mate1s ne ""); if($pe) { for (0..scalar(@$mate1s)-1) { @@ -4392,7 +4394,7 @@ my $idx_type = ""; $formatarg = "-q"; $ext = ".fq"; } elsif($read_file_format eq "tabbed") { - $formatarg = "--12"; + $formatarg = "--tab5"; $ext = ".tab"; } elsif($read_file_format eq "cline_reads") { $formatarg = "-c"; @@ -4417,9 +4419,14 @@ my $idx_type = ""; die "Bad format: $read_file_format"; } if($formatarg ne "-c") { + my $cmd = ">"; + if (defined($compressed)) { + $cmd = "| gzip -c" . $cmd; + $ext = $ext . ".gz"; + } if(defined($read_file)) { # Unpaired - open(RD, ">.simple_tests$ext") || die; + open(RD, $cmd . ".simple_tests$ext") || die; print RD $read_file; close(RD); $readarg = ".simple_tests$ext"; @@ -4427,10 +4434,10 @@ my $idx_type = ""; defined($mate1_file) || die; defined($mate2_file) || die; # Paired - open(M1, ">.simple_tests.1$ext") || die; + open(M1, $cmd . ".simple_tests.1$ext") || die; print M1 $mate1_file; close(M1); - open(M2, ">.simple_tests.2$ext") || die; + open(M2, $cmd . ".simple_tests.2$ext") || die; print M2 $mate2_file; close(M2); $mate1arg = ".simple_tests.1$ext"; @@ -4448,8 +4455,8 @@ my $idx_type = ""; $names, ".simple_tests.1.fq", ".simple_tests.2.fq"); - $mate1arg = ".simple_tests.1.fq"; - $mate2arg = ".simple_tests.2.fq"; + $mate1arg = defined($compressed) ? ".simple_tests.1.fq.gz" : ".simple_tests.1.fq"; + $mate2arg = defined($compressed) ? ".simple_tests.2.fq.gz" : ".simple_tests.2.fq"; $formatarg = "-q"; $readarg = $mate1arg; } diff --git a/scripts/test/simple_tests.sh b/scripts/test/simple_tests.sh index 90b13f2..b71ade5 100644 --- a/scripts/test/simple_tests.sh +++ b/scripts/test/simple_tests.sh @@ -31,4 +31,5 @@ make $* bowtie2-align-s \ bowtie2-build-l-debug && \ perl scripts/test/simple_tests.pl \ --bowtie2=./bowtie2 \ - --bowtie2-build=./bowtie2-build + --bowtie2-build=./bowtie2-build \ + --compressed -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/bowtie2.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
