This is an automated email from the git hooks/post-receive script. tille pushed a commit to branch master in repository diamond-aligner.
commit baec593e94cc4cfd918fa334eb9493a638abdc70 Author: Andreas Tille <ti...@debian.org> Date: Mon Sep 18 13:30:39 2017 +0200 New upstream version 0.9.10+dfsg --- CMakeLists.txt | 4 + README.rst | 2 +- build_simple.sh | 4 + src/ChangeLog | 14 +- src/align/align.cpp | 288 +++++------------------ src/align/align.h | 64 +---- src/align/align_queries.h | 67 ------ src/align/align_target.cpp | 4 + src/align/query_mapper.cpp | 133 ++++++----- src/align/query_mapper.h | 16 ++ src/basic/basic.cpp | 4 +- src/basic/config.cpp | 28 ++- src/basic/config.h | 6 +- src/basic/const.h | 2 +- src/basic/hssp.cpp | 30 ++- src/basic/match.h | 2 + src/basic/sequence.h | 9 +- src/basic/value.h | 2 +- src/data/load_seqs.h | 12 +- src/data/reference.cpp | 22 ++ src/data/reference.h | 9 +- src/data/taxonomy.cpp | 81 ++++++- src/data/taxonomy.h | 15 ++ src/dp/dp.h | 7 +- src/dp/swipe.cpp | 2 +- src/extra/match_file.h | 1 + src/output/blast_pairwise_format.cpp | 5 +- src/output/blast_tab_format.cpp | 8 +- src/output/join_blocks.cpp | 51 +++- src/output/output.h | 39 ++- src/output/output_format.cpp | 25 +- src/output/output_format.h | 69 +++++- src/output/output_sink.cpp | 86 +++++++ src/output/sam_format.cpp | 2 +- src/{basic/const.h => output/target_culling.cpp} | 30 +-- src/output/target_culling.h | 73 ++++++ src/output/taxon_format.cpp | 51 ++++ src/output/{view.h => view.cpp} | 67 +++--- src/run/double_indexed.cpp | 10 +- src/run/main.cpp | 7 +- src/run/tools.cpp | 6 +- src/util/log_stream.h | 5 + src/{basic/const.h => util/queue.h} | 59 +++-- 43 files changed, 830 insertions(+), 591 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 05d6c12..35ee8bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,6 +85,10 @@ add_executable(diamond src/run/main.cpp src/data/seed_set.cpp src/util/binary_file.cpp src/util/simd.cpp + src/output/taxon_format.cpp + src/output/view.cpp + src/output/output_sink.cpp + src/output/target_culling.cpp ) target_link_libraries(diamond ${ZLIB_LIBRARY} ${CMAKE_THREAD_LIBS_INIT}) diff --git a/README.rst b/README.rst index 7d043f3..40c77f1 100644 --- a/README.rst +++ b/README.rst @@ -8,7 +8,7 @@ Please read the `manual <https://github.com/bbuchfink/diamond/raw/master/diamond Installing the software on your system may be done by downloading it in binary format for immediate use:: - wget http://github.com/bbuchfink/diamond/releases/download/v0.9.8/diamond-linux64.tar.gz + wget http://github.com/bbuchfink/diamond/releases/download/v0.9.10/diamond-linux64.tar.gz tar xzf diamond-linux64.tar.gz The extracted ``diamond`` binary file should be moved to a directory contained in your executable search path (PATH environment variable). diff --git a/build_simple.sh b/build_simple.sh index 17fb696..6a8a4dc 100755 --- a/build_simple.sh +++ b/build_simple.sh @@ -53,4 +53,8 @@ g++ -DNDEBUG -O3 -Wno-deprecated-declarations $1 $2 $3 \ src/data/seed_set.cpp \ src/util/binary_file.cpp \ src/util/simd.cpp \ + src/output/taxon_format.cpp \ + src/output/view.cpp \ + src/output/output_sink.cpp \ + src/output/target_culling.cpp \ -lz -lpthread -o diamond diff --git a/src/ChangeLog b/src/ChangeLog index b1f7103..feaca8c 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,13 @@ +[0.9.10] +- added --strand option to choose query strand +- added dbinfo command + +[0.9.9] +- Added taxonomic classification format. +- Fixed a bug in getseq printing masked residues. +- Fixed parsing of UniRef100_ sequence id prefixes. +- Added support for using the staxids output field in diamond view. + [0.9.8] - Fixed a compiler errror. @@ -205,7 +215,7 @@ [0.7.11] - added --version switch -- added static build optiom for CMake build +- added static build option for CMake build - fixed a bug that would cause a return code of 1 without an error [0.7.10] @@ -270,4 +280,4 @@ - reduced usage of temporary disk space and memory - fixed a compiler error for GCC 4.1.2 - fixed a bug that could cause the program to hang when running out of temporary space or memory -- added --forwardonly option to view command +- added --forwardonly option to view command \ No newline at end of file diff --git a/src/align/align.cpp b/src/align/align.cpp index 8b6795c..2e60268 100644 --- a/src/align/align.cpp +++ b/src/align/align.cpp @@ -18,93 +18,65 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. #include "../basic/value.h" #include "align.h" -#include "align_queries.h" #include "../data/reference.h" #include "../output/output_format.h" +#include "../util/queue.h" +#include "../output/output.h" +#include "query_mapper.h" +#include "../util/merge_sort.h" using std::map; -auto_ptr<Simple_query_queue> Simple_query_queue::instance; -auto_ptr<Output_sink> Output_sink::instance; - -size_t Simple_query_queue::get(vector<hit>::iterator &begin, vector<hit>::iterator &end) +struct Align_fetcher { - mtx_.lock(); - const unsigned query = (unsigned)(next_++), - c = align_mode.query_contexts; - if (query >= qend_) { - mtx_.unlock(); - return Simple_query_queue::end; + static void init(size_t qbegin, size_t qend, vector<hit>::iterator begin, vector<hit>::iterator end) + { + it_ = begin; + end_ = end; + queue_ = auto_ptr<Queue>(new Queue(qbegin, qend)); } - begin = it_; - while (it_ < end_ && it_->query_ / c == query) - ++it_; - end = it_; - mtx_.unlock(); - return query; -} - -void Output_sink::push(size_t n, Text_buffer *buf) -{ - mtx_.lock(); - //cout << "n=" << n << " next=" << next_ << endl; - if (n != next_) { - backlog_[n] = buf; - size_ += buf ? buf->alloc_size() : 0; - max_size_ = std::max(max_size_, size_); - mtx_.unlock(); + void operator()(size_t query) + { + const unsigned q = (unsigned)query, + c = align_mode.query_contexts; + begin = it_; + while (it_ < end_ && it_->query_ / c == q) + ++it_; + end = it_; + this->query = query; } - else - flush(buf); -} + bool get() + { + return queue_->get(*this) != Queue::end; + } + size_t query; + vector<hit>::iterator begin, end; +private: + static vector<hit>::iterator it_, end_; + static auto_ptr<Queue> queue_; +}; -void Output_sink::flush(Text_buffer *buf) -{ - size_t n = next_ + 1; - vector<Text_buffer*> out; - out.push_back(buf); - map<size_t, Text_buffer*>::iterator i; - do { - while ((i = backlog_.begin()) != backlog_.end() && i->first == n) { - out.push_back(i->second); - backlog_.erase(i); - ++n; - } - mtx_.unlock(); - size_t size = 0; - for (vector<Text_buffer*>::iterator j = out.begin(); j < out.end(); ++j) { - if (*j) { - f_->write((*j)->get_begin(), (*j)->size()); - if(*j != buf) - size += (*j)->alloc_size(); - delete *j; - } - } - out.clear(); - mtx_.lock(); - size_ -= size; - } while ((i = backlog_.begin()) != backlog_.end() && i->first == n); - next_ = n; - mtx_.unlock(); -} +auto_ptr<Queue> Align_fetcher::queue_; +vector<hit>::iterator Align_fetcher::it_; +vector<hit>::iterator Align_fetcher::end_; void align_worker(size_t thread_id) { - vector<hit>::iterator begin, end; - size_t query; + Align_fetcher hits; Statistics stat; - while ((query = Simple_query_queue::get().get(begin, end)) != Simple_query_queue::end) { - if (end == begin) { + while (hits.get()) { + if (hits.end == hits.begin) { Text_buffer *buf = 0; if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0) { buf = new Text_buffer; - output_format->print_query_intro(query, query_ids::get()[query].c_str(), get_source_query_len((unsigned)query), *buf, true); - output_format->print_query_epilog(*buf, true); + const char *query_title = query_ids::get()[hits.query].c_str(); + output_format->print_query_intro(hits.query, query_title, get_source_query_len((unsigned)hits.query), *buf, true); + output_format->print_query_epilog(*buf, query_title, true); } - Output_sink::get().push(query, buf); + Output_sink::get().push(hits.query, buf); continue; } - Query_mapper mapper(query, begin, end); + Query_mapper mapper(hits.query, hits.begin, hits.end); mapper.init(); if (config.ext == Config::swipe) mapper.align_targets(stat); @@ -113,11 +85,13 @@ void align_worker(size_t thread_id) for (size_t i = 0; i < mapper.n_targets(); ++i) mapper.ungapped_stage(i); if (config.ext != Config::most_greedy) { + mapper.fill_source_ranges(); mapper.rank_targets(config.rank_ratio == -1 ? (mapper.query_seq(0).length() > 50 ? 0.6 : 0.9) : config.rank_ratio); stat.inc(Statistics::TARGET_HITS1, mapper.n_targets()); const int cutoff = int(mapper.raw_score_cutoff() * config.score_ratio); for (size_t i = 0; i < mapper.n_targets(); ++i) mapper.greedy_stage(i, stat, cutoff); + mapper.fill_source_ranges(); mapper.rank_targets(config.rank_ratio2 == -1 ? (mapper.query_seq(0).length() > 50 ? 0.95 : 1.0) : config.rank_ratio2); stat.inc(Statistics::TARGET_HITS2, mapper.n_targets()); for (size_t i = 0; i < mapper.n_targets(); ++i) @@ -129,157 +103,18 @@ void align_worker(size_t thread_id) buf = new Text_buffer; const bool aligned = mapper.generate_output(*buf, stat); if (aligned && !config.unaligned.empty()) - query_aligned[query] = true; - } - Output_sink::get().push(query, buf); - } - statistics += stat; -} - -void heartbeat_worker() -{ - static const int interval = 100; - int n = 0; - while (Output_sink::get().next() < Simple_query_queue::get().qend()) { - if (n == interval) { - const string title(query_ids::get()[Output_sink::get().next()].c_str()); - verbose_stream << "Queries=" << Simple_query_queue::get().next() << " size=" << megabytes(Output_sink::get().size()) << " max_size=" << megabytes(Output_sink::get().max_size()) - << " next=" << title.substr(0, title.find(' ')) << endl; - n = 0; - } - else - ++n; - tthread::this_thread::sleep_for(tthread::chrono::milliseconds(10)); - } -} - -Query_queue query_queue; - -void Query_queue::init(Trace_pt_list::iterator begin, Trace_pt_list::iterator end) -{ - trace_pt_pos = begin; - trace_pt_end = end; - assert(queue.empty()); - assert(out_queue.empty()); - n = 0; -} - -void Query_queue::flush(Output_stream *out, Statistics &stat) -{ - writing = true; - std::queue<Query_data*> q; - while (true) { - while (!out_queue.empty() && out_queue.front()->state == Query_data::finished) { - q.push(out_queue.front()); - out_queue.pop(); - } - if (q.empty()) { - writing = false; - lock.unlock(); - return; - } - lock.unlock(); - - unsigned k = 0; - while (!q.empty()) { - out->write(q.front()->buf.get_begin(), q.front()->buf.size()); - delete q.front(); - q.pop(); - ++k; + query_aligned[hits.query] = true; } - - lock.lock(); - n -= k; - /*if (n > 100) - cout << "qlen=" << out_queue.size() << " finished=" << n << endl;*/ + Output_sink::get().push(hits.query, buf); } -} - -Query_data* Query_queue::get() -{ - for (std::deque<Query_data*>::iterator i = queue.begin(); i != queue.end(); ++i) - if ((*i)->state == Query_data::free) - return *i; - return 0; -} - -void Query_queue::pop_busy() -{ - while (!queue.empty() && (queue.front()->state == Query_data::closing || queue.front()->state == Query_data::finished)) { - out_queue.push(queue.front()); - queue.pop_front(); - } -} - -void align_worker(Output_stream *out) -{ - Statistics stat; - Query_data *data = 0; - unsigned n_targets = 0; - - while (true) { - - query_queue.lock.lock(); - - if (data) { - data->mapper->targets_finished += n_targets; - if (data->mapper->finished()) { - query_queue.lock.unlock(); - const bool aligned = data->mapper->generate_output(data->buf, stat); - const unsigned query_id = data->mapper->query_id; - delete data->mapper; - query_queue.lock.lock(); - data->state = Query_data::finished; - if (aligned && !config.unaligned.empty()) - query_aligned[query_id] = true; - ++query_queue.n; - if (!query_queue.writing && !query_queue.out_queue.empty() && data == query_queue.out_queue.front()) { - query_queue.flush(out, stat); - data = 0; - continue; - } - } - } - - if (!(data = query_queue.get())) { - if (query_queue.trace_pt_pos >= query_queue.trace_pt_end) { - query_queue.lock.unlock(); - break; - } - else { - query_queue.queue.push_back(new Query_data(new Query_mapper())); - data = query_queue.queue.back(); - query_queue.lock.unlock(); - data->mapper->init(); - data->state = Query_data::free; - data = 0; - } - } - else { - size_t target = data->mapper->next_target; - n_targets = std::min(config.target_fetch_size, (unsigned)data->mapper->n_targets() - data->mapper->next_target); - data->mapper->next_target += n_targets; - if (target + n_targets == data->mapper->n_targets()) { - data->state = Query_data::closing; - query_queue.pop_busy(); - } - query_queue.lock.unlock(); - - for (unsigned i = 0; i < n_targets; ++i) - data->mapper->align_target(target + i, stat); - } - - } - statistics += stat; } void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file) { - query_queue.last_query = (unsigned)-1; const size_t max_size = (size_t)std::min(config.chunk_size*1e9 * 9 * 2 / config.lowmem, 2e9); pair<size_t, size_t> query_range; - while(true) { + while (true) { task_timer timer("Loading trace points", 3); Trace_pt_list *v = new Trace_pt_list; statistics.max(Statistics::TEMP_SPACE, trace_pts.load(*v, max_size, query_range)); @@ -291,32 +126,15 @@ void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file) merge_sort(v->begin(), v->end(), config.threads_); v->init(); timer.go("Computing alignments"); - if (config.load_balancing == Config::target_parallel) { - query_queue.init(v->begin(), v->end()); - Thread_pool threads; - for (unsigned i = 0; i < config.threads_; ++i) - threads.push_back(launch_thread(static_cast<void (*)(Output_stream*)>(&align_worker), output_file)); - threads.join_all(); - } - else { - Simple_query_queue::instance = auto_ptr<Simple_query_queue>(new Simple_query_queue(query_range.first, query_range.second, v->begin(), v->end())); - Output_sink::instance = auto_ptr<Output_sink>(new Output_sink(query_range.first, output_file)); - Thread_pool threads; - if (config.verbosity >= 3) - threads.push_back(launch_thread(heartbeat_worker)); - for (size_t i = 0; i < (config.threads_align == 0 ? config.threads_ : config.threads_align); ++i) - threads.push_back(launch_thread(static_cast<void(*)(size_t)>(&align_worker), i)); - threads.join_all(); - } + Align_fetcher::init(query_range.first, query_range.second, v->begin(), v->end()); + Output_sink::instance = auto_ptr<Output_sink>(new Output_sink(query_range.first, output_file)); + Thread_pool threads; + if (config.verbosity >= 3) + threads.push_back(launch_thread(heartbeat_worker, query_range.second)); + for (size_t i = 0; i < (config.threads_align == 0 ? config.threads_ : config.threads_align); ++i) + threads.push_back(launch_thread(static_cast<void(*)(size_t)>(&align_worker), i)); + threads.join_all(); timer.go("Deallocating buffers"); delete v; } - if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0 && config.load_balancing == Config::target_parallel) { - Text_buffer buf; - for (unsigned i = query_queue.last_query + 1; i < query_ids::get().get_length(); ++i) { - output_format->print_query_intro(i, query_ids::get()[i].c_str(), get_source_query_len(i), buf, true); - output_format->print_query_epilog(buf, true); - } - output_file->write(buf.get_begin(), buf.size()); - } } \ No newline at end of file diff --git a/src/align/align.h b/src/align/align.h index 33588b7..07d4b27 100644 --- a/src/align/align.h +++ b/src/align/align.h @@ -64,68 +64,6 @@ private: Task_queue<_buffer, Output_writer> queue; }; -struct Simple_query_queue -{ - enum { end = 0xffffffffffffffffllu }; - Simple_query_queue(size_t qbegin, size_t qend, vector<hit>::iterator begin, vector<hit>::iterator end): - next_(qbegin), - qend_(qend), - it_(begin), - end_(end) - {} - size_t get(vector<hit>::iterator &begin, vector<hit>::iterator &end); - size_t next() const - { - return next_; - } - size_t qend() const - { - return qend_; - } - static Simple_query_queue& get() - { - return *instance; - } - static auto_ptr<Simple_query_queue> instance; -private: - tthread::mutex mtx_; - size_t next_; - const size_t qend_; - vector<hit>::iterator it_, end_; -}; - -struct Output_sink -{ - Output_sink(size_t begin, Output_stream *f): - f_(f), - next_(begin), - size_(0), - max_size_(0) - {} - void push(size_t n, Text_buffer *buf); - size_t size() const - { - return size_; - } - size_t max_size() const - { - return max_size_; - } - static Output_sink& get() - { - return *instance; - } - size_t next() const - { - return next_; - } - static auto_ptr<Output_sink> instance; -private: - void flush(Text_buffer *buf); - tthread::mutex mtx_; - Output_stream* const f_; - std::map<size_t, Text_buffer*> backlog_; - size_t next_, size_, max_size_; -}; +void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file); #endif \ No newline at end of file diff --git a/src/align/align_queries.h b/src/align/align_queries.h deleted file mode 100644 index 8409f46..0000000 --- a/src/align/align_queries.h +++ /dev/null @@ -1,67 +0,0 @@ -/**** -DIAMOND protein aligner -Copyright (C) 2013-2017 Benjamin Buchfink <buchf...@gmail.com> - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as -published by the Free Software Foundation, either version 3 of the -License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -****/ - -#ifndef ALIGN_QUERIES_H_ -#define ALIGN_QUERIES_H_ - -#include <deque> -#include "../util/merge_sort.h" -#include "../search/trace_pt_buffer.h" -#include "../util/map.h" -#include "../util/task_queue.h" -#include "align.h" -#include "query_mapper.h" - -using std::vector; - -struct Query_data -{ - Query_data(): - mapper(0) - {} - Query_data(Query_mapper *mapper): - mapper(mapper), - state(init) - {} - Query_mapper *mapper; - Text_buffer buf; - enum { init, free, closing, finished }; - unsigned state; -}; - -struct Query_queue -{ - void init(Trace_pt_list::iterator begin, Trace_pt_list::iterator end); - void flush(Output_stream *out, Statistics &stat); - Query_data* get(); - void pop_busy(); - - tthread::mutex lock; - std::deque<Query_data*> queue; - std::queue<Query_data*> out_queue; - Trace_pt_list::iterator trace_pt_pos, trace_pt_end; - bool writing; - unsigned n, last_query; -}; - -extern Query_queue query_queue; - -void align_worker(Output_stream *out); -void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file); - -#endif /* ALIGN_QUERIES_H_ */ diff --git a/src/align/align_target.cpp b/src/align/align_target.cpp index 43af301..e027ac6 100644 --- a/src/align/align_target.cpp +++ b/src/align/align_target.cpp @@ -272,6 +272,10 @@ void Query_mapper::align_target(size_t idx, Statistics &stat) target.hsps.sort(); if(target.hsps.size() > 0) target.filter_score = target.hsps.front().score; + + target.ts.clear(); + for (list<Hsp_data>::iterator i = target.hsps.begin(); i != target.hsps.end(); ++i) + target.ts.push_back(Hsp_traits(i->query_source_range)); if (config.use_smith_waterman && !target.hsps.empty()) { int score; diff --git a/src/align/query_mapper.cpp b/src/align/query_mapper.cpp index 401a52e..12cb96e 100644 --- a/src/align/query_mapper.cpp +++ b/src/align/query_mapper.cpp @@ -16,24 +16,36 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. ****/ -#include "align_queries.h" #include "query_mapper.h" #include "../data/reference.h" #include "extend_ungapped.h" #include "../output/output.h" #include "../output/output_format.h" #include "../output/daa_write.h" +#include "../output/target_culling.h" -Query_mapper::Query_mapper() : - source_hits(get_query_data()), - query_id(source_hits.first->query_ / align_mode.query_contexts), - targets_finished(0), - next_target(0), - source_query_len(get_source_query_len(query_id)), - unaligned_from(query_queue.last_query+1) -{ - query_queue.last_query = query_id; - seed_hits.reserve(source_hits.second - source_hits.first); +bool Target::envelopes(const Hsp_traits &t, double p) const +{ + for (list<Hsp_traits>::const_iterator i = ts.begin(); i != ts.end(); ++i) + if (t.query_source_range.overlap_factor(i->query_source_range) >= p) + return true; + return false; +} + +bool Target::is_enveloped(const Target &t, double p) const +{ + for (list<Hsp_traits>::const_iterator i = ts.begin(); i != ts.end(); ++i) + if (!t.envelopes(*i, p)) + return false; + return true; +} + +bool Target::is_enveloped(Ptr_vector<Target>::const_iterator begin, Ptr_vector<Target>::const_iterator end, double p, int min_score) const +{ + for (; begin != end; ++begin) + if (is_enveloped(**begin, p) && (*begin)->filter_score >= min_score) + return true; + return false; } int Query_mapper::raw_score_cutoff() const @@ -70,18 +82,6 @@ void Query_mapper::init() rank_targets(config.rank_ratio == -1 ? 0.6 : config.rank_ratio); } -pair<Trace_pt_list::iterator, Trace_pt_list::iterator> Query_mapper::get_query_data() -{ - const Trace_pt_list::iterator begin = query_queue.trace_pt_pos; - if (begin == query_queue.trace_pt_end) - return pair<Trace_pt_list::iterator, Trace_pt_list::iterator>(begin, begin); - const unsigned c = align_mode.query_contexts, query = begin->query_ / c; - Trace_pt_list::iterator end = begin; - for (; end < query_queue.trace_pt_end && end->query_ / c == query; ++end); - query_queue.trace_pt_pos = end; - return pair<Trace_pt_list::iterator, Trace_pt_list::iterator>(begin, end); -} - unsigned Query_mapper::count_targets() { std::sort(source_hits.first, source_hits.second, hit::cmp_subject); @@ -132,49 +132,64 @@ void Query_mapper::rank_targets(double ratio) { std::sort(targets.begin(), targets.end(), Target::compare); - int score = 0; - if (config.toppercent < 100) { - score = int((double)targets[0].filter_score * (1.0 - config.toppercent / 100.0) * ratio); + if (config.query_overlap_culling == 0.0) { + + int score = 0; + if (config.toppercent < 100) { + score = int((double)targets[0].filter_score * (1.0 - config.toppercent / 100.0) * ratio); + } + else { + size_t min_idx = std::min(targets.size(), (size_t)config.max_alignments); + score = int((double)targets[min_idx - 1].filter_score * ratio); + } + + unsigned i = 0; + for (; i < targets.size(); ++i) + if (targets[i].filter_score < score) + break; + + if (config.benchmark_ranking) + for (unsigned j = i; j < targets.size(); ++j) + targets[j].outranked = true; + else + targets.erase(targets.begin() + i, targets.end()); + } else { - size_t min_idx = std::min(targets.size(), (size_t)config.max_alignments); - score = int((double)targets[min_idx - 1].filter_score * ratio); - } - unsigned i = 0; - for (; i < targets.size(); ++i) - if (targets[i].filter_score < score) - break; - - if (config.benchmark_ranking) - for (unsigned j = i; j < targets.size(); ++j) - targets[j].outranked = true; - else - targets.erase(targets.begin() + i, targets.end()); + int i = 0; + const double p = config.query_overlap_culling / 100.0; + while (i < (int)targets.size()) { + int min_score = int((double)targets[i].filter_score / (1.0 - config.toppercent / 100.0) / ratio); + if (targets[i].is_enveloped(targets.begin(), targets.begin() + i, p, min_score)) + targets.erase(targets.begin() + i, targets.begin() + i + 1); + else + ++i; + } + + } } bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat) { - if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0 && config.load_balancing == Config::target_parallel) { - for (unsigned i = unaligned_from; i < query_id; ++i) { - output_format->print_query_intro(i, query_ids::get()[i].c_str(), get_source_query_len(i), buffer, true); - output_format->print_query_epilog(buffer, true); - } - } - std::sort(targets.begin(), targets.end(), Target::compare); unsigned n_hsp = 0, n_target_seq = 0, hit_hsps = 0; - const unsigned top_score = targets.empty() ? 0 : targets[0].filter_score, query_len = (unsigned)query_seq(0).length(); + auto_ptr<Target_culling> target_culling(Target_culling::get(targets.empty() ? 0 : &targets[0])); + const unsigned query_len = (unsigned)query_seq(0).length(); size_t seek_pos = 0; const char *query_title = query_ids::get()[query_id].c_str(); + auto_ptr<Output_format> f(output_format->clone()); for (size_t i = 0; i < targets.size(); ++i) { if ((config.min_bit_score == 0 && score_matrix.evalue(targets[i].filter_score, config.db_size, query_len) > config.max_evalue) || score_matrix.bitscore(targets[i].filter_score) < config.min_bit_score) break; - if (!config.output_range(n_target_seq, targets[i].filter_score, top_score)) + const int c = target_culling->cull(targets[i]); + if (c == Target_culling::next) + continue; + else if (c == Target_culling::finished) break; if (targets[i].outranked) @@ -202,15 +217,15 @@ bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat) } else { if (n_hsp == 0) { - if (*output_format == Output_format::daa) + if (*f == Output_format::daa) seek_pos = write_daa_query_record(buffer, query_title, align_mode.query_translated ? query_source_seqs::get()[query_id] : query_seqs::get()[query_id]); else - output_format->print_query_intro(query_id, query_title, source_query_len, buffer, false); + f->print_query_intro(query_id, query_title, source_query_len, buffer, false); } - if (*output_format == Output_format::daa) + if (*f == Output_format::daa) write_daa_record(buffer, *j, query_id, targets[i].subject_id); else - output_format->print_match(Hsp_context(*j, + f->print_match(Hsp_context(*j, query_id, query_seq(j->frame), query_source_seq(), @@ -223,8 +238,10 @@ bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat) hit_hsps), buffer); } - if(hit_hsps == 0) + if (hit_hsps == 0) { ++n_target_seq; + target_culling->add(&targets[i]); + } ++n_hsp; ++hit_hsps; if (config.alignment_traceback && j->gap_openings > 0) @@ -235,17 +252,17 @@ bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat) if (n_hsp > 0) { if (!blocked_processing) { - if (*output_format == Output_format::daa) + if (*f == Output_format::daa) finish_daa_query_record(buffer, seek_pos); else - output_format->print_query_epilog(buffer, false); + f->print_query_epilog(buffer, query_title, false); } else Intermediate_record::finish_query(buffer, seek_pos); } - else if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0) { - output_format->print_query_intro(query_id, query_title, source_query_len, buffer, true); - output_format->print_query_epilog(buffer, true); + else if (!blocked_processing && *f != Output_format::daa && config.report_unaligned != 0) { + f->print_query_intro(query_id, query_title, source_query_len, buffer, true); + f->print_query_epilog(buffer, query_title, true); } if (!blocked_processing) { diff --git a/src/align/query_mapper.h b/src/align/query_mapper.h index 5da19c3..bbcd8b8 100644 --- a/src/align/query_mapper.h +++ b/src/align/query_mapper.h @@ -34,6 +34,9 @@ using std::list; struct Target { + Target(int score): + filter_score(score) + {} Target(size_t begin, unsigned subject_id) : subject_id(subject_id), filter_score(0), @@ -44,6 +47,14 @@ struct Target { return lhs->filter_score > rhs->filter_score; } + void fill_source_ranges(size_t query_len) + { + for (list<Hsp_traits>::iterator i = ts.begin(); i != ts.end(); ++i) + i->query_source_range = align_mode.query_translated ? untranslate_range(i->query_range, i->frame, query_len) : i->query_range; + } + bool envelopes(const Hsp_traits &t, double p) const; + bool is_enveloped(const Target &t, double p) const; + bool is_enveloped(Ptr_vector<Target>::const_iterator begin, Ptr_vector<Target>::const_iterator end, double p, int min_score) const; unsigned subject_id; int filter_score; float filter_time; @@ -78,6 +89,11 @@ struct Query_mapper { return query_seqs::get()[query_id*align_mode.query_contexts + frame]; } + void fill_source_ranges() + { + for (size_t i = 0; i < targets.size(); ++i) + targets[i].fill_source_ranges(source_query_len); + } pair<Trace_pt_list::iterator, Trace_pt_list::iterator> source_hits; unsigned query_id, targets_finished, next_target; private: diff --git a/src/basic/basic.cpp b/src/basic/basic.cpp index aa6fab3..5455ea3 100644 --- a/src/basic/basic.cpp +++ b/src/basic/basic.cpp @@ -24,9 +24,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. #include "sequence.h" #include "masking.h" -const char* Const::version_string = "0.9.8"; +const char* Const::version_string = "0.9.10"; const char* Const::program_name = "diamond"; -const char* Const::id_delimiters = " \a\b\f\n\r\t\v"; +const char* Const::id_delimiters = " \a\b\f\n\r\t\v\1"; Value_traits::Value_traits(const char *alphabet, Letter mask_char, const char *ignore) : alphabet(alphabet), diff --git a/src/basic/config.cpp b/src/basic/config.cpp index bb3fe7a..edbace2 100644 --- a/src/basic/config.cpp +++ b/src/basic/config.cpp @@ -56,7 +56,8 @@ Config::Config(int argc, const char **argv) .add_command("model-seqs", "") .add_command("opt", "") .add_command("mask", "") - .add_command("fastq2fasta", ""); + .add_command("fastq2fasta", "") + .add_command("dbinfo", ""); Options_group general("General options"); general.add() @@ -111,10 +112,12 @@ Config::Config(int argc, const char **argv) Options_group aligner("Aligner options"); aligner.add() ("query", 'q', "input query file", query_file) + ("strand", 0, "query strands to search (both/minus/plus)", query_strands, string("both")) ("un", 0, "file for unaligned queries", unaligned) ("unal", 0, "report unaligned queries (0=no, 1=yes)", report_unaligned, -1) ("max-target-seqs", 'k', "maximum number of target sequences to report alignments for", max_alignments, uint64_t(25)) ("top", 0, "report alignments within this percentage range of top alignment score (overrides --max-target-seqs)", toppercent, 100.0) + ("overlap-culling", 0, "delete hits only if a higher scoring hit envelops the given percentage of the query range", query_overlap_culling) ("compress", 0, "compression for output files (0=none, 1=gzip)", compression) ("evalue", 'e', "maximum e-value to report alignments (default=0.001)", max_evalue, 0.001) ("min-score", 0, "minimum bit score to report alignments (overrides e-value setting)", min_bit_score) @@ -137,8 +140,10 @@ Config::Config(int argc, const char **argv) //("seg", 0, "enable SEG masking of queries (yes/no)", seg) ("query-gencode", 0, "genetic code to use to translate query (see user manual)", query_gencode, 1u) ("salltitles", 0, "include full subject titles in DAA file", salltitles) + ("sallseqid", 0, "include all subject ids in DAA file", sallseqid) ("no-self-hits", 0, "suppress reporting of identical self hits", no_self_hits) - ("taxonmap", 0, "protein accession to taxid mapping file", prot_accession2taxid); + ("taxonmap", 0, "protein accession to taxid mapping file", prot_accession2taxid) + ("taxonnodes", 0, "taxonomy nodes.dmp from NCBI", nodesdmp); Options_group advanced("Advanced options"); advanced.add() @@ -230,7 +235,7 @@ Config::Config(int argc, const char **argv) throw std::runtime_error("Invalid option: --block-size/-b. Block size is set for the alignment commands."); break; case Config::blastp: - case Config::blastx: + case Config::blastx: if (database == "") throw std::runtime_error("Missing parameter: database file (--db/-d)"); if (daa_file.length() > 0) { @@ -254,6 +259,12 @@ Config::Config(int argc, const char **argv) ; } + switch (command) { + case Config::dbinfo: + if (database == "") + throw std::runtime_error("Missing parameter: database file (--db/-d)"); + } + if (hit_cap != 0) throw std::runtime_error("Deprecated parameter: --max-hits/-C."); @@ -353,14 +364,6 @@ Config::Config(int argc, const char **argv) #ifdef __POPCNT__ verbose_stream << "POPCNT enabled." << endl; #endif - - message_stream << "#Target sequences to report alignments for: "; - if (max_alignments == 0) { - max_alignments = std::numeric_limits<uint64_t>::max(); - message_stream << "unlimited" << endl; - } - else - message_stream << max_alignments << endl; } Translator::init(query_gencode); @@ -371,6 +374,9 @@ Config::Config(int argc, const char **argv) if (command == help) parser.print_help(); + if (query_strands != "both" && query_strands != "minus" && query_strands != "plus") + throw std::runtime_error("Invalid value for parameter --strand"); + /*log_stream << "sizeof(hit)=" << sizeof(hit) << " sizeof(packed_uint40_t)=" << sizeof(packed_uint40_t) << " sizeof(sorted_list::entry)=" << sizeof(sorted_list::entry) << endl;*/ } diff --git a/src/basic/config.h b/src/basic/config.h index 0b1035b..a348d46 100644 --- a/src/basic/config.h +++ b/src/basic/config.h @@ -139,10 +139,14 @@ struct Config double score_ratio; bool small_query; bool hashed_seeds; + string nodesdmp; + bool sallseqid; + double query_overlap_culling; + string query_strands; enum { makedb = 0, blastp = 1, blastx = 2, view = 3, help = 4, version = 5, getseq = 6, benchmark = 7, random_seqs = 8, compare = 9, sort = 10, roc = 11, db_stat = 12, model_sim = 13, - match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, fastq2fasta=18 + match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, fastq2fasta=18, dbinfo=19 }; unsigned command; diff --git a/src/basic/const.h b/src/basic/const.h index 0dd7c7a..a2107ba 100644 --- a/src/basic/const.h +++ b/src/basic/const.h @@ -23,7 +23,7 @@ struct Const { enum { - build_version = 109, + build_version = 111, daa_version = 0, seedp_bits = 10, seedp = 1<<seedp_bits, diff --git a/src/basic/hssp.cpp b/src/basic/hssp.cpp index d0134d9..579523e 100644 --- a/src/basic/hssp.cpp +++ b/src/basic/hssp.cpp @@ -18,6 +18,21 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. #include "../align/align.h" +interval untranslate_range(const interval &r, unsigned frame, size_t len) +{ + signed f = frame <= 2 ? frame + 1 : 2 - frame; + interval s; + if (f > 0) { + s.begin_ = (f - 1) + 3 * r.begin_; + s.end_ = s.begin_ + 3 * r.length(); + } + else { + s.end_ = (int)len + f - 3 * r.begin_ + 1; + s.begin_ = s.end_ - 3 * r.length(); + } + return s; +} + bool Hsp_data::pass_through(const Diagonal_segment &d) const { if (intersect(d.query_range(), query_range).length() != d.len @@ -80,19 +95,8 @@ void Hsp_data::set_source_range(unsigned frame, unsigned dna_len) this->frame = frame; if (!align_mode.query_translated) query_source_range = query_range; - else { - signed f = frame <= 2 ? frame + 1 : 2 - frame; - if (f > 0) { - query_source_range.begin_ = (f - 1) + 3 * query_range.begin_; - query_source_range.end_ = query_source_range.begin_ + 3 * query_range.length(); - //query_end_dna = (f-1) + 3 * (l.query_begin_+l.query_len_-1) + 3; - } - else { - query_source_range.end_ = dna_len + f - 3 * query_range.begin_ + 1; - //query_end_dna = dna_len + (f + 1) - 3 * (l.query_begin_+l.query_len_-1) - 2; - query_source_range.begin_ = query_source_range.end_ - 3 * query_range.length(); - } - } + else + query_source_range = untranslate_range(query_range, frame, dna_len); } Hsp_context& Hsp_context::parse() diff --git a/src/basic/match.h b/src/basic/match.h index 555ecb6..991d40d 100644 --- a/src/basic/match.h +++ b/src/basic/match.h @@ -38,6 +38,8 @@ inline interval normalized_range(unsigned pos, int len, Strand strand) : interval (pos + 1 + len, pos + 1); } +interval untranslate_range(const interval &r, unsigned frame, size_t l); + struct Diagonal_segment { Diagonal_segment(): diff --git a/src/basic/sequence.h b/src/basic/sequence.h index 349a9de..b52f3fb 100644 --- a/src/basic/sequence.h +++ b/src/basic/sequence.h @@ -100,8 +100,13 @@ struct sequence } std::ostream& print(std::ostream &os, const Value_traits &v) const { - for (unsigned i = 0; i<len_; ++i) - os << v.alphabet[(long)data_[i]]; + for (unsigned i = 0; i < len_; ++i) { + long l = (long)data_[i]; + if ((l & 128) == 0) + os << v.alphabet[l]; + else + os << (char)tolower(v.alphabet[l & 127]); + } return os; } std::ostream& print(std::ostream &os, const Value_traits &v, Reversed) const diff --git a/src/basic/value.h b/src/basic/value.h index 5f7a256..58467dc 100644 --- a/src/basic/value.h +++ b/src/basic/value.h @@ -76,7 +76,7 @@ private: struct Value_traits { - Value_traits(const char *alphabet, Letter mask_char, const char *ignore); + Value_traits(const char *alphabet, Letter mask_char, const char *ignore); const char *alphabet; unsigned alphabet_size; Letter mask_char; diff --git a/src/data/load_seqs.h b/src/data/load_seqs.h index a203779..fd67db3 100644 --- a/src/data/load_seqs.h +++ b/src/data/load_seqs.h @@ -24,7 +24,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. #include "../basic/translate.h" #include "../util/seq_file_format.h" -inline size_t push_seq(Sequence_set &ss, Sequence_set** source_seqs, const vector<Letter> &seq) +inline size_t push_seq(Sequence_set &ss, Sequence_set** source_seqs, const vector<Letter> &seq, unsigned frame_mask) { if (config.command == Config::blastp || config.command == Config::makedb || config.command == Config::random_seqs) { ss.push_back(seq); @@ -42,7 +42,7 @@ inline size_t push_seq(Sequence_set &ss, Sequence_set** source_seqs, const vecto unsigned bestFrames(Translator::computeGoodFrames(proteins, config.get_run_len((unsigned)seq.size() / 3))); for (unsigned j = 0; j < 6; ++j) { - if (bestFrames & (1 << j)) + if ((bestFrames & (1 << j)) && (frame_mask & (1 << j))) ss.push_back(proteins[j]); else ss.fill(proteins[j].size(), value_traits.mask_char); @@ -68,10 +68,16 @@ inline size_t load_seqs(Input_stream &file, vector<char> id; string id2; + unsigned frame_mask = (1 << 6) - 1; + if (config.query_strands == "plus") + frame_mask = (1 << 3) - 1; + else if (config.query_strands == "minus") + frame_mask = ((1 << 3) - 1) << 3; + while (letters < max_letters && format.get_seq(id, seq, file)) { if (seq.size() > 0 && (filter.empty() || id2.assign(id.data(), id.data() + id.size()).find(filter, 0) != string::npos)) { ids->push_back(id); - letters += push_seq(**seqs, source_seqs, seq); + letters += push_seq(**seqs, source_seqs, seq, frame_mask); ++n; if ((*seqs)->get_length() >(size_t)std::numeric_limits<int>::max()) throw std::runtime_error("Number of sequences in file exceeds supported maximum."); diff --git a/src/data/reference.cpp b/src/data/reference.cpp index c86caef..4e15766 100644 --- a/src/data/reference.cpp +++ b/src/data/reference.cpp @@ -39,6 +39,18 @@ bool blocked_processing; using std::cout; using std::endl; +string* get_allseqids(const char *s) +{ + string *r = new string; + const vector<string> t(tokenize(s, "\1")); + for (vector<string>::const_iterator i = t.begin(); i != t.end(); ++i) { + if (i != t.begin()) + r->append("\1"); + r->append(i->substr(0, find_first_of(i->c_str(), Const::id_delimiters))); + } + return r; +} + struct Pos_record { Pos_record() @@ -211,4 +223,14 @@ void Database_file::get_seq() seq.clear(); id.clear(); } +} + +void db_info() +{ + Database_file db_file; + cout << "Database format version = " << ref_header.db_version << endl; + cout << "Diamond build = " << ref_header.build << endl; + cout << "Sequences = " << ref_header.sequences << endl; + cout << "Letters = " << ref_header.letters << endl; + db_file.close(); } \ No newline at end of file diff --git a/src/data/reference.h b/src/data/reference.h index 7eb8bb5..c74f5e9 100644 --- a/src/data/reference.h +++ b/src/data/reference.h @@ -129,6 +129,8 @@ inline size_t max_id_len(const String_set<0> &ids) return max; } +string* get_allseqids(const char *s); + struct Ref_map { Ref_map(): @@ -157,10 +159,13 @@ struct Ref_map n = next_++; data_[block][i] = n; len_.push_back((uint32_t)ref_seqs::get().length(i)); + const char *title = ref_ids::get()[i].c_str(); if (config.salltitles) - name_.push_back(new string(ref_ids::get()[i].c_str())); + name_.push_back(new string(title)); + else if (config.sallseqid) + name_.push_back(get_allseqids(title)); else - name_.push_back(get_str(ref_ids::get()[i].c_str(), Const::id_delimiters)); + name_.push_back(get_str(title, Const::id_delimiters)); mtx_.unlock(); } return n; diff --git a/src/data/taxonomy.cpp b/src/data/taxonomy.cpp index 838f29f..9172a56 100644 --- a/src/data/taxonomy.cpp +++ b/src/data/taxonomy.cpp @@ -17,10 +17,16 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. ****/ #include <stdio.h> +#include <set> #include "taxonomy.h" #include "../util/compressed_stream.h" #include "../basic/config.h" #include "../util/merge_sort.h" +#include "../util/log_stream.h" + +using std::cout; +using std::endl; +using std::set; Taxonomy taxonomy; @@ -28,7 +34,7 @@ string& get_accession(string &t) { size_t i; if (t.compare(0, 6, "UniRef") == 0) - t.erase(0, 9); + t.erase(0, t.find('_', 0) + 1); else if ((i = t.find_first_of('|', 0)) != string::npos) { if (t.compare(0, 3, "gi|") == 0) { t.erase(0, t.find_first_of('|', i + 1) + 1); @@ -62,5 +68,78 @@ void Taxonomy::load() /*if (f.line_count % 10000 == 0) std::cout << f.line_count << endl;*/ } + f.close(); merge_sort(accession2taxid_.begin(), accession2taxid_.end(), config.threads_); +} + +void Taxonomy::load_nodes() +{ + Input_stream f(config.nodesdmp); + unsigned taxid, parent; + while (!f.eof() && (f.getline(), !f.line.empty())) { + if (sscanf(f.line.c_str(), "%u\t|\t%u", &taxid, &parent) != 2) + throw std::runtime_error("Invalid nodes.dmp file format."); + //cout << taxid << '\t' << parent << endl; + parent_.resize(taxid + 1); + parent_[taxid] = parent; + } + f.close(); +} + +void Taxonomy::init() +{ + task_timer timer; + if (!config.prot_accession2taxid.empty()) { + timer.go("Loading taxonomy"); + load(); + timer.finish(); + } + + if (!config.nodesdmp.empty()) { + timer.go("Loading taxonomy nodes"); + load_nodes(); + timer.finish(); + } +} + +void Taxonomy::get_taxids(const char *id, set<unsigned> &taxons) const +{ + const vector<string> t(tokenize(id, "\1")); + for (vector<string>::const_iterator i = t.begin(); i < t.end(); ++i) { + const unsigned id = get(Taxonomy::Accession(*i)); + if(id != 0) + taxons.insert(id); + } +} + +unsigned Taxonomy::get_lca(unsigned t1, unsigned t2) const +{ + static const int max = 64; + if (t1 == t2 || t2 == 0) + return t1; + if (t1 == 0) + return t2; + unsigned p = t2; + set<unsigned> l; + int n = 0; + do { + p = get_parent(p); + if (p == 0) + return t1; + l.insert(p); + if (++n > max) + throw std::runtime_error("Path in taxonomy too long (1)."); + } while (p != t1 && p != 1); + if (p == t1) + return p; + p = t1; + n = 0; + while (l.find(p) == l.end()) { + p = get_parent(p); + if (p == 0) + return t2; + if (++n > max) + throw std::runtime_error("Path in taxonomy too long (2)."); + } + return p; } \ No newline at end of file diff --git a/src/data/taxonomy.h b/src/data/taxonomy.h index 1ce6dc5..1a2537f 100644 --- a/src/data/taxonomy.h +++ b/src/data/taxonomy.h @@ -19,6 +19,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. #include <vector> #include <algorithm> #include <string> +#include <set> #include "../basic/const.h" #include "../util/util.h" @@ -71,7 +72,10 @@ struct Taxonomy char s[max_accesion_len]; }; + void init(); void load(); + void load_nodes(); + void get_taxids(const char *s, std::set<unsigned> &taxons) const; unsigned get(const Accession &accession) const { @@ -82,9 +86,20 @@ struct Taxonomy return 0; } + unsigned get_parent(unsigned taxid) const + { + if (parent_.size() <= taxid) + throw std::runtime_error(string("No taxonomy node found for taxon id ") + to_string(taxid)); + return parent_[taxid]; + } + + unsigned get_lca(unsigned t1, unsigned t2) const; + private: std::vector<std::pair<Accession, unsigned> > accession2taxid_; + std::vector<unsigned> parent_; + }; extern Taxonomy taxonomy; \ No newline at end of file diff --git a/src/dp/dp.h b/src/dp/dp.h index dbdec70..6a76900 100644 --- a/src/dp/dp.h +++ b/src/dp/dp.h @@ -94,6 +94,11 @@ struct Hsp_traits score(0), frame((int)frame) {} + Hsp_traits(const interval &query_source_range): + query_source_range(query_source_range) + {} + Hsp_traits(const Hsp_data &hsp) + {} int partial_score(const Diagonal_segment &d) const { const double overlap = std::max(d.subject_range().overlap_factor(subject_range), d.query_range().overlap_factor(query_range)); @@ -142,7 +147,7 @@ struct Hsp_traits } }; int d_min, d_max, score, frame; - interval query_range, subject_range; + interval query_source_range, query_range, subject_range; }; template<typename _score> diff --git a/src/dp/swipe.cpp b/src/dp/swipe.cpp index d09b2ee..9b3e070 100644 --- a/src/dp/swipe.cpp +++ b/src/dp/swipe.cpp @@ -246,4 +246,4 @@ void swipe(const sequence &query, vector<sequence>::const_iterator subject_begin #ifdef __SSE2__ swipe<uint8_t>(query, subject_begin, subject_end, out); #endif -} +} \ No newline at end of file diff --git a/src/extra/match_file.h b/src/extra/match_file.h index cd2ef69..75252d7 100644 --- a/src/extra/match_file.h +++ b/src/extra/match_file.h @@ -21,6 +21,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. #include <stdio.h> #include <vector> +#include "../util/binary_file.h" #include "blast_record.h" #include "../util/util.h" #include "../basic/reduction.h" diff --git a/src/output/blast_pairwise_format.cpp b/src/output/blast_pairwise_format.cpp index d4c4b20..1096e97 100644 --- a/src/output/blast_pairwise_format.cpp +++ b/src/output/blast_pairwise_format.cpp @@ -18,7 +18,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. #include "output_format.h" -void Pairwise_format::print_match(const Hsp_context& r, Text_buffer &out) const +void Pairwise_format::print_match(const Hsp_context& r, Text_buffer &out) { static const unsigned width = 60; out << '>'; @@ -67,9 +67,8 @@ void Pairwise_format::print_footer(Output_stream &out) const } -void Pairwise_format::print_query_epilog(Text_buffer &out, bool unaligned) const +void Pairwise_format::print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const { - } void Pairwise_format::print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const diff --git a/src/output/blast_tab_format.cpp b/src/output/blast_tab_format.cpp index a85147a..59fafd3 100644 --- a/src/output/blast_tab_format.cpp +++ b/src/output/blast_tab_format.cpp @@ -95,17 +95,17 @@ Blast_tab_format::Blast_tab_format() : void print_staxids(Text_buffer &out, const char *id) { - const vector<string> t(tokenize(id, "\1")); std::set<unsigned> taxons; - for (vector<string>::const_iterator i = t.begin(); i < t.end(); ++i) - taxons.insert(taxonomy.get(Taxonomy::Accession(*i))); + taxonomy.get_taxids(id, taxons); + if (taxons.empty()) + return; std::set<unsigned>::const_iterator i = taxons.begin(); out << *(i++); for (; i != taxons.end(); ++i) out << ';' << *i; } -void Blast_tab_format::print_match(const Hsp_context& r, Text_buffer &out) const +void Blast_tab_format::print_match(const Hsp_context& r, Text_buffer &out) { for (vector<unsigned>::const_iterator i = fields.begin(); i != fields.end(); ++i) { switch (*i) { diff --git a/src/output/join_blocks.cpp b/src/output/join_blocks.cpp index 0b2e63e..353e06b 100644 --- a/src/output/join_blocks.cpp +++ b/src/output/join_blocks.cpp @@ -21,6 +21,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. #include "../data/queries.h" #include "../output/daa_write.h" #include "output_format.h" +#include "../align/query_mapper.h" struct Join_fetcher { @@ -123,7 +124,35 @@ struct Join_record }; -void join_query(vector<Binary_buffer> &buf, Text_buffer &out, Statistics &statistics, unsigned query, const char *query_name, unsigned query_source_len) +struct Join_records +{ + Join_records(vector<Binary_buffer> &buf) + { + for (unsigned i = 0; i < current_ref_block; ++i) { + it.push_back(buf[i].begin()); + Join_record::push_next(i, std::numeric_limits<unsigned>::max(), it.back(), records); + } + std::make_heap(records.begin(), records.end()); + } + Target* get_target() + { + if (records.empty()) + return 0; + const unsigned block = records.front().block_; + const unsigned subject = records.front().info_.subject_id; + Target *t = new Target(records.front().info_.score); + do { + std::pop_heap(records.begin(), records.end()); + records.pop_back(); + if (Join_record::push_next(block, subject, it[block], records)) + std::push_heap(records.begin(), records.end()); + } while (!records.empty() && records.front().info_.subject_id == subject); + } + vector<Join_record> records; + vector<Binary_buffer::Iterator> it; +}; + +void join_query(vector<Binary_buffer> &buf, Text_buffer &out, Statistics &statistics, unsigned query, const char *query_name, unsigned query_source_len, Output_format &f) { sequence context[6]; for (unsigned i = 0; i < align_mode.query_contexts; ++i) @@ -148,11 +177,11 @@ void join_query(vector<Binary_buffer> &buf, Text_buffer &out, Statistics &statis if (config.output_range(n_target_seq, next.info_.score, top_score) || same_subject) { //printf("q=%u s=%u n=%u ss=%u\n",query, next.info_.subject_id, n_target_seq, same_subject, next.info_.score); - if(*output_format == Output_format::daa) + if(f == Output_format::daa) write_daa_record(out, next.info_); else { Hsp_data hsp(next.info_, query_source_len); - output_format->print_match(Hsp_context(hsp, + f.print_match(Hsp_context(hsp, query, context[align_mode.check_context(hsp.frame)], align_mode.query_translated ? query_source_seqs::get()[query] : context[0], @@ -205,21 +234,23 @@ void join_worker(Task_queue<Text_buffer,Join_writer> *queue) if (*output_format != Output_format::daa && config.report_unaligned != 0) { for (unsigned i = fetcher.unaligned_from; i < fetcher.query_id; ++i) { output_format->print_query_intro(i, query_ids::get()[i].c_str(), get_source_query_len(i), *out, true); - output_format->print_query_epilog(*out, true); + output_format->print_query_epilog(*out, query_ids::get()[i].c_str(), true); } } - if (*output_format == Output_format::daa) + auto_ptr<Output_format> f(output_format->clone()); + + if (*f == Output_format::daa) seek_pos = write_daa_query_record(*out, query_name, query_seq); else - output_format->print_query_intro(fetcher.query_id, query_name, (unsigned)query_seq.length(), *out, false); + f->print_query_intro(fetcher.query_id, query_name, (unsigned)query_seq.length(), *out, false); - join_query(fetcher.buf, *out, stat, fetcher.query_id, query_name, (unsigned)query_seq.length()); + join_query(fetcher.buf, *out, stat, fetcher.query_id, query_name, (unsigned)query_seq.length(), *f); - if (*output_format == Output_format::daa) + if (*f == Output_format::daa) finish_daa_query_record(*out, seek_pos); else - output_format->print_query_epilog(*out, false); + f->print_query_epilog(*out, query_name, false); queue->push(n); } @@ -242,7 +273,7 @@ void join_blocks(unsigned ref_blocks, Output_stream &master_out, const vector<Te Text_buffer out; for (unsigned i = Join_fetcher::query_last + 1; i < query_ids::get().get_length(); ++i) { output_format->print_query_intro(i, query_ids::get()[i].c_str(), get_source_query_len(i), out, true); - output_format->print_query_epilog(out, true); + output_format->print_query_epilog(out, query_ids::get()[i].c_str(), true); } writer(out); } diff --git a/src/output/output.h b/src/output/output.h index 03fe1fe..8bef7f5 100644 --- a/src/output/output.h +++ b/src/output/output.h @@ -19,6 +19,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. #ifndef OUTPUT_H_ #define OUTPUT_H_ +#include <map> #include "../util/binary_file.h" #include "../basic/packed_transcript.h" #include "../util/text_buffer.h" @@ -98,8 +99,42 @@ struct Intermediate_record Packed_transcript transcript; }; -#ifndef ST_JOIN void join_blocks(unsigned ref_blocks, Output_stream &master_out, const vector<Temp_file> &tmp_file); -#endif + +struct Output_sink +{ + Output_sink(size_t begin, Output_stream *f) : + f_(f), + next_(begin), + size_(0), + max_size_(0) + {} + void push(size_t n, Text_buffer *buf); + size_t size() const + { + return size_; + } + size_t max_size() const + { + return max_size_; + } + static Output_sink& get() + { + return *instance; + } + size_t next() const + { + return next_; + } + static auto_ptr<Output_sink> instance; +private: + void flush(Text_buffer *buf); + tthread::mutex mtx_; + Output_stream* const f_; + std::map<size_t, Text_buffer*> backlog_; + size_t next_, size_, max_size_; +}; + +void heartbeat_worker(size_t qend); #endif \ No newline at end of file diff --git a/src/output/output_format.cpp b/src/output/output_format.cpp index 74786f6..ef6113b 100644 --- a/src/output/output_format.cpp +++ b/src/output/output_format.cpp @@ -84,12 +84,31 @@ Output_format* get_output_format() return new Pairwise_format; else if (f[0] == "null") return new Null_format; + else if (f[0] == "102") + return new Taxon_format; else - throw std::runtime_error("Invalid output format. Allowed values: 5,6,100,101"); + throw std::runtime_error("Invalid output format. Allowed values: 0,5,6,100,101,102"); } +void init_output() +{ + output_format = auto_ptr<Output_format>(get_output_format()); + if (*output_format == Output_format::taxon && config.toppercent == 100.0) + config.toppercent = 10.0; + if (config.toppercent == 100.0) { + message_stream << "#Target sequences to report alignments for: "; + if (config.max_alignments == 0) { + config.max_alignments = std::numeric_limits<uint64_t>::max(); + message_stream << "unlimited" << endl; + } + else + message_stream << config.max_alignments << endl; + } + else + message_stream << "Percentage range of top alignment score to report hits: " << config.toppercent << endl; +} -void XML_format::print_match(const Hsp_context &r, Text_buffer &out) const +void XML_format::print_match(const Hsp_context &r, Text_buffer &out) { if(r.hsp_num == 0) { if (r.hit_num > 0) @@ -183,7 +202,7 @@ void XML_format::print_query_intro(size_t query_num, const char *query_name, uns << "<Iteration_hits>" << '\n'; } -void XML_format::print_query_epilog(Text_buffer &out, bool unaligned) const +void XML_format::print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const { if (!unaligned) { out << " </Hit_hsps>" << '\n' diff --git a/src/output/output_format.h b/src/output/output_format.h index 4ed34a5..15880de 100644 --- a/src/output/output_format.h +++ b/src/output/output_format.h @@ -34,14 +34,15 @@ struct Output_format {} virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const {} - virtual void print_query_epilog(Text_buffer &out, bool unaligned) const + virtual void print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const {} - virtual void print_match(const Hsp_context& r, Text_buffer &out) const + virtual void print_match(const Hsp_context& r, Text_buffer &out) {} virtual void print_header(Output_stream &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const { } virtual void print_footer(Output_stream &f) const { } + virtual Output_format* clone() const = 0; virtual ~Output_format() { } static void print_title(Text_buffer &buf, const char *id, bool full_titles, bool all_titles, const char *separator); @@ -50,7 +51,7 @@ struct Output_format return code; } unsigned code; - enum { daa, blast_tab, blast_xml, sam, blast_pairwise, null }; + enum { daa, blast_tab, blast_xml, sam, blast_pairwise, null, taxon }; }; extern auto_ptr<Output_format> output_format; @@ -60,6 +61,10 @@ struct Null_format : public Output_format Null_format() : Output_format(null) {} + virtual Output_format* clone() const + { + return new Null_format(*this); + } }; struct DAA_format : public Output_format @@ -67,6 +72,10 @@ struct DAA_format : public Output_format DAA_format(): Output_format(daa) {} + virtual Output_format* clone() const + { + return new DAA_format(*this); + } }; struct Blast_tab_format : public Output_format @@ -74,9 +83,13 @@ struct Blast_tab_format : public Output_format static const char* field_str[]; Blast_tab_format(); virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const; - virtual void print_match(const Hsp_context& r, Text_buffer &out) const; + virtual void print_match(const Hsp_context& r, Text_buffer &out); virtual ~Blast_tab_format() { } + virtual Output_format* clone() const + { + return new Blast_tab_format(*this); + } vector<unsigned> fields; }; @@ -85,11 +98,15 @@ struct Sam_format : public Output_format Sam_format(): Output_format(sam) { } - virtual void print_match(const Hsp_context& r, Text_buffer &out) const; + virtual void print_match(const Hsp_context& r, Text_buffer &out); virtual void print_header(Output_stream &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const; virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const; virtual ~Sam_format() { } + virtual Output_format* clone() const + { + return new Sam_format(*this); + } }; struct XML_format : public Output_format @@ -99,13 +116,17 @@ struct XML_format : public Output_format { config.salltitles = true; } - virtual void print_match(const Hsp_context &r, Text_buffer &out) const; + virtual void print_match(const Hsp_context &r, Text_buffer &out); virtual void print_header(Output_stream &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const; virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const; - virtual void print_query_epilog(Text_buffer &out, bool unaligned) const; + virtual void print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const; virtual void print_footer(Output_stream &f) const; virtual ~XML_format() { } + virtual Output_format* clone() const + { + return new XML_format(*this); + } }; struct Pairwise_format : public Output_format @@ -115,16 +136,46 @@ struct Pairwise_format : public Output_format { config.salltitles = true; } - virtual void print_match(const Hsp_context &r, Text_buffer &out) const; + virtual void print_match(const Hsp_context &r, Text_buffer &out); virtual void print_header(Output_stream &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const; virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const; - virtual void print_query_epilog(Text_buffer &out, bool unaligned) const; + virtual void print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const; virtual void print_footer(Output_stream &f) const; virtual ~Pairwise_format() { } + virtual Output_format* clone() const + { + return new Pairwise_format(*this); + } +}; + +struct Taxon_format : public Output_format +{ + Taxon_format() : + Output_format(taxon), + taxid(0), + evalue(std::numeric_limits<double>::max()) + { + config.salltitles = true; + if (config.prot_accession2taxid.empty()) + throw std::runtime_error("Output format requires setting the --taxonmap parameter."); + if (config.nodesdmp.empty()) + throw std::runtime_error("Output format requires setting the --taxonnodes parameter."); + } + virtual void print_match(const Hsp_context &r, Text_buffer &out); + virtual void print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const; + virtual ~Taxon_format() + { } + virtual Output_format* clone() const + { + return new Taxon_format(*this); + } + unsigned taxid; + double evalue; }; Output_format* get_output_format(); +void init_output(); void print_hsp(Hsp_data &hsp, sequence query); #endif /* OUTPUT_FORMAT_H_ */ diff --git a/src/output/output_sink.cpp b/src/output/output_sink.cpp new file mode 100644 index 0000000..d4e8f8b --- /dev/null +++ b/src/output/output_sink.cpp @@ -0,0 +1,86 @@ +/**** +DIAMOND protein aligner +Copyright (C) 2013-2017 Benjamin Buchfink <buchf...@gmail.com> + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +****/ + +#include "output.h" +#include "../data/queries.h" + +using std::map; +using std::auto_ptr; + +auto_ptr<Output_sink> Output_sink::instance; + +void Output_sink::push(size_t n, Text_buffer *buf) +{ + mtx_.lock(); + //cout << "n=" << n << " next=" << next_ << endl; + if (n != next_) { + backlog_[n] = buf; + size_ += buf ? buf->alloc_size() : 0; + max_size_ = std::max(max_size_, size_); + mtx_.unlock(); + } + else + flush(buf); +} + +void Output_sink::flush(Text_buffer *buf) +{ + size_t n = next_ + 1; + vector<Text_buffer*> out; + out.push_back(buf); + map<size_t, Text_buffer*>::iterator i; + do { + while ((i = backlog_.begin()) != backlog_.end() && i->first == n) { + out.push_back(i->second); + backlog_.erase(i); + ++n; + } + mtx_.unlock(); + size_t size = 0; + for (vector<Text_buffer*>::iterator j = out.begin(); j < out.end(); ++j) { + if (*j) { + f_->write((*j)->get_begin(), (*j)->size()); + if (*j != buf) + size += (*j)->alloc_size(); + delete *j; + } + } + out.clear(); + mtx_.lock(); + size_ -= size; + } while ((i = backlog_.begin()) != backlog_.end() && i->first == n); + next_ = n; + mtx_.unlock(); +} + +void heartbeat_worker(size_t qend) +{ + static const int interval = 100; + int n = 0; + while (Output_sink::get().next() < qend) { + if (n == interval) { + const string title(query_ids::get()[Output_sink::get().next()].c_str()); + verbose_stream << "Queries=" << Output_sink::get().next() << " size=" << megabytes(Output_sink::get().size()) << " max_size=" << megabytes(Output_sink::get().max_size()) + << " next=" << title.substr(0, title.find(' ')) << endl; + n = 0; + } + else + ++n; + tthread::this_thread::sleep_for(tthread::chrono::milliseconds(10)); + } +} \ No newline at end of file diff --git a/src/output/sam_format.cpp b/src/output/sam_format.cpp index 932e406..d8a4469 100644 --- a/src/output/sam_format.cpp +++ b/src/output/sam_format.cpp @@ -82,7 +82,7 @@ void Sam_format::print_query_intro(size_t query_num, const char *query_name, uns } } -void Sam_format::print_match(const Hsp_context& r, Text_buffer &out) const +void Sam_format::print_match(const Hsp_context& r, Text_buffer &out) { out.write_until(r.query_name, Const::id_delimiters); out << '\t' << '0' << '\t'; diff --git a/src/basic/const.h b/src/output/target_culling.cpp similarity index 62% copy from src/basic/const.h copy to src/output/target_culling.cpp index 0dd7c7a..52df2fe 100644 --- a/src/basic/const.h +++ b/src/output/target_culling.cpp @@ -16,31 +16,9 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. ****/ -#ifndef CONST_H_ -#define CONST_H_ +#include "target_culling.h" -struct Const +Target_culling* Target_culling::get(const Target* first) { - - enum { - build_version = 109, - daa_version = 0, - seedp_bits = 10, - seedp = 1<<seedp_bits, - max_seed_weight = 32, - max_shapes = 16, - max_shape_len = 32 - }; - - static const char* version_string; - static const char* program_name; - static const char* id_delimiters; - -}; - -#define SIMPLE_SEARCH -// #define FREQUENCY_MASKING -// #define ST_JOIN -// #define NO_COLLISION_FILTER - -#endif /* CONST_H_ */ + return config.query_overlap_culling == 0.0 ? new Target_culling(first) : new Overlap_culling; +} \ No newline at end of file diff --git a/src/output/target_culling.h b/src/output/target_culling.h new file mode 100644 index 0000000..365b548 --- /dev/null +++ b/src/output/target_culling.h @@ -0,0 +1,73 @@ +/**** +DIAMOND protein aligner +Copyright (C) 2013-2017 Benjamin Buchfink <buchf...@gmail.com> + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +****/ + +#ifndef TARGET_CULLING_H_ +#define TARGET_CULLING_H_ + +#include <vector> +#include "../align/query_mapper.h" + +struct Target_culling +{ + Target_culling(): + top_score_(0) + {} + Target_culling(const Target *first): + n_(0), + top_score_(first ? first->filter_score : 0) + {} + virtual int cull(const Target &t) + { + return config.output_range((unsigned)n_, t.filter_score, top_score_) ? use : finished; + } + virtual void add(Target *t) + { + ++n_; + } + enum { finished = 0, next = 1, use = 2}; + static Target_culling* get(const Target* first); +private: + size_t n_; + const int top_score_; +}; + +struct Overlap_culling : Target_culling +{ + Overlap_culling() + {} + virtual int cull(const Target &t) + { + return t.is_enveloped(targets_.begin(), targets_.end(), config.query_overlap_culling / 100.0, int(double(t.filter_score) / (1.0 - config.toppercent / 100.0))) + ? next : use; + } + virtual void add(Target *t) + { + targets_.push_back(t); + } + virtual void add(unsigned n, const Hsp_data &hsp) + { + if (n >= targets_.size()) { + targets_.push_back(new Target(hsp.score)); + } + targets_[n]->ts.push_back(Hsp_traits(hsp)); + } +private: + std::vector<Target*> targets_; +}; + +#endif \ No newline at end of file diff --git a/src/output/taxon_format.cpp b/src/output/taxon_format.cpp new file mode 100644 index 0000000..5979ae4 --- /dev/null +++ b/src/output/taxon_format.cpp @@ -0,0 +1,51 @@ +/**** +DIAMOND protein aligner +Copyright (C) 2013-2017 Benjamin Buchfink <buchf...@gmail.com> + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +****/ + +#include <set> +#include "output_format.h" +#include "../data/taxonomy.h" + +using std::set; + +void Taxon_format::print_match(const Hsp_context &r, Text_buffer &out) +{ + set<unsigned> taxons; + taxonomy.get_taxids(r.subject_name, taxons); + if (taxons.empty()) + return; + evalue = std::min(evalue, r.evalue()); + try { + for (set<unsigned>::const_iterator i = taxons.begin(); i != taxons.end(); ++i) + taxid = taxonomy.get_lca(taxid, *i); + } + catch (std::exception &) { + std::cerr << "Query=" << r.query_name << endl << "Subject=" << r.subject_name << endl; + throw; + } +} + +void Taxon_format::print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const +{ + out.write_until(query_title, Const::id_delimiters); + out << '\t' << taxid << '\t'; + if (taxid != 0) + out.print_e(evalue); + else + out << '0'; + out << '\n'; +} \ No newline at end of file diff --git a/src/output/view.h b/src/output/view.cpp similarity index 73% rename from src/output/view.h rename to src/output/view.cpp index 21a2ba6..2a66b01 100644 --- a/src/output/view.h +++ b/src/output/view.cpp @@ -16,9 +16,6 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. ****/ -#ifndef VIEW_H_ -#define VIEW_H_ - #include "../basic/config.h" #include "../util/binary_file.h" #include "../util/text_buffer.h" @@ -28,13 +25,14 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. #include "../util/task_queue.h" #include "../basic/score_matrix.h" #include "../util/thread.h" +#include "../data/taxonomy.h" const unsigned view_buf_size = 32; struct View_writer { - View_writer(): - f_ (config.compression == 1 + View_writer() : + f_(config.compression == 1 ? new Compressed_ostream(config.output_file) : new Output_stream(config.output_file)) { } @@ -52,17 +50,18 @@ struct View_writer struct View_fetcher { - View_fetcher(DAA_file &daa): - daa (daa) + View_fetcher(DAA_file &daa) : + daa(daa) { } bool operator()() { n = 0; - for(unsigned i=0;i<view_buf_size;++i) + for (unsigned i = 0; i<view_buf_size; ++i) if (!daa.read_query_buffer(buf[i], query_num)) { query_num -= n - 1; return false; - } else + } + else ++n; query_num -= n - 1; return true; @@ -73,63 +72,73 @@ struct View_fetcher DAA_file &daa; }; -void view_query(DAA_query_record &r, Text_buffer &out, const Output_format &format) +void view_query(DAA_query_record &r, Text_buffer &out, Output_format &format) { - format.print_query_intro(r.query_num, r.query_name.c_str(), (unsigned)r.query_len(), out, false); - for (DAA_query_record::Match_iterator i = r.begin(); i.good(); ++i) { + auto_ptr<Output_format> f(format.clone()); + f->print_query_intro(r.query_num, r.query_name.c_str(), (unsigned)r.query_len(), out, false); + DAA_query_record::Match_iterator i = r.begin(); + const unsigned top_score = i.good() ? i->score : 0; + for (; i.good(); ++i) { if (i->frame > 2 && config.forwardonly) continue; - format.print_match(i->context(), out); + if (!config.output_range(i->hit_num, i->score, top_score)) + break; + f->print_match(i->context(), out); } - format.print_query_epilog(out, false); + f->print_query_epilog(out, r.query_name.c_str(), false); } struct View_context { - View_context(DAA_file &daa, View_writer &writer, const Output_format &format): - daa (daa), - writer (writer), - queue (3*config.threads_, writer), - format (format) + View_context(DAA_file &daa, View_writer &writer, Output_format &format) : + daa(daa), + writer(writer), + queue(3 * config.threads_, writer), + format(format) { } void operator()(unsigned thread_id) { try { size_t n; - View_fetcher query_buf (daa); + View_fetcher query_buf(daa); Text_buffer *buffer = 0; - while(queue.get(n, buffer, query_buf)) { + while (queue.get(n, buffer, query_buf)) { for (unsigned j = 0; j < query_buf.n; ++j) { DAA_query_record r(daa, query_buf.buf[j], query_buf.query_num + j); view_query(r, *buffer, format); } queue.push(n); } - } catch(std::exception &e) { + } + catch (std::exception &e) { std::cout << e.what() << std::endl; std::terminate(); } } DAA_file &daa; View_writer &writer; - Task_queue<Text_buffer,View_writer> queue; - const Output_format &format; + Task_queue<Text_buffer, View_writer> queue; + Output_format &format; }; void view() { - DAA_file daa (config.daa_file); + task_timer timer("Loading subject IDs"); + DAA_file daa(config.daa_file); score_matrix = Score_matrix("", daa.lambda(), daa.kappa(), daa.gap_open_penalty(), daa.gap_extension_penalty()); + timer.finish(); message_stream << "Scoring parameters: " << score_matrix << endl; verbose_stream << "Build version = " << daa.diamond_build() << endl; message_stream << "DB sequences = " << daa.db_seqs() << endl; message_stream << "DB sequences used = " << daa.db_seqs_used() << endl; message_stream << "DB letters = " << daa.db_letters() << endl; - - task_timer timer("Generating output"); + + init_output(); + taxonomy.init(); + + timer.go("Generating output"); View_writer writer; - output_format = auto_ptr<Output_format>(get_output_format()); Binary_buffer buf; size_t query_num; @@ -152,5 +161,3 @@ void view() output_format->print_footer(*writer.f_); } - -#endif /* VIEW_H_ */ diff --git a/src/run/double_indexed.cpp b/src/run/double_indexed.cpp index a6fa614..ab2ede5 100644 --- a/src/run/double_indexed.cpp +++ b/src/run/double_indexed.cpp @@ -22,7 +22,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. #include "../data/queries.h" #include "../basic/statistics.h" #include "../basic/shape_config.h" -#include "../align/align_queries.h" #include "../search/align_range.h" #include "../util/seq_file_format.h" #include "../data/load_seqs.h" @@ -320,7 +319,7 @@ void master_thread_di() timer2.start(); align_mode = Align_mode(Align_mode::from_command(config.command)); - output_format = auto_ptr<Output_format>(get_output_format()); + init_output(); message_stream << "Temporary directory: " << Temp_file::get_temp_dir() << endl; @@ -343,12 +342,7 @@ void master_thread_di() Config::set_option(config.db_size, (uint64_t)ref_header.letters); set_max_open_files(config.query_bins * config.threads_ + unsigned(ref_header.letters / (size_t)(config.chunk_size * 1e9)) + 16); - - if (!config.prot_accession2taxid.empty()) { - timer.go("Loading taxonomy"); - taxonomy.load(); - timer.finish(); - } + taxonomy.init(); master_thread(db_file, timer2); } \ No newline at end of file diff --git a/src/run/main.cpp b/src/run/main.cpp index 79b1fe5..db7e36d 100644 --- a/src/run/main.cpp +++ b/src/run/main.cpp @@ -18,7 +18,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. #include <iostream> #include "../basic/config.h" -#include "../output/view.h" #include "tools.h" #include "../extra/compare.h" @@ -31,6 +30,9 @@ void model_seqs(); void opt(); void run_masker(); void fastq2fasta(); +void view(); +void make_db(); +void db_info(); int main(int ac, const char* av[]) { @@ -95,6 +97,9 @@ int main(int ac, const char* av[]) case Config::fastq2fasta: fastq2fasta(); break; + case Config::dbinfo: + db_info(); + break; default: return 1; } diff --git a/src/run/tools.cpp b/src/run/tools.cpp index 5e76d01..7b6b6c8 100644 --- a/src/run/tools.cpp +++ b/src/run/tools.cpp @@ -118,14 +118,14 @@ void run_masker() cout << '>' << string(id.data(), id.size()) << endl; seq2 = seq; Masking::get()(seq2.data(), seq2.size()); - for (size_t i = 0; i < seq.size(); ++i) { + /*for (size_t i = 0; i < seq.size(); ++i) { char c = value_traits.alphabet[(long)seq[i]]; if (seq2[i] == value_traits.mask_char) c = tolower(c); cout << c; } - cout << endl; - //cout << sequence(seq.data(), seq.size()) << endl; + cout << endl;*/ + cout << sequence(seq2.data(), seq2.size()) << endl; } } diff --git a/src/util/log_stream.h b/src/util/log_stream.h index 9aacba1..06bd34c 100644 --- a/src/util/log_stream.h +++ b/src/util/log_stream.h @@ -69,6 +69,11 @@ extern Message_stream log_stream; struct task_timer { + task_timer(unsigned level = 1) : + level_(level), + msg_(0) + { + } task_timer(const char *msg, unsigned level=1) : level_(level), msg_(msg) diff --git a/src/basic/const.h b/src/util/queue.h similarity index 60% copy from src/basic/const.h copy to src/util/queue.h index 0dd7c7a..cc24dfc 100644 --- a/src/basic/const.h +++ b/src/util/queue.h @@ -16,31 +16,38 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. ****/ -#ifndef CONST_H_ -#define CONST_H_ +#include "tinythread.h" -struct Const +struct Queue { - - enum { - build_version = 109, - daa_version = 0, - seedp_bits = 10, - seedp = 1<<seedp_bits, - max_seed_weight = 32, - max_shapes = 16, - max_shape_len = 32 - }; - - static const char* version_string; - static const char* program_name; - static const char* id_delimiters; - -}; - -#define SIMPLE_SEARCH -// #define FREQUENCY_MASKING -// #define ST_JOIN -// #define NO_COLLISION_FILTER - -#endif /* CONST_H_ */ + enum { end = 0xffffffffffffffffllu }; + Queue(size_t begin, size_t end) : + next_(begin), + end_(end) + {} + template<typename _f> + size_t get(_f &f) + { + mtx_.lock(); + const size_t q = next_++; + if (q >= end_) { + mtx_.unlock(); + return Queue::end; + } + f(q); + mtx_.unlock(); + return q; + } + size_t next() const + { + return next_; + } + size_t get_end() const + { + return end_; + } +private: + tthread::mutex mtx_; + size_t next_; + const size_t end_; +}; \ No newline at end of file -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/diamond-aligner.git _______________________________________________ debian-med-commit mailing list debian-med-commit@lists.alioth.debian.org http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit