This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository diamond-aligner.

commit baec593e94cc4cfd918fa334eb9493a638abdc70
Author: Andreas Tille <ti...@debian.org>
Date:   Mon Sep 18 13:30:39 2017 +0200

    New upstream version 0.9.10+dfsg
---
 CMakeLists.txt                                   |   4 +
 README.rst                                       |   2 +-
 build_simple.sh                                  |   4 +
 src/ChangeLog                                    |  14 +-
 src/align/align.cpp                              | 288 +++++------------------
 src/align/align.h                                |  64 +----
 src/align/align_queries.h                        |  67 ------
 src/align/align_target.cpp                       |   4 +
 src/align/query_mapper.cpp                       | 133 ++++++-----
 src/align/query_mapper.h                         |  16 ++
 src/basic/basic.cpp                              |   4 +-
 src/basic/config.cpp                             |  28 ++-
 src/basic/config.h                               |   6 +-
 src/basic/const.h                                |   2 +-
 src/basic/hssp.cpp                               |  30 ++-
 src/basic/match.h                                |   2 +
 src/basic/sequence.h                             |   9 +-
 src/basic/value.h                                |   2 +-
 src/data/load_seqs.h                             |  12 +-
 src/data/reference.cpp                           |  22 ++
 src/data/reference.h                             |   9 +-
 src/data/taxonomy.cpp                            |  81 ++++++-
 src/data/taxonomy.h                              |  15 ++
 src/dp/dp.h                                      |   7 +-
 src/dp/swipe.cpp                                 |   2 +-
 src/extra/match_file.h                           |   1 +
 src/output/blast_pairwise_format.cpp             |   5 +-
 src/output/blast_tab_format.cpp                  |   8 +-
 src/output/join_blocks.cpp                       |  51 +++-
 src/output/output.h                              |  39 ++-
 src/output/output_format.cpp                     |  25 +-
 src/output/output_format.h                       |  69 +++++-
 src/output/output_sink.cpp                       |  86 +++++++
 src/output/sam_format.cpp                        |   2 +-
 src/{basic/const.h => output/target_culling.cpp} |  30 +--
 src/output/target_culling.h                      |  73 ++++++
 src/output/taxon_format.cpp                      |  51 ++++
 src/output/{view.h => view.cpp}                  |  67 +++---
 src/run/double_indexed.cpp                       |  10 +-
 src/run/main.cpp                                 |   7 +-
 src/run/tools.cpp                                |   6 +-
 src/util/log_stream.h                            |   5 +
 src/{basic/const.h => util/queue.h}              |  59 +++--
 43 files changed, 830 insertions(+), 591 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 05d6c12..35ee8bf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,6 +85,10 @@ add_executable(diamond src/run/main.cpp
   src/data/seed_set.cpp
   src/util/binary_file.cpp
   src/util/simd.cpp
+  src/output/taxon_format.cpp
+  src/output/view.cpp
+  src/output/output_sink.cpp
+  src/output/target_culling.cpp
 )
 
 target_link_libraries(diamond ${ZLIB_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
diff --git a/README.rst b/README.rst
index 7d043f3..40c77f1 100644
--- a/README.rst
+++ b/README.rst
@@ -8,7 +8,7 @@ Please read the `manual 
<https://github.com/bbuchfink/diamond/raw/master/diamond
 
 Installing the software on your system may be done by downloading it in binary 
format for immediate use::
 
-    wget 
http://github.com/bbuchfink/diamond/releases/download/v0.9.8/diamond-linux64.tar.gz
+    wget 
http://github.com/bbuchfink/diamond/releases/download/v0.9.10/diamond-linux64.tar.gz
     tar xzf diamond-linux64.tar.gz
 
 The extracted ``diamond`` binary file should be moved to a directory contained 
in your executable search path (PATH environment variable).
diff --git a/build_simple.sh b/build_simple.sh
index 17fb696..6a8a4dc 100755
--- a/build_simple.sh
+++ b/build_simple.sh
@@ -53,4 +53,8 @@ g++ -DNDEBUG -O3 -Wno-deprecated-declarations $1 $2 $3 \
   src/data/seed_set.cpp \
   src/util/binary_file.cpp \
   src/util/simd.cpp \
+  src/output/taxon_format.cpp \
+  src/output/view.cpp \
+  src/output/output_sink.cpp \
+  src/output/target_culling.cpp \
 -lz -lpthread -o diamond
diff --git a/src/ChangeLog b/src/ChangeLog
index b1f7103..feaca8c 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,13 @@
+[0.9.10]
+- added --strand option to choose query strand
+- added dbinfo command
+
+[0.9.9]
+- Added taxonomic classification format.
+- Fixed a bug in getseq printing masked residues.
+- Fixed parsing of UniRef100_ sequence id prefixes.
+- Added support for using the staxids output field in diamond view.
+
 [0.9.8]
 - Fixed a compiler errror.
 
@@ -205,7 +215,7 @@
 
 [0.7.11]
 - added --version switch
-- added static build optiom for CMake build
+- added static build option for CMake build
 - fixed a bug that would cause a return code of 1 without an error
 
 [0.7.10]
@@ -270,4 +280,4 @@
 - reduced usage of temporary disk space and memory
 - fixed a compiler error for GCC 4.1.2
 - fixed a bug that could cause the program to hang when running out of 
temporary space or memory
-- added --forwardonly option to view command
+- added --forwardonly option to view command
\ No newline at end of file
diff --git a/src/align/align.cpp b/src/align/align.cpp
index 8b6795c..2e60268 100644
--- a/src/align/align.cpp
+++ b/src/align/align.cpp
@@ -18,93 +18,65 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>.
 
 #include "../basic/value.h"
 #include "align.h"
-#include "align_queries.h"
 #include "../data/reference.h"
 #include "../output/output_format.h"
+#include "../util/queue.h"
+#include "../output/output.h"
+#include "query_mapper.h"
+#include "../util/merge_sort.h"
 
 using std::map;
 
-auto_ptr<Simple_query_queue> Simple_query_queue::instance;
-auto_ptr<Output_sink> Output_sink::instance;
-
-size_t Simple_query_queue::get(vector<hit>::iterator &begin, 
vector<hit>::iterator &end)
+struct Align_fetcher
 {
-       mtx_.lock();
-       const unsigned query = (unsigned)(next_++),
-               c = align_mode.query_contexts;
-       if (query >= qend_) {
-               mtx_.unlock();
-               return Simple_query_queue::end;
+       static void init(size_t qbegin, size_t qend, vector<hit>::iterator 
begin, vector<hit>::iterator end)
+       {
+               it_ = begin;
+               end_ = end;
+               queue_ = auto_ptr<Queue>(new Queue(qbegin, qend));
        }
-       begin = it_;
-       while (it_ < end_ && it_->query_ / c == query)
-               ++it_;
-       end = it_;
-       mtx_.unlock();  
-       return query;
-}
-
-void Output_sink::push(size_t n, Text_buffer *buf)
-{
-       mtx_.lock();
-       //cout << "n=" << n << " next=" << next_ << endl;
-       if (n != next_) {
-               backlog_[n] = buf;
-               size_ += buf ? buf->alloc_size() : 0;
-               max_size_ = std::max(max_size_, size_);
-               mtx_.unlock();
+       void operator()(size_t query)
+       {
+               const unsigned q = (unsigned)query,
+                       c = align_mode.query_contexts;
+               begin = it_;
+               while (it_ < end_ && it_->query_ / c == q)
+                       ++it_;
+               end = it_;
+               this->query = query;
        }
-       else
-               flush(buf);
-}
+       bool get()
+       {
+               return queue_->get(*this) != Queue::end;
+       }
+       size_t query;
+       vector<hit>::iterator begin, end;
+private:       
+       static vector<hit>::iterator it_, end_;
+       static auto_ptr<Queue> queue_;
+};
 
-void Output_sink::flush(Text_buffer *buf)
-{
-       size_t n = next_ + 1;
-       vector<Text_buffer*> out;
-       out.push_back(buf);
-       map<size_t, Text_buffer*>::iterator i;
-       do {
-               while ((i = backlog_.begin()) != backlog_.end() && i->first == 
n) {
-                       out.push_back(i->second);
-                       backlog_.erase(i);
-                       ++n;
-               }
-               mtx_.unlock();
-               size_t size = 0;
-               for (vector<Text_buffer*>::iterator j = out.begin(); j < 
out.end(); ++j) {
-                       if (*j) {
-                               f_->write((*j)->get_begin(), (*j)->size());
-                               if(*j != buf)
-                                       size += (*j)->alloc_size();
-                               delete *j;
-                       }
-               }
-               out.clear();
-               mtx_.lock();
-               size_ -= size;
-       } while ((i = backlog_.begin()) != backlog_.end() && i->first == n);
-       next_ = n;
-       mtx_.unlock();
-}
+auto_ptr<Queue> Align_fetcher::queue_;
+vector<hit>::iterator Align_fetcher::it_;
+vector<hit>::iterator Align_fetcher::end_;
 
 void align_worker(size_t thread_id)
 {
-       vector<hit>::iterator begin, end;
-       size_t query;
+       Align_fetcher hits;
        Statistics stat;
-       while ((query = Simple_query_queue::get().get(begin, end)) != 
Simple_query_queue::end) {
-               if (end == begin) {
+       while (hits.get()) {
+               if (hits.end == hits.begin) {
                        Text_buffer *buf = 0;
                        if (!blocked_processing && *output_format != 
Output_format::daa && config.report_unaligned != 0) {
                                buf = new Text_buffer;
-                               output_format->print_query_intro(query, 
query_ids::get()[query].c_str(), get_source_query_len((unsigned)query), *buf, 
true);
-                               output_format->print_query_epilog(*buf, true);
+                               const char *query_title = 
query_ids::get()[hits.query].c_str();
+                               output_format->print_query_intro(hits.query, 
query_title, get_source_query_len((unsigned)hits.query), *buf, true);
+                               output_format->print_query_epilog(*buf, 
query_title, true);
                        }
-                       Output_sink::get().push(query, buf);
+                       Output_sink::get().push(hits.query, buf);
                        continue;
                }
-               Query_mapper mapper(query, begin, end);
+               Query_mapper mapper(hits.query, hits.begin, hits.end);
                mapper.init();
                if (config.ext == Config::swipe)
                        mapper.align_targets(stat);
@@ -113,11 +85,13 @@ void align_worker(size_t thread_id)
                        for (size_t i = 0; i < mapper.n_targets(); ++i)
                                mapper.ungapped_stage(i);
                        if (config.ext != Config::most_greedy) {
+                               mapper.fill_source_ranges();
                                mapper.rank_targets(config.rank_ratio == -1 ? 
(mapper.query_seq(0).length() > 50 ? 0.6 : 0.9) : config.rank_ratio);
                                stat.inc(Statistics::TARGET_HITS1, 
mapper.n_targets());
                                const int cutoff = 
int(mapper.raw_score_cutoff() * config.score_ratio);
                                for (size_t i = 0; i < mapper.n_targets(); ++i)
                                        mapper.greedy_stage(i, stat, cutoff);
+                               mapper.fill_source_ranges();
                                mapper.rank_targets(config.rank_ratio2 == -1 ? 
(mapper.query_seq(0).length() > 50 ? 0.95 : 1.0) : config.rank_ratio2);
                                stat.inc(Statistics::TARGET_HITS2, 
mapper.n_targets());
                                for (size_t i = 0; i < mapper.n_targets(); ++i)
@@ -129,157 +103,18 @@ void align_worker(size_t thread_id)
                        buf = new Text_buffer;
                        const bool aligned = mapper.generate_output(*buf, stat);
                        if (aligned && !config.unaligned.empty())
-                               query_aligned[query] = true;
-               }
-               Output_sink::get().push(query, buf);
-       }
-       statistics += stat;
-}
-
-void heartbeat_worker()
-{
-       static const int interval = 100;
-       int n = 0;
-       while (Output_sink::get().next() < Simple_query_queue::get().qend()) {
-               if (n == interval) {
-                       const string 
title(query_ids::get()[Output_sink::get().next()].c_str());
-                       verbose_stream << "Queries=" << 
Simple_query_queue::get().next() << " size=" << 
megabytes(Output_sink::get().size()) << " max_size=" << 
megabytes(Output_sink::get().max_size())
-                               << " next=" << title.substr(0, title.find(' ')) 
<< endl;
-                       n = 0;
-               }
-               else
-                       ++n;
-               
tthread::this_thread::sleep_for(tthread::chrono::milliseconds(10));
-       }
-}
-
-Query_queue query_queue;
-
-void Query_queue::init(Trace_pt_list::iterator begin, Trace_pt_list::iterator 
end)
-{
-       trace_pt_pos = begin;
-       trace_pt_end = end;
-       assert(queue.empty());
-       assert(out_queue.empty());
-       n = 0;
-}
-       
-void Query_queue::flush(Output_stream *out, Statistics &stat)
-{
-       writing = true;
-       std::queue<Query_data*> q;
-       while (true) {
-               while (!out_queue.empty() && out_queue.front()->state == 
Query_data::finished) {
-                       q.push(out_queue.front());
-                       out_queue.pop();
-               }
-               if (q.empty()) {
-                       writing = false;
-                       lock.unlock();
-                       return;
-               }
-               lock.unlock();
-
-               unsigned k = 0;
-               while (!q.empty()) {
-                       out->write(q.front()->buf.get_begin(), 
q.front()->buf.size());
-                       delete q.front();
-                       q.pop();
-                       ++k;
+                               query_aligned[hits.query] = true;
                }
-
-               lock.lock();
-               n -= k;
-               /*if (n > 100)
-               cout << "qlen=" << out_queue.size() << " finished=" << n << 
endl;*/
+               Output_sink::get().push(hits.query, buf);
        }
-}
-
-Query_data* Query_queue::get()
-{
-       for (std::deque<Query_data*>::iterator i = queue.begin(); i != 
queue.end(); ++i)
-               if ((*i)->state == Query_data::free)
-                       return *i;
-       return 0;
-}
-
-void Query_queue::pop_busy()
-{
-       while (!queue.empty() && (queue.front()->state == Query_data::closing 
|| queue.front()->state == Query_data::finished)) {
-               out_queue.push(queue.front());
-               queue.pop_front();
-       }
-}
-
-void align_worker(Output_stream *out)
-{
-       Statistics stat;
-       Query_data *data = 0;
-       unsigned n_targets = 0;
-
-       while (true) {
-
-               query_queue.lock.lock();
-
-               if (data) {
-                       data->mapper->targets_finished += n_targets;
-                       if (data->mapper->finished()) {
-                               query_queue.lock.unlock();
-                               const bool aligned = 
data->mapper->generate_output(data->buf, stat);
-                               const unsigned query_id = 
data->mapper->query_id;
-                               delete data->mapper;
-                               query_queue.lock.lock();
-                               data->state = Query_data::finished;
-                               if (aligned && !config.unaligned.empty())
-                                       query_aligned[query_id] = true;
-                               ++query_queue.n;
-                               if (!query_queue.writing && 
!query_queue.out_queue.empty() && data == query_queue.out_queue.front()) {
-                                       query_queue.flush(out, stat);
-                                       data = 0;
-                                       continue;
-                               }
-                       }
-               }
-
-               if (!(data = query_queue.get())) {
-                       if (query_queue.trace_pt_pos >= 
query_queue.trace_pt_end) {
-                               query_queue.lock.unlock();
-                               break;
-                       }
-                       else {
-                               query_queue.queue.push_back(new Query_data(new 
Query_mapper()));
-                               data = query_queue.queue.back();
-                               query_queue.lock.unlock();
-                               data->mapper->init();
-                               data->state = Query_data::free;
-                               data = 0;
-                       }
-               }
-               else {
-                       size_t target = data->mapper->next_target;
-                       n_targets = std::min(config.target_fetch_size, 
(unsigned)data->mapper->n_targets() - data->mapper->next_target);
-                       data->mapper->next_target += n_targets;
-                       if (target + n_targets == data->mapper->n_targets()) {
-                               data->state = Query_data::closing;
-                               query_queue.pop_busy();
-                       }
-                       query_queue.lock.unlock();
-
-                       for (unsigned i = 0; i < n_targets; ++i)
-                               data->mapper->align_target(target + i, stat);
-               }
-
-       }
-
        statistics += stat;
 }
 
 void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file)
 {
-       query_queue.last_query = (unsigned)-1;
        const size_t max_size = (size_t)std::min(config.chunk_size*1e9 * 9 * 2 
/ config.lowmem, 2e9);
        pair<size_t, size_t> query_range;
-       while(true) {
+       while (true) {
                task_timer timer("Loading trace points", 3);
                Trace_pt_list *v = new Trace_pt_list;
                statistics.max(Statistics::TEMP_SPACE, trace_pts.load(*v, 
max_size, query_range));
@@ -291,32 +126,15 @@ void align_queries(Trace_pt_buffer &trace_pts, 
Output_stream* output_file)
                merge_sort(v->begin(), v->end(), config.threads_);
                v->init();
                timer.go("Computing alignments");
-               if (config.load_balancing == Config::target_parallel) {
-                       query_queue.init(v->begin(), v->end());
-                       Thread_pool threads;
-                       for (unsigned i = 0; i < config.threads_; ++i)
-                               
threads.push_back(launch_thread(static_cast<void 
(*)(Output_stream*)>(&align_worker), output_file));
-                       threads.join_all();
-               }
-               else {
-                       Simple_query_queue::instance = 
auto_ptr<Simple_query_queue>(new Simple_query_queue(query_range.first, 
query_range.second, v->begin(), v->end()));
-                       Output_sink::instance = auto_ptr<Output_sink>(new 
Output_sink(query_range.first, output_file));
-                       Thread_pool threads;
-                       if (config.verbosity >= 3)
-                               
threads.push_back(launch_thread(heartbeat_worker));
-                       for (size_t i = 0; i < (config.threads_align == 0 ? 
config.threads_ : config.threads_align); ++i)
-                               
threads.push_back(launch_thread(static_cast<void(*)(size_t)>(&align_worker), 
i));
-                       threads.join_all();
-               }
+               Align_fetcher::init(query_range.first, query_range.second, 
v->begin(), v->end());
+               Output_sink::instance = auto_ptr<Output_sink>(new 
Output_sink(query_range.first, output_file));
+               Thread_pool threads;
+               if (config.verbosity >= 3)
+                       threads.push_back(launch_thread(heartbeat_worker, 
query_range.second));
+               for (size_t i = 0; i < (config.threads_align == 0 ? 
config.threads_ : config.threads_align); ++i)
+                       
threads.push_back(launch_thread(static_cast<void(*)(size_t)>(&align_worker), 
i));
+               threads.join_all();
                timer.go("Deallocating buffers");
                delete v;
        }
-       if (!blocked_processing && *output_format != Output_format::daa && 
config.report_unaligned != 0 && config.load_balancing == 
Config::target_parallel) {
-               Text_buffer buf;
-               for (unsigned i = query_queue.last_query + 1; i < 
query_ids::get().get_length(); ++i) {
-                       output_format->print_query_intro(i, 
query_ids::get()[i].c_str(), get_source_query_len(i), buf, true);
-                       output_format->print_query_epilog(buf, true);
-               }
-               output_file->write(buf.get_begin(), buf.size());
-       }
 }
\ No newline at end of file
diff --git a/src/align/align.h b/src/align/align.h
index 33588b7..07d4b27 100644
--- a/src/align/align.h
+++ b/src/align/align.h
@@ -64,68 +64,6 @@ private:
        Task_queue<_buffer, Output_writer> queue;
 };
 
-struct Simple_query_queue
-{
-       enum { end = 0xffffffffffffffffllu };
-       Simple_query_queue(size_t qbegin, size_t qend, vector<hit>::iterator 
begin, vector<hit>::iterator end):
-               next_(qbegin),
-               qend_(qend),
-               it_(begin),
-               end_(end)
-       {}
-       size_t get(vector<hit>::iterator &begin, vector<hit>::iterator &end);
-       size_t next() const
-       {
-               return next_;
-       }
-       size_t qend() const
-       {
-               return qend_;
-       }
-       static Simple_query_queue& get()
-       {
-               return *instance;
-       }
-       static auto_ptr<Simple_query_queue> instance;
-private:
-       tthread::mutex mtx_;
-       size_t next_;
-       const size_t qend_;
-       vector<hit>::iterator it_, end_;
-};
-
-struct Output_sink
-{
-       Output_sink(size_t begin, Output_stream *f):
-               f_(f),
-               next_(begin),
-               size_(0),
-               max_size_(0)
-       {}
-       void push(size_t n, Text_buffer *buf);
-       size_t size() const
-       {
-               return size_;
-       }
-       size_t max_size() const
-       {
-               return max_size_;
-       }
-       static Output_sink& get()
-       {
-               return *instance;
-       }
-       size_t next() const
-       {
-               return next_;
-       }
-       static auto_ptr<Output_sink> instance;
-private:
-       void flush(Text_buffer *buf);
-       tthread::mutex mtx_;
-       Output_stream* const f_;
-       std::map<size_t, Text_buffer*> backlog_;
-       size_t next_, size_, max_size_;
-};
+void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file);
 
 #endif
\ No newline at end of file
diff --git a/src/align/align_queries.h b/src/align/align_queries.h
deleted file mode 100644
index 8409f46..0000000
--- a/src/align/align_queries.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/****
-DIAMOND protein aligner
-Copyright (C) 2013-2017 Benjamin Buchfink <buchf...@gmail.com>
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License as
-published by the Free Software Foundation, either version 3 of the
-License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU Affero General Public License for more details.
-
-You should have received a copy of the GNU Affero General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-****/
-
-#ifndef ALIGN_QUERIES_H_
-#define ALIGN_QUERIES_H_
-
-#include <deque>
-#include "../util/merge_sort.h"
-#include "../search/trace_pt_buffer.h"
-#include "../util/map.h"
-#include "../util/task_queue.h"
-#include "align.h"
-#include "query_mapper.h"
-
-using std::vector;
-
-struct Query_data
-{
-       Query_data():
-               mapper(0)
-       {}
-       Query_data(Query_mapper *mapper):
-               mapper(mapper),
-               state(init)
-       {}
-       Query_mapper *mapper;
-       Text_buffer buf;
-       enum { init, free, closing, finished };
-       unsigned state;
-};
-
-struct Query_queue
-{
-       void init(Trace_pt_list::iterator begin, Trace_pt_list::iterator end);
-       void flush(Output_stream *out, Statistics &stat);
-       Query_data* get();
-       void pop_busy();
-
-       tthread::mutex lock;
-       std::deque<Query_data*> queue;
-       std::queue<Query_data*> out_queue;
-       Trace_pt_list::iterator trace_pt_pos, trace_pt_end;
-       bool writing;
-       unsigned n, last_query;
-};
-
-extern Query_queue query_queue;
-
-void align_worker(Output_stream *out);
-void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file);
-
-#endif /* ALIGN_QUERIES_H_ */
diff --git a/src/align/align_target.cpp b/src/align/align_target.cpp
index 43af301..e027ac6 100644
--- a/src/align/align_target.cpp
+++ b/src/align/align_target.cpp
@@ -272,6 +272,10 @@ void Query_mapper::align_target(size_t idx, Statistics 
&stat)
        target.hsps.sort();
        if(target.hsps.size() > 0)
                target.filter_score = target.hsps.front().score;
+
+       target.ts.clear();
+       for (list<Hsp_data>::iterator i = target.hsps.begin(); i != 
target.hsps.end(); ++i)
+               target.ts.push_back(Hsp_traits(i->query_source_range));
        
        if (config.use_smith_waterman && !target.hsps.empty()) {
                int score;
diff --git a/src/align/query_mapper.cpp b/src/align/query_mapper.cpp
index 401a52e..12cb96e 100644
--- a/src/align/query_mapper.cpp
+++ b/src/align/query_mapper.cpp
@@ -16,24 +16,36 @@ You should have received a copy of the GNU Affero General 
Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
-#include "align_queries.h"
 #include "query_mapper.h"
 #include "../data/reference.h"
 #include "extend_ungapped.h"
 #include "../output/output.h"
 #include "../output/output_format.h"
 #include "../output/daa_write.h"
+#include "../output/target_culling.h"
 
-Query_mapper::Query_mapper() :
-       source_hits(get_query_data()),
-       query_id(source_hits.first->query_ / align_mode.query_contexts),
-       targets_finished(0),
-       next_target(0),
-       source_query_len(get_source_query_len(query_id)),
-       unaligned_from(query_queue.last_query+1)
-{      
-       query_queue.last_query = query_id;
-       seed_hits.reserve(source_hits.second - source_hits.first);
+bool Target::envelopes(const Hsp_traits &t, double p) const
+{
+       for (list<Hsp_traits>::const_iterator i = ts.begin(); i != ts.end(); 
++i)
+               if (t.query_source_range.overlap_factor(i->query_source_range) 
>= p)
+                       return true;
+       return false;
+}
+
+bool Target::is_enveloped(const Target &t, double p) const
+{
+       for (list<Hsp_traits>::const_iterator i = ts.begin(); i != ts.end(); 
++i)
+               if (!t.envelopes(*i, p))
+                       return false;
+       return true;
+}
+
+bool Target::is_enveloped(Ptr_vector<Target>::const_iterator begin, 
Ptr_vector<Target>::const_iterator end, double p, int min_score) const
+{
+       for (; begin != end; ++begin)
+               if (is_enveloped(**begin, p) && (*begin)->filter_score >= 
min_score)
+                       return true;
+       return false;
 }
 
 int Query_mapper::raw_score_cutoff() const
@@ -70,18 +82,6 @@ void Query_mapper::init()
                rank_targets(config.rank_ratio == -1 ? 0.6 : config.rank_ratio);
 }
 
-pair<Trace_pt_list::iterator, Trace_pt_list::iterator> 
Query_mapper::get_query_data()
-{
-       const Trace_pt_list::iterator begin = query_queue.trace_pt_pos;
-       if (begin == query_queue.trace_pt_end)
-               return pair<Trace_pt_list::iterator, 
Trace_pt_list::iterator>(begin, begin);
-       const unsigned c = align_mode.query_contexts, query = begin->query_ / c;
-       Trace_pt_list::iterator end = begin;
-       for (; end < query_queue.trace_pt_end && end->query_ / c == query; 
++end);
-       query_queue.trace_pt_pos = end;
-       return pair<Trace_pt_list::iterator, Trace_pt_list::iterator>(begin, 
end);
-}
-
 unsigned Query_mapper::count_targets()
 {
        std::sort(source_hits.first, source_hits.second, hit::cmp_subject);
@@ -132,49 +132,64 @@ void Query_mapper::rank_targets(double ratio)
 {
        std::sort(targets.begin(), targets.end(), Target::compare);
 
-       int score = 0;
-       if (config.toppercent < 100) {
-               score = int((double)targets[0].filter_score * (1.0 - 
config.toppercent / 100.0) * ratio);
+       if (config.query_overlap_culling == 0.0) {
+
+               int score = 0;
+               if (config.toppercent < 100) {
+                       score = int((double)targets[0].filter_score * (1.0 - 
config.toppercent / 100.0) * ratio);
+               }
+               else {
+                       size_t min_idx = std::min(targets.size(), 
(size_t)config.max_alignments);
+                       score = int((double)targets[min_idx - 1].filter_score * 
ratio);
+               }
+
+               unsigned i = 0;
+               for (; i < targets.size(); ++i)
+                       if (targets[i].filter_score < score)
+                               break;
+
+               if (config.benchmark_ranking)
+                       for (unsigned j = i; j < targets.size(); ++j)
+                               targets[j].outranked = true;
+               else
+                       targets.erase(targets.begin() + i, targets.end());
+
        }
        else {
-               size_t min_idx = std::min(targets.size(), 
(size_t)config.max_alignments);
-               score = int((double)targets[min_idx - 1].filter_score * ratio);
-       }
 
-       unsigned i = 0;
-       for (; i < targets.size(); ++i)
-               if (targets[i].filter_score < score)
-                       break;
-       
-       if (config.benchmark_ranking)
-               for (unsigned j = i; j < targets.size(); ++j)
-                       targets[j].outranked = true;
-       else
-               targets.erase(targets.begin() + i, targets.end());
+               int i = 0;
+               const double p = config.query_overlap_culling / 100.0;
+               while (i < (int)targets.size()) {
+                       int min_score = int((double)targets[i].filter_score / 
(1.0 - config.toppercent / 100.0) / ratio);
+                       if (targets[i].is_enveloped(targets.begin(), 
targets.begin() + i, p, min_score))
+                               targets.erase(targets.begin() + i, 
targets.begin() + i + 1);
+                       else
+                               ++i;
+               }
+
+       }
 }
 
 bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat)
 {
-       if (!blocked_processing && *output_format != Output_format::daa && 
config.report_unaligned != 0 && config.load_balancing == 
Config::target_parallel) {
-               for (unsigned i = unaligned_from; i < query_id; ++i) {
-                       output_format->print_query_intro(i, 
query_ids::get()[i].c_str(), get_source_query_len(i), buffer, true);
-                       output_format->print_query_epilog(buffer, true);
-               }
-       }
-
        std::sort(targets.begin(), targets.end(), Target::compare);
 
        unsigned n_hsp = 0, n_target_seq = 0, hit_hsps = 0;
-       const unsigned top_score = targets.empty() ? 0 : 
targets[0].filter_score, query_len = (unsigned)query_seq(0).length();
+       auto_ptr<Target_culling> 
target_culling(Target_culling::get(targets.empty() ? 0 : &targets[0]));
+       const unsigned query_len = (unsigned)query_seq(0).length();
        size_t seek_pos = 0;
        const char *query_title = query_ids::get()[query_id].c_str();
+       auto_ptr<Output_format> f(output_format->clone());
 
        for (size_t i = 0; i < targets.size(); ++i) {
                if ((config.min_bit_score == 0 && 
score_matrix.evalue(targets[i].filter_score, config.db_size, query_len) > 
config.max_evalue)
                        || score_matrix.bitscore(targets[i].filter_score) < 
config.min_bit_score)
                        break;
 
-               if (!config.output_range(n_target_seq, targets[i].filter_score, 
top_score))
+               const int c = target_culling->cull(targets[i]);
+               if (c == Target_culling::next)
+                       continue;
+               else if (c == Target_culling::finished)
                        break;
 
                if (targets[i].outranked)
@@ -202,15 +217,15 @@ bool Query_mapper::generate_output(Text_buffer &buffer, 
Statistics &stat)
                        }
                        else {
                                if (n_hsp == 0) {
-                                       if (*output_format == 
Output_format::daa)
+                                       if (*f == Output_format::daa)
                                                seek_pos = 
write_daa_query_record(buffer, query_title, align_mode.query_translated ? 
query_source_seqs::get()[query_id] : query_seqs::get()[query_id]);
                                        else
-                                               
output_format->print_query_intro(query_id, query_title, source_query_len, 
buffer, false);
+                                               f->print_query_intro(query_id, 
query_title, source_query_len, buffer, false);
                                }
-                               if (*output_format == Output_format::daa)
+                               if (*f == Output_format::daa)
                                        write_daa_record(buffer, *j, query_id, 
targets[i].subject_id);
                                else
-                                       
output_format->print_match(Hsp_context(*j,
+                                       f->print_match(Hsp_context(*j,
                                                query_id,
                                                query_seq(j->frame),
                                                query_source_seq(),
@@ -223,8 +238,10 @@ bool Query_mapper::generate_output(Text_buffer &buffer, 
Statistics &stat)
                                                hit_hsps), buffer);
                        }
 
-                       if(hit_hsps == 0)
+                       if (hit_hsps == 0) {
                                ++n_target_seq;
+                               target_culling->add(&targets[i]);
+                       }
                        ++n_hsp;
                        ++hit_hsps;
                        if (config.alignment_traceback && j->gap_openings > 0)
@@ -235,17 +252,17 @@ bool Query_mapper::generate_output(Text_buffer &buffer, 
Statistics &stat)
 
        if (n_hsp > 0) {
                if (!blocked_processing) {
-                       if (*output_format == Output_format::daa)
+                       if (*f == Output_format::daa)
                                finish_daa_query_record(buffer, seek_pos);
                        else
-                               output_format->print_query_epilog(buffer, 
false);
+                               f->print_query_epilog(buffer, query_title, 
false);
                }
                else
                        Intermediate_record::finish_query(buffer, seek_pos);
        }
-       else if (!blocked_processing && *output_format != Output_format::daa && 
config.report_unaligned != 0) {
-               output_format->print_query_intro(query_id, query_title, 
source_query_len, buffer, true);
-               output_format->print_query_epilog(buffer, true);
+       else if (!blocked_processing && *f != Output_format::daa && 
config.report_unaligned != 0) {
+               f->print_query_intro(query_id, query_title, source_query_len, 
buffer, true);
+               f->print_query_epilog(buffer, query_title, true);
        }
 
        if (!blocked_processing) {
diff --git a/src/align/query_mapper.h b/src/align/query_mapper.h
index 5da19c3..bbcd8b8 100644
--- a/src/align/query_mapper.h
+++ b/src/align/query_mapper.h
@@ -34,6 +34,9 @@ using std::list;
 
 struct Target
 {
+       Target(int score):
+               filter_score(score)
+       {}
        Target(size_t begin, unsigned subject_id) :
                subject_id(subject_id),
                filter_score(0),
@@ -44,6 +47,14 @@ struct Target
        {
                return lhs->filter_score > rhs->filter_score;
        }
+       void fill_source_ranges(size_t query_len)
+       {
+               for (list<Hsp_traits>::iterator i = ts.begin(); i != ts.end(); 
++i)
+                       i->query_source_range = align_mode.query_translated ? 
untranslate_range(i->query_range, i->frame, query_len) : i->query_range;
+       }
+       bool envelopes(const Hsp_traits &t, double p) const;
+       bool is_enveloped(const Target &t, double p) const;
+       bool is_enveloped(Ptr_vector<Target>::const_iterator begin, 
Ptr_vector<Target>::const_iterator end, double p, int min_score) const;
        unsigned subject_id;
        int filter_score;
        float filter_time;
@@ -78,6 +89,11 @@ struct Query_mapper
        {
                return query_seqs::get()[query_id*align_mode.query_contexts + 
frame];
        }
+       void fill_source_ranges()
+       {
+               for (size_t i = 0; i < targets.size(); ++i)
+                       targets[i].fill_source_ranges(source_query_len);
+       }
        pair<Trace_pt_list::iterator, Trace_pt_list::iterator> source_hits;
        unsigned query_id, targets_finished, next_target;
 private:
diff --git a/src/basic/basic.cpp b/src/basic/basic.cpp
index aa6fab3..5455ea3 100644
--- a/src/basic/basic.cpp
+++ b/src/basic/basic.cpp
@@ -24,9 +24,9 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>.
 #include "sequence.h"
 #include "masking.h"
 
-const char* Const::version_string = "0.9.8";
+const char* Const::version_string = "0.9.10";
 const char* Const::program_name = "diamond";
-const char* Const::id_delimiters = " \a\b\f\n\r\t\v";
+const char* Const::id_delimiters = " \a\b\f\n\r\t\v\1";
 
 Value_traits::Value_traits(const char *alphabet, Letter mask_char, const char 
*ignore) :
        alphabet(alphabet),
diff --git a/src/basic/config.cpp b/src/basic/config.cpp
index bb3fe7a..edbace2 100644
--- a/src/basic/config.cpp
+++ b/src/basic/config.cpp
@@ -56,7 +56,8 @@ Config::Config(int argc, const char **argv)
                .add_command("model-seqs", "")
                .add_command("opt", "")
                .add_command("mask", "")
-               .add_command("fastq2fasta", "");
+               .add_command("fastq2fasta", "")
+               .add_command("dbinfo", "");
 
        Options_group general("General options");
        general.add()
@@ -111,10 +112,12 @@ Config::Config(int argc, const char **argv)
        Options_group aligner("Aligner options");
        aligner.add()
                ("query", 'q', "input query file", query_file)
+               ("strand", 0, "query strands to search (both/minus/plus)", 
query_strands, string("both"))
                ("un", 0, "file for unaligned queries", unaligned)
                ("unal", 0, "report unaligned queries (0=no, 1=yes)", 
report_unaligned, -1)
                ("max-target-seqs", 'k', "maximum number of target sequences to 
report alignments for", max_alignments, uint64_t(25))
                ("top", 0, "report alignments within this percentage range of 
top alignment score (overrides --max-target-seqs)", toppercent, 100.0)
+               ("overlap-culling", 0, "delete hits only if a higher scoring 
hit envelops the given percentage of the query range", query_overlap_culling)
                ("compress", 0, "compression for output files (0=none, 
1=gzip)", compression)
                ("evalue", 'e', "maximum e-value to report alignments 
(default=0.001)", max_evalue, 0.001)
                ("min-score", 0, "minimum bit score to report alignments 
(overrides e-value setting)", min_bit_score)
@@ -137,8 +140,10 @@ Config::Config(int argc, const char **argv)
                //("seg", 0, "enable SEG masking of queries (yes/no)", seg)
                ("query-gencode", 0, "genetic code to use to translate query 
(see user manual)", query_gencode, 1u)
                ("salltitles", 0, "include full subject titles in DAA file", 
salltitles)
+               ("sallseqid", 0, "include all subject ids in DAA file", 
sallseqid)
                ("no-self-hits", 0, "suppress reporting of identical self 
hits", no_self_hits)
-               ("taxonmap", 0, "protein accession to taxid mapping file", 
prot_accession2taxid);
+               ("taxonmap", 0, "protein accession to taxid mapping file", 
prot_accession2taxid)
+               ("taxonnodes", 0, "taxonomy nodes.dmp from NCBI", nodesdmp);
 
        Options_group advanced("Advanced options");
        advanced.add()
@@ -230,7 +235,7 @@ Config::Config(int argc, const char **argv)
                        throw std::runtime_error("Invalid option: 
--block-size/-b. Block size is set for the alignment commands.");
                break;
        case Config::blastp:
-       case Config::blastx:    
+       case Config::blastx:
                if (database == "")
                        throw std::runtime_error("Missing parameter: database 
file (--db/-d)");
                if (daa_file.length() > 0) {
@@ -254,6 +259,12 @@ Config::Config(int argc, const char **argv)
                ;
        }
 
+       switch (command) {
+       case Config::dbinfo:
+               if (database == "")
+                       throw std::runtime_error("Missing parameter: database 
file (--db/-d)");
+       }
+
        if (hit_cap != 0)
                throw std::runtime_error("Deprecated parameter: 
--max-hits/-C.");
 
@@ -353,14 +364,6 @@ Config::Config(int argc, const char **argv)
 #ifdef __POPCNT__
                verbose_stream << "POPCNT enabled." << endl;
 #endif
-
-               message_stream << "#Target sequences to report alignments for: 
";
-               if (max_alignments == 0) {
-                       max_alignments = std::numeric_limits<uint64_t>::max();
-                       message_stream << "unlimited" << endl;
-               }
-               else
-                       message_stream << max_alignments << endl;
        }
 
        Translator::init(query_gencode);
@@ -371,6 +374,9 @@ Config::Config(int argc, const char **argv)
        if (command == help)
                parser.print_help();
 
+       if (query_strands != "both" && query_strands != "minus" && 
query_strands != "plus")
+               throw std::runtime_error("Invalid value for parameter 
--strand");
+
        /*log_stream << "sizeof(hit)=" << sizeof(hit) << " 
sizeof(packed_uint40_t)=" << sizeof(packed_uint40_t)
                << " sizeof(sorted_list::entry)=" << sizeof(sorted_list::entry) 
<< endl;*/
 }
diff --git a/src/basic/config.h b/src/basic/config.h
index 0b1035b..a348d46 100644
--- a/src/basic/config.h
+++ b/src/basic/config.h
@@ -139,10 +139,14 @@ struct Config
        double score_ratio;
        bool small_query;
        bool hashed_seeds;
+       string nodesdmp;
+       bool sallseqid;
+       double query_overlap_culling;
+       string query_strands;
 
        enum {
                makedb = 0, blastp = 1, blastx = 2, view = 3, help = 4, version 
= 5, getseq = 6, benchmark = 7, random_seqs = 8, compare = 9, sort = 10, roc = 
11, db_stat = 12, model_sim = 13,
-               match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, 
fastq2fasta=18
+               match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, 
fastq2fasta=18, dbinfo=19
        };
        unsigned        command;
 
diff --git a/src/basic/const.h b/src/basic/const.h
index 0dd7c7a..a2107ba 100644
--- a/src/basic/const.h
+++ b/src/basic/const.h
@@ -23,7 +23,7 @@ struct Const
 {
 
        enum {
-               build_version = 109,
+               build_version = 111,
                daa_version = 0,
                seedp_bits = 10,
                seedp = 1<<seedp_bits,
diff --git a/src/basic/hssp.cpp b/src/basic/hssp.cpp
index d0134d9..579523e 100644
--- a/src/basic/hssp.cpp
+++ b/src/basic/hssp.cpp
@@ -18,6 +18,21 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>.
 
 #include "../align/align.h"
 
+interval untranslate_range(const interval &r, unsigned frame, size_t len)
+{
+       signed f = frame <= 2 ? frame + 1 : 2 - frame;
+       interval s;
+       if (f > 0) {
+               s.begin_ = (f - 1) + 3 * r.begin_;
+               s.end_ = s.begin_ + 3 * r.length();
+       }
+       else {
+               s.end_ = (int)len + f - 3 * r.begin_ + 1;
+               s.begin_ = s.end_ - 3 * r.length();
+       }
+       return s;
+}
+
 bool Hsp_data::pass_through(const Diagonal_segment &d) const
 {
        if (intersect(d.query_range(), query_range).length() != d.len
@@ -80,19 +95,8 @@ void Hsp_data::set_source_range(unsigned frame, unsigned 
dna_len)
        this->frame = frame;
        if (!align_mode.query_translated)
                query_source_range = query_range;
-       else {
-               signed f = frame <= 2 ? frame + 1 : 2 - frame;
-               if (f > 0) {
-                       query_source_range.begin_ = (f - 1) + 3 * 
query_range.begin_;
-                       query_source_range.end_ = query_source_range.begin_ + 3 
* query_range.length();
-                       //query_end_dna = (f-1) + 3 * 
(l.query_begin_+l.query_len_-1) + 3;
-               }
-               else {
-                       query_source_range.end_ = dna_len + f - 3 * 
query_range.begin_ + 1;
-                       //query_end_dna = dna_len + (f + 1) - 3 * 
(l.query_begin_+l.query_len_-1) - 2;
-                       query_source_range.begin_ = query_source_range.end_ - 3 
* query_range.length();
-               }
-       }
+       else
+               query_source_range = untranslate_range(query_range, frame, 
dna_len);
 }
 
 Hsp_context& Hsp_context::parse()
diff --git a/src/basic/match.h b/src/basic/match.h
index 555ecb6..991d40d 100644
--- a/src/basic/match.h
+++ b/src/basic/match.h
@@ -38,6 +38,8 @@ inline interval normalized_range(unsigned pos, int len, 
Strand strand)
                        : interval (pos + 1 + len, pos + 1);
 }
 
+interval untranslate_range(const interval &r, unsigned frame, size_t l);
+
 struct Diagonal_segment
 {
        Diagonal_segment():
diff --git a/src/basic/sequence.h b/src/basic/sequence.h
index 349a9de..b52f3fb 100644
--- a/src/basic/sequence.h
+++ b/src/basic/sequence.h
@@ -100,8 +100,13 @@ struct sequence
        }
        std::ostream& print(std::ostream &os, const Value_traits &v) const
        {
-               for (unsigned i = 0; i<len_; ++i)
-                       os << v.alphabet[(long)data_[i]];
+               for (unsigned i = 0; i < len_; ++i) {
+                       long l = (long)data_[i];
+                       if ((l & 128) == 0)
+                               os << v.alphabet[l];
+                       else
+                               os << (char)tolower(v.alphabet[l & 127]);
+               }
                return os;
        }
        std::ostream& print(std::ostream &os, const Value_traits &v, Reversed) 
const
diff --git a/src/basic/value.h b/src/basic/value.h
index 5f7a256..58467dc 100644
--- a/src/basic/value.h
+++ b/src/basic/value.h
@@ -76,7 +76,7 @@ private:
 
 struct Value_traits
 {
-       Value_traits(const char *alphabet, Letter mask_char, const char 
*ignore);       
+       Value_traits(const char *alphabet, Letter mask_char, const char 
*ignore);
        const char *alphabet;
        unsigned alphabet_size;
        Letter mask_char;
diff --git a/src/data/load_seqs.h b/src/data/load_seqs.h
index a203779..fd67db3 100644
--- a/src/data/load_seqs.h
+++ b/src/data/load_seqs.h
@@ -24,7 +24,7 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>.
 #include "../basic/translate.h"
 #include "../util/seq_file_format.h"
 
-inline size_t push_seq(Sequence_set &ss, Sequence_set** source_seqs, const 
vector<Letter> &seq)
+inline size_t push_seq(Sequence_set &ss, Sequence_set** source_seqs, const 
vector<Letter> &seq, unsigned frame_mask)
 {
        if (config.command == Config::blastp || config.command == 
Config::makedb || config.command == Config::random_seqs) {
                ss.push_back(seq);
@@ -42,7 +42,7 @@ inline size_t push_seq(Sequence_set &ss, Sequence_set** 
source_seqs, const vecto
 
                unsigned bestFrames(Translator::computeGoodFrames(proteins, 
config.get_run_len((unsigned)seq.size() / 3)));
                for (unsigned j = 0; j < 6; ++j) {
-                       if (bestFrames & (1 << j))
+                       if ((bestFrames & (1 << j)) && (frame_mask & (1 << j)))
                                ss.push_back(proteins[j]);
                        else
                                ss.fill(proteins[j].size(), 
value_traits.mask_char);
@@ -68,10 +68,16 @@ inline size_t load_seqs(Input_stream &file,
        vector<char> id;
        string id2;
 
+       unsigned frame_mask = (1 << 6) - 1;
+       if (config.query_strands == "plus")
+               frame_mask = (1 << 3) - 1;
+       else if (config.query_strands == "minus")
+               frame_mask = ((1 << 3) - 1) << 3;
+
        while (letters < max_letters && format.get_seq(id, seq, file)) {
                if (seq.size() > 0 && (filter.empty() || id2.assign(id.data(), 
id.data() + id.size()).find(filter, 0) != string::npos)) {
                        ids->push_back(id);
-                       letters += push_seq(**seqs, source_seqs, seq);
+                       letters += push_seq(**seqs, source_seqs, seq, 
frame_mask);
                        ++n;
                        if ((*seqs)->get_length() 
>(size_t)std::numeric_limits<int>::max())
                                throw std::runtime_error("Number of sequences 
in file exceeds supported maximum.");
diff --git a/src/data/reference.cpp b/src/data/reference.cpp
index c86caef..4e15766 100644
--- a/src/data/reference.cpp
+++ b/src/data/reference.cpp
@@ -39,6 +39,18 @@ bool blocked_processing;
 using std::cout;
 using std::endl;
 
+string* get_allseqids(const char *s)
+{
+       string *r = new string;
+       const vector<string> t(tokenize(s, "\1"));
+       for (vector<string>::const_iterator i = t.begin(); i != t.end(); ++i) {
+               if (i != t.begin())
+                       r->append("\1");
+               r->append(i->substr(0, find_first_of(i->c_str(), 
Const::id_delimiters)));
+       }
+       return r;
+}
+
 struct Pos_record
 {
        Pos_record()
@@ -211,4 +223,14 @@ void Database_file::get_seq()
                seq.clear();
                id.clear();
        }
+}
+
+void db_info()
+{
+       Database_file db_file;
+       cout << "Database format version = " << ref_header.db_version << endl;
+       cout << "Diamond build = " << ref_header.build << endl;
+       cout << "Sequences = " << ref_header.sequences << endl;
+       cout << "Letters = " << ref_header.letters << endl;
+       db_file.close();
 }
\ No newline at end of file
diff --git a/src/data/reference.h b/src/data/reference.h
index 7eb8bb5..c74f5e9 100644
--- a/src/data/reference.h
+++ b/src/data/reference.h
@@ -129,6 +129,8 @@ inline size_t max_id_len(const String_set<0> &ids)
        return max;
 }
 
+string* get_allseqids(const char *s);
+
 struct Ref_map
 {
        Ref_map():
@@ -157,10 +159,13 @@ struct Ref_map
                        n = next_++;
                        data_[block][i] = n;
                        len_.push_back((uint32_t)ref_seqs::get().length(i));
+                       const char *title = ref_ids::get()[i].c_str();
                        if (config.salltitles)
-                               name_.push_back(new 
string(ref_ids::get()[i].c_str()));
+                               name_.push_back(new string(title));
+                       else if (config.sallseqid)
+                               name_.push_back(get_allseqids(title));
                        else
-                               
name_.push_back(get_str(ref_ids::get()[i].c_str(), Const::id_delimiters));
+                               name_.push_back(get_str(title, 
Const::id_delimiters));
                        mtx_.unlock();
                }
                return n;
diff --git a/src/data/taxonomy.cpp b/src/data/taxonomy.cpp
index 838f29f..9172a56 100644
--- a/src/data/taxonomy.cpp
+++ b/src/data/taxonomy.cpp
@@ -17,10 +17,16 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>.
 ****/
 
 #include <stdio.h>
+#include <set>
 #include "taxonomy.h"
 #include "../util/compressed_stream.h"
 #include "../basic/config.h"
 #include "../util/merge_sort.h"
+#include "../util/log_stream.h"
+
+using std::cout;
+using std::endl;
+using std::set;
 
 Taxonomy taxonomy;
 
@@ -28,7 +34,7 @@ string& get_accession(string &t)
 {
        size_t i;
        if (t.compare(0, 6, "UniRef") == 0)
-               t.erase(0, 9);
+               t.erase(0, t.find('_', 0) + 1);
        else if ((i = t.find_first_of('|', 0)) != string::npos) {
                if (t.compare(0, 3, "gi|") == 0) {
                        t.erase(0, t.find_first_of('|', i + 1) + 1);
@@ -62,5 +68,78 @@ void Taxonomy::load()
                /*if (f.line_count % 10000 == 0)
                        std::cout << f.line_count << endl;*/
        }
+       f.close();
        merge_sort(accession2taxid_.begin(), accession2taxid_.end(), 
config.threads_);
+}
+
+void Taxonomy::load_nodes()
+{
+       Input_stream f(config.nodesdmp);
+       unsigned taxid, parent;
+       while (!f.eof() && (f.getline(), !f.line.empty())) {
+               if (sscanf(f.line.c_str(), "%u\t|\t%u", &taxid, &parent) != 2)
+                       throw std::runtime_error("Invalid nodes.dmp file 
format.");
+               //cout << taxid << '\t' << parent << endl;
+               parent_.resize(taxid + 1);
+               parent_[taxid] = parent;
+       }
+       f.close();
+}
+
+void Taxonomy::init()
+{
+       task_timer timer;
+       if (!config.prot_accession2taxid.empty()) {
+               timer.go("Loading taxonomy");
+               load();
+               timer.finish();
+       }
+
+       if (!config.nodesdmp.empty()) {
+               timer.go("Loading taxonomy nodes");
+               load_nodes();
+               timer.finish();
+       }
+}
+
+void Taxonomy::get_taxids(const char *id, set<unsigned> &taxons) const
+{
+       const vector<string> t(tokenize(id, "\1"));
+       for (vector<string>::const_iterator i = t.begin(); i < t.end(); ++i) {
+               const unsigned id = get(Taxonomy::Accession(*i));
+               if(id != 0)
+                       taxons.insert(id);
+       }
+}
+
+unsigned Taxonomy::get_lca(unsigned t1, unsigned t2) const
+{
+       static const int max = 64;
+       if (t1 == t2 || t2 == 0)
+               return t1;
+       if (t1 == 0)
+               return t2;
+       unsigned p = t2;
+       set<unsigned> l;
+       int n = 0;
+       do {
+               p = get_parent(p);
+               if (p == 0)
+                       return t1;
+               l.insert(p);
+               if (++n > max)
+                       throw std::runtime_error("Path in taxonomy too long 
(1).");
+       } while (p != t1 && p != 1);
+       if (p == t1)
+               return p;
+       p = t1;
+       n = 0;
+       while (l.find(p) == l.end()) {
+               p = get_parent(p);
+               if (p == 0)
+                       return t2;
+               if (++n > max)
+                       throw std::runtime_error("Path in taxonomy too long 
(2).");
+       }
+       return p;
 }
\ No newline at end of file
diff --git a/src/data/taxonomy.h b/src/data/taxonomy.h
index 1ce6dc5..1a2537f 100644
--- a/src/data/taxonomy.h
+++ b/src/data/taxonomy.h
@@ -19,6 +19,7 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>.
 #include <vector>
 #include <algorithm>
 #include <string>
+#include <set>
 #include "../basic/const.h"
 #include "../util/util.h"
 
@@ -71,7 +72,10 @@ struct Taxonomy
                char s[max_accesion_len];
        };
 
+       void init();
        void load();
+       void load_nodes();
+       void get_taxids(const char *s, std::set<unsigned> &taxons) const;
 
        unsigned get(const Accession &accession) const
        {
@@ -82,9 +86,20 @@ struct Taxonomy
                        return 0;
        }
 
+       unsigned get_parent(unsigned taxid) const
+       {
+               if (parent_.size() <= taxid)
+                       throw std::runtime_error(string("No taxonomy node found 
for taxon id ") + to_string(taxid));
+               return parent_[taxid];
+       }
+
+       unsigned get_lca(unsigned t1, unsigned t2) const;
+
 private:
        
        std::vector<std::pair<Accession, unsigned> > accession2taxid_;
+       std::vector<unsigned> parent_;
+
 };
 
 extern Taxonomy taxonomy;
\ No newline at end of file
diff --git a/src/dp/dp.h b/src/dp/dp.h
index dbdec70..6a76900 100644
--- a/src/dp/dp.h
+++ b/src/dp/dp.h
@@ -94,6 +94,11 @@ struct Hsp_traits
                score(0),
                frame((int)frame)
        {}
+       Hsp_traits(const interval &query_source_range):
+               query_source_range(query_source_range)
+       {}
+       Hsp_traits(const Hsp_data &hsp)
+       {}
        int partial_score(const Diagonal_segment &d) const
        {
                const double overlap = 
std::max(d.subject_range().overlap_factor(subject_range), 
d.query_range().overlap_factor(query_range));
@@ -142,7 +147,7 @@ struct Hsp_traits
                }
        };
        int d_min, d_max, score, frame;
-       interval query_range, subject_range;
+       interval query_source_range, query_range, subject_range;
 };
 
 template<typename _score>
diff --git a/src/dp/swipe.cpp b/src/dp/swipe.cpp
index d09b2ee..9b3e070 100644
--- a/src/dp/swipe.cpp
+++ b/src/dp/swipe.cpp
@@ -246,4 +246,4 @@ void swipe(const sequence &query, 
vector<sequence>::const_iterator subject_begin
 #ifdef __SSE2__
        swipe<uint8_t>(query, subject_begin, subject_end, out);
 #endif
-}
+}
\ No newline at end of file
diff --git a/src/extra/match_file.h b/src/extra/match_file.h
index cd2ef69..75252d7 100644
--- a/src/extra/match_file.h
+++ b/src/extra/match_file.h
@@ -21,6 +21,7 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>.
 
 #include <stdio.h>
 #include <vector>
+#include "../util/binary_file.h"
 #include "blast_record.h"
 #include "../util/util.h"
 #include "../basic/reduction.h"
diff --git a/src/output/blast_pairwise_format.cpp 
b/src/output/blast_pairwise_format.cpp
index d4c4b20..1096e97 100644
--- a/src/output/blast_pairwise_format.cpp
+++ b/src/output/blast_pairwise_format.cpp
@@ -18,7 +18,7 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>.
 
 #include "output_format.h"
 
-void Pairwise_format::print_match(const Hsp_context& r, Text_buffer &out) const
+void Pairwise_format::print_match(const Hsp_context& r, Text_buffer &out)
 {
        static const unsigned width = 60;
        out << '>';
@@ -67,9 +67,8 @@ void Pairwise_format::print_footer(Output_stream &out) const
 
 }
 
-void Pairwise_format::print_query_epilog(Text_buffer &out, bool unaligned) 
const
+void Pairwise_format::print_query_epilog(Text_buffer &out, const char 
*query_title, bool unaligned) const
 {
-
 }
 
 void Pairwise_format::print_query_intro(size_t query_num, const char 
*query_name, unsigned query_len, Text_buffer &out, bool unaligned) const
diff --git a/src/output/blast_tab_format.cpp b/src/output/blast_tab_format.cpp
index a85147a..59fafd3 100644
--- a/src/output/blast_tab_format.cpp
+++ b/src/output/blast_tab_format.cpp
@@ -95,17 +95,17 @@ Blast_tab_format::Blast_tab_format() :
 
 void print_staxids(Text_buffer &out, const char *id)
 {
-       const vector<string> t(tokenize(id, "\1"));     
        std::set<unsigned> taxons;
-       for (vector<string>::const_iterator i = t.begin(); i < t.end(); ++i)
-               taxons.insert(taxonomy.get(Taxonomy::Accession(*i)));
+       taxonomy.get_taxids(id, taxons);
+       if (taxons.empty())
+               return;
        std::set<unsigned>::const_iterator i = taxons.begin();
        out << *(i++);
        for (; i != taxons.end(); ++i)
                out << ';' << *i;
 }
 
-void Blast_tab_format::print_match(const Hsp_context& r, Text_buffer &out) 
const
+void Blast_tab_format::print_match(const Hsp_context& r, Text_buffer &out)
 {
        for (vector<unsigned>::const_iterator i = fields.begin(); i != 
fields.end(); ++i) {
                switch (*i) {
diff --git a/src/output/join_blocks.cpp b/src/output/join_blocks.cpp
index 0b2e63e..353e06b 100644
--- a/src/output/join_blocks.cpp
+++ b/src/output/join_blocks.cpp
@@ -21,6 +21,7 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>.
 #include "../data/queries.h"
 #include "../output/daa_write.h"
 #include "output_format.h"
+#include "../align/query_mapper.h"
 
 struct Join_fetcher
 {
@@ -123,7 +124,35 @@ struct Join_record
 
 };
 
-void join_query(vector<Binary_buffer> &buf, Text_buffer &out, Statistics 
&statistics, unsigned query, const char *query_name, unsigned query_source_len)
+struct Join_records
+{
+       Join_records(vector<Binary_buffer> &buf)
+       {
+               for (unsigned i = 0; i < current_ref_block; ++i) {
+                       it.push_back(buf[i].begin());
+                       Join_record::push_next(i, 
std::numeric_limits<unsigned>::max(), it.back(), records);
+               }
+               std::make_heap(records.begin(), records.end());
+       }
+       Target* get_target()
+       {
+               if (records.empty())
+                       return 0;
+               const unsigned block = records.front().block_;
+               const unsigned subject = records.front().info_.subject_id;
+               Target *t = new Target(records.front().info_.score);
+               do {
+                       std::pop_heap(records.begin(), records.end());
+                       records.pop_back();
+                       if (Join_record::push_next(block, subject, it[block], 
records))
+                               std::push_heap(records.begin(), records.end());
+               } while (!records.empty() && records.front().info_.subject_id 
== subject);
+       }
+       vector<Join_record> records;
+       vector<Binary_buffer::Iterator> it;
+};
+
+void join_query(vector<Binary_buffer> &buf, Text_buffer &out, Statistics 
&statistics, unsigned query, const char *query_name, unsigned query_source_len, 
Output_format &f)
 {
        sequence context[6];
        for (unsigned i = 0; i < align_mode.query_contexts; ++i)
@@ -148,11 +177,11 @@ void join_query(vector<Binary_buffer> &buf, Text_buffer 
&out, Statistics &statis
                if (config.output_range(n_target_seq, next.info_.score, 
top_score) || same_subject) {
                        //printf("q=%u s=%u n=%u ss=%u\n",query, 
next.info_.subject_id, n_target_seq, same_subject, next.info_.score);
 
-                       if(*output_format == Output_format::daa)
+                       if(f == Output_format::daa)
                                write_daa_record(out, next.info_);
                        else {
                                Hsp_data hsp(next.info_, query_source_len);
-                               output_format->print_match(Hsp_context(hsp,
+                               f.print_match(Hsp_context(hsp,
                                        query,
                                        
context[align_mode.check_context(hsp.frame)],
                                        align_mode.query_translated ? 
query_source_seqs::get()[query] : context[0],
@@ -205,21 +234,23 @@ void join_worker(Task_queue<Text_buffer,Join_writer> 
*queue)
                if (*output_format != Output_format::daa && 
config.report_unaligned != 0) {
                        for (unsigned i = fetcher.unaligned_from; i < 
fetcher.query_id; ++i) {
                                output_format->print_query_intro(i, 
query_ids::get()[i].c_str(), get_source_query_len(i), *out, true);
-                               output_format->print_query_epilog(*out, true);
+                               output_format->print_query_epilog(*out, 
query_ids::get()[i].c_str(), true);
                        }
                }
 
-               if (*output_format == Output_format::daa)
+               auto_ptr<Output_format> f(output_format->clone());
+
+               if (*f == Output_format::daa)
                        seek_pos = write_daa_query_record(*out, query_name, 
query_seq);
                else
-                       output_format->print_query_intro(fetcher.query_id, 
query_name, (unsigned)query_seq.length(), *out, false);
+                       f->print_query_intro(fetcher.query_id, query_name, 
(unsigned)query_seq.length(), *out, false);
 
-               join_query(fetcher.buf, *out, stat, fetcher.query_id, 
query_name, (unsigned)query_seq.length());
+               join_query(fetcher.buf, *out, stat, fetcher.query_id, 
query_name, (unsigned)query_seq.length(), *f);
 
-               if (*output_format == Output_format::daa)
+               if (*f == Output_format::daa)
                        finish_daa_query_record(*out, seek_pos);
                else
-                       output_format->print_query_epilog(*out, false);
+                       f->print_query_epilog(*out, query_name, false);
 
                queue->push(n);
        }
@@ -242,7 +273,7 @@ void join_blocks(unsigned ref_blocks, Output_stream 
&master_out, const vector<Te
                Text_buffer out;
                for (unsigned i = Join_fetcher::query_last + 1; i < 
query_ids::get().get_length(); ++i) {
                        output_format->print_query_intro(i, 
query_ids::get()[i].c_str(), get_source_query_len(i), out, true);
-                       output_format->print_query_epilog(out, true);
+                       output_format->print_query_epilog(out, 
query_ids::get()[i].c_str(), true);
                }
                writer(out);
        }
diff --git a/src/output/output.h b/src/output/output.h
index 03fe1fe..8bef7f5 100644
--- a/src/output/output.h
+++ b/src/output/output.h
@@ -19,6 +19,7 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>.
 #ifndef OUTPUT_H_
 #define OUTPUT_H_
 
+#include <map>
 #include "../util/binary_file.h"
 #include "../basic/packed_transcript.h"
 #include "../util/text_buffer.h"
@@ -98,8 +99,42 @@ struct Intermediate_record
        Packed_transcript transcript;
 };
 
-#ifndef ST_JOIN
 void join_blocks(unsigned ref_blocks, Output_stream &master_out, const 
vector<Temp_file> &tmp_file);
-#endif
+
+struct Output_sink
+{
+       Output_sink(size_t begin, Output_stream *f) :
+               f_(f),
+               next_(begin),
+               size_(0),
+               max_size_(0)
+       {}
+       void push(size_t n, Text_buffer *buf);
+       size_t size() const
+       {
+               return size_;
+       }
+       size_t max_size() const
+       {
+               return max_size_;
+       }
+       static Output_sink& get()
+       {
+               return *instance;
+       }
+       size_t next() const
+       {
+               return next_;
+       }
+       static auto_ptr<Output_sink> instance;
+private:
+       void flush(Text_buffer *buf);
+       tthread::mutex mtx_;
+       Output_stream* const f_;
+       std::map<size_t, Text_buffer*> backlog_;
+       size_t next_, size_, max_size_;
+};
+
+void heartbeat_worker(size_t qend);
 
 #endif
\ No newline at end of file
diff --git a/src/output/output_format.cpp b/src/output/output_format.cpp
index 74786f6..ef6113b 100644
--- a/src/output/output_format.cpp
+++ b/src/output/output_format.cpp
@@ -84,12 +84,31 @@ Output_format* get_output_format()
                return new Pairwise_format;
        else if (f[0] == "null")
                return new Null_format;
+       else if (f[0] == "102")
+               return new Taxon_format;
        else
-               throw std::runtime_error("Invalid output format. Allowed 
values: 5,6,100,101");
+               throw std::runtime_error("Invalid output format. Allowed 
values: 0,5,6,100,101,102");
 }
 
+void init_output()
+{
+       output_format = auto_ptr<Output_format>(get_output_format());
+       if (*output_format == Output_format::taxon && config.toppercent == 
100.0)
+               config.toppercent = 10.0;
+       if (config.toppercent == 100.0) {
+               message_stream << "#Target sequences to report alignments for: 
";
+               if (config.max_alignments == 0) {
+                       config.max_alignments = 
std::numeric_limits<uint64_t>::max();
+                       message_stream << "unlimited" << endl;
+               }
+               else
+                       message_stream << config.max_alignments << endl;
+       }
+       else
+               message_stream << "Percentage range of top alignment score to 
report hits: " << config.toppercent << endl;
+}
 
-void XML_format::print_match(const Hsp_context &r, Text_buffer &out) const
+void XML_format::print_match(const Hsp_context &r, Text_buffer &out)
 {
        if(r.hsp_num == 0) {
                if (r.hit_num > 0)
@@ -183,7 +202,7 @@ void XML_format::print_query_intro(size_t query_num, const 
char *query_name, uns
                << "<Iteration_hits>" << '\n';
 }
 
-void XML_format::print_query_epilog(Text_buffer &out, bool unaligned) const
+void XML_format::print_query_epilog(Text_buffer &out, const char *query_title, 
bool unaligned) const
 {
        if (!unaligned) {
                out << "  </Hit_hsps>" << '\n'
diff --git a/src/output/output_format.h b/src/output/output_format.h
index 4ed34a5..15880de 100644
--- a/src/output/output_format.h
+++ b/src/output/output_format.h
@@ -34,14 +34,15 @@ struct Output_format
        {}
        virtual void print_query_intro(size_t query_num, const char 
*query_name, unsigned query_len, Text_buffer &out, bool unaligned) const
        {}
-       virtual void print_query_epilog(Text_buffer &out, bool unaligned) const
+       virtual void print_query_epilog(Text_buffer &out, const char 
*query_title, bool unaligned) const
        {}
-       virtual void print_match(const Hsp_context& r, Text_buffer &out) const
+       virtual void print_match(const Hsp_context& r, Text_buffer &out)
        {}
        virtual void print_header(Output_stream &f, int mode, const char 
*matrix, int gap_open, int gap_extend, double evalue, const char 
*first_query_name, unsigned first_query_len) const
        { }
        virtual void print_footer(Output_stream &f) const
        { }
+       virtual Output_format* clone() const = 0;
        virtual ~Output_format()
        { }
        static void print_title(Text_buffer &buf, const char *id, bool 
full_titles, bool all_titles, const char *separator);
@@ -50,7 +51,7 @@ struct Output_format
                return code;
        }
        unsigned code;
-       enum { daa, blast_tab, blast_xml, sam, blast_pairwise, null };
+       enum { daa, blast_tab, blast_xml, sam, blast_pairwise, null, taxon };
 };
 
 extern auto_ptr<Output_format> output_format;
@@ -60,6 +61,10 @@ struct Null_format : public Output_format
        Null_format() :
                Output_format(null)
        {}
+       virtual Output_format* clone() const
+       {
+               return new Null_format(*this);
+       }
 };
 
 struct DAA_format : public Output_format
@@ -67,6 +72,10 @@ struct DAA_format : public Output_format
        DAA_format():
                Output_format(daa)
        {}
+       virtual Output_format* clone() const
+       {
+               return new DAA_format(*this);
+       }
 };
 
 struct Blast_tab_format : public Output_format
@@ -74,9 +83,13 @@ struct Blast_tab_format : public Output_format
        static const char* field_str[];
        Blast_tab_format();
        virtual void print_query_intro(size_t query_num, const char 
*query_name, unsigned query_len, Text_buffer &out, bool unaligned) const;
-       virtual void print_match(const Hsp_context& r, Text_buffer &out) const;
+       virtual void print_match(const Hsp_context& r, Text_buffer &out);
        virtual ~Blast_tab_format()
        { }
+       virtual Output_format* clone() const
+       {
+               return new Blast_tab_format(*this);
+       }
        vector<unsigned> fields;
 };
 
@@ -85,11 +98,15 @@ struct Sam_format : public Output_format
        Sam_format():
                Output_format(sam)
        { }
-       virtual void print_match(const Hsp_context& r, Text_buffer &out) const;
+       virtual void print_match(const Hsp_context& r, Text_buffer &out);
        virtual void print_header(Output_stream &f, int mode, const char 
*matrix, int gap_open, int gap_extend, double evalue, const char 
*first_query_name, unsigned first_query_len) const;
        virtual void print_query_intro(size_t query_num, const char 
*query_name, unsigned query_len, Text_buffer &out, bool unaligned) const;
        virtual ~Sam_format()
        { }
+       virtual Output_format* clone() const
+       {
+               return new Sam_format(*this);
+       }
 };
 
 struct XML_format : public Output_format
@@ -99,13 +116,17 @@ struct XML_format : public Output_format
        {
                config.salltitles = true;
        }
-       virtual void print_match(const Hsp_context &r, Text_buffer &out) const;
+       virtual void print_match(const Hsp_context &r, Text_buffer &out);
        virtual void print_header(Output_stream &f, int mode, const char 
*matrix, int gap_open, int gap_extend, double evalue, const char 
*first_query_name, unsigned first_query_len) const;
        virtual void print_query_intro(size_t query_num, const char 
*query_name, unsigned query_len, Text_buffer &out, bool unaligned) const;
-       virtual void print_query_epilog(Text_buffer &out, bool unaligned) const;
+       virtual void print_query_epilog(Text_buffer &out, const char 
*query_title, bool unaligned) const;
        virtual void print_footer(Output_stream &f) const;
        virtual ~XML_format()
        { }
+       virtual Output_format* clone() const
+       {
+               return new XML_format(*this);
+       }
 };
 
 struct Pairwise_format : public Output_format
@@ -115,16 +136,46 @@ struct Pairwise_format : public Output_format
        {
                config.salltitles = true;
        }
-       virtual void print_match(const Hsp_context &r, Text_buffer &out) const;
+       virtual void print_match(const Hsp_context &r, Text_buffer &out);
        virtual void print_header(Output_stream &f, int mode, const char 
*matrix, int gap_open, int gap_extend, double evalue, const char 
*first_query_name, unsigned first_query_len) const;
        virtual void print_query_intro(size_t query_num, const char 
*query_name, unsigned query_len, Text_buffer &out, bool unaligned) const;
-       virtual void print_query_epilog(Text_buffer &out, bool unaligned) const;
+       virtual void print_query_epilog(Text_buffer &out, const char 
*query_title, bool unaligned) const;
        virtual void print_footer(Output_stream &f) const;
        virtual ~Pairwise_format()
        { }
+       virtual Output_format* clone() const
+       {
+               return new Pairwise_format(*this);
+       }
+};
+
+struct Taxon_format : public Output_format
+{
+       Taxon_format() :
+               Output_format(taxon),
+               taxid(0),
+               evalue(std::numeric_limits<double>::max())
+       {
+               config.salltitles = true;
+               if (config.prot_accession2taxid.empty())
+                       throw std::runtime_error("Output format requires 
setting the --taxonmap parameter.");
+               if (config.nodesdmp.empty())
+                       throw std::runtime_error("Output format requires 
setting the --taxonnodes parameter.");
+       }
+       virtual void print_match(const Hsp_context &r, Text_buffer &out);
+       virtual void print_query_epilog(Text_buffer &out, const char 
*query_title, bool unaligned) const;
+       virtual ~Taxon_format()
+       { }
+       virtual Output_format* clone() const
+       {
+               return new Taxon_format(*this);
+       }
+       unsigned taxid;
+       double evalue;
 };
 
 Output_format* get_output_format();
+void init_output();
 void print_hsp(Hsp_data &hsp, sequence query);
 
 #endif /* OUTPUT_FORMAT_H_ */
diff --git a/src/output/output_sink.cpp b/src/output/output_sink.cpp
new file mode 100644
index 0000000..d4e8f8b
--- /dev/null
+++ b/src/output/output_sink.cpp
@@ -0,0 +1,86 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchf...@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include "output.h"
+#include "../data/queries.h"
+
+using std::map;
+using std::auto_ptr;
+
+auto_ptr<Output_sink> Output_sink::instance;
+
+void Output_sink::push(size_t n, Text_buffer *buf)
+{
+       mtx_.lock();
+       //cout << "n=" << n << " next=" << next_ << endl;
+       if (n != next_) {
+               backlog_[n] = buf;
+               size_ += buf ? buf->alloc_size() : 0;
+               max_size_ = std::max(max_size_, size_);
+               mtx_.unlock();
+       }
+       else
+               flush(buf);
+}
+
+void Output_sink::flush(Text_buffer *buf)
+{
+       size_t n = next_ + 1;
+       vector<Text_buffer*> out;
+       out.push_back(buf);
+       map<size_t, Text_buffer*>::iterator i;
+       do {
+               while ((i = backlog_.begin()) != backlog_.end() && i->first == 
n) {
+                       out.push_back(i->second);
+                       backlog_.erase(i);
+                       ++n;
+               }
+               mtx_.unlock();
+               size_t size = 0;
+               for (vector<Text_buffer*>::iterator j = out.begin(); j < 
out.end(); ++j) {
+                       if (*j) {
+                               f_->write((*j)->get_begin(), (*j)->size());
+                               if (*j != buf)
+                                       size += (*j)->alloc_size();
+                               delete *j;
+                       }
+               }
+               out.clear();
+               mtx_.lock();
+               size_ -= size;
+       } while ((i = backlog_.begin()) != backlog_.end() && i->first == n);
+       next_ = n;
+       mtx_.unlock();
+}
+
+void heartbeat_worker(size_t qend)
+{
+       static const int interval = 100;
+       int n = 0;
+       while (Output_sink::get().next() < qend) {
+               if (n == interval) {
+                       const string 
title(query_ids::get()[Output_sink::get().next()].c_str());
+                       verbose_stream << "Queries=" << 
Output_sink::get().next() << " size=" << megabytes(Output_sink::get().size()) 
<< " max_size=" << megabytes(Output_sink::get().max_size())
+                               << " next=" << title.substr(0, title.find(' ')) 
<< endl;
+                       n = 0;
+               }
+               else
+                       ++n;
+               
tthread::this_thread::sleep_for(tthread::chrono::milliseconds(10));
+       }
+}
\ No newline at end of file
diff --git a/src/output/sam_format.cpp b/src/output/sam_format.cpp
index 932e406..d8a4469 100644
--- a/src/output/sam_format.cpp
+++ b/src/output/sam_format.cpp
@@ -82,7 +82,7 @@ void Sam_format::print_query_intro(size_t query_num, const 
char *query_name, uns
        }
 }
 
-void Sam_format::print_match(const Hsp_context& r, Text_buffer &out) const
+void Sam_format::print_match(const Hsp_context& r, Text_buffer &out)
 {
        out.write_until(r.query_name, Const::id_delimiters);
        out << '\t' << '0' << '\t';
diff --git a/src/basic/const.h b/src/output/target_culling.cpp
similarity index 62%
copy from src/basic/const.h
copy to src/output/target_culling.cpp
index 0dd7c7a..52df2fe 100644
--- a/src/basic/const.h
+++ b/src/output/target_culling.cpp
@@ -16,31 +16,9 @@ You should have received a copy of the GNU Affero General 
Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
-#ifndef CONST_H_
-#define CONST_H_
+#include "target_culling.h"
 
-struct Const
+Target_culling* Target_culling::get(const Target* first)
 {
-
-       enum {
-               build_version = 109,
-               daa_version = 0,
-               seedp_bits = 10,
-               seedp = 1<<seedp_bits,
-               max_seed_weight = 32,
-               max_shapes = 16,
-               max_shape_len = 32
-       };
-
-       static const char* version_string;
-       static const char* program_name;
-       static const char* id_delimiters;
-
-};
-
-#define SIMPLE_SEARCH
-// #define FREQUENCY_MASKING
-// #define ST_JOIN
-// #define NO_COLLISION_FILTER
-
-#endif /* CONST_H_ */
+       return config.query_overlap_culling == 0.0 ? new Target_culling(first) 
: new Overlap_culling;
+}
\ No newline at end of file
diff --git a/src/output/target_culling.h b/src/output/target_culling.h
new file mode 100644
index 0000000..365b548
--- /dev/null
+++ b/src/output/target_culling.h
@@ -0,0 +1,73 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchf...@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#ifndef TARGET_CULLING_H_
+#define TARGET_CULLING_H_
+
+#include <vector>
+#include "../align/query_mapper.h"
+
+struct Target_culling
+{
+       Target_culling():
+               top_score_(0)
+       {}
+       Target_culling(const Target *first):
+               n_(0),
+               top_score_(first ? first->filter_score : 0)
+       {}
+       virtual int cull(const Target &t)
+       {
+               return config.output_range((unsigned)n_, t.filter_score, 
top_score_) ? use : finished;
+       }
+       virtual void add(Target *t)
+       {
+               ++n_;
+       }
+       enum { finished = 0, next = 1, use = 2};
+       static Target_culling* get(const Target* first);
+private:
+       size_t n_;
+       const int top_score_;
+};
+
+struct Overlap_culling : Target_culling
+{
+       Overlap_culling()
+       {}
+       virtual int cull(const Target &t)
+       {
+               return t.is_enveloped(targets_.begin(), targets_.end(), 
config.query_overlap_culling / 100.0, int(double(t.filter_score) / (1.0 - 
config.toppercent / 100.0)))
+                       ? next : use;
+       }
+       virtual void add(Target *t)
+       {
+               targets_.push_back(t);
+       }
+       virtual void add(unsigned n, const Hsp_data &hsp)
+       {
+               if (n >= targets_.size()) {
+                       targets_.push_back(new Target(hsp.score));
+               }
+               targets_[n]->ts.push_back(Hsp_traits(hsp));
+       }
+private:
+       std::vector<Target*> targets_;
+};
+
+#endif
\ No newline at end of file
diff --git a/src/output/taxon_format.cpp b/src/output/taxon_format.cpp
new file mode 100644
index 0000000..5979ae4
--- /dev/null
+++ b/src/output/taxon_format.cpp
@@ -0,0 +1,51 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchf...@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include <set>
+#include "output_format.h"
+#include "../data/taxonomy.h"
+
+using std::set;
+
+void Taxon_format::print_match(const Hsp_context &r, Text_buffer &out)
+{
+       set<unsigned> taxons;
+       taxonomy.get_taxids(r.subject_name, taxons);
+       if (taxons.empty())
+               return;
+       evalue = std::min(evalue, r.evalue());
+       try {
+               for (set<unsigned>::const_iterator i = taxons.begin(); i != 
taxons.end(); ++i)
+                       taxid = taxonomy.get_lca(taxid, *i);
+       }
+       catch (std::exception &) {
+               std::cerr << "Query=" << r.query_name << endl << "Subject=" << 
r.subject_name << endl;
+               throw;
+       }
+}
+
+void Taxon_format::print_query_epilog(Text_buffer &out, const char 
*query_title, bool unaligned) const
+{
+       out.write_until(query_title, Const::id_delimiters);
+       out << '\t' << taxid << '\t';
+       if (taxid != 0)
+               out.print_e(evalue);
+       else
+               out << '0';
+       out << '\n';
+}
\ No newline at end of file
diff --git a/src/output/view.h b/src/output/view.cpp
similarity index 73%
rename from src/output/view.h
rename to src/output/view.cpp
index 21a2ba6..2a66b01 100644
--- a/src/output/view.h
+++ b/src/output/view.cpp
@@ -16,9 +16,6 @@ You should have received a copy of the GNU Affero General 
Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
-#ifndef VIEW_H_
-#define VIEW_H_
-
 #include "../basic/config.h"
 #include "../util/binary_file.h"
 #include "../util/text_buffer.h"
@@ -28,13 +25,14 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>.
 #include "../util/task_queue.h"
 #include "../basic/score_matrix.h"
 #include "../util/thread.h"
+#include "../data/taxonomy.h"
 
 const unsigned view_buf_size = 32;
 
 struct View_writer
 {
-       View_writer():
-               f_ (config.compression == 1
+       View_writer() :
+               f_(config.compression == 1
                        ? new Compressed_ostream(config.output_file)
                        : new Output_stream(config.output_file))
        { }
@@ -52,17 +50,18 @@ struct View_writer
 
 struct View_fetcher
 {
-       View_fetcher(DAA_file &daa):
-               daa (daa)
+       View_fetcher(DAA_file &daa) :
+               daa(daa)
        { }
        bool operator()()
        {
                n = 0;
-               for(unsigned i=0;i<view_buf_size;++i)
+               for (unsigned i = 0; i<view_buf_size; ++i)
                        if (!daa.read_query_buffer(buf[i], query_num)) {
                                query_num -= n - 1;
                                return false;
-                       } else
+                       }
+                       else
                                ++n;
                query_num -= n - 1;
                return true;
@@ -73,63 +72,73 @@ struct View_fetcher
        DAA_file &daa;
 };
 
-void view_query(DAA_query_record &r, Text_buffer &out, const Output_format 
&format)
+void view_query(DAA_query_record &r, Text_buffer &out, Output_format &format)
 {
-       format.print_query_intro(r.query_num, r.query_name.c_str(), 
(unsigned)r.query_len(), out, false);
-       for (DAA_query_record::Match_iterator i = r.begin(); i.good(); ++i) {
+       auto_ptr<Output_format> f(format.clone());
+       f->print_query_intro(r.query_num, r.query_name.c_str(), 
(unsigned)r.query_len(), out, false);
+       DAA_query_record::Match_iterator i = r.begin();
+       const unsigned top_score = i.good() ? i->score : 0;
+       for (; i.good(); ++i) {
                if (i->frame > 2 && config.forwardonly)
                        continue;
-               format.print_match(i->context(), out);
+               if (!config.output_range(i->hit_num, i->score, top_score))
+                       break;
+               f->print_match(i->context(), out);
        }
-       format.print_query_epilog(out, false);
+       f->print_query_epilog(out, r.query_name.c_str(), false);
 }
 
 struct View_context
 {
-       View_context(DAA_file &daa, View_writer &writer, const Output_format 
&format):
-               daa (daa),
-               writer (writer),
-               queue (3*config.threads_, writer),
-               format (format)
+       View_context(DAA_file &daa, View_writer &writer, Output_format &format) 
:
+               daa(daa),
+               writer(writer),
+               queue(3 * config.threads_, writer),
+               format(format)
        { }
        void operator()(unsigned thread_id)
        {
                try {
                        size_t n;
-                       View_fetcher query_buf (daa);
+                       View_fetcher query_buf(daa);
                        Text_buffer *buffer = 0;
-                       while(queue.get(n, buffer, query_buf)) {
+                       while (queue.get(n, buffer, query_buf)) {
                                for (unsigned j = 0; j < query_buf.n; ++j) {
                                        DAA_query_record r(daa, 
query_buf.buf[j], query_buf.query_num + j);
                                        view_query(r, *buffer, format);
                                }
                                queue.push(n);
                        }
-               } catch(std::exception &e) {
+               }
+               catch (std::exception &e) {
                        std::cout << e.what() << std::endl;
                        std::terminate();
                }
        }
        DAA_file &daa;
        View_writer &writer;
-       Task_queue<Text_buffer,View_writer> queue;
-       const Output_format &format;
+       Task_queue<Text_buffer, View_writer> queue;
+       Output_format &format;
 };
 
 void view()
 {
-       DAA_file daa (config.daa_file);
+       task_timer timer("Loading subject IDs");
+       DAA_file daa(config.daa_file);
        score_matrix = Score_matrix("", daa.lambda(), daa.kappa(), 
daa.gap_open_penalty(), daa.gap_extension_penalty());
+       timer.finish();
 
        message_stream << "Scoring parameters: " << score_matrix << endl;
        verbose_stream << "Build version = " << daa.diamond_build() << endl;
        message_stream << "DB sequences = " << daa.db_seqs() << endl;
        message_stream << "DB sequences used = " << daa.db_seqs_used() << endl;
        message_stream << "DB letters = " << daa.db_letters() << endl;
-       
-       task_timer timer("Generating output");
+
+       init_output();
+       taxonomy.init();
+
+       timer.go("Generating output");
        View_writer writer;
-       output_format = auto_ptr<Output_format>(get_output_format());
 
        Binary_buffer buf;
        size_t query_num;
@@ -152,5 +161,3 @@ void view()
 
        output_format->print_footer(*writer.f_);
 }
-
-#endif /* VIEW_H_ */
diff --git a/src/run/double_indexed.cpp b/src/run/double_indexed.cpp
index a6fa614..ab2ede5 100644
--- a/src/run/double_indexed.cpp
+++ b/src/run/double_indexed.cpp
@@ -22,7 +22,6 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>.
 #include "../data/queries.h"
 #include "../basic/statistics.h"
 #include "../basic/shape_config.h"
-#include "../align/align_queries.h"
 #include "../search/align_range.h"
 #include "../util/seq_file_format.h"
 #include "../data/load_seqs.h"
@@ -320,7 +319,7 @@ void master_thread_di()
        timer2.start();
 
        align_mode = Align_mode(Align_mode::from_command(config.command));
-       output_format = auto_ptr<Output_format>(get_output_format());
+       init_output();
 
        message_stream << "Temporary directory: " << Temp_file::get_temp_dir() 
<< endl;
 
@@ -343,12 +342,7 @@ void master_thread_di()
        Config::set_option(config.db_size, (uint64_t)ref_header.letters);
 
        set_max_open_files(config.query_bins * config.threads_ + 
unsigned(ref_header.letters / (size_t)(config.chunk_size * 1e9)) + 16);
-
-       if (!config.prot_accession2taxid.empty()) {
-               timer.go("Loading taxonomy");
-               taxonomy.load();
-               timer.finish();
-       }
+       taxonomy.init();
 
        master_thread(db_file, timer2);
 }
\ No newline at end of file
diff --git a/src/run/main.cpp b/src/run/main.cpp
index 79b1fe5..db7e36d 100644
--- a/src/run/main.cpp
+++ b/src/run/main.cpp
@@ -18,7 +18,6 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>.
 
 #include <iostream>
 #include "../basic/config.h"
-#include "../output/view.h"
 #include "tools.h"
 #include "../extra/compare.h"
 
@@ -31,6 +30,9 @@ void model_seqs();
 void opt();
 void run_masker();
 void fastq2fasta();
+void view();
+void make_db();
+void db_info();
 
 int main(int ac, const char* av[])
 {
@@ -95,6 +97,9 @@ int main(int ac, const char* av[])
                case Config::fastq2fasta:
                        fastq2fasta();
                        break;
+               case Config::dbinfo:
+                       db_info();
+                       break;
                default:
                        return 1;
                }
diff --git a/src/run/tools.cpp b/src/run/tools.cpp
index 5e76d01..7b6b6c8 100644
--- a/src/run/tools.cpp
+++ b/src/run/tools.cpp
@@ -118,14 +118,14 @@ void run_masker()
                cout << '>' << string(id.data(), id.size()) << endl;
                seq2 = seq;
                Masking::get()(seq2.data(), seq2.size());
-               for (size_t i = 0; i < seq.size(); ++i) {
+               /*for (size_t i = 0; i < seq.size(); ++i) {
                        char c = value_traits.alphabet[(long)seq[i]];
                        if (seq2[i] == value_traits.mask_char)
                                c = tolower(c);
                        cout << c;
                }
-               cout << endl;
-               //cout << sequence(seq.data(), seq.size()) << endl;
+               cout << endl;*/
+               cout << sequence(seq2.data(), seq2.size()) << endl;
        }
 }
 
diff --git a/src/util/log_stream.h b/src/util/log_stream.h
index 9aacba1..06bd34c 100644
--- a/src/util/log_stream.h
+++ b/src/util/log_stream.h
@@ -69,6 +69,11 @@ extern Message_stream log_stream;
 
 struct task_timer
 {
+       task_timer(unsigned level = 1) :
+               level_(level),
+               msg_(0)
+       {
+       }
        task_timer(const char *msg, unsigned level=1) :
                level_(level),
                msg_(msg)
diff --git a/src/basic/const.h b/src/util/queue.h
similarity index 60%
copy from src/basic/const.h
copy to src/util/queue.h
index 0dd7c7a..cc24dfc 100644
--- a/src/basic/const.h
+++ b/src/util/queue.h
@@ -16,31 +16,38 @@ You should have received a copy of the GNU Affero General 
Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
-#ifndef CONST_H_
-#define CONST_H_
+#include "tinythread.h"
 
-struct Const
+struct Queue
 {
-
-       enum {
-               build_version = 109,
-               daa_version = 0,
-               seedp_bits = 10,
-               seedp = 1<<seedp_bits,
-               max_seed_weight = 32,
-               max_shapes = 16,
-               max_shape_len = 32
-       };
-
-       static const char* version_string;
-       static const char* program_name;
-       static const char* id_delimiters;
-
-};
-
-#define SIMPLE_SEARCH
-// #define FREQUENCY_MASKING
-// #define ST_JOIN
-// #define NO_COLLISION_FILTER
-
-#endif /* CONST_H_ */
+       enum { end = 0xffffffffffffffffllu };
+       Queue(size_t begin, size_t end) :
+               next_(begin),
+               end_(end)
+       {}
+       template<typename _f>
+       size_t get(_f &f)
+       {
+               mtx_.lock();
+               const size_t q = next_++;
+               if (q >= end_) {
+                       mtx_.unlock();
+                       return Queue::end;
+               }
+               f(q);
+               mtx_.unlock();
+               return q;
+       }
+       size_t next() const
+       {
+               return next_;
+       }
+       size_t get_end() const
+       {
+               return end_;
+       }
+private:
+       tthread::mutex mtx_;
+       size_t next_;
+       const size_t end_;
+};
\ No newline at end of file

-- 
Alioth's /usr/local/bin/git-commit-notice on 
/srv/git.debian.org/git/debian-med/diamond-aligner.git

_______________________________________________
debian-med-commit mailing list
debian-med-commit@lists.alioth.debian.org
http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit

Reply via email to