This is an automated email from the git hooks/post-receive script. tille pushed a commit to branch master in repository last-align.
commit 28865eb7cc3f830147d0da6bebead15e97e99497 Author: Andreas Tille <[email protected]> Date: Mon Jan 9 14:16:08 2017 +0100 New upstream version 828 --- ChangeLog.txt | 109 +++++++++- doc/last-split.html | 28 ++- doc/last-split.txt | 25 ++- doc/last-tuning.html | 11 + doc/last-tuning.txt | 13 ++ doc/lastal.html | 5 + doc/lastal.txt | 7 + doc/lastdb.html | 42 +++- doc/lastdb.txt | 41 +++- makefile | 7 +- src/Centroid.cc | 438 +++++++++++++++++++++++---------------- src/Centroid.hh | 31 +-- src/DiagonalTable.hh | 9 +- src/GappedXdropAligner.cc | 21 +- src/GappedXdropAligner2qual.cc | 21 +- src/GappedXdropAlignerInl.hh | 7 + src/GappedXdropAlignerPssm.cc | 21 +- src/GeneralizedAffineGapCosts.cc | 1 - src/GeneralizedAffineGapCosts.hh | 7 +- src/LastalArguments.cc | 2 +- src/LastalArguments.hh | 20 +- src/LastdbArguments.cc | 2 +- src/LastdbArguments.hh | 12 +- src/MultiSequence.cc | 5 +- src/MultiSequence.hh | 2 +- src/SegmentPair.hh | 4 +- src/SegmentPairPot.cc | 2 +- src/SegmentPairPot.hh | 2 +- src/SubsetSuffixArray.cc | 13 +- src/SubsetSuffixArray.hh | 16 +- src/SubsetSuffixArraySearch.cc | 18 +- src/SubsetSuffixArraySort.cc | 9 +- src/lastal.cc | 34 +-- src/lastdb.cc | 3 +- src/makefile | 204 ++++++++++-------- src/split/cbrc_split_aligner.cc | 38 ++-- src/split/cbrc_split_aligner.hh | 9 +- src/split/last-split.cc | 31 +-- src/version.hh | 2 +- 39 files changed, 820 insertions(+), 452 deletions(-) diff --git a/ChangeLog.txt b/ChangeLog.txt index c286315..157198b 100644 --- a/ChangeLog.txt +++ b/ChangeLog.txt @@ -1,8 +1,115 @@ +2016-12-19 Martin C. Frith <Martin C. Frith> + + * src/Centroid.cc: + Just re-ordered some probability calculations. + [c13b3ad70ddb] [tip] + + * src/Centroid.cc: + Refactoring. + [a31864221c04] + + * src/Centroid.cc: + Made lastal's probabilistic alignment faster. + [d2813c5ef29c] + + * src/Centroid.cc: + Refactoring. + [3323a24e4185] + + * src/Centroid.cc, src/Centroid.hh: + Refactoring. + [07185ef5ae09] + + * src/Centroid.cc, src/Centroid.hh: + Refactoring. + [23d7d4d65391] + +2016-12-16 Martin C. Frith <Martin C. Frith> + + * src/Centroid.cc: + Made lastal's probabilistic alignment faster, for FASTA queries. + [8a5fff275e58] + + * src/Centroid.cc, src/GappedXdropAligner.cc, + src/GappedXdropAligner2qual.cc, src/GappedXdropAlignerInl.hh, + src/GappedXdropAlignerPssm.cc, src/GeneralizedAffineGapCosts.cc, + src/GeneralizedAffineGapCosts.hh: + Made lastal with asymmetric gap costs faster (but symmetric gap + costs slower?) + [370ee034eebc] + + * src/MultiSequence.cc, test/last-test.out, test/last-test.sh: + Sped up fasta-reading, added tests. + [5be7ba2d1f50] + +2016-12-13 Martin C. Frith <Martin C. Frith> + + * makefile: + Bugfix: the previous update put junk in the zip-file. + [3e3e4e67c69d] + + * .hgignore, doc/last-split.txt, doc/last-tuning.txt, doc/lastal.txt, + doc/lastdb.txt, makefile, src/lastal.cc, src/lastdb.cc, + src/makefile, src/split/cbrc_split_aligner.cc: + Added 8-byte LAST! + [d6d939a854fd] + +2016-12-09 Martin C. Frith <Martin C. Frith> + + * makefile, src/makefile: + Refactoring. + [cd7349e81e02] + + * src/makefile: + Refactoring. + [59c5f7c8532e] + +2016-12-08 Martin C. Frith <Martin C. Frith> + + * src/DiagonalTable.hh, src/LastalArguments.hh, src/MultiSequence.hh, + src/SegmentPair.hh, src/SegmentPairPot.cc, src/SegmentPairPot.hh, + src/SubsetSuffixArray.hh, src/makefile: + Allow integers >= 2^32 in even more places. + [c24adb93f907] + +2016-12-06 Martin C. Frith <Martin C. Frith> + + * src/LastalArguments.hh, src/SubsetSuffixArray.hh, + src/SubsetSuffixArraySearch.cc, src/lastal.cc: + Allow integers >= 2^32 in still more places. + [4d18df171b8d] + + * src/LastdbArguments.cc, src/LastdbArguments.hh, + src/SubsetSuffixArray.cc, src/SubsetSuffixArray.hh: + Refactoring. + [a32c6d227241] + + * src/LastalArguments.hh, src/LastdbArguments.hh, + src/SubsetSuffixArray.hh, src/SubsetSuffixArraySearch.cc, + src/SubsetSuffixArraySort.cc, src/split/cbrc_split_aligner.cc, + src/split/cbrc_split_aligner.hh: + Allow integers >= 2^32 in yet more places. + [e55f2a3d8569] + + * src/LastalArguments.cc, src/LastalArguments.hh, + src/LastdbArguments.hh, src/SubsetSuffixArray.cc, + src/SubsetSuffixArray.hh, src/lastal.cc, src/lastdb.cc: + Allow integers >= 2^32 in more places. + [e8c20a8e0137] + +2016-12-02 Martin C. Frith <Martin C. Frith> + + * doc/last-split.txt, src/split/cbrc_split_aligner.cc, + src/split/cbrc_split_aligner.hh, src/split/last-split.cc, test/last- + split-test.out: + Added "sense" to last-split -d2 output. + [c332e3b4ece7] + 2016-11-24 Martin C. Frith <Martin C. Frith> * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh: Made last-split a bit faster. - [604434e30b9b] [tip] + [604434e30b9b] * src/split/cbrc_split_aligner.cc: Enabled last-split -g to read multi-volume genomes. diff --git a/doc/last-split.html b/doc/last-split.html index 32bad5c..f36c8b1 100644 --- a/doc/last-split.html +++ b/doc/last-split.html @@ -339,10 +339,10 @@ lastal -Q1 -D100 db q.fastq | last-split > out.maf </div> <div class="section" id="spliced-alignment-of-rna-reads-to-a-genome"> <h3>Spliced alignment of RNA reads to a genome</h3> -<p>Now we assume that "q.fastq" has reads from RNA forward strands. This -time, we provide the genome information to last-split, which causes it -to do spliced instead of split alignment, and also tells it where the -splice signals are (GT, AG, etc):</p> +<p>Now we assume that "q.fastq" has reads from RNA forward (sense) +strands. This time, we provide the genome information to last-split, +which causes it to do spliced instead of split alignment, and also +tells it where the splice signals are (GT, AG, etc):</p> <pre class="literal-block"> lastdb -uNEAR -R01 db genome.fasta lastal -Q1 -D10 db q.fastq | last-split -g db > out.maf @@ -357,6 +357,8 @@ between any two places in the genome.</p> very short parts of a spliced alignment (e.g. short exons). Note that last-split discards the lowest-significance alignments, but it uses them to estimate the ambiguity of higher-significance alignments.</p> +<p>If your reads are from unknown/mixed RNA strands, add -d2 to the +last-split options.</p> </div> <div class="section" id="alignment-of-two-whole-genomes"> <h3>Alignment of two whole genomes</h3> @@ -577,10 +579,16 @@ from the named genome. NAME should be the name of a lastdb database.</td></tr> <tr><td class="option-group"> <kbd><span class="option">-d</span>, <span class="option">--direction=<var>D</var></span></kbd></td> -<td>Do spliced alignment, and set the strandedness of the -queries: 0=reverse, 1=forward, 2=unknown/mixed. This +<td><p class="first">Do spliced alignment, and set the strandedness of the +queries: 0=antisense, 1=sense, 2=unknown/mixed. This determines whether forward and/or reverse-complement splice -signals are used.</td></tr> +signals are used.</p> +<p>If you use -d2, the output will have an extra "sense" field, +indicating the log-odds that the query is sense-stranded:</p> +<pre class="last literal-block"> +log2[ prob(sense) / prob(antisense) ] +</pre> +</td></tr> <tr><td class="option-group"> <kbd><span class="option">-c</span>, <span class="option">--cis=<var>PROB</var></span></kbd></td> <td>Do spliced alignment, and set the average probability per @@ -671,6 +679,12 @@ alignments are oriented to use the forward strand of the query.</p> </li> </ul> </div> +<div class="section" id="last-split8"> +<h2>last-split8</h2> +<p>last-split8 is almost identical to last-split. The only difference is +the -g option: last-split can only read the output of lastdb, whereas +last-split8 can only read the output of <a class="reference external" href="lastdb.html">lastdb8</a>.</p> +</div> <div class="section" id="limitations"> <h2>Limitations</h2> <p>last-split does not support:</p> diff --git a/doc/last-split.txt b/doc/last-split.txt index 0bda7ff..479b0e7 100644 --- a/doc/last-split.txt +++ b/doc/last-split.txt @@ -26,10 +26,10 @@ can do the alignment like this:: Spliced alignment of RNA reads to a genome ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Now we assume that "q.fastq" has reads from RNA forward strands. This -time, we provide the genome information to last-split, which causes it -to do spliced instead of split alignment, and also tells it where the -splice signals are (GT, AG, etc):: +Now we assume that "q.fastq" has reads from RNA forward (sense) +strands. This time, we provide the genome information to last-split, +which causes it to do spliced instead of split alignment, and also +tells it where the splice signals are (GT, AG, etc):: lastdb -uNEAR -R01 db genome.fasta lastal -Q1 -D10 db q.fastq | last-split -g db > out.maf @@ -46,6 +46,9 @@ very short parts of a spliced alignment (e.g. short exons). Note that last-split discards the lowest-significance alignments, but it uses them to estimate the ambiguity of higher-significance alignments. +If your reads are from unknown/mixed RNA strands, add -d2 to the +last-split options. + Alignment of two whole genomes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -185,10 +188,15 @@ Options -d, --direction=D Do spliced alignment, and set the strandedness of the - queries: 0=reverse, 1=forward, 2=unknown/mixed. This + queries: 0=antisense, 1=sense, 2=unknown/mixed. This determines whether forward and/or reverse-complement splice signals are used. + If you use -d2, the output will have an extra "sense" field, + indicating the log-odds that the query is sense-stranded:: + + log2[ prob(sense) / prob(antisense) ] + -c, --cis=PROB Do spliced alignment, and set the average probability per base of cis-splicing. The default value roughly fits human @@ -273,6 +281,13 @@ The following points matter only if you are doing something unusual * It assumes this score matrix applies to all alignments, when the alignments are oriented to use the forward strand of the query. +last-split8 +----------- + +last-split8 is almost identical to last-split. The only difference is +the -g option: last-split can only read the output of lastdb, whereas +last-split8 can only read the output of `lastdb8 <lastdb.html>`_. + Limitations ----------- diff --git a/doc/last-tuning.html b/doc/last-tuning.html index 11ec408..a273b3b 100644 --- a/doc/last-tuning.html +++ b/doc/last-tuning.html @@ -367,6 +367,17 @@ alphabetically earliest.</p> <p>The fraction of positions that are "minimum" is roughly: 2 / (W + 1).</p> </div> </div> +<div class="section" id="lastdb8-lastal8"> +<h2>lastdb8 & lastal8</h2> +<p>If your reference has more than about 4 billion letters, 8-byte LAST +<em>may</em> be beneficial. Ordinary (4-byte) LAST cannot directly handle so +much data, so it splits it into volumes, which is inefficient. 8-byte +LAST can handle such data without voluming, but it uses more memory.</p> +<p>8-byte LAST combines well with lastdb option -w or -W, which reduce +memory usage. Something like <tt class="docutils literal">lastdb8 <span class="pre">-W63</span></tt> enables rapid, +huge-scale homology search, with moderate memory usage, but low +sensitivity.</p> +</div> <div class="section" id="other-options"> <h2>Other options</h2> <div class="section" id="lastal-m"> diff --git a/doc/last-tuning.txt b/doc/last-tuning.txt index 4fe79b2..6493e62 100644 --- a/doc/last-tuning.txt +++ b/doc/last-tuning.txt @@ -61,6 +61,19 @@ alphabetically earliest. The fraction of positions that are "minimum" is roughly: 2 / (W + 1). +lastdb8 & lastal8 +~~~~~~~~~~~~~~~~~ + +If your reference has more than about 4 billion letters, 8-byte LAST +*may* be beneficial. Ordinary (4-byte) LAST cannot directly handle so +much data, so it splits it into volumes, which is inefficient. 8-byte +LAST can handle such data without voluming, but it uses more memory. + +8-byte LAST combines well with lastdb option -w or -W, which reduce +memory usage. Something like ``lastdb8 -W63`` enables rapid, +huge-scale homology search, with moderate memory usage, but low +sensitivity. + Other options ~~~~~~~~~~~~~ diff --git a/doc/lastal.html b/doc/lastal.html index d58d90a..94f9541 100644 --- a/doc/lastal.html +++ b/doc/lastal.html @@ -912,6 +912,11 @@ enough memory to hold several volumes simultaneously, or run lastal on one volume at a time. An efficient scheme is to use a different computer for each volume.</p> </div> +<div class="section" id="lastal8"> +<h2>lastal8</h2> +<p>lastal8 has identical usage to lastal, and is used with <a class="reference external" href="lastdb.html">lastdb8</a>. lastal cannot read the output of lastdb8, and +lastal8 cannot read the output of lastdb.</p> +</div> </div> </body> </html> diff --git a/doc/lastal.txt b/doc/lastal.txt index ce62516..d8b2690 100644 --- a/doc/lastal.txt +++ b/doc/lastal.txt @@ -538,3 +538,10 @@ Therefore, with parallel processes, you should either ensure you have enough memory to hold several volumes simultaneously, or run lastal on one volume at a time. An efficient scheme is to use a different computer for each volume. + +lastal8 +------- + +lastal8 has identical usage to lastal, and is used with `lastdb8 +<lastdb.html>`_. lastal cannot read the output of lastdb8, and +lastal8 cannot read the output of lastdb. diff --git a/doc/lastdb.html b/doc/lastdb.html index 7fc9a57..1e6fafb 100644 --- a/doc/lastdb.html +++ b/doc/lastdb.html @@ -442,8 +442,8 @@ can use suffixes K, M, and G to specify KibiBytes, MebiBytes, and GibiBytes (e.g. "-s 5G").</p> <p>However, the output for one sequence is never split. Since the output files are several-fold bigger than the input (unless you -use -w), this means that mammalian chromosomes cannot be -processed using much less than 2G (unless you use -w).</p> +use -w or -W), this means that mammalian chromosomes cannot be +processed using much less than 2G (unless you use -w or -W).</p> <p class="last">There is a hard upper limit of about 4 billion sequence letters per volume. Together with the previous point, this means that lastdb will refuse to process any single sequence longer than @@ -525,6 +525,44 @@ counting is never case-sensitive.</td></tr> </blockquote> </div> </div> +<div class="section" id="lastdb8"> +<h2>lastdb8</h2> +<p>lastdb8 is identical to lastdb, except that it internally uses larger +(8-byte) integers. This means it can handle more than 4 billion +sequence letters per volume, but it uses more memory.</p> +</div> +<div class="section" id="memory-and-disk-usage"> +<h2>Memory and disk usage</h2> +<p>Suppose we give lastdb N letters of sequence data, of which M are +non-masked "real" letters (e.g. excluding N for DNA and X for +proteins). The output files will include:</p> +<ul> +<li><p class="first">The sequences (N bytes).</p> +</li> +<li><p class="first">An "index" consisting of: +positions (4M bytes), and "buckets" (<= M bytes).</p> +</li> +<li><p class="first">The sequence names (<em>usually</em> negligible).</p> +</li> +</ul> +<p>This is modified by several options.</p> +<ul> +<li><p class="first">-C1 adds M bytes to the index, -C2 adds 2M bytes, and -C3 adds 4M +bytes.</p> +</li> +<li><p class="first">-w STEP: makes the index STEP times smaller.</p> +</li> +<li><p class="first">-W SIZE: makes the index about (SIZE+1)/2 times smaller.</p> +</li> +<li><p class="first">lastdb8: makes the index twice as big.</p> +</li> +<li><p class="first">-u, -m: Multiple patterns multiply the index size. For example, +<a class="reference external" href="last-seeds.html">MAM8</a> makes it 8 times bigger.</p> +</li> +<li><p class="first">-s: does not change the total size, but splits it into volumes.</p> +</li> +</ul> +</div> <div class="section" id="limitations"> <h2>Limitations</h2> <p>lastdb can become catastrophically slow for highly redundant diff --git a/doc/lastdb.txt b/doc/lastdb.txt index caba336..0d2d017 100644 --- a/doc/lastdb.txt +++ b/doc/lastdb.txt @@ -110,8 +110,8 @@ Advanced Options However, the output for one sequence is never split. Since the output files are several-fold bigger than the input (unless you - use -w), this means that mammalian chromosomes cannot be - processed using much less than 2G (unless you use -w). + use -w or -W), this means that mammalian chromosomes cannot be + processed using much less than 2G (unless you use -w or -W). There is a hard upper limit of about 4 billion sequence letters per volume. Together with the previous point, this means that @@ -190,6 +190,43 @@ Advanced Options -V, --version Show version information, and exit. +lastdb8 +------- + +lastdb8 is identical to lastdb, except that it internally uses larger +(8-byte) integers. This means it can handle more than 4 billion +sequence letters per volume, but it uses more memory. + +Memory and disk usage +--------------------- + +Suppose we give lastdb N letters of sequence data, of which M are +non-masked "real" letters (e.g. excluding N for DNA and X for +proteins). The output files will include: + +* The sequences (N bytes). + +* An "index" consisting of: + positions (4M bytes), and "buckets" (<= M bytes). + +* The sequence names (*usually* negligible). + +This is modified by several options. + +* -C1 adds M bytes to the index, -C2 adds 2M bytes, and -C3 adds 4M + bytes. + +* -w STEP: makes the index STEP times smaller. + +* -W SIZE: makes the index about (SIZE+1)/2 times smaller. + +* lastdb8: makes the index twice as big. + +* -u, -m: Multiple patterns multiply the index size. For example, + `MAM8 <last-seeds.html>`_ makes it 8 times bigger. + +* -s: does not change the total size, but splits it into volumes. + Limitations ----------- diff --git a/makefile b/makefile index 545a6b9..7d1bb11 100644 --- a/makefile +++ b/makefile @@ -2,12 +2,15 @@ CXXFLAGS = -O3 -std=c++11 -pthread -DHAS_CXX_THREADS all: @cd src && $(MAKE) CXXFLAGS="$(CXXFLAGS)" +progs = src/lastdb src/lastal src/last-split src/last-merge-batches \ +src/last-pair-probs src/lastdb8 src/lastal8 src/last-split8 + prefix = /usr/local exec_prefix = $(prefix) bindir = $(exec_prefix)/bin install: all mkdir -p $(bindir) - cp src/last?? src/last-split src/last-merge-batches src/last-pair-probs scripts/* $(bindir) + cp $(progs) scripts/* $(bindir) clean: @cd src && $(MAKE) clean @@ -17,7 +20,7 @@ html: distdir = last-`hg id -n` -RSYNCFLAGS = -aC --exclude 'last??' --exclude last-split --exclude last-merge-batches --exclude last-pair-probs +RSYNCFLAGS = -aC --exclude '*8' --exclude 'last??' --exclude last-split --exclude last-merge-batches --exclude last-pair-probs dist: log html @cd src && $(MAKE) version.hh CyclicSubsetSeedData.hh ScoreMatrixData.hh diff --git a/src/Centroid.cc b/src/Centroid.cc index 792ed45..42c5286 100644 --- a/src/Centroid.cc +++ b/src/Centroid.cc @@ -110,11 +110,8 @@ namespace cbrc{ } void Centroid::initBackwardMatrix(){ - pp.resize( fM.size() ); mD.assign( numAntidiagonals + 2, 0.0 ); mI.assign( numAntidiagonals + 2, 0.0 ); - mX1.assign ( numAntidiagonals + 2, 1.0 ); - mX2.assign ( numAntidiagonals + 2, 1.0 ); size_t n = xa.scoreEndIndex( numAntidiagonals ); bM.assign( n, 0.0 ); @@ -170,11 +167,11 @@ namespace cbrc{ assert( gap.insExist == gap.delExist || eP <= 0.0 ); for( size_t k = 0; k < numAntidiagonals; ++k ){ // loop over antidiagonals - double sum_f = 0.0; // sum of forward values const size_t seq1beg = seq1start( k ); const size_t seq2pos = k - seq1beg; - const double scale12 = 1.0 / ( scale[k+1] * scale[k] ); - const double scale1 = 1.0 / scale[k+1]; + const double scale12 = scale[k+1] * scale[k]; + const double scale1 = scale[k+1]; + double sum_f = 0.0; // sum of forward values const double seE = eE * scale1; const double seEI = eEI * scale1; @@ -203,37 +200,57 @@ namespace cbrc{ if (! isPssm) { const uchar* s2 = seqPtr( seq2, isForward, seq2pos ); - while (1) { // start: inner most loop - const double xM = *fM2 * scale12; - const double xD = *fD1 * seE; - const double xI = *fI1 * seEI; - const double xP = *fP2 * seP; - *fD0 = xM * eF + xD + xP; - *fI0 = (xM + xD) * eFI + xI + xP; - *fM0 = (xM + xD + xI + xP) * match_score[ *s1 ][ *s2 ]; - *fP0 = xM * eF + xP; - sum_f += xM; - if( globality && (isDelimiter(*s2, *match_score) || - isDelimiter(*s1, *match_score)) ){ - Z += xM + xD + xI + xP; + if (isAffine) { + while (1) { + const double xM = *fM2 * scale12; + const double xD = *fD1 * seE; + const double xI = *fI1 * seEI; + *fD0 = xM * eF + xD; + *fI0 = (xM + xD) * eFI + xI; + *fM0 = (xM + xD + xI) * match_score[ *s1 ][ *s2 ]; + sum_f += xM; + if( globality && (isDelimiter(*s2, *match_score) || + isDelimiter(*s1, *match_score)) ){ + Z += xM + xD + xI; + } + if (fM0 == fM0last) break; + fM0++; fD0++; fI0++; + fM2++; fD1++; fI1++; + s1 += seqIncrement; + s2 -= seqIncrement; } - if (fM0 == fM0last) break; - fM0++; fD0++; fI0++; fP0++; - fM2++; fD1++; fI1++; fP2++; - s1 += seqIncrement; - s2 -= seqIncrement; - } // end: inner most loop - } // end: if (! isPssm) - else { + } else { + while (1) { + const double xM = *fM2 * scale12; + const double xD = *fD1 * seE; + const double xI = *fI1 * seEI; + const double xP = *fP2 * seP; + *fD0 = xM * eF + xD + xP; + *fI0 = (xM + xD) * eFI + xI + xP; + *fM0 = (xM + xD + xI + xP) * match_score[ *s1 ][ *s2 ]; + *fP0 = xM * eF + xP; + sum_f += xM; + if( globality && (isDelimiter(*s2, *match_score) || + isDelimiter(*s1, *match_score)) ){ + Z += xM + xD + xI + xP; + } + if (fM0 == fM0last) break; + fM0++; fD0++; fI0++; fP0++; + fM2++; fD1++; fI1++; fP2++; + s1 += seqIncrement; + s2 -= seqIncrement; + } + } + } else { const ExpMatrixRow* p2 = seqPtr( pssm, isForward, seq2pos ); if (isAffine) { - while (1) { // start: inner most loop + while (1) { const double xM = *fM2 * scale12; const double xD = *fD1 * seE; - const double xI = *fI1 * seE; + const double xI = *fI1 * seEI; *fD0 = xM * eF + xD; - *fI0 = (xM + xD) * eF + xI; + *fI0 = (xM + xD) * eFI + xI; *fM0 = (xM + xD + xI) * (*p2)[ *s1 ]; sum_f += xM; if ( globality && (isDelimiter(0, *p2) || @@ -245,9 +262,9 @@ namespace cbrc{ fM2++; fD1++; fI1++; s1 += seqIncrement; p2 -= seqIncrement; - } // end: inner most loop - }else{ - while (1) { // start: inner most loop + } + } else { + while (1) { const double xM = *fM2 * scale12; const double xD = *fD1 * seE; const double xI = *fI1 * seEI; @@ -266,21 +283,22 @@ namespace cbrc{ fM2++; fD1++; fI1++; fP2++; s1 += seqIncrement; p2 -= seqIncrement; - } // end: inner most loop + } } } + if( !globality ) Z += sum_f; - scale[k+2] = sum_f + 1.0; // seems ugly - Z /= scale[k+2]; // scaling + scale[k+2] = 1.0 / (sum_f + 1.0); // seems ugly + Z *= scale[k+2]; // scaling } // k + //std::cout << "# Z=" << Z << std::endl; assert( Z > 0.0 ); - scale[ numAntidiagonals + 1 ] *= Z; // this causes scaled Z to equal 1 + scale[ numAntidiagonals + 1 ] /= Z; // this causes scaled Z to equal 1 } // added by M. Hamada // compute posterior probabilities while executing backward algorithm - // posterior probabilities are stored in pp void Centroid::backward( const uchar* seq1, const uchar* seq2, size_t start1, size_t start2, bool isForward, int globality, @@ -312,9 +330,9 @@ namespace cbrc{ for( size_t k = numAntidiagonals; k-- > 0; ){ const size_t seq1beg = seq1start( k ); const size_t seq2pos = k - seq1beg; - const double scale12 = 1.0 / ( scale[k+1] * scale[k] ); - const double scale1 = 1.0 / scale[k+1]; - scaledUnit /= scale[k+2]; + const double scale12 = scale[k+1] * scale[k]; + const double scale1 = scale[k+1]; + scaledUnit *= scale[k+2]; const double seE = eE * scale1; const double seEI = eEI * scale1; @@ -326,8 +344,6 @@ namespace cbrc{ const double* bI0 = &bI[ scoreEnd + 1 ]; const double* bP0 = &bP[ scoreEnd + 1 ]; - double* pp0 = &pp[ scoreEnd ]; - const size_t horiBeg = xa.hori( k, seq1beg ); const size_t vertBeg = xa.vert( k, seq1beg ); const size_t diagBeg = xa.diag( k, seq1beg ); @@ -336,73 +352,98 @@ namespace cbrc{ double* bM2 = &bM[ diagBeg ]; double* bP2 = &bP[ diagBeg ]; - const double* fM2 = &fM[ diagBeg ]; const double* fD1 = &fD[ horiBeg ]; const double* fI1 = &fI[ vertBeg ]; const double* fP2 = &fP[ diagBeg ]; const double* bM0last = bM0 + xa.numCellsAndPads( k ) - 2; - int i = seq1beg; int j = seq2pos; + double* mDout = &mD[ seq1beg ]; + double* mIout = &mI[ seq2pos ]; const uchar* s1 = seqPtr( seq1, isForward, seq1beg ); if (! isPssm ) { const uchar* s2 = seqPtr( seq2, isForward, seq2pos ); - while (1) { // inner most loop - double yM = *bM0 * match_score[ *s1 ][ *s2 ]; - double yD = *bD0; - double yI = *bI0; - double yP = *bP0; - double zM = yM + yD * eF + yI * eFI + yP * eF; - double zD = yM + yD + yI * eFI; - double zI = yM + yI; - double zP = yM + yP + yD + yI; - if( globality ){ - if( isDelimiter(*s2, *match_score) || - isDelimiter(*s1, *match_score) ){ - zM += scaledUnit; zD += scaledUnit; zI += scaledUnit; - zP += scaledUnit; + if (isAffine) { + while (1) { + double yM = *bM0 * match_score[ *s1 ][ *s2 ]; + double yD = *bD0; + double yI = *bI0; + double zM = yM + yD * eF + yI * eFI; + double zD = yM + yD + yI * eFI; + double zI = yM + yI; + if( globality ){ + if( isDelimiter(*s2, *match_score) || + isDelimiter(*s1, *match_score) ){ + zM += scaledUnit; zD += scaledUnit; zI += scaledUnit; + } + }else{ + zM += scaledUnit; + } + *bM2 = zM * scale12; + *bD1 = zD * seE; + *bI1 = zI * seEI; + + *mDout += (*fD1) * (*bD1); + *mIout += (*fI1) * (*bI1); + + if (bM0 == bM0last) break; + mDout++; mIout--; + bM2++; bD1++; bI1++; + bM0++; bD0++; bI0++; + fD1++; fI1++; + s1 += seqIncrement; + s2 -= seqIncrement; + } + } else { + while (1) { + double yM = *bM0 * match_score[ *s1 ][ *s2 ]; + double yD = *bD0; + double yI = *bI0; + double yP = *bP0; + double zM = yM + yD * eF + yI * eFI + yP * eF; + double zD = yM + yD + yI * eFI; + double zI = yM + yI; + double zP = yM + yP + yD + yI; + if( globality ){ + if( isDelimiter(*s2, *match_score) || + isDelimiter(*s1, *match_score) ){ + zM += scaledUnit; zD += scaledUnit; zI += scaledUnit; + zP += scaledUnit; + } + }else{ + zM += scaledUnit; } - }else{ - zM += scaledUnit; + *bM2 = zM * scale12; + *bD1 = zD * seE; + *bI1 = zI * seEI; + *bP2 = zP * seP; + + double probp = *fP2 * *bP2; + *mDout += (*fD1) * (*bD1) + probp; + *mIout += (*fI1) * (*bI1) + probp; + + if (bM0 == bM0last) break; + mDout++; mIout--; + bM2++; bD1++; bI1++; bP2++; + bM0++; bD0++; bI0++; bP0++; + fD1++; fI1++; fP2++; + s1 += seqIncrement; + s2 -= seqIncrement; } - *bM2 = zM * scale12; - *bD1 = zD * seE; - *bI1 = zI * seEI; - *bP2 = zP * seP; - - double prob = *fM2 * *bM2; - *pp0 = prob; - double probd = *fD1 * *bD1; - double probi = *fI1 * *bI1; - double probp = *fP2 * *bP2; - mD[ i ] += probd + probp; - mI[ j ] += probi + probp; - mX1 [ i ] -= ( prob + probd + probp ); - mX2 [ j ] -= ( prob + probi + probp ); - - if (bM0 == bM0last) break; - i++; j--; - bM2++; bD1++; bI1++; bP2++; - bM0++; bD0++; bI0++; bP0++; - fM2++; fD1++; fI1++; fP2++; - pp0++; - s1 += seqIncrement; - s2 -= seqIncrement; } - } - else { + } else { const ExpMatrixRow* p2 = seqPtr( pssm, isForward, seq2pos ); if (isAffine) { - while (1) { // inner most loop + while (1) { double yM = *bM0 * ( *p2 )[ *s1 ]; double yD = *bD0; double yI = *bI0; - double zM = yM + yD * eF + yI * eF; - double zD = yM + yD + yI * eF; + double zM = yM + yD * eF + yI * eFI; + double zD = yM + yD + yI * eFI; double zI = yM + yI; if( globality ){ if( isDelimiter(0, *p2) || @@ -414,27 +455,20 @@ namespace cbrc{ } *bM2 = zM * scale12; *bD1 = zD * seE; - *bI1 = zI * seE; + *bI1 = zI * seEI; - double prob = *fM2 * *bM2; - *pp0 = prob; - double probd = *fD1 * *bD1; - double probi = *fI1 * *bI1; - mD[ i ] += probd; - mI[ j ] += probi; - mX1 [ i ] -= ( prob + probd ); - mX2 [ j ] -= ( prob + probi ); + *mDout += (*fD1) * (*bD1); + *mIout += (*fI1) * (*bI1); if (bM0 == bM0last) break; - i++; j--; + mDout++; mIout--; bM2++; bD1++; bI1++; bM0++; bD0++; bI0++; - fM2++; fD1++; fI1++; - pp0++; + fD1++; fI1++; s1 += seqIncrement; p2 -= seqIncrement; } - }else{ + } else { while (1) { double yM = *bM0 * ( *p2 )[ *s1 ]; double yD = *bD0; @@ -458,22 +492,15 @@ namespace cbrc{ *bI1 = zI * seEI; *bP2 = zP * seP; - double prob = *fM2 * *bM2; - *pp0 = prob; - double probd = *fD1 * *bD1; - double probi = *fI1 * *bI1; double probp = *fP2 * *bP2; - mD[ i ] += probd + probp; - mI[ j ] += probi + probp; - mX1 [ i ] -= ( prob + probd + probp ); - mX2 [ j ] -= ( prob + probi + probp ); + *mDout += (*fD1) * (*bD1) + probp; + *mIout += (*fI1) * (*bI1) + probp; if (bM0 == bM0last) break; - i++; j--; + mDout++; mIout--; bM2++; bD1++; bI1++; bP2++; bM0++; bD0++; bI0++; bP0++; - fM2++; fD1++; fI1++; fP2++; - pp0++; + fD1++; fI1++; fP2++; s1 += seqIncrement; p2 -= seqIncrement; } @@ -506,23 +533,27 @@ namespace cbrc{ for( size_t k = 1; k < numAntidiagonals; ++k ){ // loop over antidiagonals const size_t scoreEnd = xa.scoreEndIndex( k ); double* X0 = &X[ scoreEnd ]; - const double* P0 = &pp[ scoreEnd ]; - size_t cur = seq1start( k ); + size_t seq1pos = seq1start( k ); const double* const x0end = X0 + xa.numCellsAndPads( k ); - const double* X1 = &X[xa.hori(k, cur)]; - const double* X2 = &X[xa.diag(k, cur)]; + const size_t h = xa.hori( k, seq1pos ); + const size_t d = xa.diag( k, seq1pos ); + const double* X1 = &X[h]; + const double* X2 = &X[d]; + const double* fM2 = &fM[d]; + const double* bM2 = &bM[d]; *X0++ = -DINF; // add one pad cell do{ - const double s = ( gamma + 1 ) * ( *P0++ ) - 1; + const double matchProb = (*fM2++) * (*bM2++); + const double s = ( gamma + 1 ) * matchProb - 1; const double oldX1 = *X1++; // Added by MCF const double score = std::max( std::max( oldX1, *X1 ), *X2++ + s ); //assert ( score >= 0 ); - updateScore ( score, k, cur ); + updateScore ( score, k, seq1pos ); *X0++ = score; - cur++; + seq1pos++; }while( X0 != x0end ); } return bestScore; @@ -537,10 +568,11 @@ namespace cbrc{ size_t oldPos1 = i; while( k > 0 ){ - const int m = - maxIndex( diagx( X, k, i ) + ( gamma + 1 ) * cellx( pp, k, i ) - 1, - horix( X, k, i ), - vertx( X, k, i ) ); + const size_t h = xa.hori( k, i ); + const size_t v = xa.vert( k, i ); + const size_t d = xa.diag( k, i ); + const double matchProb = fM[d] * bM[d]; + const int m = maxIndex( X[d] + (gamma + 1) * matchProb - 1, X[h], X[v] ); if( m == 0 ){ k -= 2; i -= 1; @@ -560,28 +592,51 @@ namespace cbrc{ initDecodingMatrix(); + mX1.assign ( numAntidiagonals + 2, 1.0 ); + mX2.assign ( numAntidiagonals + 2, 1.0 ); + + for (size_t k = 0; k < numAntidiagonals; ++k) { + size_t seq1pos = seq1start(k); + size_t seq2pos = k - seq1pos; + size_t loopBeg = xa.diag(k, seq1pos); + size_t loopEnd = loopBeg + xa.numCellsAndPads(k) - 1; + for (size_t i = loopBeg; i < loopEnd; ++i) { + const double matchProb = fM[i] * bM[i]; + mX1[seq1pos++] -= matchProb; + mX2[seq2pos--] -= matchProb; + } + } + for( size_t k = 1; k < numAntidiagonals; ++k ){ // loop over antidiagonals const size_t scoreEnd = xa.scoreEndIndex( k ); double* X0 = &X[ scoreEnd ]; - const double* P0 = &pp[ scoreEnd ]; - size_t cur = seq1start( k ); - size_t seq2pos = k - cur; + size_t seq1pos = seq1start( k ); + size_t seq2pos = k - seq1pos; const double* const x0end = X0 + xa.numCellsAndPads( k ); - const double* X1 = &X[ xa.hori(k, cur) ]; - const double* X2 = &X[ xa.diag(k, cur) ]; + const size_t h = xa.hori( k, seq1pos ); + const size_t d = xa.diag( k, seq1pos ); + const double* X1 = &X[h]; + const double* X2 = &X[d]; + const double* fM2 = &fM[d]; + const double* bM2 = &bM[d]; *X0++ = -DINF; // add one pad cell do{ - const double s = 2 * gamma * *P0++ - ( mX1[ cur ] + mX2[ seq2pos ] ); + const double matchProb = (*fM2++) * (*bM2++); + const double thisD = mD[seq1pos]; + const double thisI = mI[seq2pos]; + const double thisXD = mX1[seq1pos] - thisD; + const double thisXI = mX2[seq2pos] - thisI; + const double s = 2 * gamma * matchProb - (thisXD + thisXI); + const double u = gamma * thisD - thisXD; + const double t = gamma * thisI - thisXI; const double oldX1 = *X1++; // Added by MCF - const double u = gamma * mD[ cur ] - mX1[ cur ]; - const double t = gamma * mI[ seq2pos ] - mX2[ seq2pos ]; const double score = std::max( std::max( oldX1 + u, *X1 + t), *X2++ + s ); - updateScore ( score, k, cur ); + updateScore ( score, k, seq1pos ); *X0++ = score; - cur++; + seq1pos++; seq2pos--; }while( X0 != x0end ); } @@ -599,13 +654,18 @@ namespace cbrc{ while( k > 0 ){ const size_t j = k - i; - const double s = 2 * gamma * cellx( pp, k, i ) - ( mX1[ i ] + mX2[ j ] ); - const double t = gamma * mI[ j ] - mX2[ j ]; - const double u = gamma * mD[ i ] - mX1[ i ]; - const int m = - maxIndex( diagx( X, k, i ) + s, - horix( X, k, i ) + u, - vertx( X, k, i ) + t ); + const size_t h = xa.hori( k, i ); + const size_t v = xa.vert( k, i ); + const size_t d = xa.diag( k, i ); + const double matchProb = fM[d] * bM[d]; + const double thisD = mD[i]; + const double thisI = mI[j]; + const double thisXD = mX1[i] - thisD; + const double thisXI = mX2[j] - thisI; + const double s = 2 * gamma * matchProb - (thisXD + thisXI); + const double t = gamma * thisI - thisXI; + const double u = gamma * thisD - thisXD; + const int m = maxIndex( X[d] + s, X[h] + u, X[v] + t ); if( m == 0 ){ k -= 2; i -= 1; @@ -653,7 +713,8 @@ namespace cbrc{ size_t seq2pos = i->end2(); for( size_t j = 0; j < i->size; ++j ){ - double p = cellx( pp, seq1pos + seq2pos, seq1pos ); + size_t d = xa.diag( seq1pos + seq2pos, seq1pos ); + double p = fM[d] * bM[d]; ambiguityCodes.push_back( asciiProbability(p) ); --seq1pos; --seq2pos; @@ -679,7 +740,7 @@ namespace cbrc{ double Centroid::logPartitionFunction() const{ double x = 0.0; for( size_t k = 0; k < numAntidiagonals; ++k ){ - x += std::log( scale[k+2] ); + x -= std::log( scale[k+2] ); } return T * x; } @@ -711,8 +772,8 @@ namespace cbrc{ for( size_t k = 0; k < numAntidiagonals; ++k ){ // loop over antidiagonals const size_t seq1beg = seq1start( k ); const size_t seq2pos = k - seq1beg; - const double scale12 = 1.0 / ( scale[k+1] * scale[k] ); - const double scale1 = 1.0 / scale[k+1]; + const double scale12 = scale[k+1] * scale[k]; + const double scale1 = scale[k+1]; const double seE = eE * scale1; const double seEI = eEI * scale1; @@ -738,34 +799,59 @@ namespace cbrc{ const double* bM0last = bM0 + xa.numCellsAndPads( k ) - 2; if (! isPssm ) { - while (1) { // inner most loop - const double xM = *fM2 * scale12; - const double xD = *fD1 * seE; - const double xI = *fI1 * seEI; - const double xP = *fP2 * seP; - const double yM = *bM0 * match_score[ *s1 ][ *s2 ]; - const double yD = *bD0; - const double yI = *bI0; - const double yP = *bP0; - c.emit[*s1][*s2] += (xM + xD + xI + xP) * yM; - c.MM += xM * yM; - c.DM += xD * yM; - c.IM += xI * yM; - c.PM += xP * yM; - c.MD += xM * yD * eF; - c.DD += xD * yD; - c.PD += xP * yD; - c.MI += xM * yI * eFI; - c.DI += xD * yI * eFI; - c.II += xI * yI; - c.PI += xP * yI; - c.MP += xM * yP * eF; - c.PP += xP * yP; - if (bM0 == bM0last) break; - fM2++; fD1++; fI1++; fP2++; - bM0++; bD0++; bI0++; bP0++; - s1 += seqIncrement; - s2 -= seqIncrement; + if (isAffine) { + while (1) { + const double xM = *fM2 * scale12; + const double xD = *fD1 * seE; + const double xI = *fI1 * seEI; + const double yM = *bM0 * match_score[ *s1 ][ *s2 ]; + const double yD = *bD0; + const double yI = *bI0; + c.emit[*s1][*s2] += (xM + xD + xI) * yM; + c.MM += xM * yM; + c.DM += xD * yM; + c.IM += xI * yM; + c.MD += xM * yD * eF; + c.DD += xD * yD; + c.MI += xM * yI * eFI; + c.DI += xD * yI * eFI; + c.II += xI * yI; + if (bM0 == bM0last) break; + fM2++; fD1++; fI1++; + bM0++; bD0++; bI0++; + s1 += seqIncrement; + s2 -= seqIncrement; + } + } else { + while (1) { + const double xM = *fM2 * scale12; + const double xD = *fD1 * seE; + const double xI = *fI1 * seEI; + const double xP = *fP2 * seP; + const double yM = *bM0 * match_score[ *s1 ][ *s2 ]; + const double yD = *bD0; + const double yI = *bI0; + const double yP = *bP0; + c.emit[*s1][*s2] += (xM + xD + xI + xP) * yM; + c.MM += xM * yM; + c.DM += xD * yM; + c.IM += xI * yM; + c.PM += xP * yM; + c.MD += xM * yD * eF; + c.DD += xD * yD; + c.PD += xP * yD; + c.MI += xM * yI * eFI; + c.DI += xD * yI * eFI; + c.II += xI * yI; + c.PI += xP * yI; + c.MP += xM * yP * eF; + c.PP += xP * yP; + if (bM0 == bM0last) break; + fM2++; fD1++; fI1++; fP2++; + bM0++; bD0++; bI0++; bP0++; + s1 += seqIncrement; + s2 -= seqIncrement; + } } } else { @@ -775,7 +861,7 @@ namespace cbrc{ while (1) { // inner most loop const double xM = *fM2 * scale12; const double xD = *fD1 * seE; - const double xI = *fI1 * seE; + const double xI = *fI1 * seEI; const double yM = *bM0 * ( *p2 )[ *s1 ]; const double yD = *bD0; const double yI = *bI0; @@ -785,8 +871,8 @@ namespace cbrc{ c.IM += xI * yM; c.MD += xM * yD * eF; c.DD += xD * yD; - c.MI += xM * yI * eF; - c.DI += xD * yI * eF; + c.MI += xM * yI * eFI; + c.DI += xD * yI * eFI; c.II += xI * yI; if (bM0 == bM0last) break; fM2++; fD1++; fI1++; diff --git a/src/Centroid.hh b/src/Centroid.hh index 14cef9e..516bbb3 100644 --- a/src/Centroid.hh +++ b/src/Centroid.hh @@ -101,8 +101,6 @@ namespace cbrc{ dvec_t bI; // b^I(i,j) dvec_t bP; // b^P(i,j) - dvec_t pp; // posterior match probabilities - dvec_t mD; dvec_t mI; dvec_t mX1; @@ -128,30 +126,6 @@ namespace cbrc{ return xa.scoreEndIndex( antidiagonal ) - xa.scoreOrigin( antidiagonal ); } - // get DP matrix value at the given position - double cellx( const dvec_t& matrix, - size_t antiDiagonal, size_t seq1pos ) const{ - return matrix[ xa.scoreOrigin( antiDiagonal ) + seq1pos ]; - } - - // get DP matrix value "left of" the given position - double horix( const dvec_t& matrix, - size_t antiDiagonal, size_t seq1pos ) const{ - return matrix[ xa.hori( antiDiagonal, seq1pos ) ]; - } - - // get DP matrix value "above" the given position - double vertx( const dvec_t& matrix, - size_t antiDiagonal, size_t seq1pos ) const{ - return matrix[ xa.vert( antiDiagonal, seq1pos ) ]; - } - - // get DP matrix value "diagonal from" the given position - double diagx( const dvec_t& matrix, - size_t antiDiagonal, size_t seq1pos ) const{ - return matrix[ xa.diag( antiDiagonal, seq1pos ) ]; - } - // get a pointer into a sequence, taking start and direction into account template < typename T > static const T* seqPtr( const T* seq, bool isForward, size_t pos ){ @@ -160,5 +134,6 @@ namespace cbrc{ } }; -} // end namespace cbrc -#endif // CENTROID_HH +} + +#endif diff --git a/src/DiagonalTable.hh b/src/DiagonalTable.hh index ff3f9df..90acd2f 100644 --- a/src/DiagonalTable.hh +++ b/src/DiagonalTable.hh @@ -15,13 +15,15 @@ #ifndef DIAGONALTABLE_HH #define DIAGONALTABLE_HH + +#include <stddef.h> #include <utility> // pair #include <vector> namespace cbrc{ struct DiagonalTable{ - typedef unsigned indexT; + typedef LAST_INT_TYPE indexT; typedef std::pair<indexT, indexT> pairT; enum { BINS = 256 }; // use a power-of-two for faster modulus (maybe) @@ -36,5 +38,6 @@ struct DiagonalTable{ std::vector<pairT> hits[BINS]; }; -} // end namespace cbrc -#endif // DIAGONALTABLE_HH +} + +#endif diff --git a/src/GappedXdropAligner.cc b/src/GappedXdropAligner.cc index 4c8a532..479fdbe 100644 --- a/src/GappedXdropAligner.cc +++ b/src/GappedXdropAligner.cc @@ -92,10 +92,9 @@ int GappedXdropAligner::align(const uchar *seq1, int gapUnalignedCost, int maxScoreDrop, int maxMatchScore) { - bool isAffine = - insExistenceCost == delExistenceCost && - insExtensionCost == delExtensionCost && - gapUnalignedCost >= delExistenceCost + 2 * delExtensionCost; + const bool isAffine = isAffineGaps(delExistenceCost, delExtensionCost, + insExistenceCost, insExtensionCost, + gapUnalignedCost); std::size_t maxSeq1begs[] = { 0, 9 }; std::size_t minSeq1ends[] = { 1, 0 }; @@ -155,14 +154,13 @@ int GappedXdropAligner::align(const uchar *seq1, while (1) { int x = *x2; int y = *y1 - delExtensionCost; - int z = *z1 - delExtensionCost; + int z = *z1 - insExtensionCost; int b = maxValue(x, y, z); if (b >= minScore) { updateBest(bestScore, b, antidiagonal, x0, x0base); *x0 = b + scorer[*s1][*s2]; - int g = b - delExistenceCost; - *y0 = maxValue(g, y); - *z0 = maxValue(g, z); + *y0 = maxValue(b - delExistenceCost, y); + *z0 = maxValue(b - insExistenceCost, z); } else *x0 = *y0 = *z0 = -INF; if (x0 == x0last) break; @@ -172,14 +170,13 @@ int GappedXdropAligner::align(const uchar *seq1, while (1) { int x = *x2; int y = *y1 - delExtensionCost; - int z = *z1 - delExtensionCost; + int z = *z1 - insExtensionCost; int b = maxValue(x, y, z); if (b >= minScore) { updateBest(bestScore, b, antidiagonal, x0, x0base); *x0 = b + scorer[*s1][*s2]; - int g = b - delExistenceCost; - *y0 = maxValue(g, y); - *z0 = maxValue(g, z); + *y0 = maxValue(b - delExistenceCost, y); + *z0 = maxValue(b - insExistenceCost, z); } else *x0 = *y0 = *z0 = -INF; if (x0 == x0last) break; diff --git a/src/GappedXdropAligner2qual.cc b/src/GappedXdropAligner2qual.cc index 00c1559..b69cb32 100644 --- a/src/GappedXdropAligner2qual.cc +++ b/src/GappedXdropAligner2qual.cc @@ -24,10 +24,9 @@ int GappedXdropAligner::align2qual(const uchar *seq1, int gapUnalignedCost, int maxScoreDrop, int maxMatchScore) { - bool isAffine = - insExistenceCost == delExistenceCost && - insExtensionCost == delExtensionCost && - gapUnalignedCost >= delExistenceCost + 2 * delExtensionCost; + const bool isAffine = isAffineGaps(delExistenceCost, delExtensionCost, + insExistenceCost, insExtensionCost, + gapUnalignedCost); std::size_t maxSeq1begs[] = { 0, 9 }; std::size_t minSeq1ends[] = { 1, 0 }; @@ -88,14 +87,13 @@ int GappedXdropAligner::align2qual(const uchar *seq1, while (1) { int x = *x2; int y = *y1 - delExtensionCost; - int z = *z1 - delExtensionCost; + int z = *z1 - insExtensionCost; int b = maxValue(x, y, z); if (b >= minScore) { updateBest(bestScore, b, antidiagonal, x0, x0base); *x0 = b + scorer(*s1, *s2, *q1, *q2); - int g = b - delExistenceCost; - *y0 = maxValue(g, y); - *z0 = maxValue(g, z); + *y0 = maxValue(b - delExistenceCost, y); + *z0 = maxValue(b - insExistenceCost, z); } else *x0 = *y0 = *z0 = -INF; if (x0 == x0last) break; @@ -105,14 +103,13 @@ int GappedXdropAligner::align2qual(const uchar *seq1, while (1) { int x = *x2; int y = *y1 - delExtensionCost; - int z = *z1 - delExtensionCost; + int z = *z1 - insExtensionCost; int b = maxValue(x, y, z); if (b >= minScore) { updateBest(bestScore, b, antidiagonal, x0, x0base); *x0 = b + scorer(*s1, *s2, *q1, *q2); - int g = b - delExistenceCost; - *y0 = maxValue(g, y); - *z0 = maxValue(g, z); + *y0 = maxValue(b - delExistenceCost, y); + *z0 = maxValue(b - insExistenceCost, z); } else *x0 = *y0 = *z0 = -INF; if (x0 == x0last) break; diff --git a/src/GappedXdropAlignerInl.hh b/src/GappedXdropAlignerInl.hh index 21f06c5..28ca280 100644 --- a/src/GappedXdropAlignerInl.hh +++ b/src/GappedXdropAlignerInl.hh @@ -59,6 +59,13 @@ T whichFrame(std::size_t antidiagonal, T frame0, T frame1, T frame2) { } } +inline bool isAffineGaps(int delExistenceCost, int delExtensionCost, + int insExistenceCost, int insExtensionCost, + int gapUnalignedCost) { + return gapUnalignedCost >= delExtensionCost + insExtensionCost + + std::max(delExistenceCost, insExistenceCost); +} + // The next two functions will stop the alignment at delimiters. But // this is not guaranteed if bestScore > INF / 2. We could avoid this // restriction by replacing -INF / 2 with bestScore - INF. diff --git a/src/GappedXdropAlignerPssm.cc b/src/GappedXdropAlignerPssm.cc index 6d776d3..470e025 100644 --- a/src/GappedXdropAlignerPssm.cc +++ b/src/GappedXdropAlignerPssm.cc @@ -16,10 +16,9 @@ int GappedXdropAligner::alignPssm(const uchar *seq, int gapUnalignedCost, int maxScoreDrop, int maxMatchScore) { - bool isAffine = - insExistenceCost == delExistenceCost && - insExtensionCost == delExtensionCost && - gapUnalignedCost >= delExistenceCost + 2 * delExtensionCost; + const bool isAffine = isAffineGaps(delExistenceCost, delExtensionCost, + insExistenceCost, insExtensionCost, + gapUnalignedCost); std::size_t maxSeq1begs[] = { 0, 9 }; std::size_t minSeq1ends[] = { 1, 0 }; @@ -78,14 +77,13 @@ int GappedXdropAligner::alignPssm(const uchar *seq, while (1) { int x = *x2; int y = *y1 - delExtensionCost; - int z = *z1 - delExtensionCost; + int z = *z1 - insExtensionCost; int b = maxValue(x, y, z); if (b >= minScore) { updateBest(bestScore, b, antidiagonal, x0, x0base); *x0 = b + (*s2)[*s1]; - int g = b - delExistenceCost; - *y0 = maxValue(g, y); - *z0 = maxValue(g, z); + *y0 = maxValue(b - delExistenceCost, y); + *z0 = maxValue(b - insExistenceCost, z); } else *x0 = *y0 = *z0 = -INF; if (x0 == x0last) break; @@ -95,14 +93,13 @@ int GappedXdropAligner::alignPssm(const uchar *seq, while (1) { int x = *x2; int y = *y1 - delExtensionCost; - int z = *z1 - delExtensionCost; + int z = *z1 - insExtensionCost; int b = maxValue(x, y, z); if (b >= minScore) { updateBest(bestScore, b, antidiagonal, x0, x0base); *x0 = b + (*s2)[*s1]; - int g = b - delExistenceCost; - *y0 = maxValue(g, y); - *z0 = maxValue(g, z); + *y0 = maxValue(b - delExistenceCost, y); + *z0 = maxValue(b - insExistenceCost, z); } else *x0 = *y0 = *z0 = -INF; if (x0 == x0last) break; diff --git a/src/GeneralizedAffineGapCosts.cc b/src/GeneralizedAffineGapCosts.cc index f5fdd32..070a0ab 100644 --- a/src/GeneralizedAffineGapCosts.cc +++ b/src/GeneralizedAffineGapCosts.cc @@ -1,7 +1,6 @@ // Copyright 2008, 2009, 2012 Martin C. Frith #include "GeneralizedAffineGapCosts.hh" -#include <algorithm> namespace cbrc{ diff --git a/src/GeneralizedAffineGapCosts.hh b/src/GeneralizedAffineGapCosts.hh index 6902718..8c8f273 100644 --- a/src/GeneralizedAffineGapCosts.hh +++ b/src/GeneralizedAffineGapCosts.hh @@ -19,6 +19,8 @@ #ifndef GENERALIZEDAFFINEGAPCOSTS_HH #define GENERALIZEDAFFINEGAPCOSTS_HH +#include <algorithm> + namespace cbrc{ struct GeneralizedAffineGapCosts{ @@ -36,8 +38,9 @@ struct GeneralizedAffineGapCosts{ { return insExist == delExist && insExtend == delExtend; } // Will standard affine gaps always suffice for maximal alignment scores? - bool isAffine() const - { return isSymmetric() && pairExtend >= delExist + 2 * delExtend; } + bool isAffine() const{ + return pairExtend >= std::max(delExist, insExist) + delExtend + insExtend; + } // Return the score of a gap with the given sizes in a pair of // sequences, considering that it might be either one "generalized" diff --git a/src/LastalArguments.cc b/src/LastalArguments.cc index 0a16f11..709ce9d 100644 --- a/src/LastalArguments.cc +++ b/src/LastalArguments.cc @@ -405,7 +405,7 @@ void LastalArguments::setDefaultsFromAlphabet( bool isDna, bool isProtein, int refTantanSetting, bool isCaseSensitiveSeeds, bool isVolumes, - indexT refMinimizerWindow, + size_t refMinimizerWindow, unsigned realNumOfThreads ){ if( strand < 0 ) strand = (isDna || isTranslated()) ? 2 : 1; diff --git a/src/LastalArguments.hh b/src/LastalArguments.hh index 9f03b33..708f999 100644 --- a/src/LastalArguments.hh +++ b/src/LastalArguments.hh @@ -14,8 +14,6 @@ namespace cbrc{ struct LastalArguments{ - typedef unsigned indexT; - // set the parameters to their default values: LastalArguments(); @@ -38,7 +36,7 @@ struct LastalArguments{ double numLettersInReference, bool isKeepRefLowercase, int refTantanSetting, bool isCaseSensitiveSeeds, bool isVolumes, - indexT refMinimizerWindow, + size_t refMinimizerWindow, unsigned realNumOfThreads ); void setDefaultsFromMatrix( double lambda, int minScore ); @@ -85,17 +83,17 @@ struct LastalArguments{ int maxDropGapless; int maxDropFinal; sequenceFormat::Enum inputFormat; - indexT minHitDepth; - indexT maxHitDepth; - indexT oneHitMultiplicity; - indexT maxGaplessAlignmentsPerQueryPosition; + size_t minHitDepth; + size_t maxHitDepth; + size_t oneHitMultiplicity; + size_t maxGaplessAlignmentsPerQueryPosition; size_t cullingLimitForGaplessAlignments; size_t cullingLimitForFinalAlignments; - indexT queryStep; - indexT minimizerWindow; - indexT batchSize; // approx size of query sequences to scan in 1 batch + size_t queryStep; + size_t minimizerWindow; + size_t batchSize; // approx size of query sequences to scan in 1 batch unsigned numOfThreads; - indexT maxRepeatDistance; // suppress repeats <= this distance apart + size_t maxRepeatDistance; // suppress repeats <= this distance apart double temperature; // probability = exp( score / temperature ) / Z double gamma; // parameter for gamma-centroid alignment std::string geneticCodeFile; diff --git a/src/LastdbArguments.cc b/src/LastdbArguments.cc index 274d248..7393685 100644 --- a/src/LastdbArguments.cc +++ b/src/LastdbArguments.cc @@ -38,7 +38,7 @@ LastdbArguments::LastdbArguments() : subsetSeedFile(""), userAlphabet(""), minSeedLimit(0), - bucketDepth(indexT(-1)), // means: use the default (adapts to the data) + bucketDepth(-1), // means: use the default (adapts to the data) childTableType(0), isCountsOnly(false), verbosity(0), diff --git a/src/LastdbArguments.hh b/src/LastdbArguments.hh index 73018af..4eb25db 100644 --- a/src/LastdbArguments.hh +++ b/src/LastdbArguments.hh @@ -14,8 +14,6 @@ namespace cbrc{ struct LastdbArguments{ - typedef unsigned indexT; - // set the parameters to their default values: LastdbArguments(); @@ -36,14 +34,14 @@ struct LastdbArguments{ int tantanSetting; bool isCaseSensitive; std::vector< std::string > seedPatterns; - size_t volumeSize; // type? - indexT indexStep; - indexT minimizerWindow; + size_t volumeSize; + size_t indexStep; + size_t minimizerWindow; unsigned numOfThreads; std::string subsetSeedFile; std::string userAlphabet; - indexT minSeedLimit; - indexT bucketDepth; + size_t minSeedLimit; + unsigned bucketDepth; int childTableType; bool isCountsOnly; int verbosity; diff --git a/src/MultiSequence.cc b/src/MultiSequence.cc index df27f2e..f9b0077 100644 --- a/src/MultiSequence.cc +++ b/src/MultiSequence.cc @@ -5,7 +5,6 @@ #include <sstream> #include <algorithm> // upper_bound #include <cassert> -#include <cctype> // isspace #include <iterator> // istreambuf_iterator using namespace cbrc; @@ -91,8 +90,8 @@ MultiSequence::appendFromFasta( std::istream& stream, indexT maxSeqLen ){ std::istreambuf_iterator<char> endpos; while( inpos != endpos ){ uchar c = *inpos; - if( c == '>' ) break; // we have hit the next FASTA sequence - if( !std::isspace(c) ){ + if( c > ' ' ){ // faster than isspace + if( c == '>' ) break; // we have hit the next FASTA sequence if( seq.v.size() >= maxSeqLen ) break; seq.v.push_back(c); } diff --git a/src/MultiSequence.hh b/src/MultiSequence.hh index 0c8dab6..df388dc 100644 --- a/src/MultiSequence.hh +++ b/src/MultiSequence.hh @@ -19,7 +19,7 @@ namespace cbrc{ class MultiSequence{ public: - typedef unsigned indexT; + typedef LAST_INT_TYPE indexT; typedef unsigned char uchar; // initialize with leftmost delimiter pad, ready for appending sequences diff --git a/src/SegmentPair.hh b/src/SegmentPair.hh index 97d6357..8b0fb18 100644 --- a/src/SegmentPair.hh +++ b/src/SegmentPair.hh @@ -6,10 +6,12 @@ #ifndef SEGMENT_PAIR_HH #define SEGMENT_PAIR_HH +#include <stddef.h> + namespace cbrc{ struct SegmentPair{ - typedef unsigned indexT; + typedef LAST_INT_TYPE indexT; typedef unsigned char uchar; SegmentPair(){} diff --git a/src/SegmentPairPot.cc b/src/SegmentPairPot.cc index 58328b6..5c7e34d 100644 --- a/src/SegmentPairPot.cc +++ b/src/SegmentPairPot.cc @@ -113,7 +113,7 @@ void SegmentPairPot::markAllOverlaps( const std::vector<SegmentPair>& sps ){ } void SegmentPairPot::markTandemRepeats( const SegmentPair& sp, - indexT maxDistance ){ + size_t maxDistance ){ assert( !items.empty() ); // Careful: if we are self-comparing lots of short sequences, there diff --git a/src/SegmentPairPot.hh b/src/SegmentPairPot.hh index a8670bf..9890eb9 100644 --- a/src/SegmentPairPot.hh +++ b/src/SegmentPairPot.hh @@ -51,7 +51,7 @@ struct SegmentPairPot{ // mark (near-)tandem repeats within sp // to avoid death by dynamic programming when self-aligning a large sequence - void markTandemRepeats( const SegmentPair& sp, indexT maxDistance ); + void markTandemRepeats( const SegmentPair& sp, size_t maxDistance ); // data: std::vector<SegmentPair> items; diff --git a/src/SubsetSuffixArray.cc b/src/SubsetSuffixArray.cc index 880c36c..009494c 100644 --- a/src/SubsetSuffixArray.cc +++ b/src/SubsetSuffixArray.cc @@ -10,7 +10,7 @@ using namespace cbrc; void SubsetSuffixArray::addPositions(const uchar* text, indexT beg, indexT end, - indexT step, indexT minimizerWindow) { + size_t step, size_t minimizerWindow) { if (beg >= end) return; assert(step > 0); const uchar *subsetMap = seed.firstMap(); @@ -115,7 +115,7 @@ void SubsetSuffixArray::toFiles( const std::string& baseName, memoryToBinaryFile( chibiTable.begin(), chibiTable.end(), fileName ); } -void SubsetSuffixArray::makeBuckets( const uchar* text, indexT bucketDepth ){ +void SubsetSuffixArray::makeBuckets( const uchar* text, unsigned bucketDepth ){ if( bucketDepth+1 == 0 ) bucketDepth = defaultBucketDepth(); makeBucketSteps( bucketDepth ); @@ -126,7 +126,7 @@ void SubsetSuffixArray::makeBuckets( const uchar* text, indexT bucketDepth ){ const uchar* textPtr = text + suffixArray[i]; const uchar* subsetMap = seed.firstMap(); indexT bucketIndex = 0; - indexT depth = 0; + unsigned depth = 0; while( depth < bucketDepth ){ uchar subset = subsetMap[ *textPtr ]; @@ -136,8 +136,7 @@ void SubsetSuffixArray::makeBuckets( const uchar* text, indexT bucketDepth ){ } ++textPtr; ++depth; - indexT step = bucketSteps[depth]; - bucketIndex += subset * step; + bucketIndex += subset * bucketSteps[depth]; subsetMap = seed.nextMap( subsetMap ); } @@ -159,11 +158,11 @@ void SubsetSuffixArray::makeBucketSteps( indexT bucketDepth ){ } } -SubsetSuffixArray::indexT SubsetSuffixArray::defaultBucketDepth(){ +unsigned SubsetSuffixArray::defaultBucketDepth(){ indexT maxBucketEntries = suffixArray.size() / 4; - indexT bucketDepth = 0; indexT kmerEntries = 1; indexT bucketEntries = 1; + unsigned bucketDepth = 0; while(true){ indexT nextSubsetCount = seed.subsetCount(bucketDepth); diff --git a/src/SubsetSuffixArray.hh b/src/SubsetSuffixArray.hh index c77c858..de1422e 100644 --- a/src/SubsetSuffixArray.hh +++ b/src/SubsetSuffixArray.hh @@ -30,7 +30,7 @@ namespace cbrc{ class SubsetSuffixArray{ public: - typedef unsigned indexT; + typedef LAST_INT_TYPE indexT; CyclicSubsetSeed& getSeed() { return seed; } const CyclicSubsetSeed& getSeed() const { return seed; } @@ -41,17 +41,17 @@ public: // If minimizerWindow > 1 then the positions are added only if they // are "minimizers" for the given window and seed pattern. void addPositions( const uchar* text, indexT beg, indexT end, - indexT step, indexT minimizerWindow ); + size_t step, size_t minimizerWindow ); // Sort the suffix array (but don't make the buckets). void sortIndex( const uchar* text, - indexT maxUnsortedInterval, int childTableType ); + size_t maxUnsortedInterval, int childTableType ); // Make the buckets. If bucketDepth+1 == 0, then a default // bucketDepth is used. The default is: the maximum possible // bucketDepth such that the number of bucket entries is at most 1/4 // the number of suffix array entries. - void makeBuckets( const uchar* text, indexT bucketDepth ); + void makeBuckets( const uchar* text, unsigned bucketDepth ); void fromFiles( const std::string& baseName, bool isMaskLowercase, const uchar letterCode[] ); @@ -66,13 +66,13 @@ public: // via begPtr and endPtr. void match( const indexT*& begPtr, const indexT*& endPtr, const uchar* queryPtr, const uchar* text, - indexT maxHits, indexT minDepth, indexT maxDepth ) const; + size_t maxHits, size_t minDepth, size_t maxDepth ) const; // Count matches of all sizes (up to maxDepth), starting at the // given position in the query. void countMatches( std::vector<unsigned long long>& counts, const uchar* queryPtr, const uchar* text, - indexT maxDepth ) const; + size_t maxDepth ) const; private: CyclicSubsetSeed seed; @@ -113,9 +113,9 @@ private: const uchar* textBase, const uchar* subsetMap ) const; // Return the maximum prefix size covered by the buckets. - indexT maxBucketPrefix() const { return bucketSteps.size() - 1; } + size_t maxBucketPrefix() const { return bucketSteps.size() - 1; } - indexT defaultBucketDepth(); + unsigned defaultBucketDepth(); void makeBucketSteps( indexT bucketDepth ); diff --git a/src/SubsetSuffixArraySearch.cc b/src/SubsetSuffixArraySearch.cc index 2c0efff..a505030 100644 --- a/src/SubsetSuffixArraySearch.cc +++ b/src/SubsetSuffixArraySearch.cc @@ -9,17 +9,17 @@ using namespace cbrc; // could & probably should return the match depth void SubsetSuffixArray::match( const indexT*& begPtr, const indexT*& endPtr, const uchar* queryPtr, const uchar* text, - indexT maxHits, - indexT minDepth, indexT maxDepth ) const{ + size_t maxHits, + size_t minDepth, size_t maxDepth ) const{ // the next line is unnecessary, but makes it faster in some cases: if( maxHits == 0 && minDepth < maxDepth ) minDepth = maxDepth; - indexT depth = 0; + size_t depth = 0; const uchar* subsetMap = seed.firstMap(); // match using buckets: - indexT bucketDepth = maxBucketPrefix(); - indexT startDepth = std::min( bucketDepth, maxDepth ); + size_t bucketDepth = maxBucketPrefix(); + size_t startDepth = std::min( bucketDepth, maxDepth ); const indexT* bucketPtr = &buckets[0]; while( depth < startDepth ){ @@ -51,7 +51,7 @@ void SubsetSuffixArray::match( const indexT*& begPtr, const indexT*& endPtr, // match using binary search: if( depth < minDepth ){ - indexT d = depth; + size_t d = depth; const uchar* s = subsetMap; while( depth < minDepth ){ uchar subset = subsetMap[ queryPtr[depth] ]; @@ -85,12 +85,12 @@ void SubsetSuffixArray::match( const indexT*& begPtr, const indexT*& endPtr, void SubsetSuffixArray::countMatches( std::vector<unsigned long long>& counts, const uchar* queryPtr, const uchar* text, - indexT maxDepth ) const{ - indexT depth = 0; + size_t maxDepth ) const{ + size_t depth = 0; const uchar* subsetMap = seed.firstMap(); // match using buckets: - indexT bucketDepth = maxBucketPrefix(); + size_t bucketDepth = maxBucketPrefix(); const indexT* bucketPtr = &buckets[0]; indexT beg = 0; indexT end = suffixArray.size(); diff --git a/src/SubsetSuffixArraySort.cc b/src/SubsetSuffixArraySort.cc index e460b8c..5fe3a6c 100644 --- a/src/SubsetSuffixArraySort.cc +++ b/src/SubsetSuffixArraySort.cc @@ -307,7 +307,7 @@ void SubsetSuffixArray::radixSortN( const uchar* text, const uchar* subsetMap, } void SubsetSuffixArray::sortIndex( const uchar* text, - indexT maxUnsortedInterval, + size_t maxUnsortedInterval, int childTableType ){ const indexT minLength = 1; @@ -324,18 +324,19 @@ void SubsetSuffixArray::sortIndex( const uchar* text, indexT depth; POP( beg, end, depth ); - if( end - beg <= maxUnsortedInterval && depth >= minLength ) continue; + size_t interval = end - beg; + if( interval <= maxUnsortedInterval && depth >= minLength ) continue; const uchar* textBase = text + depth; const uchar* subsetMap = seed.subsetMap(depth); if( childTableType == 0 ){ - if( end - beg < 10 ){ // ??? + if( interval < 10 ){ // ??? insertionSort( textBase, seed, beg, end, subsetMap ); continue; } }else{ - if( end - beg == 2 ){ + if( interval == 2 ){ sort2( textBase, beg, subsetMap ); continue; } diff --git a/src/lastal.cc b/src/lastal.cc index 3219c3e..467fad7 100644 --- a/src/lastal.cc +++ b/src/lastal.cc @@ -292,12 +292,13 @@ void calculateScoreStatistics( const std::string& matrixName, // Read the .prj file for the whole database void readOuterPrj( const std::string& fileName, unsigned& volumes, - indexT& refMinimizerWindow, indexT& minSeedLimit, + size_t& refMinimizerWindow, size_t& minSeedLimit, bool& isKeepRefLowercase, int& refTantanSetting, countT& refSequences, countT& refLetters ){ std::ifstream f( fileName.c_str() ); if( !f ) ERR( "can't open file: " + fileName ); unsigned version = 0; + size_t fileBitsPerInt = 32; std::string trigger = "#lastal"; std::string line, word; @@ -320,6 +321,7 @@ void readOuterPrj( const std::string& fileName, unsigned& volumes, if( word == "minimizerwindow" ) iss >> refMinimizerWindow; if( word == "volumes" ) iss >> volumes; if( word == "numofindexes" ) iss >> numOfIndexes; + if( word == "integersize" ) iss >> fileBitsPerInt; } if( f.eof() && !f.bad() ) f.clear(); @@ -331,6 +333,12 @@ void readOuterPrj( const std::string& fileName, unsigned& volumes, if( !f ) ERR( "can't read file: " + fileName ); if( version < 294 && version > 0) ERR( "the lastdb files are old: please re-run lastdb" ); + + if( fileBitsPerInt != sizeof(indexT) * CHAR_BIT ){ + if( fileBitsPerInt == 32 ) ERR( "please use lastal for " + fileName ); + if( fileBitsPerInt == 64 ) ERR( "please use lastal8 for " + fileName ); + ERR( "weird integersize in " + fileName ); + } } // Read a per-volume .prj file, with info about a database volume @@ -360,7 +368,7 @@ void writeCounts( std::ostream& out ){ for( indexT i = 0; i < matchCounts.size(); ++i ){ out << query.seqName(i) << '\n'; - for( indexT j = args.minHitDepth; j < matchCounts[i].size(); ++j ){ + for( size_t j = args.minHitDepth; j < matchCounts[i].size(); ++j ){ out << j << '\t' << matchCounts[i][j] << '\n'; } @@ -370,12 +378,12 @@ void writeCounts( std::ostream& out ){ // Count all matches, of all sizes, of a query sequence against a suffix array void countMatches( size_t queryNum, const uchar* querySeq ){ - indexT loopBeg = query.seqBeg(queryNum) - query.padBeg(queryNum); - indexT loopEnd = query.seqEnd(queryNum) - query.padBeg(queryNum); + size_t loopBeg = query.seqBeg(queryNum) - query.padBeg(queryNum); + size_t loopEnd = query.seqEnd(queryNum) - query.padBeg(queryNum); if( args.minHitDepth > 1 ) loopEnd -= std::min( args.minHitDepth - 1, loopEnd ); - for( indexT i = loopBeg; i < loopEnd; i += args.queryStep ){ + for( size_t i = loopBeg; i < loopEnd; i += args.queryStep ){ for( unsigned x = 0; x < numOfIndexes; ++x ) suffixArrays[x].countMatches( matchCounts[queryNum], querySeq + i, text.seqReader(), args.maxHitDepth ); @@ -545,8 +553,8 @@ void alignGapless( LastAligner& aligner, SegmentPairPot& gaplessAlns, DiagonalTable dt; // record already-covered positions on each diagonal countT matchCount = 0, gaplessExtensionCount = 0, gaplessAlignmentCount = 0; - indexT loopBeg = query.seqBeg(queryNum) - query.padBeg(queryNum); - indexT loopEnd = query.seqEnd(queryNum) - query.padBeg(queryNum); + size_t loopBeg = query.seqBeg(queryNum) - query.padBeg(queryNum); + size_t loopEnd = query.seqEnd(queryNum) - query.padBeg(queryNum); if( args.minHitDepth > 1 ) loopEnd -= std::min( args.minHitDepth - 1, loopEnd ); @@ -571,7 +579,7 @@ void alignGapless( LastAligner& aligner, SegmentPairPot& gaplessAlns, // increase "i" to the delimiter position. This gave a speed-up // of only 3%, with 34-nt tags. - indexT gaplessAlignmentsPerQueryPosition = 0; + size_t gaplessAlignmentsPerQueryPosition = 0; for( /* noop */; beg < end; ++beg ){ // loop over suffix-array matches if( gaplessAlignmentsPerQueryPosition == @@ -639,7 +647,7 @@ void alignGapped( LastAligner& aligner, size_t queryNum, char strand, const uchar* querySeq, Phase::Enum phase ){ Dispatcher dis( phase, aligner, queryNum, strand, querySeq ); - indexT frameSize = args.isFrameshift() ? (query.padLen(queryNum) / 3) : 0; + size_t frameSize = args.isFrameshift() ? (query.padLen(queryNum) / 3) : 0; countT gappedExtensionCount = 0, gappedAlignmentCount = 0; // Redo the gapless extensions, using gapped score parameters. @@ -714,7 +722,7 @@ void alignFinish( LastAligner& aligner, const AlignmentPot& gappedAlns, size_t queryNum, char strand, const uchar* querySeq ){ Centroid& centroid = aligner.centroid; Dispatcher dis( Phase::final, aligner, queryNum, strand, querySeq ); - indexT frameSize = args.isFrameshift() ? (query.padLen(queryNum) / 3) : 0; + size_t frameSize = args.isFrameshift() ? (query.padLen(queryNum) / 3) : 0; if( args.outputType > 3 ){ if( dis.p ){ @@ -751,7 +759,7 @@ void alignFinish( LastAligner& aligner, const AlignmentPot& gappedAlns, static void eraseWeakAlignments(LastAligner &aligner, AlignmentPot &gappedAlns, size_t queryNum, char strand, const uchar *querySeq) { - indexT frameSize = args.isFrameshift() ? (query.padLen(queryNum) / 3) : 0; + size_t frameSize = args.isFrameshift() ? (query.padLen(queryNum) / 3) : 0; Dispatcher dis(Phase::gapless, aligner, queryNum, strand, querySeq); for (size_t i = 0; i < gappedAlns.size(); ++i) { Alignment &a = gappedAlns.items[i]; @@ -1146,8 +1154,8 @@ void lastal( int argc, char** argv ){ args.resetCumulativeOptions(); // because we will do fromArgs again unsigned volumes = unsigned(-1); - indexT refMinimizerWindow = 1; // assume this value, if not specified - indexT minSeedLimit = 0; + size_t refMinimizerWindow = 1; // assume this value, if not specified + size_t minSeedLimit = 0; countT refSequences = -1; countT refLetters = -1; bool isKeepRefLowercase = true; diff --git a/src/lastdb.cc b/src/lastdb.cc index 57cf321..085d9d3 100644 --- a/src/lastdb.cc +++ b/src/lastdb.cc @@ -38,7 +38,7 @@ bool isDubiousDna( const Alphabet& alph, const MultiSequence& multi ){ const uchar* seq = multi.seqReader() + multi.seqBeg(0); unsigned dnaCount = 0; - for( indexT i = 0; i < 100; ++i ){ // look at the first 100 letters + for( unsigned i = 0; i < 100; ++i ){ // look at the first 100 letters uchar c = alph.numbersToUppercase[ seq[i] ]; if( c == alph.size ) return false; // we hit the end of the sequence early if( c < alph.size || c == alph.encode[ (uchar)'N' ] ) ++dnaCount; @@ -140,6 +140,7 @@ void writePrjFile( const std::string& fileName, const LastdbArguments& args, else{ f << "numofindexes=" << numOfIndexes << '\n'; } + f << "integersize=" << (sizeof(indexT) * CHAR_BIT) << '\n'; writeLastalOptions( f, seedText ); } diff --git a/src/makefile b/src/makefile index efbfb2c..cddf3b7 100644 --- a/src/makefile +++ b/src/makefile @@ -8,51 +8,78 @@ CXXFLAGS = -O3 -Wall -Wextra -Wcast-qual -Wswitch-enum -Wundef \ CFLAGS = -Wall -O2 -DBOBJ = Alphabet.o MultiSequence.o CyclicSubsetSeed.o \ -SubsetSuffixArray.o LastdbArguments.o io.o fileMap.o TantanMasker.o \ -ScoreMatrix.o SubsetMinimizerFinder.o LambdaCalculator.o tantan.o \ -SubsetSuffixArraySort.o MultiSequenceQual.o lastdb.o - -ALOBJ = Alphabet.o MultiSequence.o CyclicSubsetSeed.o \ -SubsetSuffixArray.o LastalArguments.o io.o fileMap.o TantanMasker.o \ -ScoreMatrix.o SubsetMinimizerFinder.o tantan.o DiagonalTable.o \ -SegmentPair.o Alignment.o GappedXdropAligner.o SegmentPairPot.o \ -AlignmentPot.o GeneralizedAffineGapCosts.o Centroid.o \ -LambdaCalculator.o TwoQualityScoreMatrix.o OneQualityScoreMatrix.o \ -QualityPssmMaker.o GeneticCode.o LastEvaluer.o GreedyXdropAligner.o \ -gaplessXdrop.o gaplessPssmXdrop.o gaplessTwoQualityXdrop.o \ -SubsetSuffixArraySearch.o AlignmentWrite.o MultiSequenceQual.o \ +alpObj = alp/sls_alignment_evaluer.o alp/sls_pvalues.o \ +alp/sls_alp_sim.o alp/sls_alp_regression.o alp/sls_alp_data.o \ +alp/sls_alp.o alp/sls_basic.o alp/njn_localmaxstatmatrix.o \ +alp/njn_localmaxstat.o alp/njn_localmaxstatutil.o \ +alp/njn_dynprogprob.o alp/njn_dynprogprobproto.o \ +alp/njn_dynprogproblim.o alp/njn_ioutil.o alp/njn_random.o \ +alp/sls_falp_alignment_evaluer.o alp/sls_fsa1_pvalues.o \ +alp/sls_fsa1_utils.o alp/sls_fsa1.o alp/sls_fsa1_parameters.o + +indexObj0 = Alphabet.o CyclicSubsetSeed.o LambdaCalculator.o \ +ScoreMatrix.o SubsetMinimizerFinder.o TantanMasker.o fileMap.o io.o \ +tantan.o LastdbArguments.o + +indexObj4 = MultiSequence.o MultiSequenceQual.o SubsetSuffixArray.o \ +SubsetSuffixArraySort.o lastdb.o + +alignObj0 = Alphabet.o CyclicSubsetSeed.o LambdaCalculator.o \ +ScoreMatrix.o SubsetMinimizerFinder.o TantanMasker.o fileMap.o io.o \ +tantan.o LastalArguments.o GappedXdropAligner.o \ GappedXdropAlignerPssm.o GappedXdropAligner2qual.o \ -GappedXdropAligner3frame.o lastal.o alp/sls_alignment_evaluer.o \ -alp/sls_pvalues.o alp/sls_alp_sim.o alp/sls_alp_regression.o \ -alp/sls_alp_data.o alp/sls_alp.o alp/sls_basic.o \ -alp/njn_localmaxstatmatrix.o alp/njn_localmaxstat.o \ -alp/njn_localmaxstatutil.o alp/njn_dynprogprob.o \ -alp/njn_dynprogprobproto.o alp/njn_dynprogproblim.o alp/njn_ioutil.o \ -alp/njn_random.o alp/sls_falp_alignment_evaluer.o \ -alp/sls_fsa1_pvalues.o alp/sls_fsa1_utils.o alp/sls_fsa1.o \ -alp/sls_fsa1_parameters.o - -SPOBJ = Alphabet.o MultiSequence.o fileMap.o split/cbrc_linalg.o \ -split/last-split.o split/cbrc_split_aligner.o split/last-split-main.o \ -split/cbrc_unsplit_alignment.o +GappedXdropAligner3frame.o GeneralizedAffineGapCosts.o GeneticCode.o \ +GreedyXdropAligner.o LastEvaluer.o OneQualityScoreMatrix.o \ +QualityPssmMaker.o TwoQualityScoreMatrix.o gaplessXdrop.o \ +gaplessPssmXdrop.o gaplessTwoQualityXdrop.o $(alpObj) + +alignObj4 = Alignment.o AlignmentPot.o AlignmentWrite.o Centroid.o \ +DiagonalTable.o MultiSequence.o MultiSequenceQual.o SegmentPair.o \ +SegmentPairPot.o SubsetSuffixArray.o SubsetSuffixArraySearch.o \ +lastal.o + +splitObj0 = Alphabet.o fileMap.o split/cbrc_linalg.o \ +split/cbrc_unsplit_alignment.o split/last-split-main.o + +splitObj4 = MultiSequence.o split/cbrc_split_aligner.o \ +split/last-split.o PPOBJ = last-pair-probs.o last-pair-probs-main.o io.o MBOBJ = last-merge-batches.o -ALL = lastdb lastal last-split last-merge-batches last-pair-probs +ALL = lastdb lastal last-split last-merge-batches last-pair-probs \ +lastdb8 lastal8 last-split8 + +indexObj8 = $(indexObj4:.o=.o8) +alignObj8 = $(alignObj4:.o=.o8) +splitObj8 = $(splitObj4:.o=.o8) all: $(ALL) -lastdb: $(DBOBJ) - $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(DBOBJ) +indexAllObj4 = $(indexObj0) $(indexObj4) +lastdb: $(indexAllObj4) + $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj4) + +indexAllObj8 = $(indexObj0) $(indexObj8) +lastdb8: $(indexAllObj8) + $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj8) + +alignAllObj4 = $(alignObj0) $(alignObj4) +lastal: $(alignAllObj4) + $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj4) -lastal: $(ALOBJ) - $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(ALOBJ) +alignAllObj8 = $(alignObj0) $(alignObj8) +lastal8: $(alignAllObj8) + $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj8) -last-split: $(SPOBJ) - $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(SPOBJ) +splitAllObj4 = $(splitObj0) $(splitObj4) +last-split: $(splitAllObj4) + $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(splitAllObj4) + +splitAllObj8 = $(splitObj0) $(splitObj8) +last-split8: $(splitAllObj8) + $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(splitAllObj8) last-pair-probs: $(PPOBJ) $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(PPOBJ) @@ -61,19 +88,22 @@ last-merge-batches: $(MBOBJ) $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(MBOBJ) .SUFFIXES: -.SUFFIXES: .o .c .cc .cpp +.SUFFIXES: .o .c .cc .cpp .o8 .c.o: $(CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $< .cc.o: - $(CXX) $(CPPFLAGS) $(CXXFLAGS) -I. -c -o $@ $< + $(CXX) -DLAST_INT_TYPE=unsigned $(CPPFLAGS) $(CXXFLAGS) -I. -c -o $@ $< + +.cc.o8: + $(CXX) -DLAST_INT_TYPE=size_t $(CPPFLAGS) $(CXXFLAGS) -I. -c -o $@ $< .cpp.o: $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $< clean: - rm -f $(ALL) *.o */*.o + rm -f $(ALL) *.o* */*.o* CyclicSubsetSeedData.hh: ../data/*.seed ../build/seed-inc.sh ../data/*.seed > $@ @@ -95,96 +125,98 @@ version.hh: FORCE FORCE: +fix8 = sed 's/.*\.o/& &8/' + depend: sed '/[m][v]/q' makefile > m - $(CXX) -MM *.cc >> m + $(CXX) -MM *.cc | $(fix8) >> m $(CC) -MM *.c >> m $(CXX) -MM alp/*.cpp | sed 's|.*:|alp/&|' >> m - $(CXX) -MM -I. split/*.cc | sed 's|.*:|split/&|' >> m + $(CXX) -MM -I. split/*.cc | sed 's|.*:|split/&|' | $(fix8) >> m mv m makefile -Alignment.o: Alignment.cc Alignment.hh ScoreMatrixRow.hh SegmentPair.hh \ +Alignment.o Alignment.o8: Alignment.cc Alignment.hh ScoreMatrixRow.hh SegmentPair.hh \ Alphabet.hh Centroid.hh GappedXdropAligner.hh \ GeneralizedAffineGapCosts.hh OneQualityScoreMatrix.hh GeneticCode.hh \ GreedyXdropAligner.hh TwoQualityScoreMatrix.hh -AlignmentPot.o: AlignmentPot.cc AlignmentPot.hh Alignment.hh \ +AlignmentPot.o AlignmentPot.o8: AlignmentPot.cc AlignmentPot.hh Alignment.hh \ ScoreMatrixRow.hh SegmentPair.hh -AlignmentWrite.o: AlignmentWrite.cc Alignment.hh ScoreMatrixRow.hh \ +AlignmentWrite.o AlignmentWrite.o8: AlignmentWrite.cc Alignment.hh ScoreMatrixRow.hh \ SegmentPair.hh GeneticCode.hh LastEvaluer.hh \ alp/sls_alignment_evaluer.hpp alp/sls_pvalues.hpp alp/sls_basic.hpp \ alp/sls_falp_alignment_evaluer.hpp alp/sls_fsa1_pvalues.hpp \ MultiSequence.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh \ Alphabet.hh -Alphabet.o: Alphabet.cc Alphabet.hh -Centroid.o: Centroid.cc Centroid.hh GappedXdropAligner.hh \ +Alphabet.o Alphabet.o8: Alphabet.cc Alphabet.hh +Centroid.o Centroid.o8: Centroid.cc Centroid.hh GappedXdropAligner.hh \ ScoreMatrixRow.hh GeneralizedAffineGapCosts.hh SegmentPair.hh \ OneQualityScoreMatrix.hh GappedXdropAlignerInl.hh -CyclicSubsetSeed.o: CyclicSubsetSeed.cc CyclicSubsetSeed.hh \ +CyclicSubsetSeed.o CyclicSubsetSeed.o8: CyclicSubsetSeed.cc CyclicSubsetSeed.hh \ CyclicSubsetSeedData.hh io.hh stringify.hh -DiagonalTable.o: DiagonalTable.cc DiagonalTable.hh -GappedXdropAligner.o: GappedXdropAligner.cc GappedXdropAligner.hh \ +DiagonalTable.o DiagonalTable.o8: DiagonalTable.cc DiagonalTable.hh +GappedXdropAligner.o GappedXdropAligner.o8: GappedXdropAligner.cc GappedXdropAligner.hh \ ScoreMatrixRow.hh GappedXdropAlignerInl.hh -GappedXdropAligner2qual.o: GappedXdropAligner2qual.cc \ +GappedXdropAligner2qual.o GappedXdropAligner2qual.o8: GappedXdropAligner2qual.cc \ GappedXdropAligner.hh ScoreMatrixRow.hh GappedXdropAlignerInl.hh \ TwoQualityScoreMatrix.hh -GappedXdropAligner3frame.o: GappedXdropAligner3frame.cc \ +GappedXdropAligner3frame.o GappedXdropAligner3frame.o8: GappedXdropAligner3frame.cc \ GappedXdropAligner.hh ScoreMatrixRow.hh GappedXdropAlignerInl.hh -GappedXdropAligner3framePssm.o: GappedXdropAligner3framePssm.cc \ +GappedXdropAligner3framePssm.o GappedXdropAligner3framePssm.o8: GappedXdropAligner3framePssm.cc \ GappedXdropAligner.hh ScoreMatrixRow.hh GappedXdropAlignerInl.hh -GappedXdropAlignerPssm.o: GappedXdropAlignerPssm.cc GappedXdropAligner.hh \ +GappedXdropAlignerPssm.o GappedXdropAlignerPssm.o8: GappedXdropAlignerPssm.cc GappedXdropAligner.hh \ ScoreMatrixRow.hh GappedXdropAlignerInl.hh -GeneralizedAffineGapCosts.o: GeneralizedAffineGapCosts.cc \ +GeneralizedAffineGapCosts.o GeneralizedAffineGapCosts.o8: GeneralizedAffineGapCosts.cc \ GeneralizedAffineGapCosts.hh -GeneticCode.o: GeneticCode.cc GeneticCode.hh Alphabet.hh -GreedyXdropAligner.o: GreedyXdropAligner.cc GreedyXdropAligner.hh \ +GeneticCode.o GeneticCode.o8: GeneticCode.cc GeneticCode.hh Alphabet.hh +GreedyXdropAligner.o GreedyXdropAligner.o8: GreedyXdropAligner.cc GreedyXdropAligner.hh \ ScoreMatrixRow.hh -LambdaCalculator.o: LambdaCalculator.cc LambdaCalculator.hh -LastEvaluer.o: LastEvaluer.cc LastEvaluer.hh ScoreMatrixRow.hh \ +LambdaCalculator.o LambdaCalculator.o8: LambdaCalculator.cc LambdaCalculator.hh +LastEvaluer.o LastEvaluer.o8: LastEvaluer.cc LastEvaluer.hh ScoreMatrixRow.hh \ alp/sls_alignment_evaluer.hpp alp/sls_pvalues.hpp alp/sls_basic.hpp \ alp/sls_falp_alignment_evaluer.hpp alp/sls_fsa1_pvalues.hpp \ GeneticCode.hh -LastalArguments.o: LastalArguments.cc LastalArguments.hh \ +LastalArguments.o LastalArguments.o8: LastalArguments.cc LastalArguments.hh \ SequenceFormat.hh stringify.hh version.hh -LastdbArguments.o: LastdbArguments.cc LastdbArguments.hh \ +LastdbArguments.o LastdbArguments.o8: LastdbArguments.cc LastdbArguments.hh \ SequenceFormat.hh stringify.hh version.hh -MultiSequence.o: MultiSequence.cc MultiSequence.hh ScoreMatrixRow.hh \ +MultiSequence.o MultiSequence.o8: MultiSequence.cc MultiSequence.hh ScoreMatrixRow.hh \ VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh io.hh -MultiSequenceQual.o: MultiSequenceQual.cc MultiSequence.hh \ +MultiSequenceQual.o MultiSequenceQual.o8: MultiSequenceQual.cc MultiSequence.hh \ ScoreMatrixRow.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh -OneQualityScoreMatrix.o: OneQualityScoreMatrix.cc \ +OneQualityScoreMatrix.o OneQualityScoreMatrix.o8: OneQualityScoreMatrix.cc \ OneQualityScoreMatrix.hh ScoreMatrixRow.hh qualityScoreUtil.hh \ stringify.hh -QualityPssmMaker.o: QualityPssmMaker.cc QualityPssmMaker.hh \ +QualityPssmMaker.o QualityPssmMaker.o8: QualityPssmMaker.cc QualityPssmMaker.hh \ ScoreMatrixRow.hh qualityScoreUtil.hh stringify.hh -ScoreMatrix.o: ScoreMatrix.cc ScoreMatrix.hh ScoreMatrixData.hh io.hh -SegmentPair.o: SegmentPair.cc SegmentPair.hh -SegmentPairPot.o: SegmentPairPot.cc SegmentPairPot.hh SegmentPair.hh -SubsetMinimizerFinder.o: SubsetMinimizerFinder.cc \ +ScoreMatrix.o ScoreMatrix.o8: ScoreMatrix.cc ScoreMatrix.hh ScoreMatrixData.hh io.hh +SegmentPair.o SegmentPair.o8: SegmentPair.cc SegmentPair.hh +SegmentPairPot.o SegmentPairPot.o8: SegmentPairPot.cc SegmentPairPot.hh SegmentPair.hh +SubsetMinimizerFinder.o SubsetMinimizerFinder.o8: SubsetMinimizerFinder.cc \ SubsetMinimizerFinder.hh CyclicSubsetSeed.hh -SubsetSuffixArray.o: SubsetSuffixArray.cc SubsetSuffixArray.hh \ +SubsetSuffixArray.o SubsetSuffixArray.o8: SubsetSuffixArray.cc SubsetSuffixArray.hh \ CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh \ SubsetMinimizerFinder.hh io.hh -SubsetSuffixArraySearch.o: SubsetSuffixArraySearch.cc \ +SubsetSuffixArraySearch.o SubsetSuffixArraySearch.o8: SubsetSuffixArraySearch.cc \ SubsetSuffixArray.hh CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh \ fileMap.hh stringify.hh -SubsetSuffixArraySort.o: SubsetSuffixArraySort.cc SubsetSuffixArray.hh \ +SubsetSuffixArraySort.o SubsetSuffixArraySort.o8: SubsetSuffixArraySort.cc SubsetSuffixArray.hh \ CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh -TantanMasker.o: TantanMasker.cc TantanMasker.hh ScoreMatrixRow.hh \ +TantanMasker.o TantanMasker.o8: TantanMasker.cc TantanMasker.hh ScoreMatrixRow.hh \ tantan.hh ScoreMatrix.hh LambdaCalculator.hh -TwoQualityScoreMatrix.o: TwoQualityScoreMatrix.cc \ +TwoQualityScoreMatrix.o TwoQualityScoreMatrix.o8: TwoQualityScoreMatrix.cc \ TwoQualityScoreMatrix.hh ScoreMatrixRow.hh qualityScoreUtil.hh \ stringify.hh -fileMap.o: fileMap.cc fileMap.hh stringify.hh -gaplessPssmXdrop.o: gaplessPssmXdrop.cc gaplessPssmXdrop.hh \ +fileMap.o fileMap.o8: fileMap.cc fileMap.hh stringify.hh +gaplessPssmXdrop.o gaplessPssmXdrop.o8: gaplessPssmXdrop.cc gaplessPssmXdrop.hh \ ScoreMatrixRow.hh -gaplessTwoQualityXdrop.o: gaplessTwoQualityXdrop.cc \ +gaplessTwoQualityXdrop.o gaplessTwoQualityXdrop.o8: gaplessTwoQualityXdrop.cc \ gaplessTwoQualityXdrop.hh TwoQualityScoreMatrix.hh ScoreMatrixRow.hh -gaplessXdrop.o: gaplessXdrop.cc gaplessXdrop.hh ScoreMatrixRow.hh -io.o: io.cc io.hh -last-pair-probs-main.o: last-pair-probs-main.cc last-pair-probs.hh \ +gaplessXdrop.o gaplessXdrop.o8: gaplessXdrop.cc gaplessXdrop.hh ScoreMatrixRow.hh +io.o io.o8: io.cc io.hh +last-pair-probs-main.o last-pair-probs-main.o8: last-pair-probs-main.cc last-pair-probs.hh \ stringify.hh version.hh -last-pair-probs.o: last-pair-probs.cc last-pair-probs.hh io.hh \ +last-pair-probs.o last-pair-probs.o8: last-pair-probs.cc last-pair-probs.hh io.hh \ stringify.hh -lastal.o: lastal.cc LastalArguments.hh SequenceFormat.hh \ +lastal.o lastal.o8: lastal.cc LastalArguments.hh SequenceFormat.hh \ QualityPssmMaker.hh ScoreMatrixRow.hh OneQualityScoreMatrix.hh \ TwoQualityScoreMatrix.hh qualityScoreUtil.hh stringify.hh \ LambdaCalculator.hh LastEvaluer.hh alp/sls_alignment_evaluer.hpp \ @@ -197,12 +229,12 @@ lastal.o: lastal.cc LastalArguments.hh SequenceFormat.hh \ TantanMasker.hh tantan.hh DiagonalTable.hh GreedyXdropAligner.hh \ gaplessXdrop.hh gaplessPssmXdrop.hh gaplessTwoQualityXdrop.hh io.hh \ threadUtil.hh version.hh -lastdb.o: lastdb.cc LastdbArguments.hh SequenceFormat.hh \ +lastdb.o lastdb.o8: lastdb.cc LastdbArguments.hh SequenceFormat.hh \ SubsetSuffixArray.hh CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh \ fileMap.hh stringify.hh Alphabet.hh MultiSequence.hh ScoreMatrixRow.hh \ TantanMasker.hh tantan.hh io.hh qualityScoreUtil.hh threadUtil.hh \ version.hh -tantan.o: tantan.cc tantan.hh +tantan.o tantan.o8: tantan.cc tantan.hh last-merge-batches.o: last-merge-batches.c version.hh alp/njn_dynprogprob.o: alp/njn_dynprogprob.cpp alp/njn_dynprogprob.hpp \ alp/njn_dynprogprobproto.hpp alp/njn_memutil.hpp alp/njn_ioutil.hpp @@ -275,17 +307,17 @@ alp/sls_fsa1_utils.o: alp/sls_fsa1_utils.cpp alp/sls_fsa1_utils.hpp \ alp/sls_pvalues.o: alp/sls_pvalues.cpp alp/sls_pvalues.hpp alp/sls_basic.hpp \ alp/sls_alp_data.hpp alp/sls_alp_regression.hpp alp/njn_random.hpp \ alp/njn_uniform.hpp alp/sls_normal_distr_array.hpp -split/cbrc_linalg.o: split/cbrc_linalg.cc split/cbrc_linalg.hh -split/cbrc_split_aligner.o: split/cbrc_split_aligner.cc \ +split/cbrc_linalg.o split/cbrc_linalg.o8: split/cbrc_linalg.cc split/cbrc_linalg.hh +split/cbrc_split_aligner.o split/cbrc_split_aligner.o8: split/cbrc_split_aligner.cc \ split/cbrc_split_aligner.hh split/cbrc_unsplit_alignment.hh \ split/cbrc_int_exponentiator.hh Alphabet.hh MultiSequence.hh \ ScoreMatrixRow.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh \ split/cbrc_linalg.hh -split/cbrc_unsplit_alignment.o: split/cbrc_unsplit_alignment.cc \ +split/cbrc_unsplit_alignment.o split/cbrc_unsplit_alignment.o8: split/cbrc_unsplit_alignment.cc \ split/cbrc_unsplit_alignment.hh -split/last-split-main.o: split/last-split-main.cc split/last-split.hh \ +split/last-split-main.o split/last-split-main.o8: split/last-split-main.cc split/last-split.hh \ stringify.hh version.hh -split/last-split.o: split/last-split.cc split/last-split.hh \ +split/last-split.o split/last-split.o8: split/last-split.cc split/last-split.hh \ split/cbrc_split_aligner.hh split/cbrc_unsplit_alignment.hh \ split/cbrc_int_exponentiator.hh Alphabet.hh MultiSequence.hh \ ScoreMatrixRow.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh diff --git a/src/split/cbrc_split_aligner.cc b/src/split/cbrc_split_aligner.cc index c395a5f..bea2d07 100644 --- a/src/split/cbrc_split_aligner.cc +++ b/src/split/cbrc_split_aligner.cc @@ -714,8 +714,8 @@ void SplitAligner::initSpliceSignals(unsigned i) { StringNumMap::const_iterator f = chromosomeIndex.find(a.rname); if (f == chromosomeIndex.end()) err("can't find " + std::string(a.rname) + " in the genome"); - unsigned v = f->second % maxGenomeVolumes(); - unsigned c = f->second / maxGenomeVolumes(); + size_t v = f->second % maxGenomeVolumes(); + size_t c = f->second / maxGenomeVolumes(); const uchar *chromBeg = genome[v].seqReader() + genome[v].seqBeg(c); const uchar *chromEnd = genome[v].seqReader() + genome[v].seqEnd(c); if (a.rend > chromEnd - chromBeg) @@ -916,13 +916,13 @@ void SplitAligner::flipSpliceSignals() { } } -double SplitAligner::spliceSignalStrandProb() const { +double SplitAligner::spliceSignalStrandLogOdds() const { assert(rescales.size() == rescalesRev.size()); - double r = 1.0; + double logOdds = 0; for (unsigned j = 0; j < rescales.size(); ++j) { - r *= rescalesRev[j] / rescales[j]; - } // r might overflow to inf, but that should be OK - return 1.0 / (1.0 + r); + logOdds += std::log(rescales[j] / rescalesRev[j]); + } + return logOdds; } // 1st 1 million reads from SRR359290.fastq: @@ -1088,8 +1088,9 @@ void SplitAligner::printParameters() const { static void readPrjFile(const std::string& baseName, std::string& alphabetLetters, - unsigned& seqCount, - unsigned& volumes) { + size_t& seqCount, + size_t& volumes) { + size_t fileBitsPerInt = 32; seqCount = volumes = -1; std::string fileName = baseName + ".prj"; @@ -1102,15 +1103,22 @@ static void readPrjFile(const std::string& baseName, getline(iss, word, '='); if (word == "alphabet") iss >> alphabetLetters; if (word == "numofsequences") iss >> seqCount; - if( word == "volumes" ) iss >> volumes; + if (word == "volumes") iss >> volumes; + if (word == "integersize") iss >> fileBitsPerInt; } if (alphabetLetters != "ACGT") err("can't read file: " + fileName); + + if (fileBitsPerInt != sizeof(MultiSequence::indexT) * CHAR_BIT) { + if (fileBitsPerInt == 32) err("please use last-split for " + baseName); + if (fileBitsPerInt == 64) err("please use last-split8 for " + baseName); + err("weird integersize in " + fileName); + } } void SplitAligner::readGenomeVolume(const std::string& baseName, - unsigned seqCount, - unsigned volumeNumber) { + size_t seqCount, + size_t volumeNumber) { if (seqCount + 1 == 0) err("can't read: " + baseName); genome[volumeNumber].fromFiles(baseName, seqCount, 0); @@ -1125,14 +1133,14 @@ void SplitAligner::readGenomeVolume(const std::string& baseName, void SplitAligner::readGenome(const std::string& baseName) { std::string alphabetLetters; - unsigned seqCount, volumes; + size_t seqCount, volumes; readPrjFile(baseName, alphabetLetters, seqCount, volumes); if (volumes + 1 > 0 && volumes > 1) { if (volumes > maxGenomeVolumes()) err("too many volumes: " + baseName); - for (unsigned i = 0; i < volumes; ++i) { + for (size_t i = 0; i < volumes; ++i) { std::string b = baseName + stringify(i); - unsigned c, v; + size_t c, v; readPrjFile(b, alphabetLetters, c, v); readGenomeVolume(b, c, i); } diff --git a/src/split/cbrc_split_aligner.hh b/src/split/cbrc_split_aligner.hh index 53db5b0..a2ca7ac 100644 --- a/src/split/cbrc_split_aligner.hh +++ b/src/split/cbrc_split_aligner.hh @@ -102,9 +102,10 @@ public: // Toggles between forward and reverse-complement splice signals void flipSpliceSignals(); - // The probability that the query uses splice signals in the - // orientation currently set by flipSpliceSignals() - double spliceSignalStrandProb() const; + // This returns log(p / (1-p)), where p is the probability that + // the query uses splice signals in the orientation currently set + // by flipSpliceSignals() + double spliceSignalStrandLogOdds() const; private: static const int numQualCodes = 64; @@ -203,7 +204,7 @@ private: void initRbegsAndEnds(); static size_t maxGenomeVolumes() { return sizeof genome / sizeof *genome; } void readGenomeVolume(const std::string& baseName, - unsigned seqCount, unsigned volumeNumber); + size_t seqCount, size_t volumeNumber); void dpExtensionMinScores(int maxJumpScore, size_t& minScore1, size_t& minScore2) const; diff --git a/src/split/last-split.cc b/src/split/last-split.cc index b18a156..04790bc 100644 --- a/src/split/last-split.cc +++ b/src/split/last-split.cc @@ -70,7 +70,7 @@ static void doOneAlignmentPart(cbrc::SplitAligner& sa, const cbrc::UnsplitAlignment& a, unsigned alnNum, unsigned qSliceBeg, unsigned qSliceEnd, - double forwardDirectionProb, + double senseStrandLogOdds, const LastSplitOptions& opts, bool isAlreadySplit) { unsigned alnBeg, alnEnd; @@ -92,9 +92,11 @@ static void doOneAlignmentPart(cbrc::SplitAligner& sa, } if (opts.direction == 0) p.swap(pRev); if (opts.direction == 2) { - double reverseDirectionProb = 1.0 - forwardDirectionProb; + double reverseProb = 1 / (1 + std::exp(senseStrandLogOdds)); + // the exp might overflow to inf, but that should be OK + double forwardProb = 1 - reverseProb; for (unsigned i = 0; i < p.size(); ++i) { - p[i] = forwardDirectionProb * p[i] + reverseDirectionProb * pRev[i]; + p[i] = forwardProb * p[i] + reverseProb * pRev[i]; } } @@ -115,8 +117,16 @@ static void doOneAlignmentPart(cbrc::SplitAligner& sa, } std::cout << std::setprecision(mismapPrecision) - << "a score=" << score << " mismap=" << mismap << "\n" - << std::setprecision(6); + << "a score=" << score << " mismap=" << mismap; + if (opts.direction == 2) { + double b = senseStrandLogOdds / std::log(2.0); + if (b < 0.1 && b > -0.1) b = 0; + else if (b > 10) b = std::floor(b + 0.5); + else if (b < -10) b = std::ceil(b - 0.5); + int sensePrecision = (b < 10 && b > -10) ? 2 : 3; + std::cout << std::setprecision(sensePrecision) << " sense=" << b; + } + std::cout << "\n" << std::setprecision(6); if (a.qstrand == '-') cbrc::flipMafStrands(s.begin(), s.end()); if (opts.no_split && a.linesEnd[-1][0] == 'c') s.push_back(a.linesEnd[-1]); cbrc::printMaf(s); @@ -149,17 +159,14 @@ static void doOneQuery(std::vector<cbrc::UnsplitAlignment>::const_iterator beg, sa.flipSpliceSignals(); } - double forwardDirectionProb = -1; - if (opts.direction == 2) { - forwardDirectionProb = sa.spliceSignalStrandProb(); - if (opts.verbose) std::cerr << "\tforwardProb=" << forwardDirectionProb; - } + double senseStrandLogOdds = + (opts.direction == 2) ? sa.spliceSignalStrandLogOdds() : 0; if (opts.no_split) { if (opts.verbose) std::cerr << "\n"; for (unsigned i = 0; i < end - beg; ++i) { doOneAlignmentPart(sa, beg[i], i, beg[i].qstart, beg[i].qend, - forwardDirectionProb, opts, isAlreadySplit); + senseStrandLogOdds, opts, isAlreadySplit); } } else { long viterbiScore = LONG_MIN; @@ -192,7 +199,7 @@ static void doOneQuery(std::vector<cbrc::UnsplitAlignment>::const_iterator beg, for (unsigned k = 0; k < alnNums.size(); ++k) { unsigned i = alnNums[k]; doOneAlignmentPart(sa, beg[i], i, queryBegs[k], queryEnds[k], - forwardDirectionProb, opts, isAlreadySplit); + senseStrandLogOdds, opts, isAlreadySplit); } } } diff --git a/src/version.hh b/src/version.hh index 9420baa..25bdae7 100644 --- a/src/version.hh +++ b/src/version.hh @@ -1 +1 @@ -"809" +"828" -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/last-align.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
