This is an automated email from the git hooks/post-receive script. plessy pushed a commit to branch master in repository last-align.
commit 899a68165f8864d3cfb67aca39ac97a2add9c8ac Author: Charles Plessy <[email protected]> Date: Sun Oct 15 22:05:37 2017 +0900 New upstream version 885 --- ChangeLog.txt | 87 +++++++++++++++++++++++++++++++++++++++++++- doc/last-dotplot.html | 17 +++++++-- doc/last-dotplot.txt | 15 ++++++-- doc/last-parallel.html | 4 +-- doc/last-parallel.txt | 4 +-- doc/last-train.html | 4 +++ doc/last-train.txt | 2 ++ doc/last-tuning.html | 7 ++-- doc/last-tuning.txt | 8 ++--- doc/last.html | 4 +-- doc/last.txt | 4 +-- doc/lastal.html | 13 ++++--- doc/lastal.txt | 9 +++-- doc/lastdb.html | 5 +-- doc/lastdb.txt | 5 +-- doc/maf-convert.html | 12 +++---- doc/maf-convert.txt | 18 +++++----- scripts/fastq-interleave | 10 ++---- scripts/last-dotplot | 75 +++++++++++++++++++++++--------------- scripts/last-postmask | 23 +++++++----- scripts/last-train | 79 ++++++++++++++++++++++++++-------------- scripts/maf-convert | 93 +++++++++++++++++++++++++++++++++++------------- src/Alphabet.cc | 8 ----- src/Alphabet.hh | 3 -- src/LastEvaluer.cc | 2 +- src/LastalArguments.cc | 33 ++++++++++++----- src/MultiSequence.cc | 7 ---- src/MultiSequence.hh | 66 +++++++++++++++++++++++++++++++--- src/MultiSequenceQual.cc | 22 ++++-------- src/io.cc | 11 ++---- src/io.hh | 7 ++-- src/last-pair-probs.cc | 4 +-- src/lastal.cc | 32 ++--------------- src/lastdb.cc | 43 ++++++++++------------ src/makefile | 32 ++++++++--------- src/mcf_zstream.hh | 82 ++++++++++++++++++++++++++++++++++++++++++ src/version.hh | 2 +- 37 files changed, 569 insertions(+), 283 deletions(-) diff --git a/ChangeLog.txt b/ChangeLog.txt index 2203b24..7c46466 100644 --- a/ChangeLog.txt +++ b/ChangeLog.txt @@ -1,9 +1,94 @@ +2017-10-10 Martin C. Frith <Martin C. Frith> + + * scripts/last-train: + Make last-train print the lastal version + [c2ce08b5a76a] [tip] + + * src/Alphabet.cc, src/Alphabet.hh, src/MultiSequence.hh, + src/lastal.cc, src/lastdb.cc: + Refactor + [52565bc1c7d7] + + * src/lastdb.cc: + Refactor (I think) + [02a9cb451bc7] + + * src/MultiSequence.cc, src/MultiSequence.hh, + src/MultiSequenceQual.cc, src/lastal.cc: + Refactor some code + [88fa35342a52] + + * scripts/last-train, test/last-train-test.out, test/last-train- + test.sh: + Make last-train work with lastdb8 + [a7ef7848ad1e] + +2017-10-06 Martin C. Frith <Martin C. Frith> + + * doc/maf-convert.txt, scripts/maf-convert, test/maf-convert-test.out, + test/maf-convert-test.sh: + Add maf-convert -> chain + [6635b482e6e2] + + * doc/last.txt, src/LastEvaluer.cc: + Avoid some "can't get E-value parameters" errors + [2c0cb7ef8842] + +2017-10-03 Martin C. Frith <Martin C. Frith> + + * doc/last-dotplot.txt, scripts/last-dotplot: + Add text-rotation options to last-dotplot + [20f5c97a3cfd] + + * scripts/last-dotplot: + Fix mixed-up margin sizes in last-dotplot + [f5f93e51ef53] + +2017-09-27 Martin C. Frith <Martin C. Frith> + + * src/makefile: + Try to make the gz compilation work more portably + [bf8efe69f160] + +2017-09-26 Martin C. Frith <Martin C. Frith> + + * scripts/last-dotplot, scripts/last-postmask, scripts/last-train, + scripts/maf-convert, test/last-postmask-test.out, test/last- + postmask-test.sh, test/last-test.sh: + Add gz reading to: dotplot postmask train convert + [592295375eb1] + +2017-09-22 Martin C. Frith <Martin C. Frith> + + * doc/last-parallel.txt, src/mcf_zstream.hh: + Make the gz stuff compile on old computers + [0602563ef4e2] + + * src/makefile: + Try to make compilation more robust & portable + [8a2e4af79c99] + + * doc/lastal.txt, doc/lastdb.txt, scripts/fastq-interleave, src/io.cc, + src/io.hh, src/last-pair-probs.cc, src/lastal.cc, src/lastdb.cc, + src/makefile, src/mcf_zstream.hh: + Enable lastal, lastdb, etc to read .gz files + [a530b1ee48f4] + + * doc/last-train.txt, scripts/last-train: + Add option -k to last-train + [80a016b17089] + + * doc/last-tuning.txt, doc/lastal.txt, src/LastalArguments.cc, test + /last-test.out, test/last-test.sh: + Add -x % option to lastal + [ad7f5f953412] + 2017-06-20 Martin C. Frith <Martin C. Frith> * doc/last-train.txt, scripts/last-train, test/last-train-test.out, test/last-train-test.sh: Make it easier to feed stdin/pipe to last-train - [b73f1146e688] [tip] + [b73f1146e688] * doc/lastal.txt, src/AlignmentWrite.cc, test/last-test.out: Add raw score to BlastTab+ format diff --git a/doc/last-dotplot.html b/doc/last-dotplot.html index e1717a0..21cc1d9 100644 --- a/doc/last-dotplot.html +++ b/doc/last-dotplot.html @@ -319,8 +319,9 @@ table.field-list { border: thin solid green } <h1 class="title">last-dotplot</h1> <p>This script makes a dotplot, a.k.a. Oxford Grid, of pair-wise sequence -alignments in MAF or LAST tabular format. It requires the Python -Imaging Library to be installed. It can be used like this:</p> +alignments in MAF or LAST tabular format. It requires the <a class="reference external" href="https://pillow.readthedocs.io/">Python +Imaging Library</a> to be installed. +It can be used like this:</p> <pre class="literal-block"> last-dotplot my-alignments my-plot.png </pre> @@ -330,7 +331,11 @@ last-dotplot alns alns.gif </pre> <p>To get a nicer font, try something like:</p> <pre class="literal-block"> -last-dotplot -f /usr/share/fonts/truetype/freefont/FreeSans.ttf alns alns.png +last-dotplot -f /usr/share/fonts/liberation/LiberationSans-Regular.ttf alns alns.png +</pre> +<p>or:</p> +<pre class="literal-block"> +last-dotplot -f /Library/Fonts/Arial.ttf alns alns.png </pre> <p>If the fonts are located somewhere different on your computer, change this as appropriate.</p> @@ -450,6 +455,12 @@ appearance in the input (0), their names (1), their lengths (2).</td></tr> <kbd><span class="option">-s <var>SIZE</var></span>, <span class="option">--fontsize=<var>SIZE</var></span></kbd></td> <td>TrueType or OpenType font size.</td></tr> <tr><td class="option-group"> +<kbd><span class="option">--rot1=<var>ROT</var></span></kbd></td> +<td>Text rotation for the 1st genome: h(orizontal) or v(ertical).</td></tr> +<tr><td class="option-group"> +<kbd><span class="option">--rot2=<var>ROT</var></span></kbd></td> +<td>Text rotation for the 2nd genome: h(orizontal) or v(ertical).</td></tr> +<tr><td class="option-group"> <kbd><span class="option">--lengths1</span></kbd></td> <td>Show sequence lengths for the 1st (horizontal) genome.</td></tr> <tr><td class="option-group"> diff --git a/doc/last-dotplot.txt b/doc/last-dotplot.txt index 8642e59..2c1ff46 100644 --- a/doc/last-dotplot.txt +++ b/doc/last-dotplot.txt @@ -2,8 +2,9 @@ last-dotplot ============ This script makes a dotplot, a.k.a. Oxford Grid, of pair-wise sequence -alignments in MAF or LAST tabular format. It requires the Python -Imaging Library to be installed. It can be used like this:: +alignments in MAF or LAST tabular format. It requires the `Python +Imaging Library <https://pillow.readthedocs.io/>`_ to be installed. +It can be used like this:: last-dotplot my-alignments my-plot.png @@ -13,7 +14,11 @@ The output can be in any format supported by the Imaging Library:: To get a nicer font, try something like:: - last-dotplot -f /usr/share/fonts/truetype/freefont/FreeSans.ttf alns alns.png + last-dotplot -f /usr/share/fonts/liberation/LiberationSans-Regular.ttf alns alns.png + +or:: + + last-dotplot -f /Library/Fonts/Arial.ttf alns alns.png If the fonts are located somewhere different on your computer, change this as appropriate. @@ -95,6 +100,10 @@ Text options TrueType or OpenType font file. -s SIZE, --fontsize=SIZE TrueType or OpenType font size. + --rot1=ROT + Text rotation for the 1st genome: h(orizontal) or v(ertical). + --rot2=ROT + Text rotation for the 2nd genome: h(orizontal) or v(ertical). --lengths1 Show sequence lengths for the 1st (horizontal) genome. --lengths2 diff --git a/doc/last-parallel.html b/doc/last-parallel.html index 5d2ad15..1cfe214 100644 --- a/doc/last-parallel.html +++ b/doc/last-parallel.html @@ -372,11 +372,11 @@ parallel-fastq "lastal -Q1 -D100 db | last-split" < q.fastq > ou </pre> <p>Instead of this:</p> <pre class="literal-block"> -zcat queries.fa.gz | lastal mydb > myalns.maf +bzcat queries.fa.bz2 | lastal mydb > myalns.maf </pre> <p>try this:</p> <pre class="literal-block"> -zcat queries.fa.gz | parallel-fasta "lastal mydb" > myalns.maf +bzcat queries.fa.bz2 | parallel-fasta "lastal mydb" > myalns.maf </pre> <p>Notes:</p> <ul> diff --git a/doc/last-parallel.txt b/doc/last-parallel.txt index 730157f..c98a451 100644 --- a/doc/last-parallel.txt +++ b/doc/last-parallel.txt @@ -61,11 +61,11 @@ try this:: Instead of this:: - zcat queries.fa.gz | lastal mydb > myalns.maf + bzcat queries.fa.bz2 | lastal mydb > myalns.maf try this:: - zcat queries.fa.gz | parallel-fasta "lastal mydb" > myalns.maf + bzcat queries.fa.bz2 | parallel-fasta "lastal mydb" > myalns.maf Notes: diff --git a/doc/last-train.html b/doc/last-train.html index a4889bd..f510a51 100644 --- a/doc/last-train.html +++ b/doc/last-train.html @@ -455,6 +455,10 @@ reduce run time.</td></tr> <kbd><span class="option">-m <var>COUNT</var></span></kbd></td> <td>Maximum number of initial matches per query position.</td></tr> <tr><td class="option-group"> +<kbd><span class="option">-k <var>STEP</var></span></kbd></td> +<td>Look for initial matches starting only at every STEP-th +position in each query.</td></tr> +<tr><td class="option-group"> <kbd><span class="option">-P <var>COUNT</var></span></kbd></td> <td>Number of parallel threads.</td></tr> <tr><td class="option-group"> diff --git a/doc/last-train.txt b/doc/last-train.txt index c9fcd93..c965ff3 100644 --- a/doc/last-train.txt +++ b/doc/last-train.txt @@ -84,6 +84,8 @@ Alignment options reduce run time. -T NUMBER Type of alignment: 0=local, 1=overlap. -m COUNT Maximum number of initial matches per query position. + -k STEP Look for initial matches starting only at every STEP-th + position in each query. -P COUNT Number of parallel threads. -Q NUMBER Query sequence format: 0=fasta, 1=fastq-sanger. **Important:** if you use option -Q, last-train will only diff --git a/doc/last-tuning.html b/doc/last-tuning.html index a273b3b..a7c04de 100644 --- a/doc/last-tuning.html +++ b/doc/last-tuning.html @@ -408,11 +408,8 @@ whose query coordinates lie in those of 2 or more stronger alignments.</p> <p>This option can make lastal <strong>faster</strong> but <strong>less sensitive</strong>. It sets the maximum score drop in alignments, in the gapped extension phase. Lower values make it faster, by quitting unpromising -extensions sooner. The default aims at best accuracy.</p> -<p>Unfortunately, the default is a complex function of the other -parameters and the database size. You can see it in the lastal header -after "x=", e.g. by running lastal with no queries. Then try, say, -halving it.</p> +extensions sooner. The default aims at best accuracy. For example, +use -x50% to specify 50% of the default value.</p> </div> <div class="section" id="id2"> <h3>lastal -M</h3> diff --git a/doc/last-tuning.txt b/doc/last-tuning.txt index 6493e62..9fc4c9b 100644 --- a/doc/last-tuning.txt +++ b/doc/last-tuning.txt @@ -109,12 +109,8 @@ lastal -x This option can make lastal **faster** but **less sensitive**. It sets the maximum score drop in alignments, in the gapped extension phase. Lower values make it faster, by quitting unpromising -extensions sooner. The default aims at best accuracy. - -Unfortunately, the default is a complex function of the other -parameters and the database size. You can see it in the lastal header -after "x=", e.g. by running lastal with no queries. Then try, say, -halving it. +extensions sooner. The default aims at best accuracy. For example, +use -x50% to specify 50% of the default value. lastal -M --------- diff --git a/doc/last.html b/doc/last.html index 7c0f7f7..ef87985 100644 --- a/doc/last.html +++ b/doc/last.html @@ -354,8 +354,8 @@ compile the programs, type:</p> <pre class="literal-block"> make </pre> -<p>If your compiler is really old, you might get an error message like -this:</p> +<p>You might get some harmless warning messages. If your compiler is +really old, you might get an error message like this:</p> <pre class="literal-block"> unrecognized command line option "-std=c++11" </pre> diff --git a/doc/last.txt b/doc/last.txt index 70123f1..20d2102 100644 --- a/doc/last.txt +++ b/doc/last.txt @@ -33,8 +33,8 @@ compile the programs, type:: make -If your compiler is really old, you might get an error message like -this:: +You might get some harmless warning messages. If your compiler is +really old, you might get an error message like this:: unrecognized command line option "-std=c++11" diff --git a/doc/lastal.html b/doc/lastal.html index 191dde2..22e63d2 100644 --- a/doc/lastal.html +++ b/doc/lastal.html @@ -329,9 +329,10 @@ lastal humanDb dna*.fasta > myalns.maf writes several files whose names begin with <tt class="docutils literal">humanDb</tt>. The lastal command reads files called <tt class="docutils literal"><span class="pre">dna*.fasta</span></tt>, compares them to <tt class="docutils literal">humanDb</tt>, and writes alignments to a file called <tt class="docutils literal">myalns.maf</tt>.</p> -<p>You can also pipe query sequences into lastal, for example:</p> +<p>You can use gzip (.gz) compressed query files. You can also pipe +query sequences into lastal, for example:</p> <pre class="literal-block"> -zcat seqs.fasta.gz | lastal humanDb > myalns.maf +bzcat seqs.fasta.bz2 | lastal humanDb > myalns.maf </pre> <div class="section" id="steps-in-lastal"> <h2>Steps in lastal</h2> @@ -516,10 +517,14 @@ looks like this:</p> </td></tr> <tr><td class="option-group"> <kbd><span class="option">-x <var>DROP</var></span></kbd></td> -<td>Maximum score drop for gapped alignments. Gapped alignments are +<td><p class="first">Maximum score drop for gapped alignments. Gapped alignments are forbidden from having any internal region with score < -DROP. This serves two purposes: accuracy (avoid spurious internal -regions in alignments) and speed (the smaller the faster).</td></tr> +regions in alignments) and speed (the smaller the faster).</p> +<p class="last">You can specify either a score (e.g. -x20), or a percentage; for +example, -x50% specifies 50% of the default value (rounded down +to the nearest integer).</p> +</td></tr> <tr><td class="option-group"> <kbd><span class="option">-y <var>DROP</var></span></kbd></td> <td>Maximum score drop for gapless alignments.</td></tr> diff --git a/doc/lastal.txt b/doc/lastal.txt index db7eeb1..476d9f8 100644 --- a/doc/lastal.txt +++ b/doc/lastal.txt @@ -13,9 +13,10 @@ writes several files whose names begin with ``humanDb``. The lastal command reads files called ``dna*.fasta``, compares them to ``humanDb``, and writes alignments to a file called ``myalns.maf``. -You can also pipe query sequences into lastal, for example:: +You can use gzip (.gz) compressed query files. You can also pipe +query sequences into lastal, for example:: - zcat seqs.fasta.gz | lastal humanDb > myalns.maf + bzcat seqs.fasta.bz2 | lastal humanDb > myalns.maf Steps in lastal --------------- @@ -186,6 +187,10 @@ Score options This serves two purposes: accuracy (avoid spurious internal regions in alignments) and speed (the smaller the faster). + You can specify either a score (e.g. -x20), or a percentage; for + example, -x50% specifies 50% of the default value (rounded down + to the nearest integer). + -y DROP Maximum score drop for gapless alignments. diff --git a/doc/lastdb.html b/doc/lastdb.html index 974c588..4b54522 100644 --- a/doc/lastdb.html +++ b/doc/lastdb.html @@ -336,9 +336,10 @@ TTTGGATATG >My2ndSequence TTTAGAGGGTTCTTCGGGATT </pre> -<p>You can also pipe sequences into lastdb, for example:</p> +<p>These files may be compressed in gzip (.gz) format. You can also pipe +sequences into lastdb, for example:</p> <pre class="literal-block"> -zcat humanChromosome*.fasta.gz | lastdb humanDb +bzcat humanChromosome*.fasta.bz2 | lastdb humanDb </pre> </div> <div class="section" id="options"> diff --git a/doc/lastdb.txt b/doc/lastdb.txt index c0b5fdb..38e5fe8 100644 --- a/doc/lastdb.txt +++ b/doc/lastdb.txt @@ -21,9 +21,10 @@ like this:: >My2ndSequence TTTAGAGGGTTCTTCGGGATT -You can also pipe sequences into lastdb, for example:: +These files may be compressed in gzip (.gz) format. You can also pipe +sequences into lastdb, for example:: - zcat humanChromosome*.fasta.gz | lastdb humanDb + bzcat humanChromosome*.fasta.bz2 | lastdb humanDb Options ------- diff --git a/doc/maf-convert.html b/doc/maf-convert.html index a98040e..295e74e 100644 --- a/doc/maf-convert.html +++ b/doc/maf-convert.html @@ -318,9 +318,9 @@ table.field-list { border: thin solid green } <div class="document" id="maf-convert"> <h1 class="title">maf-convert</h1> -<p>This script reads alignments in maf format, and writes them in another -format. It can write them in these formats: axt, blast, blasttab, -html, psl, sam, tab. You can use it like this:</p> +<p>This script reads alignments in <a class="reference external" href="http://genome.ucsc.edu/FAQ/FAQformat.html#format5">maf</a> format, and writes them in +another format. It can write them in these formats: <a class="reference external" href="https://genome.ucsc.edu/goldenPath/help/axt.html">axt</a>, blast, +blasttab, <a class="reference external" href="https://genome.ucsc.edu/goldenPath/help/chain.html">chain</a>, html, <a class="reference external" href="https://genome.ucsc.edu/FAQ/FAQformat.html#format2">psl</a>, sam, tab. You can use it like this:</p> <pre class="literal-block"> maf-convert psl my-alignments.maf > my-alignments.psl </pre> @@ -328,9 +328,7 @@ maf-convert psl my-alignments.maf > my-alignments.psl <pre class="literal-block"> ... | maf-convert psl > my-alignments.psl </pre> -<p>The input should be "multiple alignment format" as described in the -UCSC Genome FAQ (not "MIRA assembly format" or any other maf).</p> -<p>This script takes the first (topmost) MAF sequence as the "reference" +<p>This script takes the first (topmost) maf sequence as the "reference" / "subject" / "target", and the second sequence as the "query".</p> <p>For html: if the input includes probability lines starting with 'p', then the output will be coloured by column probability. (To get lines @@ -351,7 +349,7 @@ starting with 'p', run lastal with option -j set to 4 or higher.)</p> nucleotides. This affects psl format only (the first 4 columns).</td></tr> <tr><td class="option-group"> -<kbd><span class="option">-j</span>, <span class="option">--join=<var>N</var></span></kbd></td> +<kbd><span class="option">-j <var>N</var></span>, <span class="option">--join=<var>N</var></span></kbd></td> <td>Join neighboring alignments if they are co-linear and separated by at most N letters. This affects psl format only.</td></tr> diff --git a/doc/maf-convert.txt b/doc/maf-convert.txt index f78acee..d2d3f4d 100644 --- a/doc/maf-convert.txt +++ b/doc/maf-convert.txt @@ -1,9 +1,9 @@ maf-convert =========== -This script reads alignments in maf format, and writes them in another -format. It can write them in these formats: axt, blast, blasttab, -html, psl, sam, tab. You can use it like this:: +This script reads alignments in maf_ format, and writes them in +another format. It can write them in these formats: axt_, blast, +blasttab, chain_, html, psl_, sam, tab. You can use it like this:: maf-convert psl my-alignments.maf > my-alignments.psl @@ -11,16 +11,18 @@ It's often convenient to pipe in the input, like this:: ... | maf-convert psl > my-alignments.psl -The input should be "multiple alignment format" as described in the -UCSC Genome FAQ (not "MIRA assembly format" or any other maf). - -This script takes the first (topmost) MAF sequence as the "reference" +This script takes the first (topmost) maf sequence as the "reference" / "subject" / "target", and the second sequence as the "query". For html: if the input includes probability lines starting with 'p', then the output will be coloured by column probability. (To get lines starting with 'p', run lastal with option -j set to 4 or higher.) +.. _maf: http://genome.ucsc.edu/FAQ/FAQformat.html#format5 +.. _axt: https://genome.ucsc.edu/goldenPath/help/axt.html +.. _chain: https://genome.ucsc.edu/goldenPath/help/chain.html +.. _psl: https://genome.ucsc.edu/FAQ/FAQformat.html#format2 + Options ------- @@ -32,7 +34,7 @@ Options nucleotides. This affects psl format only (the first 4 columns). - -j, --join=N + -j N, --join=N Join neighboring alignments if they are co-linear and separated by at most N letters. This affects psl format only. diff --git a/scripts/fastq-interleave b/scripts/fastq-interleave index ca33604..dd9174f 100755 --- a/scripts/fastq-interleave +++ b/scripts/fastq-interleave @@ -3,6 +3,7 @@ test $# = 2 || { cat <<EOF Usage: $0 x.fastq y.fastq +or: $0 x.fastq.gz y.fastq.gz Read 2 fastq files, and write them interleaved. Assumes 1 fastq per 4 lines, i.e. no line wrapping. @@ -10,10 +11,5 @@ EOF exit } -paste <(cat "$1" | paste - - - -) <(cat "$2" | paste - - - -) | tr '\t' '\n' - -# Is this better? -#paste <(zcat -f "$1"|paste - - - -) <(zcat -f "$2"|paste - - - -)|tr '\t' '\n' - -# This does not interpret "-" as stdin: -#paste <(paste - - - - < "$1") <(paste - - - - < "$2") | tr '\t' '\n' +paste <(gzip -cdf "$1" | paste - - - -) <(gzip -cdf "$2" | paste - - - -) | + tr '\t' '\n' diff --git a/scripts/last-dotplot b/scripts/last-dotplot index 6d4f1d2..f29fed2 100755 --- a/scripts/last-dotplot +++ b/scripts/last-dotplot @@ -9,6 +9,7 @@ # according to the number of aligned nt-pairs within it, but the # result is too faint. How can this be done better? +import gzip import fnmatch, itertools, optparse, os, re, sys # Try to make PIL/PILLOW work: @@ -18,6 +19,8 @@ except ImportError: import Image, ImageDraw, ImageFont, ImageColor def myOpen(fileName): # faster than fileinput if fileName == "-": return sys.stdin + if fileName.endswith(".gz"): + return gzip.open(fileName) return open(fileName) def warn(message): @@ -160,13 +163,17 @@ def natural_sort_key(my_string): parts[1::2] = map(int, parts[1::2]) return parts -def get_text_sizes(my_strings, font, fontsize, image_mode): +def textDimensions(imageDraw, font, textRot, text): + x, y = imageDraw.textsize(text, font=font) + return (y, x) if textRot else (x, y) + +def get_text_sizes(my_strings, font, fontsize, image_mode, textRot): '''Get widths & heights, in pixels, of some strings.''' if fontsize == 0: return [(0, 0) for i in my_strings] image_size = 1, 1 im = Image.new(image_mode, image_size) draw = ImageDraw.Draw(im) - return [draw.textsize(i, font=font) for i in my_strings] + return [textDimensions(draw, font, textRot, i) for i in my_strings] def sizeText(size): suffixes = "bp", "kb", "Mb", "Gb" @@ -181,7 +188,7 @@ def seqNameAndSizeText(seqName, seqSize): return seqName + ": " + sizeText(seqSize) def getSeqInfo(sortOpt, seqNames, seqLimits, - font, fontsize, image_mode, isShowSize): + font, fontsize, image_mode, isShowSize, textRot): '''Return miscellaneous information about the sequences.''' if sortOpt == 1: seqNames.sort(key=natural_sort_key) @@ -199,8 +206,9 @@ def getSeqInfo(sortOpt, seqNames, seqLimits, seqLabels = map(seqNameAndSizeText, seqNames, seqSizes) else: seqLabels = seqNames - labelSizes = get_text_sizes(seqLabels, font, fontsize, image_mode) - margin = max(zip(*labelSizes)[1]) # maximum text height + labelSizes = get_text_sizes(seqLabels, font, fontsize, image_mode, textRot) + margin = max(i[1] for i in labelSizes) + # xxx the margin may be too big, because some labels may get omitted return seqNames, seqSizes, seqLabels, labelSizes, margin def div_ceil(x, y): @@ -410,11 +418,11 @@ def drawAnnotations(im, boxes): def make_label(text, text_size, range_start, range_size): '''Return an axis label with endpoint & sort-order information.''' - text_width = text_size[0] + text_width, text_height = text_size label_start = range_start + (range_size - text_width) // 2 label_end = label_start + text_width sort_key = text_width - range_size - return sort_key, label_start, label_end, text + return sort_key, label_start, label_end, text, text_height def get_nonoverlapping_labels(labels, label_space): '''Get a subset of non-overlapping axis labels, greedily.''' @@ -425,21 +433,23 @@ def get_nonoverlapping_labels(labels, label_space): nonoverlapping_labels.append(i) return nonoverlapping_labels -def get_axis_image(seqNames, name_sizes, seq_starts, seq_pix, - font, image_mode, opts): +def get_axis_image(seqNames, name_sizes, seq_starts, seq_pix, textRot, + textAln, font, image_mode, opts): '''Make an image of axis labels.''' min_pos = seq_starts[0] max_pos = seq_starts[-1] + seq_pix[-1] - height = max(zip(*name_sizes)[1]) + margin = max(i[1] for i in name_sizes) labels = map(make_label, seqNames, name_sizes, seq_starts, seq_pix) labels = [i for i in labels if i[1] >= min_pos and i[2] <= max_pos] labels.sort() - labels = get_nonoverlapping_labels(labels, opts.label_space) - image_size = max_pos, height + minPixTweenLabels = 0 if textRot else opts.label_space + labels = get_nonoverlapping_labels(labels, minPixTweenLabels) + image_size = (margin, max_pos) if textRot else (max_pos, margin) im = Image.new(image_mode, image_size, opts.border_color) draw = ImageDraw.Draw(im) for i in labels: - position = i[1], 0 + base = margin - i[4] if textAln else 0 + position = (base, i[1]) if textRot else (i[1], base) draw.text(position, i[3], font=font, fill=opts.text_color) return im @@ -463,26 +473,28 @@ def lastDotplot(opts, args): warn("done") if not alignments: raise Exception("there are no alignments") + textRot1 = "vertical".startswith(opts.rot1) i1 = getSeqInfo(opts.sort1, seqNames1, seqLimits1, - font, opts.fontsize, image_mode, opts.lengths1) - seqNames1, seqSizes1, seqLabels1, labelSizes1, margin1 = i1 + font, opts.fontsize, image_mode, opts.lengths1, textRot1) + seqNames1, seqSizes1, seqLabels1, labelSizes1, tMargin = i1 + textRot2 = "horizontal".startswith(opts.rot2) i2 = getSeqInfo(opts.sort2, seqNames2, seqLimits2, - font, opts.fontsize, image_mode, opts.lengths2) - seqNames2, seqSizes2, seqLabels2, labelSizes2, margin2 = i2 + font, opts.fontsize, image_mode, opts.lengths2, textRot2) + seqNames2, seqSizes2, seqLabels2, labelSizes2, lMargin = i2 warn("choosing bp per pixel...") - pix_limit1 = opts.width - margin1 - pix_limit2 = opts.height - margin2 + pix_limit1 = opts.width - lMargin + pix_limit2 = opts.height - tMargin bpPerPix1 = get_bp_per_pix(seqSizes1, opts.border_pixels, pix_limit1) bpPerPix2 = get_bp_per_pix(seqSizes2, opts.border_pixels, pix_limit2) bpPerPix = max(bpPerPix1, bpPerPix2) warn("bp per pixel = " + str(bpPerPix)) seq_pix1, seq_starts1, width = get_pix_info(seqSizes1, bpPerPix, - opts.border_pixels, margin1) + opts.border_pixels, lMargin) seq_pix2, seq_starts2, height = get_pix_info(seqSizes2, bpPerPix, - opts.border_pixels, margin2) + opts.border_pixels, tMargin) warn("width: " + str(width)) warn("height: " + str(height)) @@ -506,13 +518,13 @@ def lastDotplot(opts, args): readRmsk(opts.rmsk1, seqLimits1), readGenePred(opts, opts.genePred1, seqLimits1), readGaps(opts, opts.gap1, seqLimits1)) - b1 = bedBoxes(beds1, seqLimits1, origins1, margin2, height, True, bpPerPix) + b1 = bedBoxes(beds1, seqLimits1, origins1, tMargin, height, True, bpPerPix) beds2 = itertools.chain(readBed(opts.bed2, seqLimits2), readRmsk(opts.rmsk2, seqLimits2), readGenePred(opts, opts.genePred2, seqLimits2), readGaps(opts, opts.gap2, seqLimits2)) - b2 = bedBoxes(beds2, seqLimits2, origins2, margin1, width, False, bpPerPix) + b2 = bedBoxes(beds2, seqLimits2, origins2, lMargin, width, False, bpPerPix) boxes = sorted(itertools.chain(b1, b2)) drawAnnotations(im, boxes) @@ -527,19 +539,22 @@ def lastDotplot(opts, args): if opts.fontsize != 0: axis1 = get_axis_image(seqLabels1, labelSizes1, seq_starts1, seq_pix1, - font, image_mode, opts) + textRot1, False, font, image_mode, opts) + if textRot1: + axis1 = axis1.transpose(Image.ROTATE_90) axis2 = get_axis_image(seqLabels2, labelSizes2, seq_starts2, seq_pix2, - font, image_mode, opts) - axis2 = axis2.transpose(Image.ROTATE_270) # !!! bug hotspot + textRot2, textRot2, font, image_mode, opts) + if not textRot2: + axis2 = axis2.transpose(Image.ROTATE_270) im.paste(axis1, (0, 0)) im.paste(axis2, (0, 0)) for i in seq_starts1[1:]: - box = i - opts.border_pixels, margin2, i, height + box = i - opts.border_pixels, tMargin, i, height im.paste(opts.border_color, box) for i in seq_starts2[1:]: - box = margin1, i - opts.border_pixels, width, i + box = lMargin, i - opts.border_pixels, width, i im.paste(opts.border_color, box) im.save(args[1]) @@ -589,6 +604,10 @@ if __name__ == "__main__": help="TrueType or OpenType font file") og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=11, help="TrueType or OpenType font size (default: %default)") + og.add_option("--rot1", metavar="ROT", default="h", + help="text rotation for the 1st genome (default=%default)") + og.add_option("--rot2", metavar="ROT", default="v", + help="text rotation for the 2nd genome (default=%default)") og.add_option("--lengths1", action="store_true", help="show sequence lengths for the 1st (horizontal) genome") og.add_option("--lengths2", action="store_true", diff --git a/scripts/last-postmask b/scripts/last-postmask index 669be10..b17f376 100755 --- a/scripts/last-postmask +++ b/scripts/last-postmask @@ -12,8 +12,16 @@ # Limitations: doesn't (yet) handle sequence quality data, # frameshifts, or generalized affine gaps. +import gzip import itertools, optparse, os, signal, sys +def myOpen(fileName): + if fileName == "-": + return sys.stdin + if fileName.endswith(".gz"): + return gzip.open(fileName) + return open(fileName) + def getScoreMatrix(rowHeads, colHeads, matrix, deleteCost, insertCost): defaultScore = min(map(min, matrix)) scoreMatrix = [[defaultScore for i in range(128)] for j in range(128)] @@ -91,15 +99,12 @@ def doOneFile(lines): if seqs: printIfGood(maf, seqs, scoreMatrix, aDel, aIns, minScore) def lastPostmask(args): - if args: - for i in args: - if i == "-": - doOneFile(sys.stdin) - else: - with open(i) as f: - doOneFile(f) - else: - doOneFile(sys.stdin) + if not args: + args = ["-"] + for i in args: + f = myOpen(i) + doOneFile(f) + f.close() if __name__ == "__main__": signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message diff --git a/scripts/last-train b/scripts/last-train index 7fe5947..4b360c7 100755 --- a/scripts/last-train +++ b/scripts/last-train @@ -1,11 +1,14 @@ #! /usr/bin/env python # Copyright 2015 Martin C. Frith +import gzip import math, optparse, os, random, signal, subprocess, sys, tempfile def myOpen(fileName): # faster than fileinput if fileName == "-": return sys.stdin + if fileName.endswith(".gz"): + return gzip.open(fileName) return open(fileName) def randomSample(things, sampleSize): @@ -27,29 +30,30 @@ def seqInput(fileNames): if not fileNames: fileNames = ["-"] for name in fileNames: - with myOpen(name) as f: - seqType = 0 - for line in f: - if seqType == 0: - if line[0] == ">": - seqType = 1 - seq = [] - elif line[0] == "@": - seqType = 2 - lineType = 1 - elif seqType == 1: # fasta - if line[0] == ">": - yield "".join(seq), "" - seq = [] - else: - seq.append(line.rstrip()) - elif seqType == 2: # fastq - if lineType == 1: - seq = line.rstrip() - elif lineType == 3: - yield seq, line.rstrip() - lineType = (lineType + 1) % 4 - if seqType == 1: yield "".join(seq), "" + f = myOpen(name) + seqType = 0 + for line in f: + if seqType == 0: + if line[0] == ">": + seqType = 1 + seq = [] + elif line[0] == "@": + seqType = 2 + lineType = 1 + elif seqType == 1: # fasta + if line[0] == ">": + yield "".join(seq), "" + seq = [] + else: + seq.append(line.rstrip()) + elif seqType == 2: # fastq + if lineType == 1: + seq = line.rstrip() + elif lineType == 3: + yield seq, line.rstrip() + lineType = (lineType + 1) % 4 + if seqType == 1: yield "".join(seq), "" + f.close() def isGoodChunk(chunk): for i in chunk: @@ -112,6 +116,13 @@ def getSeqSample(opts, queryFiles, outfile): x = y writeSegment(outfile, x) +def versionFromHeader(lines): + for line in lines: + words = line.split() + if "version" in words: + return words[-1] + raise Exception("couldn't read the version") + def scaleFromHeader(lines): for line in lines: for i in line.split(): @@ -329,8 +340,16 @@ def tryToMakeChildProgramsFindable(): # put srcDir first, to avoid getting older versions of LAST: os.environ["PATH"] = srcDir + os.pathsep + os.environ["PATH"] -def fixedLastalArgs(opts): - x = ["lastal", "-j7"] +def readLastalProgName(lastdbIndexName): + bitsPerInt = "32" + with open(lastdbIndexName + ".prj") as f: + for line in f: + if line.startswith("integersize="): + bitsPerInt = line.split("=")[1].strip() + return "lastal8" if bitsPerInt == "64" else "lastal" + +def fixedLastalArgs(opts, lastalProgName): + x = [lastalProgName, "-j7"] if opts.D: x.append("-D" + opts.D) if opts.E: x.append("-E" + opts.E) if opts.s: x.append("-s" + opts.s) @@ -338,14 +357,16 @@ def fixedLastalArgs(opts): if opts.C: x.append("-C" + opts.C) if opts.T: x.append("-T" + opts.T) if opts.m: x.append("-m" + opts.m) + if opts.k: x.append("-k" + opts.k) if opts.P: x.append("-P" + opts.P) if opts.Q: x.append("-Q" + opts.Q) return x def doTraining(opts, args): tryToMakeChildProgramsFindable() + lastalProgName = readLastalProgName(args[0]) scaleIncrease = 20 # while training, up-scale the scores by this amount - x = fixedLastalArgs(opts) + x = fixedLastalArgs(opts, lastalProgName) if opts.r: x.append("-r" + opts.r) if opts.q: x.append("-q" + opts.q) if opts.p: x.append("-p" + opts.p) @@ -357,6 +378,7 @@ def doTraining(opts, args): y = ["last-split", "-n"] p = subprocess.Popen(x, stdout=subprocess.PIPE) q = subprocess.Popen(y, stdin=p.stdout, stdout=subprocess.PIPE) + lastalVersion = versionFromHeader(q.stdout) externalScale = scaleFromHeader(q.stdout) internalScale = externalScale * scaleIncrease if opts.Q: @@ -364,6 +386,7 @@ def doTraining(opts, args): internalMatrix = scaledMatrix(externalMatrix, scaleIncrease) oldParameters = [] + print "# lastal version:", lastalVersion print "# maximum percent identity:", opts.pid print "# scale of score parameters:", externalScale print "# scale used while training:", internalScale @@ -390,7 +413,7 @@ def doTraining(opts, args): if parameters in oldParameters: break oldParameters.append(parameters) internalMatrix = matrixWithLetters(matScores) - x = fixedLastalArgs(opts) + x = fixedLastalArgs(opts, lastalProgName) x.append("-p-") x += args p = subprocess.Popen(x, stdin=subprocess.PIPE, stdout=subprocess.PIPE) @@ -472,6 +495,8 @@ if __name__ == "__main__": help="type of alignment: 0=local, 1=overlap (default: 0)") og.add_option("-m", metavar="COUNT", help= "maximum initial matches per query position (default: 10)") + og.add_option("-k", metavar="STEP", help="use initial matches starting at " + "every STEP-th position in each query (default: 1)") og.add_option("-P", metavar="THREADS", help="number of parallel threads") og.add_option("-Q", metavar="NUMBER", diff --git a/scripts/maf-convert b/scripts/maf-convert index 850f3b1..97e78b3 100755 --- a/scripts/maf-convert +++ b/scripts/maf-convert @@ -7,8 +7,16 @@ # Genome FAQ, not e.g. "MIRA assembly format". from itertools import * +import gzip import math, optparse, os, signal, sys +def myOpen(fileName): + if fileName == "-": + return sys.stdin + if fileName.endswith(".gz"): + return gzip.open(fileName) + return open(fileName) + def maxlen(s): return max(map(len, s)) @@ -161,7 +169,7 @@ def writeAxt(maf): head, body = ranges[0], ranges[1:] - outWords = [str(axtCounter.next())] + outWords = [str(next(axtCounter))] outWords.extend(head[:3]) for i in body: outWords.extend(i) @@ -210,6 +218,42 @@ def mafConvertToTab(opts, lines): for maf in mafInput(opts, lines): writeTab(maf) +##### Routines for converting to chain format: ##### + +def writeChain(maf): + aLine, sLines, qLines, pLines = maf + + score = "0" + for i in aLine.split(): + if i.startswith("score="): + score = i[6:] + + outWords = ["chain", score] + + for seqName, seqLen, strand, letterSize, beg, end, row in sLines: + x = seqName, str(seqLen), strand, str(beg), str(end) + outWords.extend(x) + + outWords.append(str(next(axtCounter) + 1)) + + print " ".join(outWords) + + letterSizes = [i[3] for i in sLines] + rows = [i[6] for i in sLines] + alignmentColumns = izip(*rows) + size = "0" + for i in matchAndInsertSizes(alignmentColumns, letterSizes): + if ":" in i: + print size + "\t" + i.replace(":", "\t") + size = "0" + else: + size = i + print size + +def mafConvertToChain(opts, lines): + for maf in mafInput(opts, lines): + writeChain(maf) + ##### Routines for converting to PSL format: ##### def pslBlocks(opts, mafs, outCounts): @@ -335,16 +379,17 @@ def readGroupId(readGroupItems): def readSequenceLengths(fileNames): """Read name & length of topmost sequence in each maf block.""" for i in fileNames: - with open(i) as f: - fields = None - for line in f: - if fields: - if line.isspace(): - fields = None - else: - if line[0] == "s": - fields = line.split() - yield fields[1], fields[5] + f = myOpen(i) + fields = None + for line in f: + if fields: + if line.isspace(): + fields = None + else: + if line[0] == "s": + fields = line.split() + yield fields[1], fields[5] + f.close() def naturalSortKey(s): """Return a key that sorts strings in "natural" order.""" @@ -372,11 +417,9 @@ def writeSamHeader(opts, fileNames): print "@SQ\tSN:%s\tLN:%s" % (k, sequenceLengths[k]) if opts.dictfile: - if opts.dictfile == "-": - copyDictFile(sys.stdin) - else: - with open(opts.dictfile) as f: - copyDictFile(f) + f = myOpen(opts.dictfile) + copyDictFile(f) + f.close() if opts.readgroup: print "@RG\t" + "\t".join(opts.readgroup.split()) @@ -760,6 +803,8 @@ def mafConvertOneFile(opts, formatName, lines): mafConvertToBlast(opts, lines) elif isFormat(formatName, "blasttab"): mafConvertToBlastTab(opts, lines) + elif isFormat(formatName, "chain"): + mafConvertToChain(opts, lines) elif isFormat(formatName, "html"): mafConvertToHtml(opts, lines) elif isFormat(formatName, "psl"): @@ -787,15 +832,12 @@ def mafConvert(opts, args): if isFormat(formatName, "tabular"): opts.isKeepComments = True - if fileNames: - for i in fileNames: - if i == "-": - mafConvertOneFile(opts, formatName, sys.stdin) - else: - with open(i) as f: - mafConvertOneFile(opts, formatName, f) - else: - mafConvertOneFile(opts, formatName, sys.stdin) + if not fileNames: + fileNames = ["-"] + for i in fileNames: + f = myOpen(i) + mafConvertOneFile(opts, formatName, f) + f.close() if not opts.noheader: if isFormat(formatName, "html"): @@ -809,6 +851,7 @@ if __name__ == "__main__": %prog axt mafFile(s) %prog blast mafFile(s) %prog blasttab mafFile(s) + %prog chain mafFile(s) %prog html mafFile(s) %prog psl mafFile(s) %prog sam mafFile(s) diff --git a/src/Alphabet.cc b/src/Alphabet.cc index 961b1f6..d0d2d2b 100644 --- a/src/Alphabet.cc +++ b/src/Alphabet.cc @@ -51,14 +51,6 @@ char* Alphabet::rtCopy( const uchar* beg, const uchar* end, char* dest ) const{ return dest; } -void Alphabet::rc( uchar* beg, uchar* end ) const{ - std::reverse( beg, end ); - - for( /* noop */; beg < end; ++beg ){ - *beg = complement[ *beg ]; - } -} - void Alphabet::init(){ for( std::string::iterator i = letters.begin(); i < letters.end(); ++i ) *i = std::toupper( *i ); diff --git a/src/Alphabet.hh b/src/Alphabet.hh index 0ec118c..540c884 100644 --- a/src/Alphabet.hh +++ b/src/Alphabet.hh @@ -42,9 +42,6 @@ struct Alphabet{ // return the position after the last written position in dest char* rtCopy( const uchar* beg, const uchar* end, char* dest ) const; - // reverse and complement a sequence of numbers, in place - void rc( uchar* beg, uchar* end ) const; - std::string letters; // the "proper" letters, e.g. ACGT for DNA unsigned size; // same as letters.size(): excludes delimiters uchar encode[capacity]; // translate ASCII letters to codes (small integers) diff --git a/src/LastEvaluer.cc b/src/LastEvaluer.cc index b1951ca..72b1dae 100644 --- a/src/LastEvaluer.cc +++ b/src/LastEvaluer.cc @@ -367,7 +367,7 @@ void LastEvaluer::init(const char *matrixName, 0, maxMegabytes, randomSeed, t); break; } catch (const Sls::error& e) { - if (i == 10) throw; + if (i == 20) throw; } } } else { diff --git a/src/LastalArguments.cc b/src/LastalArguments.cc index 8d389ff..995f2e1 100644 --- a/src/LastalArguments.cc +++ b/src/LastalArguments.cc @@ -41,6 +41,18 @@ static char parseOutputFormat( const char* text ){ return 0; } +static int parseScoreOrPercent(const std::string &s) { + int x; + std::istringstream iss(s); + if(!(iss >> x) || x < 0) ERR("bad value: " + s); + char c; + if (iss >> c) { + if (c != '%' || iss >> c) ERR("bad value: " + s); + x = -x; // negative value indicates percentage (kludge) + } + return x; +} + namespace cbrc{ LastalArguments::LastalArguments() : @@ -66,7 +78,7 @@ LastalArguments::LastalArguments() : gapPairCost(-1), // this means: OFF frameshiftCost(-1), // this means: ordinary, non-translated alignment matrixFile(""), - maxDropGapped(-1), // depends on minScoreGapped & maxDropGapless + maxDropGapped(-100), // depends on minScoreGapped & maxDropGapless maxDropGapless(-1), // depends on the score matrix maxDropFinal(-1), // depends on maxDropGapped inputFormat(sequenceFormat::fasta), @@ -116,8 +128,8 @@ Score options (default settings):\n\ -B: insertion extension cost (b)\n\ -c: unaligned residue pair cost (off)\n\ -F: frameshift cost (off)\n\ --x: maximum score drop for gapped alignments (max[y, e-1])\n\ --y: maximum score drop for gapless alignments (t*10)\n\ +-x: maximum score drop for gapped alignments (e-1)\n\ +-y: maximum score drop for gapless alignments (min[t*10, x])\n\ -z: maximum score drop for final gapped alignments (x)\n\ -d: minimum score for gapless alignments (min[e, t*ln(1000*refSize/n)])\n\ -e: minimum score for gapped alignments\n\ @@ -222,8 +234,7 @@ LAST home page: http://last.cbrc.jp/\n\ if( frameshiftCost < 0 ) badopt( c, optarg ); break; case 'x': - unstringify( maxDropGapped, optarg ); - if( maxDropGapped < 0 ) badopt( c, optarg ); + maxDropGapped = parseScoreOrPercent(optarg); break; case 'y': unstringify( maxDropGapless, optarg ); @@ -525,13 +536,17 @@ To proceed without E-values, set a score threshold with option -e."); if( temperature < 0 ) temperature = 1 / lambda; + if( maxDropGapped < 0 ){ + int percent = -maxDropGapped; + maxDropGapped = std::max( minScoreGapped - 1, 0 ); + maxDropGapped = std::max( maxDropGapped, maxDropGapless ); + if (percent != 100) maxDropGapped = maxDropGapped * percent / 100; + } + if( maxDropGapless < 0 ){ // should it depend on temperature or lambda? if( temperature < 0 ) maxDropGapless = 0; // shouldn't happen else maxDropGapless = int( 10.0 * temperature + 0.5 ); - } - - if( maxDropGapped < 0 ){ - maxDropGapped = std::max( minScoreGapped - 1, maxDropGapless ); + maxDropGapless = std::min( maxDropGapless, maxDropGapped ); } if( maxDropFinal < 0 ) maxDropFinal = maxDropGapped; diff --git a/src/MultiSequence.cc b/src/MultiSequence.cc index f9b0077..1f1d05a 100644 --- a/src/MultiSequence.cc +++ b/src/MultiSequence.cc @@ -58,13 +58,6 @@ void MultiSequence::toFiles( const std::string& baseName ) const{ baseName + ".qua" ); } -void MultiSequence::addName( std::string& name ){ - names.v.insert( names.v.end(), name.begin(), name.end() ); - nameEnds.v.push_back( names.v.size() ); - if( nameEnds.v.back() < names.v.size() ) - throw std::runtime_error("the sequence names are too long"); -} - std::istream& MultiSequence::readFastaName( std::istream& stream ){ std::string line, word; getline( stream, line ); diff --git a/src/MultiSequence.hh b/src/MultiSequence.hh index df388dc..3876058 100644 --- a/src/MultiSequence.hh +++ b/src/MultiSequence.hh @@ -17,10 +17,11 @@ namespace cbrc{ +typedef unsigned char uchar; + class MultiSequence{ public: typedef LAST_INT_TYPE indexT; - typedef unsigned char uchar; // initialize with leftmost delimiter pad, ready for appending sequences void initForAppending( indexT padSizeIn ); @@ -130,13 +131,67 @@ class MultiSequence{ // read the letters above PSSM columns, so we know which column is which std::istream& readPssmHeader( std::istream& stream ); - // add a new sequence name - void addName( std::string& name ); + void finishName() { // finish adding a sequence name: store its end coord + nameEnds.v.push_back(names.v.size()); + if (nameEnds.v.back() < names.v.size()) { + throw std::runtime_error("the sequence names are too long"); + } + } + + void addName(const std::string &name) { // add a new sequence name + names.v.insert(names.v.end(), name.begin(), name.end()); + finishName(); + } + + void finishQual() { // add delimiter to the end of the quality scores + uchar padQualityScore = 64; // should never be used, but a valid value + size_t s = padSize * qualityScoresPerLetter; + qualityScores.v.insert(qualityScores.v.end(), s, padQualityScore); + } + + void finishPssm() { // add delimiter to the end of the PSSM + pssm.insert(pssm.end(), padSize * scoreMatrixRowSize, -INF); + } // can we finish the last sequence and stay within the memory limit? bool isFinishable( indexT maxSeqLen ) const; }; +inline void reverseComplementPssm(ScoreMatrixRow *beg, ScoreMatrixRow *end, + const uchar *complement) { + while (beg < end) { + --end; + for (unsigned i = 0; i < scoreMatrixRowSize; ++i) { + unsigned j = complement[i]; + if (beg < end || i < j) std::swap((*beg)[i], (*end)[j]); + } + ++beg; + } +} + +inline void reverseComplementOneSequence(MultiSequence &m, + const uchar *complement, + size_t seqNum) { + size_t b = m.seqBeg(seqNum); + size_t e = m.seqEnd(seqNum); + + uchar *s = m.seqWriter(); + std::reverse(s + b, s + e); + for (size_t i = b; i < e; ++i) { + s[i] = complement[s[i]]; + } + + size_t qpl = m.qualsPerLetter(); + if (qpl) { + std::reverse(m.qualityWriter() + b * qpl, m.qualityWriter() + e * qpl); + } + + ScoreMatrixRow *p = m.pssmWriter(); + if (p) { + reverseComplementPssm(p + b, p + e, complement); + } +} + // Divide the sequences into a given number of roughly-equally-sized // chunks, and return the first sequence in the Nth chunk. inline size_t firstSequenceInChunk(const MultiSequence &m, @@ -152,5 +207,6 @@ inline size_t firstSequenceInChunk(const MultiSequence &m, return (begDistance < endDistance) ? seqNum : seqNum + 1; } -} // end namespace cbrc -#endif // MULTISEQUENCE_HH +} + +#endif diff --git a/src/MultiSequenceQual.cc b/src/MultiSequenceQual.cc index 8e33f84..415fa14 100644 --- a/src/MultiSequenceQual.cc +++ b/src/MultiSequenceQual.cc @@ -15,12 +15,9 @@ using namespace cbrc; std::istream& MultiSequence::appendFromFastq( std::istream& stream, indexT maxSeqLen ){ - const uchar padQualityScore = 64; // should never be used, but a valid value - // initForAppending: qualityScoresPerLetter = 1; - if( qualityScores.v.empty() ) - qualityScores.v.insert( qualityScores.v.end(), padSize, padQualityScore ); + if( qualityScores.v.empty() ) finishQual(); // reinitForAppending: if( qualityScores.v.size() > seq.v.size() ) @@ -51,7 +48,7 @@ MultiSequence::appendFromFastq( std::istream& stream, indexT maxSeqLen ){ if( isFinishable(maxSeqLen) ){ finish(); - qualityScores.v.insert( qualityScores.v.end(), padSize, padQualityScore ); + finishQual(); } return stream; @@ -60,15 +57,11 @@ MultiSequence::appendFromFastq( std::istream& stream, indexT maxSeqLen ){ std::istream& MultiSequence::appendFromPrb( std::istream& stream, indexT maxSeqLen, unsigned alphSize, const uchar decode[] ){ - const uchar padQualityScore = 64; // should never be used, but a valid value - size_t qualPadSize = padSize * alphSize; size_t qualSize = seq.v.size() * alphSize; // initForAppending: qualityScoresPerLetter = alphSize; - if( qualityScores.v.empty() ) - qualityScores.v.insert( qualityScores.v.end(), qualPadSize, - padQualityScore ); + if( qualityScores.v.empty() ) finishQual(); // reinitForAppending: if( qualityScores.v.size() > qualSize ) @@ -104,8 +97,7 @@ MultiSequence::appendFromPrb( std::istream& stream, indexT maxSeqLen, if( isFinishable(maxSeqLen) ){ finish(); - qualityScores.v.insert( qualityScores.v.end(), qualPadSize, - padQualityScore ); + finishQual(); } return stream; @@ -148,12 +140,10 @@ std::istream& MultiSequence::appendFromPssm( std::istream& stream, indexT maxSeqLen, const uchar* lettersToNumbers, bool isMaskLowercase ){ - size_t pssmPadSize = padSize * scoreMatrixRowSize; size_t pssmSize = seq.v.size() * scoreMatrixRowSize; // initForAppending: - if( pssm.empty() ) - pssm.insert( pssm.end(), pssmPadSize, -INF ); + if( pssm.empty() ) finishPssm(); // reinitForAppending: if( pssm.size() > pssmSize ) @@ -200,7 +190,7 @@ MultiSequence::appendFromPssm( std::istream& stream, indexT maxSeqLen, if( isFinishable(maxSeqLen) ){ finish(); - pssm.insert( pssm.end(), pssmPadSize, -INF ); + finishPssm(); } if( !stream.bad() ) stream.clear(); diff --git a/src/io.cc b/src/io.cc index 0c8d3d3..f1abcce 100644 --- a/src/io.cc +++ b/src/io.cc @@ -6,22 +6,15 @@ namespace cbrc{ -std::istream& openIn( const std::string& fileName, std::ifstream& ifs ){ +std::istream& openIn( const std::string& fileName, mcf::izstream& ifs ){ if( fileName == "-" ) return std::cin; ifs.open( fileName.c_str() ); if( !ifs ) throw std::runtime_error("can't open file: " + fileName); return ifs; } -std::ostream& openOut( const std::string& fileName, std::ofstream& ofs ){ - if( fileName == "-" ) return std::cout; - ofs.open( fileName.c_str() ); - if( !ofs ) throw std::runtime_error("can't open file: " + fileName); - return ofs; -} - std::string slurp( const std::string& fileName ){ - std::ifstream inFileStream; + mcf::izstream inFileStream; std::istream& in = openIn( fileName, inFileStream ); std::istreambuf_iterator<char> beg(in), end; return std::string( beg, end ); diff --git a/src/io.hh b/src/io.hh index 9162898..c08cd8c 100644 --- a/src/io.hh +++ b/src/io.hh @@ -6,6 +6,8 @@ #ifndef IO_H #define IO_H +#include "mcf_zstream.hh" + #include <vector> #include <string> #include <stdexcept> @@ -15,10 +17,7 @@ namespace cbrc{ // open an input file, but if the name is "-", just return cin -std::istream& openIn( const std::string& fileName, std::ifstream& ifs ); - -// open an output file, but if the name is "-", just return cout -std::ostream& openOut( const std::string& fileName, std::ofstream& ofs ); +std::istream& openIn( const std::string& fileName, mcf::izstream& ifs ); // read a file into a string, but if the name is "-", read cin std::string slurp( const std::string& fileName ); diff --git a/src/last-pair-probs.cc b/src/last-pair-probs.cc index 5d4436d..21adc33 100644 --- a/src/last-pair-probs.cc +++ b/src/last-pair-probs.cc @@ -616,7 +616,7 @@ void lastPairProbs(LastPairProbsOptions& opts) { size_t n = inputs.size(); if (!opts.isFraglen || !opts.isSdev) { - std::ifstream inFile1, inFile2; + mcf::izstream inFile1, inFile2; std::istream& in1 = (n > 0) ? cbrc::openIn(inputs[0], inFile1) : std::cin; std::istream& in2 = (n > 1) ? cbrc::openIn(inputs[1], inFile2) : in1; if (n < 2) skipOneBatchMarker(in1); @@ -626,7 +626,7 @@ void lastPairProbs(LastPairProbsOptions& opts) { } if (!opts.estdist) { - std::ifstream inFile1, inFile2; + mcf::izstream inFile1, inFile2; std::istream& in1 = (n > 0) ? cbrc::openIn(inputs[0], inFile1) : std::cin; std::istream& in2 = (n > 1) ? cbrc::openIn(inputs[1], inFile2) : in1; AlignmentParameters params1 = readHeaderOrDie(in1); diff --git a/src/lastal.cc b/src/lastal.cc index 0e90c61..47bff6e 100644 --- a/src/lastal.cc +++ b/src/lastal.cc @@ -950,42 +950,16 @@ void translateAndScan( LastAligner& aligner, size_t queryNum, char strand ){ cullFinalAlignments( aligner.textAlns, oldNumOfAlns ); } -static void reverseComplementPssm( size_t queryNum ){ - ScoreMatrixRow* beg = query.pssmWriter() + query.seqBeg(queryNum); - ScoreMatrixRow* end = query.pssmWriter() + query.seqEnd(queryNum); - - while( beg < end ){ - --end; - for( unsigned i = 0; i < scoreMatrixRowSize; ++i ){ - unsigned j = queryAlph.complement[i]; - if( beg < end || i < j ) std::swap( (*beg)[i], (*end)[j] ); - } - ++beg; - } -} - -static void reverseComplementQuery( size_t queryNum ){ - size_t b = query.seqBeg(queryNum); - size_t e = query.seqEnd(queryNum); - queryAlph.rc( query.seqWriter() + b, query.seqWriter() + e ); - if( isQuality( args.inputFormat ) ){ - std::reverse( query.qualityWriter() + b * query.qualsPerLetter(), - query.qualityWriter() + e * query.qualsPerLetter() ); - }else if( args.inputFormat == sequenceFormat::pssm ){ - reverseComplementPssm(queryNum); - } -} - static void alignOneQuery(LastAligner &aligner, size_t queryNum, bool isFirstVolume) { if (args.strand == 2 && !isFirstVolume) - reverseComplementQuery(queryNum); + reverseComplementOneSequence(query, queryAlph.complement, queryNum); if (args.strand != 0) translateAndScan(aligner, queryNum, '+'); if (args.strand == 2 || (args.strand == 0 && isFirstVolume)) - reverseComplementQuery(queryNum); + reverseComplementOneSequence(query, queryAlph.complement, queryNum); if (args.strand != 1) translateAndScan(aligner, queryNum, '-'); @@ -1251,7 +1225,7 @@ void lastal( int argc, char** argv ){ char** inputBegin = argv + args.inputStart; for( char** i = *inputBegin ? inputBegin : defaultInput; *i; ++i ){ - std::ifstream inFileStream; + mcf::izstream inFileStream; std::istream& in = openIn( *i, inFileStream ); LOG( "reading " << *i << "..." ); diff --git a/src/lastdb.cc b/src/lastdb.cc index 085d9d3..daea418 100644 --- a/src/lastdb.cc +++ b/src/lastdb.cc @@ -180,12 +180,21 @@ static void preprocessSeqs(MultiSequence &multi, // Make one database volume, from one batch of sequences void makeVolume( std::vector< CyclicSubsetSeed >& seeds, MultiSequence& multi, const LastdbArguments& args, - const Alphabet& alph, const std::vector<countT>& letterCounts, + const Alphabet& alph, std::vector<countT>& letterCounts, const TantanMasker& masker, unsigned numOfThreads, const std::string& seedText, const std::string& baseName ){ size_t numOfIndexes = seeds.size(); size_t numOfSequences = multi.finishedSequences(); size_t textLength = multi.finishedSize(); + const uchar* seq = multi.seqReader(); + + std::vector<countT> letterCountsInThisVolume(alph.size); + alph.count(seq, seq + textLength, &letterCountsInThisVolume[0]); + for (unsigned c = 0; c < alph.size; ++c) { + letterCounts[c] += letterCountsInThisVolume[c]; + } + + if (args.isCountsOnly) return; if( args.tantanSetting ){ LOG( "masking..." ); @@ -194,9 +203,8 @@ void makeVolume( std::vector< CyclicSubsetSeed >& seeds, LOG( "writing..." ); writePrjFile( baseName + ".prj", args, alph, numOfSequences, - letterCounts, -1, numOfIndexes, seedText ); + letterCountsInThisVolume, -1, numOfIndexes, seedText ); multi.toFiles( baseName ); - const uchar* seq = multi.seqReader(); for( unsigned x = 0; x < numOfIndexes; ++x ){ SubsetSuffixArray myIndex; @@ -244,12 +252,11 @@ static indexT maxLettersPerVolume( const LastdbArguments& args, return z; } -// Read the next sequence, adding it to the MultiSequence and the SuffixArray +// Read the next sequence, adding it to the MultiSequence std::istream& -appendFromFasta( MultiSequence& multi, unsigned numOfIndexes, +appendFromFasta( MultiSequence& multi, indexT maxSeqLen, const LastdbArguments& args, const Alphabet& alph, std::istream& in ){ - indexT maxSeqLen = maxLettersPerVolume( args, numOfIndexes ); if( multi.finishedSequences() == 0 ) maxSeqLen = indexT(-1); size_t oldSize = multi.unfinishedSize(); @@ -303,18 +310,18 @@ void lastdb( int argc, char** argv ){ unsigned volumeNumber = 0; countT sequenceCount = 0; std::vector<countT> letterCounts( alph.size ); - std::vector<countT> letterTotals( alph.size ); + indexT maxSeqLen = maxLettersPerVolume( args, seeds.size() ); char defaultInputName[] = "-"; char* defaultInput[] = { defaultInputName, 0 }; char** inputBegin = argv + args.inputStart; for( char** i = *inputBegin ? inputBegin : defaultInput; *i; ++i ){ - std::ifstream inFileStream; + mcf::izstream inFileStream; std::istream& in = openIn( *i, inFileStream ); LOG( "reading " << *i << "..." ); - while( appendFromFasta( multi, seeds.size(), args, alph, in ) ){ + while( appendFromFasta( multi, maxSeqLen, args, alph, in ) ){ if( !args.isProtein && args.userAlphabet.empty() && sequenceCount == 0 && isDubiousDna( alph, multi ) ){ std::cerr << args.programName << ": that's some funny-lookin DNA\n"; @@ -322,30 +329,18 @@ void lastdb( int argc, char** argv ){ if( multi.isFinished() ){ ++sequenceCount; - const uchar* seq = multi.seqReader(); - size_t lastSeq = multi.finishedSequences() - 1; - size_t beg = multi.seqBeg( lastSeq ); - size_t end = multi.seqEnd( lastSeq ); - alph.count( seq + beg, seq + end, &letterCounts[0] ); - if( args.isCountsOnly ){ - // memory-saving, which seems to be important on 32-bit systems: - multi.reinitForAppending(); - } } else{ std::string baseName = args.lastdbName + stringify(volumeNumber++); makeVolume( seeds, multi, args, alph, letterCounts, tantanMasker, numOfThreads, seedText, baseName ); - for( unsigned c = 0; c < alph.size; ++c ) - letterTotals[c] += letterCounts[c]; - letterCounts.assign( alph.size, 0 ); multi.reinitForAppending(); } } } if( multi.finishedSequences() > 0 ){ - if( volumeNumber == 0 ){ + if( volumeNumber == 0 && !args.isCountsOnly ){ makeVolume( seeds, multi, args, alph, letterCounts, tantanMasker, numOfThreads, seedText, args.lastdbName ); return; @@ -355,10 +350,8 @@ void lastdb( int argc, char** argv ){ tantanMasker, numOfThreads, seedText, baseName ); } - for( unsigned c = 0; c < alph.size; ++c ) letterTotals[c] += letterCounts[c]; - writePrjFile( args.lastdbName + ".prj", args, alph, sequenceCount, - letterTotals, volumeNumber, seeds.size(), seedText ); + letterCounts, volumeNumber, seeds.size(), seedText ); } int main( int argc, char** argv ) diff --git a/src/makefile b/src/makefile index df3581a..de49694 100644 --- a/src/makefile +++ b/src/makefile @@ -1,6 +1,3 @@ -CXX = g++ -CC = gcc - CXXFLAGS = -O3 -Wall -Wextra -Wcast-qual -Wswitch-enum -Wundef \ -Wcast-align -pedantic -g -std=c++11 -pthread -DHAS_CXX_THREADS # -Wconversion @@ -59,19 +56,19 @@ all: $(ALL) indexAllObj4 = $(indexObj0) $(indexObj4) lastdb: $(indexAllObj4) - $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj4) + $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj4) -lz indexAllObj8 = $(indexObj0) $(indexObj8) lastdb8: $(indexAllObj8) - $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj8) + $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj8) -lz alignAllObj4 = $(alignObj0) $(alignObj4) lastal: $(alignAllObj4) - $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj4) + $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj4) -lz alignAllObj8 = $(alignObj0) $(alignObj8) lastal8: $(alignAllObj8) - $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj8) + $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj8) -lz splitAllObj4 = $(splitObj0) $(splitObj4) last-split: $(splitAllObj4) @@ -82,7 +79,7 @@ last-split8: $(splitAllObj8) $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(splitAllObj8) last-pair-probs: $(PPOBJ) - $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(PPOBJ) + $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(PPOBJ) -lz last-merge-batches: $(MBOBJ) $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(MBOBJ) @@ -151,7 +148,7 @@ Centroid.o Centroid.o8: Centroid.cc Centroid.hh GappedXdropAligner.hh \ ScoreMatrixRow.hh GeneralizedAffineGapCosts.hh SegmentPair.hh \ OneQualityScoreMatrix.hh GappedXdropAlignerInl.hh CyclicSubsetSeed.o CyclicSubsetSeed.o8: CyclicSubsetSeed.cc CyclicSubsetSeed.hh \ - CyclicSubsetSeedData.hh io.hh stringify.hh + CyclicSubsetSeedData.hh io.hh mcf_zstream.hh stringify.hh DiagonalTable.o DiagonalTable.o8: DiagonalTable.cc DiagonalTable.hh GappedXdropAligner.o GappedXdropAligner.o8: GappedXdropAligner.cc GappedXdropAligner.hh \ ScoreMatrixRow.hh GappedXdropAlignerInl.hh @@ -179,7 +176,7 @@ LastalArguments.o LastalArguments.o8: LastalArguments.cc LastalArguments.hh \ LastdbArguments.o LastdbArguments.o8: LastdbArguments.cc LastdbArguments.hh \ SequenceFormat.hh stringify.hh getoptUtil.hh version.hh MultiSequence.o MultiSequence.o8: MultiSequence.cc MultiSequence.hh ScoreMatrixRow.hh \ - VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh io.hh + VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh io.hh mcf_zstream.hh MultiSequenceQual.o MultiSequenceQual.o8: MultiSequenceQual.cc MultiSequence.hh \ ScoreMatrixRow.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh OneQualityScoreMatrix.o OneQualityScoreMatrix.o8: OneQualityScoreMatrix.cc \ @@ -187,14 +184,15 @@ OneQualityScoreMatrix.o OneQualityScoreMatrix.o8: OneQualityScoreMatrix.cc \ stringify.hh QualityPssmMaker.o QualityPssmMaker.o8: QualityPssmMaker.cc QualityPssmMaker.hh \ ScoreMatrixRow.hh qualityScoreUtil.hh stringify.hh -ScoreMatrix.o ScoreMatrix.o8: ScoreMatrix.cc ScoreMatrix.hh ScoreMatrixData.hh io.hh +ScoreMatrix.o ScoreMatrix.o8: ScoreMatrix.cc ScoreMatrix.hh ScoreMatrixData.hh io.hh \ + mcf_zstream.hh SegmentPair.o SegmentPair.o8: SegmentPair.cc SegmentPair.hh SegmentPairPot.o SegmentPairPot.o8: SegmentPairPot.cc SegmentPairPot.hh SegmentPair.hh SubsetMinimizerFinder.o SubsetMinimizerFinder.o8: SubsetMinimizerFinder.cc \ SubsetMinimizerFinder.hh CyclicSubsetSeed.hh SubsetSuffixArray.o SubsetSuffixArray.o8: SubsetSuffixArray.cc SubsetSuffixArray.hh \ CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh \ - SubsetMinimizerFinder.hh io.hh + SubsetMinimizerFinder.hh io.hh mcf_zstream.hh SubsetSuffixArraySearch.o SubsetSuffixArraySearch.o8: SubsetSuffixArraySearch.cc \ SubsetSuffixArray.hh CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh \ fileMap.hh stringify.hh @@ -211,11 +209,11 @@ gaplessPssmXdrop.o gaplessPssmXdrop.o8: gaplessPssmXdrop.cc gaplessPssmXdrop.hh gaplessTwoQualityXdrop.o gaplessTwoQualityXdrop.o8: gaplessTwoQualityXdrop.cc \ gaplessTwoQualityXdrop.hh TwoQualityScoreMatrix.hh ScoreMatrixRow.hh gaplessXdrop.o gaplessXdrop.o8: gaplessXdrop.cc gaplessXdrop.hh ScoreMatrixRow.hh -io.o io.o8: io.cc io.hh +io.o io.o8: io.cc io.hh mcf_zstream.hh last-pair-probs-main.o last-pair-probs-main.o8: last-pair-probs-main.cc last-pair-probs.hh \ stringify.hh version.hh last-pair-probs.o last-pair-probs.o8: last-pair-probs.cc last-pair-probs.hh io.hh \ - stringify.hh + mcf_zstream.hh stringify.hh lastal.o lastal.o8: lastal.cc LastalArguments.hh SequenceFormat.hh \ QualityPssmMaker.hh ScoreMatrixRow.hh OneQualityScoreMatrix.hh \ TwoQualityScoreMatrix.hh qualityScoreUtil.hh stringify.hh \ @@ -228,12 +226,12 @@ lastal.o lastal.o8: lastal.cc LastalArguments.hh SequenceFormat.hh \ SegmentPairPot.hh ScoreMatrix.hh Alphabet.hh MultiSequence.hh \ TantanMasker.hh tantan.hh DiagonalTable.hh GreedyXdropAligner.hh \ gaplessXdrop.hh gaplessPssmXdrop.hh gaplessTwoQualityXdrop.hh io.hh \ - threadUtil.hh version.hh + mcf_zstream.hh threadUtil.hh version.hh lastdb.o lastdb.o8: lastdb.cc LastdbArguments.hh SequenceFormat.hh \ SubsetSuffixArray.hh CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh \ fileMap.hh stringify.hh Alphabet.hh MultiSequence.hh ScoreMatrixRow.hh \ - TantanMasker.hh tantan.hh io.hh qualityScoreUtil.hh threadUtil.hh \ - version.hh + TantanMasker.hh tantan.hh io.hh mcf_zstream.hh qualityScoreUtil.hh \ + threadUtil.hh version.hh tantan.o tantan.o8: tantan.cc tantan.hh last-merge-batches.o: last-merge-batches.c version.hh alp/njn_dynprogprob.o: alp/njn_dynprogprob.cpp alp/njn_dynprogprob.hpp \ diff --git a/src/mcf_zstream.hh b/src/mcf_zstream.hh new file mode 100644 index 0000000..fd527c5 --- /dev/null +++ b/src/mcf_zstream.hh @@ -0,0 +1,82 @@ +// Copyright 2017 Martin C. Frith + +// mcf::izstream is similar to std::ifstream. The difference is, if +// you give it a gzip-compressed file, it will decompress what it +// reads. + +#ifndef MCF_ZSTREAM +#define MCF_ZSTREAM + +#include <zlib.h> + +#include <cstdio> // BUFSIZ +#include <istream> +#include <streambuf> + +namespace mcf { + +class zbuf : public std::streambuf { +public: + zbuf() : input(0) {} + + ~zbuf() { close(); } + + bool is_open() const { return input; } + + zbuf *open(const char *fileName) { + if (is_open()) return 0; + input = gzopen(fileName, "rb"); + if (!is_open()) return 0; + return this; + } + + zbuf *close() { + if (!is_open()) return 0; + int e = gzclose(input); + input = 0; + return (e == Z_OK || e == Z_BUF_ERROR) ? this : 0; + } + +protected: + int underflow() { + if (gptr() == egptr()) { + int size = gzread(input, buffer, BUFSIZ); + if (size < 0) throw std::ios_base::failure("gzread error"); + setg(buffer, buffer, buffer + size); + } + return (gptr() == egptr()) ? + traits_type::eof() : traits_type::to_int_type(*gptr()); + } + +private: + gzFile input; + char buffer[BUFSIZ]; +}; + +class izstream : public std::istream { +public: + izstream() : std::istream(&buf) {} + + izstream(const char *fileName) : std::istream(&buf) { + open(fileName); + } + + bool is_open() const { return buf.is_open(); } + + void open(const char *fileName) { + // do something special if fileName is "-"? + if (!buf.open(fileName)) setstate(failbit); + else clear(); + } + + void close() { + if (!buf.close()) setstate(failbit); + } + +private: + zbuf buf; +}; + +} + +#endif diff --git a/src/version.hh b/src/version.hh index 7104b42..0fe2c52 100644 --- a/src/version.hh +++ b/src/version.hh @@ -1 +1 @@ -"869" +"885" -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/last-align.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
