This is an automated email from the git hooks/post-receive script. plessy pushed a commit to branch debian/unstable in repository samtools.
commit 399b28378ebd22623ab4b7a48b519b1f3a1da02b Author: Petr Danecek <[email protected]> Date: Thu Nov 28 11:23:43 2013 +0000 New "flags" command plus moved filtering from htslib/pileup to BAM reading callbacks --- Makefile | 3 ++- bam2depth.c | 15 +++++++---- bam_flags.c | 43 +++++++++++++++++++++++++++++++ bam_plcmd.c | 83 ++++++++++++++++++++++++++++++++++-------------------------- bamtk.c | 5 +++- bedcov.c | 13 +++++++--- cut_target.c | 27 ++++++++++++-------- phase.c | 23 ++++++++++------- 8 files changed, 146 insertions(+), 66 deletions(-) diff --git a/Makefile b/Makefile index bf5557f..e208ac5 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ AOBJS= bam_index.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ cut_target.o phase.o bam2depth.o padding.o bedcov.o bamshuf.o \ - faidx.o stats.o + faidx.o stats.o bam_flags.o # tview todo: bam_tview.o bam_tview_curses.o bam_tview_html.o bam_lpileup.o INCLUDES= -I. -I$(HTSDIR) LIBCURSES= -lcurses # -lXCurses @@ -121,6 +121,7 @@ bam_stat.o: bam_stat.c $(bam_h) bam_tview.o: bam_tview.c $(bam_tview_h) bam_tview_curses.o: bam_tview_curses.c $(bam_tview_h) bam_tview_html.o: bam_tview_html.c $(bam_tview_h) +bam_flags.o: bam_flags.c $(sam_h) bamshuf.o: bamshuf.c $(htslib_sam_h) $(HTSDIR)/htslib/ksort.h bamtk.o: bamtk.c $(bam_h) version.h samtools.h bedcov.o: bedcov.c $(htslib_bgzf_h) $(bam_h) $(HTSDIR)/htslib/kseq.h diff --git a/bam2depth.c b/bam2depth.c index 6fd023a..c884741 100644 --- a/bam2depth.c +++ b/bam2depth.c @@ -24,11 +24,16 @@ int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if c static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup { aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure - int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); - if (!(b->core.flag&BAM_FUNMAP)) { - if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; - else if (aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len) b->core.flag |= BAM_FUNMAP; - } + int ret; + while (1) + { + ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); + if ( ret<0 ) break; + if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; + if ( (int)b->core.qual < aux->min_mapQ ) continue; + if ( aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len ) continue; + break; + } return ret; } diff --git a/bam_flags.c b/bam_flags.c new file mode 100644 index 0000000..f2d0be3 --- /dev/null +++ b/bam_flags.c @@ -0,0 +1,43 @@ +#include <ctype.h> +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <unistd.h> +#include <stdarg.h> +#include <htslib/sam.h> + +static void usage(void) +{ + fprintf(stderr, "\n"); + fprintf(stderr, "About: Convert between textual and numeric flag representation\n"); + fprintf(stderr, "Usage: samtools flags [INT|STR]\n"); + fprintf(stderr, "Flags:\n"); + fprintf(stderr, "\t0x%x\tPAIRED .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED); + fprintf(stderr, "\t0x%x\tPROPER_PAIR .. each segment properly aligned according to the alig\n", BAM_FPROPER_PAIR); + fprintf(stderr, "\t0x%x\tUNMAP .. segment unmapped\n", BAM_FUNMAP); + fprintf(stderr, "\t0x%x\tMUNMAP .. next segment in the template unmapped\n", BAM_FMUNMAP); + fprintf(stderr, "\t0x%x\tREVERSE .. SEQ is reverse complemented\n", BAM_FREVERSE); + fprintf(stderr, "\t0x%x\tMREVERSE .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE); + fprintf(stderr, "\t0x%x\tREAD1 .. the first segment in the template\n", BAM_FREAD1); + fprintf(stderr, "\t0x%x\tREAD2 .. the last segment in the template\n", BAM_FREAD2); + fprintf(stderr, "\t0x%x\tSECONDARY .. secondary alignment\n", BAM_FSECONDARY); + fprintf(stderr, "\t0x%x\tQCFAIL .. not passing quality controls\n", BAM_FQCFAIL); + fprintf(stderr, "\t0x%x\tDUP .. PCR or optical duplicate\n", BAM_FDUP); + fprintf(stderr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY); + fprintf(stderr, "\n"); +} + + +int main_flags(int argc, char *argv[]) +{ + if ( argc!=2 ) usage(); + else + { + int mask = bam_str2flag(argv[1]); + if ( mask<0 ) { fprintf(stderr,"Could not parse \"%s\"\n", argv[1]); return 1; } + printf("0x%x\t%s\n", mask, bam_flag2str(mask)); + } + return 0; +} + diff --git a/bam_plcmd.c b/bam_plcmd.c index eb56253..efdf4ab 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -592,6 +592,7 @@ int bam_mpileup(int argc, char *argv[]) mplp.min_frac = 0.002; mplp.min_support = 1; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS; mplp.argc = argc; mplp.argv = argv; + mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; static struct option lopts[] = { {"rf",1,0,1}, // require flag @@ -601,8 +602,14 @@ int bam_mpileup(int argc, char *argv[]) while ((c = getopt_long(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV1:2:vx",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; - case 1 : mplp.rflag_require = strtol(optarg,0,0); break; - case 2 : mplp.rflag_filter = strtol(optarg,0,0); break; + case 1 : + mplp.rflag_require = bam_str2flag(optarg); + if ( mplp.rflag_require<0 ) { fprintf(stderr,"Could not parse --rf %s\n", optarg); return 1; } + break; + case 2 : + mplp.rflag_filter = bam_str2flag(optarg); + if ( mplp.rflag_filter<0 ) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return 1; } + break; case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == 0) return 1; @@ -657,47 +664,51 @@ int bam_mpileup(int argc, char *argv[]) } } if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; - if (argc == 1) { + if (argc == 1) + { + char *tmp_require = bam_flag2str(mplp.rflag_require); + char *tmp_filter = bam_flag2str(mplp.rflag_filter); fprintf(stderr, "\n"); fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n"); fprintf(stderr, "Input options:\n\n"); - fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n"); - fprintf(stderr, " -A count anomalous read pairs\n"); - fprintf(stderr, " -B disable BAQ computation\n"); - fprintf(stderr, " -b FILE list of input BAM filenames, one per line [null]\n"); - fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n"); - fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth); - fprintf(stderr, " -E recalculate extended BAQ on the fly thus ignoring existing BQs\n"); - fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n"); - fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n"); - fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n"); - fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); - fprintf(stderr, " -r STR region in which pileup is generated [null]\n"); - fprintf(stderr, " -R ignore RG tags\n"); - fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq); - fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ); - fprintf(stderr, " --rf INT required flags: skip reads with mask bits unset []\n"); - fprintf(stderr, " --ff INT filter flags: skip reads with mask bits set []\n"); - fprintf(stderr, " -x disable read-pair overlap detection\n"); + fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n"); + fprintf(stderr, " -A count anomalous read pairs\n"); + fprintf(stderr, " -B disable BAQ computation\n"); + fprintf(stderr, " -b FILE list of input BAM filenames, one per line [null]\n"); + fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n"); + fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth); + fprintf(stderr, " -E recalculate extended BAQ on the fly thus ignoring existing BQs\n"); + fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n"); + fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n"); + fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n"); + fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); + fprintf(stderr, " -r STR region in which pileup is generated [null]\n"); + fprintf(stderr, " -R ignore RG tags\n"); + fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq); + fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ); + fprintf(stderr, " --rf STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require); + fprintf(stderr, " --ff STR|INT filter flags: skip reads with mask bits set [%s]\n", tmp_filter); + fprintf(stderr, " -x disable read-pair overlap detection\n"); fprintf(stderr, "\nOutput options:\n\n"); - fprintf(stderr, " -D/V output per-sample DP/DV in BCF (requires -g/-v)\n"); - fprintf(stderr, " -g/v generate BCF/VCF output (genotype likelihoods)\n"); - fprintf(stderr, " -O output base positions on reads (disabled by -g/-u)\n"); - fprintf(stderr, " -s output mapping quality (disabled by -g/-u)\n"); - fprintf(stderr, " -S output per-sample strand bias P-value in BCF (require -g/-u)\n"); - fprintf(stderr, " -u generate uncompressed BCF/VCF output\n"); + fprintf(stderr, " -D/V output per-sample DP/DV in BCF (requires -g/-v)\n"); + fprintf(stderr, " -g/v generate BCF/VCF output (genotype likelihoods)\n"); + fprintf(stderr, " -O output base positions on reads (disabled by -g/-u)\n"); + fprintf(stderr, " -s output mapping quality (disabled by -g/-u)\n"); + fprintf(stderr, " -S output per-sample strand bias P-value in BCF (require -g/-u)\n"); + fprintf(stderr, " -u generate uncompressed BCF/VCF output\n"); fprintf(stderr, "\nSNP/INDEL genotype likelihoods options (effective with `-g' or `-u'):\n\n"); - fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); - fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac); - fprintf(stderr, " -h INT coefficient for homopolymer errors [%d]\n", mplp.tandemQ); - fprintf(stderr, " -I do not perform indel calling\n"); - fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); - fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); - fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); - fprintf(stderr, " -p apply -m and -F per-sample to increase sensitivity\n"); - fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); + fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); + fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac); + fprintf(stderr, " -h INT coefficient for homopolymer errors [%d]\n", mplp.tandemQ); + fprintf(stderr, " -I do not perform indel calling\n"); + fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); + fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); + fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); + fprintf(stderr, " -p apply -m and -F per-sample to increase sensitivity\n"); + fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Notes: Assuming diploid individuals.\n\n"); + free(tmp_require); free(tmp_filter); return 1; } int ret; diff --git a/bamtk.c b/bamtk.c index cdf69a0..9c0dbe7 100644 --- a/bamtk.c +++ b/bamtk.c @@ -29,6 +29,7 @@ int main_pad2unpad(int argc, char *argv[]); int main_bedcov(int argc, char *argv[]); int main_bamshuf(int argc, char *argv[]); int main_stats(int argc, char *argv[]); +int main_flags(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); @@ -71,7 +72,8 @@ static void usage(FILE *fp) " phase phase heterozygotes\n" " stats generate stats (former bamcheck)\n" " -- viewing\n" -" tview text alignment viewer (todo)\n" +" flags explain BAM flags\n" +" tview text alignment viewer\n" " view SAM<->BAM conversion\n" //" depad convert padded BAM to unpadded BAM\n" // not stable "\n"); @@ -127,6 +129,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bedcov") == 0) ret = main_bedcov(argc-1, argv+1); else if (strcmp(argv[1], "bamshuf") == 0) ret = main_bamshuf(argc-1, argv+1); else if (strcmp(argv[1], "stats") == 0) ret = main_stats(argc-1, argv+1); + else if (strcmp(argv[1], "flags") == 0) ret = main_flags(argc-1, argv+1); else if (strcmp(argv[1], "pileup") == 0) { fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); return 1; diff --git a/bedcov.c b/bedcov.c index 5fb9b37..8c143c1 100644 --- a/bedcov.c +++ b/bedcov.c @@ -19,9 +19,16 @@ typedef struct { static int read_bam(void *data, bam1_t *b) { - aux_t *aux = (aux_t*)data; - int ret = bam_iter_read(aux->fp, aux->iter, b); - if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; + aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure + int ret; + while (1) + { + ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); + if ( ret<0 ) break; + if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; + if ( (int)b->core.qual < aux->min_mapQ ) continue; + break; + } return ret; } diff --git a/cut_target.c b/cut_target.c index 8c56f33..cd266a5 100644 --- a/cut_target.c +++ b/cut_target.c @@ -121,21 +121,27 @@ static int read_aln(void *data, bam1_t *b) extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag); ct_t *g = (ct_t*)data; int ret, len; - ret = bam_read1(g->fp, b); - if (ret >= 0 && g->fai && b->core.tid >= 0 && (b->core.flag&4) == 0) { - if (b->core.tid != g->tid) { // then load the sequence - free(g->ref); - g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len); - g->tid = b->core.tid; - } - bam_prob_realn_core(b, g->ref, 1<<1|1); - } + while (1) + { + ret = bam_read1(g->fp, b); + if ( ret<0 ) break; + if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; + if ( g->fai && b->core.tid >= 0 ) { + if (b->core.tid != g->tid) { // then load the sequence + free(g->ref); + g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len); + g->tid = b->core.tid; + } + bam_prob_realn_core(b, g->ref, 1<<1|1); + } + break; + } return ret; } int main_cut_target(int argc, char *argv[]) { - int c, tid, pos, n, lasttid = -1, lastpos = -1, l, max_l; + int c, tid, pos, n, lasttid = -1, l, max_l; const bam_pileup1_t *p; bam_plp_t plp; uint16_t *cns; @@ -178,7 +184,6 @@ int main_cut_target(int argc, char *argv[]) lasttid = tid; } cns[pos] = gencns(&g, n, p); - lastpos = pos; } process_cns(g.h, lasttid, l, cns); free(cns); diff --git a/phase.c b/phase.c index bf8d6d3..b086b5f 100644 --- a/phase.c +++ b/phase.c @@ -450,15 +450,20 @@ static int readaln(void *data, bam1_t *b) { phaseg_t *g = (phaseg_t*)data; int ret; - ret = bam_read1(g->fp, b); - if (ret < 0) return ret; - if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) { - if (g->n == g->m) { - g->m = g->m? g->m<<1 : 16; - g->b = realloc(g->b, g->m * sizeof(bam1_t*)); - } - g->b[g->n++] = bam_dup1(b); - } + while (1) + { + ret = bam_read1(g->fp, b); + if (ret < 0) break; + if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; + if ( g->pre ) { + if (g->n == g->m) { + g->m = g->m? g->m<<1 : 16; + g->b = realloc(g->b, g->m * sizeof(bam1_t*)); + } + g->b[g->n++] = bam_dup1(b); + } + break; + } return ret; } -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/samtools.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
