This is an automated email from the git hooks/post-receive script. plessy pushed a commit to branch debian/unstable in repository samtools.
commit 9e782306ac8092a80d53b7e2cf6108c48c8ea11c Author: Petr Danecek <[email protected]> Date: Wed Dec 4 16:52:29 2013 +0000 stats: Fix of mismmatch-per-cycle calculation in presence of hard-clipped reads --- INSTALL | 3 +++ stats.c | 30 +++++++++++++++++++++--------- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/INSTALL b/INSTALL index 37d84a9..f60ac70 100644 --- a/INSTALL +++ b/INSTALL @@ -12,6 +12,9 @@ the modern Linux/Unix distributions. If you do not have this library installed, you can still compile the rest of SAMtools by manually changing: `-D_CURSES_LIB=1' to `-D_CURSES_LIB=0' at the line starting with `DFLAGS=', and comment out the line starting with `LIBCURSES='. +Note that on some systems the library is available as -lcurses while on others +as -lnurses. This can be set in Makefile by setting LIBCURSES= -lcurses vs +-lncurses. Compilation diff --git a/stats.c b/stats.c index 7500271..45864d9 100644 --- a/stats.c +++ b/stats.c @@ -306,12 +306,24 @@ void count_indels(stats_t *stats,bam1_t *bam_line) } } +int unclipped_length(bam1_t *bam_line) +{ + int icig, read_len = bam_line->core.l_qseq; + for (icig=0; icig<bam_line->core.n_cigar; icig++) + { + int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK; + if ( cig==BAM_CHARD_CLIP ) + read_len += bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT; + } + return read_len; +} + void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) { int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; int icig,iread=0,icycle=0; int iref = bam_line->core.pos - stats->rseq_pos; - int read_len = bam_line->core.l_qseq; + int read_len = unclipped_length(bam_line); uint8_t *read = bam1_seq(bam_line); uint8_t *quals = bam1_qual(bam_line); uint64_t *mpc_buf = stats->mpc_buf; @@ -322,18 +334,18 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) // MIDNSHP int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK; int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT; - if ( cig==1 ) + if ( cig==BAM_CINS ) { iread += ncig; icycle += ncig; continue; } - if ( cig==2 ) + if ( cig==BAM_CDEL ) { iref += ncig; continue; } - if ( cig==4 ) + if ( cig==BAM_CSOFT_CLIP ) { icycle += ncig; // Soft-clips are present in the sequence, but the position of the read marks a start of the sequence after clipping @@ -341,15 +353,15 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) iread += ncig; continue; } - if ( cig==5 ) + if ( cig==BAM_CHARD_CLIP ) { icycle += ncig; continue; } // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large // chunk of refseq in memory. Not very frequent and not noticable in the stats. - if ( cig==3 || cig==5 ) continue; - if ( cig!=0 ) + if ( cig==BAM_CREF_SKIP || cig==BAM_CHARD_CLIP ) continue; + if ( cig!=BAM_CMATCH ) error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); if ( ncig+iref > stats->nrseq_buf ) @@ -599,7 +611,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats) stats->read_lengths[seq_len]++; - // Count GC and ACGT per cycle + // Count GC and ACGT per cycle. Note that cycle is approximate, clipping is ignored uint8_t base, *seq = bam1_seq(bam_line); int gc_count = 0; int i; @@ -644,7 +656,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats) if ( stats->trim_qual>0 ) stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse); - // Quality histogram and average quality + // Quality histogram and average quality. Clipping is neglected. for (i=0; i<seq_len; i++) { uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i]; -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/samtools.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
