This is an automated email from the git hooks/post-receive script. plessy pushed a commit to branch develop in repository samtools.
commit c8e1a2bf94c48787b26a0e5d4f2b4d3c488ebc07 Author: Petr Danecek <[email protected]> Date: Wed Aug 13 08:48:13 2014 +0100 stats: Compute checksum for names/seqs/quals --- stats.c | 26 ++++++++++++++++++++++++++ test/stat/1.stats.expected | 3 +++ test/stat/2.stats.expected | 3 +++ test/stat/3.stats.expected | 3 +++ test/stat/4.stats.expected | 3 +++ test/stat/5.stats.expected | 3 +++ test/stat/6.stats.expected | 3 +++ 7 files changed, 44 insertions(+) diff --git a/stats.c b/stats.c index 8eb24e8..f72165e 100644 --- a/stats.c +++ b/stats.c @@ -26,6 +26,7 @@ #include <getopt.h> #include <errno.h> #include <assert.h> +#include <zlib.h> // for crc32 #include <htslib/faidx.h> #include <htslib/sam.h> #include <htslib/hts.h> @@ -122,6 +123,9 @@ typedef struct uint64_t nbases_trimmed; // bwa trimmed bases uint64_t nmismatches; uint64_t nreads_QCfailed, nreads_secondary; + struct { + uint32_t names, reads, quals; + } checksum; // GC-depth related data uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin @@ -572,6 +576,23 @@ void realloc_buffers(stats_t *stats, int seq_len) realloc_rseq_buffer(stats); } +void update_checksum(bam1_t *bam_line, stats_t *stats) +{ + uint8_t *name = (uint8_t*) bam_get_qname(bam_line); + int len = 0; + while ( name[len] ) len++; + stats->checksum.names += crc32(0L, name, len); + + int seq_len = bam_line->core.l_qseq; + if ( !seq_len ) return; + + uint8_t *seq = bam_get_seq(bam_line); + stats->checksum.reads += crc32(0L, seq, (seq_len+1)/2); + + uint8_t *qual = bam_get_qual(bam_line); + stats->checksum.quals += crc32(0L, qual, (seq_len+1)/2); +} + void collect_stats(bam1_t *bam_line, stats_t *stats) { if ( stats->rg_hash ) @@ -599,6 +620,8 @@ void collect_stats(bam1_t *bam_line, stats_t *stats) if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++; if ( bam_line->core.flag & BAM_FPAIRED ) stats->nreads_paired_tech++; + update_checksum(bam_line, stats); + int seq_len = bam_line->core.l_qseq; if ( !seq_len ) return; @@ -909,6 +932,9 @@ void output_stats(stats_t *stats, int sparse) for (i=1; i<stats->argc; i++) printf(" %s",stats->argv[i]); printf("\n"); + printf("# CHK, Checksum\t[2]Read Names\t[3]Sequences\t[4]Qualities\n"); + printf("# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n"); + printf("CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals); printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"); printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd)); // not counting excluded seqs (and none of the below) printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered); diff --git a/test/stat/1.stats.expected b/test/stat/1.stats.expected index fa8a3b9..0604069 100644 --- a/test/stat/1.stats.expected +++ b/test/stat/1.stats.expected @@ -1,3 +1,6 @@ +# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities +# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow) +CHK 1a1c1362 29c426ae 7bab45da # Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part. SN raw total sequences: 2 SN filtered sequences: 0 diff --git a/test/stat/2.stats.expected b/test/stat/2.stats.expected index a017dad..2024271 100644 --- a/test/stat/2.stats.expected +++ b/test/stat/2.stats.expected @@ -1,3 +1,6 @@ +# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities +# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow) +CHK 1a1c1362 29c426ae 7bab45da # Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part. SN raw total sequences: 2 SN filtered sequences: 0 diff --git a/test/stat/3.stats.expected b/test/stat/3.stats.expected index bd2e000..157c321 100644 --- a/test/stat/3.stats.expected +++ b/test/stat/3.stats.expected @@ -1,3 +1,6 @@ +# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities +# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow) +CHK 1a1c1362 ce379e9a 7bab45da # Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part. SN raw total sequences: 2 SN filtered sequences: 0 diff --git a/test/stat/4.stats.expected b/test/stat/4.stats.expected index 8aafd23..d6c5622 100644 --- a/test/stat/4.stats.expected +++ b/test/stat/4.stats.expected @@ -1,3 +1,6 @@ +# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities +# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow) +CHK 1a1c1362 e7724556 7bab45da # Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part. SN raw total sequences: 2 SN filtered sequences: 0 diff --git a/test/stat/5.stats.expected b/test/stat/5.stats.expected index 4e0ffcd..6588d66 100644 --- a/test/stat/5.stats.expected +++ b/test/stat/5.stats.expected @@ -1,3 +1,6 @@ +# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities +# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow) +CHK 1a1c1362 32507d92 7bab45da # Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part. SN raw total sequences: 2 SN filtered sequences: 0 diff --git a/test/stat/6.stats.expected b/test/stat/6.stats.expected index 4e0ffcd..6588d66 100644 --- a/test/stat/6.stats.expected +++ b/test/stat/6.stats.expected @@ -1,3 +1,6 @@ +# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities +# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow) +CHK 1a1c1362 32507d92 7bab45da # Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part. SN raw total sequences: 2 SN filtered sequences: 0 -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/samtools.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
