This is an automated email from the git hooks/post-receive script. daube-guest pushed a commit to branch master in repository wgsim.
commit e33dee6300feb5ab2c5c4cbf9f6d13767cfcfba1 Author: Kevin Murray <s...@kdmurray.id.au> Date: Sat Jul 4 20:09:58 2015 +1000 Imported Upstream version 0.3.1-r13 --- .gitignore | 1 + README | 93 +++++++++++++ kseq.h | 225 ++++++++++++++++++++++++++++++ wgsim.c | 437 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ wgsim_eval.pl | 294 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 1050 insertions(+) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a01ee28 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.*.swp diff --git a/README b/README new file mode 100644 index 0000000..3b3f105 --- /dev/null +++ b/README @@ -0,0 +1,93 @@ +Introduction +============ + +Wgsim is a small tool for simulating sequence reads from a reference genome. +It is able to simulate diploid genomes with SNPs and insertion/deletion (INDEL) +polymorphisms, and simulate reads with uniform substitution sequencing errors. +It does not generate INDEL sequencing errors, but this can be partly +compensated by simulating INDEL polymorphisms. + +Wgsim outputs the simulated polymorphisms, and writes the true read coordinates +as well as the number of polymorphisms and sequencing errors in read names. +One can evaluate the accuracy of a mapper or a SNP caller with wgsim_eval.pl +that comes with the package. + + +Compilation +=========== + +gcc -g -O2 -Wall -o wgsim wgsim.c -lz -lm + + +History +======= + +Wgsim was modified from MAQ's read simulator by dropping dependencies to other +source codes in the MAQ package and incorporating patches from Colin Hercus +which allow to simulate INDELs longer than 1bp. Wgsim was originally released +in the SAMtools software package. I forked it out in 2011 as a standalone +project. A few improvements were also added in this course. + + +Evaluation +========== + +Simulation and evaluation +------------------------- + +The command line for simulation: + + wgsim -Nxxx -1yyy -d0 -S11 -e0 -rzzz hs37m.fa yyy-zzz.fq /dev/null + +where yyy is the read length, zzz is the error rate and $xxx * $yyy = 10000000. +By default, 15% of polymorphisms are INDELs and their lengths are drawn from a +geometric distribution with density 0.7*0.3^{l-1}. + +The command line for evaluation: + + wgsim_eval.pl unique aln.sam | wgsim_eval.pl alneval -g 20 + +The '-g' option may be changed with mappers. + + +System +------ + +GCC: 4.1.2 +CPU: AMD Opteron 8350 @ 2.0GHz +Mem: 128GB + + +Results +------- + +================================================================================================================== + 100bp 200bp 500bp 1000bp 10000bp + ------------------ ----------------- ----------------- ----------------- ----------------- + Program Metrics 2% 5% 10% 2% 5% 10% 2% 5% 10% 2% 5% 10% 2% 5% 10% +------------------------------------------------------------------------------------------------------------------ + CPU 249 198 136 325 262 163 332 243 232 320 235 215 235 197 189 + BWA-SW Q20% 85.1 63.6 21.4 93.7 88.9 53.5 96.4 95.7 89.2 96.6 96.2 95.1 97.7 98.3 97.7 + err% 0.01 0.06 0.20 0.00 0.01 0.14 0.00 0.01 0.01 0.00 0.00 0.01 0.00 0.00 0.00 + one% 94.6 77.4 35.7 97.5 95.1 67.6 98.6 98.5 93.4 99.0 98.9 98.3 99.7 99.8 99.7 +------------------------------------------------------------------------------------------------------------------ + CPU 302 484 1060 330 352 607 381 480 919 + AGILE Q20% 98.6 98.4 98.4 98.4 98.4 98.6 98.2 98.6 99.3 + err% 0.66 0.69 2.31 0.34 0.40 0.70 0.10 0.00 0.20 + one% 100 99.4 0 100 100 100 100 100 100 +================================================================================================================== + +1) AGILE throws "Floating point exception" halfway for 100/200bp reads. The + default output is supposed to be PSL, but actually has an additional "score" + column. AGILE is reportedly faster than BWA-SW for 1000bp reads. It is + slower here possibly because of suboptimal command line options. + +2) Gassst uses over 27GB memory in 20 minutes. The memory then quickly + increases to over 40GB. It gets killed. + +3) Lastz complains: "FAILURE: bad fasta character in hs37m.fa ...". + +4) Pash only gives 'unique mapping'. Its unique mapping is better than BWA-SW's + Q1 mapiping. It is very slow, though, possibly because of suboptimal + options. + diff --git a/kseq.h b/kseq.h new file mode 100644 index 0000000..5ba985e --- /dev/null +++ b/kseq.h @@ -0,0 +1,225 @@ +/* The MIT License + + Copyright (c) 2008, 2009 Attractive Chaos <attrac...@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + 2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*" + */ + +/* Last Modified: 12APR2009 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include <ctype.h> +#include <string.h> +#include <stdlib.h> + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_MAX 1 + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + unsigned char *buf; \ + int begin, end, is_eof; \ + type_t f; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ + } + +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + return (int)ks->buf[ks->begin++]; \ + } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { \ + if (dret) *dret = 0; \ + str->l = 0; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < i - ks->begin + 1) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (str->l == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } \ + str->s[str->l] = '\0'; \ + return str->l; \ + } + +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) + +#define __KSEQ_BASIC(type_t) \ + static inline kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + static inline void kseq_rewind(kseq_t *ks) \ + { \ + ks->last_char = 0; \ + ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ + } \ + static inline void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ \ + static int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* the first header char has been read */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ + if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (isgraph(c)) { /* printable non-space character */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l++] = (char)c; \ + } \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* we should not stop here */ \ + while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ + if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ + seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT(type_t, __read) \ + KSTREAM_INIT(type_t, __read, 4096) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(type_t) \ + __KSEQ_READ + +#endif diff --git a/wgsim.c b/wgsim.c new file mode 100644 index 0000000..5c82192 --- /dev/null +++ b/wgsim.c @@ -0,0 +1,437 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + 2011 Heng Li <l...@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* This program is separated from maq's read simulator with Colin + * Hercus' modification to allow longer indels. */ + +#include <stdlib.h> +#include <math.h> +#include <time.h> +#include <assert.h> +#include <stdio.h> +#include <unistd.h> +#include <stdint.h> +#include <ctype.h> +#include <string.h> +#include <zlib.h> +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +#define PACKAGE_VERSION "0.3.1-r13" + +const uint8_t nst_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +/* Simple normal random number generator, copied from genran.c */ + +double ran_normal() +{ + static int iset = 0; + static double gset; + double fac, rsq, v1, v2; + if (iset == 0) { + do { + v1 = 2.0 * drand48() - 1.0; + v2 = 2.0 * drand48() - 1.0; + rsq = v1 * v1 + v2 * v2; + } while (rsq >= 1.0 || rsq == 0.0); + fac = sqrt(-2.0 * log(rsq) / rsq); + gset = v1 * fac; + iset = 1; + return v2 * fac; + } else { + iset = 0; + return gset; + } +} + +/* wgsim */ + +enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000}; +typedef unsigned short mut_t; +static mut_t mutmsk = (mut_t)0xf000; + +typedef struct { + int l, m; /* length and maximum buffer size */ + mut_t *s; /* sequence */ +} mutseq_t; + +static double ERR_RATE = 0.02; +static double MUT_RATE = 0.001; +static double INDEL_FRAC = 0.15; +static double INDEL_EXTEND = 0.3; +static double MAX_N_RATIO = 0.05; + +void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2) +{ + int i, deleting = 0; + mutseq_t *ret[2]; + + ret[0] = hap1; ret[1] = hap2; + ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l; + ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m; + ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t)); + ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t)); + for (i = 0; i != ks->seq.l; ++i) { + int c; + c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]]; + if (deleting) { + if (drand48() < INDEL_EXTEND) { + if (deleting & 1) ret[0]->s[i] |= DELETE; + if (deleting & 2) ret[1]->s[i] |= DELETE; + continue; + } else deleting = 0; + } + if (c < 4 && drand48() < MUT_RATE) { // mutation + if (drand48() >= INDEL_FRAC) { // substitution + double r = drand48(); + c = (c + (int)(r * 3.0 + 1)) & 3; + if (is_hap || drand48() < 0.333333) { // hom + ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c; + } else { // het + ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c; + } + } else { // indel + if (drand48() < 0.5) { // deletion + if (is_hap || drand48() < 0.333333) { // hom-del + ret[0]->s[i] = ret[1]->s[i] = DELETE; + deleting = 3; + } else { // het-del + deleting = drand48()<0.5?1:2; + ret[deleting-1]->s[i] = DELETE; + } + } else { // insertion + int num_ins = 0, ins = 0; + do { + num_ins++; + ins = (ins << 2) | (int)(drand48() * 4.0); + } while (num_ins < 4 && drand48() < INDEL_EXTEND); + + if (is_hap || drand48() < 0.333333) { // hom-ins + ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c; + } else { // het-ins + ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c; + } + } + } + } + } +} +void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2) +{ + int i, j = 0; // j keeps the end of the last deletion + for (i = 0; i != ks->seq.l; ++i) { + int c[3]; + c[0] = nst_nt4_table[(int)ks->seq.s[i]]; + c[1] = hap1->s[i]; c[2] = hap2->s[i]; + if (c[0] >= 4) continue; + if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) { + if (c[1] == c[2]) { // hom + if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution + printf("%s\t%d\t%c\t%c\t-\n", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]); + } else if ((c[1]&mutmsk) == DELETE) { // del + if (i >= j) { + printf("%s\t%d\t", name, i+1); + for (j = i; j < ks->seq.l && hap1->s[j] == hap2->s[j] && (hap1->s[j]&mutmsk) == DELETE; ++j) + putchar("ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]]); + printf("\t-\t-\n"); + } + } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins + printf("%s\t%d\t-\t", name, i+1); + int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; + while (n > 0) { + putchar("ACGTN"[ins & 0x3]); + ins >>= 2; + n--; + } + printf("\t-\n"); + } // else: deleted base in a long deletion + } else { // het + if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution + printf("%s\t%d\t%c\t%c\t+\n", name, i+1, "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]); + } else if ((c[1]&mutmsk) == DELETE) { + if (i >= j) { + printf("%s\t%d\t", name, i+1); + for (j = i; j < ks->seq.l && hap1->s[j] != hap2->s[j] && (hap1->s[j]&mutmsk) == DELETE; ++j) + putchar("ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]]); + printf("\t-\t-\n"); + } + } else if ((c[2]&mutmsk) == DELETE) { + if (i >= j) { + printf("%s\t%d\t", name, i+1); + for (j = i; j < ks->seq.l && hap1->s[j] != hap2->s[j] && (hap2->s[j]&mutmsk) == DELETE; ++j) + putchar("ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]]); + printf("\t-\t-\n"); + } + } else if (((c[1] & mutmsk) >> 12) <= 4 && ((c[1] & mutmsk) >> 12) > 0) { // ins1 + printf("%s\t%d\t-\t", name, i+1); + int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; + while (n > 0) { + putchar("ACGTN"[ins & 0x3]); + ins >>= 2; + n--; + } + printf("\t+\n"); + } else if (((c[2] & mutmsk) >> 12) <= 4 || ((c[2] & mutmsk) >> 12) > 0) { // ins2 + printf("%s\t%d\t-\t", name, i+1); + int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4; + while (n > 0) { + putchar("ACGTN"[ins & 0x3]); + ins >>= 2; + n--; + } + printf("\t+\n"); + } // else: deleted base in a long deletion + } + } + } +} + +void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r) +{ + kseq_t *ks; + mutseq_t rseq[2]; + gzFile fp_fa; + uint64_t tot_len, ii; + int i, l, n_ref; + char *qstr; + int size[2], Q, max_size; + uint8_t *tmp_seq[2]; + mut_t *target; + + l = size_l > size_r? size_l : size_r; + qstr = (char*)calloc(l+1, 1); + tmp_seq[0] = (uint8_t*)calloc(l+2, 1); + tmp_seq[1] = (uint8_t*)calloc(l+2, 1); + size[0] = size_l; size[1] = size_r; + max_size = size_l > size_r? size_l : size_r; + + Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; + + fp_fa = gzopen(fn, "r"); + ks = kseq_init(fp_fa); + tot_len = n_ref = 0; + fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__); + while ((l = kseq_read(ks)) >= 0) { + tot_len += l; + ++n_ref; + } + fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len); + kseq_destroy(ks); + gzclose(fp_fa); + + fp_fa = gzopen(fn, "r"); + ks = kseq_init(fp_fa); + while ((l = kseq_read(ks)) >= 0) { + uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5); + if (l < dist + 3 * std_dev) { + fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev); + continue; + } + + // generate mutations and print them out + wgsim_mut_diref(ks, is_hap, rseq, rseq+1); + wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1); + + for (ii = 0; ii != n_pairs; ++ii) { // the core loop + double ran; + int d, pos, s[2], is_flip = 0; + int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k; + FILE *fpo[2]; + + do { // avoid boundary failure + ran = ran_normal(); + ran = ran * std_dev + dist; + d = (int)(ran + 0.5); + d = d > max_size? d : max_size; + pos = (int)((l - d + 1) * drand48()); + } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l); + + // flip or not + if (drand48() < 0.5) { + fpo[0] = fpout1; fpo[1] = fpout2; + s[0] = size[0]; s[1] = size[1]; + } else { + fpo[1] = fpout1; fpo[0] = fpout2; + s[1] = size[0]; s[0] = size[1]; + is_flip = 1; + } + + // generate the read sequences + target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated + n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0; + +#define __gen_read(x, start, iter) do { \ + for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) { \ + int c = target[i], mut_type = c & mutmsk; \ + if (ext_coor[x] < 0) { \ + if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \ + ext_coor[x] = i; \ + } \ + if (mut_type == DELETE) ++n_indel[x]; \ + else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \ + tmp_seq[x][k++] = c & 0xf; \ + if (mut_type == SUBSTITUTE) ++n_sub[x]; \ + } else { \ + int n, ins; \ + ++n_indel[x]; \ + tmp_seq[x][k++] = c & 0xf; \ + for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \ + tmp_seq[x][k++] = ins & 0x3; \ + } \ + } \ + if (k != s[x]) ext_coor[x] = -10; \ + } while (0) + + __gen_read(0, pos, ++i); + __gen_read(1, pos + d - 1, --i); + for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement + if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s) + --ii; + continue; + } + + // generate sequencing errors + for (j = 0; j < 2; ++j) { + int n_n = 0; + for (i = 0; i < s[j]; ++i) { + int c = tmp_seq[j][i]; + if (c >= 4) { // actually c should be never larger than 4 if everything is correct + c = 4; + ++n_n; + } else if (drand48() < ERR_RATE) { + // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors + c = (c + 1) & 3; // recurrent sequencing errors + ++n_err[j]; + } + tmp_seq[j][i] = c; + } + if ((double)n_n / s[j] > MAX_N_RATIO) break; + } + if (j < 2) { // too many ambiguous bases on one of the reads + --ii; + continue; + } + + // print + for (j = 0; j < 2; ++j) { + for (i = 0; i < s[j]; ++i) qstr[i] = Q; + qstr[i] = 0; + fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1, + n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], + (long long)ii, j==0? is_flip+1 : 2-is_flip); + for (i = 0; i < s[j]; ++i) + fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]); + fprintf(fpo[j], "\n+\n%s\n", qstr); + } + } + free(rseq[0].s); free(rseq[1].s); + } + kseq_destroy(ks); + gzclose(fp_fa); + free(qstr); + free(tmp_seq[0]); free(tmp_seq[1]); +} + +static int simu_usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Program: wgsim (short read simulator)\n"); + fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); + fprintf(stderr, "Contact: Heng Li <l...@sanger.ac.uk>\n\n"); + fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n"); + fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE); + fprintf(stderr, " -d INT outer distance between the two ends [500]\n"); + fprintf(stderr, " -s INT standard deviation [50]\n"); + fprintf(stderr, " -N INT number of read pairs [1000000]\n"); + fprintf(stderr, " -1 INT length of the first read [70]\n"); + fprintf(stderr, " -2 INT length of the second read [70]\n"); + fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE); + fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC); + fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND); + fprintf(stderr, " -S INT seed for random generator [-1]\n"); + fprintf(stderr, " -A FLOAT disgard if the fraction of ambiguous bases higher than FLOAT [%.2f]\n", MAX_N_RATIO); + fprintf(stderr, " -h haplotype mode\n"); + fprintf(stderr, "\n"); + return 1; +} + +int main(int argc, char *argv[]) +{ + int64_t N; + int dist, std_dev, c, size_l, size_r, is_hap = 0; + FILE *fpout1, *fpout2; + int seed = -1; + + N = 1000000; dist = 500; std_dev = 50; + size_l = size_r = 70; + while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:A:")) >= 0) { + switch (c) { + case 'd': dist = atoi(optarg); break; + case 's': std_dev = atoi(optarg); break; + case 'N': N = atoi(optarg); break; + case '1': size_l = atoi(optarg); break; + case '2': size_r = atoi(optarg); break; + case 'e': ERR_RATE = atof(optarg); break; + case 'r': MUT_RATE = atof(optarg); break; + case 'R': INDEL_FRAC = atof(optarg); break; + case 'X': INDEL_EXTEND = atof(optarg); break; + case 'A': MAX_N_RATIO = atof(optarg); break; + case 'S': seed = atoi(optarg); break; + case 'h': is_hap = 1; break; + } + } + if (argc - optind < 3) return simu_usage(); + fpout1 = fopen(argv[optind+1], "w"); + fpout2 = fopen(argv[optind+2], "w"); + if (!fpout1 || !fpout2) { + fprintf(stderr, "[wgsim] file open error\n"); + return 1; + } + if (seed <= 0) seed = time(0)&0x7fffffff; + fprintf(stderr, "[wgsim] seed = %d\n", seed); + srand48(seed); + wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r); + + fclose(fpout1); fclose(fpout2); + return 0; +} diff --git a/wgsim_eval.pl b/wgsim_eval.pl new file mode 100755 index 0000000..5342333 --- /dev/null +++ b/wgsim_eval.pl @@ -0,0 +1,294 @@ +#!/usr/bin/perl -w + +# Author: lh3 + +use strict; +use warnings; +use Getopt::Std; + +die (qq/ +Usage: wgsim_eval.pl <command> <arguments> + +Command: alneval evaluate alignment in the SAM format + vareval evaluate variant calls in the pileup format + unique keep the top scoring hit in SAM + uniqcmp compare two alignments without multiple hits +\n/) if (@ARGV == 0); +my $command = shift(@ARGV); +if ($command eq 'alneval') { + &alneval; +} elsif ($command eq 'vareval') { + &vareval; +} elsif ($command eq 'unique') { + &unique; +} elsif ($command eq 'uniqcmp') { + &uniqcmp; +} else { + die("[wgsim_eval] unrecognized command.\n"); +} +exit; + +sub alneval { + my %opts = (g=>20); + getopts('pag:', \%opts); + die(qq/ +Usage: wgsim_eval.pl alneval [options] <in.sam>\n +Options: -p print wrong alignments + -g INT correct if withint INT of the true coordinate +\n/) if (@ARGV == 0 && -t STDIN); + my (@c0, @c1, %fnfp); + my ($max_q, $flag) = (0, 0); + my $gap = $opts{g}; + $flag |= 1 if (defined $opts{p}); + while (<>) { + next if (/^\@/); + my @t = split("\t"); + next if (@t < 11); + my $line = $_; + my ($q, $is_correct, $chr, $left, $rght) = (int($t[4]/10), 1, $t[2], $t[3], $t[3]); + $max_q = $q if ($q > $max_q); + # right coordinate + $_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg; + --$rght; + # correct for clipping + my ($left0, $rght0) = ($left, $rght); + $left -= $1 if (/^(\d+)[SH]/); + $rght += $1 if (/(\d+)[SH]$/); + $left0 -= $1 if (/(\d+)[SH]$/); + $rght0 += $1 if (/^(\d+)[SH]/); + # skip unmapped reads + next if (($t[1]&0x4) || $chr eq '*'); + # parse read name and check + if ($t[0] =~ /^(\S+)_(\d+)_(\d+)_/) { + if ($1 ne $chr) { # different chr + $is_correct = 0; + } else { + if ($t[1] & 0x10) { # reverse + $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap); # in case of indels that are close to the end of a reads + } else { + $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap); + } + } + } else { + warn("[wgsim_eval] read '$t[0]' was not generated by wgsim?\n"); + next; + } + ++$c0[$q]; + ++$c1[$q] unless ($is_correct); + @{$fnfp{$t[4]}} = (0, 0) unless (defined $fnfp{$t[4]}); + ++$fnfp{$t[4]}[0]; + ++$fnfp{$t[4]}[1] unless ($is_correct); + print STDERR $line if (($flag&1) && !$is_correct && $q > 0); + } + # print + my ($cc0, $cc1) = (0, 0); + if (!defined($opts{a})) { + for (my $i = $max_q; $i >= 0; --$i) { + $c0[$i] = 0 unless (defined $c0[$i]); + $c1[$i] = 0 unless (defined $c1[$i]); + $cc0 += $c0[$i]; $cc1 += $c1[$i]; + printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0) if ($cc0); + } + } else { + for (reverse(sort {$a<=>$b} (keys %fnfp))) { + next if ($_ == 0); + $cc0 += $fnfp{$_}[0]; + $cc1 += $fnfp{$_}[1]; + print join("\t", $_, $cc0, $cc1), "\n"; + } + } +} + +sub vareval { + my %opts = (g=>10, Q=>200); + getopts('g:p', \%opts); + my $skip = $opts{g}; + die("Usage: wgsim_eval.pl vareval [-g $opts{g}] <wgsim.snp> <pileup.flt>\n") if (@ARGV < 1); + + my $is_print = defined($opts{p})? 1 : 0; + + my ($fh, %snp, %indel); + # read simulated variants + open($fh, $ARGV[0]) || die; + while (<$fh>) { + my @t = split; + if (@t != 5 || $t[2] eq '-' || $t[3] eq '-') { + $indel{$t[0]}{$t[1]} = 1; + } else { + $snp{$t[0]}{$t[1]} = $t[3]; + } + } + close($fh); + + shift(@ARGV); + my (@cnt); + for my $i (0 .. 3) { + for my $j (0 .. $opts{Q}) { + $cnt[$i][$j] = 0; + } + } + while (<>) { + my @t = split; + my $q = $t[5]; + next if ($t[2] eq $t[3]); + $q = $opts{Q} if ($q > $opts{Q}); + if ($t[2] eq '*') { + my $hit = 0; + ++$cnt[2][$q]; + for my $i ($t[1] - $skip .. $t[1] + $skip) { + if (defined $indel{$t[0]}{$i}) { + $hit = 1; last; + } + } + ++$cnt[3][$q] if ($hit == 0); + print STDERR $_ if ($hit == 0 && $is_print); + } else { + ++$cnt[0][$q]; + ++$cnt[1][$q] unless (defined $snp{$t[0]}{$t[1]}); + print STDERR $_ if (!defined($snp{$t[0]}{$t[1]}) && $is_print); + } + } + + for (my $i = $opts{Q} - 1; $i >= 0; --$i) { + $cnt[$_][$i] += $cnt[$_][$i+1] for (0 .. 3); + } + + for (my $i = $opts{Q}; $i >= 0; --$i) { + print join("\t", $i, $cnt[0][$i], $cnt[1][$i], $cnt[2][$i], $cnt[3][$i]), "\n"; + } +} + +sub unique { + # -f: parameter to recalute mapping quality + # -Q: do not recalculate mapping quality + # -a, -b, -q, -r: scoring system + my %opts = (f=>250.0, q=>5, r=>2, a=>1, b=>3); + getopts('Qf:q:r:a:b:m', \%opts); + die(qq/ +Usage: wgsim_eval.pl unique [options] <in.sam>\n +Options: -Q recompuate mapping quality from multiple hits + -f FLOAT mapQ=FLOAT*(best1-best2)\/best1 [opts{f}] + -a INT matching score (when AS tag is absent) [$opts{a}] + -q INT gap open penalty [$opts{q}] + -r INT gap extension penalty [$opts{r}] +\n/) if (@ARGV == 0 && -t STDIN); + my $last = ''; + my $recal_Q = defined($opts{Q}); + my $multi_only = defined($opts{m}); + my @a; + while (<>) { + my $score = -1; + print $_ if (/^\@/); + $score = $1 if (/AS:i:(\d+)/); + my @t = split("\t"); + next if (@t < 11); + if ($score < 0) { # AS tag is unavailable + my $cigar = $t[5]; + my ($mm, $go, $ge) = (0, 0, 0); + $cigar =~ s/(\d+)[ID]/++$go,$ge+=$1/eg; + $cigar = $t[5]; + $cigar =~ s/(\d+)M/$mm+=$1/eg; + $score = $mm * $opts{a} - $go * $opts{q} - $ge * $opts{r}; # no mismatches... + } + $score = 1 if ($score < 1); + if ($t[0] ne $last) { + &unique_aux(\@a, $opts{f}, $recal_Q, $multi_only) if (@a); + $last = $t[0]; + } + push(@a, [$score, \@t]); + } + &unique_aux(\@a, $opts{f}, $recal_Q, $multi_only) if (@a); +} + +sub unique_aux { + my ($a, $fac, $is_recal, $multi_only) = @_; + my ($max, $max2, $max_i) = (0, 0, -1); + for (my $i = 0; $i < @$a; ++$i) { + if ($a->[$i][0] > $max) { + $max2 = $max; $max = $a->[$i][0]; $max_i = $i; + } elsif ($a->[$i][0] > $max2) { + $max2 = $a->[$i][0]; + } + } + if ($is_recal) { + if (!$multi_only || @$a > 1) { + my $q = int($fac * ($max - $max2) / $max + .499); + $q = 250 if ($q > 250); + $a->[$max_i][1][4] = $q < 250? $q : 250; + } + } + print join("\t", @{$a->[$max_i][1]}); + @$a = (); +} + +sub uniqcmp { + my %opts = (q=>20, s=>100, b=>4); + getopts('pq:s:b:', \%opts); + die(qq/ +Usage: wgsim_eval.pl uniqcmp [options] <in1.sam> <in2.sam>\n +Options: -q INT confident mapping if mapping quality above INT [$opts{q}] + -s INT same mapping if the distance below INT [$opts{s}] + -b INT penalty for a difference [$opts{b}] +\n/) if (@ARGV < 2); + my ($fh, %a); + warn("[uniqcmp] read the first file...\n"); + &uniqcmp_aux($ARGV[0], \%a, 0, $opts{b}); + warn("[uniqcmp] read the second file...\n"); + &uniqcmp_aux($ARGV[1], \%a, 1, $opts{b}); + warn("[uniqcmp] stats...\n"); + my @cnt; + $cnt[$_] = 0 for (0..9); + for my $x (keys %a) { + my $p = $a{$x}; + my $z; + if (defined($p->[0]) && defined($p->[1])) { + $z = ($p->[0][0] == $p->[1][0] && $p->[0][1] eq $p->[1][1] && abs($p->[0][2] - $p->[1][2]) < $opts{s})? 0 : 1; + if ($p->[0][3] >= $opts{q} && $p->[1][3] >= $opts{q}) { + ++$cnt[$z*3+0]; + } elsif ($p->[0][3] >= $opts{q}) { + ++$cnt[$z*3+1]; + } elsif ($p->[1][3] >= $opts{q}) { + ++$cnt[$z*3+2]; + } + print STDERR "$x\t$p->[0][1]:$p->[0][2]\t$p->[0][3]\t$p->[0][4]\t$p->[1][1]:$p->[1][2]\t$p->[1][3]\t$p->[1][4]\t", + $p->[0][5]-$p->[1][5], "\n" if ($z && defined($opts{p}) && ($p->[0][3] >= $opts{q} || $p->[1][3] >= $opts{q})); + } elsif (defined($p->[0])) { + ++$cnt[$p->[0][3]>=$opts{q}? 6 : 7]; + print STDERR "$x\t$p->[0][1]:$p->[0][2]\t$p->[0][3]\t$p->[0][4]\t*\t0\t*\t", + $p->[0][5], "\n" if (defined($opts{p}) && $p->[0][3] >= $opts{q}); + } else { + print STDERR "$x\t*\t0\t*\t$p->[1][1]:$p->[1][2]\t$p->[1][3]\t$p->[1][4]\t", + -$p->[1][5], "\n" if (defined($opts{p}) && $p->[1][3] >= $opts{q}); + ++$cnt[$p->[1][3]>=$opts{q}? 8 : 9]; + } + } + print "Consistent (high, high): $cnt[0]\n"; + print "Consistent (high, low ): $cnt[1]\n"; + print "Consistent (low , high): $cnt[2]\n"; + print "Inconsistent (high, high): $cnt[3]\n"; + print "Inconsistent (high, low ): $cnt[4]\n"; + print "Inconsistent (low , high): $cnt[5]\n"; + print "Second missing (high): $cnt[6]\n"; + print "Second missing (low ): $cnt[7]\n"; + print "First missing (high): $cnt[8]\n"; + print "First missing (low ): $cnt[9]\n"; +} + +sub uniqcmp_aux { + my ($fn, $a, $which, $b) = @_; + my $fh; + $fn = "samtools view $fn |" if ($fn =~ /\.bam/); + open($fh, $fn) || die; + while (<$fh>) { + my @t = split; + next if (@t < 11); +# my $l = ($t[5] =~ /^(\d+)S/)? $1 : 0; + my $l = 0; + my ($x, $nm) = (0, 0); + $nm = $1 if (/NM:i:(\d+)/); + $_ = $t[5]; + s/(\d+)[MI]/$x+=$1/eg; + @{$a->{$t[0]}[$which]} = (($t[1]&0x10)? 1 : 0, $t[2], $t[3]-$l, $t[4], "$x:$nm", $x - $b * $nm); + } + close($fh); +} -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/wgsim.git _______________________________________________ debian-med-commit mailing list debian-med-commit@lists.alioth.debian.org http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit