This is an automated email from the git hooks/post-receive script. tille pushed a commit to branch master in repository gasic.
commit 1e9230efd6ba28de5d4f02c12f71f39eca0fb8bc Author: Andreas Tille <[email protected]> Date: Sat Feb 8 09:20:48 2014 +0100 Imported Upstream version 0.0.r18 --- LICENSE | 24 +++++++ README | 126 ++++++++++++++++++++++++++++++++++ core/__init__.py | 0 core/gasic.py | 186 ++++++++++++++++++++++++++++++++++++++++++++++++++ core/tools.py | 152 +++++++++++++++++++++++++++++++++++++++++ correct_abundances.py | 177 +++++++++++++++++++++++++++++++++++++++++++++++ create_matrix.py | 158 ++++++++++++++++++++++++++++++++++++++++++ doc/MANUAL | 183 +++++++++++++++++++++++++++++++++++++++++++++++++ doc/example_script.py | 172 ++++++++++++++++++++++++++++++++++++++++++++++ quality_check.py | 131 +++++++++++++++++++++++++++++++++++ run_mappers.py | 85 +++++++++++++++++++++++ 11 files changed, 1394 insertions(+) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..2bba2f5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +Copyright (c) 2012, Martin S. Lindner, [email protected], +Robert Koch-Institut, Germany, +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL MARTIN S. LINDNER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README b/README new file mode 100644 index 0000000..10d94ab --- /dev/null +++ b/README @@ -0,0 +1,126 @@ +GASiC - Genome Abundance Similarity Correction + + +Downloading +----------- + +GASiC is hosted on sourceforge.net + + http://sourceforge.net/projects/gasic/ + +Further information can be found on the group homepage: + + http://www.rki-ng4.de/ + + +System Requirements +------------------- + +GASiC was built and tested on Linux operating systems. + +For Windows or Mac OS users, we provide a virtual machine (VM) with a +pre-installed and tested version of GASiC. Download links and further +information can be found at: + + http://sourceforge.net/projects/gasic/ + +and + + http://www.rki-ng4.de/ + +The VirtualBox software is reqired to run the virtual machine. It can +can be obtained at: + + https://www.virtualbox.org/ + + +Documentation +------------- + +Documentation can be found in the folder 'doc/'. The sourceforge wiki +pages also contain useful information. + +GASiC is self-documenting. Calling a script with the '--help' option +displays help on the script functionality. + +The GASiC source code is thoroughly commented. Developers might find +answers there. + + +Installation +------------ + +GASiC is a Python script library and does not need installation. If all +dependencies are installed correctly, GASiC scripts can be run directly +from the main folder. + +We recommend to add the GASiC main directory to your PYTHONPATH environment +variable. On the console type: + export PYTHONPATH=$PYTHONPATH:/path/to/GASiC + +To use GASiC from any directory, you may also adjust your PATH environment +variable: + export PATH=$PATH:/path/to/GASiC + + +Dependencies +------------ + +GASiC was tested with the software listed below. Older versions might still +work, but are not tested. + + * Python 2.7, http://www.python.org/ + + * NumPy 1.6.1, http://numpy.scipy.org/ + * SciPy 0.10.0, http://www.scipy.org/ + * BioPython 1.58, http://biopython.org/wiki/Biopython + * pysam 0.6, http://code.google.com/p/pysam/ + +GASiC calls alignment tools and read simulators from the command line. These +tools must be installed and added to the PATH before GASiC can be used. +We tested GASiC with the following tools: + + * bowtie, http://bowtie-bio.sourceforge.net/index.shtml + * bowtie2, http://bowtie-bio.sourceforge.net/bowtie2/index.shtml + * bwa, http://bio-bwa.sourceforge.net/bwa.shtml + + * Mason, http://www.seqan.de/projects/mason/ + * dwgsim, http://dnaa.sourceforge.net/ + + +Bug Reporting +------------- + +You can send GASiC bug reports to <[email protected]>. + +Please provide a minimal example of the code producing the error and the +error message(s). Also check which libraries (see Dependencies) you use. + + + + +------------------------------------------------------------------------------- +Copyright (c) 2012, Martin S. Lindner, [email protected], +Robert Koch-Institut, Germany, +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL MARTIN S. LINDNER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/core/__init__.py b/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/core/gasic.py b/core/gasic.py new file mode 100644 index 0000000..687e1be --- /dev/null +++ b/core/gasic.py @@ -0,0 +1,186 @@ +""" +Copyright (c) 2012, Martin S. Lindner, [email protected], +Robert Koch-Institut, Germany, +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL MARTIN S. LINDNER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" +import sys +import numpy as np +import numpy.linalg as la +import scipy.optimize as opt + + +def similarity_correction(sim, reads, N): + """ Calculate corrected abundances given a similarity matrix and observations using optimization. + + Input: + sim: [numpy.array (M,M)]: with pairwise similarities between species + reads [numpy.array (M,)]: with number of observed reads for each species + N [int]: total number of reads + + Output: + abundances [numpy.array (M,)]: estimated abundance of each species in the sample + """ + + # transform reads to abundances and rename similarity matrix + A = sim + r = reads.astype(np.float) / N + + rng = range(len(reads)) + + # Now solve the optimization problem: min_c |Ac-r|^2 s.t. c_i >= 0 and 1-sum(c_i) >= 0 + # construct objective function + def objective (c): + # numpy implementation of objective function |A*c-r|^2 + return np.sum(np.square(np.dot(A,c)-r)) + + # construct constraints + def build_con(k): + # constraint: k'th component of x should be >0 + return lambda c: c[k] + cons = [build_con(k) for k in rng] + # constraint: the sum of all components of x should not exceed 1 + cons.append( lambda c: 1-np.sum(c) ) + + # initial guess + c_0 = np.array([0.5 for i in range(len(reads))]) + + # finally: optimization procedure + abundances = opt.fmin_cobyla(objective, c_0, cons, disp=0, rhoend=1e-10, maxfun=10000) + + return abundances + + + + +def similarity_correction_smp(sim, reads, N, subsets=1): + """ Calculate corrected abundances given a similarity matrix and observations + using optimization. Sample subsets of the reads to obtain a more stable + estimation of the abundance. + + Input: + sim: [numpy.array (M,M)]: with pairwise similarities between species + reads [numpy.array (M,N)]: read mapping information about every species + N [int]: total number of reads + subsets [int]: divide the reads in subsets and use the median of the estimated abundances + + Output: + abundances [numpy.array (M,)]: estimated relative abundance of each species in the sample + """ + # Each subset consists of every subsets'th read. Count the number of matching reads + # per species in each subset + M = reads.shape[0] + reads_sbs = np.zeros((M, subsets)) + for s in range(subsets): + reads_sbs[:,s] = np.sum(reads[:,s::subsets], axis=1) + + N_sbs = np.ceil(float(N)/subsets) + + # calculate the corrected abundances for each subset + sbs_corr = np.zeros( (M,subsets) ) + for s in range(subsets): + sbs_corr[:,s] = similarity_correction(sim, reads_sbs[:,s], N_sbs) + + abundances = np.median(sbs_corr,axis=1) + + return abundances + + + +def bootstrap_similarity_matrix(mapped_reads): + """ + Calculate a similarity matrix by bootstrapping. + + INPUT: + mapped_reads: mapping information as generated by similarity_matrix_raw(). + + OUTPUT: + d_matrix: similarity matrix. Ordering is the same as in 'names' used in the + similarity_matrix_raw() function. + """ + # get the number of sequences and simulated reads from the shape of the mapped reads matrix + num_reads = mapped_reads.shape[2] + num_seq = mapped_reads.shape[0] + + # create a bootstrap index vector + bootstrap_indices = np.random.randint(num_reads, size=(num_reads,)) + + # count number of mapped reads in bootstrap sample + counts = np.sum( mapped_reads[:,:,bootstrap_indices], axis=2 ) + + # normalize read counts and build similarity matrix + s_matrix = np.zeros((num_seq,num_seq)) + for i in range(num_seq): + for j in range(num_seq): + # similarity matrix entries as described in the paper + s_matrix[i,j] = float(counts[j,i])/counts[i,i] + + return s_matrix + + + +def bootstrap(reads, smat_raw, B, test_c=0.01): + """ + Similarity correction using a bootstrapping procedure for more robust corrections and error + estimates. + reads: [numpy.array (M,N)] array with mapping information; reads[m,n]==1, if read n mapped to + species m. + smat_raw: mapping information for similarity matrix. species have same ordering as reads array + B: Number of bootstrap samples + test_c: For testing: treat species as not present, if estimated concentration is below test_c. + """ + # M: Number of species, N: Number of reads + M,N = reads.shape + + # initialize arrays to store results + found = np.zeros( (B,M) ) + corr = np.zeros( (B,M) ) + fails = np.zeros( (B,M) ) + + for b in range(B): + msg = " bootstrapping %i of %i"%(b+1,B) + sys.stdout.write("\r" + " "*(len(msg)+5) + "\r") + sys.stdout.write(msg) + sys.stdout.flush() + + # select a bootstrap sample + random_set = np.random.randint(N,size=N) + + # count the number of matching reads in bootstrap sample + found[b,:] = np.sum(reads[:,random_set], axis=1) + #for r in range(N): + # found[b,:] += reads[:,random_set[r]] + + # bootstrap a similarity matrix + smat = bootstrap_similarity_matrix(smat_raw) + + # calculate abundances + corr[b,:] = similarity_correction(smat,found[b,:],N) + + # check if the calculated abundance is below the test abundance + fails[b,:] = corr[b,:] < test_c + print + p_values = np.mean(fails, axis=0) + abundances = np.mean(corr, axis=0) + variances = np.var(corr, axis=0) + return p_values, abundances, variances diff --git a/core/tools.py b/core/tools.py new file mode 100644 index 0000000..6faa73f --- /dev/null +++ b/core/tools.py @@ -0,0 +1,152 @@ +""" +Copyright (c) 2012, Martin S. Lindner, [email protected], +Robert Koch-Institut, Germany, +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL MARTIN S. LINDNER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +import os + +# Names File Reader +# +# Reads a names file (plain text, one name per line) and returns an array of strings +def read_names(filename): + f = open(filename,'r') + names = [nm.rstrip('\n\r') for nm in f] + f.close() + return names + + +# Read Mappers +# +# Define caller functions for the read mappers here. These fuctions call the +# read mappers on the command line with the correspondig parameters. Output +# is a SAM file at the specified location. Each function takes 3 mandatory +# string arguments: +# index: name of the reference sequence or reference index +# reads: name of the file containing the reads +# out: name of the output SAM file +# Additional mapper parameters may be speciefied: +# param: string with parameters for the mapper [default: ""] +# +# Note: the mapper should be configured in that way, that it only reports ONE +# match for each read (e.g. the best match). + +def run_bowtie2(index, reads, out, param=""): + command = "bowtie2 -U {reads} -x {index} -S {samfile} {param} --local -M 0 ".format(reads=reads, index=index, samfile=out, param=param) + print "Executing:",command + os.system(command) + return 1 + +def run_bowtie(index, reads, out, param=""): + command = "bowtie -S -p 2 -q -3 30 -v 2 {param} {index} {reads} > {samfile}".format(index=index, reads=reads, samfile=out, param=param) + print "Executing:",command + os.system(command) + return 1 + +def run_bwa(index, reads, out, param=""): + command = "bwa aln {param} {index} {reads} > /tmp/res.sai".format(index=index, reads=reads, param=param) + print "Executing:",command + os.system(command) + # convert the bwa output to SAM and remove temporary file + command_sam = "bwa samse %s /tmp/res.sai %s > %s && rm /tmp/res.sai"%(index,reads,out) + os.system(command_sam) + return 1 + +def run_bwasw(index, reads, out, param=""): + command = "bwa bwasw {param} {index} {reads} > {samfile}".format(index=index, reads=reads, samfile=out, param=param) + print "Executing:",command + os.system(command) + return 1 + + +run_mapper = dict( bowtie=run_bowtie, + bowtie2=run_bowtie2, + bwa = run_bwa, + bwasw = run_bwasw,) + +""" +How to add your custom mapper + +1. Create a caller function + - Create a copy of one of the existing functions, e.g. run_bowtie + - Rename it, customize it, but DO NOT TOUCH the interface! + +2. Add the caller function to the run_mapper dict + - The dict entry should have the format: [name] = [caller function] + +3. Now you can use your mapper: + >>> import tools_lib + >>> tools_lib.run_mapper["name"] +""" + + + +# Read Simulators +# +# Define caller functions for the read simulators here. These fuctions call +# the read simulators on the command line with the correspondig parameters. +# Output is a SAM file at the specified location. Each function takes 3 +# mandatory string arguments: +# ref: name of the reference sequence file +# out: name of the output reads file +# +# Note: since simulators tend to have many tuning parameters, we encourage +# to create a seperate caller function for every scenario. + +def run_mason_illumina(ref, out): + command = "mason illumina -N 10000 -hi 0 -hs 0 -n 72 -sq -o %s %s"%(out,ref) + print "Executing:",command + os.system(command) + # remove the needless SAM file + command = "rm %s.sam"%out + os.system(command) + return 1 + +def run_dwgsim(ref, out): + command = "dwgsim -c 2 -1 80 -2 0 -r 0 -y 0 -e 0.002 -N 100000 -f TACG %s %s"%(ref, out) + print "Executing:",command + os.system(command) + # remove all additional files and rename reads file + command = "mv {out}.bfast.fastq {out} && rm {out}.bwa.read1.fastq && rm {out}.bwa.read2.fastq && rm {out}.mutations.txt".format(out=out) + os.system(command) + return 1 + + +run_simulator = dict( mason_illumina=run_mason_illumina, + dwgsim=run_dwgsim,) + +""" +How to add your custom read simulator + +1. Create a caller function + - Create a copy of one of the existing functions, e.g. run_dwgsim + - Rename it, customize it, but DO NOT TOUCH the interface! + +2. Add the caller function to the run_simulator dict + - The dict entry should have the format: [name] = [caller function] + +3. Now you can use your simulator: + >>> import tools_lib + >>> tools_lib.run_simulator["name"] +""" diff --git a/correct_abundances.py b/correct_abundances.py new file mode 100755 index 0000000..db367db --- /dev/null +++ b/correct_abundances.py @@ -0,0 +1,177 @@ +#!/usr/bin/python +""" +Copyright (c) 2012, Martin S. Lindner, [email protected], +Robert Koch-Institut, Germany, +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL MARTIN S. LINDNER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +import numpy as np +import pysam +import sys +import glob +import optparse + +from core import gasic +from core import tools + + +def similarity_correction(names, smat_raw, sam_pattern, bootstrap_samples): + """ + Perform similarity correction step. The similarity matrix and mapping + results must be available. + + INPUT: + names: array of genome names + smat_raw: mapping information for similarity matrix with same ordering of genomes as in 'names' + sam_pattern: pattern pointing to the filenames of the SAM-files to analyze + bootstrap_samples: number of bootstrap samples, use 1 to disable bootstrapping + + OUTPUT: + total: total number of reads in the dataset + num_reads: number of reads mapped to each genome (array) + corr: abundance of each genome after similarity correction + err: estimated standard error + p: p-value for the confidence, that the true abundance is above some threshold + """ + + print "\n--- GASiC correction\n" + # find out the number of reads + total = len( [1 for read in pysam.Samfile(sam_pattern%(names[0]), "r")] ) + print " found %i reads"%total + + # initialize some arrays + # mapping information; mapped[i,j]=1 if read j was successfully mapped to i. + mapped = np.zeros( (len(names), total) ) + + # total number of successfully mapped reads per reference + num_reads = np.zeros( (len(names),) ) + + print "\n--- Analyzing SAM files\n" + # analyze the SAM files + for n_ind,nm in enumerate(names): + msg = " SAM file %i of %i"%(n_ind+1,len(names)) + sys.stdout.write("\r" + " "*(len(msg)+5) + "\r") + sys.stdout.write(msg) + sys.stdout.flush() + + # open SAM file + sf = pysam.Samfile(sam_pattern%nm, "r") + + # go through reads in samfile and check if it was successfully mapped + mapped[n_ind,:] = np.array([int(not rd.is_unmapped) for rd in sf]) + num_reads[n_ind] = sum(mapped[n_ind,:]) + + print "\n\n--- Bootstrapping abundances\n" + # run similarity correction step + p,corr,var = gasic.bootstrap(mapped, smat_raw, bootstrap_samples) + err = np.sqrt(var) + + print "\nDone estimating abundances\n" + return total,num_reads,corr,err,p + + +def unique(names, sam_pattern): + """ Determine the number of unique reads for every species based on the read names. + INPUT: + names: array of genome names + sam_pattern: pattern pointing to the filenames of the SAM-files to analyze + + OUTPUT: + unique: number of unique reads per species. + """ + # one set for the names of mapped reads for each species + mapped_read_names = [set() for nm in names] + + for n,nm in enumerate(names): + # parse the samfile + sf = pysam.Samfile(sam_pattern%nm, "r") + for read in sf: + # add the hash of read name to the set, if read was mapped + if not read.is_unmapped: + mapped_read_names[n].add(hash(read.qname)) + + unique_read_names = [set() for nm in names] + for n in range(len(names)): + others = set() + for m in range(len(names)): + if n!=m: + others |= mapped_read_names[m] + unique_read_names[n] = mapped_read_names[n] - others + + return np.array([len(unq) for unq in unique_read_names]) + + +if __name__=="__main__": + usage = """%prog NAMES + +Run the similarity correction step. + + +Note: Although it is possible to run the read mappers by hand or to create the +similarity matrix manually, we strongly recommend to use the provided Python +scripts 'run_mappers.py' and 'create_similarity_matrix.py'. + +Input: +NAMES: Filename of the names file; the plain text names file should + contain one name per line. The name is used as identifier in + the whole algorithm. + +See the provided LICENSE file or source code for license information. +""" + + # configure the parser + parser = optparse.OptionParser(usage=usage) + parser.add_option('-m', '--similarity-matrix', type='string', dest='smat', default='./similarity_matrix.npy', help='Path to similarity matrix file. The similarity matrix must be created with the same NAMES file. [default: %default]') + parser.add_option('-s', '--samfiles', type='string', dest='sam', default='./SAM/%s.sam', help='Pattern pointing to the SAM files created by the mapper. Placeholder for the name is "%s". [default: %default]') + + parser.add_option('-b', '--bootstrap-samples', type='int', dest='boot', default=100, help='Set the number of bootstrap samples. Use 1 to disable bootstrapping [default: %default]') + parser.add_option('-o', '--output', type='string', dest='out', default='./results.txt', help='Plain text output file containing the results. [default: %default]') + # parse arguments + opt, args = parser.parse_args() + + numArgs = len(args) + if numArgs == 1: + # read the Names file + names_file = args[0] + names = tools.read_names(names_file) + + # load the similarity matrix + smat = np.load(opt.smat) + + # start similarity correction + total,num_reads,corr,err,p = similarity_correction(names, smat, opt.sam, opt.boot) + + # write results into tab separated file. + ofile = open(opt.out,'w') + ofile.write("#genome name\tmapped reads\testimated reads\testimated error\tp-value\n") + for n_ind,nm in enumerate(names): + # Name, mapped reads, corrected reads, estimated error, p-value + out = "{name}\t{mapped}\t{corr}\t{error}\t{pval}\n" + ofile.write(out.format(name=nm,mapped=num_reads[n_ind],corr=corr[n_ind]*total,error=err[n_ind]*total,pval=p[n_ind])) + ofile.close() + print "--- wrote results to", opt.out + + else: + parser.print_help() + sys.exit(1) diff --git a/create_matrix.py b/create_matrix.py new file mode 100755 index 0000000..fa412a6 --- /dev/null +++ b/create_matrix.py @@ -0,0 +1,158 @@ +#!/usr/bin/python +""" +Copyright (c) 2012, Martin S. Lindner, [email protected], +Robert Koch-Institut, Germany, +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL MARTIN S. LINDNER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +import sys +import os +import glob +import optparse + +import numpy as np +from Bio import SeqIO +import pysam + +from core import tools + + +def similarity_matrix_raw(names, ref_pattern, index_pattern, temp_dir, simulator, mapper): + """ + Perform the read generation and mapping step for the similarity matrix + calculation. The raw output is used to bootstrap a similarity matrix. + + INPUT: + names: array of genome names + ref_pattern: pattern pointing to the reference genome files used by the read simulator + index_pattern: pattern pointing to the mapping index files used by the read mapper + temp_dir: directory where temporary files are stored (simulated reads, SAM files). + Attention: make sure that there is enough space available! + + OUTPUT: + mapped_reads: Mapping information about mapped reads readily usable by + bootstrap_similarity_matrix(). + """ + + if not mapper in tools.run_mapper: + raise Exception('Aborting. Mapper "%s" not found in tools.py'%mapper) + if not simulator in tools.run_simulator: + raise Exception('Aborting. Simulator "%s" not found in tools.py'%simulator) + if not os.path.isdir(temp_dir): + os.makedirs(temp_dir) + + n_seq = len(names) + rng = range(n_seq) + + # construct arrays with real file names from pattern + ref_files = [ ref_pattern%nm for nm in names ] # filenames of reference sequences + index_files = [ index_pattern%nm for nm in names ] # filenames of mapper index files + sim_files = [ temp_dir+'/'+nm+'.fastq' for nm in names ] # filenames of simulated read files + + print "\n--- Simulating reads for each reference genome with %s\n"%simulator + # generate reads for every reference genome + for i in rng: + tools.run_simulator[simulator](ref_files[i], sim_files[i]) + + # find out how many reads were generated + # Attention: Here we assume that all files contain the same number of reads + # and are stored in fastq format + num_reads = len( [ True for i in SeqIO.parse(sim_files[0],'fastq') ] ) + + print "\n--- Mapping simulated reads to reference genomes with %s\n"%mapper + # map the reads of every reference to all references + for i in rng: + for j in rng: + samfile = temp_dir+'/'+names[i]+'-'+names[j]+'.sam' + tools.run_mapper[mapper](index_files[j], sim_files[i], samfile) + + print "\n--- Parsing SAM files\n" + # parse SAM files + mapped_reads = np.zeros((n_seq,n_seq, num_reads)) + for i in rng: + for j in rng: + msg = " SAM file %i of %i"%(i*n_seq+j+1,n_seq*n_seq) + sys.stdout.write("\r" + " "*(len(msg)+5) + "\r") + sys.stdout.write(msg) + sys.stdout.flush() + + # count the reads in i mapping to refernce j + samfile = temp_dir+'/'+names[i]+'-'+names[j]+'.sam' + samhandle = pysam.Samfile(samfile, "r") + mapped_reads[i,j,:] = np.array( [int(not read.is_unmapped) for read in samhandle] ) + + print "\n\nMatrix creation done\n" + + return mapped_reads + + + + + +if __name__=="__main__": + usage = """%prog [options] NAMES + +Calculate the similarity matrix. + +First, a set of reads is simulated for every reference genome using a read +simulator from core/tools.py specified via -s. +Second, the simulated reads of each species are mapped against all reference +genomes using the mapper specified with -m. +Third, the resulting SAM-files are analyzed to calculate the similarity +matrix. The similarity matrix is stored as a numpy file (-o). + +Input: +NAMES: Filename of the names file; the plain text names file should + contain one name per line. The name is used as identifier in + the whole algorithm. + +See the provided LICENSE file or source code for license information. +""" + + # configure the parser + parser = optparse.OptionParser(usage=usage) + parser.add_option('-s', '--simulator', type='string', dest='simulator', default=None, help='Identifier of read simulator defined in core/tools.py [default: %default]') + parser.add_option('-r', '--reference', type='string', dest='ref', default='./ref/%s.fasta', help='Reference sequence file pattern for the read simulator. Placeholder for the name is "%s". [default: %default]') + parser.add_option('-m', '--mapper', type='string', dest='mapper', default=None, help='Identifier of mapper defined in core/tools.py [default: %default]') + parser.add_option('-i', '--index', type='string', dest='index', default='./ref/%s.fasta', help='Reference index files for the read mapper. Placeholder for the name is "%s". [default: %default]') + parser.add_option('-t', '--temp', type='string', dest='temp', default='./temp', help='Directory to store temporary simulated datasets and SAM files. [default: %default]') + parser.add_option('-o', '--output', type='string', dest='out', default='./similarity_matrix.npy', help='Output similarity matrix file. [default: %default]') + # parse arguments + opt, args = parser.parse_args() + + numArgs = len(args) + if numArgs == 1: + # read the Names file + names_file = args[0] + names = tools.read_names(names_file) + + # construct the similarity matrix + smat = similarity_matrix_raw(names, opt.ref, opt.index, opt.temp, opt.simulator, opt.mapper) + + # save the similarity matrix + np.save(opt.out, smat) + print "Wrote similarity matrix to",opt.out + else: + parser.print_help() + sys.exit(1) diff --git a/doc/MANUAL b/doc/MANUAL new file mode 100644 index 0000000..493cec2 --- /dev/null +++ b/doc/MANUAL @@ -0,0 +1,183 @@ +GASiC Overview +-------------- + +GASiC (Genome Abundance Similarity Correction) analyzes results +obtained from aligning sequence reads against a set of reference +sequences and estimates the abundance of each reference in the +dataset. + + +1. Input / Format Prerequisites +------------------------------- + +In addition to the raw sequence read dataset and the reference +genomes, GASiC requires a read alignment/mapping tool and a read +simulator installed. + +The reference genomes must be stored in separate files with a +unique, brief filename entitling the content of the file, e.g. +'e.coli.fasta'. The format is not fixed, but must be compatible +to the read mapper and read simulator. + +If the read mapper requires a reference index (e.g. bowtie), the +index must have the same base name as the reference genome file, +e.g. 'e.coli.index'. + +The raw data requires no special format; everything is possible, +if the read mapper is able to read the file. + +The read alignment/mapping tool should be appropriate for the +raw data, i.e. do not use a short read mapper to align Sanger +reads! The output of the read mapper must be in the SAM format. +Most tools natively support SAM output or provide scripts to +convert their output to SAM format (e.g. SAMSE for BWA). + +The read simulator should be able to simulate reads with the +same characteristics as in the raw data. The output format must +be readable by the alignment/mapping tool. + + +2. Command Line Interface +------------------------- + +One way to use GASiC is the command line interface. This +requires the least knowledge about programming and Linux. + +2.1 Names File +-------------- + +One central element for practically using GASiC via the command +line interface is the so called Names File. The Names File +contains the brief uniqe names identifying the reference +genomes involved in the analysis. The names are listed as plain +text, using one name per line. An example Names File could look +like (without the dashed lines): + +--- Names File --- +e.coli +EHEC +shigella +--- End of file --- + +This has the advantage, that you only need to pass the Names +File and a general path pattern to the scripts. Let us make an +example. Say, you stored your reference sequences in the +directory 'references/' as 'e.coli.fasta', 'EHEC.fasta', and +'shigella.fasta'. The mapper index files are stored accordingly +in 'references/map_index/'. Instead of passing the full +paths of all files to the GASiC scripts, you only need to pass +the names file and the pattern string 'references/%s.fasta' +for the references and 'references/map_index/%s.index' for the +index files. '%s' is the placeholder for the name listed in +the index file. + + +2.2 Read Alignment/Mapping +-------------------------- + +The first stage of GASiC is to map/align the reads to the +reference genomes. Although this can also be done by hand, we +recommend to use our script 'run_mappers.py'. + +'run_mappers.py' can be called from the command line via: + python run_mappers.py +Since no parameters were passed, this will simply show the +help text and exit. The behaviour is controlled by the +parameters described in the help text. + +The general call looks like this: + python run_mappers.py -m [mapper] -i [index] -o [output] [names] + +[mapper] is a string identifying which mapping tool is used. +The identifier must be associated to a call function in the +GASiC file 'core/gasic.py'. More information can be found +there. If you specify 'bowtie', the bowtie mapper must be +installed on your system. + +[index] is the string pattern pointing to the reference +genome index files. For example: 'references/index/%s.idx'. + +[output] is the string pattern pointing to the output files +created by the mapping tool. For example: 'SAM/%s.sam'. + +[names] is the path to the Names File, e.g. 'names.txt'. + + +2.3 Similarity Estimation +------------------------- + +The genome similarity is estimated with the GASiC script +'create_matrix.py'. It is called via + python create_matrix.py -s [simulator] -r [reference] -m [mapper] -i [index] -t [temp] -o [output] [names] + +[simulator] is the identifier string for the read simulator, +see the [mapper] description above. + +[reference] is the string pattern pointing to the reference +sequences serving as input for the read simulator, e.g. +'references/%s.fasta' + +[mapper] see above. + +[index] see above. + +[temp] is the path of a temporary directory, where the +simulated reads and intermediate SAM files can be stored. +For example: '/tmp'. + +[output] is the filename of the output similarity matrix +file. The file is in the NumPy format. Example: +'similarity_matrix.npy' + +[names] Names File, see above. + + +2.4 Similarity Correction +------------------------- + +The GASiC script 'correct_abundances.py' estimates the +true abundances for each reference genome and calculates +p-values. The script is called via + python correct_abundances.py -m [matrix] -s [samfiles] -b [bootstrap] -o [output] [names] + +[matrix] is the filename of the similarity matrix file +obtained in the previous step. For example +'similarity_matrix.npy'. + +[samfiles] is a string pointing to the SAM files created +in the mapping step. For example: 'SAM/%s.sam'. + +[bootstrap] is the number of bootstrap repetitions of +the experiment. This highly influences the runtime. +For example: '100' + +[output] is the filename where the results will be saved. +For example: 'results.txt'. + + +3. Scripting Interface +---------------------- + +The command line scripts described in the previous +section can also be imported in python, such that they +can easily be used to create more sophisticated analysis +scripts. One example script is provided in the file +'doc/example_script.py'. The script is documented, +please find information there. + + +4. Developer Information +------------------------ + +Since the GASiC source code is freely available and well +documented, users can easily extend GASiC or tune it to +their needs. This may become necessary when very large +datasets need to be analyzed and for example parallel +computation needs to be involved, or the calculation of +the similarity matrix needs to be accelerated and some +prior knowledge about the genome similarities needs to +be included. + +GASiC is published under a BSD-like license and can thus +be used, modified, and distributed easily. See the +LICENSE file for more information. \ No newline at end of file diff --git a/doc/example_script.py b/doc/example_script.py new file mode 100644 index 0000000..88dc372 --- /dev/null +++ b/doc/example_script.py @@ -0,0 +1,172 @@ +""" +File: example_script.py + +Description: + This file contains commented analysis script demonstrating how GASiC + can be used via the scripting interface. The contained code only has + exemplary character and is not meant to be executed directly. + +Author: Martin Lindner, [email protected] +""" + + +""" + Example 1: + ---------- + + Simple GASiC workflow. Reproduces the command line tool + functionalities. + + Assumptions: + - Data: FASTQ reads in './data/reads.fastq' + - Reference genomes: FASTA files in './ref/' + - Mapper index: Index files in './ref/index/' + - bowtie mapper + - Mason simulator (configured for illumina reads, "mason_illumina") + - Names File in 'names.txt' +""" + +# load module for system operations +import os + + +# load GASiC modules +from core import tools +import create_matrix +import correct_abundances + + +# set all paths and patterns +p_data = './data/reads.fastq' +p_ref = './ref/%s.fasta' +p_index = './ref/index/%s' +p_SAM = './SAM/%s.sam' +p_temp = './distance' + + +# create directories for SAM files and temporary files +os.makedirs(p_SAM) +os.makedirs(p_temp) + + +# read the Names File +names = tools.read_names('names.txt') + + +# Step 1: map the reads to every reference genome +for name in names: + # fill the current name into the pattern of reference index and SAM path + index_name = p_index%name + SAM_name = p_SAM%name + + # run the read mapper + tools.run_mapper['bowtie'](index_name, p_data, SAM_name) + + +# Step 2: calculate the similarity matrix +sim_matrix = create_matrix.similarity_matrix_raw(names, p_ref, p_index, p_temp, 'mason_illumina', 'bowtie') + + +# Step 3: estimate the true abundances +total,mapped,est,err,p = correct_abundances.similarity_correction(names, sim_matrix, p_SAM, 100) + +print "Finished." + + + + +""" + Example 2: + ---------- + + Apply GASiC to 5 datasets, e.g. containing repetitions of the + same experiment + + Assumptions: + - Data: FASTQ reads in './data/' with names 'rep0.fastq' - 'rep5.fastq' + - Reference genomes: FASTA files in './ref/' + - Mapper index: Index files in './ref/index/' + - a custom mapper + - Mason simulator (configured for illumina reads, "mason_illumina") + - Names File in 'names.txt' +""" + +# load module for system operations +import os + + +# load GASiC modules +from core import tools +import create_matrix +import correct_abundances + + +# set all paths and patterns +p_data = './data/rep%i.fastq' +p_ref = './ref/%s.fasta' +p_index = './ref/index/%s' +p_temp = './distance' + + +# create directories for SAM files and temporary files +os.makedirs('./SAM') +os.makedirs(p_temp) + + +# read the Names File +names = tools.read_names('names.txt') + + +# define a wrapper function for 'my_mapper' +def run_my_mapper(index, reads, out, param=""): + command = "my_mapper -d some_default {param} {index} {reads} > {samfile}".format(index=index, reads=reads, samfile=out, param=param) + print "Executing:",command + os.system(command) + return 1 + + +# register wrapper function temporarily as 'my_mapper' +tools.run_mapper['my_mapper'] = run_my_mapper + + +# calculate the similarity matrix only once +sim_matrix = create_matrix.similarity_matrix_raw(names, p_ref, p_index, p_temp, 'mason_illumina', 'my_mapper') + + +# set up numpy arrays to store all results +num_genomes = len(names) +import numpy as np +total = np.zeros((5,)) +mapped= np.zeros((5,num_genomes)) +est = np.zeros((5,num_genomes)) +err = np.zeros((5,num_genomes)) +p = np.zeros((5,num_genomes)) +unique= np.zeros((5,num_genomes)) + +# iterate over all datasets +for it in range(5): + dataset = p_data%it + + # map the reads to every reference genome + for name in names: + # fill the current name into the pattern of reference index and SAM path + index_name = p_index%name + SAM_name = './SAM/rep%i_%s.sam'%(it,name) + + # run the read mapper + tools.run_mapper['bowtie'](index_name, dataset, SAM_name) + + # estimate the true abundances and count unique reads + SAM_pattern = './SAM/rep{num}_%s.sam'.format(num=it) + total[it],mapped[it,:],est[it,:],err[it,:],p[it,:] = correct_abundances.similarity_correction(names, sim_matrix, SAM_pattern, 100) + unique[it,:] = correct_abundances.unique(names, SAM_pattern) + +# GASiC calculation finished. Now we can post-process the results +# for example, calculate mean and variance for each genome over all datasets +mean_est = np.mean(est,axis=0) +var_est = np.var(est,axis=0) + +print mean_est +print var_est + +print "Finished." diff --git a/quality_check.py b/quality_check.py new file mode 100755 index 0000000..1a79cf6 --- /dev/null +++ b/quality_check.py @@ -0,0 +1,131 @@ +#!/usr/bin/python +""" +Copyright (c) 2012, Martin S. Lindner, [email protected], +Robert Koch-Institut, Germany, +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL MARTIN S. LINDNER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +import optparse +import glob +import os +import sys +try: + import matplotlib.pyplot as plt +except ImportError: + raise Exception('Import Error: Failed to load matplotlib module. Please install matplotlib properly (http://matplotlib.sourceforge.net/).') + +try: + import pysam +except ImportError: + raise Exception('Import Error: Failed to load pysam module. Please install pysam properly (http://code.google.com/p/pysam/).') + +import numpy as np +import scipy.stats as st +from core import tools + +if __name__=="__main__": + usage = """%prog [Options] SAMFILE + +Perform a sanity check on the mapping results (SAM file). + +This tool analyzes the output of the read mapper and provides useful +information to the user to decide whether the set of reference sequences +and the mapper settings are feasible for the given dataset. + +Input: +SAMFILE: Name of the SAM file to analyze + +""" + + # configure the parser + parser = optparse.OptionParser(usage=usage) + parser.add_option('-o', '--outpath', type='string', dest='out', default='./', help='Path to the directory for the analysis output. [default: %default]') + # parse arguments + options, args = parser.parse_args() + + numArgs = len(args) + if numArgs >= 1: + sam_names = args + + # create output directory if necessary + if not os.path.exists(os.path.dirname(options.out)): + os.makedirs(os.path.dirname(options.out)) + + # analyze all files and gather statistics + for i,nm in enumerate(sam_names): + dataset_name = os.path.splitext(os.path.split(nm)[1])[0] + sf = pysam.Samfile(nm,'r') + cov = np.zeros( (sum(sf.lengths),) ) + start_pos = np.cumsum(sf.lengths)-sf.lengths[0] + total_reads = 0 + mapped_reads = 0 + for read in sf: + total_reads += 1 + if not read.is_unmapped: + r_start = start_pos[read.tid] + read.pos + r_end = start_pos[read.tid] + read.pos + read.qlen + cov[r_start:r_end] += 1 + mapped_reads += 1 + + # calculate coverage + max_cov = np.max(cov) + mean_cov = np.mean(cov) + + # plot the coverage + plt.figure() + h = plt.hist(cov, bins=min(max_cov,100), range=(0,max_cov)) + plt.xlabel('coverage') + plt.ylabel('# occurrences') + plt.title('Coverage histogram for '+dataset_name) + plt.savefig(options.out+'/'+dataset_name+'_coverage.png', dpi=300) + + # write out user information + f = open(options.out+'/'+dataset_name+'_info.txt','w') + f.write(' Name:\t'+nm+'\n') + f.write('Total Genome Length:\t%i\n'%(sum(sf.lengths))) + f.write('Num. Contigs:\t%i\n\n'%(len(sf.lengths))) + f.write('Mapping Results:\n') + f.write('Total Reads:\t%i\n'%(total_reads)) + f.write('Mapped Reads:\t%i\n'%(mapped_reads)) + f.write('Average Coverage:\t%f\n'%(mean_cov)) + zero_cov = h[0][0]/float(sum(h[0])) + f.write('Fraction of bases with 0 Coverage:\t%f\n\n'%(zero_cov) ) + + f.write('Comments:\n') + if mean_cov < 1.: + f.write('Low overall coverage. ') + if mapped_reads/float(total_reads)>0.01: + f.write('Consider using a larger dataset.\n') + else: + f.write('Abundance of this or related sequences is below 0.01.\n') + + p = st.poisson(mean_cov) + if zero_cov > 2*p.pmf(0): + f.write('Unnaturally many bases with zero coverage. Consider that this species is not present in the dataset.\n') + + f.close() + else: + parser.print_help() + sys.exit(1) + diff --git a/run_mappers.py b/run_mappers.py new file mode 100755 index 0000000..41114da --- /dev/null +++ b/run_mappers.py @@ -0,0 +1,85 @@ +#!/usr/bin/python +""" +Copyright (c) 2012, Martin S. Lindner, [email protected], +Robert Koch-Institut, Germany, +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL MARTIN S. LINDNER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +import optparse +import glob +import os +import sys + +from core import tools + +if __name__=="__main__": + usage = """%prog NAMES READS -i REF -o OUT -m MAPPER + +Run a read mapper to map reads to reference genomes. + +The names in the NAMES file will be inserted in the provided string +patterns. Each pattern must contain exactly one "%s" placeholder +(python string formatting). + +Input: +NAMES: Filename of the names file; the plain text names file should + contain one name per line. The name is used as identifier in + the whole algorithm. + +READS: File containing the reads to be mapped. + +""" + + # configure the parser + parser = optparse.OptionParser(usage=usage) + parser.add_option('-m', '--mapper', type='string', dest='mapper', default=None, help='Identifier of mapper defined in core/tools.py [default: %default]') + parser.add_option('-i', '--index', type='string', dest='ref', default='./ref/%s.fasta', help='Pattern, that points to the reference sequences/indices when used with a name. Placeholder for the name is "%s". [default: %default]') + parser.add_option('-o', '--output', type='string', dest='out', default='./SAM/%s.sam', help='Pattern, that points to the output SAM file, when used with a name. Placeholder for the name is "%s". [default: %default]') + # parse arguments + options, args = parser.parse_args() + + numArgs = len(args) + if numArgs == 2: + reads = args[1] + names = tools.read_names(args[0]) + out = [options.out%nm for nm in names] + ref = [options.ref%nm for nm in names] + + if not options.mapper: + parser.print_help() + raise Exception('Aborting. No mapper specified.') + + # create output directory if necessary + if not os.path.exists(os.path.dirname(options.out)): + os.makedirs(os.path.dirname(options.out)) + + for i in range(len(names)): + msg = "--- mapping reads with %s to %s"%(options.mapper, ref[i]) + print msg + tools.run_mapper[options.mapper](ref[i], reads, out[i]) + print "\nMapping done.\n" + else: + parser.print_help() + sys.exit(1) + -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/gasic.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
