This is an automated email from the git hooks/post-receive script. afif-guest pushed a commit to branch master in repository pbbarcode.
commit 266491af26faabf51816b3a5314c77c8f525b06d Author: Afif Elghraoui <[email protected]> Date: Sun Nov 29 01:36:48 2015 -0800 Imported Upstream version 0.8.0 --- Makefile | 47 ++ README.rst | 36 ++ doc/Makefile | 153 +++++++ doc/PbbarcodeFunctionalSpecification.rst | 405 +++++++++++++++++ doc/conf.py | 242 ++++++++++ doc/index.rst | 16 + etc/barcode.fasta | 8 + etc/barcode_complete.fasta | 192 ++++++++ etc/pacbio_barcodes_paired.fasta | 192 ++++++++ setup.py | 37 ++ src/C/Makefile | 11 + src/C/sw.c | 56 +++ src/python/pbbarcode/BarcodeLabeler.py | 225 +++++++++ src/python/pbbarcode/SWaligner.py | 69 +++ src/python/pbbarcode/__init__.py | 0 src/python/pbbarcode/_version.py | 1 + src/python/pbbarcode/main.py | 751 +++++++++++++++++++++++++++++++ tests/cram/consensus.t.disabled | 88 ++++ tests/cram/sanity.t | 55 +++ tests/test_basic.py | 32 ++ 20 files changed, 2616 insertions(+) diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..982f5ce --- /dev/null +++ b/Makefile @@ -0,0 +1,47 @@ +.PHONY: doc doc-clean + +SHELL = /bin/bash -e + +all: build install + +build: + python setup.py build --executable="/usr/bin/env python" + +bdist: + python setup.py build --executable="/usr/bin/env python" + python setup.py bdist --formats=egg + +install: + python setup.py install + +develop: + python setup.py develop + +test: + find tests -name "*.py" | xargs nosetests + find tests/cram -name "*.t" | grep -v consensus.t | xargs cram --verbose + +clean: doc-clean + rm -rf build/;\ + find . -name "*.egg-info" | xargs rm -rf;\ + find . -name "*.pyc" | xargs rm -rf;\ + rm -rf dist/ + make -C src/C clean + +doc-clean: + make -C doc clean + +doc: + make -C doc html + +pip-install: + @which pip > /dev/null + @pip freeze|grep 'pbtools.barcode=='>/dev/null \ + && pip uninstall -y pbtools.barcode \ + || true + @pip freeze|grep 'pbbarcode=='>/dev/null \ + && pip uninstall -y pbbarcode \ + || true + @pip install --no-index \ + --install-option="--install-scripts=$(PREFIX)/bin" \ + ./ diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..42610c7 --- /dev/null +++ b/README.rst @@ -0,0 +1,36 @@ +Overview of the pbbarcode package +================================= + +The *pbbarcode* package provides tools for annotating PacBio +sequencing reads with barcode information. Typically, *pbbarcode* +is called in context of a SMRTPipe workflow as opposed to directly on +the command line, however, users are encouraged to utilize the +command-line utility directly, as more options are available. + +The *pbbarcode* package provides a multi-command line tool +*pbbarcode* which currently has the following sub-commands: + +* labelZmws +* labelAlignments +* emitFastqs +* consensus + +The first three sub-commands depend on only *pbcore* and its +dependencies, the fourth, *consensus*, depends on the *pbdagcon* +package and is considered experimental. + +For more details on the package, please see docs/index.rst for more +information. + +Installation +============ + +Typically, the *pbbarcode* package is installed within an installation +of SMRTPipe, however, it can be installed by itself using:: + + make install + +To test that everything is installed correctly, one should +additionally issue a:: + + make test diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..a37efe5 --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,153 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make <target>' where <target> is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pbbarcode.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pbbarcode.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/pbbarcode" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pbbarcode" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/doc/PbbarcodeFunctionalSpecification.rst b/doc/PbbarcodeFunctionalSpecification.rst new file mode 100644 index 0000000..b00fe1f --- /dev/null +++ b/doc/PbbarcodeFunctionalSpecification.rst @@ -0,0 +1,405 @@ +.. pbbarcode Functional Specification +.. ======================================= + +.. Version + + +Introduction +```````````` +This document describes the interface and input/output formats of the +``pbbarcode`` package command line tools. The package provides +utilities for annotating individual ZMWs directly from a bas.h5 file, +emitting fast[a|q] files for each barcode, labeling alignments stored +in a cmp.h5 file, and calling consensus on small amplicons (requires +``pbdagcon``) + +At the moment, Barcodes can be scored in two different ways: +``symmetric`` and ``paired``. Symmetric mode supports barcode designs +with two identical barcodes on both sides of a SMRTbell, e.g., for +barcodes (A, B), molecules are labeled as A--A or B--B. The ``paired`` +mode supports designs with two distinct barcodes on each side of the +molecule, but neither barcode appears without its mate. The minimum +example is given with the following barcodes: (ALeft, ARight, BLeft, +BRight), where the following barcode sets are checked: ALeft--ARight, +BLeft--BRight. + +It is important to highlight that a barcode FASTA file specifies a +list of available barcodes to evaluate. Depending on the scoring mode, +the barcodes are grouped together in different ways. For instance, in +the ``symmetric`` case, the number of possible barcode outcomes are +simply the number of barcodes that are supplied to the routine in the +FASTA file (see below for usage) plus an additional ``NULL`` barcode +indicating that no barcode could be evaluated (denoted by: +'--'). Labels like this (A--A) are used in the final outputs. In the +``paired`` mode, the number of possible barcode outcomes are half the +number of the sequences in the FASTA file plus the ``NULL`` +barcode. The ``NULL`` barcode indicates that no attempt was made to +score the molecule or it was filtered out by the user's criteria. The +majority of cases when a molecule is not scored are related to not +observing any adapters. If a user has executed a "hot-start" run, the +user can try the '--scoreFirst' parameter to attempt to label the +first adapter's barcode. This increases the yield of the labeleing +procedure at the expense of some probably false positives. + +The software is implemented as a standard python package. Barcodes are +labeled according to the following high-level logic. For each +molecule, all adapters are found. For each adapter, we align (using +standard Smith-Watterman alignment) each barcode and its reverse +complement to flanking sequence of the adapter. If two complete +flanking sequences are available, we divide by 2, else 1 if only one +flanking sequence was available (average score at adapter). This +allows the scores across adapters to be on the same scale (chimera +detection). Depending on the ``mode``, we then determine which +barcode(s) are maximally scoring. We store the two maximally scoring +barcodes, the sum of their alignment scores across the adapters. The +average barcode score then can be given approximately by: +total-score/number-of-adapters. At the moment, the alignment +parameters are fixed at: + + +.. table:: SW Match Parameters ++----------+----------+ +|type |score | +| | | ++----------+----------+ +|insertion |-1 | +| | | ++----------+----------+ +|deletion |-1 | +| | | ++----------+----------+ +|missmatch |-2 | +| | | ++----------+----------+ +|match |2 | +| | | ++----------+----------+ + +Input and output +```````````````` + +labelZmws +--------- + usage: pbbarcode labelZmws [-h] [--outDir OUTDIR] [--outFofn OUTFOFN] + [--adapterSidePad ADAPTERSIDEPAD] + [--insertSidePad INSERTSIDEPAD] + [--scoreMode {symmetric,paired}] + [--maxAdapters MAXADAPTERS] [--scoreFirst] + [--startTimeCutoff STARTTIMECUTOFF] + [--nZmws NZMWS] [--nProcs NPROCS] + [--saveExtendedInfo] + barcode.fasta input.fofn + + Creates a barcode.h5 file from base h5 files. + + positional arguments: + barcode.fasta Input barcode fasta file + input.fofn Input base fofn + + optional arguments: + -h, --help show this help message and exit + --outDir OUTDIR Where to write the newly created barcode.h5 files. + (default: /home/UNIXHOME/jbullard/projects/software/bi + oinformatics/tools/pbbarcode/doc) + --outFofn OUTFOFN Write to outFofn (default: barcode.fofn) + --adapterSidePad ADAPTERSIDEPAD + Pad with adapterSidePad bases (default: 4) + --insertSidePad INSERTSIDEPAD + Pad with insertSidePad bases (default: 4) + --scoreMode {symmetric,paired} + The mode in which the barcodes should be scored. + (default: symmetric) + --maxAdapters MAXADAPTERS + Only score the first maxAdapters (default: 20) + --scoreFirst Whether to try to score the leftmost barcode in a + trace. (default: False) + --startTimeCutoff STARTTIMECUTOFF + Reads must start before this value in order to be + included when scoreFirst is set. (default: 10.0) + --nZmws NZMWS Use the first n ZMWs for testing (default: -1) + --nProcs NPROCS How many processes to use (default: 8) + --saveExtendedInfo Whether to save extended information tothe barcode.h5 + files; this information is useful for debugging and + chimera detection (default: False) + +The ``labelZmws`` command takes an input.fofn representing a set of +bas.h5 files to operate on. Additionally, it takes a barcode.fasta +file. Depending on ``scoreMode``, the FASTA file will be processed in +different ways. Specifically, in ``paired`` mode, each two consecutive +barcodes in the file are considered a set. + +The parameters, ``adapterSidePad`` and ``insertSidePad`` represents +how many bases should be considered on each side of the putative +barcode. These parameters are constrained such that: +``|adapterSidePad| + |insertSidePad| + |barcode| < 65``. + +Users have the option to specify a different output location +for the various outputs. Specifically, for each bas.h5 file in +input.fofn, a bc.h5 (barcode hdf5) file is generated. These files are +listed in the file ``outFofn`` which is typically just called +``barcode.fofn``. See below for a description of the barcode hdf5 +file. + + +labelAlignments +--------------- + usage: pbbarcode labelAlignments [-h] + [--minAvgBarcodeScore MINAVGBARCODESCORE] + [--minNumBarcodes MINNUMBARCODES] + [--minScoreRatio MINSCORERATIO] + barcode.fofn aligned_reads.cmp.h5 + + Adds information about barcode alignments to a cmp.h5 file from a previous + call to "labelZmws". + + positional arguments: + barcode.fofn input barcode fofn file + aligned_reads.cmp.h5 cmp.h5 file to add barcode labels + + optional arguments: + -h, --help show this help message and exit + --minAvgBarcodeScore MINAVGBARCODESCORE + ZMW Filter: exclude ZMW if average barcode score is + less than this value (default: 0.0) + --minNumBarcodes MINNUMBARCODES + ZMW Filter: exclude ZMW if number of barcodes observed + is less than this value (default: 1) + --minScoreRatio MINSCORERATIO + ZMW Filter: exclude ZMWs whose best score divided by + the 2nd best score is less than this ratio (default: + 1.0) + + +The ``labelAlignments`` command takes as input a barcode.fofn computed +from a call to ``labelZMWs`` and a cmp.h5 file where the barcode +information is written to. See below for a description of the cmp.h5 +file additions. + + + +emitFastqs +---------- + usage: pbbarcode emitFastqs [-h] [--outDir output.dir] [--subreads] + [--unlabeledZmws] [--trim TRIM] [--fasta] + [--minMaxInsertLength MINMAXINSERTLENGTH] + [--hqStartTime HQSTARTTIME] + [--minReadScore MINREADSCORE] + [--minAvgBarcodeScore MINAVGBARCODESCORE] + [--minNumBarcodes MINNUMBARCODES] + [--minScoreRatio MINSCORERATIO] + input.fofn barcode.fofn + + Takes a bas.h5 fofn and a barcode.h5 fofn and produces a fast[a|q] file for + each barcode. + + positional arguments: + input.fofn input base or CCS fofn file + barcode.fofn input barcode.h5 fofn file + + optional arguments: + -h, --help show this help message and exit + --outDir output.dir output directory to write fastq files (default: /home/ + UNIXHOME/jbullard/projects/software/bioinformatics/too + ls/pbbarcode/doc) + --subreads whether to produce fastq files for the subreads;the + default is to use the CCS reads. This option + onlyapplies when input.fofn has both consensus and raw + reads,otherwise the read type from input.fofn will be + returned. (default: False) + --unlabeledZmws whether to emit a fastq file for the unlabeled ZMWs. + These are the ZMWs where no adapters are found + typically (default: False) + --trim TRIM trim off barcodes and any excess constant sequence + (default: 20) + --fasta whether the files produced should be FASTA files + asopposed to FASTQ (default: False) + --minMaxInsertLength MINMAXINSERTLENGTH + ZMW Filter: exclude ZMW if the longest subreadis less + than this amount (default: 0) + --hqStartTime HQSTARTTIME + ZMW Filter: exclude ZMW if start time of HQ + regiongreater than this value (seconds) (default: inf) + --minReadScore MINREADSCORE + ZMW Filter: exclude ZMW if readScore is less thanthis + value (default: 0) + --minAvgBarcodeScore MINAVGBARCODESCORE + ZMW Filter: exclude ZMW if average barcode score is + less than this value (default: 0.0) + --minNumBarcodes MINNUMBARCODES + ZMW Filter: exclude ZMW if number of barcodes observed + is less than this value (default: 1) + --minScoreRatio MINSCORERATIO + ZMW Filter: exclude ZMWs whose best score divided by + the 2nd best score is less than this ratio (default: + 1.0) + + +The ``emitFastqs`` command takes as input both an input.fofn for the +bas.h5 files as well as a barcode.fofn from a call to labelZmws. The +optional parameter ``outDir`` dictates where the files will be +written. For each detected barcode, a fast[a|q] file will be emitted +with all of the reads for that barcode. The ``trim`` parameter +dictates how much of the read should be trimmed off. The default +parameter for ``trim`` is the length of the barcode (which is stored +in the barcode hdf5 files). At the moment, all barcodes in the barcode +FASTA file must be the same length, therefore only a constant trim +value is supported. In practice, one can aggressively trim in order to +ensure that extra bases aren't left on the ends of reads. Finally, the +``subreads`` parameter dictates whether subreads or CCS reads should +be returned with the default being the appropriate reads according to +the input file type, either CCS or subreads. This parameter is only +inspected if the input.fofn contains both CCS and subread data, if the +input.fofn contains only subread or CCS data then that is returned +irrespective of the state of the the ``subreads`` parameter and a +warning is issued. + +consensus +--------- + usage: pbbarcode consensus [-h] [--subsample SUBSAMPLE] [--nZmws NZMWS] + [--outDir OUTDIR] [--keepTmpDir] + [--ccsFofn CCSFOFN] [--nProcs NPROCS] + [--noQuiver] + [--minMaxInsertLength MINMAXINSERTLENGTH] + [--hqStartTime HQSTARTTIME] + [--minReadScore MINREADSCORE] + [--minAvgBarcodeScore MINAVGBARCODESCORE] + [--minNumBarcodes MINNUMBARCODES] + [--minScoreRatio MINSCORERATIO] + [--barcode BARCODE [BARCODE ...]] + input.fofn barcode.fofn + + Compute consensus sequences for each barcode. + + positional arguments: + input.fofn input bas.h5 fofn file + barcode.fofn input bc.h5 fofn file + + optional arguments: + -h, --help show this help message and exit + --subsample SUBSAMPLE + Subsample ZMWs (default: 1) + --nZmws NZMWS Take n ZMWs (default: -1) + --outDir OUTDIR Use this directory to output results (default: .) + --keepTmpDir + --ccsFofn CCSFOFN Obtain CCS data from ccsFofn instead of input.fofn + (default: ) + --nProcs NPROCS Use nProcs to execute. (default: 16) + --noQuiver + --minMaxInsertLength MINMAXINSERTLENGTH + ZMW Filter: exclude ZMW if the longest subreadis less + than this amount (default: 0) + --hqStartTime HQSTARTTIME + ZMW Filter: exclude ZMW if start time of HQ + regiongreater than this value (seconds) (default: inf) + --minReadScore MINREADSCORE + ZMW Filter: exclude ZMW if readScore is less thanthis + value (default: 0) + --minAvgBarcodeScore MINAVGBARCODESCORE + ZMW Filter: exclude ZMW if average barcode score is + less than this value (default: 0.0) + --minNumBarcodes MINNUMBARCODES + ZMW Filter: exclude ZMW if number of barcodes observed + is less than this value (default: 1) + --minScoreRatio MINSCORERATIO + ZMW Filter: exclude ZMWs whose best score divided by + the 2nd best score is less than this ratio (default: + 1.0) + --barcode BARCODE [BARCODE ...] + Use this to extract consensus for just one barcode. + (default: None) + +The ``emitFastqs`` command takes as input both an input.fofn for the +bas.h5 files as well as a barcode.fofn from a call to labelZmws. The +results are a FASTA file with an entry for each barcode containing the +consensus amplicon sequence. This mode utilizes ``Quiver`` and +``pbdagcon`` to compute consensus. + +In cases where the amplicon is fewer than 2.5k bases, using CCS data +is quite helpful. The ``--ccsFofn`` allows one to pass directly the +ccs files. In many cases, both the CCS and raw basecalls are in the +same file so you can check by passing the same parameter to input.fofn +as to ccsFofn. + +Dependencies +```````````` + +The pbbarcode package depends on a standard pbcore installation +(https://github.com/PacificBiosciences/pbcore). If one wishes to use +the ``consensus`` tool, ``pbdagcon`` needs to be installed +(https://github.com/PacificBiosciences/pbdagcon). + + +Barcode HDF5 File +````````````````` + +The barcode hdf5 file, ``bc.h5``, represents a simple data store for +barcode calls and their scores for each ZMW. Generally, a user need +not interact with barcode hdf5 files, but can use the results stored in +either the resulting cmp.h5 file or fast[a|q] files. The barcode hdf5 +file contains the following structure: + +/BarcodeCalls/best - (nZMWs, 6)[32-bit integer] dataset with the +following columns: + + ``holeNumber,nAdapters,barcodeIdx1,barcodeScore1,barcodeIdx2,barcodeScore2`` + +Additionally, the ``best`` dataset has the following attributes: + ++-----------+------------------------------------------------------------------------+ +|movieName |m120408_042614_richard_c100309392550000001523011508061222_s1_p0 | +| | | ++-----------+------------------------------------------------------------------------+ +|columnNames|holeNumber,nAdapters,barcodeIdx1,barcodeScore1,barcodeIdx2, | +| |barcodeScore2 | ++-----------+------------------------------------------------------------------------+ +|scoreMode |[symmetric|paired] | +| | | ++-----------+------------------------------------------------------------------------+ +|barcodes |'bc_1', 'bc_2', ...., 'bc_N' | +| | | ++-----------+------------------------------------------------------------------------+ + +The two barcodeIdx1 and barcodeIdx2 columns are indices into +``barcodes`` attribute. The ``scoreMode`` is scoring mode used to +align the barcodes. The ``barcodes`` attribute correspond to the +barcode.fasta sequence names. + +Additionally, in some circumstances, it is useful to retain the entire +history of the scoring, i.e., each barcode scored to each adapter +across all ZMWs. In oder to retain this information, one must call: + + ``pbbarcode labelZmws --saveExtendedInfo ...`` + +In this mode, the resultant HDF5 file will have an additional dataset +under the BarcodeCalls group, named: ``all``. This dataset has the +following format: + +/BarcodeCalls/all - (nbarcodes * nadapters[zmw_i], 4) \forall i in 1 ... nZMWs + + ```holeNumber, adapterIdx, barcodeIdx, score``` + +The ``adapterIdx`` is the index of the adapter along the molecule, +i.e., adapterIdx 1 is the first adapter scored. + +Additions to the compare HDF5 (cmp.h5) File +``````````````````````````````````````````` + +In addition to the barcode hdf5 file, a call to ``labelAlignments`` +will annotate a cmp.h5 file. This annotation is stored in ways +consistent with the cmp.h5 file format. Specifically, a new group: + +| /BarcodeInfo/ +| ID (nBarcodeLabels + 1, 1)[32-bit integer] +| Name (nBarcodeLabels + 1, 1)[variable length string] + +In addition to the /BarcodeInfo/ group, the key dataset which assigns +alignments to barcodes is located at: + +/AlnInfo/Barcode (nAlignments, 3)[32-bit integer] with the following +colums: + + ``index,count,bestIndex,bestScore,secondBestIndex,secondBestScore`` + +Here index refers to the index into the ``Name`` vector, score +corresponds to the sum of the scores for the barcodes, and finally, +count refers to the number of adapters found in the molecule. diff --git a/doc/conf.py b/doc/conf.py new file mode 100755 index 0000000..7a1cd30 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,242 @@ +# -*- coding: utf-8 -*- +# +# pbbarcode documentation build configuration file, created by +# sphinx-quickstart on Mon Apr 30 18:28:57 2012. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.viewcode'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'pbbarcode' +copyright = u'2012, PacBio' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '.1' +# The full version, including alpha/beta/rc tags. +release = '.1' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# "<project> v<release> documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'pbbarcodedoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'pbbarcode.tex', u'pbbarcode Documentation', + u'PacBio', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'pbbarcode', u'pbbarcode Documentation', + [u'PacBio'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------------ + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'pbbarcode', u'pbbarcode Documentation', + u'PacBio', 'pbbarcode', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 0000000..ea69335 --- /dev/null +++ b/doc/index.rst @@ -0,0 +1,16 @@ +.. pbbarcode documentation master file, created by + sphinx-quickstart on Mon Apr 30 18:28:57 2012. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +pbbarcode +========= + +Contents: + +.. toctree:: + :maxdepth: 2 + + PbbarcodeFunctionalSpecification + + diff --git a/etc/barcode.fasta b/etc/barcode.fasta new file mode 100644 index 0000000..e524539 --- /dev/null +++ b/etc/barcode.fasta @@ -0,0 +1,8 @@ +>bc3 +tatctatcgtatacgc +>bc4 +atcacactgcatctga +>bc5 +acgtacgctcgtcata +>bc10 +tcatgcacgtctcgct diff --git a/etc/barcode_complete.fasta b/etc/barcode_complete.fasta new file mode 100644 index 0000000..9803f37 --- /dev/null +++ b/etc/barcode_complete.fasta @@ -0,0 +1,192 @@ +>bc_1 +GCGCTCTGTGTGCAGC +>bc_2 +TCATGAGTCGACACTA +>bc_3 +TATCTATCGTATACGC +>bc_4 +ATCACACTGCATCTGA +>bc_5 +ACGTACGCTCGTCATA +>bc_6 +TGTGAGTCAGTACGCG +>bc_7 +AGAGACACGATACTCA +>bc_8 +CTGCTAGAGTCTACAG +>bc_9 +AGCACTCGCGTCAGTG +>bc_10 +TCATGCACGTCTCGCT +>bc_11 +AGAGCATCTCTGTACT +>bc_12 +CGCATCGACTACGCTA +>bc_13 +CGTAGCGTGCTATCAC +>bc_14 +ATGCTGATGACTGCGA +>bc_15 +TGCGTGAGCTGTACAT +>bc_16 +CGATCATCTATAGACA +>bc_17 +CGACGTATCTGACAGT +>bc_18 +CACGTCACTAGAGCGA +>bc_19 +TGTCGCAGCTACTAGT +>bc_20 +CATACGCTGTGTAGCA +>bc_21 +AGTCGCATGACTGTGT +>bc_22 +CAGTACTGCACGATCG +>bc_23 +GTGCTGAGCATCAGAC +>bc_24 +CACTGATCGATATGCA +>bc_25 +TACAGTGTCTGCTGCG +>bc_26 +TACAGATAGTGTAGCG +>bc_27 +TCGTAGAGCTCGAGAC +>bc_28 +GAGCTGCGCACTCGAT +>bc_29 +GCGATGTCGCTATGTG +>bc_30 +CGAGAGTCAGCGCATA +>bc_31 +TCACGATGAGCACGTA +>bc_32 +GACTGAGATCATGATC +>bc_33 +ACGACATGATACTGCT +>bc_34 +ATACAGCACAGATGTG +>bc_35 +ACAGTCGATATCTCTC +>bc_36 +GCTCGATCACATGACG +>bc_37 +GTCGTACACGTGCGAC +>bc_38 +ACTCATATCTAGAGTG +>bc_39 +ACTGATCTGTCGCGCT +>bc_40 +CACTAGCTCTGACTAC +>bc_41 +GCTGTCATGTACTAGC +>bc_42 +TATACATACACGCACT +>bc_43 +TGTGACGACGCGTCTC +>bc_44 +GACGTGAGCATGCACT +>bc_45 +CTCGATACGTGTAGCT +>bc_46 +GTGTCTAGACAGCTGT +>bc_47 +GATGCATGCGTACGCA +>bc_48 +TATCAGAGCAGCGATG +>bc_49 +TCATATGTAGTACTCT +>bc_50 +GCGATCTATGCACACG +>bc_51 +TGCAGTCGAGATACAT +>bc_52 +GACTCTGCGTCGAGTC +>bc_53 +TACAGCGACGTCATCG +>bc_54 +GCGCAGACTACGTGTG +>bc_55 +GTCTCTGCGATACAGC +>bc_56 +AGTATGAGATAGCTCG +>bc_57 +GCGACGAGTACTCATG +>bc_58 +AGTATCACAGTCGCTG +>bc_59 +ATCATATGATGCGACA +>bc_60 +AGACGTAGATCACAGC +>bc_61 +CGTGTCATGCTACTCA +>bc_62 +TGTGAGACTGCATGTC +>bc_63 +GCTCAGTGCGCTACTG +>bc_64 +ACTATCGCGCACGCAG +>bc_65 +TGACACTCTGCACGCG +>bc_66 +CAGACGTGACTGATAT +>bc_67 +GCACTGTAGTGATCGT +>bc_68 +CAGTGCGAGACAGTAG +>bc_69 +AGTAGTGCTACTCGAC +>bc_70 +ATGCGAGATCTGCTCA +>bc_71 +TGAGACATACTGAGTG +>bc_72 +ATGTGCACTAGTGTAC +>bc_73 +TCAGCTGACGATGTGA +>bc_74 +ACTGATGCGCACATGT +>bc_75 +CTACTCTCAGCAGTGA +>bc_76 +ATCTACATCACGACTC +>bc_77 +ATATAGTACAGCGTCT +>bc_78 +GACACGACTAGATCGC +>bc_79 +TACGAGTCTGTCATAC +>bc_80 +ACTCAGCTACATAGTG +>bc_81 +ACGTATCATAGTGAGA +>bc_82 +GAGTCGTATCGCTCAT +>bc_83 +GCGATCACGAGTAGAC +>bc_84 +CTAGACGTACATGTCG +>bc_85 +TAGCAGTCACTGTGCG +>bc_86 +GCTCATGCGATAGCTA +>bc_87 +GCGCAGTCGTCTGTAT +>bc_88 +ATGAGCTACGTACAGA +>bc_89 +GTCGCGAGTCTATCAG +>bc_90 +ACATCGATCTGCACTA +>bc_91 +AGTATAGCATAGACGC +>bc_92 +GTGAGAGCGTGACTCT +>bc_93 +TGTCAGTAGATGACTC +>bc_94 +TCGTACGAGATCGACA +>bc_95 +CTACATGTGACTCGAG +>bc_96 +GCGCTATAGTGCTCGT diff --git a/etc/pacbio_barcodes_paired.fasta b/etc/pacbio_barcodes_paired.fasta new file mode 100755 index 0000000..1ba2e7a --- /dev/null +++ b/etc/pacbio_barcodes_paired.fasta @@ -0,0 +1,192 @@ +>F_1 +GGTAGGCGCTCTGTGTGCAGC +>R_1 +AGAGTACTACATATGAGATGG +>F_2 +GGTAGTCATGAGTCGACACTA +>R_2 +CGTGTGCATAGATCGCGATGG +>F_3 +GGTAGTATCTATCGTATACGC +>R_3 +ATGTATCTCGACTGCAGATGG +>F_4 +GGTAGATCACACTGCATCTGA +>R_4 +GACTCGACGCAGAGTCGATGG +>F_5 +GGTAGACGTACGCTCGTCATA +>R_5 +CGATGACGTCGCTGTAGATGG +>F_6 +GGTAGTGTGAGTCAGTACGCG +>R_6 +CACACGTAGTCTGCGCGATGG +>F_7 +GGTAGAGAGACACGATACTCA +>R_7 +GCTGTATCGCAGAGACGATGG +>F_8 +GGTAGCTGCTAGAGTCTACAG +>R_8 +CGAGCTATCTCATACTGATGG +>F_9 +GGTAGAGCACTCGCGTCAGTG +>R_9 +CATGAGTACTCGTCGCGATGG +>F_10 +GGTAGTCATGCACGTCTCGCT +>R_10 +CAGCGACTGTGATACTGATGG +>F_11 +GGTAGAGAGCATCTCTGTACT +>R_11 +TGTCGCATCATATGATGATGG +>F_12 +GGTAGCGCATCGACTACGCTA +>R_12 +GCTGTGATCTACGTCTGATGG +>F_13 +GGTAGCGTAGCGTGCTATCAC +>R_13 +TGAGTAGCATGACACGGATGG +>F_14 +GGTAGATGCTGATGACTGCGA +>R_14 +GACATGCAGTCTCACAGATGG +>F_15 +GGTAGTGCGTGAGCTGTACAT +>R_15 +CAGTAGCGCACTGAGCGATGG +>F_16 +GGTAGCGATCATCTATAGACA +>R_16 +CTGCGTGCGCGATAGTGATGG +>F_17 +GGTAGCGACGTATCTGACAGT +>R_17 +CGCGTGCAGAGTGTCAGATGG +>F_18 +GGTAGCACGTCACTAGAGCGA +>R_18 +ATATCAGTCACGTCTGGATGG +>F_19 +GGTAGTGTCGCAGCTACTAGT +>R_19 +ACGATCACTACAGTGCGATGG +>F_20 +GGTAGCATACGCTGTGTAGCA +>R_20 +CTACTGTCTCGCACTGGATGG +>F_21 +GGTAGAGTCGCATGACTGTGT +>R_21 +GTCGAGTAGCACTACTGATGG +>F_22 +GGTAGCAGTACTGCACGATCG +>R_22 +TGAGCAGATCTCGCATGATGG +>F_23 +GGTAGGTGCTGAGCATCAGAC +>R_23 +CACTCAGTATGTCTCAGATGG +>F_24 +GGTAGCACTGATCGATATGCA +>R_24 +GTACACTAGTGCACATGATGG +>F_25 +GGTAGTACAGTGTCTGCTGCG +>R_25 +TCACATCGTCAGCTGAGATGG +>F_26 +GGTAGTACAGATAGTGTAGCG +>R_26 +ACATGTGCGCATCAGTGATGG +>F_27 +GGTAGTCGTAGAGCTCGAGAC +>R_27 +TCACTGCTGAGAGTAGGATGG +>F_28 +GGTAGGAGCTGCGCACTCGAT +>R_28 +GAGTCGTGATGTAGATGATGG +>F_29 +GGTAGGCGATGTCGCTATGTG +>R_29 +AGACGCTGTACTATATGATGG +>F_30 +GGTAGCGAGAGTCAGCGCATA +>R_30 +GCGATCTAGTCGTGTCGATGG +>F_31 +GGTAGTCACGATGAGCACGTA +>R_31 +GTATGACAGACTCGTAGATGG +>F_32 +GGTAGGACTGAGATCATGATC +>R_32 +CACTATGTAGCTGAGTGATGG +>F_33 +GGTAGACGACATGATACTGCT +>R_33 +TCTCACTATGATACGTGATGG +>F_34 +GGTAGATACAGCACAGATGTG +>R_34 +ATGAGCGATACGACTCGATGG +>F_35 +GGTAGACAGTCGATATCTCTC +>R_35 +GTCTACTCGTGATCGCGATGG +>F_36 +GGTAGGCTCGATCACATGACG +>R_36 +CGACATGTACGTCTAGGATGG +>F_37 +GGTAGGTCGTACACGTGCGAC +>R_37 +CGCACAGTGACTGCTAGATGG +>F_38 +GGTAGACTCATATCTAGAGTG +>R_38 +TAGCTATCGCATGAGCGATGG +>F_39 +GGTAGACTGATCTGTCGCGCT +>R_39 +ATACAGACGACTGCGCGATGG +>F_40 +GGTAGCACTAGCTCTGACTAC +>R_40 +TCTGTACGTAGCTCATGATGG +>F_41 +GGTAGGCTGTCATGTACTAGC +>R_41 +CTGATAGACTCGCGACGATGG +>F_42 +GGTAGTATACATACACGCACT +>R_42 +TAGTGCAGATCGATGTGATGG +>F_43 +GGTAGTGTGACGACGCGTCTC +>R_43 +GCGTCTATGCTATACTGATGG +>F_44 +GGTAGGACGTGAGCATGCACT +>R_44 +AGAGTCACGCTCTCACGATGG +>F_45 +GGTAGCTCGATACGTGTAGCT +>R_45 +GAGTCATCTACTGACAGATGG +>F_46 +GGTAGGTGTCTAGACAGCTGT +>R_46 +TGTCGATCTCGTACGAGATGG +>F_47 +GGTAGGATGCATGCGTACGCA +>R_47 +CTCGAGTCACATGTAGGATGG +>F_48 +GGTAGTATCAGAGCAGCGATG +>R_48 +ACGAGCACTATAGCGCGATGG diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..5a017e3 --- /dev/null +++ b/setup.py @@ -0,0 +1,37 @@ +from setuptools import setup, Extension, find_packages +import os +import sys + +vFile = 'src/python/pbbarcode/_version.py' + +if os.path.exists(vFile): + lines = open(vFile, 'r').read().splitlines() + for line in lines: + elts = line.split('=') + elts = [e.strip() for e in elts] + if len(elts) == 2 and elts[0] == '__version__': + _ReadVersion = elts[1].replace('\'', '').replace('\"', '') + break +else: + _ReadVersion = '0.0.0' + +setup( + name = 'pbbarcode', + version=_ReadVersion, + author='pbiDevNet', + author_email='[email protected]', + license='LICENSE.txt', + packages = find_packages('src/python'), + package_dir = {'':'src/python'}, + ext_modules=[Extension('pbbarcode/sw', ['src/C/sw.c'], extra_compile_args=["-O3","-shared"])], + zip_safe = False, + entry_points={ + 'console_scripts': [ + 'pbbarcode = pbbarcode.main:main'] + }, + install_requires=[ + 'pbcore >= 0.6.3', + 'numpy >= 1.6.0', + 'h5py >= 1.3.0' + ] + ) diff --git a/src/C/Makefile b/src/C/Makefile new file mode 100644 index 0000000..4913cf3 --- /dev/null +++ b/src/C/Makefile @@ -0,0 +1,11 @@ +.PHONY: clean all +SHELL = /bin/bash -e + +all: build/sw.so + +build/sw.so: sw.c + mkdir -p ./build;\ + gcc -O4 -DGETPROB -shared -fPIC sw.c -o build/sw.so +clean: + rm -rf build + diff --git a/src/C/sw.c b/src/C/sw.c new file mode 100644 index 0000000..4cdffdb --- /dev/null +++ b/src/C/sw.c @@ -0,0 +1,56 @@ +#include <stdlib.h> +#include <string.h> +#include <stdio.h> + +#define M 64 +#define N 64 +#define MAX(x,y) (((x) > (y)) ? (x) : (y)) + +int* allocate_dp_mat() { + return (int*) calloc(N*M, sizeof(int)); +} + +int compute_align_score(int* dp_mat, char* tSeq, char* qSeq) { + int ipenalty = -1; + int dpenalty = -1; + int match = 2; + int mpenalty = -2; + int best_score = 0; + int iscore = 0; + int dscore = 0; + int mscore = 0; + int i,j; + + memset(dp_mat, 0, M*N*sizeof(int)); + + for (i = 1; i < strlen(tSeq) + 1; i++) { + for (j = 1; j < strlen(qSeq) + 1; j++) { + iscore = dp_mat[i*M + j-1] + ipenalty; + dscore = dp_mat[(i-1)*M + j] + dpenalty; + mscore = dp_mat[(i-1)*M + j-1] + ((tSeq[i-1] == qSeq[j-1]) ? match : mpenalty); + dp_mat[i*M + j] = MAX(MAX(0, iscore), MAX(dscore, mscore)); + if (dp_mat[i*M + j] >= best_score) + best_score = dp_mat[i*M + j]; + } + } + return best_score; +} + +void compute_align_scores(int* scores, int n, int* dp_mat, char* tSeq, + char** qSeqs) { + int i = 0; + for (i; i < n; i++) { + scores[i] = compute_align_score(dp_mat, tSeq, qSeqs[i]); + } +} + + +void print_dp_mat(int* dp_mat, char* tSeq, char* qSeq) { + int i,j; + for (j = 0; j < strlen(qSeq) + 1; j++) { + for (i = 0; i < strlen(tSeq) + 1; i++) { + printf("%d ", dp_mat[i*M + j]); + } + printf("\n"); + } +} diff --git a/src/python/pbbarcode/BarcodeLabeler.py b/src/python/pbbarcode/BarcodeLabeler.py new file mode 100755 index 0000000..354338b --- /dev/null +++ b/src/python/pbbarcode/BarcodeLabeler.py @@ -0,0 +1,225 @@ +#################################################################################$$ +# Copyright (c) 2011,2012, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * Neither the name of Pacific Biosciences nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#################################################################################$$ +import logging + +from pbcore.io import BasH5Reader, BaxH5Reader +from pbcore.io.FastaIO import * +import pbbarcode.SWaligner as Aligner +import numpy as n + +from pbcore.io.BarcodeH5Reader import LabeledZmw, \ + BARCODE_DELIMITER + +__RC_MAP__ = dict(zip('ACGTacgt-N','TGCAtgca-N')) + +class BarcodeScorer(object): + def __init__(self, basH5, barcodeFasta, + adapterSidePad = 0, insertSidePad = 4, + scoreMode = 'symmetric', maxHits = 10, + scoreFirst = False, startTimeCutoff = 1): + """A BarcodeScorer object scores ZMWs and produces summaries + of the scores. Various parameters control the behavior of the + object, specifically the padding allows the user to add a + little extra on each side of the adapter find for safety. The + most relevant parameter is the scoreMode which dictates how + the barcodes are scored, either paired or symmetric.""" + + self.basH5 = basH5 + self.barcodeFasta = list(barcodeFasta) + self.aligner = Aligner.SWaligner() + self.barcodeLength = n.unique(map(lambda x : len(x.sequence), + self.barcodeFasta)) + if len(self.barcodeLength) > 1: + raise Exception("Currently, all barcodes must be the same length.") + else: + self.barcodeLength = int(self.barcodeLength) + + self.barcodeSeqs = [(barcode.sequence.upper(), + self._rc(barcode.sequence.upper())) + for barcode in self.barcodeFasta] + + self.adapterSidePad = adapterSidePad + self.insertSidePad = insertSidePad + self.maxHits = maxHits + + if scoreMode not in ['symmetric', 'paired']: + raise Exception("scoreMode must either be symmetric or paired") + self._scoreMode = scoreMode + + self.scoreFirst = scoreFirst + self.startTimeCutoff = startTimeCutoff + + self.forwardScorer = self.aligner.makeScorer([x[0] for x in self.barcodeSeqs]) + self.reverseScorer = self.aligner.makeScorer([x[1] for x in self.barcodeSeqs]) + + logging.debug(("Constructed BarcodeScorer with scoreMode: %s," + \ + "adapterSidePad: %d, insertSidePad: %d, and scoreFirst: %r") \ + % (scoreMode, adapterSidePad, insertSidePad, scoreFirst)) + + @property + def movieName(self): + return self.basH5.movieName + + def makeBCLabel(self, s1, s2): + return BARCODE_DELIMITER.join((s1, s2)) + + @property + def barcodeLabels(self): + """The barcode labels are function of the barcodeNames and the + scoreMode, they represent the user-visible names.""" + if self.scoreMode == 'paired': + return n.array([self.makeBCLabel(self.barcodeFasta[i].name, + self.barcodeFasta[i+1].name) for i + in xrange(0, len(self.barcodeSeqs), 2)]) + else: + return n.array([self.makeBCLabel(x.name, x.name) for x in self.barcodeFasta]) + + @property + def barcodeNames(self): + """The barcode names are the FASTA names""" + return n.array([x.name for x in self.barcodeFasta]) + + @property + def scoreMode(self): + return self._scoreMode + + def _rc(self, s): + return "".join([__RC_MAP__[c] for c in s[::-1]]) + + def _flankingSeqs(self, zmw): + def fromRange(rStart, rEnd): + try: + qSeqLeft = zmw.read(rStart - (self.barcodeLength + self.insertSidePad), + rStart + self.adapterSidePad).basecalls() + except IndexError: + qSeqLeft = None + try: + qSeqRight = zmw.read(rEnd - self.adapterSidePad, + rEnd + self.barcodeLength + + self.insertSidePad).basecalls() + except IndexError: + qSeqRight = None + + return (qSeqLeft, qSeqRight) + + adapterRegions = zmw.adapterRegions + if len(adapterRegions) > self.maxHits: + adapterRegions = adapterRegions[0:self.maxHits] + + seqs = [fromRange(start, end) for (start, end) in adapterRegions] + + # We only score the first barcode if we don't find any adapters + # *and* the start time is less than the threshold. + scoredFirst = False + if self.scoreFirst and not len(seqs): + s = zmw.zmwMetric('HQRegionStartTime') + e = zmw.zmwMetric('HQRegionEndTime') + # s<e => has HQ. + if s < e and s <= self.startTimeCutoff: + l = self.barcodeLength + self.insertSidePad + l = l if zmw.hqRegion[1] > l else zmw.hqRegion[1] + try: + bc = zmw.read(0, l).basecalls() + if len(bc) >= self.barcodeLength: + seqs.insert(0, (bc, None)) + scoredFirst = True + except IndexError: + pass + + return (seqs, scoredFirst) + + def labelZmws(self, holeNumbers): + """Return a list of LabeledZmws for input holeNumbers""" + def scoreZmw(zmw): + adapters, scoredFirst = self._flankingSeqs(zmw) + adapterScores = [[]]*len(adapters) + barcodeScores = n.zeros(len(self.barcodeSeqs)) + + for i,adapter in enumerate(adapters): + fscores = self.forwardScorer(adapter[0]) + rscores = self.reverseScorer(adapter[0]) + ffscores = self.forwardScorer(adapter[1]) + rrscores = self.reverseScorer(adapter[1]) + + scored = 2.0 if adapter[0] and adapter[1] else \ + 1.0 if adapter[0] or adapter[1] else 0 + + # An adapter score is the average barcode score for + # each barcode -- that way, you can compare across + # adapters even if the different adapters have + # different numbers of flanking sequence. + if scored == 0: + adapterScores[i] = barcodeScores + else: + adapterScores[i] = n.maximum((fscores + rrscores)/scored, + (rscores + ffscores)/scored) + + barcodeScores = reduce(lambda x, y: x + y, adapterScores) if adapterScores \ + else n.zeros(len(self.barcodeSeqs)) + + return (zmw.holeNumber, len(adapters), barcodeScores, adapterScores, + scoredFirst) + + # o here is the record immediately above. + def chooseSymmetric(o): + p = n.argsort(-o[2]) + return LabeledZmw(o[0], o[1], p[0], o[2][p[0]], p[1], o[2][p[1]], o[3]) + def choosePaired(o): + if o[1] == 1: + s = n.array([max(o[2][i], o[2][i + 1]) for i in \ + xrange(0, len(self.barcodeSeqs), 2)]) + p = n.argsort(-s) + s = s[p] + else: + # score the pairs by scoring the two alternate + # ways they could have been put on the molecule. A + # missed adapter will confuse this computation. + scores = o[3] + results = n.zeros(len(self.barcodeSeqs)/2) + for i in xrange(0, len(self.barcodeSeqs), 2): + pths = [0,0] + for j in xrange(0, len(scores)): + pths[j % 2] += scores[j][i] + pths[1 - j % 2] += scores[j][i + 1] + results[i/2] = max(pths) + + p = n.argsort(-results) + s = results[p] + + return LabeledZmw(o[0], o[1], p[0], s[0], p[1], s[1], o[3]) + + if self.scoreMode == 'symmetric': + choose = chooseSymmetric + elif self.scoreMode == 'paired': + choose = choosePaired + else: + raise Exception("Unsupported scoring mode in BarcodeLabeler.py") + + scored = [scoreZmw(self.basH5[zmw]) for zmw in holeNumbers] + return [choose(scoreTup) for scoreTup in scored if scoreTup[1]] diff --git a/src/python/pbbarcode/SWaligner.py b/src/python/pbbarcode/SWaligner.py new file mode 100755 index 0000000..d6dae46 --- /dev/null +++ b/src/python/pbbarcode/SWaligner.py @@ -0,0 +1,69 @@ +#################################################################################$$ +# Copyright (c) 2011,2012, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * Neither the name of Pacific Biosciences nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#################################################################################$$ +from ctypes import * +import os +import numpy +import pkg_resources + +class SWaligner(object): + def __init__(self): + # setup.py should put sw.so in the following path. + self.SW_DLL_PATH = os.path.dirname(os.path.abspath(__file__)) + os.path.sep + "sw.so" + self._dll = CDLL(self.SW_DLL_PATH) + self.dpMat = self._dll.allocate_dp_mat() + + def score(self, tSeq, qSeq): + return self._dll.compute_align_score(self.dpMat, tSeq, qSeq) + + def makeScorer(self, targets): + ScoreType = c_int * len(targets) + scores = ScoreType() + for i in range(0, len(scores)): + scores[i] = 0 + + TargetType = c_char_p * len(targets) + targetSeqs = TargetType() + for i in range(0, len(targetSeqs)): + targetSeqs[i] = targets[i] + + targetLen = len(targets) + + def scorer(query): + if not query: + return numpy.zeros(len(targets)) + + self._dll.compute_align_scores(scores, + targetLen, + self.dpMat, + query, + targetSeqs) + return numpy.array([scores[i] for i in xrange(0, len(scores))]) + return scorer + + diff --git a/src/python/pbbarcode/__init__.py b/src/python/pbbarcode/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/src/python/pbbarcode/_version.py b/src/python/pbbarcode/_version.py new file mode 100755 index 0000000..4f0196d --- /dev/null +++ b/src/python/pbbarcode/_version.py @@ -0,0 +1 @@ +__version__='0.8.0' diff --git a/src/python/pbbarcode/main.py b/src/python/pbbarcode/main.py new file mode 100755 index 0000000..0b6ddd0 --- /dev/null +++ b/src/python/pbbarcode/main.py @@ -0,0 +1,751 @@ +#!/usr/bin/env python +#################################################################################$$ +# Copyright (c) 2011,2012, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * Neither the name of Pacific Biosciences nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#################################################################################$$ +import os +import sys +import argparse +import logging +import tempfile +import shutil +import pkg_resources +import re +import subprocess +import random +import shutil + +from multiprocessing import Pool + +import h5py as h5 +import numpy as n + +from pbcore.util.ToolRunner import PBMultiToolRunner +from pbcore.io import BaxH5Reader, BasH5Reader +from pbcore.io import CmpH5Reader, CmpH5Alignment +from pbcore.io.BarcodeH5Reader import * +from pbcore.io import FastaReader, FastqWriter, FastqRecord, \ + FastaWriter, FastaRecord + +from pbbarcode.BarcodeLabeler import * +from pbbarcode._version import __version__ + +from pbh5tools.CmpH5Utils import copyAttributes + +# Paths to the Barcode Datasets in the cmp.h5 file. +BC_ALN_INFO_DS = "AlnInfo/Barcode" +BC_INFO_NAME = "BarcodeInfo/Name" +BC_INFO_ID = "BarcodeInfo/ID" + +SCORE_MODES = ['symmetric', 'paired'] + +BAS_PLS_REGEX = r'\.ba[x|s]\.h5$|\.pl[x|s]\.h5$|\.cc[x|s]\.h5$' +BARCODE_EXT = '.bc.h5' +BC_REGEX = r'\.bc\.h5' + +def movieNameFromFile(fn): + return re.sub('|'.join((BC_REGEX, BAS_PLS_REGEX)) , '', + os.path.basename(fn)) + +def makeBarcodeH5FromBasH5(basH5): + """The workhorse function for creating a barcode H5 file from a + base H5 file.""" + labeler = BarcodeScorer(basH5, FastaReader(runner.args.barcodeFile), + runner.args.adapterSidePad, runner.args.insertSidePad, + scoreMode = runner.args.scoreMode, + maxHits = runner.args.maxAdapters, + scoreFirst = runner.args.scoreFirst, + startTimeCutoff = runner.args.startTimeCutoff) + if runner.args.nZmws < 0: + zmws = basH5.sequencingZmws + else: + zmws = basH5.sequencingZmws[0:runner.args.nZmws] + + logging.debug("Labeling %d ZMWs from: %s" % (len(zmws), basH5.filename)) + labeledZmws = labeler.labelZmws(zmws) + logging.debug("Labeled %d ZMWs" % len(labeledZmws)) + + outBase = re.sub(BAS_PLS_REGEX, BARCODE_EXT, + os.path.basename(basH5.filename)) + outFile = '/'.join((runner.args.outDir, outBase)) + logging.debug("Writing to: %s" % outFile) + + writeBarcodeH5(labeledZmws, labeler, outFile, + runner.args.saveExtendedInfo) + return outFile + +def mpWrapper(f): + return makeBarcodeH5FromBasH5(BasH5Reader(f)) + +def makeBarcodeFofnFromBasFofn(): + inputFofn = runner.args.inputFile + inFiles = open(inputFofn).read().splitlines() + + if not all(map(os.path.exists, inFiles)): + raise IOError("All files in input.fofn must exist.") + + logging.debug("Using %d processes." % runner.args.nProcs) + if runner.args.nProcs <= 1: + newFiles = map(mpWrapper, inFiles) + else: + pool = Pool(runner.args.nProcs) + newFiles = pool.map(mpWrapper, inFiles) + + oFile = open(runner.args.outFofn, 'w') + for nF in newFiles: + oFile.write(nF + "\n") + oFile.close() + +def labelAlignments(): + logging.info("Labeling alignments using: %s" % runner.args.inputFofn) + bcFofn = BarcodeH5Fofn(runner.args.inputFofn) + + with CmpH5Reader(runner.args.cmpH5) as cmpH5: + bcDS = n.zeros((len(cmpH5), 5), dtype = "int32") + + for (i, aln) in enumerate(cmpH5): + bcReader = bcFofn.readerForMovie(aln.movieInfo.Name) + try: + lZmw = bcReader.labeledZmwFromHoleNumber(aln.HoleNumber) + if lZmw.nScored < runner.args.minNumBarcodes or \ + lZmw.averageScore < runner.args.minAvgBarcodeScore or \ + lZmw.scoreRatio < runner.args.minScoreRatio: + lZmw = None + except KeyError: + lZmw = None + + if lZmw: + bcDS[i,:] = n.array([lZmw.nScored, lZmw.bestIdx, lZmw.bestScore, + lZmw.secondBestIdx, lZmw.secondBestScore]) + else: + # either no barcode was found for this guy or they got + # filtered, hence the NULL_BARCODE + bcDS[i,:] = n.array([0, + len(bcReader.barcodeLabels), 0, + len(bcReader.barcodeLabels), 0]) + + # write to the cmp.h5 file. + H5 = h5.File(runner.args.cmpH5, 'r+') + if BC_INFO_ID in H5: + del H5[BC_INFO_ID] + if BC_INFO_NAME in H5: + del H5[BC_INFO_NAME] + + # we use the first one to get the labels, if somehow they + # don't have all of the same stuff that will be an issue. + bcLabels = n.concatenate((bcFofn.barcodeLabels, n.array([BARCODE_DELIMITER]))) + H5.create_dataset(BC_INFO_ID, data = n.array(range(0, len(bcLabels))), + dtype = 'int32') + H5.create_dataset(BC_INFO_NAME, data = bcLabels, dtype = h5.new_vlen(str)) + if BC_ALN_INFO_DS in H5: + del H5[BC_ALN_INFO_DS] + bcDS = H5.create_dataset(BC_ALN_INFO_DS, data = bcDS, dtype = 'int32') + bcDS.attrs['ColumnNames'] = n.array(['count', 'index1', 'score1', 'index2', + 'score2']) + #force BarcodeMode to have numpy dtype for CmpH5Sort 'extra datasets' routine + bcDS.attrs['BarcodeMode'] = n.array( bcFofn.scoreMode ) + H5.close() + +def zipFofns(*inFofns): + """Take inputFofns and return n tuples of length len(inFofns) + where n is the number of entries in each FOFN.""" + def readAndSort(inFile): + lines = n.array(open(inFile).read().splitlines()) + lines = lines[n.array(n.argsort([movieNameFromFile(fofnLine) for + fofnLine in lines]))] + return lines + + sortedFofns = [readAndSort(inFofn) for inFofn in inFofns] + l = map(len, sortedFofns) + if len(n.unique(l)) != 1: + raise Exception("Fofns don't match, unequal number of inputs.") + else: + for i in xrange(0, n.unique(l)): + if len(n.unique([movieNameFromFile(sortedFofn[i]) for + sortedFofn in sortedFofns])) != 1: + raise Exception("Fofn elements don't match, movies differ.") + + # need to un-arrayify these guys + return zip(*map(list, sortedFofns)) + +def filterZmws(zmwsForBCs): + """Apply various filterings passed by the user. There are somewhat + different semantics for CCS filtering and subread filtering in + terms of the raw primary metrics available, e.g., + HQRegionStartTime is unavailable for the CCS data and somewhat + irrelevant.""" + def getHQStart(zmw): + try: + return zmw.zmwMetric('HQRegionStartTime') + except: + return 0 + + def getReadScore(zmw): + return zmw.zmwMetric("ReadScore") + + def molLenGuess(zmw): + if zmw.baxH5.hasRawBasecalls: + return max(map(len, zmw.subreads)) if zmw.subreads else 0 + else: + return len(zmw.ccsRead) if zmw.ccsRead else 0 + + def zmwFilterFx(tup): + zmw, lZmw = tup + + mlGuess = molLenGuess(zmw) + if not mlGuess: + return False + + avgScore = lZmw.averageScore + numScored = lZmw.nScored + scoreRatio = lZmw.scoreRatio + hqStart = getHQStart(zmw) + readScore = getReadScore(zmw) + + ## XXX : still need to detect the chimeras + if mlGuess < runner.args.minMaxInsertLength or \ + hqStart > runner.args.hqStartTime or \ + readScore < runner.args.minReadScore or \ + avgScore < runner.args.minAvgBarcodeScore or \ + numScored < runner.args.minNumBarcodes or \ + scoreRatio < runner.args.minScoreRatio: + return False + else: + return True + + return { k:filter(zmwFilterFx, v) for k,v in zmwsForBCs.items() } + +def _warnOnce(): + var = [] + def warnOnce(msg): + if not var: + logging.warn(msg) + var.append(1) + return warnOnce +warnOnce = _warnOnce() + +def getFastqRecords(zmw, lZmw = None): + if zmw.baxH5.hasRawBasecalls and zmw.baxH5.hasConsensusBasecalls: + # Only examine this parameter when passed both. + if runner.args.subreads: + reads = zmw.subreads + else: + reads = [zmw.ccsRead] + elif zmw.baxH5.hasRawBasecalls: + if runner.args.subreads: + warnOnce("`subreads` argument is ignored when using >= 2.1" + + "bas.h5 data as input.") + reads = zmw.subreads + else: + if runner.args.subreads: + warnOnce("`subreads` argument is ignored when using >= 2.1" + + "ccs.h5 data as input.") + reads = [zmw.ccsRead] + + extra = (" %g %g" % (round(zmw.zmwMetric("ReadScore"), 2), + round(lZmw.averageScore, 2))) if lZmw else "" + + return [FastqRecord(read.readName + extra, + read.basecalls(), + read.QualityValue()) for read in reads if read] + +def getFastqs(): + zmwsByBarcode = getZmwsForBarcodes() + logging.debug("Pre-filter: Average number of ZMWs per barcode: %d" % + n.mean([len(zmwsByBarcode[k]) for k in zmwsByBarcode.keys()])) + + zmwsByBarcode = filterZmws(zmwsByBarcode) + logging.debug("Post-filter: Average number of ZMWs per barcode: %d" % + n.mean([len(zmwsByBarcode[k]) for k in zmwsByBarcode.keys()])) + + def getReadData(zmws): + recs = [getFastqRecords(zmw,lZmw) for zmw,lZmw in zmws] + recs = filter(lambda x : x, recs) + return [elt for sublst in recs for elt in sublst] + + return {k:getReadData(zmws) for k, zmws in zmwsByBarcode.iteritems()} + +def emitFastqs(): + outFiles = getFastqs() + outDir = runner.args.outDir + fasta = runner.args.fasta + + if runner.args.unlabeledZmws: + outFiles['UNLABELED'] = getUnlabeledZmws() + + if not os.path.exists(runner.args.outDir): + os.makedirs(runner.args.outDir) + + if fasta: + writer = FastaWriter + def record(n, s, qv): + return FastaRecord(n, s) + else: + writer = FastqWriter + record = FastqRecord + + l = 'a' if runner.args.fasta else 'q' + for k in outFiles.keys(): + if outFiles[k]: + with writer("%s/%s.fast%s" % (runner.args.outDir, k, l)) as w: + for e in outFiles[k]: + tlen = len(e.sequence)-runner.args.trim + r = record(e.name, e.sequence[runner.args.trim:tlen], + e.quality[runner.args.trim:tlen]) + if r: + w.writeRecord(r) + +def getUnlabeledZmws(): + """Return FASTQ records for ZMWs which do not have a barcode label""" + unlabeledZmws = [] + + for basFile, barcodeFile in zipFofns(runner.args.inputFofn, + runner.args.barcodeFofn): + basH5 = BasH5Reader(basFile) + bcH5 = BarcodeH5Reader(barcodeFile) + sdiff = basH5.sequencingZmws[~n.in1d(basH5.sequencingZmws, + bcH5.labeledZmws.keys())] + for hn in sdiff: + unlabeledZmws.append(basH5[hn]) + + return reduce(lambda x,y : x+y, [getFastqRecords(unlabeledZmw) for + unlabeledZmw in unlabeledZmws]) + +def getZmwsForBarcodes(labels = None): + """dictionary of pbcore.io.Zmw and LabeledZmw indexed by barcode + label""" + zmwsForBCs = {} + for basFile, barcodeFile in zipFofns(runner.args.inputFofn, + runner.args.barcodeFofn): + basH5 = BasH5Reader(basFile) + bcH5 = BarcodeH5Reader(barcodeFile) + allLabs = bcH5.barcodeLabels + if labels: + allLabs = [x for x in allLabs if x in labels] + logging.info("Processing only: %s" % ",".join(allLabs)) + for label in allLabs: + lZmws = bcH5.labeledZmwsFromBarcodeLabel(label) + for lZmw in lZmws: + zmw = basH5[lZmw.holeNumber] + if not label in zmwsForBCs.keys(): + zmwsForBCs[label] = [] + zmwsForBCs[label].append((zmw, lZmw)) + + return zmwsForBCs + +def gconFunc(tp): + # called bcause multiprocess + rootDir, barcode = tp + bcdir = "/".join((rootDir, barcode)) + + ## call gcon + logging.info("In gconFunc for: %s" % barcode) + + cmd = "gcon.py r --min_cov 3 %s/subreads.fasta %s/seed_read.fasta -d %s" % \ + (bcdir, bcdir, bcdir) + subprocess.call(cmd, shell = True) + + ## check to see if the file is empty + r = FastaReader("%s/g_consensus.fa" % bcdir) + + if not list(r)[0].sequence: + return None + + ## check to see if we are going to run quiver + if not runner.args.noQuiver: + # setup the blasr / sam / quiver stuff. + logging.info("Setup regions file, now running blasr through quiver.") + + cmd = ('blasr %s %s/g_consensus.fa -nproc 1 -sam -regionTable %s/region.fofn -out ' + \ + '%s/aligned_reads.sam') % (runner.args.inputFofn, bcdir, bcdir, bcdir) + logging.debug(cmd) + subprocess.call(cmd, shell = True) + + cmd = 'samtoh5 %s/aligned_reads.sam %s/g_consensus.fa %s/aligned_reads.cmp.h5' % \ + (bcdir, bcdir, bcdir) + logging.debug(cmd) + subprocess.call(cmd, shell = True) + + cmd = ('loadPulses %s %s/aligned_reads.cmp.h5 -byread -metrics ' + \ + 'QualityValue,InsertionQV,MergeQV,DeletionQV,DeletionTag,SubstitutionTag,' + \ + 'SubstitutionQV') % (runner.args.inputFofn, bcdir) + logging.debug(cmd) + subprocess.call(cmd, shell = True) + + cmd = 'cmph5tools.py sort --inPlace %s/aligned_reads.cmp.h5' % bcdir + logging.debug(cmd) + subprocess.call(cmd, shell = True) + + cmd = ('quiver -vv --algorithm quiver -p P4-C2.AllQVsMergingByChannelModel ' \ + '%s/aligned_reads.cmp.h5 --outputFilename %s/q_consensus.fasta ' + \ + '--referenceFilename %s/g_consensus.fa') % (bcdir, bcdir, bcdir) + logging.debug(cmd) + subprocess.call(cmd, shell = True) + cFilename = 'q_consensus.fasta' + else: + cFilename = 'g_consensus.fa' + + ## append results to output file. + bcCons = "%s/%s/%s" % (rootDir, barcode, cFilename) + if os.path.exists(bcCons): + return FastaRecord(barcode, list(FastaReader(bcCons))[0].sequence) + else: + return None + +def subsampleReads(e): + logging.debug("starting with %d zmws" % len(e)) + if runner.args.nZmws > 0: + k = runner.args.nZmws if runner.args.nZmws < len(e) else len(e) + elif runner.args.subsample < 1: + k = int(len(e)*runner.args.subsample) + else: + k = len(e) + i = n.array(random.sample(range(0, len(e)), k), dtype = int) + logging.debug("subsampled down to: %d" % len(i)) + return [e[j] for j in i] + +def callConsensus(): + def makeReadAndReads(zmwsForBC): + ccsData = filter(lambda x:x, [zmw.ccsRead for _,_,zmw in zmwsForBC if zmw]) + srData = reduce(lambda x,y : x+y, [zmw.subreads for zmw,_,_ in + zmwsForBC if zmw], []) + if not srData and not ccsData: + return (None,None) + + def getSeedRead(reads, lq = 80, uq = 90, + sLambda = lambda x : -x.zmw.readScore): + lens = map(len, reads) + candidateRange = (n.percentile(lens, lq), + n.percentile(lens, uq)) + pfReads = [read for read,l in zip(reads, lens) if + l >= candidateRange[0] and l <= candidateRange[1]] + pfReads.sort(key = sLambda) + return pfReads[0] if len(pfReads) else None + + if ccsData: + ## all CCS reads should be the *same* length for an + ## amplicon. Let's take the middle ones + seedRead = getSeedRead(ccsData, lq = 30, uq = 70, + sLambda = lambda x: -x.zmw.numPasses) + if not seedRead: + seedRead = getSeedRead(srData) + logging.info("Unable to use a CCS read for the seed read.") + else: + logging.info("Using a CCS read for the seed read.") + else: + logging.info("Using a raw read for the seed read") + seedRead = getSeedRead(srData) + + return (seedRead, srData) + + # check to make sure that you have the necessary dependencies, + # i.e., hgap script, blasr, etc. + try: + import pbtools.pbdagcon + except ImportError: + raise ImportError("Unable to find dependency `pbdagcon` - please install.") + + # retrieve ZMWs by barcode + if runner.args.barcode: + zmwsForBCs = getZmwsForBarcodes(runner.args.barcode) + else: + zmwsForBCs = getZmwsForBarcodes() + + # subsample + zmwsForBCs = {k:subsampleReads(v) for k,v in zmwsForBCs.items()} + + logging.info("unfiltered average zmws per barcode: %g" % + n.round(n.mean(map(len, zmwsForBCs.values())))) + + # filter ZMWs + zmwsForBCs = filterZmws(zmwsForBCs) + + logging.info("filtered average zmws per barcode: %g" % + n.round(n.mean(map(len, zmwsForBCs.values())))) + + # now choose the best subread to seed the assembly + if runner.args.ccsFofn: + # XXX: This part depends on the filenames of the ccs and input + # fofns, this is essentially a workaround to the fact the the + # part isn't part of the API + ccsReaders = {movieNameFromFile(l):BasH5Reader(l) for l in + open(runner.args.ccsFofn).read().splitlines()} + + # fill in the CCS spot. + for k,v in zmwsForBCs.items(): + l = [] + for zmw,lZmw in v: + r = ccsReaders[movieNameFromFile(zmw.baxH5.file.filename)] + l.append((zmw,lZmw,r[zmw.holeNumber])) + zmwsForBCs[k] = l + else: + # add none to the CCS spot. + zmwsForBCs = {k:[(zmw,lZmw,None) for zmw,lZmw in v] + for k,v in zmwsForBCs.iteritems()} + + readAndReads = { k:makeReadAndReads(v) for k,v in zmwsForBCs.items() } + + # remove barcodes that don't have a seed read and a set of useable reads. + readAndReads = { k:v for k,v in readAndReads.items() if v[0] and v[1] } + + # generate FASTA files + outDir = runner.args.outDir + + for barcode, reads in readAndReads.items(): + bcdir = '/'.join((outDir, barcode)) + if not os.path.exists(bcdir): + os.makedirs(bcdir) + + # emit the seeds to separte files + with FastaWriter("%s/seed_read.fasta" % bcdir) as w: + w.writeRecord(FastaRecord(reads[0].readName, reads[0].basecalls())) + + subreads = reads[1] + + # emit the subreads to a single file + with FastaWriter("%s/subreads.fasta" % bcdir) as w: + for r in subreads: + w.writeRecord(FastaRecord(r.readName, r.basecalls())) + + # construct the region file by subsetting the ZMWs that you + # are interested in. + nfofn = [] + for inFof, in zipFofns(runner.args.inputFofn): + bh5 = BaxH5Reader(inFof) + reg = bh5.file['/PulseData/Regions'] + inMovie = filter(lambda z : z.baxH5.movieName == bh5.movieName, + subreads) + holes = n.in1d(reg[:,0], n.array([a.holeNumber for a in inMovie])) + if any(holes): + nreg = reg[holes,:] + else: + nreg = n.empty(shape = (0, reg.shape[1]), dtype = 'int32') + + fname = "%s/%s.rgn.h5" % (bcdir, movieNameFromFile(inFof)) + nfile = h5.File(fname, 'w') + ndset = nfile.create_dataset('/PulseData/Regions', data = nreg, + maxshape = (None, None)) + copyAttributes(reg, ndset) + nfile.close() + nfofn.append(fname) + + ofile = open('%s/region.fofn' % bcdir, 'w') + ofile.writelines("\n".join(nfofn)) + ofile.close() + + ## call gcon + outDirs = [ (outDir, k) for k in readAndReads.keys() ] + if runner.args.nProcs == 1: + outFasta = filter(lambda z: z, map(gconFunc, outDirs)) + else: + pool = Pool(runner.args.nProcs) + outFasta = filter(lambda z : z, pool.map(gconFunc, outDirs)) + + ## write the results + with FastaWriter('/'.join((outDir, "consensus.fa"))) as w: + for r in outFasta: + w.writeRecord(r) + + ## optionally cleanup + if not runner.args.keepTmpDir: + for barcode, reads in readAndReads.items(): + bcdir = '/'.join((outDir, barcode)) + shutil.rmtree(bcdir) + + +class Pbbarcode(PBMultiToolRunner): + def __init__(self): + desc = ['Utilities for labeling and annoting reads with barcode information.'] + super(Pbbarcode, self).__init__('\n'.join(desc)) + subparsers = self.subParsers + + desc = ['Creates a barcode.h5 file from base h5 files.'] + parser_m = subparsers.add_parser('labelZmws', description = "\n".join(desc), + help = 'Label zmws with barcode annotation', + formatter_class = \ + argparse.ArgumentDefaultsHelpFormatter) + parser_m.add_argument('--outDir', + help = 'Where to write the newly created barcode.h5 files.', + default = os.getcwd()) + parser_m.add_argument('--outFofn', help = 'Write to outFofn', + default = 'barcode.fofn') + parser_m.add_argument('--adapterSidePad', help = 'Pad with adapterSidePad bases', + default = 4, type = int) + parser_m.add_argument('--insertSidePad', help = 'Pad with insertSidePad bases', + default = 4, type = int) + parser_m.add_argument('--scoreMode', + help = 'The mode in which the barcodes should be scored.', + choices = SCORE_MODES, default = 'symmetric', type = str) + parser_m.add_argument('--maxAdapters', type = int, default = 20, + help = 'Only score the first maxAdapters') + parser_m.add_argument('--scoreFirst', action = 'store_true', default = False, + help = 'Whether to try to score the leftmost barcode in a trace.') + parser_m.add_argument('--startTimeCutoff', + help = 'Reads must start before this value in order to be ' + \ + 'included when scoreFirst is set.', type = float, + default = 10.0) + parser_m.add_argument('--nZmws', type = int, default = -1, + help = 'Use the first n ZMWs for testing') + parser_m.add_argument('--nProcs', type = int, default = 8, + help = 'How many processes to use') + parser_m.add_argument('--saveExtendedInfo', action = 'store_true', default = False,\ + help = 'Whether to save extended information to' + \ + 'the barcode.h5 files; this information is useful for ' + \ + 'debugging and chimera detection') + parser_m.add_argument('barcodeFile', metavar = 'barcode.fasta', + help = 'Input barcode fasta file') + parser_m.add_argument('inputFile', metavar = 'input.fofn', + help = 'Input base fofn') + + def addFilteringOpts(parser, justBarcode = False): + ## These are independent of the barcode scoring + if not justBarcode: + parser.add_argument('--minMaxInsertLength', default = 0, type = int, + help = "ZMW Filter: exclude ZMW if the longest subread" + \ + "is less than this amount") + parser.add_argument('--hqStartTime', default = float("inf"), type = float, + help = "ZMW Filter: exclude ZMW if start time of HQ region" + \ + "greater than this value (seconds)") + parser.add_argument('--minReadScore', default = 0, type = float, + help = "ZMW Filter: exclude ZMW if readScore is less than" + \ + "this value") + + ## These obviously need the barcode score + parser.add_argument('--minAvgBarcodeScore', default = 0.0, type = float, + help = "ZMW Filter: exclude ZMW if average barcode score " + \ + "is less than this value") + parser.add_argument('--minNumBarcodes', default = 1, type = int, + help = "ZMW Filter: exclude ZMW if number of barcodes observed " + \ + "is less than this value") + parser.add_argument('--minScoreRatio', default = 1.0, type = float, + help = "ZMW Filter: exclude ZMWs whose best score divided by " + \ + "the 2nd best score is less than this ratio") + + # Not yet implemented + # parser.add_argument('--filterChimeras', default = False, action = 'store_true', + # help = "ZMW Filter: exclude ZMWs that appear to be chimeric") + + + desc = ['Adds information about barcode alignments to a cmp.h5 file', + 'from a previous call to "labelZmws".'] + parser_s = subparsers.add_parser('labelAlignments', description = "\n".join(desc), + help = "Label reads from a barcode or region h5 file", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + addFilteringOpts(parser_s, justBarcode = True) + parser_s.add_argument('inputFofn', metavar = 'barcode.fofn', + help = 'input barcode fofn file') + parser_s.add_argument('cmpH5', metavar = 'aligned_reads.cmp.h5', + help = 'cmp.h5 file to add barcode labels') + + desc = ['Takes a bas.h5 fofn and a barcode.h5 fofn and produces', + 'a fast[a|q] file for each barcode.'] + parser_s = subparsers.add_parser('emitFastqs', description = "\n".join(desc), + help = "Write fastq files", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser_s.add_argument('--outDir', metavar = 'output.dir', + help = 'output directory to write fastq files', + default = os.getcwd()) + + parser_s.add_argument('--subreads', + help = 'whether to produce fastq files for the subreads;' + \ + 'the default is to use the CCS reads. This option only' + \ + 'applies when input.fofn has both consensus and raw reads,' + \ + 'otherwise the read type from input.fofn will be returned.', + action = 'store_true', + default = False) + parser_s.add_argument('--unlabeledZmws', + help = 'whether to emit a fastq file for the unlabeled ZMWs.' + \ + ' These are the ZMWs where no adapters are found typically', + action = 'store_true', + default = False) + + parser_s.add_argument('--trim', help = 'trim off barcodes and any excess constant sequence', + default = 20, type = int) + parser_s.add_argument('--fasta', help = ('whether the files produced should be FASTA files as' + + 'opposed to FASTQ'), + action = 'store_true', + default = False) + addFilteringOpts(parser_s) + parser_s.add_argument('inputFofn', metavar = 'input.fofn', + help = 'input base or CCS fofn file') + parser_s.add_argument('barcodeFofn', metavar = 'barcode.fofn', + help = 'input barcode.h5 fofn file') + + desc = ['Compute consensus sequences for each barcode.'] + parser_s = subparsers.add_parser('consensus', description = "\n".join(desc), + help = "Compute a consensus sequence for each barcode." + \ + "This command relies on the presence of pbdagcon", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser_s.add_argument('--subsample', default = 1, type = float, + help = "Subsample ZMWs") + parser_s.add_argument('--nZmws', default = -1, type = int, + help = "Take n ZMWs") + parser_s.add_argument('--outDir', default = '.', type = str, + help = "Use this directory to output results") + parser_s.add_argument('--keepTmpDir', action = 'store_true', default = False) + parser_s.add_argument('--ccsFofn', default = '', type = str, + help = 'Obtain CCS data from ccsFofn instead of input.fofn') + parser_s.add_argument('--nProcs', default = 16, type = int, + help = 'Use nProcs to execute.') + parser_s.add_argument('--noQuiver', action = 'store_true', + default = False) + addFilteringOpts(parser_s) + + parser_s.add_argument('inputFofn', metavar = 'input.fofn', + help = 'input bas.h5 fofn file') + parser_s.add_argument('barcodeFofn', metavar = 'barcode.fofn', + help = 'input bc.h5 fofn file') + + parser_s.add_argument('--barcode', default = None, type = str, nargs = "+", + help = "Use this to extract consensus for just one barcode.") + + def getVersion(self): + return __version__ + + def run(self): + logging.debug("Arguments" + str(self.args)) + + if self.args.subCommand == 'labelZmws': + makeBarcodeFofnFromBasFofn() + elif self.args.subCommand == 'labelAlignments': + labelAlignments() + elif self.args.subCommand == 'emitFastqs': + emitFastqs() + elif self.args.subCommand == 'consensus': + callConsensus() + else: + sys.exit(1) + +runner = Pbbarcode() + +def main(): + """The entry point for pbbarcode""" + sys.exit(runner.start()) + +#if __name__ == '__main__': +# runner = Pbbarcode() +# sys.exit(runner.start()) diff --git a/tests/cram/consensus.t.disabled b/tests/cram/consensus.t.disabled new file mode 100644 index 0000000..5bd9285 --- /dev/null +++ b/tests/cram/consensus.t.disabled @@ -0,0 +1,88 @@ + $ export INH5=`python -c "from pbcore import data ; print data.getCmpH5()"` + $ export INBH51=`python -c "from pbcore import data ; print data.geBasH5s[0]"` + $ export INBH52=`python -c "from pbcore import data ; print data.getBasH5s[1]"` + $ export BARCODE_FASTA=$TESTDIR/../../etc/barcode.fasta + $ echo $INBH51 > bas.fofn + $ echo $INBH52 >> bas.fofn + $ pbbarcode labelZmws $BARCODE_FASTA bas.fofn + $ pbbarcode consensus bas.fofn barcode.fofn + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46[INFO] [blasr] started. + 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] [INFO] 2013-08-02T00:28:462013-08-02T00:28:46 [blasr] started. [blasr] started. + + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] ended. + [INFO] 2013-08-02T00:28:46 [blasr] started. + [INFO] 2013-08-02T00:28:46 [blasr] ended. diff --git a/tests/cram/sanity.t b/tests/cram/sanity.t new file mode 100644 index 0000000..310c4db --- /dev/null +++ b/tests/cram/sanity.t @@ -0,0 +1,55 @@ + $ export INH5=`python -c "from pbcore import data ; print data.getCmpH5()"` + $ export INBH51=`python -c "from pbcore import data ; print data.getBasH5s()[0]"` + $ export INBH52=`python -c "from pbcore import data ; print data.getBasH5s()[1]"` + $ export BARCODE_FASTA=$TESTDIR/../../etc/barcode.fasta + $ echo $INBH51 > bas.fofn + $ echo $INBH52 >> bas.fofn + $ pbbarcode labelZmws $BARCODE_FASTA bas.fofn + $ pbbarcode labelZmws --scoreMode paired $BARCODE_FASTA bas.fofn + $ pbbarcode labelZmws --scoreMode paired --scoreFirst $BARCODE_FASTA bas.fofn + $ pbbarcode labelZmws --scoreMode paired --scoreFirst --adapterSidePad 0 --insertSidePad 0 $BARCODE_FASTA bas.fofn + $ pbbarcode emitFastqs --fasta bas.fofn barcode.fofn + $ pbbarcode emitFastqs --trim 20 bas.fofn barcode.fofn + $ pbbarcode emitFastqs --subreads --trim 20 bas.fofn barcode.fofn + $ cp $INH5 ./aligned_reads.cmp.h5 + $ chmod 766 ./aligned_reads.cmp.h5 + $ pbbarcode labelAlignments barcode.fofn aligned_reads.cmp.h5 +Check that same holes get the same barcode (consistent scoring) + $ cmph5tools.py stats --what "(Movie,HoleNumber,Barcode,AverageBarcodeScore)" aligned_reads.cmp.h5 | uniq + Movie Barcode AverageBarcodeScore HoleNumber + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 13.00 3008 + m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 12.50 2001 + m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 12.00 4009 + m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 12.57 2008 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 14.33 3006 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 12.00 1000 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.00 4004 + m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 14.50 1006 + m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 12.00 4006 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 13.33 2006 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.67 3002 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 13.33 2006 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.67 1009 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.67 3002 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.67 1009 + m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 13.33 1000 + m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 12.33 1007 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.50 9 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 13.00 1004 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.00 2002 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 12.80 2004 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 12.00 4007 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 12.80 2004 + m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 12.00 3008 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 14.33 2009 + m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 14.50 2007 + m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 12.57 2008 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 16.00 1002 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 13.33 1008 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.50 9 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 14.00 2000 + m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 11.67 9 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 14.00 2000 + m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 11.67 9 + m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 14.33 8 + m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 14.33 2003 diff --git a/tests/test_basic.py b/tests/test_basic.py new file mode 100755 index 0000000..064bf71 --- /dev/null +++ b/tests/test_basic.py @@ -0,0 +1,32 @@ +import logging +import unittest + +# this is purely for the coverage to not fail when it's generated +import pbbarcode + +log = logging.getLogger(__name__) + + +class TestBasic(unittest.TestCase): + def test_01(self): + """Place holder so jenkins will generate a coverage report""" + self.assertTrue(True) + + + + + + + + + + + + + + + + + + + -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pbbarcode.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
