This is an automated email from the git hooks/post-receive script. afif-guest pushed a commit to branch master in repository python-pbcore.
commit a8e05e349e793653d6fcaa1a8d466d4fd4271028 Author: Afif Elghraoui <[email protected]> Date: Mon Oct 26 01:12:01 2015 -0700 Imported Upstream version 1.2.4+dfsg --- CHANGELOG.org | 4 + doc/pacbio-theme/static/headerGradient.jpg | Bin 7099 -> 0 bytes doc/pacbio-theme/static/pacbio.css | 238 ---------------- doc/pacbio-theme/static/pacbioLogo.png | Bin 3128 -> 0 bytes doc/pacbio-theme/static/pygments.css | 55 ---- doc/pacbio-theme/theme.conf | 4 - doc/pbcore.io.dataset.rst | 95 ++++++- pbcore/__init__.py | 2 +- pbcore/data/datasets/__init__.py | 3 + ...00001823085912221377_s1_X0.subreads.fake_bc.bam | Bin 0 -> 192565 bytes ...1823085912221377_s1_X0.subreads.fake_bc.bam.bai | Bin 0 -> 16 bytes ...1823085912221377_s1_X0.subreads.fake_bc.bam.pbi | Bin 0 -> 1523 bytes pbcore/data/datasets/pbalchemysim0.pbalign.bam | Bin 303745 -> 303947 bytes pbcore/data/datasets/pbalchemysim0.pbalign.bam.pbi | Bin 2147 -> 2159 bytes pbcore/data/datasets/pbalchemysim0.subreads.bam | Bin 301012 -> 301048 bytes .../data/datasets/pbalchemysim0.subreads.bam.pbi | Bin 1003 -> 1030 bytes pbcore/data/datasets/pbalchemysim1.pbalign.bam | Bin 287718 -> 287902 bytes pbcore/data/datasets/pbalchemysim1.pbalign.bam.pbi | Bin 2028 -> 2082 bytes pbcore/data/datasets/pbalchemysim1.subreads.bam | Bin 284875 -> 284903 bytes .../data/datasets/pbalchemysim1.subreads.bam.pbi | Bin 945 -> 965 bytes pbcore/data/empty.ccs.bam | Bin 540 -> 507 bytes pbcore/data/empty.ccs.bam.pbi | Bin 67 -> 67 bytes ...100569412550000001823090301191423_s1_p0.ccs.bam | Bin 280793 -> 278396 bytes ...00001823085912221377_s1_X0.aligned_subreads.bam | Bin 201867 -> 201960 bytes ...1823085912221377_s1_X0.aligned_subreads.bam.bai | Bin 112 -> 112 bytes ...1823085912221377_s1_X0.aligned_subreads.bam.pbi | Bin 2521 -> 2609 bytes ...564852550000001823085912221377_s1_X0.scraps.bam | Bin 0 -> 608536 bytes ...4852550000001823085912221377_s1_X0.subreads.bam | Bin 204413 -> 204543 bytes ...550000001823085912221377_s1_X0.subreads.bam.pbi | Bin 1314 -> 1463 bytes pbcore/io/BasH5IO.py | 9 + pbcore/io/FastaIO.py | 2 + pbcore/io/GffIO.py | 97 +++++-- pbcore/io/align/BamAlignment.py | 4 +- pbcore/io/align/BamIO.py | 12 +- pbcore/io/align/PacBioBamIndex.py | 43 ++- pbcore/io/dataset/DataSetIO.py | 306 ++++++++++++++++----- pbcore/io/dataset/DataSetMembers.py | 73 +++-- pbcore/io/dataset/DataSetReader.py | 1 + pbcore/io/dataset/DataSetValidator.py | 30 +- pbcore/io/dataset/DataSetWriter.py | 21 +- pbcore/io/dataset/EntryPoints.py | 38 ++- pbcore/io/dataset/_pbds.py | 2 +- pbcore/io/dataset/utils.py | 79 ++++-- tests/test_pbcore_extract_bam_sequence.py | 4 +- tests/test_pbcore_io_AlnFileReaders.py | 39 ++- tests/test_pbcore_io_GffIO.py | 27 +- tests/test_pbdataset.py | 185 +++++++++++-- tests/test_pbdataset_subtypes.py | 82 +++++- 48 files changed, 921 insertions(+), 534 deletions(-) diff --git a/CHANGELOG.org b/CHANGELOG.org index c00d28e..1a8c09f 100644 --- a/CHANGELOG.org +++ b/CHANGELOG.org @@ -1,3 +1,7 @@ +* Version 1.2.4 +- add GFF gather functionality +- support for 3.0.1 BAM format spec + * Version 1.2.3 - Updated for newer dataset schema - BAM access speed improvements diff --git a/doc/pacbio-theme/static/headerGradient.jpg b/doc/pacbio-theme/static/headerGradient.jpg deleted file mode 100644 index 883f147..0000000 Binary files a/doc/pacbio-theme/static/headerGradient.jpg and /dev/null differ diff --git a/doc/pacbio-theme/static/pacbio.css b/doc/pacbio-theme/static/pacbio.css deleted file mode 100644 index b4ab87f..0000000 --- a/doc/pacbio-theme/static/pacbio.css +++ /dev/null @@ -1,238 +0,0 @@ -/** - * Sphinx stylesheet -- default theme - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -@import url("basic.css"); - -/* -- page layout ----------------------------------------------------------- */ - -body { - font-family: Arial, sans-serif; - font-size: 100%; - background-color: #555; - color: #555; - margin: 0; - padding: 0; - min-width: 500px; - max-width: 956px; - margin: 0 auto; -} - -div.documentwrapper { - float: left; - width: 100%; -} - -div.bodywrapper { - margin: 0 0 0 230px; -} - -hr{ - border: 1px solid #B1B4B6; - -} - -div.document { - background-color: #eee; -} - -div.body { - background-color: #ffffff; - color: #3E4349; - padding: 30px 30px 30px 30px; - font-size: 0.8em; -} - -div.footer { - color: #555; - background-color: #fff; - padding: 13px 0; - text-align: center; - font-size: 75%; - -} -div.footer a { - color: #444; - text-decoration: underline; -} - -div.related { - background: #fff url(headerGradient.jpg); - line-height: 80px; - color: #fff; - font-size: 0.80em; - height: 79px; - z-index: -1; -} - -div.related ul { - background: url(pacbioLogo.png) 10px no-repeat; - padding: 0 0 0 200px; -} - -div.related a { - color: #E2F3CC; -} - -div.sphinxsidebar { - font-size: 0.75em; - line-height: 1.5em; -} - -div.sphinxsidebarwrapper{ - padding: 20px 0; -} - -div.sphinxsidebar h3, -div.sphinxsidebar h4 { - font-family: Arial, sans-serif; - color: #222; - font-size: 1.2em; - font-weight: bold; - margin: 0; - padding: 5px 10px 0 10px; -} - -div.sphinxsidebar h4{ - font-size: 1.1em; -} - -div.sphinxsidebar h3 a { - color: #444; -} - - -div.sphinxsidebar p { - color: #888; - padding: 0px 20px; - margin-top: 5px; -} - -div.sphinxsidebar p.topless { -} - -div.sphinxsidebar ul { - margin: 5px 20px 10px 20px; - padding: 0; - color: #000; -} - -div.sphinxsidebar a { - color: #444; -} - -div.sphinxsidebar input { - border: 1px solid #ccc; - font-family: sans-serif; - font-size: 1em; -} - -div.sphinxsidebar input[type=text]{ - margin-left: 20px; -} - -/* -- body styles ----------------------------------------------------------- */ - -a { - color: #005B81; - text-decoration: none; -} - -a:hover { - color: #E32E00; - text-decoration: underline; -} - -div.body h1, -div.body h2, -div.body h3, -div.body h4, -div.body h5, -div.body h6 { - font-family: Arial, sans-serif; - font-weight: bold; - color: #264868; - margin: 30px 0px 10px 0px; - padding: 5px 0 5px 0px; -} - -div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 180%; font-weight: normal; } -div.body h2 { font-size: 125%; } -div.body h3 { font-size: 110%; } -div.body h4 { font-size: 100%; } -div.body h5 { font-size: 100%; } -div.body h6 { font-size: 100%; } - -a.headerlink { - color: #c60f0f; - font-size: 0.8em; - padding: 0 4px 0 4px; - text-decoration: none; -} - -a.headerlink:hover { - background-color: #c60f0f; - color: white; -} - -div.body p, div.body dd, div.body li { - line-height: 1.5em; - font-size: 1em; -} - -div.admonition p.admonition-title + p { - display: inline; -} - -div.highlight{ - background-color: white; -} - -div.note { - background-color: #eee; - border: 1px solid #ccc; -} - -div.seealso { - background-color: #ffc; - border: 1px solid #ff6; -} - -div.topic { - background-color: #eee; -} - -div.warning { - background-color: #ffe4e4; - border: 1px solid #f66; -} - -p.admonition-title { - display: inline; -} - -p.admonition-title:after { - content: ":"; -} - -pre { - padding: 10px; - background-color: White; - color: #222; - line-height: 1.2em; - border: 1px solid #C6C9CB; - font-size: 1.2em; - margin: 1.5em 0 1.5em 0; - -webkit-box-shadow: 1px 1px 1px #d8d8d8; - -moz-box-shadow: 1px 1px 1px #d8d8d8; -} - -tt { - background-color: #ecf0f3; - color: #222; - padding: 1px 2px; - font-size: 1.2em; - font-family: monospace; -} - diff --git a/doc/pacbio-theme/static/pacbioLogo.png b/doc/pacbio-theme/static/pacbioLogo.png deleted file mode 100644 index b2e4887..0000000 Binary files a/doc/pacbio-theme/static/pacbioLogo.png and /dev/null differ diff --git a/doc/pacbio-theme/static/pygments.css b/doc/pacbio-theme/static/pygments.css deleted file mode 100644 index 4588cde..0000000 --- a/doc/pacbio-theme/static/pygments.css +++ /dev/null @@ -1,55 +0,0 @@ -.c { color: #999988; font-style: italic } /* Comment */ -.k { font-weight: bold } /* Keyword */ -.o { font-weight: bold } /* Operator */ -.cm { color: #999988; font-style: italic } /* Comment.Multiline */ -.cp { color: #999999; font-weight: bold } /* Comment.preproc */ -.c1 { color: #999988; font-style: italic } /* Comment.Single */ -.gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */ -.ge { font-style: italic } /* Generic.Emph */ -.gr { color: #aa0000 } /* Generic.Error */ -.gh { color: #999999 } /* Generic.Heading */ -.gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */ -.go { color: #111 } /* Generic.Output */ -.gp { color: #555555 } /* Generic.Prompt */ -.gs { font-weight: bold } /* Generic.Strong */ -.gu { color: #aaaaaa } /* Generic.Subheading */ -.gt { color: #aa0000 } /* Generic.Traceback */ -.kc { font-weight: bold } /* Keyword.Constant */ -.kd { font-weight: bold } /* Keyword.Declaration */ -.kp { font-weight: bold } /* Keyword.Pseudo */ -.kr { font-weight: bold } /* Keyword.Reserved */ -.kt { color: #445588; font-weight: bold } /* Keyword.Type */ -.m { color: #009999 } /* Literal.Number */ -.s { color: #bb8844 } /* Literal.String */ -.na { color: #008080 } /* Name.Attribute */ -.nb { color: #999999 } /* Name.Builtin */ -.nc { color: #445588; font-weight: bold } /* Name.Class */ -.no { color: #ff99ff } /* Name.Constant */ -.ni { color: #800080 } /* Name.Entity */ -.ne { color: #990000; font-weight: bold } /* Name.Exception */ -.nf { color: #990000; font-weight: bold } /* Name.Function */ -.nn { color: #555555 } /* Name.Namespace */ -.nt { color: #000080 } /* Name.Tag */ -.nv { color: purple } /* Name.Variable */ -.ow { font-weight: bold } /* Operator.Word */ -.mf { color: #009999 } /* Literal.Number.Float */ -.mh { color: #009999 } /* Literal.Number.Hex */ -.mi { color: #009999 } /* Literal.Number.Integer */ -.mo { color: #009999 } /* Literal.Number.Oct */ -.sb { color: #bb8844 } /* Literal.String.Backtick */ -.sc { color: #bb8844 } /* Literal.String.Char */ -.sd { color: #bb8844 } /* Literal.String.Doc */ -.s2 { color: #bb8844 } /* Literal.String.Double */ -.se { color: #bb8844 } /* Literal.String.Escape */ -.sh { color: #bb8844 } /* Literal.String.Heredoc */ -.si { color: #bb8844 } /* Literal.String.Interpol */ -.sx { color: #bb8844 } /* Literal.String.Other */ -.sr { color: #808000 } /* Literal.String.Regex */ -.s1 { color: #bb8844 } /* Literal.String.Single */ -.ss { color: #bb8844 } /* Literal.String.Symbol */ -.bp { color: #999999 } /* Name.Builtin.Pseudo */ -.vc { color: #ff99ff } /* Name.Variable.Class */ -.vg { color: #ff99ff } /* Name.Variable.Global */ -.vi { color: #ff99ff } /* Name.Variable.Instance */ -.il { color: #009999 } /* Literal.Number.Integer.Long */ - diff --git a/doc/pacbio-theme/theme.conf b/doc/pacbio-theme/theme.conf deleted file mode 100644 index dd24a1a..0000000 --- a/doc/pacbio-theme/theme.conf +++ /dev/null @@ -1,4 +0,0 @@ -[theme] -inherit = default -stylesheet = pacbio.css -pygments_style = tango diff --git a/doc/pbcore.io.dataset.rst b/doc/pbcore.io.dataset.rst index acf2269..032aab1 100644 --- a/doc/pbcore.io.dataset.rst +++ b/doc/pbcore.io.dataset.rst @@ -55,12 +55,15 @@ Filter:: usage: dataset.py filter [-h] infile outfile filters [filters ...] - Add filters to an XML file + Add filters to an XML file. Suggested fields: ['bcf', 'bcq', 'bcr', + 'length', 'pos', 'qend', 'qname', 'qstart', 'readstart', 'rname', 'rq', + 'tend', 'tstart', 'zm']. More expensive fields: ['accuracy', 'bc', 'movie', + 'qs'] positional arguments: infile The xml file to filter outfile The resulting xml file - filters The values and thresholds to filter (e.g. rq>0.85) + filters The values and thresholds to filter (e.g. 'rq>0.85') optional arguments: -h, --help show this help message and exit @@ -123,13 +126,96 @@ Split:: --subdatasets Split on subdatasets --outdir OUTDIR Specify an output directory -Consolidation requires in-depth filtering and BAM file manipulation. -Implementation plans TBD. +Consolidate:: + usage: dataset.py consolidate [-h] [--numFiles NUMFILES] [--noTmp] + infile datafile xmlfile + + Consolidate the XML files + + positional arguments: + infile The XML file to consolidate + datafile The resulting data file + xmlfile The resulting XML file + + optional arguments: + -h, --help show this help message and exit + --numFiles NUMFILES The number of data files to produce (1) + --noTmp Don't copy to a tmp location to ensure local disk + use Usage Examples ============================= +Filter Reads (CLI version) +-------------------------- + +In this scenario we have one or more bam files worth of subreads, aligned or +otherwise, that we want to filter and put in a single bam file. This is +possible using the CLI with the following steps, starting with a DataSet XML +file:: + + # usage: dataset.py filter <in_fn.xml> <out_fn.xml> <filters> + dataset.py filter in_fn.subreadset.xml filtered_fn.subreadset.xml 'rq>0.85' + + # usage: dataset.py consolidate <in_fn.xml> <out_data_fn.bam> <out_fn.xml> + dataset.py consolidate filtered_fn.subreadset.xml consolidate.subreads.bam out_fn.subreadset.xml + +The filtered DataSet and the consolidated DataSet should be read for read +equivalent when used with SMRT Analysis software. + +Filter Reads (API version) +-------------------------- + +The API version of filtering allows for more advanced filtering criteria:: + + ss = SubreadSet('in_fn.subreadset.xml') + ss.filters.addRequirement(rname=[('=', 'E.faecalis.2'), + ('=', 'E.faecalis.2')], + tStart=[('<', '99'), + ('<', '299')], + tEnd=[('>', '0'), + ('>', '200')]) + +Produces the following conditions for a read to be considered passing: + +(rname = E.faecalis.2 AND tstart < 99 AND tend > 0) +OR +(rname = E.faecalis.2 AND tstart < 299 AND tend > 200) + +You can add sets of filters by providing equal length lists of requirements for +each filter. + +Additional requirements added singly will be added to all filters:: + + ss.filters.addRequirement(rq=[('>', '0.85')]) + +(rname = E.faecalis.2 AND tstart < 99 AND tend > 0 AND rq > 0.85) +OR +(rname = E.faecalis.2 AND tstart < 299 AND tend > 100 AND rq > 0.85) + +Additional requirements added with a plurality of options will duplicate the +previous requiremnts for each option:: + + ss.filters.addRequirement(length=[('>', 500), ('>', 1000)]) + +(rname = E.faecalis.2 AND tstart < 99 AND tend > 0 AND rq > 0.85 AND length > 500) +OR +(rname = E.faecalis.2 AND tstart < 299 AND tend > 100 AND rq > 0.85 AND length > 500) +OR +(rname = E.faecalis.2 AND tstart < 99 AND tend > 0 AND rq > 0.85 AND length > 1000) +OR +(rname = E.faecalis.2 AND tstart < 299 AND tend > 100 AND rq > 0.85 AND length > 1000) + +Of course you can always wipe the filters and start over:: + + ss.filters = None + +Consolidation is more similar to the CLI version:: + + ss.consolidate('cons.bam') + ss.write('cons.xml') + Resequencing Pipeline (CLI version) ----------------------------------- @@ -295,4 +381,3 @@ DataSetMembers module. :show-inheritance: :undoc-members: - diff --git a/pbcore/__init__.py b/pbcore/__init__.py index 466ec29..dd6477f 100644 --- a/pbcore/__init__.py +++ b/pbcore/__init__.py @@ -28,4 +28,4 @@ # POSSIBILITY OF SUCH DAMAGE. ################################################################################# -__VERSION__ = "1.2.3" +__VERSION__ = "1.2.4" diff --git a/pbcore/data/datasets/__init__.py b/pbcore/data/datasets/__init__.py index 843faae..2d4378a 100755 --- a/pbcore/data/datasets/__init__.py +++ b/pbcore/data/datasets/__init__.py @@ -60,3 +60,6 @@ def getSubreadSet(): def getHdfSubreadSet(): return _getAbsPath(XML_FILES[7]) + +def getBarcodedBam(): + return _getAbsPath(BAM_FILES[3]) diff --git a/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam new file mode 100644 index 0000000..ff3e6e8 Binary files /dev/null and b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam differ diff --git a/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.bai b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.bai new file mode 100644 index 0000000..79a6a43 Binary files /dev/null and b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.bai differ diff --git a/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.pbi b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.pbi new file mode 100644 index 0000000..4d2df12 Binary files /dev/null and b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.pbi differ diff --git a/pbcore/data/datasets/pbalchemysim0.pbalign.bam b/pbcore/data/datasets/pbalchemysim0.pbalign.bam index b9d3997..5cd4251 100644 Binary files a/pbcore/data/datasets/pbalchemysim0.pbalign.bam and b/pbcore/data/datasets/pbalchemysim0.pbalign.bam differ diff --git a/pbcore/data/datasets/pbalchemysim0.pbalign.bam.pbi b/pbcore/data/datasets/pbalchemysim0.pbalign.bam.pbi index 1ae468c..144089a 100644 Binary files a/pbcore/data/datasets/pbalchemysim0.pbalign.bam.pbi and b/pbcore/data/datasets/pbalchemysim0.pbalign.bam.pbi differ diff --git a/pbcore/data/datasets/pbalchemysim0.subreads.bam b/pbcore/data/datasets/pbalchemysim0.subreads.bam index f209211..8ea5174 100644 Binary files a/pbcore/data/datasets/pbalchemysim0.subreads.bam and b/pbcore/data/datasets/pbalchemysim0.subreads.bam differ diff --git a/pbcore/data/datasets/pbalchemysim0.subreads.bam.pbi b/pbcore/data/datasets/pbalchemysim0.subreads.bam.pbi index 0930a2e..532236a 100644 Binary files a/pbcore/data/datasets/pbalchemysim0.subreads.bam.pbi and b/pbcore/data/datasets/pbalchemysim0.subreads.bam.pbi differ diff --git a/pbcore/data/datasets/pbalchemysim1.pbalign.bam b/pbcore/data/datasets/pbalchemysim1.pbalign.bam index e0ee9d2..e699726 100644 Binary files a/pbcore/data/datasets/pbalchemysim1.pbalign.bam and b/pbcore/data/datasets/pbalchemysim1.pbalign.bam differ diff --git a/pbcore/data/datasets/pbalchemysim1.pbalign.bam.pbi b/pbcore/data/datasets/pbalchemysim1.pbalign.bam.pbi index cf39b47..2477e13 100644 Binary files a/pbcore/data/datasets/pbalchemysim1.pbalign.bam.pbi and b/pbcore/data/datasets/pbalchemysim1.pbalign.bam.pbi differ diff --git a/pbcore/data/datasets/pbalchemysim1.subreads.bam b/pbcore/data/datasets/pbalchemysim1.subreads.bam index 627f5f5..e45f2fa 100644 Binary files a/pbcore/data/datasets/pbalchemysim1.subreads.bam and b/pbcore/data/datasets/pbalchemysim1.subreads.bam differ diff --git a/pbcore/data/datasets/pbalchemysim1.subreads.bam.pbi b/pbcore/data/datasets/pbalchemysim1.subreads.bam.pbi index d637f58..28f0f2f 100644 Binary files a/pbcore/data/datasets/pbalchemysim1.subreads.bam.pbi and b/pbcore/data/datasets/pbalchemysim1.subreads.bam.pbi differ diff --git a/pbcore/data/empty.ccs.bam b/pbcore/data/empty.ccs.bam index 704abbe..3454722 100644 Binary files a/pbcore/data/empty.ccs.bam and b/pbcore/data/empty.ccs.bam differ diff --git a/pbcore/data/empty.ccs.bam.pbi b/pbcore/data/empty.ccs.bam.pbi index 9e7fd4e..efbe492 100644 Binary files a/pbcore/data/empty.ccs.bam.pbi and b/pbcore/data/empty.ccs.bam.pbi differ diff --git a/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam b/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam index d927eac..c1e6df2 100644 Binary files a/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam and b/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam differ diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam index 789865a..02f6299 100644 Binary files a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam differ diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.bai b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.bai index dea076b..9e1f72b 100644 Binary files a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.bai and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.bai differ diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.pbi b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.pbi index 4a8ff48..2c82b65 100644 Binary files a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.pbi and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.pbi differ diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.scraps.bam b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.scraps.bam new file mode 100644 index 0000000..cf034d2 Binary files /dev/null and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.scraps.bam differ diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam index 3514891..4809be5 100644 Binary files a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam differ diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam.pbi b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam.pbi index 0cb1c4b..0027fb6 100644 Binary files a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam.pbi and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam.pbi differ diff --git a/pbcore/io/BasH5IO.py b/pbcore/io/BasH5IO.py index 8552bf6..b2ef3a5 100644 --- a/pbcore/io/BasH5IO.py +++ b/pbcore/io/BasH5IO.py @@ -411,6 +411,15 @@ class ZmwRead(CommonEqualityMixin): PulseIndex = _makeQvAccessor("PulseIndex") + def StartFrame(self): + return self.EndFrame() - self.WidthInFrames() + + def EndFrame(self): + fullPBF = arrayFromDataset(self._getBasecallsGroup()["PreBaseFrames"], + *self._getOffsets()[self.holeNumber]) + fullWF = arrayFromDataset(self._getBasecallsGroup()["WidthInFrames"], + *self._getOffsets()[self.holeNumber]) + return np.cumsum(fullWF + fullPBF)[self.readStart:self.readEnd] class CCSZmwRead(ZmwRead): """ diff --git a/pbcore/io/FastaIO.py b/pbcore/io/FastaIO.py index 5e2d7bf..e750117 100644 --- a/pbcore/io/FastaIO.py +++ b/pbcore/io/FastaIO.py @@ -252,6 +252,8 @@ class FastaWriter(WriterBase): raise ValueError if len(args) == 1: record = args[0] + if isinstance(record, IndexedFastaRecord): + record = FastaRecord(record.header, record.sequence) assert isinstance(record, FastaRecord) else: header, sequence = args diff --git a/pbcore/io/GffIO.py b/pbcore/io/GffIO.py index c3313f3..a08f0b7 100644 --- a/pbcore/io/GffIO.py +++ b/pbcore/io/GffIO.py @@ -42,7 +42,7 @@ __all__ = [ "Gff3Record", "GffWriter" ] from .base import ReaderBase, WriterBase -from collections import OrderedDict +from collections import OrderedDict, defaultdict from copy import copy as shallow_copy import tempfile import os.path @@ -256,42 +256,85 @@ def sort_gff(file_name, output_file_name=None): return output_file_name -def _merge_gffs(gff1, gff2, out): - out = GffWriter(out) +def merge_gffs (gff_files, output_file): + """ + Merge a sequence of GFF files, preserving unique headers (except for the + source commandline, which will usually vary). + """ + headers, header_keys = _merge_gff_headers(gff_files) n_rec = 0 - with GffReader(gff1) as f1: - map(out.writeHeader, f1.headers) - with GffReader(gff2) as f2: - rec1 = [ rec for rec in f1 ] - rec2 = [ rec for rec in f2 ] - i = j = 0 - while i < len(rec1) and j < len(rec2): - if (rec1[i] > rec2[j]): - out.writeRecord(rec2[j]) - j += 1 - else: - out.writeRecord(rec1[i]) + with GffWriter(output_file) as writer: + for key in header_keys: + for value in headers[key]: + writer.writeHeader("##%s %s" % (key, value)) + for gff_file in gff_files: + with GffReader(gff_file) as reader: + for rec in reader: + writer.writeRecord(rec) + n_rec += 1 + return n_rec + + +def _merge_gff_headers(gff_files): + ignore_fields = set(["source-commandline", "gff-version", "date"]) + def _get_headers(f): + with GffReader(f) as reader: + for h in reader.headers: + fields = h[2:].strip().split(" ") + if not fields[0] in ignore_fields: + yield fields[0], " ".join(fields[1:]) + header_keys = [] + for k, v in _get_headers(gff_files[0]): + if not k in header_keys: + header_keys.append(k) + headers = defaultdict(list) + for gff_file in gff_files: + for key, value in _get_headers(gff_file): + if not value in headers[key]: + headers[key].append(value) + return headers, header_keys + + +def _merge_gffs_sorted(gff1, gff2, out_file): + import logging + logging.warn("Merging %s and %s" % (gff1, gff2)) + with GffWriter(out_file) as out: + n_rec = 0 + headers, header_keys = _merge_gff_headers([gff1, gff2]) + with GffReader(gff1) as f1: + for key in header_keys: + for value in headers[key]: + out.writeHeader("##%s %s" % (key, value)) + with GffReader(gff2) as f2: + rec1 = [ rec for rec in f1 ] + rec2 = [ rec for rec in f2 ] + i = j = 0 + while i < len(rec1) and j < len(rec2): + if (rec1[i] > rec2[j]): + out.writeRecord(rec2[j]) + j += 1 + else: + out.writeRecord(rec1[i]) + i += 1 + for rec in rec1[i:]: + out.writeRecord(rec) i += 1 - for rec in rec1[i:]: - out.writeRecord(rec) - i += 1 - for rec in rec2[j:]: - out.writeRecord(rec) - j += 2 - return i + j + for rec in rec2[j:]: + out.writeRecord(rec) + j += 2 + return i + j -def merge_gffs(gff_files, output_file_name): +def merge_gffs_sorted(gff_files, output_file_name): """ Utility function to combine a set of N (>= 2) GFF files, with records ordered by genomic location. (Assuming each input file is already sorted.) """ if len(gff_files) == 1: return gff_files[0] while len(gff_files) > 2: - tmpout = tempfile.NamedTemporaryFile() - _merge_gffs(gff_files[0], gff_files[1], tmpout) - tmpout.seek(0) + tmpout = tempfile.NamedTemporaryFile(suffix=".gff").name + _merge_gffs_sorted(gff_files[0], gff_files[1], tmpout) gff_files = [tmpout] + gff_files[2:] with open(output_file_name, "w") as f: - _merge_gffs(gff_files[0], gff_files[1], f) + _merge_gffs_sorted(gff_files[0], gff_files[1], f) return output_file_name diff --git a/pbcore/io/align/BamAlignment.py b/pbcore/io/align/BamAlignment.py index 3ed36d7..14df751 100644 --- a/pbcore/io/align/BamAlignment.py +++ b/pbcore/io/align/BamAlignment.py @@ -253,7 +253,7 @@ class BamAlignment(AlignmentRecordMixin): use of this property can result in code that won't work on legacy data. """ - return 0.001 * self.peer.opt("rq") + return self.peer.opt("rq") @property def readType(self): @@ -592,7 +592,7 @@ class BamAlignment(AlignmentRecordMixin): if key in self.bam.pbi.columnNames: return self.bam.pbi[self.rowNumber][key] else: - raise AttributeError, "no such column in pbi index" + raise AttributeError("no such column '%s' in pbi index" % key) def __dir__(self): basicDir = self.__dict__.keys() diff --git a/pbcore/io/align/BamIO.py b/pbcore/io/align/BamIO.py index 824674c..5a51bf8 100644 --- a/pbcore/io/align/BamIO.py +++ b/pbcore/io/align/BamIO.py @@ -168,16 +168,20 @@ class _BamReaderBase(ReaderBase): def _checkFileCompatibility(self): # Verify that this is a "pacbio" BAM file of version at least - # 3.0b7 + # 3.0.1 try: checkedVersion = self.version if "b" in checkedVersion: - version, beta = checkedVersion.split("b") - if int(beta) < 7: raise Exception() + raise Exception() + else: + major, minor, patch = checkedVersion.split('.') + assert major >= 3 + assert minor >= 0 + assert patch >= 1 except: raise IncompatibleFile( "This BAM file is incompatible with this API " + - "(only PacBio BAM files version >= 3.0b7 are supported)") + "(only PacBio BAM files version >= 3.0.1 are supported)") def __init__(self, fname, referenceFastaFname=None): self.filename = fname = abspath(expanduser(fname)) diff --git a/pbcore/io/align/PacBioBamIndex.py b/pbcore/io/align/PacBioBamIndex.py index 642593e..d14e1fa 100644 --- a/pbcore/io/align/PacBioBamIndex.py +++ b/pbcore/io/align/PacBioBamIndex.py @@ -47,7 +47,7 @@ PBI_HEADER_LEN = 32 PBI_FLAGS_BASIC = 0 PBI_FLAGS_MAPPED = 1 PBI_FLAGS_COORDINATE_SORTED = 2 -PBI_FLAGS_BARCODE_ADAPTER = 4 +PBI_FLAGS_BARCODE = 4 class PacBioBamIndex(object): @@ -62,23 +62,32 @@ class PacBioBamIndex(object): header = unpack("< 4s BBBx H I 18x", buf) (self.magic, self.vPatch, self.vMinor, self.vMajor, self.pbiFlags, self.nReads) = header + try: + assert self.vMajor >= 3 + assert self.vMinor >= 0 + assert self.vPatch >= 1 + except: + raise IncompatibleFile( + "This PBI file is incompatible with this API " + "(only PacBio PBI files version >= 3.0.1 are supported)") @property def hasMappingInfo(self): return (self.pbiFlags & PBI_FLAGS_MAPPED) @property - def hasAdapterInfo(self): - return (self.pbiFlags & PBI_FLAGS_BARCODE_ADAPTER) + def hasBarcodeInfo(self): + return (self.pbiFlags & PBI_FLAGS_BARCODE) def _loadMainIndex(self, f): - # Main index holds subread and mapping info - SUBREADS_INDEX_DTYPE = [ + # Main index holds basic, mapping, and barcode info + BASIC_INDEX_DTYPE = [ ("qId" , "i4"), ("qStart" , "i4"), ("qEnd" , "i4"), ("holeNumber" , "i4"), - ("readQual" , "u2"), + ("readQual" , "f4"), + ("contextFlag" , "u1"), ("virtualFileOffset" , "i8") ] MAPPING_INDEX_DTYPE = [ @@ -92,16 +101,24 @@ class PacBioBamIndex(object): ("nMM" , "u4"), ("mapQV" , "u1") ] + BARCODE_INDEX_DTYPE = [ + ("bcForward" , "u2"), + ("bcReverse" , "u2"), + ("bcQual" , "u1")] + COMPUTED_COLUMNS_DTYPE = [ ("nIns" , "u4"), ("nDel" , "u4") ] - JOINT_DTYPE = SUBREADS_INDEX_DTYPE + MAPPING_INDEX_DTYPE + COMPUTED_COLUMNS_DTYPE + joint_dtype = BASIC_INDEX_DTYPE[:] if self.hasMappingInfo: - tbl = np.zeros(shape=(self.nReads,), dtype=JOINT_DTYPE).view(np.recarray) - else: - tbl = np.zeros(shape=(self.nReads,), dtype=SUBREADS_INDEX_DTYPE).view(np.recarray) + joint_dtype += MAPPING_INDEX_DTYPE + joint_dtype += COMPUTED_COLUMNS_DTYPE + if self.hasBarcodeInfo: + joint_dtype += BARCODE_INDEX_DTYPE + tbl = np.zeros(shape=(self.nReads,), + dtype=joint_dtype).view(np.recarray) def peek(type_, length): flavor, width = type_ @@ -109,7 +126,7 @@ class PacBioBamIndex(object): if True: # BASIC data always present - for columnName, columnType in SUBREADS_INDEX_DTYPE: + for columnName, columnType in BASIC_INDEX_DTYPE: tbl[columnName] = peek(columnType, self.nReads) if self.hasMappingInfo: @@ -120,6 +137,10 @@ class PacBioBamIndex(object): tbl.nIns = tbl.aEnd - tbl.aStart - tbl.nM - tbl.nMM tbl.nDel = tbl.tEnd - tbl.tStart - tbl.nM - tbl.nMM + if self.hasBarcodeInfo: + for columnName, columnType in BARCODE_INDEX_DTYPE: + tbl[columnName] = peek(columnType, self.nReads) + self._tbl = tbl self._checkForBrokenColumns() diff --git a/pbcore/io/dataset/DataSetIO.py b/pbcore/io/dataset/DataSetIO.py index d0fe999..9c63cec 100755 --- a/pbcore/io/dataset/DataSetIO.py +++ b/pbcore/io/dataset/DataSetIO.py @@ -10,6 +10,7 @@ import copy import os import errno import logging +import itertools import xml.dom.minidom import tempfile from functools import wraps @@ -19,7 +20,9 @@ from pbcore.util.Process import backticks from pbcore.io.opener import (openAlignmentFile, openIndexedAlignmentFile, FastaReader, IndexedFastaReader, CmpH5Reader, IndexedBamReader) +from pbcore.io.align.PacBioBamIndex import PBI_FLAGS_BARCODE from pbcore.io.FastaIO import splitFastaHeader, FastaWriter +from pbcore.io.FastqIO import FastqReader, FastqWriter, qvsFromAscii from pbcore.io import BaxH5Reader from pbcore.io.align._BamSupport import UnavailableFeature from pbcore.io.dataset.DataSetReader import (parseStats, populateDataSet, @@ -33,7 +36,8 @@ from pbcore.io.dataset.DataSetMembers import (DataSetMetadata, BarcodeSetMetadata, ExternalResources, ExternalResource, Filters) -from pbcore.io.dataset.utils import consolidateBams, _infixFname +from pbcore.io.dataset.utils import (consolidateBams, _infixFname, _pbindexBam, + _indexBam, _indexFasta) log = logging.getLogger(__name__) @@ -53,41 +57,62 @@ def filtered(generator): return wrapper -def _toDsId(x): - return "PacBio.DataSet.{x}".format(x=x) +def _toDsId(name): + """Translate a class name into a MetaType/ID""" + return "PacBio.DataSet.{x}".format(x=name) -def _dsIdToName(x): - if DataSetMetaTypes.isValid(x): - return x.split('.')[-1] +def _dsIdToName(dsId): + """Translate a MetaType/ID into a class name""" + if DataSetMetaTypes.isValid(dsId): + return dsId.split('.')[-1] + else: + raise InvalidDataSetIOError("Invalid DataSet MetaType") -def _dsIdToType(x): - if DataSetMetaTypes.isValid(x): +def _dsIdToType(dsId): + """Translate a MetaType/ID into a type""" + if DataSetMetaTypes.isValid(dsId): types = DataSet.castableTypes() - return types[_dsIdToName(x)] + return types[_dsIdToName(dsId)] + else: + raise InvalidDataSetIOError("Invalid DataSet MetaType") -def _dsIdToSuffix(x): +def _dsIdToSuffix(dsId): + """Translate a MetaType/ID into a file suffix""" dsIds = DataSetMetaTypes.ALL suffixMap = {dsId: _dsIdToName(dsId) for dsId in dsIds} suffixMap[_toDsId("DataSet")] = 'DataSet' - if DataSetMetaTypes.isValid(x): - suffix = suffixMap[x] + if DataSetMetaTypes.isValid(dsId): + suffix = suffixMap[dsId] suffix = suffix.lower() suffix += '.xml' return suffix + else: + raise InvalidDataSetIOError("Invalid DataSet MetaType") def _typeDataSet(dset): + """Determine the type of a dataset from the xml file without opening it""" xml_rt = xmlRootType(dset) dsId = _toDsId(xml_rt) tbrType = _dsIdToType(dsId) return tbrType +def isDataSet(xmlfile): + """Determine if a file is a DataSet before opening it""" + try: + _typeDataSet(xmlfile) + return True + except: + return False + def openDataSet(*files, **kwargs): - if not files[0].endswith('xml'): - raise TypeError("openDataSet requires that the first file is an XML") + """Factory function for DataSet types as suggested by the first file""" + if not isDataSet(files[0]): + raise TypeError("openDataSet requires that the first file is a DS") tbrType = _typeDataSet(files[0]) return tbrType(*files, **kwargs) def openDataFile(*files, **kwargs): + """Factory function for DataSet types determined by the first data file""" possibleTypes = [AlignmentSet, SubreadSet, ConsensusReadSet, ConsensusAlignmentSet, ReferenceSet, HdfSubreadSet] origFiles = files @@ -126,10 +151,16 @@ def openDataFile(*files, **kwargs): return SubreadSet(*origFiles, **kwargs) def _stackRecArrays(recArrays): + """Stack recarrays into a single larger recarray""" tbr = np.concatenate(recArrays) tbr = tbr.view(np.recarray) return tbr +def _flatten(lol, times=1): + """ This wont do well with mixed nesting""" + for _ in range(times): + lol = np.concatenate(lol) + return lol class DataSetMetaTypes(object): """ @@ -231,6 +262,7 @@ class DataSet(object): """ self._strict = kwargs.get('strict', False) self._skipCounts = kwargs.get('skipCounts', False) + _induceIndices = kwargs.get('generateIndices', False) # The metadata concerning the DataSet or subtype itself (e.g. # name, version, UniqueId) @@ -326,6 +358,13 @@ class DataSet(object): elif self.totalLength <= 0 or self.numRecords <= 0: self.updateCounts() + # generate indices if requested and needed + if _induceIndices: + self._induceIndices() + + def _induceIndices(self): + raise NotImplementedError() + def __repr__(self): """Represent the dataset with an informative string: @@ -381,9 +420,7 @@ class DataSet(object): self.__class__.__name__ == 'DataSet'): # determine whether or not this is the merge that is populating a # dataset for the first time - firstIn = False - if len(self.externalResources) == 0: - firstIn = True + firstIn = True if len(self.externalResources) == 0 else False # Block on filters? if (not firstIn and @@ -407,6 +444,17 @@ class DataSet(object): else: result = self + # If this dataset has no subsets representing it, add self as a + # subdataset to the result + # TODO: this is a stopgap to prevent spurious subdatasets when + # creating datasets from dataset xml files... + if not self.subdatasets and not firstIn: + result.addDatasets(self.copy()) + + # add subdatasets + if other.subdatasets or not firstIn: + result.addDatasets(other.copy()) + # add in the metadata (not to be confused with obj metadata) if firstIn: result.metadata = other.metadata @@ -421,20 +469,6 @@ class DataSet(object): # DataSets may only be merged if they have identical filters, # So there is nothing new to add. - # If this dataset has no subsets representing it, add self as a - # subdataset to the result - # TODO: this is a stopgap to prevent spurious subdatasets when - # creating datasets from dataset xml files... - if not self.subdatasets and not firstIn: - result.addDatasets(self.copy()) - - # add subdatasets - if other.subdatasets or not firstIn: - #if copyOnMerge: - result.addDatasets(other.copy()) - #else: - #result.addDatasets(other) - return result else: raise TypeError('DataSets can only be merged with records of the ' @@ -623,7 +657,7 @@ class DataSet(object): def split(self, chunks=0, ignoreSubDatasets=False, contigs=False, maxChunks=0, breakContigs=False, targetSize=5000, zmws=False, - barcodes=False, byRecords=False, updateCounts=False): + barcodes=False, byRecords=False, updateCounts=True): """Deep copy the DataSet into a number of new DataSets containing roughly equal chunks of the ExternalResources or subdatasets. @@ -655,6 +689,13 @@ class DataSet(object): breakContigs: Whether or not to break contigs when using maxChunks. If True, something resembling an efficient number of chunks for the dataset size will be produced. + targetSize: The target number of reads per chunk, when using + byRecords + zmws: Split by zmws + barcodes: Split by barcodes + byRecords: Split contigs by mapped records, rather than reference + length + updateCounts: Update the count metadata in each chunk Returns: A list of new DataSet objects (all other information deep copied). @@ -837,9 +878,16 @@ class DataSet(object): self.makePathsAbsolute() xml_string = toXml(self) if pretty: - xml_string = xml.dom.minidom.parseString(xml_string).toprettyxml() - if validate: - validateString(xml_string, relTo=outFile) + xml_string = xml.dom.minidom.parseString(xml_string).toprettyxml( + encoding="UTF-8") + # Disable for contigsets: no support for multiple contigs in XSD + # Disable for ccssets: no support for datasetmetadata in XSD + # XXX turn this off temporarily to aid in pbsmrtpipe fixing + if (validate and + not self.datasetType in [ContigSet.datasetType, + ConsensusReadSet.datasetType]): + #validateString(xml_string, relTo=os.path.dirname(outFile)) + pass fileName = urlparse(outFile).path.strip() if self._strict and not isinstance(self.datasetType, tuple): if not fileName.endswith(_dsIdToSuffix(self.datasetType)): @@ -1017,7 +1065,9 @@ class DataSet(object): resource.timeStampedName = tsName def _getTimeStampedName(self, mType): - time = datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S") + mType = mType.lower() + mType = '_'.join(mType.split('.')) + time = datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S%f")[:-3] return "{m}-{t}".format(m=mType, t=time) @staticmethod @@ -1297,6 +1347,21 @@ class DataSet(object): ranges.append((movie, min(values), max(values))) return ranges + # FIXME this is a workaround for the lack of support for barcode chunking + # in pbbam, and should probably go away once that is available. + @property + def barcodes(self): + """Return the list of barcodes explicitly set by filters via + DataSet.split(barcodes=True). + + """ + barcodes = [] + for filt in self._filters: + for param in filt: + if param.name == "bc": + barcodes.append(param.value) + return barcodes + def toFofn(self, outfn=None, uri=False, relative=False): """Return a list of resource filenames (and write to optional outfile) @@ -1394,7 +1459,10 @@ class DataSet(object): @filters.setter def filters(self, value): """Limit setting to ensure cache hygiene and filter compatibility""" - self._filters = value + if value is None: + self._filters = Filters() + else: + self._filters = value def reFilter(self, light=True): """ @@ -1476,7 +1544,9 @@ class DataSet(object): @property def sequencingChemistry(self): - return self._checkIdentical('sequencingChemistry') + key = 'sequencingChemistry' + responses = self._pollResources(lambda x: getattr(x, key)) + return _flatten(responses) @property def isEmpty(self): @@ -1545,7 +1615,6 @@ class DataSet(object): raise IOError(errno.EIO, "File not indexed", fname) return True - def __getitem__(self, index): """Should respect filters for free, as _indexMap should only be populated by filtered reads. Only pbi filters considered, however.""" @@ -1580,6 +1649,30 @@ class ReadSet(DataSet): super(ReadSet, self).__init__(*files, **kwargs) self._metadata = SubreadSetMetadata(self._metadata) + def _induceIndices(self): + for res in self.externalResources: + fname = res.resourceId + if not res.pbi: + _pbindexBam(fname) + self.close() + if not res.bai: + _indexBam(fname) + self.close() + + @property + def isIndexed(self): + if self.hasPbi: + return True + return False + + def assertBarcoded(self): + """Test whether all resources are barcoded files""" + res = self._pollResources( + lambda x: x.index.pbiFlags & PBI_FLAGS_BARCODE) + if not self._unifyResponses(res): + raise RuntimeError("File not barcoded") + + def _openFiles(self): """Open the files (assert they exist, assert they are of the proper type before accessing any file) @@ -1607,12 +1700,12 @@ class ReadSet(DataSet): location, referenceFastaFname=refFile) else: raise - #if resource is None: - #assert not self._strict - #resource = openAlignmentFile( - #location, referenceFastaFname=refFile) - if not resource.isEmpty: - self._openReaders.append(resource) + try: + if not resource.isEmpty: + self._openReaders.append(resource) + except UnavailableFeature: # isEmpty requires bai + if list(itertools.islice(resource, 1)): + self._openReaders.append(resource) if len(self._openReaders) == 0 and len(self.toExternalFiles()) != 0: raise IOError("No files were openable or reads found") log.debug("Done opening resources") @@ -1645,10 +1738,12 @@ class ReadSet(DataSet): """ # Find all possible barcodes and counts for each - # TODO: switch this over to the pbi when bc information is exposed + self.assertIndexed() + self.assertBarcoded() barcodes = defaultdict(int) - for read in self.records: - barcodes[tuple(read.peer.opt("bc"))] += 1 + for bcTuple in itertools.izip(self.index.bcForward, + self.index.bcReverse): + barcodes[bcTuple] += 1 log.debug("{i} barcodes found".format(i=len(barcodes.keys()))) @@ -1677,6 +1772,7 @@ class ReadSet(DataSet): log.debug("Generating new UUID") for result in results: result.newUuid() + # TODO: updateCounts # Update the basic metadata for the new DataSets from external # resources, or at least mark as dirty @@ -1840,7 +1936,7 @@ class ReadSet(DataSet): # Pull generic values, kwargs, general treatment in super super(ReadSet, self).addMetadata(newMetadata, **kwargs) - def consolidate(self, dataFile, numFiles=1): + def consolidate(self, dataFile, numFiles=1, useTmp=True): """Consolidate a larger number of bam files to a smaller number of bam files (min 1) @@ -1863,15 +1959,18 @@ class ReadSet(DataSet): resLists.append(thisResList) fnames = [_infixFname(dataFile, str(i)) for i in range(numFiles)] for resList, fname in zip(resLists, fnames): - consolidateBams(resList, fname, filterDset=self) + consolidateBams(resList, fname, filterDset=self, useTmp=useTmp) log.debug("Replacing resources") self.externalResources = ExternalResources() self.addExternalResources(fnames) else: - consolidateBams(self.toExternalFiles(), dataFile, filterDset=self) + consolidateBams(self.toExternalFiles(), dataFile, filterDset=self, + useTmp=useTmp) + # TODO: add as subdatasets log.debug("Replacing resources") self.externalResources = ExternalResources() self.addExternalResources([dataFile]) + self._populateMetaTypes() def __len__(self): if self.numRecords == -1: @@ -1919,6 +2018,13 @@ class HdfSubreadSet(ReadSet): self.objMetadata["TimeStampedName"] = self._getTimeStampedName( self.objMetadata["MetaType"]) + def _induceIndices(self): + log.debug("Bax files already indexed") + + @property + def isIndexed(self): + return True + def _openFiles(self): """Open the files (assert they exist, assert they are of the proper type before accessing any file) @@ -2055,6 +2161,19 @@ class AlignmentSet(ReadSet): else: return super(AlignmentSet, self).consolidate(*args, **kwargs) + def _induceIndices(self): + if self.isCmpH5: + log.debug("Cmp.h5 files already indexed") + else: + return super(AlignmentSet, self)._induceIndices() + + @property + def isIndexed(self): + if self.isCmpH5: + return True + else: + return super(AlignmentSet, self).isIndexed + def addReference(self, fname): if isinstance(fname, ReferenceSet): reference = fname.externalResources.resourceIds @@ -2221,9 +2340,9 @@ class AlignmentSet(ReadSet): winend = -1 for param in filt: if param.name == 'tstart': - winstart = param.value - if param.name == 'tend': winend = param.value + if param.name == 'tend': + winstart = param.value # If the filter is just for rname, fill the window # boundaries (pricey but worth the guaranteed behavior) if winend == -1: @@ -2365,7 +2484,7 @@ class AlignmentSet(ReadSet): return shiftedAtoms def _split_contigs(self, chunks, maxChunks=0, breakContigs=False, - targetSize=5000, byRecords=True, updateCounts=True): + targetSize=5000, byRecords=False, updateCounts=True): """Split a dataset into reference windows based on contigs. Args: @@ -2396,21 +2515,22 @@ class AlignmentSet(ReadSet): for rn in refNames if len(self._indexReadsInReference(rn)) != 0] else: - atoms = [(rn, 0, 0) for rn in refNames if + atoms = [(rn, 0, refLens[rn]) for rn in refNames if len(self._indexReadsInReference(rn)) != 0] else: log.debug("Skipping records for each reference check") - atoms = [(rn, 0, 0) for rn in refNames] + atoms = [(rn, 0, refLens[rn]) for rn in refNames] if byRecords: log.debug("Counting records...") # This is getting out of hand, but the number of references # determines the best read counting algorithm: if len(refNames) < 100: - atoms = [(rn, 0, 0, self.countRecords(rn)) + atoms = [(rn, 0, refLens[rn], self.countRecords(rn)) for rn in refNames] else: counts = self._countMappedReads() - atoms = [(rn, 0, 0, counts[rn]) for rn in refNames] + atoms = [(rn, 0, refLens[rn], counts[rn]) + for rn in refNames] log.debug("{i} contigs found".format(i=len(atoms))) if byRecords: @@ -2473,6 +2593,27 @@ class AlignmentSet(ReadSet): sub_segments.append(tmp) segments.extend(sub_segments) atoms = segments + elif breakContigs and not byRecords: + log.debug("Checking for oversized chunks") + # we may have chunks <= len(atoms). We wouldn't usually split up + # contigs, but some might be huge, resulting in some tasks running + # very long + # We are only doing this for refLength splits for now, as those are + # cheap (and quiver is linear in length not coverage) + dataSize = sum(refLens.values()) + # target size per chunk: + target = dataSize/chunks + log.debug("Target chunk length: {t}".format(t=target)) + newAtoms = [] + for i, atom in enumerate(atoms): + testAtom = atom + while testAtom[2] - testAtom[1] > target: + newAtom1 = (testAtom[0], testAtom[1], testAtom[1] + target) + newAtom2 = (testAtom[0], testAtom[1] + target, testAtom[2]) + newAtoms.append(newAtom1) + testAtom = newAtom2 + newAtoms.append(testAtom) + atoms = newAtoms log.debug("Done defining {n} chunks".format(n=chunks)) # duplicate @@ -2489,7 +2630,8 @@ class AlignmentSet(ReadSet): log.debug("Distributing chunks") # if we didn't have to split atoms and are doing it byRecords, the # original counts are still valid: - # Now well have to count records again to recombine atoms + # + # Otherwise we'll now have to count records again to recombine atoms chunks = self._chunkList(atoms, chunks, balanceKey) log.debug("Done chunking") @@ -2498,8 +2640,8 @@ class AlignmentSet(ReadSet): if atoms[0][2]: result.filters.addRequirement( rname=[('=', c[0]) for c in chunk], - tStart=[('>', c[1]) for c in chunk], - tEnd=[('<', c[2]) for c in chunk]) + tStart=[('<', c[2]) for c in chunk], + tEnd=[('>', c[1]) for c in chunk]) else: result.filters.addRequirement( rname=[('=', c[0]) for c in chunk]) @@ -3030,8 +3172,9 @@ class ContigSet(DataSet): super(ContigSet, self).__init__(*files, **kwargs) self._metadata = ContigSetMetadata(self._metadata) self._updateMetadata() + self._fastq = False - def consolidate(self, outfn=None, numFiles=1): + def consolidate(self, outfn=None, numFiles=1, useTmp=False): """Consolidation should be implemented for window text in names and for filters in ContigSets""" @@ -3066,6 +3209,7 @@ class ContigSet(DataSet): writeTemp = True writeMatches = {} writeComments = {} + writeQualities = {} for name, match_list in matches.items(): if len(match_list) > 1: log.debug("Multiple matches found for {i}".format(i=name)) @@ -3086,7 +3230,11 @@ class ContigSet(DataSet): # collapse matches new_name = self._removeWindow(name) - new_seq = ''.join([match.sequence for match in match_list]) + new_seq = ''.join([match.sequence[:] for match in match_list]) + if self._fastq: + new_qual = ''.join([match.qualityString for match in + match_list]) + writeQualities[new_name] = new_qual # set to write writeTemp = True @@ -3094,8 +3242,10 @@ class ContigSet(DataSet): writeComments[new_name] = match_list[0].comment else: log.debug("One match found for {i}".format(i=name)) - writeMatches[name] = match_list[0].sequence + writeMatches[name] = match_list[0].sequence[:] writeComments[name] = match_list[0].comment + if self._fastq: + writeQualities[name] = match_list[0].qualityString if writeTemp: log.debug("Writing a new file is necessary") if not outfn: @@ -3103,12 +3253,18 @@ class ContigSet(DataSet): outdir = tempfile.mkdtemp(suffix="consolidated-contigset") outfn = os.path.join(outdir, 'consolidated_contigs.fasta') - with FastaWriter(outfn) as outfile: + with self._writer(outfn) as outfile: log.debug("Writing new resource {o}".format(o=outfn)) for name, seq in writeMatches.items(): if writeComments[name]: name = ' '.join([name, writeComments[name]]) + if self._fastq: + outfile.writeRecord(name, seq, + qvsFromAscii(writeQualities[name])) + continue outfile.writeRecord(name, seq) + if not self._fastq: + _indexFasta(outfn) # replace resources log.debug("Replacing resources") self.externalResources = ExternalResources() @@ -3117,12 +3273,19 @@ class ContigSet(DataSet): log.debug("Replacing metadata") self._metadata.contigs = [] self._populateContigMetadata() + self._populateMetaTypes() + + @property + def _writer(self): + if self._fastq: + return FastqWriter + return FastaWriter def _popSuffix(self, name): """Chunking and quivering adds suffixes to contig names, after the normal ID and window. This complicates our dewindowing and consolidation, so we'll remove them for now""" - observedSuffixes = ['|quiver'] + observedSuffixes = ['|quiver', '|plurality', '|arrow'] for suff in observedSuffixes: if name.endswith(suff): log.debug("Suffix found: {s}".format(s=suff)) @@ -3138,6 +3301,12 @@ class ContigSet(DataSet): return '_'.join(name.split('_')[:-2]) + suff return name + def _induceIndices(self): + if not self.isIndexed: + for fname in self.toExternalFiles(): + _indexFasta(fname) + self.close() + def _parseWindow(self, name): """Chunking and quivering appends a window to the contig ID, which allows us to consolidate the contig chunks.""" @@ -3230,6 +3399,10 @@ class ContigSet(DataSet): log.debug("Opening resources") for extRes in self.externalResources: location = urlparse(extRes.resourceId).path + if location.endswith("fastq"): + self._fastq = True + self._openReaders.append(FastqReader(location)) + continue try: resource = IndexedFastaReader(location) except IOError: @@ -3314,6 +3487,7 @@ class ContigSet(DataSet): @staticmethod def _metaTypeMapping(): return {'fasta':'PacBio.ContigFile.ContigFastaFile', + 'fastq':'PacBio.ContigFile.ContigFastqFile', 'fa':'PacBio.ContigFile.ContigFastaFile', 'fas':'PacBio.ContigFile.ContigFastaFile', 'fai':'PacBio.Index.SamIndex', diff --git a/pbcore/io/dataset/DataSetMembers.py b/pbcore/io/dataset/DataSetMembers.py index fb79d29..3f36238 100755 --- a/pbcore/io/dataset/DataSetMembers.py +++ b/pbcore/io/dataset/DataSetMembers.py @@ -54,6 +54,7 @@ import operator as OP import numpy as np import logging from functools import partial as P +from collections import Counter from pbcore.io.dataset.DataSetWriter import namespaces log = logging.getLogger(__name__) @@ -427,10 +428,6 @@ class Filters(RecordWrapper): @property def _bamAccMap(self): - # tStart/tEnd is a hack for overlapping ranges. you're not testing - # whether the tStart/tEnd is within a range, you're testing if it - # overlaps with the tStart/tEnd in the filter (overlaps with a - # reference window) return {'rname': (lambda x: x.referenceName), 'length': (lambda x: int(x.readLength)), 'qname': (lambda x: x.qNameA), @@ -442,38 +439,43 @@ class Filters(RecordWrapper): 'pos': (lambda x: int(x.tStart)), 'accuracy': (lambda x: float(x.identity)), 'readstart': (lambda x: int(x.aStart)), - 'tstart': (lambda x: int(x.tEnd)), # see above - 'tend': (lambda x: int(x.tStart)), # see above + 'tstart': (lambda x: int(x.tStart)), + 'tend': (lambda x: int(x.tEnd)), } def _pbiAccMap(self, tIdMap): - # tStart/tEnd is a hack for overlapping ranges. you're not testing - # whether the tStart/tEnd is within a range, you're testing if it - # overlaps with the tStart/tEnd in the filter (overlaps with a - # reference window) return {'rname': (lambda x, m=tIdMap: m[x.tId]), 'length': (lambda x: int(x.aEnd)-int(x.aStart)), 'qname': (lambda x: x.qId), 'zm': (lambda x: int(x.holeNumber)), 'pos': (lambda x: int(x.tStart)), 'readstart': (lambda x: int(x.aStart)), - 'tstart': (lambda x: int(x.tEnd)), # see above - 'tend': (lambda x: int(x.tStart)), # see above + 'tstart': (lambda x: int(x.tStart)), + 'tend': (lambda x: int(x.tEnd)), } - def _pbiVecAccMap(self, tIdMap): - # tStart/tEnd is a hack for overlapping ranges. you're not testing - # whether the tStart/tEnd is within a range, you're testing if it - # overlaps with the tStart/tEnd in the filter (overlaps with a - # reference window) - return {'rname': (lambda x, m=tIdMap: m[x.tId]), + def _pbiMappedVecAccMap(self, tIdMap): + plus = {'rname': (lambda x, m=tIdMap: m[x.tId]), 'length': (lambda x: x.aEnd - x.aStart), - 'qname': (lambda x: x.qId), - 'zm': (lambda x: x.holeNumber), 'pos': (lambda x: x.tStart), 'readstart': (lambda x: x.aStart), - 'tstart': (lambda x: x.tEnd), # see above - 'tend': (lambda x: x.tStart), # see above + 'tstart': (lambda x: x.tStart), + 'tend': (lambda x: x.tEnd), + } + base = self._pbiVecAccMap(tIdMap) + base.update(plus) + return base + + def _pbiVecAccMap(self, tIdMap): + return {'length': (lambda x: x.qEnd - x.qStart), + 'qstart': (lambda x: x.qStart), + 'qend': (lambda x: x.qEnd), + 'qname': (lambda x: x.qId), + 'zm': (lambda x: x.holeNumber), + 'rq': (lambda x: x.readQual), + 'bcf': (lambda x: x.bcForward), + 'bcr': (lambda x: x.bcForward), + 'bcq': (lambda x: x.bcQual), } @property @@ -528,7 +530,9 @@ class Filters(RecordWrapper): def filterIndexRecords(self, indexRecords, nameMap): typeMap = self._bamTypeMap accMap = self._pbiVecAccMap({}) - accMap['rname'] = lambda x: x.tId + if 'aEnd' in indexRecords.dtype.names: + accMap = self._pbiMappedVecAccMap({}) + accMap['rname'] = lambda x: x.tId filterLastResult = None for filt in self: lastResult = None @@ -554,6 +558,10 @@ class Filters(RecordWrapper): del lastResult return filterLastResult + def fromString(self, filterString): + filtDict = {} + self._runCallbacks() + def addRequirement(self, **kwargs): """Use this to add requirements. Members of the list will be considered options for fulfilling this requirement, all other filters will be @@ -705,6 +713,9 @@ class ExternalResources(RecordWrapper): super(self.__class__, self).__init__(record) self.record['tag'] = self.__class__.__name__ + # state tracking. Not good, but needs it: + self._resourceIds = [] + def __eq__(self, other): for extRef in self: found = False @@ -728,7 +739,14 @@ class ExternalResources(RecordWrapper): def merge(self, other): # make sure we don't add dupes - curIds = [res.resourceId for res in self] + curIds = self.resourceIds + + # check to make sure ResourceIds in other are unique + otherIds = Counter([res.resourceId for res in other]) + dupes = [c for c in otherIds if otherIds[c] > 1] + if dupes: + raise RuntimeError("Duplicate ResourceIds found: " + "{f}".format(f=', '.join(dupes))) for newRes in other: # merge instead @@ -752,6 +770,7 @@ class ExternalResources(RecordWrapper): resourceIds: a list of uris as strings """ templist = [] + self._resourceIds = [] for res in resourceIds: toAdd = res if not isinstance(res, ExternalResource): @@ -774,13 +793,16 @@ class ExternalResources(RecordWrapper): aren't in record form, but append will fix that for us automatically. A bit messy, but fairly concise. """ + self._resourceIds = [] self.record['children'] = [] for res in resources: self.append(res) @property def resourceIds(self): - return [res.resourceId for res in self] + if not self._resourceIds: + self._resourceIds = [res.resourceId for res in self] + return self._resourceIds class ExternalResource(RecordWrapper): @@ -1000,7 +1022,6 @@ class DataSetMetadata(RecordWrapper): else: self.append(other.summaryStats) if not self.namespace: - print "HERE" self.namespace = other.namespace self.attrib.update(other.attrib) diff --git a/pbcore/io/dataset/DataSetReader.py b/pbcore/io/dataset/DataSetReader.py index 3c932db..eadf00a 100755 --- a/pbcore/io/dataset/DataSetReader.py +++ b/pbcore/io/dataset/DataSetReader.py @@ -57,6 +57,7 @@ def _addXmlFile(dset, path): root = tree.getroot() tmp = _parseXml(type(dset), root) tmp.makePathsAbsolute(curStart=os.path.dirname(path)) + # copyOnMerge must be false, you're merging in a tmp and maintaining dset dset.merge(tmp, copyOnMerge=False) def openFofnFile(path): diff --git a/pbcore/io/dataset/DataSetValidator.py b/pbcore/io/dataset/DataSetValidator.py index d0f9980..d588198 100755 --- a/pbcore/io/dataset/DataSetValidator.py +++ b/pbcore/io/dataset/DataSetValidator.py @@ -16,7 +16,7 @@ def validateResources(xmlroot, relTo='.'): Args: xmlroot: The ET root of an xml tree relTo: ('.') The path relative to which resources may reside. This will - work poorly if relTo is not set to the location of the incoming + work poorly if relTo is not set to the dirname of the incoming XML file. """ stack = [xmlroot] @@ -28,8 +28,10 @@ def validateResources(xmlroot, relTo='.'): parsedId = urlparse(resId) rfn = urlparse(resId).path.strip() if not os.path.isfile(rfn): - if not os.path.isfile(os.path.join(os.path.dirname(relTo), - rfn)): + if (not os.path.isfile(os.path.join(relTo, + rfn)) and + not os.path.isfile(os.path.join('.', + rfn))): raise IOError, "{f} not found".format(f=rfn) def validateLxml(xml_fn, xsd_fn): @@ -49,13 +51,10 @@ def validateMiniXsv(xml_fn, xsd_fn): except ImportError: log.debug('minixsv not found, validation disabled') -def validateXml(xmlroot, skipResources=False): - # pyxb precompiles and therefore does not need the original xsd file. - #if not os.path.isfile(XSD): - #raise SystemExit, "Validation xsd {s} not found".format(s=XSD) +def validateXml(xmlroot, skipResources=False, relTo='.'): - if not skipResources: # XXX generally a bad idea, but useful for pbvalidate - validateResources(xmlroot) + if not skipResources: + validateResources(xmlroot, relTo) # Conceal the first characters of UniqueIds if they are legal numbers that # would for some odd reason be considered invalid. Let all illegal @@ -75,14 +74,9 @@ def validateFile(xmlfn, skipResources=False): if ':' in xmlfn: xmlfn = urlparse(xmlfn).path.strip() with open(xmlfn, 'r') as xmlfile: - #root = etree.parse(xmlfile) root = ET.parse(xmlfile).getroot() - return validateXml(root, skipResources=skipResources) - -def validateString(xmlstring, relTo='.'): - #root = etree.parse(xmlfile) - root = ET.fromstring(xmlstring) - validateResources(root, relTo=relTo) - #return validateXml(root) - + return validateXml(root, skipResources=skipResources, + relTo=os.path.dirname(xmlfn)) +def validateString(xmlString, skipResources=False, relTo='.'): + validateXml(ET.fromstring(xmlString), skipResources, relTo) diff --git a/pbcore/io/dataset/DataSetWriter.py b/pbcore/io/dataset/DataSetWriter.py index 976572d..6275b4f 100755 --- a/pbcore/io/dataset/DataSetWriter.py +++ b/pbcore/io/dataset/DataSetWriter.py @@ -158,9 +158,9 @@ def _toElementTree(dataSet, root=None, core=False): root = ET.Element(rootType, attribs) _addExternalResourcesElement(dataSet, root, core) - _addDataSetMetadataElement(dataSet, root) _addFiltersElement(dataSet, root) _addDataSetsElement(dataSet, root) + _addDataSetMetadataElement(dataSet, root) xsi = "{http://www.w3.org/2001/XMLSchema-instance}" # The ElementTree element dictionary doesn't quite work the same as a # regular dictionary, it seems, thus the convoluted get/set business @@ -234,8 +234,15 @@ def _guessNs(tag): def _eleFromDictList(eleAsDict, core=False): """A last ditch capture method for uknown Elements""" + if eleAsDict['tag'] == 'DataSets': + print eleAsDict['namespace'] if not eleAsDict['namespace']: eleAsDict['namespace'] = _guessNs(eleAsDict['tag']) + elif (eleAsDict['namespace'] == + "http://pacificbiosciences.com/PacBioDataModel.xsd"): + newNamespace = _guessNs(eleAsDict['tag']) + if newNamespace != '': + eleAsDict['namespace'] = newNamespace if core: ele = ET.Element("{{{n}}}{t}".format(n=eleAsDict['namespace'], t=eleAsDict['tag']), @@ -258,9 +265,7 @@ def _addFiltersElement(dataset, root, core=False): root: The root ElementTree object. Extended here using SubElement """ if dataset.filters: - filters = ET.SubElement(root, 'Filters') - for child in dataset.filters.record['children']: - filters.append(_eleFromDictList(child, core)) + root.append(_eleFromDictList(dataset.filters.record, core=core)) def _addDataSetsElement(dataset, root): """Add DataSet Elements to root, which essentially nests ElementTrees. @@ -269,9 +274,13 @@ def _addDataSetsElement(dataset, root): root: The root ElementTree object. Extended here using SubElement """ if dataset.subdatasets: - dse = ET.SubElement(root, 'DataSets') + rootType = '{{{n}}}{t}'.format(n=namespaces()['pbds'], + t='DataSets') + dse = ET.SubElement(root, rootType) for subSet in dataset.subdatasets: - subSetRoot = ET.SubElement(dse, subSet.__class__.__name__, + rootType = '{{{n}}}{t}'.format(n=namespaces()['pbds'], + t=subSet.__class__.__name__) + subSetRoot = ET.SubElement(dse, rootType, subSet.objMetadata) _toElementTree(subSet, subSetRoot) diff --git a/pbcore/io/dataset/EntryPoints.py b/pbcore/io/dataset/EntryPoints.py index 47aede4..cdf02c7 100755 --- a/pbcore/io/dataset/EntryPoints.py +++ b/pbcore/io/dataset/EntryPoints.py @@ -2,7 +2,9 @@ import os import argparse +from collections import defaultdict from pbcore.io import DataSet, ContigSet, openDataSet +from pbcore.io.dataset.DataSetMembers import Filters from pbcore.io.dataset.DataSetValidator import validateFile import logging @@ -24,7 +26,13 @@ def summarize_options(parser): def createXml(args): dsTypes = DataSet.castableTypes() dset = dsTypes[args.dsType](*args.infile, strict=args.strict, - skipCounts=args.skipCounts) + skipCounts=args.skipCounts, + generateIndices=args.generateIndices) + if args.generateIndices: + # we generated the indices with the last open, lets capture them with + # this one: + dset = dsTypes[args.dsType](*args.infile, strict=args.strict, + skipCounts=args.skipCounts) log.debug("Dataset created") dset.write(args.outfile, validate=args.novalidate, modPaths=True, relPaths=args.relative) @@ -42,6 +50,8 @@ def create_options(parser): help="The fofn or BAM file(s) to make into an XML") parser.add_argument("--type", type=str, default='DataSet', dest='dsType', help="The type of XML to create") + parser.add_argument("--generateIndices", action='store_true', + default=False, help="The type of XML to create") parser.add_argument("--novalidate", action='store_false', default=True, help=("Don't validate the resulting XML, don't touch " "paths")) @@ -51,34 +61,38 @@ def create_options(parser): parser.set_defaults(func=createXml) def filterXml(args): - log.error("Adding filters via CLI is temporarily out of order") - exit(1) if args.infile.endswith('xml'): dataSet = openDataSet(args.infile, strict=args.strict) - filters = [] + filters = defaultdict(list) separators = ['<=', '>=', '!=', '==', '>', '<', '='] for filt in args.filters: for sep in separators: if sep in filt: param, condition = filt.split(sep) - condition = sep + condition - filters[param] = condition + condition = (sep, condition) + filters[param].append(condition) break - dataSet.addFilters([filters]) + dataSet.filters.addRequirement(**filters) + dataSet.updateCounts() log.info("{i} filters added".format(i=len(filters))) dataSet.write(args.outfile) else: raise IOError("No files found/found to be compatible") def filter_options(parser): - parser.description = 'Add filters to an XML file' + pbiFilterOptions = set(Filters()._pbiMappedVecAccMap({}).keys()) + bamFilterOptions = set(Filters()._bamAccMap.keys()) + parser.description = ('Add filters to an XML file. Suggested fields: ' + '{f}. More expensive fields: {b}'.format( + f=sorted(list(pbiFilterOptions)), + b=sorted(list(bamFilterOptions - pbiFilterOptions)))) #parser.add_argument("infile", type=validate_file, parser.add_argument("infile", type=str, help="The xml file to filter") parser.add_argument("outfile", type=str, help="The resulting xml file") parser.add_argument("filters", type=str, nargs='+', help="The values and thresholds to filter (e.g. " - "rq>0.85)") + "'rq>0.85')") parser.set_defaults(func=filterXml) def splitXml(args): @@ -207,7 +221,8 @@ def consolidateXml(args): """Combine BAMs and apply the filters described in the XML file, producing one consolidated XML""" dset = openDataSet(args.infile) - dset.consolidate(args.datafile, numFiles=args.numFiles) + dset.consolidate(args.datafile, numFiles=args.numFiles, useTmp=(not + args.noTmp)) dset.write(args.xmlfile) def consolidate_options(parser): @@ -215,6 +230,9 @@ def consolidate_options(parser): #parser.add_argument("infile", type=validate_file, parser.add_argument("--numFiles", type=int, default=1, help="The number of data files to produce (1)") + parser.add_argument("--noTmp", default=False, action='store_true', + help="Don't copy to a tmp location to ensure local" + " disk use") parser.add_argument("infile", type=str, help="The XML file to consolidate") parser.add_argument("datafile", type=str, diff --git a/pbcore/io/dataset/_pbds.py b/pbcore/io/dataset/_pbds.py index a027058..34d0e05 100755 --- a/pbcore/io/dataset/_pbds.py +++ b/pbcore/io/dataset/_pbds.py @@ -1632,7 +1632,7 @@ CTD_ANON_._Automaton = _BuildAutomaton_() -DataSetMetadataType._AddElement(pyxb.binding.basis.element(pyxb.namespace.ExpandedName(Namespace, 'TotalLength'), pyxb.binding.datatypes.int, scope=DataSetMetadataType, location=pyxb.utils.utility.Location('/tmp/tmpoNuZaMxsds/PacBioDatasets.xsd', 52, 3))) +DataSetMetadataType._AddElement(pyxb.binding.basis.element(pyxb.namespace.ExpandedName(Namespace, 'TotalLength'), pyxb.binding.datatypes.long, scope=DataSetMetadataType, location=pyxb.utils.utility.Location('/tmp/tmpoNuZaMxsds/PacBioDatasets.xsd', 52, 3))) DataSetMetadataType._AddElement(pyxb.binding.basis.element(pyxb.namespace.ExpandedName(Namespace, 'NumRecords'), pyxb.binding.datatypes.int, scope=DataSetMetadataType, location=pyxb.utils.utility.Location('/tmp/tmpoNuZaMxsds/PacBioDatasets.xsd', 53, 3))) diff --git a/pbcore/io/dataset/utils.py b/pbcore/io/dataset/utils.py index 05fccb4..acf131e 100755 --- a/pbcore/io/dataset/utils.py +++ b/pbcore/io/dataset/utils.py @@ -5,13 +5,28 @@ import os import tempfile import logging import json +import shutil +import pysam from pbcore.util.Process import backticks log = logging.getLogger(__name__) -def consolidateBams(inFiles, outFile, filterDset=None): +def consolidateBams(inFiles, outFile, filterDset=None, useTmp=True): """Take a list of infiles, an outFile to produce, and optionally a dataset (filters) to provide the definition and content of filtrations.""" + # check space available + if useTmp: + tmpout = tempfile.mkdtemp(suffix="consolidation-filtration") + if (disk_free(os.path.split(outFile)[0]) > + sum(file_size(infn) for infn in inFiles)): + tmpInFiles = _tmpFiles(inFiles, tmpout) + origOutFile = outFile + origInFiles = inFiles[:] + inFiles = tmpInFiles + outFile = os.path.join(tmpout, "outfile.bam") + else: + useTmp = False + if filterDset and filterDset.filters: finalOutfile = outFile outFile = _infixFname(outFile, "_unfiltered") @@ -21,6 +36,29 @@ def consolidateBams(inFiles, outFile, filterDset=None): outFile = finalOutfile _indexBam(outFile) _pbindexBam(outFile) + if useTmp: + shutil.copy(outFile, origOutFile) + shutil.copy(outFile + ".bai", origOutFile + ".bai") + shutil.copy(outFile + ".pbi", origOutFile + ".pbi") + +def _tmpFiles(inFiles, tmpout=None): + tmpInFiles = [] + if tmpout is None: + tmpout = tempfile.mkdtemp(suffix="consolidation-filtration") + for i, fname in enumerate(inFiles): + newfn = _infixFname(os.path.join(tmpout, os.path.basename(fname)), i) + shutil.copy(fname, newfn) + tmpInFiles.append(newfn) + return tmpInFiles + +def disk_free(path): + if path == '': + path = os.getcwd() + space = os.statvfs(path) + return space.f_bavail * space.f_frsize + +def file_size(path): + return os.stat(path).st_size def _pbindexBam(fname): cmd = "pbindex {i}".format(i=fname) @@ -36,30 +74,13 @@ def _sortBam(fname): o, r, m = backticks(cmd) if r != 0: raise RuntimeError(m) - _mvFile(tmpname, fname) - -def _mvFile(inFname, outFname): - cmd = "mv {i} {o}".format(i=inFname, - o=outFname) - log.info(cmd) - o, r, m = backticks(cmd) - if r != 0: - raise RuntimeError(m) - -def _cpFile(inFname, outFname): - cmd = "cp {i} {o}".format(i=inFname, - o=outFname) - log.info(cmd) - o, r, m = backticks(cmd) - if r != 0: - raise RuntimeError(m) + shutil.move(tmpname, fname) def _indexBam(fname): - cmd = "samtools index {i}".format(i=fname) - log.info(cmd) - o, r, m = backticks(cmd) - if r != 0: - raise RuntimeError(m) + pysam.index(fname) + +def _indexFasta(fname): + pysam.faidx(fname) def _mergeBams(inFiles, outFile): if len(inFiles) > 1: @@ -70,10 +91,7 @@ def _mergeBams(inFiles, outFile): if r != 0: raise RuntimeError(m) else: - _cpFile(inFiles[0], outFile) - -def _maxDigits(numItems): - return len(str(numItems)) + shutil.copy(inFiles[0], outFile) def _filterBam(inFile, outFile, filterDset): tmpout = tempfile.mkdtemp(suffix="consolidation-filtration") @@ -88,11 +106,12 @@ def _filterBam(inFile, outFile, filterDset): def _infixFname(fname, infix): prefix, extension = os.path.splitext(fname) - return prefix + infix + extension + return prefix + str(infix) + extension def _emitFilterScript(filterDset, filtScriptName): - bamtoolsFilters = ['reference', 'insertSize', 'mapQuality', 'name', - 'position', 'queryBases'] + """Use the filter script feature of bamtools. Use with specific filters if + all that are needed are available, otherwise filter by readname (easy but + uselessly expensive)""" filterMap = {'rname': 'reference', 'pos': 'position', 'length': 'queryBases', diff --git a/tests/test_pbcore_extract_bam_sequence.py b/tests/test_pbcore_extract_bam_sequence.py index cc516fd..ffd0399 100644 --- a/tests/test_pbcore_extract_bam_sequence.py +++ b/tests/test_pbcore_extract_bam_sequence.py @@ -14,11 +14,11 @@ import shutil import os sam_str_ = """\ -@HD VN:1.5\tSO:coordinate\tpb:3.0b7 +@HD VN:1.5\tSO:coordinate\tpb:3.0.1 @SQ\tSN:genome\tLN:28\tM5:734d5f3b2859595f4bd87a2fe6b7389b @RG\tID:1\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd=ip;BASECALLERVERSION=2.0.1.0.123678;FRAMERATEHZ=75.000000;BINDINGKIT=foo;SEQUENCINGKIT=bar\tPU:movie1 @PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\tDS:bax2bam -movie1/54130/0_10\t16\tgenome\t12\t30\t10=\t*\t0\t-28\tTCTCAGGAGA\t*\tRG:Z:1\tdq:Z:2222'$22'2\tdt:Z:NNNNAGNNGN\tip:B:C,255,2,0,10,22,34,0,2,3,0,16\tiq:Z:(+#1'$#*1&\tmq:Z:&1~51*5&~2\tnp:i:1\tqe:i:10\tqs:i:0\trq:i:854\tsn:B:f,-1,-1,-1,-1\tsq:Z:<32<4<<<<3\tzm:i:54130\tAS:i:-3020\tNM:i:134\tcx:i:2""" +movie1/54130/0_10\t16\tgenome\t12\t30\t10=\t*\t0\t-28\tTCTCAGGAGA\t*\tRG:Z:1\tdq:Z:2222'$22'2\tdt:Z:NNNNAGNNGN\tip:B:C,255,2,0,10,22,34,0,2,3,0,16\tiq:Z:(+#1'$#*1&\tmq:Z:&1~51*5&~2\tnp:i:1\tqe:i:10\tqs:i:0\trq:f:0.854\tsn:B:f,-1,-1,-1,-1\tsq:Z:<32<4<<<<3\tzm:i:54130\tAS:i:-3020\tNM:i:134\tcx:i:2""" fasta_str = """\ >genome diff --git a/tests/test_pbcore_io_AlnFileReaders.py b/tests/test_pbcore_io_AlnFileReaders.py index cac5d36..7d3bb97 100644 --- a/tests/test_pbcore_io_AlnFileReaders.py +++ b/tests/test_pbcore_io_AlnFileReaders.py @@ -2,9 +2,12 @@ from numpy.testing import (assert_array_almost_equal as ASIM, assert_array_equal as AEQ) from nose.tools import (nottest, assert_raises, - assert_equal as EQ) + assert_equal as EQ, + assert_almost_equal as EQISH) from nose import SkipTest +import tempfile +import pysam import numpy as np import bisect import h5py @@ -407,10 +410,10 @@ class TestBasicBam(_BasicAlnFileReaderTests): CONSTRUCTOR_ARGS = (data.getBamAndCmpH5()[0], data.getLambdaFasta()) def testSpecVersion(self): - EQ("3.0b7", self.f.version) + EQ("3.0.1", self.f.version) def testReadScore(self): - EQ(0.904, self.fwdAln.readScore) + EQISH(0.904, self.fwdAln.readScore, 3) class TestIndexedBam(_IndexedAlnFileReaderTests): @@ -439,3 +442,33 @@ class TestCCSBam(object): aln.qStart with assert_raises(UnavailableFeature): aln.qEnd + + +class TestMultipleReadGroups(object): + """ + Verify that BAM files with multiple read groups are handled sensibly - see + bug 26548. + """ + SAM_IN = """\ +@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1 +@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:734d5f3b2859595f4bd87a2fe6b7389b +@RG\tID:3f58e5b8\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000;BINDINGKIT=100356300;SEQUENCINGKIT=100356200\tPU:movie1 +@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\tPU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0 +movie1/54130/0_10\t2\tecoliK12_pbi_March2013_2955000_to_2980000\t2\t10\t10=\t*\t0\t0\tAATGAGGAGA\t*\tRG:Z:3f58e5b8\tdq:Z:2222'$22'2\tdt:Z:NNNNAGNNGN\tip:B:C,255,2,0,10,22,34,0,2,3,0,16\tiq:Z:(+#1'$#*1&\tmq:Z:&1~51*5&~2\tnp:i:1\tqe:i:10\tqs:i:0\trq:f:0.854\tsn:B:f,2,2,2,2\tsq:Z:<32<4<<<<3\tzm:i:54130\tAS:i:-3020\tNM:i:134\tcx:i:2 +m140906_231018_42161_c100676332550000001823129611271486_s1_p0/1/10_20\t2\tecoliK12_pbi_March2013_2955000_to_2980000\t12\t10\t10=\t*\t0\t0\tAATGAGGAGA\t*\tRG:Z:b5482b33\tdq:Z:2222'$22'2\tdt:Z:NNNNAGNNGN\tip:B:C,255,2,0,10,22,34,0,2,3,0,16\tiq:Z:(+#1'$#*1&\tmq:Z:&1~51*5&~2\tnp:i:1\tqe:i:20\tqs:i:10\trq:f:0.854\tsn:B:f,2,2,2,2\tsq:Z:<32<4<<<<3\tzm:i:54130\tAS:i:-3020\tNM:i:134\tcx:i:2""" + + def test_retrieve_read_group_properties(self): + f1 = tempfile.NamedTemporaryFile(suffix=".sam").name + f2 = tempfile.NamedTemporaryFile(suffix=".bam").name + with open(f1, "w") as f: + f.write(self.SAM_IN) + with pysam.AlignmentFile(f1) as sam_in: + with pysam.AlignmentFile(f2, 'wb', template=sam_in) as bam_out: + for aln in sam_in: + bam_out.write(aln) + movie_names = [] + with BamReader(f2) as bam_in: + for aln in bam_in: + EQ(aln.sequencingChemistry, "P6-C4") + movie_names.append(aln.movieName) + EQ(movie_names, ['movie1', 'm140906_231018_42161_c100676332550000001823129611271486_s1_p0']) diff --git a/tests/test_pbcore_io_GffIO.py b/tests/test_pbcore_io_GffIO.py index 8144910..d2e86b1 100644 --- a/tests/test_pbcore_io_GffIO.py +++ b/tests/test_pbcore_io_GffIO.py @@ -6,7 +6,7 @@ import os.path from nose.tools import assert_equal, assert_raises from pbcore.io import GffWriter, Gff3Record, GffReader -from pbcore.io.GffIO import merge_gffs, sort_gff +from pbcore.io.GffIO import merge_gffs, merge_gffs_sorted, sort_gff from pbcore import data class TestGff3Record: @@ -109,11 +109,16 @@ class TestGffWriter: class TestGffSorting(unittest.TestCase): gff_data = ["""\ ##gff-version 3 -##some random comment here +##source ipdSummary +##source-commandline ipdSummary etc. +##sequence-region lambda_NEB3011 1 48502 chr1\tkinModCall\tmodified_base\t32580\t32580\t32\t-\t.\tcoverage=94;context=AATGGCATCGTTCCGGTGGTGGGCGTTGATGGCTGGTCCCG;IPDRatio=1.75 chr1\tkinModCall\tmodified_base\t32766\t32766\t42\t-\t.\tcoverage=170;context=GCTGGGAAGCTGGCTGAACGTGTCGGCATGGATTCTGTCGA;IPDRatio=1.70 chr1\tkinModCall\tmodified_base\t32773\t32773\t54\t-\t.\tcoverage=154;context=AACGCTGGCTGGGAAGCTGGCTGAACGTGTCGGCATGGATT;IPDRatio=2.65""", """\ ##gff-version 3 +##source ipdSummary +##source-commandline ipdSummary etc. +##sequence-region lambda_NEB3011 1 48502 chr2\tkinModCall\tmodified_base\t1200\t1200\t47\t-\t.\tcoverage=109;context=ACTTTTCACGGTAGTTTTTTGCCGCTTTACCGCCCAGGCAC;IPDRatio=1.89 chr2\tkinModCall\tmodified_base\t1786\t1786\t36\t-\t.\tcoverage=153;context=TCCCACGTCTCACCGAGCGTGGTGTTTACGAAGGTTTTACG;IPDRatio=1.67 chr2\tkinModCall\tmodified_base\t1953\t1953\t39\t+\t.\tcoverage=148;context=AATGCGCGTATGGGGATGGGGGCCGGGTGAGGAAAGCTGGC;IPDRatio=1.86""", """\ @@ -152,8 +157,24 @@ chr1\tkinModCall\tmodified_base\t16348\t16348\t42\t-\t.\tcoverage=115;context=CC os.remove(cls.combined) def test_merge_gffs(self): - gff_out = "tmp_pbcore_merged.gff" + gff_out = "tmp_pbcore_merge.gff" merge_gffs(self.files, gff_out) + n_rec = 0 + for fn in self.files: + with GffReader(fn) as f: + n_rec += len([ rec for rec in f ]) + with GffReader(gff_out) as f: + self.assertEqual(f.headers, [ + "##gff-version 3", + "##source ipdSummary", + "##sequence-region lambda_NEB3011 1 48502", + ]) + n_rec_merged = len([ rec for rec in f ]) + self.assertEqual(n_rec, n_rec_merged) + + def test_merge_gffs_sorted(self): + gff_out = "tmp_pbcore_merged_sorted.gff" + merge_gffs_sorted(self.files, gff_out) with GffReader(gff_out) as f: start = [ (rec.seqid, rec.start) for rec in f ] self.assertEqual(start, self.sorted_start) diff --git a/tests/test_pbdataset.py b/tests/test_pbdataset.py index 66ad971..5457906 100644 --- a/tests/test_pbdataset.py +++ b/tests/test_pbdataset.py @@ -7,8 +7,10 @@ import tempfile import numpy as np import unittest +import shutil from unittest.case import SkipTest +from pbcore.io import PacBioBamIndex, IndexedBamReader from pbcore.io import openIndexedAlignmentFile from pbcore.io import (DataSet, SubreadSet, ReferenceSet, AlignmentSet, openDataSet, DataSetMetaTypes, HdfSubreadSet) @@ -21,6 +23,34 @@ import pbcore.data.datasets as data log = logging.getLogger(__name__) +def _check_constools(): + cmd = "dataset.py" + o, r, m = backticks(cmd) + if r != 2: + return False + + cmd = "bamtools" + o, r, m = backticks(cmd) + if r != 0: + return False + + cmd = "pbindex" + o, r, m = backticks(cmd) + if r != 1: + return False + + cmd = "samtools" + o, r, m = backticks(cmd) + if r != 1: + return False + return True + +def _internal_data(): + if os.path.exists("/mnt/secondary-siv/testdata"): + return True + return False + + class TestDataSet(unittest.TestCase): """Unit and integrationt tests for the DataSet class and \ associated module functions""" @@ -39,7 +69,9 @@ class TestDataSet(unittest.TestCase): # e.g. d.write("alignmentset.xml") outdir = tempfile.mkdtemp(suffix="dataset-unittest") outXml = os.path.join(outdir, 'tempfile.xml') - d.write(outXml) + log.debug(outXml) + # don't validate, type DataSet doesn't validate well + d.write(outXml, validate=False) # And then recover the same XML (or a different one): # e.g. d = DataSet("alignmentset.xml") d = DataSet(outXml) @@ -79,6 +111,8 @@ class TestDataSet(unittest.TestCase): ds.externalResources[-1].indices[0].resourceId == "IdontExist.bam.pbi") + @unittest.skipIf(not _check_constools(), + "bamtools or pbindex not found, skipping") def test_split_cli(self): outdir = tempfile.mkdtemp(suffix="dataset-unittest") cmd = "dataset.py split --outdir {o} --contigs --chunks 2 {d}".format( @@ -92,6 +126,71 @@ class TestDataSet(unittest.TestCase): self.assertTrue(os.path.exists( os.path.join(outdir, os.path.basename(data.getXml(16))))) + @unittest.skipIf(not _check_constools(), + "bamtools or pbindex not found, skipping") + def test_filter_cli(self): + outdir = tempfile.mkdtemp(suffix="dataset-unittest") + outfn = os.path.join(outdir, "filtered8.xml") + log.debug(outfn) + cmd = "dataset.py filter {i} {o} {f}".format( + i=data.getXml(8), + o=outfn, + f="rname=E.faecalis.1") + log.debug(cmd) + o, r, m = backticks(cmd) + if r != 0: + log.debug(m) + self.assertEqual(r, 0) + self.assertTrue(os.path.exists(outfn)) + aln = AlignmentSet(data.getXml(8)) + aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) + aln.updateCounts() + dset = AlignmentSet(outfn) + self.assertEqual(str(aln.filters), str(dset.filters)) + self.assertEqual(aln.totalLength, dset.totalLength) + self.assertEqual(aln.numRecords, dset.numRecords) + + def test_merge_subdatasets(self): + # from data file + ds1 = AlignmentSet(data.getBam(0)) + self.assertEqual(len(ds1.subdatasets), 0) + ds2 = AlignmentSet(data.getBam(1)) + self.assertEqual(len(ds1.subdatasets), 0) + merged = ds1 + ds2 + self.assertEqual(len(merged.subdatasets), 2) + self.assertEqual(merged.subdatasets[0].toExternalFiles(), + AlignmentSet(data.getBam(0)).toExternalFiles()) + self.assertEqual(len(merged.subdatasets[0].toExternalFiles()), 1) + self.assertEqual(merged.subdatasets[1].toExternalFiles(), + AlignmentSet(data.getBam(1)).toExternalFiles()) + self.assertEqual(len(merged.subdatasets[1].toExternalFiles()), 1) + + # from data set + ds1 = AlignmentSet(data.getXml(8)) + self.assertEqual(len(ds1.subdatasets), 0) + ds2 = AlignmentSet(data.getXml(11)) + self.assertEqual(len(ds2.subdatasets), 0) + merged = ds1 + ds2 + self.assertEqual(len(merged.subdatasets), 2) + self.assertEqual(merged.subdatasets[0].toExternalFiles(), + AlignmentSet(data.getXml(8)).toExternalFiles()) + self.assertEqual(len(merged.subdatasets[0].toExternalFiles()), 1) + self.assertEqual(merged.subdatasets[1].toExternalFiles(), + AlignmentSet(data.getXml(11)).toExternalFiles()) + self.assertEqual(len(merged.subdatasets[1].toExternalFiles()), 1) + + # combined data set + merged = AlignmentSet(data.getXml(8), data.getXml(11)) + self.assertEqual(len(merged.subdatasets), 2) + self.assertEqual(len(merged.subdatasets[0].toExternalFiles()), 1) + self.assertEqual(merged.subdatasets[0].toExternalFiles(), + AlignmentSet(data.getXml(8)).toExternalFiles()) + self.assertEqual(len(merged.subdatasets[1].toExternalFiles()), 1) + self.assertEqual(merged.subdatasets[1].toExternalFiles(), + AlignmentSet(data.getXml(11)).toExternalFiles()) + + @unittest.skipIf(not _check_constools(), + "bamtools or pbindex not found, skipping") def test_create_cli(self): log.debug("Absolute") outdir = tempfile.mkdtemp(suffix="dataset-unittest") @@ -214,6 +313,25 @@ class TestDataSet(unittest.TestCase): self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) + # TODO: replace this with a reproducable bam file and move test upstream + @unittest.skip("Skip until suitable barcoded files found and updated") + def test_barcode_accession(self): + testFile = data.getBarcodedBam() + # Test the pbi file: + bam = IndexedBamReader(testFile) + pbi = PacBioBamIndex(testFile + '.pbi') + for brec, prec in zip(bam, pbi): + brec_bc = brec.peer.opt("bc") + prec_bc = [prec.bcLeft, prec.bcRight] + self.assertEqual(brec_bc, prec_bc) + + # Test split by barcode: + ss = SubreadSet(testFile) + sss = ss.split(chunks=2, barcodes=True) + self.assertEqual(len(sss), 2) + for sset in sss: + self.assertTrue(len(sset.barcodes) > 1) + def test_attributes(self): aln = AlignmentSet(data.getBam(0)) self.assertEqual(aln.sequencingChemistry, ['unknown']) @@ -375,6 +493,15 @@ class TestDataSet(unittest.TestCase): [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0', 55, 1815)]) + @unittest.skipUnless(os.path.isdir("/mnt/secondary-siv/testdata"), + "Missing testadata directory") + def test_large_pbi(self): + pbiFn = ('/mnt/secondary-siv/testdata/SA3-DS/lambda/simulated' + '/100Gb/alnsubreads/pbalchemy_100Gb_Seq_sim1_p0.' + 'aligned.bam.pbi') + pbi = PacBioBamIndex(pbiFn) + self.assertFalse(pbi.aStart is None) + def test_copy(self): ds1 = DataSet(data.getXml(12)) ds2 = ds1.copy() @@ -597,8 +724,9 @@ class TestDataSet(unittest.TestCase): num += 1 self.assertTrue(num > 2000) - @unittest.skipUnless(os.path.isdir("/mnt/secondary/Share/Quiver"), - "Missing testadata directory") + @unittest.skip("Skip until /mnt/secondary/Share/Quiver/ updated") + #@unittest.skipUnless(os.path.isdir("/mnt/secondary/Share/Quiver"), + # "Missing testadata directory") def test_reads_in_range_order_large(self): window = ('Staphylococcus_aureus_subsp_aureus_USA300_TCH1516', 558500, @@ -811,7 +939,7 @@ class TestDataSet(unittest.TestCase): dss = ds3.split(contigs=True, chunks=3, updateCounts=True) self.assertEqual(len(dss), 3) sizes = sorted([dset.numRecords for dset in dss]) - self.assertListEqual(sizes, [22, 31, 39]) + self.assertListEqual(sizes, [20, 24, 48]) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) for ref in random_few: @@ -933,25 +1061,24 @@ class TestDataSet(unittest.TestCase): log.debug(dss[0].filters) log.debug(dss[1].filters) self.assertTrue( - '( rname = E.faecalis.2 ) ' + '( rname = E.faecalis.2 ' in str(dss[0].filters) or - '( rname = E.faecalis.2 ) ' + '( rname = E.faecalis.2 ' in str(dss[1].filters)) ds = AlignmentSet(data.getBam()) - ds.filters.addRequirement(rname=[('=', 'lambda_NEB3011'), - ('=', 'lambda_NEB3011')], - tStart=[('<', '0'), - ('<', '100')], - tEnd=[('>', '99'), - ('>', '299')]) + ds.filters.addRequirement(rname=[('=', 'E.faecalis.2'), + ('=', 'E.faecalis.2')], + tStart=[('<', '99'), + ('<', '299')], + tEnd=[('>', '0'), + ('>', '100')]) self.assertEqual(str(ds.filters), - '( rname = lambda_NEB3011 AND tstart ' - '< 0 AND tend > 99 ) OR ( rname = lambd' - 'a_NEB3011 AND tstart < 100 AND tend > 299 )') - # TODO: look into swapping around refWindows to make this work: - #self.assertEqual(ds.refWindows, [('lambda_NEB3011', 0, 99), - #('lambda_NEB3011', 100, 299)]) + '( rname = E.faecalis.2 AND tstart ' + '< 99 AND tend > 0 ) OR ( rname = ' + 'E.faecalis.2 AND tstart < 299 AND tend > 100 )') + self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99), + ('E.faecalis.2', 100, 299)]) def test_refLengths(self): @@ -1064,6 +1191,28 @@ class TestDataSet(unittest.TestCase): for i, item in enumerate(aln): self.assertEqual(item, aln[i]) + @unittest.skipIf(not _check_constools(), + "bamtools or pbindex not found, skipping") + def test_induce_indices(self): + # all of our test files are indexed. Copy just the main files to a temp + # location, open as dataset, assert unindexed, open with + # generateIndices=True, assert indexed + toTest = [8, 9, 10, 11, 12, 15, 16] + for fileNo in toTest: + outdir = tempfile.mkdtemp(suffix="dataset-unittest") + orig_dset = openDataSet(data.getXml(fileNo)) + resfnames = orig_dset.toExternalFiles() + new_resfnames = [] + for fname in resfnames: + newfname = os.path.join(outdir, os.path.basename(fname)) + shutil.copy(fname, newfname) + new_resfnames.append(newfname) + dset = type(orig_dset)(*new_resfnames) + self.assertFalse(dset.isIndexed) + dset = type(orig_dset)(*new_resfnames, generateIndices=True) + self.assertTrue(dset.isIndexed) + + def test_reads_in_reference(self): ds = AlignmentSet(data.getBam()) refNames = ds.refNames diff --git a/tests/test_pbdataset_subtypes.py b/tests/test_pbdataset_subtypes.py index 71d2869..3500d66 100644 --- a/tests/test_pbdataset_subtypes.py +++ b/tests/test_pbdataset_subtypes.py @@ -12,7 +12,7 @@ from pbcore.io import (DataSet, SubreadSet, ConsensusReadSet, ReferenceSet, ContigSet, AlignmentSet, FastaReader, FastaWriter, IndexedFastaReader, HdfSubreadSet, ConsensusAlignmentSet, - openDataFile) + openDataFile, FastaWriter) import pbcore.data as upstreamData import pbcore.data.datasets as data from pbcore.io.dataset.DataSetValidator import validateXml @@ -21,6 +21,11 @@ import xml.etree.ElementTree as ET log = logging.getLogger(__name__) def _check_constools(): + cmd = "dataset.py" + o, r, m = backticks(cmd) + if r != 2: + return False + cmd = "bamtools" o, r, m = backticks(cmd) if r != 0: @@ -137,6 +142,22 @@ class TestDataSet(unittest.TestCase): self.assertEquals(type(ds2).__name__, 'ConsensusReadSet') self.assertEquals(type(ds2._metadata).__name__, 'SubreadSetMetadata') + def test_ccsset_from_bam(self): + # DONE bug 28698 + ds1 = ConsensusReadSet(upstreamData.getCCSBAM(), strict=False) + fn = tempfile.NamedTemporaryFile(suffix=".consensusreadset.xml").name + log.debug(fn) + ds1.write(fn, validate=False) + ds1.write(fn) + + def test_subreadset_from_bam(self): + # DONE control experiment for bug 28698 + bam = upstreamData.getUnalignedBam() + ds1 = SubreadSet(bam, strict=False) + fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name + log.debug(fn) + ds1.write(fn) + def test_ccsalignment_build(self): ds1 = ConsensusAlignmentSet(data.getXml(20), strict=False) self.assertEquals(type(ds1).__name__, 'ConsensusAlignmentSet') @@ -146,6 +167,22 @@ class TestDataSet(unittest.TestCase): #self.assertEquals(type(ds2).__name__, 'ConsensusAlignmentSet') #self.assertEquals(type(ds2._metadata).__name__, 'SubreadSetMetadata') + def test_contigset_write(self): + fasta = upstreamData.getLambdaFasta() + ds = ContigSet(fasta) + self.assertTrue(isinstance(ds.resourceReaders()[0], + IndexedFastaReader)) + outdir = tempfile.mkdtemp(suffix="dataset-unittest") + outfn = os.path.join(outdir, 'test.fasta') + w = FastaWriter(outfn) + for rec in ds: + w.writeRecord(rec) + w.close() + fas = FastaReader(outfn) + for rec in fas: + # make sure a __repr__ didn't slip through: + self.assertFalse(rec.sequence.startswith('<')) + def test_file_factory(self): # TODO: add ConsensusReadSet, cmp.h5 alignmentSet types = [AlignmentSet(data.getXml(8)), @@ -209,6 +246,13 @@ class TestDataSet(unittest.TestCase): self.assertEqual(read1, read2) self.assertEqual(len(aln), len(nonCons)) + # Test that it is a valid xml: + outdir = tempfile.mkdtemp(suffix="dataset-unittest") + datafile = os.path.join(outdir, "apimerged.bam") + xmlfile = os.path.join(outdir, "apimerged.xml") + log.debug(xmlfile) + aln.write(xmlfile) + log.debug("Test with cheap filter") aln = AlignmentSet(data.getXml(12)) self.assertEqual(len(list(aln)), 177) @@ -286,7 +330,7 @@ class TestDataSet(unittest.TestCase): "7920800000001823174110291514_s1_p0." "all.alignmentset.xml") aln = AlignmentSet(testFile) - nonCons= AlignmentSet(testFile) + nonCons = AlignmentSet(testFile) self.assertEqual(len(aln.toExternalFiles()), 3) outdir = tempfile.mkdtemp(suffix="dataset-unittest") outfn = os.path.join(outdir, 'merged.bam') @@ -381,9 +425,39 @@ class TestDataSet(unittest.TestCase): # open acc and compare to exp for name, seq in zip(singletons, exp_single_seqs): - self.assertEqual(acc_file.get_contig(name).sequence, seq) - self.assertEqual(acc_file.get_contig(double).sequence, exp_double_seq) + self.assertEqual(acc_file.get_contig(name).sequence[:], seq) + self.assertEqual(acc_file.get_contig(double).sequence[:], + exp_double_seq) + def test_contigset_consolidate_genomic_consensus(self): + """ + Verify that the contigs output by GenomicConsensus (e.g. quiver) can + be consolidated. + """ + FASTA1 = ("lambda_NEB3011_0_60", + "GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCG") + FASTA2 = ("lambda_NEB3011_120_180", + "CACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTG") + FASTA3 = ("lambda_NEB3011_60_120", + "GTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGT") + files = [] + for i, (header, seq) in enumerate([FASTA1, FASTA2, FASTA3]): + _files = [] + for suffix in ["", "|quiver", "|plurality", "|arrow"]: + tmpfile = tempfile.NamedTemporaryFile(suffix=".fasta").name + with open(tmpfile, "w") as f: + f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq)) + _files.append(tmpfile) + files.append(_files) + for i in range(3): + ds = ContigSet(*[ f[i] for f in files ]) + out1 = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name + fa1 = tempfile.NamedTemporaryFile(suffix=".fasta").name + ds.consolidate(fa1) + ds.write(out1) + with ContigSet(out1) as ds_new: + self.assertEqual(len([ rec for rec in ds_new ]), 1, + "failed on %d" % i) def test_split_hdfsubreadset(self): hdfds = HdfSubreadSet(*upstreamData.getBaxH5_v23()) -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pbcore.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
