This is an automated email from the git hooks/post-receive script. afif pushed a commit to branch master in repository pbgenomicconsensus.
commit db3cde2805ef6dbe0e35728aece4924f583061b0 Author: Afif Elghraoui <[email protected]> Date: Sat Jul 23 15:54:41 2016 -0700 Imported Upstream version 2.1.0 --- CHANGELOG | 6 + GenomicConsensus/__init__.py | 2 +- GenomicConsensus/arrow/arrow.py | 52 ++++++-- GenomicConsensus/arrow/evidence.py | 161 +++++++++++++----------- GenomicConsensus/arrow/model.py | 49 ++++---- GenomicConsensus/arrow/utils.py | 77 ++++++++---- GenomicConsensus/main.py | 11 +- GenomicConsensus/options.py | 10 +- circle.yml | 4 + doc/HowTo.rst | 8 +- tests/cram/bad_input.t | 2 +- tests/cram/extra/arrow-evidence.t | 28 +++++ tests/cram/internal/alignment_summary_scaling.t | 4 +- tests/cram/internal/arrow-staph.t | 14 +-- tests/cram/version.t | 2 +- tests/unit/test_algorithm_selection.py | 4 +- tests/unit/test_summarize_consensus.py | 2 +- tests/unit/test_tool_contract.py | 72 +++++------ 18 files changed, 315 insertions(+), 193 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 1e0cfea..eea0fd8 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,9 @@ +Version 2.1.0 + * Major fixes for arrow + * Documentation for arrow + * Update for substantially-refactored ConsensusCore2 + * --dumpEvidence support for Arrow + Version 2.0.0 * Working support for Arrow and POA-only consensus models diff --git a/GenomicConsensus/__init__.py b/GenomicConsensus/__init__.py index 0e66fb7..68ac61b 100644 --- a/GenomicConsensus/__init__.py +++ b/GenomicConsensus/__init__.py @@ -30,4 +30,4 @@ # Author: David Alexander -__VERSION__ = "2.0.0" +__VERSION__ = "2.1.0" diff --git a/GenomicConsensus/arrow/arrow.py b/GenomicConsensus/arrow/arrow.py index ced2e25..5016ddf 100755 --- a/GenomicConsensus/arrow/arrow.py +++ b/GenomicConsensus/arrow/arrow.py @@ -30,7 +30,7 @@ # Authors: David Alexander, Lance Hepler -import logging +import logging, os.path import ConsensusCore2 as cc, numpy as np from .. import reference @@ -41,8 +41,9 @@ from ..ResultCollector import ResultCollectorProcess, ResultCollectorThread from GenomicConsensus.consensus import Consensus, ArrowConsensus, join from GenomicConsensus.windows import kSpannedIntervals, holes, subWindow from GenomicConsensus.variants import filterVariants, annotateVariants -from GenomicConsensus.arrow.evidence import dumpEvidence +from GenomicConsensus.arrow.evidence import ArrowEvidence from GenomicConsensus.arrow import diploid +from GenomicConsensus.utils import die import GenomicConsensus.arrow.model as M import GenomicConsensus.arrow.utils as U @@ -68,7 +69,7 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig, alnHits = U.readsInWindow(alnFile, refWindow, depthLimit=20000, minMapQV=arrowConfig.minMapQV, - strategy="longest", + strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) starts = np.fromiter((hit.tStart for hit in alnHits), np.int) @@ -98,7 +99,7 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig, alns = U.readsInWindow(alnFile, subWin, depthLimit=depthLimit, minMapQV=arrowConfig.minMapQV, - strategy="longest", + strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ] @@ -133,14 +134,26 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig, variants += filteredVars # Dump? - shouldDumpEvidence = \ + maybeDumpEvidence = \ ((options.dumpEvidence == "all") or + (options.dumpEvidence == "outliers") or (options.dumpEvidence == "variants") and (len(variants) > 0)) - if shouldDumpEvidence: - logging.info("Arrow does not yet support --dumpEvidence") -# dumpEvidence(options.evidenceDirectory, -# subWin, windowRefSeq, -# clippedAlns, css) + if maybeDumpEvidence: + refId, refStart, refEnd = subWin + refName = reference.idToName(refId) + windowDirectory = os.path.join( + options.evidenceDirectory, + refName, + "%d-%d" % (refStart, refEnd)) + ev = ArrowEvidence.fromConsensus(css) + if options.dumpEvidence != "outliers": + ev.save(windowDirectory) + elif (np.max(np.abs(ev.delta)) > 20): + # Mathematically I don't think we should be seeing + # deltas > 6 in magnitude, but let's just restrict + # attention to truly bonkers outliers. + ev.save(windowDirectory) + else: css = ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, subWin, intRefSeq) @@ -238,12 +251,29 @@ def configure(options, alnFile): if options.diploid: logging.warn("Diploid analysis not yet supported under Arrow model.") + # test available chemistries + supp = set(cc.SupportedChemistries()) + logging.info("Found consensus models for: ({0})".format(", ".join(sorted(supp)))) + + used = set(alnFile.sequencingChemistry) + if options.parametersSpec != "auto": + used = set([options.parametersSpec]) + + unsupp = used - supp + if unsupp: + die("Arrow: unsupported chemistries found: ({0})".format(", ".join(sorted(unsupp)))) + + logging.info("Using consensus models for: ({0})".format(", ".join(sorted(used)))) + return M.ArrowConfig(minMapQV=options.minMapQV, noEvidenceConsensus=options.noEvidenceConsensusCall, computeConfidence=(not options.fastMode), minReadScore=options.minReadScore, minHqRegionSnr=options.minHqRegionSnr, - minZScore=options.minZScore) + minZScore=options.minZScore, + minAccuracy=options.minAccuracy, + chemistryOverride=(None if options.parametersSpec == "auto" + else options.parametersSpec)) def slaveFactories(threaded): # By default we use slave processes. The tuple ordering is important. diff --git a/GenomicConsensus/arrow/evidence.py b/GenomicConsensus/arrow/evidence.py index 450f9c5..03d008f 100755 --- a/GenomicConsensus/arrow/evidence.py +++ b/GenomicConsensus/arrow/evidence.py @@ -30,8 +30,7 @@ # Authors: David Alexander, Lance Hepler -__all__ = [ "dumpEvidence", - "ArrowEvidence" ] +__all__ = [ "ArrowEvidence" ] import h5py, logging, os.path, numpy as np from collections import namedtuple @@ -41,60 +40,7 @@ from pbcore.io import FastaReader, FastaWriter from .utils import scoreMatrix from .. import reference -def dumpEvidence(evidenceDumpBaseDirectory, - refWindow, refSequence, alns, - arrowConsensus): - # Format of evidence dump: - # evidence_dump/ - # ref000001/ - # 0-1005/ - # reference.fa - # reads.fa - # consensus.fa - # arrow-scores.h5 - # 995-2005/ - # ... - join = os.path.join - refId, refStart, refEnd = refWindow - refName = reference.idToName(refId) - windowDirectory = join(evidenceDumpBaseDirectory, - refName, - "%d-%d" % (refStart, refEnd)) - logging.info("Dumping evidence to %s" % (windowDirectory,)) - - if os.path.exists(windowDirectory): - raise Exception, "Evidence dump does not expect directory %s to exist." % windowDirectory - os.makedirs(windowDirectory) - refFasta = FastaWriter(join(windowDirectory, "reference.fa")) - readsFasta = FastaWriter(join(windowDirectory, "reads.fa")) - consensusFasta = FastaWriter(join(windowDirectory, "consensus.fa")) - - windowName = refName + (":%d-%d" % (refStart, refEnd)) - refFasta.writeRecord(windowName, refSequence) - refFasta.close() - - consensusFasta.writeRecord(windowName + "|arrow", arrowConsensus.sequence) - consensusFasta.close() - - rowNames, columnNames, baselineScores, scores = scoreMatrix(arrowConsensus.ai) - arrowScoreFile = h5py.File(join(windowDirectory, "arrow-scores.h5")) - arrowScoreFile.create_dataset("Scores", data=scores) - vlen_str = h5py.special_dtype(vlen=str) - arrowScoreFile.create_dataset("RowNames", data=rowNames, dtype=vlen_str) - arrowScoreFile.create_dataset("ColumnNames", data=columnNames, dtype=vlen_str) - arrowScoreFile.create_dataset("BaselineScores", data=baselineScores) - arrowScoreFile.close() - for aln in alns: - readsFasta.writeRecord(str(aln.rowNumber), - aln.read(orientation="genomic", aligned=False)) - readsFasta.close() - - class ArrowEvidence(object): - """ - An experimental reader class for arrow evidence dumps produced by - arrow --dumpEvidence - """ Mutation = namedtuple("Mutation", ("Position", "Type", "FromBase", "ToBase")) @@ -105,9 +51,9 @@ class ArrowEvidence(object): type, fromBase, _, toBase = fields[1:] return ArrowEvidence.Mutation(pos, type, fromBase, toBase) - def __init__(self, path, refStart, consensus, rowNames, colNames, baselineScores, scores): - self.path = path - self.refStart = refStart + def __init__(self, refWindow, consensus, rowNames, colNames, baselineScores, scores): + assert isinstance(consensus, str) + self.refWindow = refWindow # tuple(str, int, int) self.consensus = consensus self.rowNames = rowNames self.colNames = colNames @@ -115,6 +61,27 @@ class ArrowEvidence(object): self.scores = scores self.muts = map(ArrowEvidence._parseMutName, self.colNames) + @staticmethod + def fromConsensus(css): + rowNames, colNames, baselineScores, scores = scoreMatrix(css.ai) + return ArrowEvidence(css.refWindow, + css.sequence, + rowNames, + colNames, + baselineScores, + scores) + @property + def refName(self): + return self.refWindow[0] + + @property + def refStart(self): + return self.refWindow[1] + + @property + def refEnd(self): + return self.refWindow[2] + @property def positions(self): return [ mut.Position for mut in self.muts ] @@ -124,33 +91,75 @@ class ArrowEvidence(object): return sorted(list(set(self.positions))) @property - def totalScores(self): - return self.baselineScores[:, np.newaxis] + self.scores + def delta(self): + return self.scores - self.baselineScores[:, np.newaxis] @staticmethod - def load(path): - if path.endswith("/"): path = path[:-1] - - refWin_ = path.split("/")[-1].split("-") - refStart = int(refWin_[0]) - - with FastaReader(path + "/consensus.fa") as fr: + def load(dir): + """ + Load an ArrowEvidence from a directory + """ + if dir.endswith("/"): dir = dir[:-1] + refStart, refEnd = map(int, dir.split("/")[-1].split("-")) + refName = dir.split("/")[-2] + refWindow = (refName, refStart, refEnd) + with FastaReader(dir + "/consensus.fa") as fr: consensus = next(iter(fr)).sequence - - with h5py.File(path + "/arrow-scores.h5", "r") as f: + with h5py.File(dir + "/arrow-scores.h5", "r") as f: scores = f["Scores"].value baselineScores = f["BaselineScores"].value colNames = f["ColumnNames"].value rowNames = f["RowNames"].value - return ArrowEvidence(path, refStart, consensus, + return ArrowEvidence(refWindow, consensus, rowNames, colNames, baselineScores, scores) + def save(self, dir): + """ + Save this ArrowEvidence to a directory. The directory will be + *created* by this method. + + Format of evidence dump: + evidence_dump/ + ref000001/ + 0-1005/ + consensus.fa + arrow-scores.h5 + 995-2005/ + ... + """ + logging.info("Dumping evidence to %s" % (dir,)) + join = os.path.join + if os.path.exists(dir): + raise Exception, "Evidence dump does not expect directory %s to exist." % dir + os.makedirs(dir) + #refFasta = FastaWriter(join(dir, "reference.fa")) + #readsFasta = FastaWriter(join(dir, "reads.fa")) + consensusFasta = FastaWriter(join(dir, "consensus.fa")) + windowName = self.refName + (":%d-%d" % (self.refStart, self.refEnd)) + #refFasta.writeRecord(windowName, self.refSequence) + #refFasta.close() + + consensusFasta.writeRecord(windowName + "|arrow", self.consensus) + consensusFasta.close() + + arrowScoreFile = h5py.File(join(dir, "arrow-scores.h5")) + arrowScoreFile.create_dataset("Scores", data=self.scores) + vlen_str = h5py.special_dtype(vlen=str) + arrowScoreFile.create_dataset("RowNames", data=self.rowNames, dtype=vlen_str) + arrowScoreFile.create_dataset("ColumnNames", data=self.colNames, dtype=vlen_str) + arrowScoreFile.create_dataset("BaselineScores", data=self.baselineScores) + arrowScoreFile.close() + # for aln in alns: + # readsFasta.writeRecord(str(aln.rowNumber), + # aln.read(orientation="genomic", aligned=False)) + # readsFasta.close() + + def forPosition(self, pos): posStart = bisect_left(self.positions, pos) posEnd = bisect_right(self.positions, pos) - return ArrowEvidence(self.path, - self.refStart, + return ArrowEvidence(self.refStart, self.consensus, self.rowNames, self.colNames[posStart:posEnd], @@ -160,8 +169,7 @@ class ArrowEvidence(object): def justSubstitutions(self): colMask = np.array(map(lambda s: ("Sub" in s), self.colNames)) - return ArrowEvidence(self.path, - self.refStart, + return ArrowEvidence(self.refStart, self.consensus, self.rowNames, self.colNames[colMask], @@ -169,5 +177,6 @@ class ArrowEvidence(object): self.scores[:, colMask]) def rowNumbers(self): - with FastaReader(self.path + "/reads.fa") as fr: - return [ int(ctg.name) for ctg in fr ] + # with FastaReader(self.dir + "/reads.fa") as fr: + # return [ int(ctg.name) for ctg in fr ] + raise NotImplementedError diff --git a/GenomicConsensus/arrow/model.py b/GenomicConsensus/arrow/model.py index 2138c65..a1c8f1d 100755 --- a/GenomicConsensus/arrow/model.py +++ b/GenomicConsensus/arrow/model.py @@ -64,7 +64,9 @@ class ArrowConfig(object): readStumpinessThreshold=0.1, minReadScore=0.75, minHqRegionSnr=3.75, - minZScore=-3.5): + minZScore=-3.5, + minAccuracy=0.82, + chemistryOverride=None): self.minMapQV = minMapQV self.minPoaCoverage = minPoaCoverage @@ -78,32 +80,37 @@ class ArrowConfig(object): self.minReadScore = minReadScore self.minHqRegionSnr = minHqRegionSnr self.minZScore = minZScore + self.minAccuracy = minAccuracy + self.chemistryOverride = chemistryOverride - @staticmethod - def extractFeatures(aln): - """ - Extract the data in a cmp.h5 alignment record into a - native-orientation gapless string. - """ - if isinstance(aln, CmpH5Alignment): - die("Arrow does not support CmpH5 files!") - else: - return aln.read(aligned=False, orientation="native") - - @staticmethod - def extractMappedRead(aln, windowStart): + def extractMappedRead(self, aln, windowStart): """ Given a clipped alignment, convert its coordinates into template space (starts with 0), bundle it up with its features as a MappedRead. """ + if isinstance(aln, CmpH5Alignment): + die("Arrow does not support CmpH5 files!") + assert aln.referenceSpan > 0 + + def baseFeature(featureName): + if aln.reader.hasBaseFeature(featureName): + rawFeature = aln.baseFeature(featureName, aligned=False, orientation="native") + return rawFeature.clip(0,255).astype(np.uint8) + else: + return np.zeros((aln.qLen,), dtype=np.uint8) + name = aln.readName chemistry = aln.sequencingChemistry - strand = cc.StrandEnum_REVERSE if aln.isReverseStrand else cc.StrandEnum_FORWARD - read = cc.Read(name, ArrowConfig.extractFeatures(aln), chemistry) - return (cc.MappedRead(read, - strand, - int(aln.referenceStart - windowStart), - int(aln.referenceEnd - windowStart)), - cc.SNR(aln.hqRegionSnr)) + strand = cc.StrandType_REVERSE if aln.isReverseStrand else cc.StrandType_FORWARD + read = cc.Read(name, + aln.read(aligned=False, orientation="native"), + cc.Uint8Vector(baseFeature("Ipd").tolist()), + cc.Uint8Vector(baseFeature("PulseWidth").tolist()), + cc.SNR(aln.hqRegionSnr), + chemistry if self.chemistryOverride is None else self.chemistryOverride) + return cc.MappedRead(read, + strand, + int(aln.referenceStart - windowStart), + int(aln.referenceEnd - windowStart)) diff --git a/GenomicConsensus/arrow/utils.py b/GenomicConsensus/arrow/utils.py index c8bbe8e..8277fd9 100755 --- a/GenomicConsensus/arrow/utils.py +++ b/GenomicConsensus/arrow/utils.py @@ -132,8 +132,8 @@ def refineConsensus(ai, arrowConfig): cfg = cc.PolishConfig(arrowConfig.maxIterations, arrowConfig.mutationSeparation, arrowConfig.mutationNeighborhood) - isConverged, nTested, nApplied = cc.Polish(ai, cfg) - return str(ai), isConverged + polishResult = cc.Polish(ai, cfg) + return str(ai), polishResult.hasConverged def consensusConfidence(ai, positions=None): """ @@ -142,7 +142,7 @@ def consensusConfidence(ai, positions=None): omitted, confidence values are returned for all positions in the consensus (str(ai)). """ - return np.array(np.clip(cc.ConsensusQVs(ai), 0, 93), dtype=np.uint8) + return np.array(np.clip(cc.ConsensusQualities(ai), 0, 93), dtype=np.uint8) def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None): """ @@ -233,9 +233,9 @@ def _shortMutationDescription(mut, tpl): """ _type = _typeMap[mut.Type] _pos = mut.Start() - _oldBase = "." if mut.Type() == cc.MutationType_INSERTION \ + _oldBase = "." if mut.Type == cc.MutationType_INSERTION \ else tpl[_pos] - _newBase = "." if mut.Type() == cc.MutationType_DELETION \ + _newBase = "." if mut.Type == cc.MutationType_DELETION \ else mut.Base return "%d %s %s > %s" % (_pos, _type, _oldBase, _newBase) @@ -253,16 +253,17 @@ def scoreMatrix(ai): """ css = str(ai) allMutations = sorted(allSingleBaseMutations(css)) - shape = (ai.NumReads(), len(allMutations)) + readNames = list(ai.ReadNames()) + numReads = len(readNames) + shape = (numReads, len(allMutations)) scoreMatrix = np.zeros(shape) for j, mut in enumerate(allMutations): - mutScores = ai.ReadLLs(mut) + mutScores = ai.LLs(mut) scoreMatrix[:, j] = mutScores - baselineScores = np.array(ai.ReadLLs()) - rowNames = [ ai.Read(i).Name - for i in xrange(ai.NumReads()) ] + baselineScores = np.array(ai.LLs()) columnNames = [ _shortMutationDescription(mut, css) for mut in allMutations ] + rowNames = readNames return (rowNames, columnNames, baselineScores, scoreMatrix) @@ -309,6 +310,24 @@ def filterAlns(refWindow, alns, arrowConfig): a.readScore >= arrowConfig.minReadScore ] +def sufficientlyAccurate(mappedRead, poaCss, minAccuracy): + if minAccuracy <= 0.0: + return True + s, e = mappedRead.TemplateStart, mappedRead.TemplateEnd + tpl = poaCss[s:e] + if mappedRead.Strand == cc.StrandType_FORWARD: + pass + elif mappedRead.Strand == cc.StrandType_REVERSE: + tpl = reverseComplement(tpl) + else: + return False + aln = cc.AlignLinear(tpl, mappedRead.Seq) + nErrors = sum(1 for t in aln.Transcript() if t != 'M') + tlen = len(tpl) + acc = 1.0 - 1.0 * min(nErrors, tlen) / tlen + return acc >= minAccuracy + + def consensusForAlignments(refWindow, refSequence, alns, arrowConfig): """ Call consensus on this interval---without subdividing the interval @@ -341,29 +360,38 @@ def consensusForAlignments(refWindow, refSequence, alns, arrowConfig): # coordinates relative to the POA consensus mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ] queryPositions = cc.TargetToQueryPositions(ga) - mappedReads = [ (lifted(queryPositions, mr), snr) for (mr, snr) in mappedReads ] + mappedReads = [ lifted(queryPositions, mr) for mr in mappedReads ] # Load the mapped reads into the mutation scorer, and iterate # until convergence. ai = cc.MultiMolecularIntegrator(poaCss, cc.IntegratorConfig(arrowConfig.minZScore)) coverage = 0 - for (mr, snr) in mappedReads: + for mr in mappedReads: if (mr.TemplateEnd <= mr.TemplateStart or mr.TemplateEnd - mr.TemplateStart < 2 or mr.Length() < 2): continue - coverage += 1 if ai.AddRead(mr, snr) == cc.AddReadResult_SUCCESS else 0 - - # TODO(lhepler, dalexander): propagate coverage around somehow + if not sufficientlyAccurate(mr, poaCss, arrowConfig.minAccuracy): + tpl = poaCss[mr.TemplateStart:mr.TemplateEnd] + if mr.Strand == cc.StrandType_FORWARD: + pass + elif mr.Strand == cc.StrandType_REVERSE: + tpl = reverseComplement(tpl) + else: + tpl = "INACTIVE/UNMAPPED" + logging.debug("%s: skipping read '%s' due to insufficient accuracy, (poa, read): ('%s', '%s')" % (refWindow, mr.Name, tpl, mr.Seq)) + continue + if ai.AddRead(mr) == cc.State_VALID: + coverage += 1 # Iterate until covergence - try: - if coverage < arrowConfig.minPoaCoverage: - logging.info("%s: Inadequate coverage to call consensus" % (refWindow,)) - return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, - refWindow, refSequence) - _, converged = refineConsensus(ai, arrowConfig) - assert converged, "Arrow did not converge to MLE" + if coverage < arrowConfig.minPoaCoverage: + logging.info("%s: Inadequate coverage to call consensus" % (refWindow,)) + return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, + refWindow, refSequence) + _, converged = refineConsensus(ai, arrowConfig) + + if converged: arrowCss = str(ai) if arrowConfig.computeConfidence: confidence = consensusConfidence(ai) @@ -373,9 +401,8 @@ def consensusForAlignments(refWindow, refSequence, alns, arrowConfig): arrowCss, confidence, ai) - except: - traceback = ''.join(format_exception(*sys.exc_info())) - logging.info("%s: %s" % (refWindow, traceback)) + else: + logging.info("%s: Arrow did not converge to MLE" % (refWindow,)) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence) diff --git a/GenomicConsensus/main.py b/GenomicConsensus/main.py index 4c5ca14..0bbf436 100644 --- a/GenomicConsensus/main.py +++ b/GenomicConsensus/main.py @@ -78,7 +78,7 @@ class ToolRunner(object): options.temporaryDirectory = tempfile.mkdtemp(prefix="GenomicConsensus-", dir="/tmp") logging.info("Created temporary directory %s" % (options.temporaryDirectory,) ) - def _algorithmByName(self, name, cmpH5): + def _algorithmByName(self, name, peekFile): if name == "plurality": from GenomicConsensus.plurality import plurality algo = plurality @@ -88,14 +88,19 @@ class ToolRunner(object): elif name == "arrow": from GenomicConsensus.arrow import arrow algo = arrow + # All arrow models require PW except P6 and the first S/P1-C1 + if set(peekFile.sequencingChemistry) - set(["P6-C4", "S/P1-C1/beta"]): + if (not peekFile.hasBaseFeature("Ipd") or + not peekFile.hasBaseFeature("PulseWidth")): + die("Model requires missing base feature: IPD or PulseWidth") elif name == "poa": from GenomicConsensus.poa import poa algo = poa elif name == "best": logging.info("Identifying best algorithm based on input data") from GenomicConsensus import algorithmSelection - algoName = algorithmSelection.bestAlgorithm(cmpH5.sequencingChemistry) - return self._algorithmByName(algoName, cmpH5) + algoName = algorithmSelection.bestAlgorithm(peekFile.sequencingChemistry) + return self._algorithmByName(algoName, peekFile) else: die("Failure: unrecognized algorithm %s" % name) isOK, msg = algo.availability diff --git a/GenomicConsensus/options.py b/GenomicConsensus/options.py index 7303e89..127d1c9 100644 --- a/GenomicConsensus/options.py +++ b/GenomicConsensus/options.py @@ -90,6 +90,7 @@ class Constants(object): DEFAULT_MIN_READSCORE = 0.65 DEFAULT_MIN_HQREGIONSNR = 3.75 DEFAULT_MIN_ZSCORE = -3.5 + DEFAULT_MIN_ACCURACY = 0.82 def get_parser(): """ @@ -300,6 +301,13 @@ def add_options_to_argument_parser(parser): type=float, default=Constants.DEFAULT_MIN_ZSCORE, help="The minimum acceptable z-score for reads that will be used for analysis (arrow-only).") + readSelection.add_argument( + "--minAccuracy", + action="store", + dest="minAccuracy", + type=float, + default=Constants.DEFAULT_MIN_ACCURACY, + help="The minimum acceptable window-global alignment accuracy for reads that will be used for the analysis (arrow-only).") algorithm = parser.add_argument_group("Algorithm and parameter settings") algorithm.add_argument( @@ -349,7 +357,7 @@ def add_options_to_argument_parser(parser): nargs="?", default=None, const="variants", - choices=["variants", "all"]) + choices=["variants", "all", "outliers"]) debugging.add_argument( "--evidenceDirectory", default="evidence_dump") diff --git a/circle.yml b/circle.yml index b1db9e5..f1e3793 100644 --- a/circle.yml +++ b/circle.yml @@ -7,12 +7,15 @@ dependencies: - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test - sudo apt-get update - sudo apt-get install g++-4.8 + - curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash + - sudo apt-get install git-lfs=1.1.0 - if [ ! -d _deps ] ; then mkdir _deps ; fi # Create a directory for dependencies, These are static, cache them. - pushd _deps ; if [ ! -d cmake-3.3.0-Linux-x86_64 ] ; then wget --no-check-certificate http://www.cmake.org/files/v3.3/cmake-3.3.0-Linux-x86_64.tar.gz ; tar xzf cmake-3.3.0-Linux-x86_64.tar.gz ; fi - pushd _deps ; if [ ! -d boost_1_58_0 ] ; then wget http://downloads.sourceforge.net/project/boost/boost/1.58.0/boost_1_58_0.tar.bz2 ; tar xjf boost_1_58_0.tar.bz2 ; fi - pushd _deps ; if [ ! -f swig-3.0.8/bin/swig ] ; then rm -fr swig-3.0.8* ; mkdir dl ; pushd dl ; wget http://downloads.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz ; tar xzf swig-3.0.8.tar.gz ; pushd swig-3.0.8 ; ./configure --prefix $(readlink -f ../../swig-3.0.8) ; make ; make install ; fi - pushd _deps ; git clone https://github.com/PacificBiosciences/ConsensusCore.git - pushd _deps ; git clone https://github.com/PacificBiosciences/ConsensusCore2.git + - pushd _deps ; git clone https://github.com/PacificBiosciences/PacBioTestData.git - pip install --upgrade pip - pip install numpy - pip install cython @@ -24,6 +27,7 @@ dependencies: override: - pushd _deps/ConsensusCore ; python setup.py install --boost=$(readlink -f ../boost_1_58_0) --swig=$(readlink -f ../swig-3.0.8/bin/swig) - pushd _deps/ConsensusCore2 ; CC=gcc-4.8 CXX=g++-4.8 CMAKE_COMMAND=$(readlink -f ../cmake-3.3.0-Linux-x86_64/bin/cmake) Boost_INCLUDE_DIRS=$(readlink -f ../boost_1_58_0) SWIG_COMMAND=$(readlink -f ../swig-3.0.8/bin/swig) pip install --verbose --upgrade --no-deps . + - pushd _deps/PacBioTestData ; git lfs pull && make python test: pre: - pip install --verbose . diff --git a/doc/HowTo.rst b/doc/HowTo.rst index 538f707..0466d52 100644 --- a/doc/HowTo.rst +++ b/doc/HowTo.rst @@ -36,9 +36,14 @@ release of SMRTanalysis 3.0 or build from GitHub sources.* Running a large-scale resequencing/polishing job in SMRTanalysis 2.3 -------------------------------------------------------------------- +We do not recommend attempting to construct a single giant cmp.h5 file and +then processing it on a single node. This is inefficient and users attempting to do this +have run into many problems with the instability of the HDF5 library (which PacBio is +moving away from, in favor of BAM_.) + To run a large-scale resequencing job (>50 megabase genome @ 50x coverage,nominally), you want to spread the computation load across -multiple nodes in your computing cluster. +multiple nodes in your computing cluster. The `smrtpipe` workflow engine in SMRTanalysis 2.3 provides a convenient workflow automating this---it will automatically spread the @@ -126,3 +131,4 @@ Please consult the `FAQ document`_. .. _pitchfork : https://github.com/PacificBiosciences/pitchfork .. _`params.xml template`: ./params-template.xml .. _`SMRTpipe reference manual`: http://www.pacb.com/wp-content/uploads/2015/09/SMRT-Pipe-Reference-Guide.pdf +.. _`BAM`: http://pacbiofileformats.readthedocs.io/en/3.0/BAM.html diff --git a/tests/cram/bad_input.t b/tests/cram/bad_input.t index 30d59e9..74e49bb 100644 --- a/tests/cram/bad_input.t +++ b/tests/cram/bad_input.t @@ -7,5 +7,5 @@ Test for sane behavior in the presence of bogus arguments. Test that it doesn't crash when one BAM file in an otherwise valid AlignmentSet is empty. $ DATA=$TESTDIR/../data/sanity - $ REF="`python -c 'import pbcore.data ; print pbcore.data.getLambdaFasta()'`" + $ REF="`pbdata get lambda-fasta`" $ arrow --reference $REF -o contig.fasta $DATA/mixed.alignmentset.xml diff --git a/tests/cram/extra/arrow-evidence.t b/tests/cram/extra/arrow-evidence.t new file mode 100644 index 0000000..3c42cd1 --- /dev/null +++ b/tests/cram/extra/arrow-evidence.t @@ -0,0 +1,28 @@ +Get the arrow evidence dump and make sure it can be grokked. + + $ export DATA=$TESTDIR/../../data + $ export INPUT=$DATA/all4mer/out.aligned_subreads.bam + $ export REFERENCE=$DATA/all4mer/All4mer.V2.01_Insert.fa + +Run arrow w/ evidence dump + + $ arrow --dumpEvidence=all $INPUT -r $REFERENCE -o v.gff -o css.fa -o css.fq + +Inspect the output... + + $ find evidence_dump | sort + evidence_dump + evidence_dump/All4mer.V2.01_Insert + evidence_dump/All4mer.V2.01_Insert/0-260 + evidence_dump/All4mer.V2.01_Insert/0-260/arrow-scores.h5 + evidence_dump/All4mer.V2.01_Insert/0-260/consensus.fa + + +Try to load it up using the API... + + $ python << EOF + > from GenomicConsensus.arrow.evidence import ArrowEvidence + > ev = ArrowEvidence.load("evidence_dump/All4mer.V2.01_Insert/0-260") + > assert 8*len(ev.consensus)==len(ev.colNames)==2080 + > assert ev.delta.shape == (len(ev.rowNames), len(ev.colNames))==(95,2080) + > EOF diff --git a/tests/cram/internal/alignment_summary_scaling.t b/tests/cram/internal/alignment_summary_scaling.t index e20e454..cbc3a1a 100644 --- a/tests/cram/internal/alignment_summary_scaling.t +++ b/tests/cram/internal/alignment_summary_scaling.t @@ -12,7 +12,7 @@ unless an O(N^2) loop is used. $ GFFREF=$TESTDATA/alignment_summary_variants.gff $ OUTPUT=alignment_summary_variants_test.gff $ summarizeConsensus --variantsGff $VARIANTS --output $OUTPUT $SUMMARY - $ diff -I "##source-commandline" $OUTPUT $GFFREF + $ diff -I "##source" $OUTPUT $GFFREF Second test has 20000 regions in a single contig, and 10000 variants. This will also take several minutes. @@ -22,4 +22,4 @@ will also take several minutes. $ GFFREF=$TESTDATA/alignment_summary_variants_big_chr.gff $ OUTPUT=alignment_summary_variants_big_chr_test.gff $ summarizeConsensus --variantsGff $VARIANTS --output $OUTPUT $SUMMARY - $ diff -I "##source-commandline" $OUTPUT $GFFREF + $ diff -I "##source" $OUTPUT $GFFREF diff --git a/tests/cram/internal/arrow-staph.t b/tests/cram/internal/arrow-staph.t index 99eb11c..433675d 100644 --- a/tests/cram/internal/arrow-staph.t +++ b/tests/cram/internal/arrow-staph.t @@ -14,19 +14,9 @@ Quiver does a good job here---no errors. $ fastacomposition quiver-css.fasta quiver-css.fasta A 960233 C 470725 G 470271 T 971458 -Arrow, on the other hand, makes a number of errors. +Arrow, since the SNR capping fix, also gets no errors. $ gffsubtract.pl arrow-variants.gff $MASK | grep -v "#" | sed 's/\t/ /g' - Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 42450 42450 . . . reference=A;variantSeq=.;coverage=100;confidence=68 - Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 442861 442861 . . . reference=T;variantSeq=.;coverage=100;confidence=64 - Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 562107 562107 . . . reference=T;variantSeq=.;coverage=100;confidence=52 - Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 690814 690814 . . . reference=T;variantSeq=.;coverage=100;confidence=46 - Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1041877 1041877 . . . reference=A;variantSeq=.;coverage=100;confidence=45 - Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1801427 1801427 . . . reference=T;variantSeq=.;coverage=100;confidence=51 - Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1840635 1840635 . . . reference=A;variantSeq=.;coverage=100;confidence=88 - Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1850379 1850379 . . . reference=T;variantSeq=.;coverage=100;confidence=40 - Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 2147449 2147449 . . . reference=T;variantSeq=.;coverage=100;confidence=93 - Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 2231043 2231043 . . . reference=A;variantSeq=.;coverage=100;confidence=80 $ fastacomposition arrow-css.fasta - arrow-css.fasta A 960033 C 470641 G 470190 T 971286 a 178 c 84 g 80 t 158 + arrow-css.fasta A 960232 C 470725 G 470271 T 971457 diff --git a/tests/cram/version.t b/tests/cram/version.t index 7df81c8..2ec1942 100644 --- a/tests/cram/version.t +++ b/tests/cram/version.t @@ -2,7 +2,7 @@ This actually failed once because of a missing import, so we might as well test it. $ variantCaller --version - 2.0.0 + 2.1.0 This will break if the parser setup is messed up. diff --git a/tests/unit/test_algorithm_selection.py b/tests/unit/test_algorithm_selection.py index 65e8e89..8ffaf6b 100644 --- a/tests/unit/test_algorithm_selection.py +++ b/tests/unit/test_algorithm_selection.py @@ -10,5 +10,5 @@ def test_algorithm_selection(): EQ("arrow", bestAlgorithm_(["P6-C4", "S/P1-C1/beta"])) EQ(None, bestAlgorithm_(["P6-C4", "unknown"])) EQ("arrow", bestAlgorithm_(["S/P1-C1"])) - EQ("arrow", bestAlgorithm_(["P6-C4", "S/P1-C1"])) - EQ("arrow", bestAlgorithm_(["P5-C3", "S/P1-C1"])) # (Arrow pres. no training for P5. But it will tell us that) + EQ("arrow", bestAlgorithm_(["P6-C4", "S/P1-C1.1"])) + EQ("arrow", bestAlgorithm_(["P5-C3", "S/P1-C1.1"])) # (Arrow pres. no training for P5. But it will tell us that) diff --git a/tests/unit/test_summarize_consensus.py b/tests/unit/test_summarize_consensus.py index b1d2718..e335120 100644 --- a/tests/unit/test_summarize_consensus.py +++ b/tests/unit/test_summarize_consensus.py @@ -31,7 +31,7 @@ chr2\t.\tregion\t10000\t23469\t0.00\t+\t.\tcov=0,48,89;cov2=47.303,12.036;gaps=1 chr3\t.\tregion\t1\t7000\t0.00\t+\t.\tcov=0,48,89;cov2=47.303,12.036;gaps=1,24""" EXPECTED = """\ -##source GenomicConsensus 2.0.0 +##source GenomicConsensus 2.1.0 ##pacbio-alignment-summary-version 0.6 ##source-commandline this line will be skipped in the comparison chr1\t.\tregion\t1\t5000\t0.00\t+\t.\tcov=4,23,28;cov2=20.162,5.851;gaps=0,0;cQv=20,20,20;del=0;ins=0;sub=1 diff --git a/tests/unit/test_tool_contract.py b/tests/unit/test_tool_contract.py index 357ada3..33de349 100644 --- a/tests/unit/test_tool_contract.py +++ b/tests/unit/test_tool_contract.py @@ -3,43 +3,46 @@ import unittest import os.path from pbcore.io import openDataSet, ContigSet -import pbcore.data import pbcommand.testkit -# XXX local data directory, absolutely required +import pbtestdata + DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data") assert os.path.isdir(DATA_DIR) -# optional (but required for TestSummarizeConsensus) -DATA_DIR_2 = "/mnt/secondary/Share/Quiver/TestData/tinyLambda/" - -class TestVariantCaller(pbcommand.testkit.PbTestApp): - DRIVER_BASE = "variantCaller " - DRIVER_EMIT = DRIVER_BASE + " --emit-tool-contract " - DRIVER_RESOLVE = DRIVER_BASE + " --resolved-tool-contract " - REQUIRES_PBCORE = True - INPUT_FILES = [ - pbcore.data.getBamAndCmpH5()[0], -# "/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/2372215/0007_tiny/Alignment_Results/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.alignmentset.xml", - pbcore.data.getLambdaFasta() - ] - TASK_OPTIONS = { - "genomic_consensus.task_options.min_coverage": 0, - "genomic_consensus.task_options.min_confidence": 0, - "genomic_consensus.task_options.algorithm": "quiver", - "genomic_consensus.task_options.diploid": False, - } - - def run_after(self, rtc, output_dir): - contigs_file = rtc.task.output_files[1] - with openDataSet(contigs_file, strict=True) as ds: - self.assertTrue(isinstance(ds, ContigSet)) - - -class TestVariantCallerArrow(TestVariantCaller): - TASK_OPTIONS = { - "genomic_consensus.task_options.algorithm": "arrow", - } +# These tests seem to cause some logging failure at shutdown; +# disabling them pending upstream fix. See: +# https://bugzilla.nanofluidics.com/show_bug.cgi?id=33699 + +# class TestVariantCaller(pbcommand.testkit.PbTestApp): +# DRIVER_BASE = "variantCaller " +# DRIVER_EMIT = DRIVER_BASE + " --emit-tool-contract " +# DRIVER_RESOLVE = DRIVER_BASE + " --resolved-tool-contract " +# REQUIRES_PBCORE = True +# INPUT_FILES = [ +# pbtestdata.get_file("aligned-xml"), pbtestdata.get_file("lambdaNEB") +# ] +# TASK_OPTIONS = { +# "genomic_consensus.task_options.min_coverage": 0, +# "genomic_consensus.task_options.min_confidence": 0, +# "genomic_consensus.task_options.algorithm": "quiver", +# "genomic_consensus.task_options.diploid": False, +# } + +# def test_run_e2e(self): +# import ipdb; ipdb.set_trace() +# super(TestVariantCaller, self).test_run_e2e() + +# def run_after(self, rtc, output_dir): +# contigs_file = rtc.task.output_files[1] +# with openDataSet(contigs_file, strict=True) as ds: +# self.assertTrue(isinstance(ds, ContigSet)) + + +# class TestVariantCallerArrow(TestVariantCaller): +# TASK_OPTIONS = { +# "genomic_consensus.task_options.algorithm": "arrow", +# } class TestGffToBed(pbcommand.testkit.PbTestApp): @@ -70,15 +73,14 @@ class TestGffToVcf(pbcommand.testkit.PbTestApp): } [email protected](os.path.isdir(DATA_DIR_2), "Missing %s" % DATA_DIR_2) class TestSummarizeConsensus(pbcommand.testkit.PbTestApp): DRIVER_BASE = "summarizeConsensus" DRIVER_EMIT = DRIVER_BASE + " --emit-tool-contract " DRIVER_RESOLVE = DRIVER_BASE + " --resolved-tool-contract " REQUIRES_PBCORE = True INPUT_FILES = [ - os.path.join(DATA_DIR_2, "alignment_summary.gff"), - os.path.join(DATA_DIR_2, "variants.gff.gz") + pbtestdata.get_file("alignment-summary-gff"), + pbtestdata.get_file("variants-gff") ] TASK_OPTIONS = {} -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pbgenomicconsensus.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
