comparing with http://toolshed.g2.bx.psu.edu/repos/devteam/picard
searching for changes
changeset:   5:e195791371fd
user:        Brad Langhorst <langhorst@neb.com>
date:        Tue Feb 25 11:20:58 2014 -0500
summary:     adds support for RNA seq metrics tool

diff -r ab1f60c26526 -r e195791371fd picard_wrapper.py
--- a/picard_wrapper.py	Fri Feb 21 12:07:49 2014 -0500
+++ b/picard_wrapper.py	Tue Feb 25 11:20:58 2014 -0500
@@ -405,6 +405,12 @@
     # CollectGcBiasMetrics
     op.add_option('', '--windowsize', default='100')
     op.add_option('', '--mingenomefrac', default='0.00001')    
+    # CollectRnaSeqMetrics
+    op.add_option('', '--refflat', default=None)
+    op.add_option('', '--ribosomalintervals', default=None)
+    op.add_option('', '--strandspecificity', default='FIRST_READ_TRANSCRIPTION_STRAND')
+    op.add_option('', '--rrnaoverlappercent', default="0.8")
+    op.add_option('', '--mintranscriptlength', default="500")
     # AddOrReplaceReadGroups
     op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' )
     op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' )
@@ -614,7 +620,32 @@
         lf.write(s)
         lf.write('\n')
         lf.close()
-        
+
+    elif pic.picname == 'CollectRnaSeqMetrics':
+        assert os.path.isfile(ref_file_name),'Picard needs a reference sequence - cannot read %s' % ref_file_name
+        # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
+        # why? Dunno
+        fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
+        try:
+            os.symlink(ref_file_name,fakefasta)
+        except:
+            s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy'
+            info = s
+            shutil.copy(ref_file_name,fakefasta)
+        pic.delme.append(fakefasta)
+        x = 'rgPicardRnaSeqMetrics'
+        pdfname = '%s.pdf' % x
+        jpgname = '%s.jpg' % x
+        outname = '%s.out' % x
+        tempout = os.path.join(opts.outdir,outname)
+        temppdf = os.path.join(opts.outdir,pdfname)
+        cl.append('R=%s' % fakefasta)
+        cl.append('REF_FLAT=%s' % opts.refflat)
+        cl.append('RIBOSOMAL_INTERVALS=%s' % opts.ribosomalintervals )
+        cl.append('STRAND_SPECIFICITY=%s' % opts.strandspecificity )
+        cl.append('INPUT=%s' % opts.input)
+        cl.append('OUTPUT=%s' % pic.metricsOut ) #tempout)    
+
     elif pic.picname == 'CollectInsertSizeMetrics':
         """ <command interpreter="python">
    picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations"
diff -r ab1f60c26526 -r e195791371fd rgPicardRnaSeqMetrics.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rgPicardRnaSeqMetrics.xml	Tue Feb 25 11:20:58 2014 -0500
@@ -0,0 +1,156 @@
+<tool name="SAM/BAM RNA Seq Metrics" id="PicardRnaSeqMetrics" version="1.64.0">
+  <command interpreter="python">
+    picard_wrapper.py -i "$input_file" -d "$html_file.files_path" -t "$html_file"
+    --refflat "$ref_flat" --ribosomalintervals "$ribosomal_intervals" --strandspecificity "$strand_specificity" -n "$out_prefix" --tmpdir "${__new_file_path__}"
+    -j ${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectRnaSeqMetrics.jar
+#if $genomeSource.refGenomeSource == "history":
+ --ref-file "${genomeSource.ownFile}"
+#else:
+ --ref "${genomeSource.index.fields.path}"
+#end if
+  </command>
+  <requirements><requirement type="package">picard</requirement></requirements>
+  <inputs>
+    <param format="sam,bam" name="input_file" type="data" label="SAM/BAM dataset to generate metrics about"
+      help="If empty, upload or import a SAM/BAM dataset."/>
+    <param name="out_prefix" value="RNA Seq Metrics" type="text"
+      label="Title for the output file" help="Use this remind you what the job was for." size="80" />
+
+    <conditional name="genomeSource">
+      <param name="refGenomeSource" type="select" label="Select Reference Genome">
+        <option value="default" selected="true">Use the assigned data genome/build</option>
+        <option value="indexed">Select a different built-in genome</option>
+        <option value="history">Use a genome (fasta format) from my history</option>
+      </param>
+      <when value="default">
+        <param name="index" type="select" label="Check the assigned reference genome" help="Galaxy thinks that the reads in you dataset were aligned against this reference. If this is not correct, use the 'Select a build-in reference genome' option of the 'Select Reference Genome' dropdown to select approprtiate Reference.">
+          <options from_data_table="all_fasta">
+          <filter type="data_meta" ref="input_file" key="dbkey" column="dbkey" multiple="True" separator=","/>
+          <validator type="no_options" message="No reference build available for the selected input data" />
+          </options>
+        </param>
+      </when>
+      <when value="indexed">
+        <param name="index" type="select" label="Select a built-in reference genome" help="This list contains genomes cached at this Galaxy instance. If your genome of interest is not present here request it by using 'Help' link at the top of Galaxy interface or use the 'Use a genome (fasta format) from my history' option of the 'Select Reference Genome' dropdown.">
+          <options from_data_table="all_fasta"/>
+        </param>
+      </when>
+      <when value="history">
+        <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference genome from history" help="This option works best for relatively small genomes. If you are working with large human-sized genomes, send request to Galaxy team for adding your reference to this Galaxy instance by using 'Help' link at the top of Galaxy interface."/>
+      </when>
+    </conditional>
+    <param name="strand_specificity" type="select" label="Measure strand specificity">
+	<option value="NONE">Don't measure strand specificity</option>
+	<option value="FIRST_READ_TRANSCRIPTION_STRAND" selected="true">Expected on First Strand</option>
+	<option value="SECOND_READ_TRANSCRIPTION_STRAND">Expected on Second Strand</option>
+    </param>
+    <param name="ribosomal_intervals" format="picard_interval_list" type="data" label="List of ribosomal RNA intervals" />
+    <param name="ref_flat" format="tabular" type="data" label="Gene annotations in RefFlat format (from UCSC)"/>
+  </inputs>
+
+  <outputs>
+    <data format="html" name="html_file"  label="${out_prefix}.html"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="out_prefix" value="CollectRnaSeq" />
+      <param name="strand_specificity" value="NONE" />
+      <param name="ribosomal_intervals" file="picard_input_ribosomal_intervals.interval" />
+      <param name="ref_flat" file="picard_input_gene_model.refflat" />
+      <param name="refGenomeSource" value="history" />
+      <param name="ownFile" value="picard_input_hg18.trimmed.fasta" dbkey="hg18" />
+      <param name="input_file" value="picard_input_summary_alignment_stats.sam" ftype="sam" dbkey="hg18"/>
+      <output name="html_file" file="picard_output_GcBias_uploaded_hg18_summary_alignment_stats.html" ftype="html" lines_diff="50"/>
+    </test>
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**Summary**
+
+This Galaxy tool uses Picard to report detailed metrics about reads positions across genes and various classes of sequence (rRNA, mRNA, UTR, etc)
+
+**Picard documentation**
+
+This is a Galaxy wrapper for CollectRnaSeqMetrics, a part of the external package Picard-tools_.
+
+ .. _Picard-tools: http://www.google.com/search?q=picard+samtools
+
+-----
+
+.. class:: infomark
+
+**Syntax**
+
+- **Input** - SAM/BAM format aligned short read data in your current history
+- **Title** - the title to use for all output files from this job - use it for high level metadata
+- **Reference Genome** - Galaxy (and Picard) needs to know which genomic reference was used to generate alignemnts within the input SAM/BAM dataset. Here you have three choices:
+
+  - *Assigned data genome/build* - a genome specified for this dataset. If you your SAM/BAM dataset has an assigned reference genome it will be displayed below this dropdown. If it does not -> use one of the following two options.
+  - *Select a different built-in genome* - this option will list all reference genomes presently cached at this instance of Galaxy.
+  - *Select a reference genome from history* - alternatively you can upload your own version of reference genome into your history and use it with this option. This is however not advisable with large human-sized genomes. If your genome is large contact Galaxy team using "Help" link at the top of the interface and provide exact details on where we can download sequences you would like to use as the refenece. We will then install them as a part of locally cached genomic references.
+
+- **Ribosomal Intervals** - a file in Interval format containing the list of ribosomal RNA regions
+- **Gene List** - a file in ref_flat format containing the list of genes and their positions (for normalized gene coverage by position)
+- **Strand specificity** - whether to calculate strand specificity, and which strand reads are expected on
+
+-----
+
+.. class:: infomark
+
+**Inputs, outputs, and parameters**
+
+The Picard documentation (reformatted for Galaxy) says:
+
+.. csv-table::
+   :header-rows: 1
+
+    Option,Description
+    "REFERENCE_SEQUENCE=File","The reference sequence fasta file. Required."
+    "REF_FLAT=File","Gene annotations in refFlat format(see http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat ). Required."
+    "INPUT=File","The BAM or SAM file containing aligned reads. Required."
+    "OUTPUT=File","The text file to write the metrics table to. Required."
+    "CHART_OUTPUT=File","The PDF file to render the chart to. Required."
+    "SUMMARY_OUTPUT=File","The text file to write summary metrics to. Default value: null."
+    "STRAND_SPECIFICITY=File","Default value: FIRST_READ_TRANSCRIPTION_STRAND"
+    "RIBOSOMAL_INTERVALS=File","Specify this file to calculate the number of reads on known rRNA regions (a contaminant when mRNA is desired) Default value: null."
+
+
+The output produced by the tool has the following columns::
+
+  1. PF_BASES: Total bases in the input file
+  2. PF_ALIGNED_BASES: Total bases aligned to the reference
+  3. RIBOSOMAL_BASES: # of bases in the reference sequence annotated as ribosomal RNA
+  4. CODING_BASES: # of bases aligned to known genes
+  5. UTR_BASES:
+  6. INTRONIC_BASES: # of bases between exons
+  7. INTERGENIC_BASES: # of bases between genes
+  8. IGNORED_READS: # of reads that were ignorred
+  9. CORRECT_STRAND_READS: # of reads on the expected strand
+ 10. INCORRECT_STRAND_READS: # of reads on the opposite strand
+ 11. PCT_RIBOSOMAL_BASES:
+ 12. PCT_CODING_BASES:
+ 13. PCT_UTR_BASES:
+ 14. PCT_INTRONIC_BASES:
+ 15. PCT_INTERGENIC_BASES:
+ 16. PCT_MRNA_BASES:
+ 17. PCT_USABLE_BASES:
+ 18. PCT_CORRECT_STRAND_READS:
+ 19. MEDIAN_CV_COVERAGE:
+ 20. MEDIAN_5PRIME_BIAS:
+ 21. MEDIAN_3PRIME_BIAS:
+ 22. MEDIAN_5PRIME_TO_3PRIME_BIAS:
+
+.. class:: warningmark
+
+**Warning on SAM/BAM quality**
+
+Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
+flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
+to be the only way to deal with SAM/BAM that cannot be parsed.
+
+  </help>
+</tool>
+

changeset:   6:00673f4996b7
tag:         tip
user:        Brad Langhorst <langhorst@neb.com>
date:        Tue Feb 25 11:38:15 2014 -0500
summary:     adds support for downsample sam tool

diff -r e195791371fd -r 00673f4996b7 picard_DownsampleSam.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_DownsampleSam.xml	Tue Feb 25 11:38:15 2014 -0500
@@ -0,0 +1,67 @@
+<tool name="Downsample SAM" id="picard_DownsampleSam" version="1.64.0">
+  <requirements><requirement type="package">picard</requirement></requirements>
+  <command interpreter="python">
+    picard_wrapper.py
+      --input=$inputFile
+      --output=$outFile
+      --output-format=$outputFormat
+      --probability=$probability
+      --seed=$seed
+      -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/DownsampleSam.jar"
+  </command>
+  <inputs>
+    <param format="sam,bam" name="inputFile" type="data" label="SAM/BAM dataset to be downsampled"
+           help="If empty, upload or import a SAM/BAM dataset." />
+    <param name="probability" type="float" size="5" label="Probability (between 0 and 1) that any given read will be kept" help="specify 1 to keep all reads, 0.1 to keep 10% of the reads" value="1" />
+    <param name="seed" type="integer" size="5" label="Random seed value" help="(same seed + same probability = same set of reads kept)" value="1" />
+    <param name="outputFormat" type="boolean" checked="True" truevalue="bam" falsevalue="sam" label="Output BAM instead of SAM" help="Uncheck for SAM output" />
+  </inputs>
+  <outputs>
+    <data name="outFile" format="bam" label="${tool.name} on ${on_string}: downsampled ${outputFormat}">
+      <change_format>
+        <when input="outputFormat" value="sam" format="sam" />
+      </change_format>
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <!-- Commands:
+      cp test-data/phiX.fasta .
+      samtools faidx phiX.fasta
+      java -jar CreateSequenceDictionary.jar R=phiX.fasta O=phiX.dict URI=phiX.fasta TRUNCATE_NAMES_AT_WHITESPACE=false SPECIES=phiX174
+      java -jar ReorderSam.jar VALIDATION_STRINGENCY=LENIENT I=test-data/picard_RS_input1.bam O=picard_RS_output1.bam REFERENCE=phiX.fasta ALLOW_INCOMPLETE_DICT_CONCORDANCE=false ALLOW_CONTIG_LENGTH_DISCORDANCE=false
+    -->
+      <param name="inputFile" value="picard_RS_input1.sam" />
+      <param name="probability" value="0.1" />
+      <param name="seed" value="2" />
+      <output name="outFile" file="picard_RS_output1.sam" ftype="sam" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**Purpose**
+
+Randomly down-sample a SAM or BAM file to retain a random subset of the reads. Mate-pairs are either both kept or both discarded. Reads marked as not primary alignments are all discarded. Each read is given a probability P of being retained - results with the exact same input in the same order and with the same value for RANDOM_SEED will produce the same results.
+
+**Picard documentation**
+
+This is a Galaxy wrapper for DownsampleSam, a part of the external package Picard-tools_.
+
+ .. _Picard-tools: http://www.google.com/search?q=picard+samtools
+
+------
+
+.. class:: warningmark
+
+**Warning on SAM/BAM quality**
+
+Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
+flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
+to be the only way to deal with SAM/BAM that cannot be parsed.
+
+
+  </help>
+</tool>
+
diff -r e195791371fd -r 00673f4996b7 picard_wrapper.py
--- a/picard_wrapper.py	Tue Feb 25 11:20:58 2014 -0500
+++ b/picard_wrapper.py	Tue Feb 25 11:38:15 2014 -0500
@@ -790,6 +790,20 @@
         pic.delme.append(sortedfile) # not wanted
         stf.close()
         pic.cleanup()
+
+    elif pic.picname == "DownsampleSam":
+        cl.append('I=%s' % opts.input)
+        cl.append('O=%s' % opts.output)
+
+        if float(opts.probability) > 0:
+            cl.append('PROBABILITY=%s' % opts.probability)
+        if float(opts.seed) > 0:
+            cl.append('RANDOM_SEED=%s' % opts.seed)
+
+        stdouts,rval = pic.runPic(opts.jar, cl)
+       if opts.output_format == 'sam':
+            haveTempout = true
+
     else:
         print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname
         sys.exit(1)

