This is an automated email from the git hooks/post-receive script. misterc-guest pushed a commit to annotated tag upstream/3.0.0+dfsg in repository transdecoder.
commit f9ac657db54bd4d1eb376b80ea796e2e4300cfc3 Author: Michael R. Crusoe <[email protected]> Date: Wed May 18 02:30:59 2016 -0700 Imported Upstream version 3.0.0+dfsg --- .gitmodules | 3 + PerlLib/Gene_obj.pm | 20 +++ TransDecoder.LongOrfs | 81 +++++++-- TransDecoder.Predict | 99 ++++++++--- notes | 3 + sample_data/Makefile | 14 ++ sample_data/cufflinks_example/Makefile | 6 + sample_data/cufflinks_example/blastp.outfmt6.gz | Bin 0 -> 1957 bytes .../cufflinks_example/blastp.results.outfmt6.gz | Bin 1808 -> 0 bytes sample_data/cufflinks_example/cleanme.pl | 7 +- sample_data/cufflinks_example/pfam.domtblout.gz | Bin 12838 -> 12460 bytes sample_data/cufflinks_example/runMe.sh | 41 +++-- sample_data/pasa_example/Makefile | 6 + sample_data/pasa_example/cleanme.pl | 2 + sample_data/pasa_example/pasa_assemblies.fasta.gz | Bin 304896 -> 304706 bytes sample_data/pasa_example/pasa_assemblies.gff3.gz | Bin 55455 -> 55467 bytes .../pasa_example/pasa_assemblies_described.txt.gz | Bin 0 -> 231156 bytes sample_data/pasa_example/runMe.sh | 17 +- sample_data/simple_transcriptome_target/Makefile | 6 + sample_data/simple_transcriptome_target/cleanme.pl | 1 + sample_data/simple_transcriptome_target/runMe.sh | 4 +- util/cdna_alignment_orf_to_genome_orf.pl | 62 ++++--- util/remove_eclipsed_ORFs.pl | 4 +- util/single_best_ORF_per_transcript.pl | 187 +++++++++++++++++++++ 24 files changed, 472 insertions(+), 91 deletions(-) diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..70b3ae8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "TransDecoder.lrgTests"] + path = TransDecoder.lrgTests + url = https://github.com/TransDecoder/TransDecoder.lrgTests.git diff --git a/PerlLib/Gene_obj.pm b/PerlLib/Gene_obj.pm index 990011c..86ea2e4 100755 --- a/PerlLib/Gene_obj.pm +++ b/PerlLib/Gene_obj.pm @@ -4450,6 +4450,26 @@ sub has_CDS { } +#### +sub set_CDS_phases_from_init_phase { + my ($self, $init_phase) = @_; + + my @exons = $self->get_exons(); + + my $curr_cds_len = $init_phase; + + foreach my $exon (@exons) { + if (my $cds = $exon->get_CDS_obj()) { + $cds->set_phase($curr_cds_len % 3); + my $cds_len = $cds->length(); + $curr_cds_len += $cds_len; + } + } + + return; +} + + sub set_CDS_phases { my ($self, $genomic_seq_ref) = @_; diff --git a/TransDecoder.LongOrfs b/TransDecoder.LongOrfs index 68b30a4..8935687 100755 --- a/TransDecoder.LongOrfs +++ b/TransDecoder.LongOrfs @@ -16,6 +16,8 @@ Required: Optional: + --gene_trans_map <string> gene-to-transcript identifier mapping file (tab-delimited, gene_id<tab>trans_id<return> ) + -m <int> minimum protein length (default: 100) -G <string> genetic code (default: universal; see PerlDoc; options: Euplotes, Tetrahymena, Candida, Acetabularia) @@ -83,7 +85,7 @@ my $workdir; my $verbose; my $search_pfam = ""; my ($reuse,$pfam_out); - +my $gene_trans_map_file; my $MPI_DEBUG = 1; @@ -93,8 +95,9 @@ my $MPI_DEBUG = 1; 'h' => \$help, 'v' => \$verbose, 'S' => \$TOP_STRAND_ONLY, - 'p=i' => \$preserve_full_lengths - ); + 'p=i' => \$preserve_full_lengths, + 'gene_trans_map=s' => \$gene_trans_map_file, + ); @@ -116,6 +119,20 @@ if ($genetic_code ne 'universal') { main: { + + my %gene_trans_map; + if ($gene_trans_map_file) { + open(my $fh, $gene_trans_map_file) or die "Error, cannot open file $gene_trans_map_file"; + while (<$fh>) { + chomp; + my ($gene_id, $trans_id) = split(/\t/); + $gene_trans_map{$trans_id} = $gene_id; + } + close $fh; + } + + + my $workdir = basename($transcripts_file) . ".transdecoder_dir"; unless (-d $workdir) { mkdir($workdir) or die "Error, cannot mkdir $workdir."; @@ -175,9 +192,11 @@ main: { my @orf_structs = $longest_orf_finder->capture_all_ORFs($sequence); @orf_structs = reverse sort {$a->{length}<=>$b->{length}} @orf_structs; - print "checking for fp in $acc \n" if ($SEE); - $longest_orf_finder->check_for_fp_5prime_partials($MIN_PROT_LENGTH,\@orf_structs,$preserve_full_lengths) - if(defined($preserve_full_lengths)); + print "checking for fp in $acc \n" if ($SEE); + + if (defined($preserve_full_lengths)) { + $longest_orf_finder->check_for_fp_5prime_partials($MIN_PROT_LENGTH,\@orf_structs,$preserve_full_lengths) + } while (@orf_structs) { my $orf = shift @orf_structs; @@ -229,19 +248,32 @@ main: { $model_counter++; - my $model_id = "$acc|m.$model_counter"; - my $gene_id = "$acc|g.$model_counter"; + + my $gene_id; + if (%gene_trans_map) { + $gene_id = $gene_trans_map{$acc} or die "Error, cannot locate gene identifier for transcript acc: [$acc]"; + } + elsif (my $parsed_gene_id = &try_parse_gene_id_from_acc($acc)) { + $gene_id = $parsed_gene_id; + } + else { + $gene_id = "Gene.$model_counter"; + } + ## gene ID and model ID must be unique for each entry, but also decipherable for later on. + $gene_id = "${gene_id}::${acc}::g.$model_counter"; + my $model_id = "${gene_id}::m.$model_counter"; + $gene_obj->{TU_feat_name} = $gene_id; $gene_obj->{Model_feat_name} = $model_id; - - + my $cds = $gene_obj->create_CDS_sequence(\$sequence); - + $gene_obj->set_CDS_phases(\$sequence); + unless ($cds) { die "Error, no CDS for gene: " . Dumper($cds_coords_href) . Dumper($exon_coords_href); } - + my $got_start = 0; my $got_stop = 0; if ($protein =~ /^M/) { @@ -262,22 +294,24 @@ main: { $prot_type = "internal"; } - $gene_obj->{com_name} = "ORF $gene_id $model_id type:$prot_type len:$length ($orient)"; + $gene_obj->{com_name} = "ORF type:$prot_type len:$length ($orient)"; # this header is identical between CDS and PEP (since PEP is just a direct translation of CDS for a specific translation table) # we are currently not printing this out at the final data but it would be nice to. - my $pep_header = ">$model_id $gene_id type:$prot_type len:$length gc:$genetic_code $acc:$start-$stop($orient)\n"; - my $cds_header = ">$model_id $gene_id type:$prot_type len:$length $acc:$start-$stop($orient)\n"; + my $pep_header = ">$model_id type:$prot_type len:$length gc:$genetic_code $acc:$start-$stop($orient)\n"; + my $cds_header = ">$model_id type:$prot_type len:$length $acc:$start-$stop($orient)\n"; print PEP $pep_header."$protein\n"; print CDS $cds_header."$cds\n"; + + print GFF $gene_obj->to_GFF3_format(source => "transdecoder") . "\n"; } } - + close PEP; close CDS; close GFF; @@ -313,3 +347,18 @@ sub process_cmd { } +#### +sub try_parse_gene_id_from_acc { + my ($acc) = @_; + + my $gene_id; + + if ($acc =~ /^(\S+_c\d+_g\d+)_i\d+$/) { + $gene_id = $1; + } + elsif ($acc =~ /^(\S+_c\d+)_seq\d+$/) { + $gene_id = $1; + } + + return($gene_id); +} diff --git a/TransDecoder.Predict b/TransDecoder.Predict index a94080a..838aea4 100755 --- a/TransDecoder.Predict +++ b/TransDecoder.Predict @@ -17,10 +17,14 @@ Common options: --retain_long_orfs <int> retain all ORFs found that are equal or longer than these many nucleotides even if no other evidence marks it as coding (default: 900 bp => 300aa) - --retain_pfam_hits <string> /path/to/pfam_db.hmm to search - using hmmscan (which should be accessible via your PATH setting) + --retain_pfam_hits <string> domain table output file from running hmmscan to search Pfam (see transdecoder.github.io for info) + Any ORF with a pfam domain hit will be retained in the final output. - --retain_blastp_hits <string> + --retain_blastp_hits <string> blastp output in '-outfmt 6' format. + Any ORF with a blast match will be retained in the final output. + + --single_best_orf Retain only the single best ORF per transcript. + (Best is defined as having (optionally pfam and/or blast support) and longest orf) --cpu <int> Use multipe cores for cd-hit-est. (default=1) @@ -78,6 +82,7 @@ my $retain_pfam_hits_file; my $retain_blastp_hits_file; my $cpu = 1; my $MPI_DEBUG = 1; +my $single_best_orf_flag = 0; &GetOptions( 't=s' => \$transcripts_file, 'train:s' => \$train_file, @@ -97,6 +102,8 @@ my $MPI_DEBUG = 1; 'retain_pfam_hits=s' => \$retain_pfam_hits_file, 'retain_blastp_hits=s' => \$retain_blastp_hits_file, 'cpu=i' => \$cpu, + + 'single_best_orf' => \$single_best_orf_flag, ); @@ -192,35 +199,59 @@ main: { # get accs for best entries my $acc_file = "$cds_file.scores.selected"; { + + my %att_counter; + open (my $ofh, ">$acc_file") or die "Error, cannot write to $acc_file"; open (my $ifh, "$cds_file.scores") or die "Error, cannot open file $cds_file.scores"; while (<$ifh>) { chomp; my ($acc, $orf_length, @scores) = split(/\t/); + + my @ATTS; my $score_1 = shift @scores; my $max_score_other_frame = max(@scores); - if ($has_pfam_hit{$acc} - || - $has_blastp_hit{$acc} - || - $orf_length >= $RETAIN_LONG_ORFS - || - ($score_1 > 0 && $score_1 > $max_score_other_frame) - ) { + + my $keep_flag = 0; + + if ($has_pfam_hit{$acc}) { + $keep_flag = 1; + push (@ATTS, "PFAM"); + print STDERR "-$acc flagged as having a pfam domain.\n" if $verbose; + } + if ($has_blastp_hit{$acc}) { + $keep_flag = 1; + push (@ATTS, "BLASTP"); + print STDERR "-$acc flagged as having a blastp match.\n" if $verbose; + } + if ($orf_length >= $RETAIN_LONG_ORFS) { + $keep_flag = 1; + push (@ATTS, "LONGORF"); + } + if ($score_1 > 0 && $score_1 > $max_score_other_frame) { + $keep_flag = 1; + push (@ATTS, "FRAMESCORE"); + } + + if ($keep_flag) { print $ofh "$acc\n"; - - if ($has_pfam_hit{$acc}) { - print STDERR "-$acc flagged as having a pfam domain.\n" if $verbose; - } - if ($has_blastp_hit{$acc}) { - print STDERR "-$acc flagged as having a blastp match.\n" if $verbose; - } - + + my $att_string = join("|", sort @ATTS); + $att_counter{$att_string}++; } } close $ifh; close $ofh; + + + # report on the categories of the selected ORFs + print STDERR "\n#####################\nCounts of kept entries according to attributes:\n"; + foreach my $att (reverse sort {$att_counter{$a}<=>$att_counter{$b}} keys %att_counter) { + my $count = $att_counter{$att}; + print STDERR join("\t", $att, $count) . "\n"; + } + print STDERR "########################\n\n\n"; } # index the current gff file: @@ -241,8 +272,30 @@ main: { { # exclude shadow orfs (smaller orfs in different reading frame that are eclipsed by longer orfs) - $cmd = "$UTIL_DIR/remove_eclipsed_ORFs.pl $cds_file.best_candidates.gff3 > $final_output_prefix.gff3"; + $cmd = "$UTIL_DIR/remove_eclipsed_ORFs.pl $cds_file.best_candidates.gff3 > $cds_file.eclipsed_removed.gff3"; + + my $target_final_file = "$cds_file.eclipsed_removed.gff3"; + &process_cmd($cmd); + + + if ($single_best_orf_flag) { + $cmd = "$UTIL_DIR/single_best_ORF_per_transcript.pl "; + if ($retain_blastp_hits_file) { + $cmd .= " --blast_hits $retain_blastp_hits_file "; + } + if ($retain_pfam_hits_file) { + $cmd .= " --pfam_hits $retain_pfam_hits_file "; + } + $cmd .= " --gff3_file $cds_file.eclipsed_removed.gff3 > $cds_file.single_best_orf.gff3"; + + &process_cmd($cmd); + + $target_final_file = "$cds_file.single_best_orf.gff3"; + } + + + &process_cmd("cp $target_final_file $final_output_prefix.gff3"); ## write final outputs: @@ -268,12 +321,6 @@ main: { $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl $gff3_file $transcripts_file CDS > $best_cds_file"; &process_cmd($cmd); - # make a CDS file: - my $best_cdna_file = $best_pep_file; - $best_cdna_file =~ s/\.pep$/\.mRNA/; - $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl $gff3_file $transcripts_file cDNA > $best_cdna_file"; - &process_cmd($cmd); - } print STDERR "transdecoder is finished. See output files $final_output_prefix.\*\n\n\n"; diff --git a/notes b/notes index 7806c64..8cd2de1 100644 --- a/notes +++ b/notes @@ -1 +1,4 @@ git clone https://github.com/TransDecoder/TransDecoder.git + +GeneID paper: +http://genome.cshlp.org/content/10/4/511.long diff --git a/sample_data/Makefile b/sample_data/Makefile new file mode 100644 index 0000000..451b9a5 --- /dev/null +++ b/sample_data/Makefile @@ -0,0 +1,14 @@ + +DIRS=cufflinks_example pasa_example simple_transcriptome_target + +test: + @for i in $(DIRS); do \ + echo "Running example in $$$i..."; \ + (cd $$i; $(MAKE) test) || exit $$?; done + + +clean: + @for i in $(DIRS); do \ + echo "Running example in $$i..."; \ + (cd $$i; $(MAKE) clean) || exit $$?; done + diff --git a/sample_data/cufflinks_example/Makefile b/sample_data/cufflinks_example/Makefile new file mode 100644 index 0000000..4ab2c6d --- /dev/null +++ b/sample_data/cufflinks_example/Makefile @@ -0,0 +1,6 @@ +test: + ./runMe.sh + +clean: + ./cleanme.pl + diff --git a/sample_data/cufflinks_example/blastp.outfmt6.gz b/sample_data/cufflinks_example/blastp.outfmt6.gz new file mode 100644 index 0000000..e3824ef Binary files /dev/null and b/sample_data/cufflinks_example/blastp.outfmt6.gz differ diff --git a/sample_data/cufflinks_example/blastp.results.outfmt6.gz b/sample_data/cufflinks_example/blastp.results.outfmt6.gz deleted file mode 100644 index 09e79bf..0000000 Binary files a/sample_data/cufflinks_example/blastp.results.outfmt6.gz and /dev/null differ diff --git a/sample_data/cufflinks_example/cleanme.pl b/sample_data/cufflinks_example/cleanme.pl index f5f72ad..1dfa3d9 100755 --- a/sample_data/cufflinks_example/cleanme.pl +++ b/sample_data/cufflinks_example/cleanme.pl @@ -17,11 +17,12 @@ my @files_to_keep = qw (cleanme.pl test.tophat.sam.gz transcripts.gtf.gz - +blastp.outfmt6.gz pfam.domtblout.gz -blastp.results.outfmt6.gz - ); +Makefile + + ); my %keep = map { + $_ => 1 } @files_to_keep; diff --git a/sample_data/cufflinks_example/pfam.domtblout.gz b/sample_data/cufflinks_example/pfam.domtblout.gz index 9b7af55..7a22fa6 100644 Binary files a/sample_data/cufflinks_example/pfam.domtblout.gz and b/sample_data/cufflinks_example/pfam.domtblout.gz differ diff --git a/sample_data/cufflinks_example/runMe.sh b/sample_data/cufflinks_example/runMe.sh index fc27d70..450faa2 100755 --- a/sample_data/cufflinks_example/runMe.sh +++ b/sample_data/cufflinks_example/runMe.sh @@ -1,22 +1,21 @@ -#!/bin/bash -e +#!/bin/bash -ve -if [ -e test.genome.fasta.gz ] && [ ! -e test.genome.fasta ]; then +export PERL_HASH_SEED=0 + +if [ ! -e test.genome.fasta ]; then gunzip -c test.genome.fasta.gz > test.genome.fasta fi -if [ -e test.tophat.sam.gz ] && [ ! -e test.tophat.sam ]; then - gunzip -c test.tophat.sam.gz > test.tophat.sam -fi -if [ -e transcripts.gtf.gz ] && [ ! -e transcripts.gtf ]; then +if [ ! -e transcripts.gtf ]; then gunzip -c transcripts.gtf.gz > transcripts.gtf fi -if [ -e blastp.results.outfmt6.gz ] && [ ! -e blastp.results.outfmt6 ]; then - gunzip -c blastp.results.outfmt6.gz > blastp.results.outfmt6 +if [ ! -e blastp.outfmt6 ]; then + gunzip -c blastp.outfmt6.gz > blastp.outfmt6 fi -if [ -e pfam.domtblout.gz ] && [ ! -e pfam.domtblout ]; then +if [ ! -e pfam.domtblout ]; then gunzip -c pfam.domtblout.gz > pfam.domtblout fi @@ -32,9 +31,21 @@ fi ## Predict likely ORFs -if [ $1 ]; then +if [ 1 ]; then # always doing this now. + + # this is how I would have run blast and pfam but I'm using precomputed results for ease of demonstration. + #BLASTDB=/seq/RNASEQ/DBs/TRINOTATE_RESOURCES/TRINOTATE_V3/uniprot_sprot.pep + #PFAMDB=/seq/RNASEQ/DBs/TRINOTATE_RESOURCES/TRINOTATE_V3/Pfam-A.hmm + # + ## run blast + #blastp -query transcripts.fasta.transdecoder_dir/longest_orfs.pep -db $BLASTDB -max_target_seqs 1 -outfmt 6 -evalue 1e-5 > blastp.outfmt6 + # + ## run pfam + #hmmscan --domtblout pfam.domtblout $PFAMDB transcripts.fasta.transdecoder_dir/longest_orfs.pep > pfam.log + + ## use pfam and blast results: - ../../TransDecoder.Predict -t transcripts.fasta --retain_pfam_hits pfam.domtblout --retain_blastp_hits blastp.results.outfmt6 -v + ../../TransDecoder.Predict -t transcripts.fasta --retain_pfam_hits pfam.domtblout --retain_blastp_hits blastp.outfmt6 --single_best_orf -v else # just coding metrics ../../TransDecoder.Predict -t transcripts.fasta @@ -52,10 +63,8 @@ fi # convert the genome-based gene-gff3 file to bed ../../util/gff3_file_to_bed.pl transcripts.fasta.transdecoder.genome.gff3 > transcripts.fasta.transdecoder.genome.bed -echo -echo -echo Done! Coding region genome annotations provided as: best_candidates.eclipsed_orfs_removed.genome.gff3 -echo -echo + +# Done! Coding region genome annotations provided as: transcripts.fasta.transdecoder.genome.\* + exit 0 diff --git a/sample_data/pasa_example/Makefile b/sample_data/pasa_example/Makefile new file mode 100644 index 0000000..4ab2c6d --- /dev/null +++ b/sample_data/pasa_example/Makefile @@ -0,0 +1,6 @@ +test: + ./runMe.sh + +clean: + ./cleanme.pl + diff --git a/sample_data/pasa_example/cleanme.pl b/sample_data/pasa_example/cleanme.pl index 88f5233..2278c9a 100755 --- a/sample_data/pasa_example/cleanme.pl +++ b/sample_data/pasa_example/cleanme.pl @@ -17,6 +17,8 @@ genome.fasta.gz pasa_assemblies.fasta.gz pasa_assemblies.gff3.gz runMe.sh +pasa_assemblies_described.txt.gz +Makefile ); diff --git a/sample_data/pasa_example/pasa_assemblies.fasta.gz b/sample_data/pasa_example/pasa_assemblies.fasta.gz index 3eca927..0da5588 100644 Binary files a/sample_data/pasa_example/pasa_assemblies.fasta.gz and b/sample_data/pasa_example/pasa_assemblies.fasta.gz differ diff --git a/sample_data/pasa_example/pasa_assemblies.gff3.gz b/sample_data/pasa_example/pasa_assemblies.gff3.gz index bdb2aac..c635200 100644 Binary files a/sample_data/pasa_example/pasa_assemblies.gff3.gz and b/sample_data/pasa_example/pasa_assemblies.gff3.gz differ diff --git a/sample_data/pasa_example/pasa_assemblies_described.txt.gz b/sample_data/pasa_example/pasa_assemblies_described.txt.gz new file mode 100644 index 0000000..4e29025 Binary files /dev/null and b/sample_data/pasa_example/pasa_assemblies_described.txt.gz differ diff --git a/sample_data/pasa_example/runMe.sh b/sample_data/pasa_example/runMe.sh index 6470d43..4a625c1 100755 --- a/sample_data/pasa_example/runMe.sh +++ b/sample_data/pasa_example/runMe.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -ve if [ ! -e genome.fasta ]; then gunzip -c genome.fasta.gz > genome.fasta @@ -12,10 +12,21 @@ if [ ! -e pasa_assemblies.gff3 ]; then gunzip -c pasa_assemblies.gff3.gz > pasa_assemblies.gff3 fi -../../TransDecoder.LongOrfs -t pasa_assemblies.fasta +if [ ! -e pasa_assemblies_described.txt ]; then + gunzip -c pasa_assemblies_described.txt.gz > pasa_assemblies_described.txt +fi + + +# get the gene-to-transcript relationships +cut -f2,3 pasa_assemblies_described.txt > pasa.gene_trans_map.txt + +../../TransDecoder.LongOrfs -t pasa_assemblies.fasta --gene_trans_map pasa.gene_trans_map.txt ../../TransDecoder.Predict -t pasa_assemblies.fasta -../../util/cdna_alignment_orf_to_genome_orf.pl pasa_assemblies.fasta.transdecoder.gff3 pasa_assemblies.gff3 pasa_assemblies.fasta | tee pasa_assemblies.fasta.transdecoder.genome.gff3 +../../util/cdna_alignment_orf_to_genome_orf.pl pasa_assemblies.fasta.transdecoder.gff3 pasa_assemblies.gff3 pasa_assemblies.fasta > pasa_assemblies.fasta.transdecoder.genome.gff3 + +echo "Done. See pasa_assemblies.fasta.transdecoder.\*" + exit 0 diff --git a/sample_data/simple_transcriptome_target/Makefile b/sample_data/simple_transcriptome_target/Makefile new file mode 100644 index 0000000..4ab2c6d --- /dev/null +++ b/sample_data/simple_transcriptome_target/Makefile @@ -0,0 +1,6 @@ +test: + ./runMe.sh + +clean: + ./cleanme.pl + diff --git a/sample_data/simple_transcriptome_target/cleanme.pl b/sample_data/simple_transcriptome_target/cleanme.pl index bbe0088..de8482d 100755 --- a/sample_data/simple_transcriptome_target/cleanme.pl +++ b/sample_data/simple_transcriptome_target/cleanme.pl @@ -15,6 +15,7 @@ my @files_to_keep = qw ( cleanme.pl runMe.sh Trinity.fasta.gz +Makefile ); diff --git a/sample_data/simple_transcriptome_target/runMe.sh b/sample_data/simple_transcriptome_target/runMe.sh index c42569a..3ad9343 100755 --- a/sample_data/simple_transcriptome_target/runMe.sh +++ b/sample_data/simple_transcriptome_target/runMe.sh @@ -4,8 +4,8 @@ if [ ! -e Trinity.fasta ]; then gunzip -c Trinity.fasta.gz > Trinity.fasta fi -../../TransDecoder.LongOrfs -t Trinity.fasta +../../TransDecoder.LongOrfs -t Trinity.fasta -../../TransDecoder.Predict -t Trinity.fasta +../../TransDecoder.Predict -t Trinity.fasta --single_best_orf exit 0 diff --git a/util/cdna_alignment_orf_to_genome_orf.pl b/util/cdna_alignment_orf_to_genome_orf.pl index 4a3b1ee..84d0d55 100755 --- a/util/cdna_alignment_orf_to_genome_orf.pl +++ b/util/cdna_alignment_orf_to_genome_orf.pl @@ -41,28 +41,20 @@ main: { ## associate gene identifiers with contig id's. my $contig_to_gene_list_href = &GFF3_utils::index_GFF3_gene_objs($cdna_orfs_gff3, $gene_obj_indexer_href); - foreach my $asmbl_id (sort keys %$contig_to_gene_list_href) { + + my %isolated_gene_id_to_new_genes; + foreach my $asmbl_id (sort keys %$contig_to_gene_list_href) { + ## $asmbl_id is the actual Transcript identifier from which ORFs were predicted. + my @gene_ids = @{$contig_to_gene_list_href->{$asmbl_id}}; foreach my $gene_id (@gene_ids) { # gene identifiers as given by transdecoder on the transcript sequences my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id}; - my $asmbl_id = $gene_obj_ref->{asmbl_id}; - - - ## pasa stuff - - if ($asmbl_id =~ /(S\d+)_(asmbl_\d+)/) { - - my $subcluster = $1; - $asmbl_id = $2; - } - my $transcript_struct = $cdna_acc_to_transcript_structure{$asmbl_id} or die "Error, no cdna struct for $asmbl_id"; - #print Dumper($transcript_struct) . $gene_obj_ref->toString(); - + eval { my $new_orf_gene = &place_orf_in_cdna_alignment_context($transcript_struct, $gene_obj_ref, \%cdna_seq_lengths); @@ -74,14 +66,17 @@ main: { #$new_orf_gene->{Model_feat_name} = "m.$asmbl_id.$orf_count"; $new_orf_gene->{com_name} = "ORF"; - my $use_gene_id = $transcript_struct->{gene_id} || $gene_id; - - + my $use_gene_id = $transcript_struct->{gene_id}; + unless ($use_gene_id) { + ## extract the orig gene id from the incoming gene obj + my ($isolated_gene_id, @rest) = split(/::/, $gene_id); + $use_gene_id = $isolated_gene_id; + } + $new_orf_gene->{TU_feat_name} = $use_gene_id; $new_orf_gene->{Model_feat_name} = $gene_obj_ref->{Model_feat_name}; - - - print $new_orf_gene->to_GFF3_format(source => "transdecoder") . "\n"; + + push (@{$isolated_gene_id_to_new_genes{$use_gene_id}}, $new_orf_gene); } }; @@ -99,6 +94,25 @@ main: { } } + ## output results: + foreach my $gene_id (sort keys %isolated_gene_id_to_new_genes) { + + my @gene_objs = @{$isolated_gene_id_to_new_genes{$gene_id}}; + + foreach my $gene_obj (@gene_objs) { + $gene_obj->set_CDS_phases_from_init_phase(0); + } + + my $parent_gene_obj = shift @gene_objs; + foreach my $gene_obj (@gene_objs) { + $parent_gene_obj->add_isoform($gene_obj); + } + + $parent_gene_obj->refine_gene_object(); + + print $parent_gene_obj->to_GFF3_format(source => "transdecoder") . "\n"; + } + exit(0); } @@ -126,7 +140,7 @@ sub parse_transcript_alignment_info { my $trans_id = ""; my $gene_id = ""; - + ## trick for retaining gene and trans identifier information from cufflinks data if ($asmbl =~ /^GENE\^(\S+),TRANS\^(\S+)/) { $gene_id = $1; $trans_id = $2; @@ -147,12 +161,12 @@ sub parse_transcript_alignment_info { [$lend, $rend] ], - orient => $orient, + orient => $orient, trans_id => $trans_id, gene_id => $gene_id, }; - + $cdna_alignments{$asmbl} = $struct; } @@ -230,7 +244,7 @@ sub place_orf_in_cdna_alignment_context { if (scalar(@exon_coords) > 1) { # any correct ORF should be in the '+' orientation here.... must be a false positive orf or transcript structure is wrong $WARNING_COUNT++; - print STDERR "Warning [$WARNING_COUNT], shouldn't have a minus-strand ORF on a spliced transcript structure. Skipping entry.\n"; + print STDERR "Warning [$WARNING_COUNT], shouldn't have a minus-strand ORF on a spliced transcript structure. Skipping entry $orf_gene_obj->{Model_feat_name}.\n"; return undef; } diff --git a/util/remove_eclipsed_ORFs.pl b/util/remove_eclipsed_ORFs.pl index 92e7fd1..4cc3e8d 100755 --- a/util/remove_eclipsed_ORFs.pl +++ b/util/remove_eclipsed_ORFs.pl @@ -33,7 +33,7 @@ main: { my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id}; - my ($lend, $rend) = sort {$a<=>$b} $gene_obj_ref->get_coords(); + my ($lend, $rend) = sort {$a<=>$b} $gene_obj_ref->get_model_span(); my $struct = { gene_obj => $gene_obj_ref, lend => $lend, @@ -64,6 +64,8 @@ main: { if ($next_lend > $lend && $next_rend < $rend) { ## eclipsed + my $model_feat_name = $gene->{gene_obj}->{Model_feat_name}; + print STDERR "\tECLIPSE: $model_feat_name is eclipsed by longer ORF, removing it.\n"; $found_eclipsed = 1; last; } diff --git a/util/single_best_ORF_per_transcript.pl b/util/single_best_ORF_per_transcript.pl new file mode 100755 index 0000000..f34f3a3 --- /dev/null +++ b/util/single_best_ORF_per_transcript.pl @@ -0,0 +1,187 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use FindBin; +use lib ("$FindBin::Bin/../PerlLib"); +use Gene_obj; +use Gene_obj_indexer; +use GFF3_utils; +use Carp; +use Data::Dumper; +use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through); + + + +my $usage = <<__EOUSAGE__; + +################################################################################ +# +# Required: +# +# --gff3_file <string> gff3 file w/ predicted ORFs +# +# Optional: +# +# --blast_hits <string> blast hits file +# +# --pfam_hits <string> pfam hits file +# +################################################################################ + + +__EOUSAGE__ + + ; + +my $gff3_file; +my $blast_hits_file; +my $pfam_hits_file; + + +&GetOptions('gff3_file=s' => \$gff3_file, + 'blast_hits=s' => \$blast_hits_file, + 'pfam_hits=s' => \$pfam_hits_file, + ); + +unless ($gff3_file) { + die $usage; +} + + +main: { + + my %blast_hits; + if ($blast_hits_file) { + %blast_hits = &parse_blastp_hits_file($blast_hits_file); + } + + my %pfam_hits; + if ($pfam_hits_file) { + %pfam_hits = &parse_pfam_hits_file($pfam_hits_file); + } + + my $gene_obj_indexer_href = {}; + + my $asmbl_id_to_gene_list_href = &GFF3_utils::index_GFF3_gene_objs($gff3_file, $gene_obj_indexer_href); + + foreach my $asmbl_id (sort keys %$asmbl_id_to_gene_list_href) { + + my @gene_ids = @{$asmbl_id_to_gene_list_href->{$asmbl_id}}; + + #print "ASMBL: $asmbl_id, gene_ids: @gene_ids\n"; + my @gene_entries; + + foreach my $gene_id (@gene_ids) { + + my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id}; + + my $model_id = $gene_obj_ref->{Model_feat_name}; + + my $homology_count = 0; + if ($blast_hits{$model_id}) { + $homology_count++; + } + if ($pfam_hits{$model_id}) { + $homology_count++; + } + + + my $struct = { gene_obj => $gene_obj_ref, + length => $gene_obj_ref->get_CDS_length(), + homology_count => $homology_count, + }; + + push (@gene_entries, $struct); + + + } + + @gene_entries = sort { + + $b->{homology_count} <=> $a->{homology_count} + || + $b->{length} <=> $a->{length} + + } @gene_entries; + + if (scalar @gene_entries > 1) { + print STDERR "ORFs prioritized as:\n"; + foreach my $entry (@gene_entries) { + print STDERR "\t" . join("\t", $entry->{gene_obj}->{Model_feat_name}, + "homology_count: " . $entry->{homology_count}, + "len: " . $entry->{length}) . "\n"; + } + + } + + my $best_gene_entry = shift @gene_entries; + + my $gene_obj = $best_gene_entry->{gene_obj}; + + print $gene_obj->to_GFF3_format(source => "transdecoder") . "\n"; + + foreach my $remaining_gene (@gene_entries) { + print STDERR "-discarding " . $remaining_gene->{gene_obj}->{Model_feat_name} . " as not single best orf on trans\n"; + } + + } + + + exit(0); + +} + + + +## note borrowed code below from TransDecoder.predict and should consolidate. ## //FIXME + +sub parse_pfam_hits_file { + my ($pfam_hits_file) = @_; + + my %has_pfam_hit; + + if (! -e $pfam_hits_file) { + die "Error, cannot find pfam hits file: $pfam_hits_file"; + } + + print "PFAM output found and processing...\n"; + # capture those proteins having pfam hits + open (my $fh, $pfam_hits_file) or die "Error, cannot open file: $pfam_hits_file"; + while (my $ln=<$fh>) { + next if $ln=~/^\#/; + my @x = split(/\s+/,$ln); + next unless $x[3]; # domtbl + my $orf_acc = $x[3]; + $has_pfam_hit{$orf_acc} = 1; + } + close $fh; + + + return(%has_pfam_hit); +} + +#### +sub parse_blastp_hits_file { + my ($blastp_file) = @_; + + unless (-e $blastp_file) { + die "Error, cannot find file $blastp_file"; + } + + my %blastp_hits; + + open (my $fh, $blastp_file) or die "Error, cannot open file $blastp_file"; + while (<$fh>) { + chomp; + my @x = split(/\t/); + my $id = $x[0]; + + $blastp_hits{$id} = 1; + } + close $fh; + + return(%blastp_hits); +} + + -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/transdecoder.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
