Hello,
I've gotten a start on this but am running into a few difficulties.
I've attached my code so far.
I'd like to implement a function "translate" that uses the standard
translation table by default, but can optionally use a different
translation table if it is passed as an argument. Does it make sense to
write this with the type signature:
translate :: SeqData -> Offset -> Maybe TransTable -> SeqData
I'd like to be able to call the function with only the first two
argument and get the hard-coded standard translation table, but if I
pass a third argument with type TransTable, I get the customized
translation.
As I've mentioned before on this list I'm still quite new to haskell, so
any tips are appreciated.
Dan
On 16/08/11 02:16 PM, Ketil Malde wrote:
Dan Fornika<dforn...@gmail.com> writes:
I've been working on a mitochondrial genetics project, and I'd like to
see a human mitochondrial-specific translation function integrated
into BioHaskell.
Hm, yes, support for the non-canonical translations would be nice.
I see that Bio.Sequence.Phd has a standard translation table,
Bio.Sequence.SeqData, actually, but it's reexported in a lot of places.
so I've
made a version called trans_tbl_mito with the appropriate changes
integrated.
Currently, there's just a 'translate' that hardwires this table. I
concede this is poor design, the table ought to be a parameter, but it
was what I needed at the time.
I know that there is a lot of re-factoring going on, and
I'm not sure what is the most efficient/elegant way to introduce this
feature.
How about a module Bio.Sequence.Translation containing:
import Bio.Core.Sequence
data Triple = ... -- e.g. String, or perhaps just SeqData
data Amino = ...
data TransTable = [(Triple,Amino)]
transtable_std, transtbl_mito, ... :: TransTable
translate :: TransTable -> SeqData -> SeqData
-- and perhaps versions for all frames, or some specific frame?
What does everybody think?
Dan: would you like to write this? Feel free to rip what you want from
the old SeqData module.
-k
import Bio.Core.Sequence
import Data.List (unfoldr, intercalate, isPrefixOf)
import Data.Char (toUpper, isNumber)
import Data.Maybe
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BB
data Codon = String -- SeqData?
data Amino = Ala | Arg | Asn | Asp | Cys | Gln | Glu | Gly
| His | Ile | Leu | Lys | Met | Phe | Pro | Ser
| Thr | Tyr | Trp | Val
| STP | Asx | Glx | Xle | Xaa -- unknowns
deriving (Show,Eq)
data TransTable = TTable [(Codon, Amino)]
data InverseTransTable = ITTable [(Amino, Codon)]
{-# INLINE (!) #-}
(!) :: SeqData -> Offset -> Char
(!) (SeqData bs) = B.index bs
translate :: SeqData -> Offset -> Maybe TransTable -> SeqData
translate s' o' Nothing = unfoldr codons (s', o')
where codons (s,o) =
if o > seqlength s - 3 then Nothing
else Just (trans1 (map (s!) [o, o+1, o+2]), (s, o+3) trans_table_std)
translate s' o' trans_table = unfoldr codons (s', o', trans_table)
where codons (s, o, trans_table) =
if o > seqlength s - 3 then Nothing
else Just (trans1 (map (s!) [o, o+1, o+2]), (s, o+3) trans_table)
trans1 :: String -> Maybe TransTable-> Amino
trans1 s Nothing = (flip lookup trans_table_std . map (repUT . toUpper)) s
where repUT x = if x == 'U' then 'T' else x -- RNA uses U for T
trans1 s transl_table = (flip lookup transl_table . map (repUT . toUpper)) s
where repUT x = if x == 'U' then 'T' else x -- RNA uses U for T
-- amino2iupac :: [Amino] -> SeqData
-- amino2iupac =
-- reverse_translate :: SeqData -> TransTable -> SeqData
-- reverse_translate =
-- rev_trans1 :: Amino -> TransTable -> String
-- rev_trans1 =
-- todo: extend with more IUPAC nucleotide wildcards?
-- | Convert a list of amino acids to a sequence in IUPAC format.
toIUPAC :: [Amino] -> SeqData
toIUPAC = B.pack . map (fromJust . flip lookup iupac)
-- | Convert a sequence in IUPAC format to a list of amino acids.
fromIUPAC :: SeqData -> [Amino]
fromIUPAC = map (maybe Xaa id . flip lookup iupac' . toUpper) . B.unpack
iupac :: [(Amino, Char)]
iupac = [(Ala, 'A'), (Arg, 'R'), (Asn, 'N')
,(Asp, 'D'), (Cys, 'C'), (Gln, 'Q')
,(Glu, 'E'), (Gly, 'G'), (His, 'H')
,(Ile, 'I'), (Leu, 'L'), (Lys, 'K')
,(Met, 'M'), (Phe, 'F'), (Pro, 'P')
,(Ser, 'S'), (Thr, 'T'), (Tyr, 'Y')
,(Trp, 'W'), (Val,'V')
,(Asx, 'B') -- Asn or Asp
,(Glx, 'Z') -- Gln or Glu
,(Xle, 'J') -- Ile or Leu
,(STP, '*')
,(Xaa, 'X')
]
iupac' :: [(Char,Amino)]
iupac' = map (\(a,b)->(b,a)) iupac
trans_table_std :: TransTable
trans_table_std = TTable [("AAA", Lys), ("AAC", Asn), ("AAG", Lys), ("AAT", Asn)
,("ACA", Thr), ("ACC", Thr), ("ACG", Thr), ("ACT", Thr)
,("AGA", Arg), ("AGC", Ser), ("AGG", Arg), ("AGT", Ser)
,("ATA", Ile), ("ATC", Ile), ("ATG", Met), ("ATT", Ile)
,("CAA", Gln), ("CAC", His), ("CAG", Gln), ("CAT", His)
,("CCA", Pro), ("CCC", Pro), ("CCG", Pro), ("CCT", Pro)
,("CGA", Arg), ("CGC", Arg), ("CGG", Arg), ("CGT", Arg),
,("CTA", Leu), ("CTC", Leu), ("CTG", Leu), ("CTT", Leu),
,("GAA", Glu), ("GAC", Asp), ("GAG", Glu), ("GAT", Asp),
,("GCA", Ala), ("GCC", Ala), ("GCG", Ala), ("GCT", Ala),
,("GGA", Gly), ("GGC", Gly), ("GGG", Gly), ("GGT", Gly),
,("GTA", Val), ("GTC", Val), ("GTG", Val), ("GTT", Val),
,("TAA", STP), ("TAC", Tyr), ("TAG", STP), ("TAT", Tyr),
,("TCA", Ser), ("TCC", Ser), ("TCG", Ser), ("TCT", Ser),
,("TGA", STP), ("TGC", Cys), ("TGG", Trp), ("TGT", Cys),
,("TTA", Leu), ("TTC", Phe), ("TTG", Leu), ("TTT", Phe)
-- ("RAT", Asx), ("RAC", Asx), ("SAA", Glx), ("SAG", Glx)
-- IUPAC R is (A or G), S is (C or G)
]
trans_table_vertebrate_mito :: TransTable
trans_table_vertebrate_mito = TTable [
, ("AAA", Lys), ("AAC", Asn), ("AAG", Lys), ("AAT", Asn),
("ACA", Thr), ("ACC", Thr), ("ACG", Thr), ("ACT", Thr),
("AGA", STP), ("AGC", Ser), ("AGG", STP), ("AGT", Ser),
("ATA", Met), ("ATC", Ile), ("ATG", Met), ("ATT", Ile),
("CAA", Gln), ("CAC", His), ("CAG", Gln), ("CAT", His),
("CCA", Pro), ("CCC", Pro), ("CCG", Pro), ("CCT", Pro),
("CGA", Arg), ("CGC", Arg), ("CGG", Arg), ("CGT", Arg),
("CTA", Leu), ("CTC", Leu), ("CTG", Leu), ("CTT", Leu),
("GAA", Glu), ("GAC", Asp), ("GAG", Glu), ("GAT", Asp),
("GCA", Ala), ("GCC", Ala), ("GCG", Ala), ("GCT", Ala),
("GGA", Gly), ("GGC", Gly), ("GGG", Gly), ("GGT", Gly),
("GTA", Val), ("GTC", Val), ("GTG", Val), ("GTT", Val),
("TAA", STP), ("TAC", Tyr), ("TAG", STP), ("TAT", Tyr),
("TCA", Ser), ("TCC", Ser), ("TCG", Ser), ("TCT", Ser),
("TGA", Trp), ("TGC", Cys), ("TGG", Trp), ("TGT", Cys),
("TTA", Leu), ("TTC", Phe), ("TTG", Leu), ("TTT", Phe)
--("RAT", Asx), ("RAC", Asx), ("SAA", Glx), ("SAG", Glx) --IUPAC R is (A or G), S is (C or G)
]
trans_table_yeast_nuc :: TransTable
trans_table_yeast_nuc = TTable [
("AAA", Lys), ("AAC", Asn), ("AAG", Lys), ("AAT", Asn),
("ACA", Thr), ("ACC", Thr), ("ACG", Thr), ("ACT", Thr),
("AGA", Arg), ("AGC", Ser), ("AGG", Arg), ("AGT", Ser),
("ATA", Met), ("ATC", Ile), ("ATG", Met), ("ATT", Ile),
("CAA", Gln), ("CAC", His), ("CAG", Gln), ("CAT", His),
("CCA", Pro), ("CCC", Pro), ("CCG", Pro), ("CCT", Pro),
("CGA", Arg), ("CGC", Arg), ("CGG", Arg), ("CGT", Arg),
("CTA", Leu), ("CTC", Thr), ("CTG", Thr), ("CTT", Thr),
("GAA", Glu), ("GAC", Asp), ("GAG", Glu), ("GAT", Asp),
("GCA", Ala), ("GCC", Ala), ("GCG", Ala), ("GCT", Ala),
("GGA", Gly), ("GGC", Gly), ("GGG", Gly), ("GGT", Gly),
("GTA", Val), ("GTC", Val), ("GTG", Val), ("GTT", Val),
("TAA", STP), ("TAC", Tyr), ("TAG", STP), ("TAT", Tyr),
("TCA", Ser), ("TCC", Ser), ("TCG", Ser), ("TCT", Ser),
("TGA", Trp), ("TGC", Cys), ("TGG", Trp), ("TGT", Cys),
("TTA", Leu), ("TTC", Phe), ("TTG", Leu), ("TTT", Phe)
--("RAT", Asx), ("RAC", Asx), ("SAA", Glx), ("SAG", Glx) --IUPAC R is (A or G), S is (C or G)
]
inverse_trans_table_std :: InverseTransTable
inverse_trans_table_std = ITTable [
(Ala, "GCN"), (Leu, "YTN"), (Arg, "MGN"), (Lys, "AAR"),
(Asn, "AAY"), (Met, "ATG"), (Asp, "GAY"), (Phe, "TTY"),
(Cys, "TGY"), (Pro, "CCN"), (Gln, "CAR"), (Ser, "WSN"),
(Glu, "GAR"), (Thr, "ACN"), (Gly, "GGN"), (Trp, "TGG"),
(His, "CAY"), (Tyr, "TAY"), (Ile, "AUH"), (Val, "GUN"),
(STP, "TRR"), (Asx, "RAY"), (Glx, "SAR"), (Xaa, "NNN")
]
_______________________________________________
Biohaskell mailing list
Biohaskell@biohaskell.org
http://malde.org/cgi-bin/mailman/listinfo/biohaskell