Hello,

I've gotten a start on this but am running into a few difficulties. I've attached my code so far.

I'd like to implement a function "translate" that uses the standard translation table by default, but can optionally use a different translation table if it is passed as an argument. Does it make sense to write this with the type signature:

translate :: SeqData -> Offset -> Maybe TransTable -> SeqData

I'd like to be able to call the function with only the first two argument and get the hard-coded standard translation table, but if I pass a third argument with type TransTable, I get the customized translation.

As I've mentioned before on this list I'm still quite new to haskell, so any tips are appreciated.

Dan

On 16/08/11 02:16 PM, Ketil Malde wrote:
Dan Fornika<dforn...@gmail.com>  writes:

I've been working on a mitochondrial genetics project, and I'd like to
see a human mitochondrial-specific translation function integrated
into BioHaskell.

Hm, yes, support for the non-canonical translations would be nice.

I see that Bio.Sequence.Phd has a standard translation table,

Bio.Sequence.SeqData, actually, but it's reexported in a lot of places.

so I've
made a version called trans_tbl_mito with the appropriate changes
integrated.

Currently, there's just a 'translate' that hardwires this table.  I
concede this is poor design, the table ought to be a parameter, but it
was what I needed at the time.

I know that there is a lot of re-factoring going on, and
I'm not sure what is the most efficient/elegant way to introduce this
feature.

How about a module Bio.Sequence.Translation containing:

   import Bio.Core.Sequence

   data Triple = ... -- e.g. String, or perhaps just SeqData
   data Amino  = ...
   data TransTable = [(Triple,Amino)]

   transtable_std, transtbl_mito, ... :: TransTable

   translate :: TransTable ->  SeqData ->  SeqData

   -- and perhaps versions for all frames, or some specific frame?

What does everybody think?

Dan: would you like to write this?  Feel free to rip what you want from
the old SeqData module.

-k
import Bio.Core.Sequence

import Data.List (unfoldr, intercalate, isPrefixOf)
import Data.Char (toUpper, isNumber)
import Data.Maybe
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BB

data Codon = String                      -- SeqData?

data Amino = Ala | Arg | Asn | Asp | Cys | Gln | Glu | Gly
           | His | Ile | Leu | Lys | Met | Phe | Pro | Ser
           | Thr | Tyr | Trp | Val 
           | STP | Asx | Glx | Xle | Xaa -- unknowns
     deriving (Show,Eq)

data TransTable = TTable [(Codon, Amino)]

data InverseTransTable = ITTable [(Amino, Codon)]

{-# INLINE (!) #-}
(!) :: SeqData -> Offset -> Char
(!) (SeqData bs) = B.index bs

translate ::  SeqData -> Offset -> Maybe TransTable -> SeqData
translate s' o' Nothing = unfoldr codons (s', o')
   where codons (s,o) = 
             if o > seqlength s - 3 then Nothing
             else Just (trans1 (map (s!) [o, o+1, o+2]), (s, o+3) trans_table_std)
translate s' o' trans_table = unfoldr codons (s', o', trans_table)
   where codons (s, o, trans_table) = 
             if o > seqlength s - 3 then Nothing
             else Just (trans1 (map (s!) [o, o+1, o+2]), (s, o+3) trans_table)

trans1 :: String -> Maybe TransTable-> Amino
trans1 s Nothing = (flip lookup trans_table_std . map (repUT . toUpper)) s
    where repUT x = if x == 'U' then 'T' else x -- RNA uses U for T 
trans1 s transl_table = (flip lookup transl_table . map (repUT . toUpper)) s
    where repUT x = if x == 'U' then 'T' else x -- RNA uses U for T

-- amino2iupac :: [Amino] -> SeqData
-- amino2iupac =

-- reverse_translate :: SeqData -> TransTable -> SeqData
-- reverse_translate =

-- rev_trans1 :: Amino -> TransTable -> String
-- rev_trans1 = 

-- todo: extend with more IUPAC nucleotide wildcards?
-- | Convert a list of amino acids to a sequence in IUPAC format.
toIUPAC :: [Amino] -> SeqData
toIUPAC = B.pack . map (fromJust . flip lookup iupac)

-- | Convert a sequence in IUPAC format to a list of amino acids.
fromIUPAC :: SeqData -> [Amino]
fromIUPAC = map (maybe Xaa id . flip lookup iupac' . toUpper) . B.unpack

iupac :: [(Amino, Char)]
iupac = [(Ala, 'A'), (Arg, 'R'), (Asn, 'N')
        ,(Asp, 'D'), (Cys, 'C'), (Gln, 'Q')
        ,(Glu, 'E'), (Gly, 'G'), (His, 'H')
        ,(Ile, 'I'), (Leu, 'L'), (Lys, 'K')
        ,(Met, 'M'), (Phe, 'F'), (Pro, 'P')
        ,(Ser, 'S'), (Thr, 'T'), (Tyr, 'Y')
        ,(Trp, 'W'), (Val,'V')
        ,(Asx, 'B') -- Asn or Asp
        ,(Glx, 'Z') -- Gln or Glu
        ,(Xle, 'J') -- Ile or Leu
        ,(STP, '*')        
        ,(Xaa, 'X')
        ]

iupac' :: [(Char,Amino)]
iupac' = map (\(a,b)->(b,a)) iupac

trans_table_std :: TransTable
trans_table_std = TTable [("AAA", Lys), ("AAC", Asn), ("AAG", Lys), ("AAT", Asn)
                         ,("ACA", Thr), ("ACC", Thr), ("ACG", Thr), ("ACT", Thr)
                         ,("AGA", Arg), ("AGC", Ser), ("AGG", Arg), ("AGT", Ser)
                         ,("ATA", Ile), ("ATC", Ile), ("ATG", Met), ("ATT", Ile)
                         ,("CAA", Gln), ("CAC", His), ("CAG", Gln), ("CAT", His)
                         ,("CCA", Pro), ("CCC", Pro), ("CCG", Pro), ("CCT", Pro)
                         ,("CGA", Arg), ("CGC", Arg), ("CGG", Arg), ("CGT", Arg),
                         ,("CTA", Leu), ("CTC", Leu), ("CTG", Leu), ("CTT", Leu),
                         ,("GAA", Glu), ("GAC", Asp), ("GAG", Glu), ("GAT", Asp),
                         ,("GCA", Ala), ("GCC", Ala), ("GCG", Ala), ("GCT", Ala),
                         ,("GGA", Gly), ("GGC", Gly), ("GGG", Gly), ("GGT", Gly),
                         ,("GTA", Val), ("GTC", Val), ("GTG", Val), ("GTT", Val),
                         ,("TAA", STP), ("TAC", Tyr), ("TAG", STP), ("TAT", Tyr),
                         ,("TCA", Ser), ("TCC", Ser), ("TCG", Ser), ("TCT", Ser),
                         ,("TGA", STP), ("TGC", Cys), ("TGG", Trp), ("TGT", Cys),
                         ,("TTA", Leu), ("TTC", Phe), ("TTG", Leu), ("TTT", Phe)
--                        ("RAT", Asx), ("RAC", Asx), ("SAA", Glx), ("SAG", Glx)  
-- IUPAC R is (A or G), S is (C or G) 
                         ]

trans_table_vertebrate_mito :: TransTable
trans_table_vertebrate_mito = TTable [
                                     , ("AAA", Lys), ("AAC", Asn), ("AAG", Lys), ("AAT", Asn),
  ("ACA", Thr), ("ACC", Thr), ("ACG", Thr), ("ACT", Thr),
  ("AGA", STP), ("AGC", Ser), ("AGG", STP), ("AGT", Ser),
  ("ATA", Met), ("ATC", Ile), ("ATG", Met), ("ATT", Ile),
  ("CAA", Gln), ("CAC", His), ("CAG", Gln), ("CAT", His),
  ("CCA", Pro), ("CCC", Pro), ("CCG", Pro), ("CCT", Pro),
  ("CGA", Arg), ("CGC", Arg), ("CGG", Arg), ("CGT", Arg),
  ("CTA", Leu), ("CTC", Leu), ("CTG", Leu), ("CTT", Leu),
  ("GAA", Glu), ("GAC", Asp), ("GAG", Glu), ("GAT", Asp),
  ("GCA", Ala), ("GCC", Ala), ("GCG", Ala), ("GCT", Ala),
  ("GGA", Gly), ("GGC", Gly), ("GGG", Gly), ("GGT", Gly),
  ("GTA", Val), ("GTC", Val), ("GTG", Val), ("GTT", Val),
  ("TAA", STP), ("TAC", Tyr), ("TAG", STP), ("TAT", Tyr),
  ("TCA", Ser), ("TCC", Ser), ("TCG", Ser), ("TCT", Ser),
  ("TGA", Trp), ("TGC", Cys), ("TGG", Trp), ("TGT", Cys),
  ("TTA", Leu), ("TTC", Phe), ("TTG", Leu), ("TTT", Phe)
--("RAT", Asx), ("RAC", Asx), ("SAA", Glx), ("SAG", Glx)  --IUPAC R is (A or G), S is (C or G) 
  ]

trans_table_yeast_nuc :: TransTable
trans_table_yeast_nuc = TTable [
  ("AAA", Lys), ("AAC", Asn), ("AAG", Lys), ("AAT", Asn),
  ("ACA", Thr), ("ACC", Thr), ("ACG", Thr), ("ACT", Thr),
  ("AGA", Arg), ("AGC", Ser), ("AGG", Arg), ("AGT", Ser),
  ("ATA", Met), ("ATC", Ile), ("ATG", Met), ("ATT", Ile),
  ("CAA", Gln), ("CAC", His), ("CAG", Gln), ("CAT", His),
  ("CCA", Pro), ("CCC", Pro), ("CCG", Pro), ("CCT", Pro),
  ("CGA", Arg), ("CGC", Arg), ("CGG", Arg), ("CGT", Arg),
  ("CTA", Leu), ("CTC", Thr), ("CTG", Thr), ("CTT", Thr),
  ("GAA", Glu), ("GAC", Asp), ("GAG", Glu), ("GAT", Asp),
  ("GCA", Ala), ("GCC", Ala), ("GCG", Ala), ("GCT", Ala),
  ("GGA", Gly), ("GGC", Gly), ("GGG", Gly), ("GGT", Gly),
  ("GTA", Val), ("GTC", Val), ("GTG", Val), ("GTT", Val),
  ("TAA", STP), ("TAC", Tyr), ("TAG", STP), ("TAT", Tyr),
  ("TCA", Ser), ("TCC", Ser), ("TCG", Ser), ("TCT", Ser),
  ("TGA", Trp), ("TGC", Cys), ("TGG", Trp), ("TGT", Cys),
  ("TTA", Leu), ("TTC", Phe), ("TTG", Leu), ("TTT", Phe)
--("RAT", Asx), ("RAC", Asx), ("SAA", Glx), ("SAG", Glx)  --IUPAC R is (A or G), S is (C or G) 
  ]

inverse_trans_table_std :: InverseTransTable
inverse_trans_table_std = ITTable [
  (Ala, "GCN"), (Leu, "YTN"), (Arg, "MGN"), (Lys, "AAR"),
  (Asn, "AAY"), (Met, "ATG"), (Asp, "GAY"), (Phe, "TTY"),
  (Cys, "TGY"), (Pro, "CCN"), (Gln, "CAR"), (Ser, "WSN"),
  (Glu, "GAR"), (Thr, "ACN"), (Gly, "GGN"), (Trp, "TGG"),
  (His, "CAY"), (Tyr, "TAY"), (Ile, "AUH"), (Val, "GUN"),
  (STP, "TRR"), (Asx, "RAY"), (Glx, "SAR"), (Xaa, "NNN")
  ]
_______________________________________________
Biohaskell mailing list
Biohaskell@biohaskell.org
http://malde.org/cgi-bin/mailman/listinfo/biohaskell

Reply via email to