This is an automated email from the git hooks/post-receive script. tille pushed a commit to branch master in repository giira.
commit 54ea89068c995776063bbfac8dd31e9fe157d1ea Author: Andreas Tille <[email protected]> Date: Mon Jan 9 11:55:59 2017 +0100 New upstream version 0.0.20140625 --- src/geneFinder/ExtractGeneCandidates.java | 3 + src/geneFinder/FrameSearch.java | 236 ++++++++++++++++++++++++ src/geneFinder/GeneFinder.java | 10 +- src/geneFinder/Giira.java | 5 +- src/geneFinder/ProkaryoteExtraction.java | 3 + src/geneFinder/Prokaryote_Specials.java | 211 +++++++++++++++++++++ src/geneFinder/ReadInParameters_GeneFinder.java | 88 ++++++++- src/geneFinder/SamParser.java | 76 ++++++-- src/types/Rna.java | 2 +- 9 files changed, 612 insertions(+), 22 deletions(-) diff --git a/src/geneFinder/ExtractGeneCandidates.java b/src/geneFinder/ExtractGeneCandidates.java index a663c7c..3bc9bb2 100755 --- a/src/geneFinder/ExtractGeneCandidates.java +++ b/src/geneFinder/ExtractGeneCandidates.java @@ -57,6 +57,9 @@ public class ExtractGeneCandidates { if(line.startsWith(">")){ // test if correct contig if(line.substring(1).startsWith(contigName)){ + if(!((line.substring(1).startsWith(contigName+" ")) || (line.substring(1).length() == contigName.length()))){ + continue; // as an additional check to avoid picking the wrong contig because of name sub-similarities + } // found right one, now extract sequence while(((line = br.readLine()) != null) && (line.length() != 0) && (!(line.startsWith(">")))){ String line2 = ""; diff --git a/src/geneFinder/FrameSearch.java b/src/geneFinder/FrameSearch.java index d469eaa..76fdd1f 100755 --- a/src/geneFinder/FrameSearch.java +++ b/src/geneFinder/FrameSearch.java @@ -26,6 +26,12 @@ public class FrameSearch { public static int findPossibleStarts_Forward(Gene cluster, StringBuffer contigSeq, int posAr, int tempStop){ + if(!GeneFinder.alternativeCodons.isEmpty()){ + if(GeneFinder.alternativeCodons.containsKey("START FO")){ + return FrameSearch.findPossibleStarts_Forward_AlternativeStarts(cluster, contigSeq, posAr, tempStop, GeneFinder.alternativeCodons.get("START FO")); + } + } + if(tempStop < (int)Math.max(0,(cluster.startPos-GeneFinder.readLength))){ return -1; } @@ -69,6 +75,12 @@ public class FrameSearch { public static int findPossibleStarts_Reverse(Gene cluster, StringBuffer contigSeq, int posAr, int tempStop){ + if(!GeneFinder.alternativeCodons.isEmpty()){ + if(GeneFinder.alternativeCodons.containsKey("START RE")){ + return FrameSearch.findPossibleStarts_Reverse_AlternativeStarts(cluster, contigSeq, posAr, tempStop, GeneFinder.alternativeCodons.get("START RE")); + } + } + int start_RE = -1; if(tempStop < (int)Math.max(0,(cluster.startPos-GeneFinder.readLength))){ @@ -113,6 +125,112 @@ public class FrameSearch { } /* + * if alternative start and stop codons are given, perform a more general search also respecting those codons + * + */ + + public static int findPossibleStarts_Forward_AlternativeStarts(Gene cluster, StringBuffer contigSeq, int posAr, int tempStop,String[] alternativeStarts){ + + int start1 = -1; + + if(tempStop < (int)Math.max(0,(cluster.startPos-GeneFinder.readLength))){ + return -1; + } + + String startPart = contigSeq.substring((int)Math.max(0,(cluster.startPos-GeneFinder.readLength)), tempStop); + + int startSub_alt[] = new int[alternativeStarts.length]; + + for(int i = 0; i<alternativeStarts.length;++i){ + startSub_alt[i] = startPart.lastIndexOf(alternativeStarts[i]); + } + + java.util.Arrays.sort(startSub_alt); + + for(int i = startSub_alt.length -1; i>= 0;i--){ + if(startSub_alt[i] > -1){ + start1 = (int) Math.max(0,(cluster.startPos-GeneFinder.readLength)) + startSub_alt[i]; + break; + } + } + + if(start1 == -1){ + return start1; + } + + boolean foundSameFrame = false; + + for(int i = 0; i<posAr;++i){ + if((cluster.possibleStarts_Forward[i] - start1) % 3 == 0){ + foundSameFrame = true; + break; + } + } + + if(!foundSameFrame){ + cluster.possibleStarts_Forward[posAr++] = start1; + findPossibleStarts_Forward_AlternativeStarts(cluster,contigSeq,posAr,start1,alternativeStarts); + }else{ + findPossibleStarts_Forward_AlternativeStarts(cluster,contigSeq,posAr,start1,alternativeStarts); + } + + return start1; + } + + /* + * if alternative start and stop codons are given, perform a more general search also respecting those codons + * + */ + + public static int findPossibleStarts_Reverse_AlternativeStarts(Gene cluster, StringBuffer contigSeq, int posAr, int tempStop,String[] alternativeStops){ + + int start_RE = -1; + + if(tempStop < (int)Math.max(0,(cluster.startPos-GeneFinder.readLength))){ + return -1; + } + + String startPart = contigSeq.substring((int)Math.max(0,(cluster.startPos-GeneFinder.readLength)), tempStop); + + int startSub_alt[] = new int[alternativeStops.length]; + + for(int i = 0; i<alternativeStops.length;++i){ + startSub_alt[i] = startPart.lastIndexOf(alternativeStops[i]); + } + + java.util.Arrays.sort(startSub_alt); + + for(int i = startSub_alt.length -1; i>= 0;i--){ + if(startSub_alt[i] > -1){ + start_RE = (int) Math.max(0,(cluster.startPos-GeneFinder.readLength)) + startSub_alt[i]; + break; + } + } + + if(start_RE == -1){ + return start_RE; + } + + boolean foundSameFrame = false; + + for(int i = 0; i<posAr;++i){ + if((cluster.possibleStarts_Reverse[i] - start_RE) % 3 == 0){ + foundSameFrame = true; + break; + } + } + + if(!foundSameFrame){ + cluster.possibleStarts_Reverse[posAr++] = start_RE; + findPossibleStarts_Reverse_AlternativeStarts(cluster,contigSeq,posAr,start_RE,alternativeStops); + }else{ + findPossibleStarts_Reverse_AlternativeStarts(cluster,contigSeq,posAr,start_RE,alternativeStops); + } + + return start_RE; +} + + /* * new way of gene extraction by remembering all starts that are not in the same frame (maxNum = 3) * after that, starts and stops are checked if we find a combination that defines the frame of the cluster * @@ -122,6 +240,12 @@ public class FrameSearch { public static int findPossibleStops_Forward(Gene cluster, StringBuffer contigSeq, int posAr, int tempStart){ + if(!GeneFinder.alternativeCodons.isEmpty()){ + if(GeneFinder.alternativeCodons.containsKey("STOP FO")){ + return FrameSearch.findPossibleStops_Forward_AlternativeStops(cluster, contigSeq, posAr, tempStart, GeneFinder.alternativeCodons.get("STOP FO")); + } + } + int stop_FO = -1; if(tempStart > (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)){ @@ -174,6 +298,12 @@ public class FrameSearch { public static int findPossibleStops_Reverse(Gene cluster, StringBuffer contigSeq, int posAr, int tempStart){ + if(!GeneFinder.alternativeCodons.isEmpty()){ + if(GeneFinder.alternativeCodons.containsKey("STOP RE")){ + return FrameSearch.findPossibleStops_Reverse_AlternativeStop(cluster, contigSeq, posAr, tempStart, GeneFinder.alternativeCodons.get("STOP RE")); + } + } + if(tempStart > (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)){ return -1; } @@ -206,6 +336,112 @@ public class FrameSearch { } /* + * if alternative start and stop codons are given, perform a more general search also respecting those codons + * + */ + + public static int findPossibleStops_Forward_AlternativeStops(Gene cluster, StringBuffer contigSeq, int posAr, int tempStart, String[] alternativeStops){ + + int stop_FO = -1; + + if(tempStart > (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)){ + return -1; + } + + String stopPart = contigSeq.substring(tempStart, (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)); + + int stopSub_alt[] = new int[alternativeStops.length]; + + for(int i = 0; i<alternativeStops.length;++i){ + stopSub_alt[i] = stopPart.indexOf(alternativeStops[i]); + } + + java.util.Arrays.sort(stopSub_alt); + + for(int i = 0; i < stopSub_alt.length;++i){ + if(stopSub_alt[i] > -1){ + stop_FO = tempStart + stopSub_alt[i]; + break; + } + } + + if(stop_FO == -1){ + return stop_FO; + } + + boolean foundSameFrame = false; + + for(int i = 0; i<posAr;++i){ + if((cluster.possibleStops_Forward[i] - stop_FO) % 3 == 0){ + foundSameFrame = true; + break; + } + } + + if(!foundSameFrame){ + cluster.possibleStops_Forward[posAr++] = stop_FO; + findPossibleStops_Forward_AlternativeStops(cluster,contigSeq,posAr,stop_FO+3,alternativeStops); + }else{ + findPossibleStops_Forward_AlternativeStops(cluster,contigSeq,posAr,stop_FO+3,alternativeStops); + } + + return stop_FO; + } + + /* + * if alternative start and stop codons are given, perform a more general search also respecting those codons + * + */ + + public static int findPossibleStops_Reverse_AlternativeStop(Gene cluster, StringBuffer contigSeq, int posAr, int tempStart, String[] alternativeStarts){ + + int start1 = -1; + + if(tempStart > (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)){ + return -1; + } + + String stopPart = contigSeq.substring(tempStart, (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)); + + int stopSub_alt[] = new int[alternativeStarts.length]; + + for(int i = 0; i<alternativeStarts.length;++i){ + stopSub_alt[i] = stopPart.indexOf(alternativeStarts[i]); + } + + java.util.Arrays.sort(stopSub_alt); + + for(int i = 0; i < stopSub_alt.length;++i){ + if(stopSub_alt[i] > -1){ + start1 = tempStart + stopSub_alt[i]; + break; + } + } + + if(start1 == -1){ + return start1; + } + + boolean foundSameFrame = false; + + for(int i = 0; i<posAr;++i){ + if((cluster.possibleStops_Reverse[i] - start1) % 3 == 0){ + foundSameFrame = true; + break; + } + } + + if(!foundSameFrame){ + cluster.possibleStops_Reverse[posAr++] = start1; + findPossibleStops_Reverse_AlternativeStop(cluster,contigSeq,posAr,start1+3,alternativeStarts); + }else{ + findPossibleStops_Reverse_AlternativeStop(cluster,contigSeq,posAr,start1+3,alternativeStarts); + } + + return start1; + } + + /* * test if there is one of the possible start-stop codon pairs which is in frame * take the smallest interval possible */ diff --git a/src/geneFinder/GeneFinder.java b/src/geneFinder/GeneFinder.java index 5dba17a..6a81249 100755 --- a/src/geneFinder/GeneFinder.java +++ b/src/geneFinder/GeneFinder.java @@ -30,6 +30,8 @@ public class GeneFinder { public static Map<File,String> genomeFilesWithNames = new HashMap<File,String>(); public static Map<File,String> rnaFilesWithNames = new HashMap<File,String>(); + + public static Map<String,String[]> alternativeCodons = new HashMap<String,String[]>(); public static boolean useTopHat; // indicator for mapping tool public static String settingMapper; // setting for the mapping tool, differs slightly depending on which tool was chosen @@ -80,13 +82,7 @@ public class GeneFinder { public static Object[] manager(String[] args){ - ReadInParameters_GeneFinder.readIn_GF(args); - - /*Gene gene = new Gene(); - gene.startPos = 0; - String seq = readInFasta(); - Prokaryote_Specials.define_OrfsInOperon(seq,gene); - System.exit(0);*/ + ReadInParameters_GeneFinder.readIn_GF(args); long timeBef = System.currentTimeMillis(); diff --git a/src/geneFinder/Giira.java b/src/geneFinder/Giira.java index 4f264b5..f669535 100755 --- a/src/geneFinder/Giira.java +++ b/src/geneFinder/Giira.java @@ -43,7 +43,10 @@ public class Giira { try { String decodedPath = URLDecoder.decode(path, "UTF-8"); - String scriptPath = decodedPath.substring(0,decodedPath.length()-9); + String[] pathArr = decodedPath.split("/"); + int lengthName = pathArr[pathArr.length-1].length(); + String scriptPath = decodedPath.substring(0,decodedPath.length()-lengthName); + //String scriptPath = decodedPath.substring(0,decodedPath.length()-9); //System.out.println("Path of Giira: " + decodedPath); classPath = ""; diff --git a/src/geneFinder/ProkaryoteExtraction.java b/src/geneFinder/ProkaryoteExtraction.java index 65c12a4..9e19847 100755 --- a/src/geneFinder/ProkaryoteExtraction.java +++ b/src/geneFinder/ProkaryoteExtraction.java @@ -56,6 +56,9 @@ public class ProkaryoteExtraction { if(line.startsWith(">")){ // test if correct contig if(line.substring(1).startsWith(contigName)){ + if(!((line.substring(1).startsWith(contigName+" ")) || (line.substring(1).length() == contigName.length()))){ + continue; // as an additional check to avoid picking the wrong contig because of name sub-similarities + } // found right one, now extract sequence while(((line = br.readLine()) != null) && (line.length() != 0) && (!(line.startsWith(">")))){ String line2 = ""; diff --git a/src/geneFinder/Prokaryote_Specials.java b/src/geneFinder/Prokaryote_Specials.java index ed886ee..bbb3cfc 100755 --- a/src/geneFinder/Prokaryote_Specials.java +++ b/src/geneFinder/Prokaryote_Specials.java @@ -234,6 +234,12 @@ public class Prokaryote_Specials { public static Vector<int[]> searchFO_orfs(String inputSeq){ + if(!GeneFinder.alternativeCodons.isEmpty()){ + if(GeneFinder.alternativeCodons.containsKey("START FO")){ + return searchFO_orfs_alternativeCodons(inputSeq, GeneFinder.alternativeCodons.get("START FO"), GeneFinder.alternativeCodons.get("STOP FO")); + } + } + Vector<int[]> allORFs_FO = new Vector<int[]>(); int foundNewATG = 1; @@ -315,12 +321,118 @@ public class Prokaryote_Specials { } /* + * if alternative start and stop codons are specified, respect this in a more general orf search + * + */ + + public static Vector<int[]> searchFO_orfs_alternativeCodons(String inputSeq, String[] alternativeStarts_FO, String[] alternativeStops_FO){ + + Vector<int[]> allORFs_FO = new Vector<int[]>(); + + int foundNewATG = 1; + int posLastATG = 0; + + do{ + + int startPos = -1; + + String startPart_alt = inputSeq.substring(posLastATG); + + int startSub_alt[] = new int[alternativeStarts_FO.length]; + + for(int i = 0; i<alternativeStarts_FO.length;++i){ + startSub_alt[i] = startPart_alt.indexOf(alternativeStarts_FO[i]); + } + + java.util.Arrays.sort(startSub_alt); + + for(int i = 0; i < startSub_alt.length;++i){ + if(startSub_alt[i] > -1){ + startPos = startSub_alt[i]; + break; + } + } + + int stopPos = -1; + + int posLastStart = -1; + + if(startPos == -1){ + foundNewATG = 0; + break; + }else{ + startPos = startPos + posLastATG; + posLastATG = startPos + 3; + posLastStart = startPos + 3; + } + + int goOn = 0; + + do{ + goOn = 0; + + String stopPart = inputSeq.substring(posLastStart); + + int stopSub[] = new int[alternativeStops_FO.length]; + + for(int i = 0; i<alternativeStops_FO.length;++i){ + stopSub[i] = stopPart.indexOf(alternativeStops_FO[i]); + } + + java.util.Arrays.sort(stopSub); + + for(int i = 0; i < stopSub.length;++i){ + if(stopSub[i] > -1){ + if(((((posLastStart + stopSub[i])-startPos) % 3) == 0)){ + stopPos = posLastStart + stopSub[i]; + }else{ + posLastStart = posLastStart + stopSub[i]+1; + goOn = 1; + } + break; + } + } + + if(stopPos != -1){ + + if(stopPos-startPos > 30){ + if(!checkIfORFcovered(allORFs_FO,new int[]{startPos,(stopPos+2)})){ + allORFs_FO.add(new int[]{startPos,(stopPos+2)}); + for(int i=startPos;i<=stopPos+2;++i){ + cov[i]++; + } + }else{ + alreadyCovered++; + } + }else{ + notCounted++; + } + + break; + } + + }while(goOn == 1); + + + }while(foundNewATG == 1); + + + return allORFs_FO; + } + + /* * searches all ORFs assuming reverse direction * note: no length limit is set, ORFs too short should be penalized in the BIC scoring */ public static Vector<int[]> searchRE_orfs(String inputSeq){ + if(!GeneFinder.alternativeCodons.isEmpty()){ + if(GeneFinder.alternativeCodons.containsKey("START RE")){ + return searchRE_orfs_alternativeCodons(inputSeq, GeneFinder.alternativeCodons.get("STOP RE"), GeneFinder.alternativeCodons.get("START RE")); // are stored the other way around so start is stop and vice versa + } + } + Vector<int[]> allORFs_RE= new Vector<int[]>(); int foundNewCAT = 1; @@ -402,6 +514,105 @@ public class Prokaryote_Specials { } /* + * if alternative start and stop codons are specified, respect this in a more general orf search + * + */ + + public static Vector<int[]> searchRE_orfs_alternativeCodons(String inputSeq, String[] alternativeStarts_RE, String[] alternativeStops_RE){ + + Vector<int[]> allORFs_RE= new Vector<int[]>(); + + int foundNewCAT = 1; + int posLastCAT = inputSeq.length(); + + do{ + int startPos = -1; + + String startPart_alt = inputSeq.substring(0,posLastCAT); + + int startSub_alt[] = new int[alternativeStarts_RE.length]; + + for(int i = 0; i<alternativeStarts_RE.length;++i){ + startSub_alt[i] = startPart_alt.lastIndexOf(alternativeStarts_RE[i]); + } + + java.util.Arrays.sort(startSub_alt); + + for(int i = startSub_alt.length -1; i>= 0;i--){ + if(startSub_alt[i] > -1){ + startPos = startSub_alt[i]; + break; + } + } + + int stopPos = -1; + + int posLastStop = -1; + + if(startPos == -1){ + foundNewCAT = 0; + break; + }else{ + posLastCAT = startPos; + posLastStop = startPos; + } + + int goOn = 0; + + do{ + goOn = 0; + + String stopPart = inputSeq.substring(0,posLastStop); + + int stopSub[] = new int[alternativeStops_RE.length]; + + for(int i = 0; i<alternativeStops_RE.length;++i){ + stopSub[i] = stopPart.lastIndexOf(alternativeStops_RE[i]); + } + + java.util.Arrays.sort(stopSub); + + for(int i = stopSub.length -1; i>= 0;i--){ + if(stopSub[i] > -1){ + if(((startPos-stopSub[i]) % 3) == 0){ + stopPos = stopSub[i]; + }else{ + posLastStop = stopSub[i]+2; + goOn = 1; + } + break; + } + } + + if(stopPos != -1){ + + if(startPos-stopPos > 30){ + if(!checkIfORFcovered(allORFs_RE,new int[]{stopPos,(startPos+2)})){ + allORFs_RE.add(new int[]{stopPos,(startPos+2)}); + for(int i=stopPos;i<=startPos+2;++i){ + cov[i]++; + } + }else{ + alreadyCovered++; + } + }else{ + notCounted++; + } + + + break; + } + + }while(goOn == 1); + + + }while(foundNewCAT == 1); + + + return allORFs_RE; + } + + /* * filter out all orfs that are completely included in bigger ones */ diff --git a/src/geneFinder/ReadInParameters_GeneFinder.java b/src/geneFinder/ReadInParameters_GeneFinder.java index 9313070..3d0fd75 100755 --- a/src/geneFinder/ReadInParameters_GeneFinder.java +++ b/src/geneFinder/ReadInParameters_GeneFinder.java @@ -58,6 +58,7 @@ public class ReadInParameters_GeneFinder { boolean foundProkaryote = false; boolean foundSequential = false; boolean foundInprogea = false; + boolean foundAlternativeCodons = false; if(!parameter.isEmpty() && args.length > 0){ @@ -230,6 +231,12 @@ public class ReadInParameters_GeneFinder { inputText += "minimal interval length: " + GeneFinder.interval + "\n"; } + } else if(arg.equals("-altCodon")){ // alternative start and stop codons + String pathToAlternative = args[i+1]; + readInAlternativeStartsStops(pathToAlternative); + foundAlternativeCodons = true; + inputText += "Alternative Starts and stops provided \n"; + } else if(arg.equals("-noAmbiOpti")){ // turn on or off the optimization of ambiguous reads foundAmbiOpti = true; GeneFinder.noAmbiOpti = true; @@ -321,6 +328,21 @@ public class ReadInParameters_GeneFinder { if(!havePathOut){ GeneFinder.pathOut = ""; + }else{ + // check if directory exists, if not, create it + File f = new File(GeneFinder.pathOut); + if(!f.exists()){ + Runtime rtAlign = Runtime.getRuntime(); + try { + String exe = "mkdir " + GeneFinder.pathOut; + Process pc = rtAlign.exec(exe); + pc.waitFor(); + } catch (IOException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } } if(!foundGenome){ System.out.println("No genome file specified. Use \"-h\" to print usage options. "); @@ -410,6 +432,7 @@ public class ReadInParameters_GeneFinder { GeneFinder.inprogeaCall = false; } + GeneFinder.logFile = new File(GeneFinder.pathOut+"log_it" + GeneFinder.iteration + ".txt"); if(!GeneFinder.secondPart){ System.out.println(inputText); @@ -438,6 +461,66 @@ public class ReadInParameters_GeneFinder { } /* + * reads in the alternative start and stop codons from a given input file + * one line per codon type, with codons tab separated + */ + + public static void readInAlternativeStartsStops(String altFile) { + + Map<String,String[]> altCodons = new HashMap<String,String[]>(); + + try { + + BufferedReader br = new BufferedReader(new FileReader(altFile)); + + String line = ""; + + while((line = br.readLine()) != null){ + + String[] lineArr = line.split("\t"); + String[] temp = new String[lineArr.length-1]; + + if(line.startsWith("START FO")){ + + for(int i = 1; i<lineArr.length;++i){ + temp[i-1] = lineArr[i]; + } + + altCodons.put("START FO",temp); + } + if(line.startsWith("START RE")){ + for(int i = 1; i<lineArr.length;++i){ + temp[i-1] = lineArr[i]; + } + + altCodons.put("STOP RE",temp); // for GIIRA start and stops are switched for the reverse direction + } + if(line.startsWith("STOP FO")){ + for(int i = 1; i<lineArr.length;++i){ + temp[i-1] = lineArr[i]; + } + + altCodons.put("STOP FO",temp); + } + if(line.startsWith("STOP RE")){ + for(int i = 1; i<lineArr.length;++i){ + temp[i-1] = lineArr[i]; + } + + altCodons.put("START RE",temp); // for GIIRA start and stops are switched for the reverse direction + } + } + + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + GeneFinder.alternativeCodons = altCodons; + } + + /* * print the help text to screen */ @@ -457,7 +540,7 @@ public class ReadInParameters_GeneFinder { " \n -iG [pathToGenomes] : specify path to directory with genome files in fasta format \n" + " \n -iR [pathToRna] : specify path to directory with rna read files in fastq format \n" + " \n -scripts [absolutePath] : specify the absolute path to the directory containing the required helper scripts, DEFAULT: directory of GIIRA.jar \n" + - " \n -out [pathToResults] : specify the directory that shall contain the results files \n" + + " \n -out [pathToResults] : specify the absolute pyth to the directory that shall contain the results files \n" + " \n -outName [outputName] : specify desired name for output files, DEFAULT: genes \n" + " \n -haveSam [samfileName]: if a sam file already exists, provide the name, else a mapping is performed. NOTE: the sam file has to be sorted according to read names! \n" + " \n -nT [numberThreads] : specify the maximal number of threads that are allowed to be used, DEFAULT: 1 \n" + @@ -468,7 +551,7 @@ public class ReadInParameters_GeneFinder { //" \n -splitRunAndOpti [y/n] : indicates if the optimization and giira shall be run separately, to reduce the memory consumption (y), DEFAULT: n" + " \n -mem [int] : specify the amount of memory that cplex is allowed to use \n" + " \n -maxReportedHits [int] : if using BWA as mapping tool, specify the maximal number of reported hits, DEFAULT: 2 \n" + - " \n -prokaryote : if specified, genome is treated as prokaryotic, no spliced reads are accepted, and structural genes are resolved. DEFAULT: n \n" + + " \n -prokaryote : if specified, genome is treated as prokaryotic, no spliced reads are accepted, and structural genes are resolved. DEFAULT: False \n" + " \n -minCov [double] : specify the minimum required coverage of the gene candidate extraction, DEFAULT: -1 (is estimated from mapping) \n" + " \n -maxCov [double] : optional maximal coverage threshold, can also be estimated from mapping (DEFAULT) \n" + " \n -endCov [double] : if the coverage falls below this value, the currently open candidate gene is closed. This value can be estimated from the minimum coverage (-1); DEFAULT: -1 \n" + @@ -476,6 +559,7 @@ public class ReadInParameters_GeneFinder { " \n -interval [int] : specify the minimal size of an interval between near candidate genes, if \"-1\" it equals the read length. DEFAULT: -1 \n " + " \n -splLim [double] : specify the minimal coverage that is required to accept a splice site, if (-1) the threshold is equal to minCov, DEFAULT: -1 \n" + " \n -rL [int] : specify read length, otherwise this information is extracted from SAM file (DEFAULT) \n" + + " \n -altCodon [pathToAlternativeCodons] : specify path to txt file with alternative start and stop codons, see example file in scripts folder \n" + " \n -samForSequential [pathToSamFile] : if it is desired to analyse chromosomes in a sequential manner, provide a chromosome sorted sam file in addition to the one sorted by read names, DEFAULT: noSequential \n" + " \n -noAmbiOpti : if specified, ambiguous hits are not included in the analysis \n" + " \n -settingMapper [(list of parameters)] : A comma-separated list of the desired parameters for TopHat or BWA. Please provide \n" + diff --git a/src/geneFinder/SamParser.java b/src/geneFinder/SamParser.java index 291befe..76377be 100755 --- a/src/geneFinder/SamParser.java +++ b/src/geneFinder/SamParser.java @@ -138,7 +138,21 @@ public class SamParser { totalHitCount++; - if(!parts[0].equals(currentReadID)){ // now we have proceeded to a new read + String adaptedName = ""; + + if(parts[0].contains(":")){ + String[] nameParts = parts[0].split(":"); + for(int i=0;i<nameParts.length;++i){ + adaptedName += nameParts[i] + ";;;"; // necessary to avoid cplex or glpk errors + } + + adaptedName = adaptedName.substring(0,(adaptedName.length()-3)); + }else{ + adaptedName = parts[0]; + } + + + if(!adaptedName.equals(currentReadID)){ // now we have proceeded to a new read if(GeneFinder.iteration == 2 && currentRead != null && currentRead.isMulti == 1){ @@ -147,7 +161,20 @@ public class SamParser { do{ String[] partsReaSam = lineReaSam.split(" "); - if(currentRead.rnaID.equals(partsReaSam[0])){ + String adaptedNameReaSam = ""; + + if(partsReaSam[0].contains(":")){ + String[] nameParts = partsReaSam[0].split(":"); + for(int i=0;i<nameParts.length;++i){ + adaptedNameReaSam += nameParts[i] + ";;;"; // necessary to avoid cplex or glpk errors + } + + adaptedNameReaSam = adaptedNameReaSam.substring(0,(adaptedNameReaSam.length()-3)); + }else{ + adaptedNameReaSam = partsReaSam[0]; + } + + if(currentRead.rnaID.equals(adaptedNameReaSam)){ allReassigned.put(Integer.parseInt(partsReaSam[3]),partsReaSam[2]); }else{ break; @@ -165,12 +192,12 @@ public class SamParser { } - currentReadID = parts[0]; + currentReadID = adaptedName; // set up new rna node Rna newRna = new Rna(); - newRna.rnaID = parts[0]; + newRna.rnaID = adaptedName; newRna.isMulti = 0; newRna.hitNum = 1; newRna.assignedNum = 0; @@ -479,9 +506,22 @@ public class SamParser { do{ String[] partsReaSam = lineReaSam.split(" "); - if(partsReaSam[0].compareTo(currentRead.rnaID) > 0){ + String adaptedNameReaSam = ""; + + if(partsReaSam[0].contains(":")){ + String[] nameParts = partsReaSam[0].split(":"); + for(int i=0;i<nameParts.length;++i){ + adaptedNameReaSam += nameParts[i] + ";;;"; // necessary to avoid cplex or glpk errors + } + + adaptedNameReaSam = adaptedNameReaSam.substring(0,(adaptedNameReaSam.length()-3)); + }else{ + adaptedNameReaSam = partsReaSam[0]; + } + + if(adaptedNameReaSam.compareTo(currentRead.rnaID) > 0){ break; // we exceeded this read, so stop - }else if(currentRead.rnaID.equals(partsReaSam[0])){ + }else if(currentRead.rnaID.equals(adaptedNameReaSam)){ allReassigned.put(Integer.parseInt(partsReaSam[3]),partsReaSam[2]); } }while((lineReaSam = br.readLine()) != null); @@ -953,9 +993,23 @@ public class SamParser { Rna read; - if(seenReads.keySet().contains(parts[0])){ + String adaptedName = ""; + + if(parts[0].contains(":")){ + String[] nameParts = parts[0].split(":"); + for(int i=0;i<nameParts.length;++i){ + adaptedName += nameParts[i] + ";;;"; // necessary to avoid cplex or glpk errors + } + + adaptedName = adaptedName.substring(0,(adaptedName.length()-3)); + }else{ + adaptedName = parts[0]; + } + + + if(seenReads.keySet().contains(adaptedName)){ - Vector<Object> temp = seenReads.get(parts[0]); + Vector<Object> temp = seenReads.get(adaptedName); if(((Integer)temp.get(0)) != 0){ @@ -973,7 +1027,7 @@ public class SamParser { temp.clear(); temp.add(0); - seenReads.put(parts[0],temp); + seenReads.put(adaptedName,temp); if(totalHitCount % 100000 == 0){ @@ -1005,7 +1059,7 @@ public class SamParser { interChromoTotalCount++; Vector<Object> temp = new Vector<Object>(); temp.add(0); - seenReads.put(parts[0],temp); + seenReads.put(adaptedName,temp); interChromoTotalCount++; break; } @@ -1049,7 +1103,7 @@ public class SamParser { Vector<Object> temp = new Vector<Object>(); temp.add(1); temp.add(read); - seenReads.put(parts[0],temp); + seenReads.put(adaptedName,temp); } } diff --git a/src/types/Rna.java b/src/types/Rna.java index 590f31d..8e8e123 100755 --- a/src/types/Rna.java +++ b/src/types/Rna.java @@ -17,7 +17,7 @@ public class Rna { public double quality; - public Vector<Object[]> contigsMappedOn = new Vector<Object[]>(); // contains several Arrays รก: [contig, alignPos, cigarString, mapQual,spliceInfo,mismatchInfo,direcInfo] (one for each hit) + public Vector<Object[]> contigsMappedOn = new Vector<Object[]>(); // contains several Arrays ala: [contig, alignPos, cigarString, mapQual,spliceInfo,mismatchInfo,direcInfo] (one for each hit) public int isMulti; // indicator if this read is an ambiguous read -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/giira.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
