This is an automated email from the git hooks/post-receive script. malex-guest pushed a commit to branch master in repository gmap.
commit e3408d5a15b373f1241d461a0ab57b5a631fcf7d Author: Alexandre Mestiashvili <[email protected]> Date: Fri Mar 18 09:47:27 2016 +0100 Imported Upstream version 2015-12-31.v9 --- ChangeLog | 15 ++ src/gsnap.c | 20 +- src/pair.c | 60 ++++-- src/sam_sort.c | 5 +- src/samread.c | 457 +++++++++++++++++++++++++------------------- src/stage1hr.c | 18 +- src/stage1hr.h | 4 +- src/stage2.c | 37 +++- src/uniqscan.c | 4 +- util/gff3_genes.pl.in | 57 ++++-- util/gff3_introns.pl.in | 112 ++++++++--- util/gff3_splicesites.pl.in | 145 +++++++++----- 12 files changed, 615 insertions(+), 319 deletions(-) diff --git a/ChangeLog b/ChangeLog index 37cee49..af65f81 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +2016-03-17 twu + + * gff3_genes.pl.in, gff3_introns.pl.in, gff3_splicesites.pl.in: Merged + revision 186095 from trunk to handle parsing of latest NCBI gff3 files + + * pair.c, stage2.c: Changed occurrences of abs() to explicit conditional + statements, since abs() can give large integers with -m64 compiler flag + + * gsnap.c, stage1hr.c, stage1hr.h, uniqscan.c: Added option --max-anchors + + * samread.c: Made version consistent with trunk, adding breaks after cases + and handling EOF. Added debugging statements. + + * sam_sort.c: Added todo comment + 2016-02-18 twu * stage3hr.c: Applied revision 184528 from trunk to check for stage2pairs diff --git a/src/gsnap.c b/src/gsnap.c index bf91e19..add5511 100644 --- a/src/gsnap.c +++ b/src/gsnap.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: gsnap.c 181923 2016-01-08 00:43:56Z twu $"; +static char rcsid[] = "$Id: gsnap.c 186090 2016-03-17 22:21:17Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -330,6 +330,9 @@ static Width_T required_index1interval = 0; static Width_T spansize; static int indexdb_size_threshold; +/* Option for GSNAP complete set algorithm. Affects speed significantly. */ +static int max_anchors = 10; + /* Genes IIT */ static char *genes_file = (char *) NULL; @@ -555,6 +558,7 @@ static struct option long_options[] = { {"runlengthdir", required_argument, 0, 0}, /* user_runlengthdir */ {"use-runlength", required_argument, 0, 0}, /* runlength_root */ + {"max-anchors", required_argument, 0, 0}, /* max_anchors */ {"gmap-mode", required_argument, 0, 0}, /* gmap_mode */ {"trigger-score-for-gmap", required_argument, 0, 0}, /* trigger_score_for_gmap */ {"gmap-min-match-length", required_argument, 0, 0}, /* gmap_min_nconsecutive */ @@ -1728,6 +1732,9 @@ parse_command_line (int argc, char *argv[], int optind) { } else if (!strcmp(long_name,"use-runlength")) { runlength_root = optarg; + } else if (!strcmp(long_name,"max-anchors")) { + max_anchors = atoi(check_valid_int(optarg)); + } else if (!strcmp(long_name,"gmap-mode")) { gmap_mode = 0; /* Initialize */ string = strtok(optarg,","); @@ -3230,7 +3237,7 @@ worker_setup (char *genomesubdir, char *fileroot) { /*snpp*/snps_iit ? true : false,amb_closest_p,amb_clip_p,min_shortend); Splice_setup(min_shortend); Indel_setup(min_indel_end_matches,indel_penalty_middle); - Stage1hr_setup(use_sarray_p,use_only_sarray_p,index1part,index1interval,spansize,chromosome_iit,nchromosomes, + Stage1hr_setup(use_sarray_p,use_only_sarray_p,index1part,index1interval,spansize,max_anchors,chromosome_iit,nchromosomes, genomecomp,genomecomp_alt,mode,maxpaths_search, splicesites,splicetypes,splicedists,nsplicesites, novelsplicingp,knownsplicingp,find_dna_chimeras_p,distances_observed_p, @@ -4120,6 +4127,15 @@ is still designed to be fast.\n\ "); #endif + fprintf(stdout,"\ + --max-anchors=INT Controls number of candidate segments returned by the complete set algorithm\n\ + Default is 10. Can be increased to higher values to solve alignments with\n\ + evenly spaced mismatches at close distances. However, higher values will\n\ + cause GSNAP to run more slowly. A value of 1000, for example, slows down\n\ + the program by a factor of 10 or so. Therefore, change this value only if\n\ + absolutely necessary.\n\ +"); + fprintf(stdout,"\n"); fprintf(stdout,"Options for GMAP alignment within GSNAP\n"); fprintf(stdout,"\ diff --git a/src/pair.c b/src/pair.c index 3fd1649..56e3894 100644 --- a/src/pair.c +++ b/src/pair.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: pair.c 179365 2015-11-20 22:15:56Z twu $"; +static char rcsid[] = "$Id: pair.c 186092 2016-03-17 22:28:37Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -853,7 +853,13 @@ add_intronlengths (struct T *pairs, int npairs) { } if (comp == DUALBREAK_COMP || comp == EXTRAEXON_COMP) { - sprintf(cdnabreak,"%d",abs(this->querypos - last_querypos)-1); + /* abs() gives a large value when flag -m64 is specified */ + /* sprintf(cdnabreak,"%d",abs(this->querypos - last_querypos)-1); */ + if (this->querypos > last_querypos) { + sprintf(cdnabreak,"%d",(this->querypos - last_querypos) - 1); + } else { + sprintf(cdnabreak,"%d",(last_querypos - this->querypos) - 1); + } if (this->genomepos < last_genomepos) { sprintf(genomicbreak,"%d",last_genomepos - this->genomepos - 1); } else { @@ -2561,7 +2567,7 @@ print_gff3_exons_forward (Filestring_T fp, struct T *pairs, int npairs, int path int pctidentity, num = 0, den = 0, exonno = 0, i; int Mlength = 0, Ilength = 0, Dlength = 0; List_T tokens = NULL; - char token[10]; + char token[11]; #if 0 int intronno = 0; #endif @@ -2631,7 +2637,14 @@ print_gff3_exons_forward (Filestring_T fp, struct T *pairs, int npairs, int path } if (gff_estmatch_format_p == true && i > 0) { - sprintf(token,"N%u",abs(intron_end - intron_start) + 1); + /* abs() gives a large value when flag -m64 is specified */ + /* sprintf(token,"N%u",abs(intron_end - intron_start) + 1); */ + if (intron_end > intron_start) { + sprintf(token,"N%u",(intron_end - intron_start) + 1); + } else { + sprintf(token,"N%u",(intron_start - intron_end) + 1); + } + tokens = push_token(tokens,token); } else if (gff_introns_p == true) { if (i > 0) { @@ -5137,7 +5150,7 @@ hardclip_pairs (int *clipped_npairs, int hardclip_start, int hardclip_end, List_T Pair_clean_cigar (List_T tokens, bool watsonp) { List_T clean, unique = NULL, p; - char token[10], *curr_token, *last_token; + char token[11], *curr_token, *last_token; int length = 0; char type, last_type = ' '; bool duplicatep = false; @@ -5222,7 +5235,7 @@ List_T Pair_compute_cigar (bool *intronp, int *hardclip_start, int *hardclip_end, struct T *pairs, int npairs, int querylength_given, bool watsonp, int sensedir, int chimera_part) { List_T tokens = NULL; - char token[10]; + char token[11]; int Mlength = 0, Ilength = 0, Dlength = 0; bool in_exon = false, deletionp; struct T *ptr, *prev, *this = NULL; @@ -5322,7 +5335,13 @@ Pair_compute_cigar (bool *intronp, int *hardclip_start, int *hardclip_end, struc if (prev != NULL) { /* Gap */ - genome_gap = abs(intron_end - intron_start) + 1; + /* abs() gives a large value when flag -m64 is specified */ + /* genome_gap = abs(intron_end - intron_start) + 1; */ + if (intron_end > intron_start) { + genome_gap = (intron_end - intron_start) + 1; + } else { + genome_gap = (intron_start - intron_end) + 1; + } deletionp = false; #ifdef CONVERT_INTRONS_TO_DELETIONS @@ -5843,7 +5862,7 @@ state_print (MD_state_T state) { static List_T compute_md_string_old (int *nmismatches, struct T *pairs, int npairs, bool watsonp) { List_T tokens = NULL; - char token[10], *first_token; + char token[11], *first_token; int nmatches = 0; struct T *ptr, *prev, *this = NULL; MD_state_T state = IN_MISMATCHES; @@ -6016,7 +6035,7 @@ static List_T compute_md_string (int *nmismatches_refdiff, int *nmismatches_bothdiff, int *nindels, struct T *pairs, int npairs, bool watsonp, List_T cigar_tokens) { List_T md_tokens = NULL, p; - char *cigar_token, token[10], *first_token, type; + char *cigar_token, token[11], *first_token, type; Pair_T this; int nmatches = 0, length; MD_state_T state = IN_MISMATCHES; @@ -6508,7 +6527,12 @@ count_psl_blocks_nt (Intlist_T *blockSizes, Intlist_T *qStarts, Uintlist_T *tSta nblocks++; block_queryend = last_querypos; debug2(FPRINTF(fp,"Block size: %d\n",abs(block_queryend-block_querystart)+1)); - *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1); + /* *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1); */ + if (block_queryend > block_querystart) { + *blockSizes = Intlist_push(*blockSizes,(block_queryend-block_querystart)+1); + } else { + *blockSizes = Intlist_push(*blockSizes,(block_querystart-block_queryend)+1); + } in_block = false; } } else if (this->comp == INTRONGAP_COMP) { @@ -6519,7 +6543,12 @@ count_psl_blocks_nt (Intlist_T *blockSizes, Intlist_T *qStarts, Uintlist_T *tSta nblocks++; block_queryend = last_querypos; debug2(FPRINTF(fp,"Block size: %d\n",abs(block_queryend-block_querystart)+1)); - *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1); + /* *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1); */ + if (block_queryend > block_querystart) { + *blockSizes = Intlist_push(*blockSizes,(block_queryend-block_querystart)+1); + } else { + *blockSizes = Intlist_push(*blockSizes,(block_querystart-block_queryend)+1); + } in_block = false; } @@ -6555,7 +6584,12 @@ count_psl_blocks_nt (Intlist_T *blockSizes, Intlist_T *qStarts, Uintlist_T *tSta nblocks++; block_queryend = last_querypos; debug2(FPRINTF(fp,"Block size: %d\n",abs(block_queryend-block_querystart)+1)); - *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1); + /* *blockSizes = Intlist_push(*blockSizes,abs(block_queryend-block_querystart)+1); */ + if (block_queryend > block_querystart) { + *blockSizes = Intlist_push(*blockSizes,(block_queryend-block_querystart)+1); + } else { + *blockSizes = Intlist_push(*blockSizes,(block_querystart-block_queryend)+1); + } } *blockSizes = Intlist_reverse(*blockSizes); @@ -8140,7 +8174,7 @@ Pair_print_compressed (Filestring_T fp, int pathnum, int npaths, T start, T end, Chrpos_T exon_genomestart = -1, exon_genomeend, intron_start, intron_end; int num = 0, den = 0, runlength = 0, i; int print_dinucleotide_p; - char token[10], donor[3], acceptor[3], *chr; + char token[11], donor[3], acceptor[3], *chr; double coverage; /* double trimmed_coverage; */ int last_querypos = -1; diff --git a/src/sam_sort.c b/src/sam_sort.c index 9438ad7..a49097b 100644 --- a/src/sam_sort.c +++ b/src/sam_sort.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: sam_sort.c 155408 2014-12-16 07:00:44Z twu $"; +static char rcsid[] = "$Id: sam_sort.c 186087 2016-03-17 22:12:51Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -27,6 +27,9 @@ static char rcsid[] = "$Id: sam_sort.c 155408 2014-12-16 07:00:44Z twu $"; #include "getopt.h" +/* To do: Check of reverse complements of paired-reads work properly. + May want to use low/high instead of first/second read */ + /************************************************************************ * * Check for correctness: diff --git a/src/samread.c b/src/samread.c index 440a380..a3253b0 100644 --- a/src/samread.c +++ b/src/samread.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: samread.c 154570 2014-12-03 22:06:33Z twu $"; +static char rcsid[] = "$Id: samread.c 186088 2016-03-17 22:13:46Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -25,6 +25,13 @@ static char rcsid[] = "$Id: samread.c 154570 2014-12-03 22:06:33Z twu $"; #define debug(x) #endif +/* Prints location of parsing error for Unexpected output type */ +#ifdef DEBUG1 +#define debug1(x) x +#else +#define debug1(x) +#endif + static int cigar_string_readlength (int *hardclip_low, int *hardclip_high, char *cigar) { @@ -149,10 +156,10 @@ Samread_get_acc (unsigned int *flag, char *line) { /* Takes advantage of the fact that sam_sort knows the linelength */ char * Samread_get_acc_fromfile (int *acclength, FILE *fp, int linelength) { - char *acc, *p; + char *acc, *p, c; p = acc = MALLOC((linelength + 1) * sizeof(char)); - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; /* terminating char */ *acclength = (p - acc)/sizeof(char); @@ -179,9 +186,11 @@ parse_XO_fromfile (FILE *fp) { if (abbrev1 == 'M') { return OUTPUT_NM; } else { + debug1(fprintf(stderr,"parse_XO_from_file 1: ")); fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; } + break; case 'C': switch (abbrev1) { case 'U': return OUTPUT_CU; @@ -189,8 +198,11 @@ parse_XO_fromfile (FILE *fp) { case 'T': return OUTPUT_CT; case 'M': return OUTPUT_CM; case 'X': return OUTPUT_CX; - default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; + default: + debug1(fprintf(stderr,"parse_XO_from_file 2: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; } + break; case 'H': switch (abbrev1) { case 'U': return OUTPUT_HU; @@ -198,8 +210,11 @@ parse_XO_fromfile (FILE *fp) { case 'T': return OUTPUT_HT; case 'M': return OUTPUT_HM; case 'X': return OUTPUT_HX; - default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; + default: + debug1(fprintf(stderr,"parse_XO_from_file 3: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; } + break; case 'U': switch (abbrev1) { case 'U': return OUTPUT_UU; @@ -207,8 +222,11 @@ parse_XO_fromfile (FILE *fp) { case 'T': return OUTPUT_UT; case 'M': return OUTPUT_UM; case 'X': return OUTPUT_UX; - default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; + default: + debug1(fprintf(stderr,"parse_XO_from_file 4: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; } + break; case 'P': switch (abbrev1) { case 'C': return OUTPUT_PC; @@ -217,72 +235,91 @@ parse_XO_fromfile (FILE *fp) { case 'L': return OUTPUT_PL; case 'M': return OUTPUT_PM; case 'X': return OUTPUT_PX; - default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; + default: + debug1(fprintf(stderr,"parse_XO_from_file 5: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; } - default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; + break; + default: + debug1(fprintf(stderr,"parse_XO_from_file 6: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; } } - while (c != '\0') { - while ((c = fgetc(fp)) != '\0' && c != '\t') ; - if (c == '\0') { - return OUTPUT_NONE; - } else { - c0 = fgetc(fp); - c1 = fgetc(fp); - if (c0 == 'X' && c1 == 'O') { - fgetc(fp); /* : */ - fgetc(fp); /* type */ - fgetc(fp); /* : */ - abbrev0 = fgetc(fp); - abbrev1 = fgetc(fp); - switch (abbrev0) { - case 'N': - if (abbrev1 == 'M') { - return OUTPUT_NM; - } else { - fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); - return OUTPUT_NONE; - } - case 'C': - switch (abbrev1) { - case 'U': return OUTPUT_CU; - case 'C': return OUTPUT_CC; - case 'T': return OUTPUT_CT; - case 'M': return OUTPUT_CM; - case 'X': return OUTPUT_CX; - default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; - } - case 'H': - switch (abbrev1) { - case 'U': return OUTPUT_HU; - case 'C': return OUTPUT_HC; - case 'T': return OUTPUT_HT; - case 'M': return OUTPUT_HM; - case 'X': return OUTPUT_HX; - default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; - } - case 'U': - switch (abbrev1) { - case 'U': return OUTPUT_UU; - case 'C': return OUTPUT_UC; - case 'T': return OUTPUT_UT; - case 'M': return OUTPUT_UM; - case 'X': return OUTPUT_UX; - default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; - } - case 'P': - switch (abbrev1) { - case 'C': return OUTPUT_PC; - case 'I': return OUTPUT_PI; - case 'S': return OUTPUT_PS; - case 'L': return OUTPUT_PL; - case 'M': return OUTPUT_PM; - case 'X': return OUTPUT_PX; - default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; - } - default: fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; + while ((c = fgetc(fp)) != EOF && c != '\0' && c != '\t') ; + if (c == EOF || c == '\0') { + return OUTPUT_NONE; + } else { + c0 = fgetc(fp); + c1 = fgetc(fp); + if (c0 == 'X' && c1 == 'O') { + fgetc(fp); /* : */ + fgetc(fp); /* type */ + fgetc(fp); /* : */ + abbrev0 = fgetc(fp); + abbrev1 = fgetc(fp); + switch (abbrev0) { + case 'N': + if (abbrev1 == 'M') { + return OUTPUT_NM; + } else { + debug1(fprintf(stderr,"parse_XO_from_file 7: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); + return OUTPUT_NONE; } + break; + case 'C': + switch (abbrev1) { + case 'U': return OUTPUT_CU; + case 'C': return OUTPUT_CC; + case 'T': return OUTPUT_CT; + case 'M': return OUTPUT_CM; + case 'X': return OUTPUT_CX; + default: + debug1(fprintf(stderr,"parse_XO_from_file 8: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; + } + break; + case 'H': + switch (abbrev1) { + case 'U': return OUTPUT_HU; + case 'C': return OUTPUT_HC; + case 'T': return OUTPUT_HT; + case 'M': return OUTPUT_HM; + case 'X': return OUTPUT_HX; + default: + debug1(fprintf(stderr,"parse_XO_from_file 9: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; + } + break; + case 'U': + switch (abbrev1) { + case 'U': return OUTPUT_UU; + case 'C': return OUTPUT_UC; + case 'T': return OUTPUT_UT; + case 'M': return OUTPUT_UM; + case 'X': return OUTPUT_UX; + default: + debug1(fprintf(stderr,"parse_XO_from_file 10: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; + } + break; + case 'P': + switch (abbrev1) { + case 'C': return OUTPUT_PC; + case 'I': return OUTPUT_PI; + case 'S': return OUTPUT_PS; + case 'L': return OUTPUT_PL; + case 'M': return OUTPUT_PM; + case 'X': return OUTPUT_PX; + default: + debug1(fprintf(stderr,"parse_XO_from_file 11: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; + } + break; + default: + debug1(fprintf(stderr,"parse_XO_from_file 12: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); return OUTPUT_NONE; } } } @@ -310,7 +347,7 @@ parse_XO_and_HI_fromfile (char **hiti, FILE *fp) { fgetc(fp); /* : */ p = *hiti; - while ((c = *p++ = fgetc(fp)) != '\0' && c != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\0' && c != '\t') ; *--p = '\0'; /* terminating char */ } else if (c0 == 'X' && c1 == 'O') { @@ -324,9 +361,11 @@ parse_XO_and_HI_fromfile (char **hiti, FILE *fp) { if (abbrev1 == 'M') { split_output = OUTPUT_NM; } else { + debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 1: ")); fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); split_output = OUTPUT_NONE; } + break; case 'C': switch (abbrev1) { case 'U': split_output = OUTPUT_CU; break; @@ -335,9 +374,11 @@ parse_XO_and_HI_fromfile (char **hiti, FILE *fp) { case 'M': split_output = OUTPUT_CM; break; case 'X': split_output = OUTPUT_CX; break; default: + debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 2: ")); fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); split_output = OUTPUT_NONE; } + break; case 'H': switch (abbrev1) { case 'U': split_output = OUTPUT_HU; break; @@ -346,9 +387,11 @@ parse_XO_and_HI_fromfile (char **hiti, FILE *fp) { case 'M': split_output = OUTPUT_HM; break; case 'X': split_output = OUTPUT_HX; break; default: + debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 3: ")); fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); split_output = OUTPUT_NONE; } + break; case 'U': switch (abbrev1) { case 'U': split_output = OUTPUT_UU; break; @@ -357,9 +400,11 @@ parse_XO_and_HI_fromfile (char **hiti, FILE *fp) { case 'M': split_output = OUTPUT_UM; break; case 'X': split_output = OUTPUT_UX; break; default: + debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 4: ")); fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); split_output = OUTPUT_NONE; } + break; case 'P': switch (abbrev1) { case 'C': split_output = OUTPUT_PC; break; @@ -369,94 +414,106 @@ parse_XO_and_HI_fromfile (char **hiti, FILE *fp) { case 'M': split_output = OUTPUT_PM; break; case 'X': split_output = OUTPUT_PX; break; default: + debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 5: ")); fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); split_output = OUTPUT_NONE; } + break; default: + debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 6: ")); fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); split_output = OUTPUT_NONE; } } - while (c != '\0') { - while ((c = fgetc(fp)) != '\0' && c != '\t') ; - if (c == '\0') { - return split_output; - } else { - c0 = fgetc(fp); - c1 = fgetc(fp); - if (c0 == 'H' && c1 == 'I') { - fgetc(fp); /* : */ - fgetc(fp); /* type */ - fgetc(fp); /* : */ - - p = *hiti; - while ((c = *p++ = fgetc(fp)) != '\0' && c != '\t') ; - *--p = '\0'; /* terminating char */ - - } else if (c0 == 'X' && c1 == 'O') { - fgetc(fp); /* : */ - fgetc(fp); /* type */ - fgetc(fp); /* : */ - abbrev0 = fgetc(fp); - abbrev1 = fgetc(fp); - switch (abbrev0) { - case 'N': - if (abbrev1 == 'M') { - split_output = OUTPUT_NM; - } else { - fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); - split_output = OUTPUT_NONE; - } - case 'C': - switch (abbrev1) { - case 'U': split_output = OUTPUT_CU; break; - case 'C': split_output = OUTPUT_CC; break; - case 'T': split_output = OUTPUT_CT; break; - case 'M': split_output = OUTPUT_CM; break; - case 'X': split_output = OUTPUT_CX; break; - default: - fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); - split_output = OUTPUT_NONE; - } - case 'H': - switch (abbrev1) { - case 'U': split_output = OUTPUT_HU; break; - case 'C': split_output = OUTPUT_HC; break; - case 'T': split_output = OUTPUT_HT; break; - case 'M': split_output = OUTPUT_HM; break; - case 'X': split_output = OUTPUT_HX; break; - default: - fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); - split_output = OUTPUT_NONE; - } - case 'U': - switch (abbrev1) { - case 'U': split_output = OUTPUT_UU; break; - case 'C': split_output = OUTPUT_UC; break; - case 'T': split_output = OUTPUT_UT; break; - case 'M': split_output = OUTPUT_UM; break; - case 'X': split_output = OUTPUT_UX; break; - default: - fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); - split_output = OUTPUT_NONE; - } - case 'P': - switch (abbrev1) { - case 'C': split_output = OUTPUT_PC; break; - case 'I': split_output = OUTPUT_PI; break; - case 'S': split_output = OUTPUT_PS; break; - case 'L': split_output = OUTPUT_PL; break; - case 'M': split_output = OUTPUT_PM; break; - case 'X': split_output = OUTPUT_PX; break; - default: - fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); - split_output = OUTPUT_NONE; - } + while ((c = fgetc(fp)) != EOF && c != '\0' && c != '\t') ; + if (c == EOF || c == '\0') { + return split_output; + } else { + c0 = fgetc(fp); + c1 = fgetc(fp); + if (c0 == 'H' && c1 == 'I') { + fgetc(fp); /* : */ + fgetc(fp); /* type */ + fgetc(fp); /* : */ + + p = *hiti; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\0' && c != '\t') ; + *--p = '\0'; /* terminating char */ + + } else if (c0 == 'X' && c1 == 'O') { + fgetc(fp); /* : */ + fgetc(fp); /* type */ + fgetc(fp); /* : */ + abbrev0 = fgetc(fp); + abbrev1 = fgetc(fp); + switch (abbrev0) { + case 'N': + if (abbrev1 == 'M') { + split_output = OUTPUT_NM; + } else { + debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 7: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); + split_output = OUTPUT_NONE; + } + break; + case 'C': + switch (abbrev1) { + case 'U': split_output = OUTPUT_CU; break; + case 'C': split_output = OUTPUT_CC; break; + case 'T': split_output = OUTPUT_CT; break; + case 'M': split_output = OUTPUT_CM; break; + case 'X': split_output = OUTPUT_CX; break; + default: + debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 8: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); + split_output = OUTPUT_NONE; + } + break; + case 'H': + switch (abbrev1) { + case 'U': split_output = OUTPUT_HU; break; + case 'C': split_output = OUTPUT_HC; break; + case 'T': split_output = OUTPUT_HT; break; + case 'M': split_output = OUTPUT_HM; break; + case 'X': split_output = OUTPUT_HX; break; default: + debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 9: ")); fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); split_output = OUTPUT_NONE; } + break; + case 'U': + switch (abbrev1) { + case 'U': split_output = OUTPUT_UU; break; + case 'C': split_output = OUTPUT_UC; break; + case 'T': split_output = OUTPUT_UT; break; + case 'M': split_output = OUTPUT_UM; break; + case 'X': split_output = OUTPUT_UX; break; + default: + debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 10: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); + split_output = OUTPUT_NONE; + } + break; + case 'P': + switch (abbrev1) { + case 'C': split_output = OUTPUT_PC; break; + case 'I': split_output = OUTPUT_PI; break; + case 'S': split_output = OUTPUT_PS; break; + case 'L': split_output = OUTPUT_PL; break; + case 'M': split_output = OUTPUT_PM; break; + case 'X': split_output = OUTPUT_PX; break; + default: + debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 11: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); + split_output = OUTPUT_NONE; + } + break; + default: + debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 12: ")); + fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1); + split_output = OUTPUT_NONE; } } } @@ -471,7 +528,7 @@ char * Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM_split_output_type *split_output, char **hiti, Univcoord_T *genomicpos, int *initial_softclip, bool *query_lowp, FILE *fp, Univ_IIT_T chromosome_iit, Univcoord_T *chroffsets, int linelength) { - char *acc, *p; + char *acc, *p, c; char *substring; Chrnum_T chrnum, mate_chrnum; Chrpos_T chrpos, mate_chrpos; @@ -481,13 +538,13 @@ Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM /* 1. QNAME */ p = acc = MALLOC((linelength + 1) * sizeof(char)); - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; /* terminating char */ *acclength = (p - acc)/sizeof(char); /* 2. FLAG. Skip */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (sscanf(substring,"%u",&(*flag)) != 1) { @@ -499,13 +556,15 @@ Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM /* 3. RNAME: chr. Skip */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (!strcmp(substring,"*")) { *genomicpos = 0; } else if ((chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) { fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring); + fprintf(stderr,"Available chromosomes in the genome: "); + Univ_IIT_dump_labels(stderr,chromosome_iit); exit(9); } else { *genomicpos = chroffsets[chrnum - 1]; @@ -513,7 +572,7 @@ Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM /* 4. POS: chrpos */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (sscanf(substring,"%u",&chrpos) != 1) { @@ -524,11 +583,11 @@ Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM } /* 5. MAPQ: Mapping quality. Skip */ - while (fgetc(fp) != '\t') ; + while ((c = fgetc(fp)) != EOF && c != '\t') ; /* 6. CIGAR. Parse for initial_softclip */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; *initial_softclip = cigar_string_initial_softclip(substring); @@ -536,7 +595,7 @@ Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM /* 7. MRNM: Mate chr */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (!strcmp(substring,"*")) { @@ -546,6 +605,8 @@ Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM mate_genomicpos = chroffsets[mate_chrnum - 1]; } else if ((mate_chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) { fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring); + fprintf(stderr,"Available chromosomes in the genome: "); + Univ_IIT_dump_labels(stderr,chromosome_iit); exit(9); } else { mate_genomicpos = chroffsets[mate_chrnum - 1]; @@ -553,7 +614,7 @@ Samread_parse_acc_and_softclip_fromfile (int *acclength, unsigned int *flag, SAM /* 8. MPOS: Mate chrpos */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (sscanf(substring,"%d",&mate_chrpos) != 1) { @@ -779,7 +840,7 @@ void Samread_parse_line_fromfile (FILE *fp, char **acc, unsigned int *flag, int *mapq, char **chr, Chrpos_T *chrpos, char **cigar, char **mate_chr, Chrpos_T *mate_chrpos_low, int *readlength, char **read, char **quality_string, int linelength) { - char *p; + char *p, c; int length, i; char *substring; @@ -789,12 +850,12 @@ Samread_parse_line_fromfile (FILE *fp, char **acc, unsigned int *flag, int *mapq /* 1. QNAME */ p = *acc = (char *) MALLOC((linelength + 1) * sizeof(char)); - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; /* 2. FLAG */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (sscanf(substring,"%u",&(*flag)) != 1) { @@ -806,12 +867,12 @@ Samread_parse_line_fromfile (FILE *fp, char **acc, unsigned int *flag, int *mapq /* 3. RNAME: chr */ p = *chr = (char *) MALLOC((linelength + 1) * sizeof(char)); - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; /* 4. POS: chrpos */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (sscanf(substring,"%u",&(*chrpos)) != 1) { @@ -823,7 +884,7 @@ Samread_parse_line_fromfile (FILE *fp, char **acc, unsigned int *flag, int *mapq /* 5. MAPQ: Mapping quality */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (sscanf(substring,"%d",&(*mapq)) != 1) { @@ -835,17 +896,17 @@ Samread_parse_line_fromfile (FILE *fp, char **acc, unsigned int *flag, int *mapq /* 5. CIGAR */ p = *cigar = (char *) MALLOC((linelength + 1) * sizeof(char)); - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; /* 7. MRNM: Mate chr */ p = *mate_chr = (char *) MALLOC((linelength + 1) * sizeof(char)); - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; /* 8. MPOS: Mate chrpos */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (sscanf(substring,"%d",&(*mate_chrpos_low)) != 1) { @@ -860,13 +921,13 @@ Samread_parse_line_fromfile (FILE *fp, char **acc, unsigned int *flag, int *mapq /* 10. SEQ: queryseq */ p = *read = (char *) MALLOC((linelength + 1) * sizeof(char)); - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; *readlength = (p - *read)/sizeof(char); /* 11. QUAL: quality scores */ p = *quality_string = (char *) MALLOC((*readlength + 1) * sizeof(char)); - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; length = (p - *quality_string)/sizeof(char); @@ -914,7 +975,7 @@ Samread_parse_genomicpos_fromfile (FILE *fp, unsigned int *flag, SAM_split_outpu Univcoord_T genomicpos; Chrnum_T chrnum; Chrpos_T chrpos; - char *substring, *p; + char *substring, *p, c; substring = MALLOCA((linelength + 1) * sizeof(char)); @@ -923,7 +984,7 @@ Samread_parse_genomicpos_fromfile (FILE *fp, unsigned int *flag, SAM_split_outpu /* 2. FLAG */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (sscanf(substring,"%u",&(*flag)) != 1) { @@ -935,13 +996,15 @@ Samread_parse_genomicpos_fromfile (FILE *fp, unsigned int *flag, SAM_split_outpu /* 3. RNAME: chr */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (!strcmp(substring,"*")) { genomicpos = 0; } else if ((chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) { fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring); + fprintf(stderr,"Available chromosomes in the genome: "); + Univ_IIT_dump_labels(stderr,chromosome_iit); exit(9); } else { genomicpos = chroffsets[chrnum - 1]; @@ -949,7 +1012,7 @@ Samread_parse_genomicpos_fromfile (FILE *fp, unsigned int *flag, SAM_split_outpu /* 4. POS: chrpos */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (sscanf(substring,"%u",&chrpos) != 1) { @@ -992,22 +1055,20 @@ char * Samread_parse_aux_fromfile (FILE *fp, char *auxfield, int linelength) { char *value, *p, c = 1, c0, c1; - while (c != '\0') { - while ((c = fgetc(fp)) != '\0' && c != '\t') ; - if (c == '\0') { - return (char *) NULL; - } else { - c0 = fgetc(fp); - c1 = fgetc(fp); - if (c0 == auxfield[0] && c1 == auxfield[1]) { - fgetc(fp); /* : */ - fgetc(fp); /* type */ - fgetc(fp); /* : */ - p = value = MALLOC((linelength+1) * sizeof(char)); - while ((c = *p++ = fgetc(fp)) != '\0' && c != '\t') ; - *--p = '\0'; /* terminating char */ - return value; - } + while ((c = fgetc(fp)) != EOF && c != '\0' && c != '\t') ; + if (c == '\0') { + return (char *) NULL; + } else { + c0 = fgetc(fp); + c1 = fgetc(fp); + if (c0 == auxfield[0] && c1 == auxfield[1]) { + fgetc(fp); /* : */ + fgetc(fp); /* type */ + fgetc(fp); /* : */ + p = value = MALLOC((linelength+1) * sizeof(char)); + while ((c = *p++ = fgetc(fp)) != EOF && c != '\0' && c != '\t') ; + *--p = '\0'; /* terminating char */ + return value; } } @@ -1032,7 +1093,7 @@ Samread_parse_read_and_mateinfo_fromfile (FILE *fp, unsigned int *flag, char **m /* 2. FLAG */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (sscanf(substring,"%u",&(*flag)) != 1) { @@ -1053,7 +1114,7 @@ Samread_parse_read_and_mateinfo_fromfile (FILE *fp, unsigned int *flag, char **m /* 6. CIGAR. Parse for cigar_readlength. */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; /* For a nomapper, this readlength is incorrect */ @@ -1062,12 +1123,12 @@ Samread_parse_read_and_mateinfo_fromfile (FILE *fp, unsigned int *flag, char **m /* 7. MRNM: Mate chr */ p = *mate_chr = (char *) MALLOC((linelength + 1) * sizeof(char)); - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; /* 8. MPOS: Mate chrpos */ w p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (sscanf(substring,"%d",&(*mate_chrpos)) != 1) { @@ -1085,7 +1146,7 @@ w p = substring; p = &((*read)[hardclip_low]); subseq_length = 0; - while ((*p++ = fgetc(fp)) != '\t') { + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') { subseq_length++; } @@ -1143,7 +1204,7 @@ Samread_parse_read_fromfile (FILE *fp, unsigned int *flag, int *readlength, char /* 2. FLAG. Skip */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (sscanf(substring,"%u",&(*flag)) != 1) { @@ -1164,7 +1225,7 @@ Samread_parse_read_fromfile (FILE *fp, unsigned int *flag, int *readlength, char /* 6. CIGAR. Parse for cigar_readlength. */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; /* For a nomapper, this readlength is incorrect */ @@ -1185,7 +1246,7 @@ Samread_parse_read_fromfile (FILE *fp, unsigned int *flag, int *readlength, char p = &((*read)[hardclip_low]); subseq_length = 0; - while ((*p++ = fgetc(fp)) != '\t') { + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') { subseq_length++; } @@ -1235,7 +1296,7 @@ Samread_parse_mate_genomicpos_fromfile (FILE *fp, Univ_IIT_T chromosome_iit, Uni Univcoord_T mate_genomicpos; Chrpos_T mate_chrpos; Chrnum_T chrnum, mate_chrnum; - char *p; + char *p, c; char *substring; substring = MALLOCA((linelength + 1) * sizeof(char)); @@ -1248,7 +1309,7 @@ Samread_parse_mate_genomicpos_fromfile (FILE *fp, Univ_IIT_T chromosome_iit, Uni /* 3. RNAME: chr */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (!strcmp(substring,"*")) { @@ -1268,7 +1329,7 @@ Samread_parse_mate_genomicpos_fromfile (FILE *fp, Univ_IIT_T chromosome_iit, Uni /* 7. MRNM: Mate chr */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (!strcmp(substring,"*")) { @@ -1278,6 +1339,8 @@ Samread_parse_mate_genomicpos_fromfile (FILE *fp, Univ_IIT_T chromosome_iit, Uni mate_genomicpos = chroffsets[mate_chrnum - 1]; } else if ((mate_chrnum = Univ_IIT_find_one(chromosome_iit,substring)) < 0) { fprintf(stderr,"Cannot find chromosome %s in chromosome IIT file\n",substring); + fprintf(stderr,"Available chromosomes in the genome: "); + Univ_IIT_dump_labels(stderr,chromosome_iit); exit(9); } else { mate_genomicpos = chroffsets[mate_chrnum - 1]; @@ -1285,7 +1348,7 @@ Samread_parse_mate_genomicpos_fromfile (FILE *fp, Univ_IIT_T chromosome_iit, Uni /* 8. MPOS: Mate chrpos */ p = substring; - while ((*p++ = fgetc(fp)) != '\t') ; + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') ; *--p = '\0'; if (sscanf(substring,"%d",&mate_chrpos) != 1) { @@ -1406,7 +1469,7 @@ Samread_print_as_duplicate_fromfile (FILE *fp, int linelength) { unsigned int flag; /* 1. QNAME */ - while ((c = fgetc(fp)) != '\t') { + while ((c = fgetc(fp)) != EOF && c != '\t') { putchar(c); nread++; } @@ -1415,7 +1478,7 @@ Samread_print_as_duplicate_fromfile (FILE *fp, int linelength) { /* 2. FLAG */ p = buffer; - while ((*p++ = fgetc(fp)) != '\t') { + while ((c = *p++ = fgetc(fp)) != EOF && c != '\t') { nread++; } nread++; diff --git a/src/stage1hr.c b/src/stage1hr.c index bf6a34f..c5dfbeb 100644 --- a/src/stage1hr.c +++ b/src/stage1hr.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: stage1hr.c 182407 2016-01-15 17:41:06Z twu $"; +static char rcsid[] = "$Id: stage1hr.c 186090 2016-03-17 22:21:17Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -96,7 +96,6 @@ static char rcsid[] = "$Id: stage1hr.c 182407 2016-01-15 17:41:06Z twu $"; #define MAX_NTERMINALS 100 #define MAX_ALLOCATION 200 -#define MAX_ANCHORS 1000 static bool use_sarray_p = true; static bool use_only_sarray_p = true; @@ -188,6 +187,8 @@ static int minendexon = 9; static int index1part; static int index1interval; static int spansize; +static int max_anchors; + static int two_index1intervals; static int min_kmer_readlength; static Univ_IIT_T chromosome_iit; @@ -4487,12 +4488,12 @@ identify_all_segments (int *nsegments, List_T *anchor_segments, Segment_T **spli debug(printf("%d all segments\n",List_length(all_segments))); debug(printf("%d anchor segments\n",List_length(*anchor_segments))); - if (List_length(all_segments) <= MAX_ANCHORS) { + if (List_length(all_segments) <= max_anchors) { /* Might as well use all segments */ List_free(&(*anchor_segments)); *anchor_segments = List_reverse(all_segments); - } else if (List_length(*anchor_segments) <= MAX_ANCHORS) { + } else if (List_length(*anchor_segments) <= max_anchors) { /* Use only the good anchor segments */ List_free(&all_segments); *anchor_segments = List_reverse(*anchor_segments); @@ -4506,9 +4507,9 @@ identify_all_segments (int *nsegments, List_T *anchor_segments, Segment_T **spli List_free(&(*anchor_segments)); *anchor_segments = (List_T) NULL; - length_threshold = array[MAX_ANCHORS]->querypos3 - array[MAX_ANCHORS]->querypos5; - n = MAX_ANCHORS; - while (n < nanchors && n < MAX_ANCHORS + /*ties*/100 && array[n]->querypos3 - array[n]->querypos5 == length_threshold) { + length_threshold = array[max_anchors]->querypos3 - array[max_anchors]->querypos5; + n = max_anchors; + while (n < nanchors && n < max_anchors + /*ties*/100 && array[n]->querypos3 - array[n]->querypos5 == length_threshold) { n++; } @@ -21081,7 +21082,7 @@ Stage1hr_cleanup () { void Stage1hr_setup (bool use_sarray_p_in, bool use_only_sarray_p_in, int index1part_in, int index1interval_in, - int spansize_in, Univ_IIT_T chromosome_iit_in, int nchromosomes_in, + int spansize_in, int max_anchors_in, Univ_IIT_T chromosome_iit_in, int nchromosomes_in, Genome_T genome_in, Genome_T genomealt, Mode_T mode_in, int maxpaths_search_in, Univcoord_T *splicesites_in, Splicetype_T *splicetypes_in, @@ -21109,6 +21110,7 @@ Stage1hr_setup (bool use_sarray_p_in, bool use_only_sarray_p_in, int index1part_ index1interval = index1interval_in; two_index1intervals = index1interval_in + index1interval_in; spansize = spansize_in; + max_anchors = max_anchors_in; min_kmer_readlength = index1part_in + index1interval_in - 1; chromosome_iit = chromosome_iit_in; diff --git a/src/stage1hr.h b/src/stage1hr.h index 84ba163..5bd43f6 100644 --- a/src/stage1hr.h +++ b/src/stage1hr.h @@ -1,4 +1,4 @@ -/* $Id: stage1hr.h 173896 2015-09-12 00:11:40Z twu $ */ +/* $Id: stage1hr.h 186090 2016-03-17 22:21:17Z twu $ */ #ifndef STAGE1HR_INCLUDED #define STAGE1HR_INCLUDED @@ -89,7 +89,7 @@ Stage1hr_cleanup (); extern void Stage1hr_setup (bool use_sarray_p_in, bool use_only_sarray_p_in, int index1part_in, int index1interval_in, - int spansize_in, Univ_IIT_T chromosome_iit_in, int nchromosomes_in, + int spansize_in, int max_anchors_in, Univ_IIT_T chromosome_iit_in, int nchromosomes_in, Genome_T genome_in, Genome_T genomealt, Mode_T mode_in, int maxpaths_search_in, Univcoord_T *splicesites_in, Splicetype_T *splicetypes_in, diff --git a/src/stage2.c b/src/stage2.c index c0e0e63..5298112 100644 --- a/src/stage2.c +++ b/src/stage2.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: stage2.c 183727 2016-02-04 00:40:29Z twu $"; +static char rcsid[] = "$Id: stage2.c 186092 2016-03-17 22:28:37Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -684,7 +684,12 @@ while (prevhit != -1 && (prevposition = mappings[prev_querypos][prevhit]) + inde prevlink = &(links[prev_querypos][prevhit]); gendistance = position - prevposition - indexsize_nt; - diffdistance = abs(gendistance - querydistance); + /* diffdistance = abs(gendistance - querydistance); */ + if (gendistance > querydistance) { + diffdistance = gendistance - querydistance; + } else { + diffdistance = querydistance - gendistance; + } if (diffdistance < maxintronlen) { if (diffdistance <= EQUAL_DISTANCE_NOT_SPLICING) { @@ -1054,7 +1059,12 @@ score_querypos_lookback_one ( prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]); gendistance = position - prevposition; - diffdistance = abs(gendistance - querydistance); + /* diffdistance = abs(gendistance - querydistance); */ + if (gendistance > querydistance) { + diffdistance = gendistance - querydistance; + } else { + diffdistance = querydistance - gendistance; + } #ifdef BAD_GMAX fwd_score = prevlink->fwd_score + querydist_credit - (diffdistance/ONE + 1) /*- querydist_penalty*/; @@ -1557,7 +1567,12 @@ score_querypos_lookback_mult ( prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]); gendistance = position - prevposition; - diffdistance = abs(gendistance - querydistance); + /* diffdistance = abs(gendistance - querydistance); */ + if (gendistance > querydistance) { + diffdistance = gendistance - querydistance; + } else { + diffdistance = querydistance - gendistance; + } #ifdef BAD_GMAX fwd_score = prevlink->fwd_score + querydist_credit - (diffdistance/ONE + 1) /*- querydist_penalty*/; @@ -1921,7 +1936,12 @@ score_querypos_lookforward_one ( prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]); gendistance = prevposition - position; - diffdistance = abs(gendistance - querydistance); + /* diffdistance = abs(gendistance - querydistance); */ + if (gendistance > querydistance) { + diffdistance = gendistance - querydistance; + } else { + diffdistance = querydistance - gendistance; + } #ifdef BAD_GMAX fwd_score = prevlink->fwd_score + querydist_credit - (diffdistance/ONE + 1) /*- querydist_penalty*/; @@ -2419,7 +2439,12 @@ score_querypos_lookforward_mult ( prevlink = &(/*links[prev_querypos]*/prev_links[prevhit]); gendistance = prevposition - position; - diffdistance = abs(gendistance - querydistance); + /* diffdistance = abs(gendistance - querydistance); */ + if (gendistance > querydistance) { + diffdistance = gendistance - querydistance; + } else { + diffdistance = querydistance - gendistance; + } #ifdef BAD_GMAX fwd_score = prevlink->fwd_score + querydist_credit - (diffdistance/ONE + 1) /*- querydist_penalty*/; diff --git a/src/uniqscan.c b/src/uniqscan.c index 58c9cc6..77699cc 100644 --- a/src/uniqscan.c +++ b/src/uniqscan.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: uniqscan.c 173896 2015-09-12 00:11:40Z twu $"; +static char rcsid[] = "$Id: uniqscan.c 186090 2016-03-17 22:21:17Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -1260,7 +1260,7 @@ main (int argc, char *argv[]) { spansize = Spanningelt_setup(index1part,index1interval); Indel_setup(min_indel_end_matches,indel_penalty_middle); Stage1hr_setup(/*use_sarray_p*/false,/*use_only_sarray_p*/false,index1part,index1interval, - spansize,chromosome_iit,nchromosomes, + spansize,/*max_anchors*/10,chromosome_iit,nchromosomes, genome,genomealt,mode,/*maxpaths_search*/10, splicesites,splicetypes,splicedists,nsplicesites, novelsplicingp,knownsplicingp,/*find_dna_chimeras_p*/false,distances_observed_p, diff --git a/util/gff3_genes.pl.in b/util/gff3_genes.pl.in index eb0fe73..017c010 100644 --- a/util/gff3_genes.pl.in +++ b/util/gff3_genes.pl.in @@ -5,6 +5,8 @@ use warnings; $gene_name = ""; $last_transcript_id = ""; +$last_cds_id = ""; +$cds_transcript_id = ""; @exons = (); @CDS_regions = (); while (defined($line = <>)) { @@ -32,7 +34,10 @@ while (defined($line = <>)) { print "$gene_name\n"; print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand); - } elsif ($#CDS_regions >= 0) { + } + @exons = (); + + if ($#CDS_regions >= 0) { # Handle last mRNA of previous gene if ($strand eq "+") { ($start,$end) = get_gene_bounds_plus(\@CDS_regions); @@ -44,15 +49,14 @@ while (defined($line = <>)) { die "strand $strand"; } print "$gene_name\n"; - print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand); + print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand); } + @CDS_regions = (); ($gene_name) = $fields[8] =~ /ID=([^;]+)/; $chr = $fields[0]; - @exons = (); - @CDS_regions = (); - } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript") { + } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript" || $fields[2] eq "ncRNA") { if ($#exons >= 0) { if ($strand eq "+") { ($start,$end) = get_gene_bounds_plus(\@exons); @@ -66,7 +70,10 @@ while (defined($line = <>)) { print "$gene_name\n"; print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand); - } elsif ($#CDS_regions >= 0) { + } + @exons = (); + + if ($#CDS_regions >= 0) { if ($strand eq "+") { ($start,$end) = get_gene_bounds_plus(\@CDS_regions); printf ">$last_transcript_id $chr:%u..%u\n",$start,$end; @@ -77,8 +84,9 @@ while (defined($line = <>)) { die "strand $strand"; } print "$gene_name\n"; - print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand); + print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand); } + @CDS_regions = (); ($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/; $strand = $fields[6]; @@ -87,14 +95,34 @@ while (defined($line = <>)) { if (!defined($gene_name) || $gene_name !~ /\S/) { $gene_name = $last_transcript_id; } - $chr = $fields[0]; - @exons = (); - @CDS_regions = (); + if (!defined($chr) || $chr !~ /\S/) { + $chr = $fields[0]; + } } elsif ($fields[2] eq "exon") { - push @exons,"$fields[3] $fields[4]"; + ($parent) = $fields[8] =~ /Parent=([^;]+)/; + if (!defined($parent) || $parent eq $last_transcript_id) { + push @exons,"$fields[3] $fields[4]"; + } + } elsif ($fields[2] eq "CDS") { - push @CDS_regions,"$fields[3] $fields[4]"; + ($cds_id) = $fields[8] =~ /ID=([^;]+)/; + if (!defined($cds_id)) { + push @CDS_regions,"$fields[3] $fields[4]"; + } else { + if ($cds_id ne $last_cds_id) { + if ($#CDS_regions > 0) { + print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand); + } + @CDS_regions = (); # Need to clear single CDS regions + } + push @CDS_regions,"$fields[3] $fields[4]"; + $last_cds_id = $cds_id; + } + ($cds_transcript_id) = $fields[8] =~ /Parent=([^;]+)/; + if (!defined($cds_transcript_id)) { + $cds_transcript_id = $last_transcript_id; + } } } } @@ -112,7 +140,9 @@ if ($#exons >= 0) { print "$gene_name\n"; print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand); -} elsif ($#CDS_regions >= 0) { +} +@exons = (); +if ($#CDS_regions >= 0) { if ($strand eq "+") { ($start,$end) = get_gene_bounds_plus(\@CDS_regions); printf ">$last_transcript_id $chr:%u..%u\n",$start,$end; @@ -125,6 +155,7 @@ if ($#exons >= 0) { print "$gene_name\n"; print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand); } +@CDS_regions = (); exit; diff --git a/util/gff3_introns.pl.in b/util/gff3_introns.pl.in index 73699e3..77acd71 100755 --- a/util/gff3_introns.pl.in +++ b/util/gff3_introns.pl.in @@ -28,6 +28,7 @@ if (defined($opt_d)) { $gene_name = ""; $last_transcript_id = ""; + $last_cds_id = ""; @exons = (); @CDS_regions = (); while (defined($line = <>)) { @@ -46,25 +47,29 @@ if (defined($opt_d)) { # Handle last mRNA of previous gene query_dinucleotides(\@exons,$chr,$strand,$FP); #print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP); - } elsif ($#CDS_regions > 0) { + } + @exons = (); # Need to clear single exons + if ($#CDS_regions > 0) { # Handle last mRNA of previous gene query_dinucleotides(\@CDS_regions,$chr,$strand,$FP); #print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP); } + @CDS_regions = (); # Need to clear single CDS regions ($gene_name) = $fields[8] =~ /ID=([^;]+)/; $chr = $fields[0]; - @exons = (); - @CDS_regions = (); - } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript") { + } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript" || $fields[2] eq "primary_transcript" || $fields[2] eq "ncRNA") { if ($#exons > 0) { query_dinucleotides(\@exons,$chr,$strand,$FP); #print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP); - } elsif ($#CDS_regions > 0) { + } + @exons = (); # Need to clear single exons + if ($#CDS_regions > 0) { query_dinucleotides(\@CDS_regions,$chr,$strand,$FP); #print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP); } + @CDS_regions = (); # Need to clear single CDS regions ($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/; $strand = $fields[6]; @@ -73,14 +78,34 @@ if (defined($opt_d)) { if (!defined($gene_name) || $gene_name !~ /\S/) { $gene_name = $last_transcript_id; } - $chr = $fields[0]; - @exons = (); - @CDS_regions = (); + if (!defined($chr) || $chr !~ /\S/) { + $chr = $fields[0]; + } } elsif ($fields[2] eq "exon") { - push @exons,"$fields[3] $fields[4]"; + ($parent) = $fields[8] =~ /Parent=([^;]+)/; + if (!defined($parent) || $parent eq $last_transcript_id) { + push @exons,"$fields[3] $fields[4]"; + } + } elsif ($fields[2] eq "CDS") { - push @CDS_regions,"$fields[3] $fields[4]"; + ($cds_id) = $fields[8] =~ /ID=([^;]+)/; + if (!defined($cds_id)) { + push @CDS_regions,"$fields[3] $fields[4]"; + } else { + if ($cds_id ne $last_cds_id) { + if ($#CDS_regions > 0) { + query_dinucleotides(\@CDS_regions,$chr,$strand,$FP); + } + @CDS_regions = (); # Need to clear single CDS regions + } + push @CDS_regions,"$fields[3] $fields[4]"; + $last_cds_id = $cds_id; + } + ($cds_transcript_id) = $fields[8] =~ /Parent=([^;]+)/; + if (!defined($cds_transcript_id)) { + $cds_transcript_id = $last_transcript_id; + } } } } @@ -88,9 +113,12 @@ if (defined($opt_d)) { if ($#exons > 0) { query_dinucleotides(\@exons,$chr,$strand,$FP); - } elsif ($#CDS_regions > 0) { + } + @exons = (); # Need to clear single exons + if ($#CDS_regions > 0) { query_dinucleotides(\@CDS_regions,$chr,$strand,$FP); } + @CDS_regions = (); # Need to clear single CDS regions close($FP); @@ -115,6 +143,8 @@ if (defined($opt_d)) { $gene_name = ""; $last_transcript_id = ""; +$last_cds_id = ""; +$cds_transcript_id = ""; @exons = (); @CDS_regions = (); while (defined($line = get_line())) { @@ -130,23 +160,27 @@ while (defined($line = get_line())) { if ($fields[2] eq "gene") { if ($#exons > 0) { # Handle last mRNA of previous gene - print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP); - } elsif ($#CDS_regions > 0) { + print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP,"exon"); + } + @exons = (); # Need to clear single exons + if ($#CDS_regions > 0) { # Handle last mRNA of previous gene - print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP); + print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand,$FP,"CDS"); } + @CDS_regions = (); # Need to clear single CDS regions ($gene_name) = $fields[8] =~ /ID=([^;]+)/; $chr = $fields[0]; - @exons = (); - @CDS_regions = (); - } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript") { + } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript" || $fields[2] eq "primary_transcript" || $fields[2] eq "ncRNA") { if ($#exons > 0) { - print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP); - } elsif ($#CDS_regions > 0) { - print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP); + print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP,"exon"); } + @exons = (); # Need to clear single exons + if ($#CDS_regions > 0) { + print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand,$FP,"CDS"); + } + @CDS_regions = (); # Need to clear single CDS regions ($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/; $strand = $fields[6]; @@ -159,22 +193,42 @@ while (defined($line = get_line())) { $chr = $fields[0]; } - @exons = (); - @CDS_regions = (); - } elsif ($fields[2] eq "exon") { - push @exons,"$fields[3] $fields[4]"; + ($parent) = $fields[8] =~ /Parent=([^;]+)/; + if (!defined($parent) || $parent eq $last_transcript_id) { + push @exons,"$fields[3] $fields[4]"; + } + } elsif ($fields[2] eq "CDS") { - push @CDS_regions,"$fields[3] $fields[4]"; + ($cds_id) = $fields[8] =~ /ID=([^;]+)/; + if (!defined($cds_id)) { + push @CDS_regions,"$fields[3] $fields[4]"; + } else { + if ($cds_id ne $last_cds_id) { + if ($#CDS_regions > 0) { + print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand,$FP,"CDS"); + } + @CDS_regions = (); # Need to clear single CDS regions + } + push @CDS_regions,"$fields[3] $fields[4]"; + $last_cds_id = $cds_id; + } + ($cds_transcript_id) = $fields[8] =~ /Parent=([^;]+)/; + if (!defined($cds_transcript_id)) { + $cds_transcript_id = $last_transcript_id; + } } } } if ($#exons > 0) { - print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP); -} elsif ($#CDS_regions > 0) { - print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP); + print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP,"exon"); } +@exons = (); # Need to clear single exons +if ($#CDS_regions > 0) { + print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP,"CDS"); +} +@CDS_regions = (); # Need to clear single CDS regions if (defined($opt_d)) { close($FP); @@ -336,7 +390,7 @@ sub acceptor_okay_p { sub print_exons { - my ($exons, $gene_name, $transcript_id, $chr, $strand, $FP) = @_; + my ($exons, $gene_name, $transcript_id, $chr, $strand, $FP, $transcript_type) = @_; $nexons = $#{$exons} + 1; if ($strand eq "+") { diff --git a/util/gff3_splicesites.pl.in b/util/gff3_splicesites.pl.in index 93ba453..592075b 100755 --- a/util/gff3_splicesites.pl.in +++ b/util/gff3_splicesites.pl.in @@ -28,6 +28,7 @@ if (defined($opt_d)) { $gene_name = ""; $last_transcript_id = ""; + $last_cds_id = ""; @exons = (); @CDS_regions = (); while (defined($line = <>)) { @@ -46,25 +47,29 @@ if (defined($opt_d)) { # Handle last mRNA of previous gene query_dinucleotides(\@exons,$chr,$strand,$FP); #print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP); - } elsif ($#CDS_regions > 0) { + } + @exons = (); # Need to clear single exons + if ($#CDS_regions > 0) { # Handle last mRNA of previous gene query_dinucleotides(\@CDS_regions,$chr,$strand,$FP); #print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP); } + @CDS_regions = (); # Need to clear single CDS regions ($gene_name) = $fields[8] =~ /ID=([^;]+)/; $chr = $fields[0]; - @exons = (); - @CDS_regions = (); - } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript") { + } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript" || $fields[2] eq "primary_transcript" || $fields[2] eq "ncRNA") { if ($#exons > 0) { query_dinucleotides(\@exons,$chr,$strand,$FP); #print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP); - } elsif ($#CDS_regions > 0) { + } + @exons = (); # Need to clear single exons + if ($#CDS_regions > 0) { query_dinucleotides(\@CDS_regions,$chr,$strand,$FP); #print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP); } + @CDS_regions = (); # Need to clear single CDS regions ($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/; $strand = $fields[6]; @@ -73,16 +78,35 @@ if (defined($opt_d)) { if (!defined($gene_name) || $gene_name !~ /\S/) { $gene_name = $last_transcript_id; } - $chr = $fields[0]; - @exons = (); - @CDS_regions = (); + if (!defined($chr) || $chr !~ /\S/) { + $chr = $fields[0]; + } } elsif ($fields[2] eq "exon") { - push @exons,"$fields[3] $fields[4]"; + ($parent) = $fields[8] =~ /Parent=([^;]+)/; + if (!defined($parent) || $parent eq $last_transcript_id) { + push @exons,"$fields[3] $fields[4]"; + } } elsif ($fields[2] eq "CDS") { - push @CDS_regions,"$fields[3] $fields[4]"; - + ($cds_id) = $fields[8] =~ /ID=([^;]+)/; + if (!defined($cds_id)) { + push @CDS_regions,"$fields[3] $fields[4]"; + } else { + if ($cds_id ne $last_cds_id) { + if ($#CDS_regions > 0) { + query_dinucleotides(\@CDS_regions,$chr,$strand,$FP); + #print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP); + } + @CDS_regions = (); # Need to clear single CDS regions + } + push @CDS_regions,"$fields[3] $fields[4]"; + $last_cds_id = $cds_id; + } + ($cds_transcript_id) = $fields[8] =~ /Parent=([^;]+)/; + if (!defined($cds_transcript_id)) { + $cds_transcript_id = $last_transcript_id; + } } } } @@ -90,9 +114,12 @@ if (defined($opt_d)) { if ($#exons > 0) { query_dinucleotides(\@exons,$chr,$strand,$FP); - } elsif ($#CDS_regions > 0) { + } + @exons = (); # Need to clear single exons + if ($#CDS_regions > 0) { query_dinucleotides(\@CDS_regions,$chr,$strand,$FP); } + @CDS_regions = (); # Need to clear single CDS regions close($FP); @@ -117,6 +144,8 @@ if (defined($opt_d)) { $gene_name = ""; $last_transcript_id = ""; +$last_cds_id = ""; +$cds_transcript_id = ""; @exons = (); @CDS_regions = (); while (defined($line = get_line())) { @@ -132,23 +161,27 @@ while (defined($line = get_line())) { if ($fields[2] eq "gene") { if ($#exons > 0) { # Handle last mRNA of previous gene - print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP); - } elsif ($#CDS_regions > 0) { + print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP,"exon"); + } + @exons = (); # Need to clear single exons + if ($#CDS_regions > 0) { # Handle last mRNA of previous gene - print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP); + print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand,$FP,"CDS"); } + @CDS_regions = (); # Need to clear single CDS regions ($gene_name) = $fields[8] =~ /ID=([^;]+)/; $chr = $fields[0]; - @exons = (); - @CDS_regions = (); - } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript") { + } elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript" || $fields[2] eq "primary_transcript" || $fields[2] eq "ncRNA") { if ($#exons > 0) { - print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP); - } elsif ($#CDS_regions > 0) { - print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP); + print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP,"exon"); } + @exons = (); # Need to clear single exons + if ($#CDS_regions > 0) { + print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand,$FP,"CDS"); + } + @CDS_regions = (); # Need to clear single CDS regions ($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/; $strand = $fields[6]; @@ -161,22 +194,42 @@ while (defined($line = get_line())) { $chr = $fields[0]; } - @exons = (); - @CDS_regions = (); - } elsif ($fields[2] eq "exon") { - push @exons,"$fields[3] $fields[4]"; + ($parent) = $fields[8] =~ /Parent=([^;]+)/; + if (!defined($parent) || $parent eq $last_transcript_id) { + push @exons,"$fields[3] $fields[4]"; + } + } elsif ($fields[2] eq "CDS") { - push @CDS_regions,"$fields[3] $fields[4]"; + ($cds_id) = $fields[8] =~ /ID=([^;]+)/; + if (!defined($cds_id)) { + push @CDS_regions,"$fields[3] $fields[4]"; + } else { + if ($cds_id ne $last_cds_id) { + if ($#CDS_regions > 0) { + print_exons(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand,$FP,"CDS"); + } + @CDS_regions = (); # Need to clear single CDS regions + } + push @CDS_regions,"$fields[3] $fields[4]"; + $last_cds_id = $cds_id; + } + ($cds_transcript_id) = $fields[8] =~ /Parent=([^;]+)/; + if (!defined($cds_transcript_id)) { + $cds_transcript_id = $last_transcript_id; + } } } } if ($#exons > 0) { - print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP); -} elsif ($#CDS_regions > 0) { - print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP); + print_exons(\@exons,$gene_name,$last_transcript_id,$chr,$strand,$FP,"exon"); +} +@exons = (); # Need to clear single exons +if ($#CDS_regions > 0) { + print_exons(\@CDS_regions,$gene_name,$last_transcript_id,$chr,$strand,$FP,"CDS"); } +@CDS_regions = (); # Need to clear single CDS regions if (defined($opt_d)) { close($FP); @@ -338,7 +391,7 @@ sub acceptor_okay_p { sub print_exons { - my ($exons, $gene_name, $transcript_id, $chr, $strand, $FP) = @_; + my ($exons, $gene_name, $transcript_id, $chr, $strand, $FP, $transcript_type) = @_; $nexons = $#{$exons} + 1; if ($strand eq "+") { @@ -347,8 +400,8 @@ sub print_exons { if (($intron_length = $ {$starts}[$i] - $ {$ends}[$i] - 1) < 0) { printf STDERR "gene %s, transcript %s, intron %d with donor at %s:%u and acceptor at %s:%u implies a negative intron length of %d...skipping\n",$gene_name,$transcript_id,$i+1,$chr,$ {$ends}[$i],$chr,$ {$starts}[$i],$intron_length; } elsif (!defined($opt_d)) { - printf ">%s.%s.exon%d/%d %s:%u..%u donor $intron_length\n",$gene_name,$transcript_id,$i+1,$nexons,$chr,$ {$ends}[$i],$ {$ends}[$i]+1; - printf ">%s.%s.exon%d/%d %s:%u..%u acceptor $intron_length\n",$gene_name,$transcript_id,$i+2,$nexons,$chr,$ {$starts}[$i]-1,$ {$starts}[$i]; + printf ">%s.%s.%s%d/%d %s:%u..%u donor $intron_length\n",$gene_name,$transcript_id,$transcript_type,$i+1,$nexons,$chr,$ {$ends}[$i],$ {$ends}[$i]+1; + printf ">%s.%s.%s%d/%d %s:%u..%u acceptor $intron_length\n",$gene_name,$transcript_id,$transcript_type,$i+2,$nexons,$chr,$ {$starts}[$i]-1,$ {$starts}[$i]; } else { $query = sprintf("%s:%u..%u",$chr,$ends[$i]+1,$ends[$i]+2); $donor_dinucl = get_dinucleotide($query,$FP); @@ -358,17 +411,17 @@ sub print_exons { if (defined($opt_C) && donor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) { if (!defined($opt_Q)) { - printf STDERR "Skipping non-canonical donor $donor_dinucl, intron length %d for %s.%s.exon%d/%d on plus strand\n",$intron_length,$gene_name,$transcript_id,$i+1,$nexons; + printf STDERR "Skipping non-canonical donor $donor_dinucl, intron length %d for %s.%s.%s%d/%d on plus strand\n",$intron_length,$gene_name,$transcript_id,$transcript_type,$i+1,$nexons; } } elsif (defined($opt_R)) { if (donor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) { - printf ">%s.%s.exon%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$i+1,$nexons,$chr,$ {$ends}[$i],$ {$ends}[$i]+1; + printf ">%s.%s.%s%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+1,$nexons,$chr,$ {$ends}[$i],$ {$ends}[$i]+1; print " $donor_dinucl"; print "\n"; } } else { - printf ">%s.%s.exon%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$i+1,$nexons,$chr,$ {$ends}[$i],$ {$ends}[$i]+1; + printf ">%s.%s.%s%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+1,$nexons,$chr,$ {$ends}[$i],$ {$ends}[$i]+1; if (defined($opt_2)) { print " $donor_dinucl"; } @@ -377,17 +430,17 @@ sub print_exons { if (defined($opt_C) && acceptor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) { if (!defined($opt_Q)) { - printf STDERR "Skipping non-canonical acceptor $acceptor_dinucl, intron length %d for %s.%s.exon%d/%d on plus strand\n",$intron_length,$gene_name,$transcript_id,$i+2,$nexons; + printf STDERR "Skipping non-canonical acceptor $acceptor_dinucl, intron length %d for %s.%s.%s%d/%d on plus strand\n",$intron_length,$gene_name,$transcript_id,$transcript_type,$i+2,$nexons; } } elsif (defined($opt_R)) { if (acceptor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) { - printf ">%s.%s.exon%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$i+2,$nexons,$chr,$ {$starts}[$i]-1,$ {$starts}[$i]; + printf ">%s.%s.%s%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+2,$nexons,$chr,$ {$starts}[$i]-1,$ {$starts}[$i]; print " $acceptor_dinucl"; print "\n"; } } else { - printf ">%s.%s.exon%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$i+2,$nexons,$chr,$ {$starts}[$i]-1,$ {$starts}[$i]; + printf ">%s.%s.%s%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+2,$nexons,$chr,$ {$starts}[$i]-1,$ {$starts}[$i]; if (defined($opt_2)) { print " $acceptor_dinucl"; } @@ -402,8 +455,8 @@ sub print_exons { if (($intron_length = $ {$starts}[$i] - $ {$ends}[$i] - 1) < 0) { printf STDERR "gene %s, transcript %s, intron %d with donor at %s:%u and acceptor at %s:%u implies a negative intron length of %d...skipping\n",$gene_name,$transcript_id,$i+1,$chr,$ {$starts}[$i],$chr,$ {$ends}[$i],$intron_length; } elsif (!defined($opt_d)) { - printf ">%s.%s.exon%d/%d %s:%u..%u donor $intron_length\n",$gene_name,$transcript_id,$i+1,$nexons,$chr,$ {$starts}[$i],$ {$starts}[$i]-1; - printf ">%s.%s.exon%d/%d %s:%u..%u acceptor $intron_length\n",$gene_name,$transcript_id,$i+2,$nexons,$chr,$ {$ends}[$i]+1,$ {$ends}[$i]; + printf ">%s.%s.%s%d/%d %s:%u..%u donor $intron_length\n",$gene_name,$transcript_id,$transcript_type,$i+1,$nexons,$chr,$ {$starts}[$i],$ {$starts}[$i]-1; + printf ">%s.%s.%s%d/%d %s:%u..%u acceptor $intron_length\n",$gene_name,$transcript_id,$transcript_type,$i+2,$nexons,$chr,$ {$ends}[$i]+1,$ {$ends}[$i]; } else { $query = sprintf("%s:%u..%u",$chr,$starts[$i]-1,$starts[$i]-2); $donor_dinucl = get_dinucleotide($query,$FP); @@ -413,17 +466,17 @@ sub print_exons { if (defined($opt_C) && donor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) { if (!defined($opt_Q)) { - printf STDERR "Skipping non-canonical donor $donor_dinucl, intron length %d for %s.%s.exon%d/%d on minus strand\n",$intron_length,$gene_name,$transcript_id,$i+1,$nexons; + printf STDERR "Skipping non-canonical donor $donor_dinucl, intron length %d for %s.%s.%s%d/%d on minus strand\n",$intron_length,$gene_name,$transcript_id,$transcript_type,$i+1,$nexons; } } elsif (defined($opt_R)) { if (donor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) { - printf ">%s.%s.exon%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$i+1,$nexons,$chr,$ {$starts}[$i],$ {$starts}[$i]-1; + printf ">%s.%s.%s%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+1,$nexons,$chr,$ {$starts}[$i],$ {$starts}[$i]-1; print " $donor_dinucl"; print "\n"; } } else { - printf ">%s.%s.exon%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$i+1,$nexons,$chr,$ {$starts}[$i],$ {$starts}[$i]-1; + printf ">%s.%s.%s%d/%d %s:%u..%u donor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+1,$nexons,$chr,$ {$starts}[$i],$ {$starts}[$i]-1; if (defined($opt_2)) { print " $donor_dinucl"; } @@ -432,17 +485,17 @@ sub print_exons { if (defined($opt_C) && acceptor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) { if (!defined($opt_Q)) { - printf STDERR "Skipping non-canonical acceptor $acceptor_dinucl, intron length %d for %s.%s.exon%d/%d on minus strand\n",$intron_length,$gene_name,$transcript_id,$i+2,$nexons; + printf STDERR "Skipping non-canonical acceptor $acceptor_dinucl, intron length %d for %s.%s.%s%d/%d on minus strand\n",$intron_length,$gene_name,$transcript_id,$transcript_type,$i+2,$nexons; } } elsif (defined($opt_R)) { if (acceptor_okay_p($donor_dinucl,$acceptor_dinucl) == 0) { - printf ">%s.%s.exon%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$i+2,$nexons,$chr,$ {$ends}[$i]+1,$ {$ends}[$i]; + printf ">%s.%s.%s%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+2,$nexons,$chr,$ {$ends}[$i]+1,$ {$ends}[$i]; print " $acceptor_dinucl"; print "\n"; } } else { - printf ">%s.%s.exon%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$i+2,$nexons,$chr,$ {$ends}[$i]+1,$ {$ends}[$i]; + printf ">%s.%s.%s%d/%d %s:%u..%u acceptor $intron_length",$gene_name,$transcript_id,$transcript_type,$i+2,$nexons,$chr,$ {$ends}[$i]+1,$ {$ends}[$i]; if (defined($opt_2)) { print " $acceptor_dinucl"; } -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/gmap.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
