This is an automated email from the git hooks/post-receive script. tille pushed a commit to branch master in repository gmap.
commit 11ae4fff84668a7aa865d20e99c205b3cb5e82e2 Author: Andreas Tille <[email protected]> Date: Sun Oct 26 08:20:27 2014 +0100 Imported Upstream version 2014-10-22 --- ChangeLog | 33 ++++++++++ VERSION | 2 +- configure | 24 ++++---- src/bitpack64-write.c | 20 +++--- src/genome128_hr.c | 6 +- src/gregion.c | 167 ++++++++++++++++++++++++++++++++++++++++---------- src/indexdb-write.c | 3 +- src/sarray-read.c | 31 +++++++--- src/sarray-write.c | 3 +- src/shortread.c | 92 +++++++++++++++------------ src/stage1hr.c | 3 +- src/stage2.c | 134 +++++++++++++++++++--------------------- src/stage3hr.c | 57 ++++++++++++++--- 13 files changed, 383 insertions(+), 192 deletions(-) diff --git a/ChangeLog b/ChangeLog index e24d61d..178c15e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,36 @@ +2014-10-22 twu + + * gregion.c: Checking size before deciding to use alloca or malloc. + +2014-10-16 twu + + * stage2.c: Fixed an uninitialized variable in grand_fwd and grand_rev + procedures, plus the checks on maxintronlen in computing + grand_fwd_lookforward and grand_rev_lookforward. + + * shortread.c: Allowing queryseq1 to be equal to SKIPPED. Removed unused + parameter acc from input_oneline routines. + + * VERSION, index.html: Updated version number + + * stage1hr.c: Added debugging statement + + * sarray-read.c: Don't limit filling of best elt based on nmatches being + more than half of the read length + + * stage3hr.c: Restored previous behavior where soft clips avoid + circularization + + * indexdb-write.c, sarray-write.c: Removed unnecessary includes of + popcount.h + + * bitpack64-write.c, genome128_hr.c: In lookups of clz_table, removing the + intermediate variable "top". + + * stage3hr.c: Not allowing soft-clipping at ends to avoid circularization. + Added pre-processor macro SOFT_CLIPS_AVOID_CIRCULARIZATION to preserve + previous code. + 2014-10-15 twu * VERSION, config.site.rescomp.prd, config.site.rescomp.tst, index.html: diff --git a/VERSION b/VERSION index 9594e69..0fab9d5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2014-10-15 \ No newline at end of file +2014-10-22 \ No newline at end of file diff --git a/configure b/configure index b77f154..290e333 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.63 for gmap 2014-10-15. +# Generated by GNU Autoconf 2.63 for gmap 2014-10-22. # # Report bugs to <Thomas Wu <[email protected]>>. # @@ -745,8 +745,8 @@ SHELL=${CONFIG_SHELL-/bin/sh} # Identity of this package. PACKAGE_NAME='gmap' PACKAGE_TARNAME='gmap' -PACKAGE_VERSION='2014-10-15' -PACKAGE_STRING='gmap 2014-10-15' +PACKAGE_VERSION='2014-10-22' +PACKAGE_STRING='gmap 2014-10-22' PACKAGE_BUGREPORT='Thomas Wu <[email protected]>' ac_unique_file="src/gmap.c" @@ -1508,7 +1508,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures gmap 2014-10-15 to adapt to many kinds of systems. +\`configure' configures gmap 2014-10-22 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1579,7 +1579,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of gmap 2014-10-15:";; + short | recursive ) echo "Configuration of gmap 2014-10-22:";; esac cat <<\_ACEOF @@ -1711,7 +1711,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -gmap configure 2014-10-15 +gmap configure 2014-10-22 generated by GNU Autoconf 2.63 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, @@ -1725,7 +1725,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by gmap $as_me 2014-10-15, which was +It was created by gmap $as_me 2014-10-22, which was generated by GNU Autoconf 2.63. Invocation command line was $ $0 $@ @@ -2095,8 +2095,8 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu { $as_echo "$as_me:$LINENO: checking package version" >&5 $as_echo_n "checking package version... " >&6; } -{ $as_echo "$as_me:$LINENO: result: 2014-10-15" >&5 -$as_echo "2014-10-15" >&6; } +{ $as_echo "$as_me:$LINENO: result: 2014-10-22" >&5 +$as_echo "2014-10-22" >&6; } ### Read defaults @@ -4147,7 +4147,7 @@ fi # Define the identity of the package. PACKAGE=gmap - VERSION=2014-10-15 + VERSION=2014-10-22 cat >>confdefs.h <<_ACEOF @@ -25945,7 +25945,7 @@ exec 6>&1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by gmap $as_me 2014-10-15, which was +This file was extended by gmap $as_me 2014-10-22, which was generated by GNU Autoconf 2.63. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -26008,7 +26008,7 @@ Report bugs to <[email protected]>." _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_version="\\ -gmap config.status 2014-10-15 +gmap config.status 2014-10-22 configured by $0, generated by GNU Autoconf 2.63, with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\" diff --git a/src/bitpack64-write.c b/src/bitpack64-write.c index 24ef7a1..6a98ec1 100644 --- a/src/bitpack64-write.c +++ b/src/bitpack64-write.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: bitpack64-write.c 132144 2014-04-02 16:02:28Z twu $"; +static char rcsid[] = "$Id: bitpack64-write.c 151045 2014-10-16 19:08:17Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -17,11 +17,13 @@ static char rcsid[] = "$Id: bitpack64-write.c 132144 2014-04-02 16:02:28Z twu $" #include "mem.h" #include "assert.h" #include "fopen.h" +#include "popcount.h" #ifdef HAVE_SSE2 #include <emmintrin.h> #endif + /* #define ALLOW_ODD_PACKSIZES 1 */ /* #define USE_ONE_FILE_FOR_FIXED 1 */ @@ -4424,7 +4426,7 @@ static int compute_q4_diffs_bidir (UINT4 *diffs, UINT4 *values) { UINT4 packsize; int i; - UINT4 maxdiff = 0, top; + UINT4 maxdiff = 0; int firstbit, msb; #if 0 @@ -4460,7 +4462,7 @@ compute_q4_diffs_bidir (UINT4 *diffs, UINT4 *values) { asm("bsr %1,%0" : "=r"(msb) : "r"(maxdiff)); packsize = msb + 1; #else - firstbit = ((top = maxdiff >> 16) ? clz_table[top] : 16 + clz_table[maxdiff]); + firstbit = ((maxdiff >> 16) ? clz_table[maxdiff >> 16] : 16 + clz_table[maxdiff]); packsize = 32 - firstbit; #endif @@ -4477,7 +4479,7 @@ static int compute_q1_diffs (UINT4 *diffs, UINT4 *values) { UINT4 packsize; int i; - UINT4 maxdiff = 0, top; + UINT4 maxdiff = 0; int firstbit, msb; #if 0 @@ -4502,7 +4504,7 @@ compute_q1_diffs (UINT4 *diffs, UINT4 *values) { asm("bsr %1,%0" : "=r"(msb) : "r"(maxdiff)); packsize = msb + 1; #else - firstbit = ((top = maxdiff >> 16) ? clz_table[top] : 16 + clz_table[maxdiff]); + firstbit = ((maxdiff >> 16) ? clz_table[maxdiff >> 16] : 16 + clz_table[maxdiff]); packsize = 32 - firstbit; #endif @@ -4521,7 +4523,7 @@ static int compute_q4_diffs_bidir_huge (UINT4 *diffs, UINT8 *values) { UINT4 packsize; int i; - UINT4 maxdiff = 0, top; + UINT4 maxdiff = 0; int firstbit, msb; @@ -4552,7 +4554,7 @@ compute_q4_diffs_bidir_huge (UINT4 *diffs, UINT8 *values) { asm("bsr %1,%0" : "=r"(msb) : "r"(maxdiff)); packsize = msb + 1; #else - firstbit = ((top = maxdiff >> 16) ? clz_table[top] : 16 + clz_table[maxdiff]); + firstbit = ((maxdiff >> 16) ? clz_table[maxdiff >> 16] : 16 + clz_table[maxdiff]); packsize = 32 - firstbit; #endif @@ -5314,7 +5316,7 @@ Bitpack64_write_fixed10_huge (char *pagesfile, char *ptrsfile, char *compfile, static int compute_packsize (UINT4 *values) { UINT4 packsize; - UINT4 maxvalue = 0, top; + UINT4 maxvalue = 0; int i; int firstbit, msb; @@ -5334,7 +5336,7 @@ compute_packsize (UINT4 *values) { asm("bsr %1,%0" : "=r"(msb) : "r"(maxvalue)); packsize = msb + 1; #else - firstbit = ((top = maxvalue >> 16) ? clz_table[top] : 16 + clz_table[maxvalue]); + firstbit = ((maxvalue >> 16) ? clz_table[maxvalue >> 16] : 16 + clz_table[maxvalue]); packsize = 32 - firstbit; #endif diff --git a/src/genome128_hr.c b/src/genome128_hr.c index ffc91f9..d455af1 100644 --- a/src/genome128_hr.c +++ b/src/genome128_hr.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: genome128_hr.c 148186 2014-09-18 16:35:33Z twu $"; +static char rcsid[] = "$Id: genome128_hr.c 151045 2014-10-16 19:08:17Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -18649,7 +18649,7 @@ print_diff_leading_zeroes (__m128i _diff, int offset) { #elif defined(HAVE_BUILTIN_CLZ) #define count_leading_zeroes(diff) __builtin_clz(diff) #else -#define count_leading_zeroes(diff) ((top = diff >> 16) ? clz_table[top] : 16 + clz_table[diff]) +#define count_leading_zeroes(diff) ((diff >> 16) ? clz_table[diff >> 16] : 16 + clz_table[diff]) #endif #ifdef HAVE_BMI1 @@ -18714,7 +18714,7 @@ print_diff_leading_zeroes (UINT4 diff, int offset) { #elif defined(HAVE_BUILTIN_CLZ) #define count_leading_zeroes_32(diff) __builtin_clz(diff) #else -#define count_leading_zeroes_32(diff) ((top = diff >> 16) ? clz_table[top] : 16 + clz_table[diff]) +#define count_leading_zeroes_32(diff) ((diff >> 16) ? clz_table[diff >> 16] : 16 + clz_table[diff]) #endif #ifdef HAVE_BMI1 diff --git a/src/gregion.c b/src/gregion.c index b0d7f98..c197605 100644 --- a/src/gregion.c +++ b/src/gregion.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: gregion.c 145990 2014-08-25 21:47:32Z twu $"; +static char rcsid[] = "$Id: gregion.c 151507 2014-10-22 19:37:48Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -15,6 +15,9 @@ static char rcsid[] = "$Id: gregion.c 145990 2014-08-25 21:47:32Z twu $"; #define MAX_GENOMICLENGTH 2000000 +#define MAX_NCHRS_FOR_ALLOCA 100 +#define MAX_GREGIONS_FOR_ALLOCA 100 + #define EXTRA_SHORTEND 30000 #define EXTRA_LONGEND 100000 @@ -516,10 +519,16 @@ Gregion_filter_unique (List_T gregionlist) { } ); - eliminate = (bool *) CALLOCA(n,sizeof(bool)); - array = (T *) MALLOCA(n * sizeof(T)); + if (n < MAX_GREGIONS_FOR_ALLOCA) { + eliminate = (bool *) CALLOCA(n,sizeof(bool)); + array = (T *) MALLOCA(n * sizeof(T)); + List_fill_array_and_free((void **) array,&gregionlist); + } else { + eliminate = (bool *) CALLOC(n,sizeof(bool)); + array = (T *) List_to_array(gregionlist,NULL); + List_free(&gregionlist); + } - List_fill_array_and_free((void **) array,&gregionlist); qsort(array,n,sizeof(T),weight_cmp); for (i = 0; i < n; i++) { @@ -559,8 +568,13 @@ Gregion_filter_unique (List_T gregionlist) { } } - FREEA(eliminate); - FREEA(array); + if (n < MAX_GREGIONS_FOR_ALLOCA) { + FREEA(eliminate); + FREEA(array); + } else { + FREE(eliminate); + FREE(array); + } debug( for (p = unique, i = 0; p != NULL; p = p->rest, i++) { @@ -831,16 +845,29 @@ Gregion_sort_low_descending (List_T gregions) { if ((n = List_length(gregions)) == 0) { return (List_T) NULL; - } else { + + } else if (n < MAX_GREGIONS_FOR_ALLOCA) { array = (Gregion_T *) MALLOCA(n * sizeof(Gregion_T)); List_fill_array((void **) array,gregions); + qsort(array,n,sizeof(Gregion_T),Gregion_low_descending_cmp); - for (i = n-1; i >= 0; i--) { sorted = List_push(sorted,array[i]); } + FREEA(array); return sorted; + + } else { + array = (Gregion_T *) List_to_array(gregions,NULL); + + qsort(array,n,sizeof(Gregion_T),Gregion_low_descending_cmp); + for (i = n-1; i >= 0; i--) { + sorted = List_push(sorted,array[i]); + } + + FREE(array); + return sorted; } } @@ -854,16 +881,29 @@ Gregion_sort_high_ascending (List_T gregions) { if ((n = List_length(gregions)) == 0) { return (List_T) NULL; - } else { + + } else if (n < MAX_GREGIONS_FOR_ALLOCA) { array = (Gregion_T *) MALLOCA(n * sizeof(Gregion_T)); List_fill_array((void **) array,gregions); + qsort(array,n,sizeof(Gregion_T),Gregion_high_ascending_cmp); - for (i = n-1; i >= 0; i--) { sorted = List_push(sorted,array[i]); } + FREEA(array); return sorted; + + } else { + array = (Gregion_T *) List_to_array(gregions,NULL); + + qsort(array,n,sizeof(Gregion_T),Gregion_high_ascending_cmp); + for (i = n-1; i >= 0; i--) { + sorted = List_push(sorted,array[i]); + } + + FREE(array); + return sorted; } } @@ -879,6 +919,7 @@ struct Base_T { bool usedp; }; +#if 0 static void Base_free (Base_T *old) { if ((*old)->gregions != NULL) { @@ -887,6 +928,7 @@ Base_free (Base_T *old) { FREE(*old); return; } +#endif static Base_T @@ -953,8 +995,12 @@ compute_ceilings (Uinttable_T low_basetable) { int n, i; n = Uinttable_length(low_basetable); - keys = (Chrpos_T *) MALLOCA(n * sizeof(Chrpos_T)); - Uinttable_fill_keys(keys,low_basetable,/*sortp*/true); + if (n < MAX_NCHRS_FOR_ALLOCA) { + keys = (Chrpos_T *) MALLOCA(n * sizeof(Chrpos_T)); + Uinttable_fill_keys(keys,low_basetable,/*sortp*/true); + } else { + keys = Uinttable_keys(low_basetable,/*sortp*/true); + } debug2(printf("low_basetable has %d entries\n",n)); prevpos = 0U; @@ -1045,7 +1091,11 @@ compute_ceilings (Uinttable_T low_basetable) { } } - FREEA(keys); + if (n < MAX_NCHRS_FOR_ALLOCA) { + FREEA(keys); + } else { + FREE(keys); + } return bestprevpos; } @@ -1067,8 +1117,12 @@ compute_floors (Uinttable_T high_basetable) { int n, i; n = Uinttable_length(high_basetable); - keys = (Chrpos_T *) MALLOCA(n * sizeof(Chrpos_T)); - Uinttable_fill_keys(keys,high_basetable,/*sortp*/true); + if (n < MAX_NCHRS_FOR_ALLOCA) { + keys = (Chrpos_T *) MALLOCA(n * sizeof(Chrpos_T)); + Uinttable_fill_keys(keys,high_basetable,/*sortp*/true); + } else { + keys = Uinttable_keys(high_basetable,/*sortp*/true); + } debug2(printf("high_basetable has %d entries\n",n)); prevpos = (Chrpos_T) -1U; @@ -1158,7 +1212,11 @@ compute_floors (Uinttable_T high_basetable) { } } - FREEA(keys); + if (n < MAX_NCHRS_FOR_ALLOCA) { + FREEA(keys); + } else { + FREE(keys); + } return bestprevpos; } @@ -1173,8 +1231,12 @@ traceback_ceilings (Uinttable_T low_basetable, Chrpos_T prevpos) { int n, i; n = Uinttable_length(low_basetable); - keys = (Chrpos_T *) MALLOCA(n * sizeof(Chrpos_T)); - Uinttable_fill_keys(keys,low_basetable,/*sortp*/true); + if (n < MAX_NCHRS_FOR_ALLOCA) { + keys = (Chrpos_T *) MALLOCA(n * sizeof(Chrpos_T)); + Uinttable_fill_keys(keys,low_basetable,/*sortp*/true); + } else { + keys = Uinttable_keys(low_basetable,/*sortp*/true); + } ceiling = (Chrpos_T) -1U; @@ -1205,7 +1267,11 @@ traceback_ceilings (Uinttable_T low_basetable, Chrpos_T prevpos) { i--; } - FREEA(keys); + if (n < MAX_NCHRS_FOR_ALLOCA) { + FREEA(keys); + } else { + FREE(keys); + } return; } @@ -1220,8 +1286,12 @@ traceback_floors (Uinttable_T high_basetable, Chrpos_T prevpos) { int n, i; n = Uinttable_length(high_basetable); - keys = (Chrpos_T *) MALLOCA(n * sizeof(Chrpos_T)); - Uinttable_fill_keys(keys,high_basetable,/*sortp*/true); + if (n < MAX_NCHRS_FOR_ALLOCA) { + keys = (Chrpos_T *) MALLOCA(n * sizeof(Chrpos_T)); + Uinttable_fill_keys(keys,high_basetable,/*sortp*/true); + } else { + keys = Uinttable_keys(high_basetable,/*sortp*/true); + } floor = 0U; @@ -1252,7 +1322,11 @@ traceback_floors (Uinttable_T high_basetable, Chrpos_T prevpos) { i++; } - FREEA(keys); + if (n < MAX_NCHRS_FOR_ALLOCA) { + FREEA(keys); + } else { + FREE(keys); + } return; } @@ -1268,8 +1342,12 @@ bound_gregions (Uinttable_T low_basetable, Uinttable_T high_basetable) { int n, i; n = Uinttable_length(low_basetable); - keys = (Chrpos_T *) MALLOCA(n * sizeof(Chrpos_T)); - Uinttable_fill_keys(keys,low_basetable,/*sortp*/true); + if (n < MAX_NCHRS_FOR_ALLOCA) { + keys = (Chrpos_T *) MALLOCA(n * sizeof(Chrpos_T)); + Uinttable_fill_keys(keys,low_basetable,/*sortp*/true); + } else { + keys = Uinttable_keys(low_basetable,/*sortp*/true); + } for (i = 0; i < n; i++) { base_low = (Base_T) Uinttable_get(low_basetable,keys[i]); @@ -1288,12 +1366,20 @@ bound_gregions (Uinttable_T low_basetable, Uinttable_T high_basetable) { } } - FREEA(keys); + if (n < MAX_NCHRS_FOR_ALLOCA) { + FREEA(keys); + } else { + FREE(keys); + } n = Uinttable_length(high_basetable); - keys = (Chrpos_T *) MALLOCA(n * sizeof(Chrpos_T)); - Uinttable_fill_keys(keys,high_basetable,/*sortp*/true); + if (n < MAX_NCHRS_FOR_ALLOCA) { + keys = (Chrpos_T *) MALLOCA(n * sizeof(Chrpos_T)); + Uinttable_fill_keys(keys,high_basetable,/*sortp*/true); + } else { + keys = Uinttable_keys(high_basetable,/*sortp*/true); + } for (i = n-1; i >= 0; i--) { base_high = (Base_T) Uinttable_get(high_basetable,keys[i]); @@ -1312,7 +1398,11 @@ bound_gregions (Uinttable_T low_basetable, Uinttable_T high_basetable) { } } - FREEA(keys); + if (n < MAX_NCHRS_FOR_ALLOCA) { + FREEA(keys); + } else { + FREE(keys); + } return; } @@ -1328,10 +1418,11 @@ Gregion_filter_clean (List_T gregionlist, int nchrs) { Chrpos_T prevpos; Chrnum_T chrnum; - List_T unique = NULL, p; + List_T p; T gregion; int n; #if 0 + List_T unique = NULL; T x, y, *array; int i, j; bool *eliminate; @@ -1353,8 +1444,13 @@ Gregion_filter_clean (List_T gregionlist, int nchrs) { } ); - low_basetables = (Uinttable_T *) CALLOCA(nchrs+1,sizeof(Uinttable_T)); - high_basetables = (Uinttable_T *) CALLOCA(nchrs+1,sizeof(Uinttable_T)); + if (nchrs < MAX_NCHRS_FOR_ALLOCA) { + low_basetables = (Uinttable_T *) CALLOCA(nchrs+1,sizeof(Uinttable_T)); + high_basetables = (Uinttable_T *) CALLOCA(nchrs+1,sizeof(Uinttable_T)); + } else { + low_basetables = (Uinttable_T *) CALLOC(nchrs+1,sizeof(Uinttable_T)); + high_basetables = (Uinttable_T *) CALLOC(nchrs+1,sizeof(Uinttable_T)); + } for (p = gregionlist; p != NULL; p = List_next(p)) { gregion = (T) List_head(p); @@ -1395,8 +1491,13 @@ Gregion_filter_clean (List_T gregionlist, int nchrs) { /* Todo: Free each table */ - FREEA(high_basetables); - FREEA(low_basetables); + if (nchrs < MAX_NCHRS_FOR_ALLOCA) { + FREEA(high_basetables); + FREEA(low_basetables); + } else { + FREE(high_basetables); + FREE(low_basetables); + } #if 0 eliminate = (bool *) CALLOC(n,sizeof(bool)); diff --git a/src/indexdb-write.c b/src/indexdb-write.c index f9fe333..4c3792e 100644 --- a/src/indexdb-write.c +++ b/src/indexdb-write.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: indexdb-write.c 132144 2014-04-02 16:02:28Z twu $"; +static char rcsid[] = "$Id: indexdb-write.c 151046 2014-10-16 19:08:41Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -60,7 +60,6 @@ static char rcsid[] = "$Id: indexdb-write.c 132144 2014-04-02 16:02:28Z twu $"; #include "indexdbdef.h" #include "iit-read-univ.h" #include "indexdb.h" -#include "popcount.h" #include "bitpack64-read.h" #include "bitpack64-write.h" diff --git a/src/sarray-read.c b/src/sarray-read.c index aed9a11..69638d0 100644 --- a/src/sarray-read.c +++ b/src/sarray-read.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: sarray-read.c 150216 2014-10-07 23:53:59Z twu $"; +static char rcsid[] = "$Id: sarray-read.c 151053 2014-10-16 19:57:23Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -1499,8 +1499,14 @@ Elt_fill_positions_all (Elt_T this, T sarray) { Univcoord_T pos; int i; - if (this->positions_allocated == NULL) { + debug(printf("Entering Elt_fill_positions_all on %p\n",this)); + if (this->positions_allocated != NULL) { + debug(printf(" positions_allocated is already non-NULL, so skipping\n")); + + } else { this->npositions = this->finalptr - this->initptr + 1; + debug(printf(" filling %d positions\n",this->npositions)); + if (this->nmatches == 0 || this->npositions > EXCESS_SARRAY_HITS) { this->positions_allocated = this->positions = (Univcoord_T *) NULL; this->npositions = 0; @@ -2394,8 +2400,9 @@ static void Elt_dump (Elt_T elt) { int k; + printf("Elt with %d positions:\n",elt->npositions); for (k = 0; k < elt->npositions; k++) { - printf("%d..%d:%u\n",elt->querystart,elt->queryend,elt->positions[k]); + printf(" %d..%d:%u\n",elt->querystart,elt->queryend,elt->positions[k]); } printf("\n"); @@ -3836,7 +3843,8 @@ Sarray_search_greedy (int *found_score, List_T *subs, List_T *indels, List_T *am sarray_search(&initptr,&finalptr,&successp,&nmatches,&(queryuc_ptr[halfwaypos]), querylength - halfwaypos,/*queryoffset*/halfwaypos, query_compress_fwd,plus_sarray,/*plusp*/true,genestrand,first_read_p,plus_conversion); - if (nmatches >= querylength - halfwaypos) { + /* Don't want to limit based on nmatches */ + if (1 || nmatches >= querylength - halfwaypos) { elt = Elt_new(halfwaypos,nmatches,initptr,finalptr); Elt_fill_positions_all(elt,plus_sarray); for (i = 0; i < elt->npositions; i++) { @@ -3894,7 +3902,8 @@ Sarray_search_greedy (int *found_score, List_T *subs, List_T *indels, List_T *am sarray_search(&initptr,&finalptr,&successp,&nmatches,&(queryrc[halfwaypos]), querylength - halfwaypos,/*queryoffset*/halfwaypos, query_compress_rev,minus_sarray,/*plusp*/false,genestrand,first_read_p,minus_conversion); - if (nmatches >= querylength - halfwaypos) { + /* Don't want to limit based on nmatches */ + if (1 || nmatches >= querylength - halfwaypos) { elt = Elt_new(halfwaypos,nmatches,initptr,finalptr); Elt_fill_positions_all(elt,minus_sarray); for (i = 0; i < elt->npositions; i++) { @@ -3946,11 +3955,12 @@ Sarray_search_greedy (int *found_score, List_T *subs, List_T *indels, List_T *am debug(printf("plus_querypos %d vs querylength %d\n",plus_querypos,querylength)); if (nmatches <= best_plus_nmatches) { /* Initial (left) elt was best */ - debug(printf("Initial elt was best\n")); + debug(printf("Initial elt %p was best:\n",best_plus_elt)); plus_set = List_push(NULL,elt); if (plus_querypos >= querylength) { chrhigh = 0U; Elt_fill_positions_all(best_plus_elt,plus_sarray); + debug(Elt_dump(best_plus_elt)); for (i = 0; i < best_plus_elt->npositions; i++) { left = best_plus_elt->positions[i]; if (left > chrhigh) { @@ -3972,13 +3982,14 @@ Sarray_search_greedy (int *found_score, List_T *subs, List_T *indels, List_T *am } } else { /* Second (right) elt is best */ - debug(printf("Second elt is best\n")); + debug(printf("Second elt %p is best:\n",elt)); plus_set = List_push(NULL,best_plus_elt); best_plus_elt = elt; best_plus_nmatches = nmatches; if (plus_querypos >= querylength) { chrhigh = 0U; Elt_fill_positions_all(best_plus_elt,plus_sarray); + debug(Elt_dump(best_plus_elt)); for (i = 0; i < best_plus_elt->npositions; i++) { left = best_plus_elt->positions[i]; if (left > chrhigh) { @@ -4022,11 +4033,12 @@ Sarray_search_greedy (int *found_score, List_T *subs, List_T *indels, List_T *am debug(printf("minus_querypos %d vs querylength %d\n",minus_querypos,querylength)); if (nmatches <= best_minus_nmatches) { /* Initial (left) elt was best */ - debug(printf("Initial elt was best\n")); + debug(printf("Initial elt %p was best:\n",best_minus_elt)); minus_set = List_push(NULL,elt); if (minus_querypos >= querylength) { chrhigh = 0U; Elt_fill_positions_all(best_minus_elt,minus_sarray); + debug(Elt_dump(best_minus_elt)); for (i = 0; i < best_minus_elt->npositions; i++) { left = best_minus_elt->positions[i]; if (left > chrhigh) { @@ -4048,13 +4060,14 @@ Sarray_search_greedy (int *found_score, List_T *subs, List_T *indels, List_T *am } } else { /* Second (right) elt is best */ - debug(printf("Second elt is best\n")); + debug(printf("Second elt %p is best:\n",elt)); minus_set = List_push(NULL,best_minus_elt); best_minus_elt = elt; best_minus_nmatches = nmatches; if (minus_querypos >= querylength) { chrhigh = 0U; Elt_fill_positions_all(best_minus_elt,minus_sarray); + debug(Elt_dump(best_minus_elt)); for (i = 0; i < best_minus_elt->npositions; i++) { left = best_minus_elt->positions[i]; if (left > chrhigh) { diff --git a/src/sarray-write.c b/src/sarray-write.c index 5c2aa6e..68c6d20 100644 --- a/src/sarray-write.c +++ b/src/sarray-write.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: sarray-write.c 140591 2014-07-03 16:08:23Z twu $"; +static char rcsid[] = "$Id: sarray-write.c 151046 2014-10-16 19:08:41Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -24,7 +24,6 @@ static char rcsid[] = "$Id: sarray-write.c 140591 2014-07-03 16:08:23Z twu $"; #include "genome128_hr.h" #include "uintlist.h" #include "intlist.h" -#include "popcount.h" #ifdef WORDS_BIGENDIAN diff --git a/src/shortread.c b/src/shortread.c index 110996a..c7251de 100644 --- a/src/shortread.c +++ b/src/shortread.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: shortread.c 150681 2014-10-14 01:47:18Z twu $"; +static char rcsid[] = "$Id: shortread.c 151067 2014-10-16 21:09:00Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -903,10 +903,10 @@ skip_header_bzip2 (Bzip2_T fp) { -#if 1 #define CONTROLM 13 /* From PC */ #define SPACE 32 +#if 0 static char * find_bad_char (char *line) { char *first, *p1, *p2; @@ -946,6 +946,7 @@ find_bad_char (char *line) { return first; } } +#endif static char * find_spaces (int *nspaces, char *line) { @@ -965,12 +966,11 @@ find_spaces (int *nspaces, char *line) { return first; } } -#endif static int -input_oneline (int *nextchar, char **longstring, char *Start, FILE *fp, char *acc, bool possible_fasta_header_p) { +input_oneline (int *nextchar, char **longstring, char *Start, FILE *fp, bool possible_fasta_header_p) { int remainder; char *ptr, *p = NULL; int strlenp, nspaces; @@ -1060,7 +1060,7 @@ input_oneline (int *nextchar, char **longstring, char *Start, FILE *fp, char *ac #ifdef HAVE_ZLIB static int -input_oneline_gzip (int *nextchar, char **longstring, char *Start, gzFile fp, char *acc, bool possible_fasta_header_p) { +input_oneline_gzip (int *nextchar, char **longstring, char *Start, gzFile fp, bool possible_fasta_header_p) { int remainder; char *ptr, *p = NULL; int strlenp, nspaces; @@ -1148,7 +1148,7 @@ input_oneline_gzip (int *nextchar, char **longstring, char *Start, gzFile fp, ch #ifdef HAVE_BZLIB static int -input_oneline_bzip2 (int *nextchar, char **longstring, char *Start, Bzip2_T fp, char *acc, bool possible_fasta_header_p) { +input_oneline_bzip2 (int *nextchar, char **longstring, char *Start, Bzip2_T fp, bool possible_fasta_header_p) { int remainder; char *ptr, *p = NULL; int strlenp, nspaces; @@ -2189,7 +2189,7 @@ Shortread_read_fasta_shortreads (int *nextchar, T *queryseq2, FILE **input1, FIL /* Process blank lines and loop again */ while (*nextchar != EOF && ((*nextchar = fgetc(*input1)) != '>')) { } - } else if ((fulllength1 = input_oneline(&(*nextchar),&long_read_1,&(Read1[0]),*input1,acc, + } else if ((fulllength1 = input_oneline(&(*nextchar),&long_read_1,&(Read1[0]),*input1, /*possible_fasta_header_p*/true)) == 0) { /* fprintf(stderr,"length is zero\n"); */ /* No sequence1. Don't process, but loop again */ @@ -2197,14 +2197,14 @@ Shortread_read_fasta_shortreads (int *nextchar, T *queryseq2, FILE **input1, FIL } else { /* queryseq1 is in Read1 */ /* See what is in next line */ - if ((fulllength2 = input_oneline(&(*nextchar),&long_read_2,&(Read2[0]),*input1,acc, + if ((fulllength2 = input_oneline(&(*nextchar),&long_read_2,&(Read2[0]),*input1, /*possible_fasta_header_p*/true)) > 0) { /* Paired-end, single file. queryseq1 is in Read1 and queryseq2 is in Read2 */ if (*nextchar == '+') { /* Paired-end with quality strings */ skip_header(*input1); *nextchar = fgetc(*input1); - quality_length = input_oneline(&(*nextchar),&long_quality,&(Quality[0]),*input1,acc, + quality_length = input_oneline(&(*nextchar),&long_quality,&(Quality[0]),*input1, /*possible_fasta_header_p*/false); if (quality_length != fulllength1) { fprintf(stderr,"Length %d of quality score differs from length %d of nucleotides in sequence %s\n", @@ -2215,7 +2215,7 @@ Shortread_read_fasta_shortreads (int *nextchar, T *queryseq2, FILE **input1, FIL queryseq1 = Shortread_new(acc,restofheader,filterp,Read1,long_read_1,fulllength1, Quality,long_quality,quality_length,barcode_length, invert_first_p,/*copy_acc_p*/false,skipp); - quality_length = input_oneline(&(*nextchar),&long_quality,&(Quality[0]),*input1,acc, + quality_length = input_oneline(&(*nextchar),&long_quality,&(Quality[0]),*input1, /*possible_fasta_header_p*/false); if (quality_length != fulllength2) { fprintf(stderr,"Length %d of quality score differs from length %d of nucleotides in sequence %s\n", @@ -2255,7 +2255,7 @@ Shortread_read_fasta_shortreads (int *nextchar, T *queryseq2, FILE **input1, FIL while (nextchar2 != EOF && ((nextchar2 = fgetc(*input2)) != '>')) { } (*queryseq2) = (T) NULL; - } else if ((fulllength2 = input_oneline(&nextchar2,&long_read_2,&(Read2[0]),*input2,acc2, + } else if ((fulllength2 = input_oneline(&nextchar2,&long_read_2,&(Read2[0]),*input2, /*possible_fasta_header_p*/true)) == 0) { /* fprintf(stderr,"length is zero\n"); */ /* No sequence1. Don't process, but loop again */ @@ -2266,7 +2266,7 @@ Shortread_read_fasta_shortreads (int *nextchar, T *queryseq2, FILE **input1, FIL /* End 1 with a quality string */ skip_header(*input1); *nextchar = fgetc(*input1); - quality_length = input_oneline(&(*nextchar),&long_quality,&(Quality[0]),*input1,acc, + quality_length = input_oneline(&(*nextchar),&long_quality,&(Quality[0]),*input1, /*possible_fasta_header_p*/false); if (quality_length != fulllength1) { fprintf(stderr,"Length %d of quality score differs from length %d of nucleotides in sequence %s\n", @@ -2288,7 +2288,7 @@ Shortread_read_fasta_shortreads (int *nextchar, T *queryseq2, FILE **input1, FIL /* End 2 with a quality string */ skip_header(*input2); nextchar2 = fgetc(*input2); - quality_length = input_oneline(&nextchar2,&long_quality,&(Quality[0]),*input2,acc2, + quality_length = input_oneline(&nextchar2,&long_quality,&(Quality[0]),*input2, /*possible_fasta_header_p*/false); if (quality_length != fulllength2) { fprintf(stderr,"Length %d of quality score differs from length %d of nucleotides in sequence %s\n", @@ -2318,7 +2318,7 @@ Shortread_read_fasta_shortreads (int *nextchar, T *queryseq2, FILE **input1, FIL /* Single-end with a quality string */ skip_header(*input1); *nextchar = fgetc(*input1); - quality_length = input_oneline(&(*nextchar),&long_quality,&(Quality[0]),*input1,acc, + quality_length = input_oneline(&(*nextchar),&long_quality,&(Quality[0]),*input1, /*possible_fasta_header_p*/false); if (quality_length != fulllength1) { fprintf(stderr,"Length %d of quality score differs from length %d of nucleotides in sequence %s\n", @@ -2341,7 +2341,9 @@ Shortread_read_fasta_shortreads (int *nextchar, T *queryseq2, FILE **input1, FIL debug(printf("Returning queryseq with contents %s\n",queryseq1->contents)); - if (queryseq1 != NULL && queryseq1->acc != NULL && queryseq1->fulllength > 0) { + if (queryseq1 == (T) SKIPPED) { + return (T) SKIPPED; + } else if (queryseq1 != NULL && queryseq1->acc != NULL && queryseq1->fulllength > 0) { return queryseq1; } } @@ -2358,7 +2360,7 @@ Shortread_read_fasta_shortreads_gzip (int *nextchar, T *queryseq2, gzFile *input int barcode_length, bool invert_first_p, bool invert_second_p) { T queryseq1; char *acc, *restofheader, *acc2, *restofheader2; - char *long_read_1, *long_read_2, *long_quality; + char *long_read_1, *long_read_2; int nextchar2; int fulllength1, fulllength2; bool filterp; @@ -2429,7 +2431,7 @@ Shortread_read_fasta_shortreads_gzip (int *nextchar, T *queryseq2, gzFile *input /* Process blank lines and loop again */ while (*nextchar != EOF && ((*nextchar = gzgetc(*input1)) != '>')) { } - } else if ((fulllength1 = input_oneline_gzip(&(*nextchar),&long_read_1,&(Read1[0]),*input1,acc, + } else if ((fulllength1 = input_oneline_gzip(&(*nextchar),&long_read_1,&(Read1[0]),*input1, /*possible_fasta_header_p*/true)) == 0) { /* fprintf(stderr,"length is zero\n"); */ /* No sequence1. Don't process, but loop again */ @@ -2437,7 +2439,7 @@ Shortread_read_fasta_shortreads_gzip (int *nextchar, T *queryseq2, gzFile *input } else { /* queryseq1 is in Read1 */ /* See what is in next line */ - if ((fulllength2 = input_oneline_gzip(&(*nextchar),&long_read_2,&(Read2[0]),*input1,acc, + if ((fulllength2 = input_oneline_gzip(&(*nextchar),&long_read_2,&(Read2[0]),*input1, /*possible_fasta_header_p*/true)) > 0) { /* Paired-end, single file. queryseq1 is in Read1 and queryseq2 is in Read2 */ queryseq1 = Shortread_new(acc,restofheader,filterp,Read1,long_read_1,fulllength1, @@ -2470,7 +2472,7 @@ Shortread_read_fasta_shortreads_gzip (int *nextchar, T *queryseq2, gzFile *input while (nextchar2 != EOF && ((nextchar2 = gzgetc(*input2)) != '>')) { } (*queryseq2) = (T) NULL; - } else if ((fulllength2 = input_oneline_gzip(&nextchar2,&long_read_2,&(Read2[0]),*input2,acc2, + } else if ((fulllength2 = input_oneline_gzip(&nextchar2,&long_read_2,&(Read2[0]),*input2, /*possible_fasta_header_p*/true)) == 0) { /* fprintf(stderr,"length is zero\n"); */ /* No sequence1. Don't process, but loop again */ @@ -2498,7 +2500,9 @@ Shortread_read_fasta_shortreads_gzip (int *nextchar, T *queryseq2, gzFile *input debug(printf("Returning queryseq with contents %s\n",queryseq1->contents)); - if (queryseq1 != NULL && queryseq1->acc != NULL && queryseq1->fulllength > 0) { + if (queryseq1 == (T) SKIPPED) { + return (T) SKIPPED; + } else if (queryseq1 != NULL && queryseq1->acc != NULL && queryseq1->fulllength > 0) { return queryseq1; } } @@ -2515,7 +2519,7 @@ Shortread_read_fasta_shortreads_bzip2 (int *nextchar, T *queryseq2, Bzip2_T *inp int barcode_length, bool invert_first_p, bool invert_second_p) { T queryseq1; char *acc, *restofheader, *acc2, *restofheader2; - char *long_read_1, *long_read_2, *long_quality; + char *long_read_1, *long_read_2; int nextchar2; int fulllength1, fulllength2; bool filterp; @@ -2583,7 +2587,7 @@ Shortread_read_fasta_shortreads_bzip2 (int *nextchar, T *queryseq2, Bzip2_T *inp /* Process blank lines and loop again */ while (*nextchar != EOF && ((*nextchar = bzgetc(*input1)) != '>')) { } - } else if ((fulllength1 = input_oneline_bzip2(&(*nextchar),&long_read_1,&(Read1[0]),*input1,acc, + } else if ((fulllength1 = input_oneline_bzip2(&(*nextchar),&long_read_1,&(Read1[0]),*input1, /*possible_fasta_header_p*/true)) == 0) { /* fprintf(stderr,"length is zero\n"); */ /* No sequence1. Don't process, but loop again */ @@ -2591,7 +2595,7 @@ Shortread_read_fasta_shortreads_bzip2 (int *nextchar, T *queryseq2, Bzip2_T *inp } else { /* queryseq1 is in Read1 */ /* See what is in next line */ - if ((fulllength2 = input_oneline_bzip2(&(*nextchar),&long_read_2,&(Read2[0]),*input1,acc, + if ((fulllength2 = input_oneline_bzip2(&(*nextchar),&long_read_2,&(Read2[0]),*input1, /*possible_fasta_header_p*/true)) > 0) { /* Paired-end, single file. queryseq1 is in Read1 and queryseq2 is in Read2 */ queryseq1 = Shortread_new(acc,restofheader,filterp,Read1,long_read_1,fulllength1, @@ -2620,7 +2624,7 @@ Shortread_read_fasta_shortreads_bzip2 (int *nextchar, T *queryseq2, Bzip2_T *inp while (nextchar2 != EOF && ((nextchar2 = bzgetc(*input2)) != '>')) { } (*queryseq2) = (T) NULL; - } else if ((fulllength2 = input_oneline_bzip2(&nextchar2,&long_read_2,&(Read2[0]),*input2,acc2, + } else if ((fulllength2 = input_oneline_bzip2(&nextchar2,&long_read_2,&(Read2[0]),*input2, /*possible_fasta_header_p*/true)) == 0) { /* fprintf(stderr,"length is zero\n"); */ /* No sequence1. Don't process, but loop again */ @@ -2648,7 +2652,9 @@ Shortread_read_fasta_shortreads_bzip2 (int *nextchar, T *queryseq2, Bzip2_T *inp debug(printf("Returning queryseq with contents %s\n",queryseq1->contents)); - if (queryseq1 != NULL && queryseq1->acc != NULL && queryseq1->fulllength > 0) { + if (queryseq1 == (T) SKIPPED) { + return (T) SKIPPED; + } else if (queryseq1 != NULL && queryseq1->acc != NULL && queryseq1->fulllength > 0) { return queryseq1; } } @@ -2726,7 +2732,7 @@ Shortread_read_fastq_shortreads (int *nextchar, T *queryseq2, FILE **input1, FIL *nextchar = EOF; } else { *nextchar = fgetc(*input1); - if ((fulllength = input_oneline(&(*nextchar),&long_read_1,&(Read1[0]),*input1,acc, + if ((fulllength = input_oneline(&(*nextchar),&long_read_1,&(Read1[0]),*input1, /*possible_fasta_header_p*/true)) == 0) { FREE_IN(acc); FREE_IN(restofheader); @@ -2742,7 +2748,7 @@ Shortread_read_fastq_shortreads (int *nextchar, T *queryseq2, FILE **input1, FIL } else { skip_header(*input1); *nextchar = fgetc(*input1); - quality_length = input_oneline(&(*nextchar),&long_quality,&(Quality[0]),*input1,acc, + quality_length = input_oneline(&(*nextchar),&long_quality,&(Quality[0]),*input1, /*possible_fasta_header_p*/false); if (quality_length != fulllength) { fprintf(stderr,"Length %d of quality score differs from length %d of nucleotides in sequence %s\n", @@ -2784,7 +2790,7 @@ Shortread_read_fastq_shortreads (int *nextchar, T *queryseq2, FILE **input1, FIL } } nextchar2 = fgetc(*input2); - if ((fulllength = input_oneline(&nextchar2,&long_read_2,&(Read2[0]),*input2,acc, + if ((fulllength = input_oneline(&nextchar2,&long_read_2,&(Read2[0]),*input2, /*possible_fasta_header_p*/true)) == 0) { FREE_IN(acc); FREE_IN(restofheader); @@ -2800,7 +2806,7 @@ Shortread_read_fastq_shortreads (int *nextchar, T *queryseq2, FILE **input1, FIL } else { skip_header(*input2); nextchar2 = fgetc(*input2); - quality_length = input_oneline(&nextchar2,&long_quality,&(Quality[0]),*input2,acc, + quality_length = input_oneline(&nextchar2,&long_quality,&(Quality[0]),*input2, /*possible_fasta_header_p*/false); if (quality_length != fulllength) { fprintf(stderr,"Length %d of quality score differs from length %d of nucleotides in sequence %s\n", @@ -2817,7 +2823,9 @@ Shortread_read_fastq_shortreads (int *nextchar, T *queryseq2, FILE **input1, FIL } } - if (queryseq1 != NULL && queryseq1->acc != NULL && queryseq1->fulllength > 0) { + if (queryseq1 == (T) SKIPPED) { + return (T) SKIPPED; + } else if (queryseq1 != NULL && queryseq1->acc != NULL && queryseq1->fulllength > 0) { return queryseq1; } } @@ -2898,7 +2906,7 @@ Shortread_read_fastq_shortreads_gzip (int *nextchar, T *queryseq2, gzFile *input *nextchar = EOF; } else { *nextchar = gzgetc(*input1); - if ((fulllength = input_oneline_gzip(&(*nextchar),&long_read_1,&(Read1[0]),*input1,acc, + if ((fulllength = input_oneline_gzip(&(*nextchar),&long_read_1,&(Read1[0]),*input1, /*possible_fasta_header_p*/true)) == 0) { FREE_IN(acc); FREE_IN(restofheader); @@ -2914,7 +2922,7 @@ Shortread_read_fastq_shortreads_gzip (int *nextchar, T *queryseq2, gzFile *input } else { skip_header_gzip(*input1); *nextchar = gzgetc(*input1); - quality_length = input_oneline_gzip(&(*nextchar),&long_quality,&(Quality[0]),*input1,acc, + quality_length = input_oneline_gzip(&(*nextchar),&long_quality,&(Quality[0]),*input1, /*possible_fasta_header_p*/false); if (quality_length != fulllength) { fprintf(stderr,"Length %d of quality score differs from length %d of nucleotides in sequence %s\n", @@ -2956,7 +2964,7 @@ Shortread_read_fastq_shortreads_gzip (int *nextchar, T *queryseq2, gzFile *input } } nextchar2 = gzgetc(*input2); - if ((fulllength = input_oneline_gzip(&nextchar2,&long_read_2,&(Read2[0]),*input2,acc, + if ((fulllength = input_oneline_gzip(&nextchar2,&long_read_2,&(Read2[0]),*input2, /*possible_fasta_header_p*/true)) == 0) { FREE_IN(acc); FREE_IN(restofheader); @@ -2972,7 +2980,7 @@ Shortread_read_fastq_shortreads_gzip (int *nextchar, T *queryseq2, gzFile *input } else { skip_header_gzip(*input2); nextchar2 = gzgetc(*input2); - quality_length = input_oneline_gzip(&nextchar2,&long_quality,&(Quality[0]),*input2,acc, + quality_length = input_oneline_gzip(&nextchar2,&long_quality,&(Quality[0]),*input2, /*possible_fasta_header_p*/false); if (quality_length != fulllength) { fprintf(stderr,"Length %d of quality score differs from length %d of nucleotides in sequence %s\n", @@ -2988,7 +2996,9 @@ Shortread_read_fastq_shortreads_gzip (int *nextchar, T *queryseq2, gzFile *input } } - if (queryseq1 != NULL && queryseq1->acc != NULL && queryseq1->fulllength > 0) { + if (queryseq1 == (T) SKIPPED) { + return (T) SKIPPED; + } else if (queryseq1 != NULL && queryseq1->acc != NULL && queryseq1->fulllength > 0) { return queryseq1; } } @@ -3058,7 +3068,7 @@ Shortread_read_fastq_shortreads_bzip2 (int *nextchar, T *queryseq2, Bzip2_T *inp *nextchar = EOF; } else { *nextchar = bzgetc(*input1); - if ((fulllength = input_oneline_bzip2(&(*nextchar),&long_read_1,&(Read1[0]),*input1,acc, + if ((fulllength = input_oneline_bzip2(&(*nextchar),&long_read_1,&(Read1[0]),*input1, /*possible_fasta_header_p*/true)) == 0) { FREE_IN(acc); FREE_IN(restofheader); @@ -3074,7 +3084,7 @@ Shortread_read_fastq_shortreads_bzip2 (int *nextchar, T *queryseq2, Bzip2_T *inp } else { skip_header_bzip2(*input1); *nextchar = bzgetc(*input1); - quality_length = input_oneline_bzip2(&(*nextchar),&long_quality,&(Quality[0]),*input1,acc, + quality_length = input_oneline_bzip2(&(*nextchar),&long_quality,&(Quality[0]),*input1, /*possible_fasta_header_p*/false); if (quality_length != fulllength) { fprintf(stderr,"Length %d of quality score differs from length %d of nucleotides in sequence %s\n", @@ -3116,7 +3126,7 @@ Shortread_read_fastq_shortreads_bzip2 (int *nextchar, T *queryseq2, Bzip2_T *inp } } nextchar2 = bzgetc(*input2); - if ((fulllength = input_oneline_bzip2(&nextchar2,&long_read_2,&(Read2[0]),*input2,acc, + if ((fulllength = input_oneline_bzip2(&nextchar2,&long_read_2,&(Read2[0]),*input2, /*possible_fasta_header_p*/true)) == 0) { FREE_IN(acc); FREE_IN(restofheader); @@ -3132,7 +3142,7 @@ Shortread_read_fastq_shortreads_bzip2 (int *nextchar, T *queryseq2, Bzip2_T *inp } else { skip_header_bzip2(*input2); nextchar2 = bzgetc(*input2); - quality_length = input_oneline_bzip2(&nextchar2,&long_quality,&(Quality[0]),*input2,acc, + quality_length = input_oneline_bzip2(&nextchar2,&long_quality,&(Quality[0]),*input2, /*possible_fasta_header_p*/false); if (quality_length != fulllength) { fprintf(stderr,"Length %d of quality score differs from length %d of nucleotides in sequence %s\n", @@ -3148,7 +3158,9 @@ Shortread_read_fastq_shortreads_bzip2 (int *nextchar, T *queryseq2, Bzip2_T *inp } } - if (queryseq1 != NULL && queryseq1->acc != NULL && queryseq1->fulllength > 0) { + if (queryseq1 == (T) SKIPPED) { + return (T) SKIPPED; + } else if (queryseq1 != NULL && queryseq1->acc != NULL && queryseq1->fulllength > 0) { return queryseq1; } } diff --git a/src/stage1hr.c b/src/stage1hr.c index b09904b..f4e4209 100644 --- a/src/stage1hr.c +++ b/src/stage1hr.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: stage1hr.c 148544 2014-09-22 20:42:34Z twu $"; +static char rcsid[] = "$Id: stage1hr.c 151054 2014-10-16 19:58:21Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -12873,6 +12873,7 @@ align_end (int *cutoff_level, History_T gmap_history, T this, if ((done_level = opt_level + subopt_levels) > user_maxlevel) { done_level = user_maxlevel; } + debug(printf("SA> opt_level %d, done_level %d\n",opt_level,done_level)); } else { #endif diff --git a/src/stage2.c b/src/stage2.c index 20e2896..98060a8 100644 --- a/src/stage2.c +++ b/src/stage2.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: stage2.c 146634 2014-09-02 22:33:39Z twu $"; +static char rcsid[] = "$Id: stage2.c 151086 2014-10-16 23:52:16Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -3411,28 +3411,26 @@ align_compute_scores_lookback (int *ncells, struct Link_T **links, Chrpos_T **ma if ((best_fwd_score = prevlink->fwd_score - (querypos - grand_fwd_querypos)) > 0) { prevposition = mappings[grand_fwd_querypos][grand_fwd_hit]; debug12(printf("Considering prevposition %u to position %u as a grand fwd lookback\n",prevposition,position)); - if (position > prevposition + maxintronlen) { - debug12(printf(" => Too long\n")); - } else { - for (hit = low_hit; hit < high_hit; hit++) { + for (hit = low_hit; hit < high_hit; hit++) { + if ((position = mappings[querypos][hit]) > prevposition + maxintronlen) { + debug12(printf(" => Too long\n")); + } else if (position >= prevposition + indexsize_nt) { currlink = &(links[querypos][hit]); - if ((position = mappings[querypos][hit]) >= prevposition + indexsize_nt) { - currlink->fwd_consecutive = indexsize_nt; - /* currlink->fwd_rootnlinks = 1; */ - currlink->fwd_pos = grand_fwd_querypos; - currlink->fwd_hit = grand_fwd_hit; - currlink->fwd_score = best_fwd_score; + currlink->fwd_consecutive = indexsize_nt; + /* currlink->fwd_rootnlinks = 1; */ + currlink->fwd_pos = grand_fwd_querypos; + currlink->fwd_hit = grand_fwd_hit; + currlink->fwd_score = best_fwd_score; #ifdef DEBUG9 - currlink->fwd_tracei = ++fwd_tracei; - currlink->fwd_intronnfwd = prevlink->fwd_intronnfwd; - currlink->fwd_intronnrev = prevlink->fwd_intronnrev; - currlink->fwd_intronnunk = prevlink->fwd_intronnunk + 1; + currlink->fwd_tracei = ++fwd_tracei; + currlink->fwd_intronnfwd = prevlink->fwd_intronnfwd; + currlink->fwd_intronnrev = prevlink->fwd_intronnrev; + currlink->fwd_intronnunk = prevlink->fwd_intronnunk + 1; #endif - } } - debug12(printf("At querypos %d, setting all fwd hits to point back to grand_fwd %d,%d with a score of %d\n", - querypos,grand_fwd_querypos,grand_fwd_hit,prevlink->fwd_score)); } + debug12(printf("At querypos %d, setting all fwd hits to point back to grand_fwd %d,%d with a score of %d\n", + querypos,grand_fwd_querypos,grand_fwd_hit,prevlink->fwd_score)); } } @@ -3461,28 +3459,26 @@ align_compute_scores_lookback (int *ncells, struct Link_T **links, Chrpos_T **ma if ((best_rev_score = prevlink->rev_score - (querypos - grand_rev_querypos)) > 0) { prevposition = mappings[grand_rev_querypos][grand_rev_hit]; debug12(printf("Considering prevposition %u to position %u as a grand rev lookback\n",prevposition,position)); - if (position > prevposition + maxintronlen) { - debug12(printf(" => Too long\n")); - } else { - for (hit = low_hit; hit < high_hit; hit++) { + for (hit = low_hit; hit < high_hit; hit++) { + if ((position = mappings[querypos][hit]) > prevposition + maxintronlen) { + debug12(printf(" => Too long\n")); + } else if (position >= prevposition + indexsize_nt) { currlink = &(links[querypos][hit]); - if ((position = mappings[querypos][hit]) >= prevposition + indexsize_nt) { - currlink->rev_consecutive = indexsize_nt; - /* currlink->rev_rootnlinks = 1; */ - currlink->rev_pos = grand_rev_querypos; - currlink->rev_hit = grand_rev_hit; - currlink->rev_score = best_rev_score; + currlink->rev_consecutive = indexsize_nt; + /* currlink->rev_rootnlinks = 1; */ + currlink->rev_pos = grand_rev_querypos; + currlink->rev_hit = grand_rev_hit; + currlink->rev_score = best_rev_score; #ifdef DEBUG9 - currlink->rev_tracei = ++rev_tracei; - currlink->rev_intronnrev = prevlink->rev_intronnfwd; - currlink->rev_intronnrev = prevlink->rev_intronnrev; - currlink->rev_intronnunk = prevlink->rev_intronnunk + 1; + currlink->rev_tracei = ++rev_tracei; + currlink->rev_intronnrev = prevlink->rev_intronnfwd; + currlink->rev_intronnrev = prevlink->rev_intronnrev; + currlink->rev_intronnunk = prevlink->rev_intronnunk + 1; #endif - } } - debug12(printf("At querypos %d, setting all rev hits to point back to grand_rev %d,%d with a score of %d\n", - querypos,grand_rev_querypos,grand_rev_hit,prevlink->rev_score)); } + debug12(printf("At querypos %d, setting all rev hits to point back to grand_rev %d,%d with a score of %d\n", + querypos,grand_rev_querypos,grand_rev_hit,prevlink->rev_score)); } } @@ -4179,29 +4175,27 @@ align_compute_scores_lookforward (int *ncells, struct Link_T **links, Chrpos_T * prevlink = &(links[grand_fwd_querypos][grand_fwd_hit]); if ((best_fwd_score = prevlink->fwd_score - (grand_fwd_querypos - querypos)) > 0) { prevposition = mappings[grand_fwd_querypos][grand_fwd_hit]; - debug12(printf("Considering prevposition %u to position %u as a grand fwd lookback\n",prevposition,position)); - if (position > prevposition + maxintronlen) { - debug12(printf(" => Too long\n")); - } else { - for (hit = high_hit - 1; hit >= low_hit; --hit) { + debug12(printf("Considering prevposition %u to position %u as a grand fwd lookforward\n",prevposition,position)); + for (hit = high_hit - 1; hit >= low_hit; --hit) { + if ((position = mappings[querypos][hit]) + maxintronlen < prevposition) { + debug12(printf(" => Too long\n")); + } else if (position + indexsize_nt <= prevposition) { currlink = &(links[querypos][hit]); - if ((position = mappings[querypos][hit]) + indexsize_nt <= prevposition) { - currlink->fwd_consecutive = indexsize_nt; - /* currlink->fwd_rootnlinks = 1; */ - currlink->fwd_pos = grand_fwd_querypos; - currlink->fwd_hit = grand_fwd_hit; - currlink->fwd_score = best_fwd_score; + currlink->fwd_consecutive = indexsize_nt; + /* currlink->fwd_rootnlinks = 1; */ + currlink->fwd_pos = grand_fwd_querypos; + currlink->fwd_hit = grand_fwd_hit; + currlink->fwd_score = best_fwd_score; #ifdef DEBUG9 - currlink->fwd_tracei = ++fwd_tracei; - currlink->fwd_intronnfwd = prevlink->fwd_intronnfwd; - currlink->fwd_intronnrev = prevlink->fwd_intronnrev; - currlink->fwd_intronnunk = prevlink->fwd_intronnunk + 1; + currlink->fwd_tracei = ++fwd_tracei; + currlink->fwd_intronnfwd = prevlink->fwd_intronnfwd; + currlink->fwd_intronnrev = prevlink->fwd_intronnrev; + currlink->fwd_intronnunk = prevlink->fwd_intronnunk + 1; #endif - } } - debug12(printf("At querypos %d, setting all fwd hits to point back to grand_fwd %d,%d with a score of %d\n", - querypos,grand_fwd_querypos,grand_fwd_hit,prevlink->fwd_score)); } + debug12(printf("At querypos %d, setting all fwd hits to point back to grand_fwd %d,%d with a score of %d\n", + querypos,grand_fwd_querypos,grand_fwd_hit,prevlink->fwd_score)); } } @@ -4229,29 +4223,27 @@ align_compute_scores_lookforward (int *ncells, struct Link_T **links, Chrpos_T * prevlink = &(links[grand_rev_querypos][grand_rev_hit]); if ((best_rev_score = prevlink->rev_score - (grand_rev_querypos - querypos)) > 0) { prevposition = mappings[grand_rev_querypos][grand_rev_hit]; - debug12(printf("Considering prevposition %u to position %u as a grand rev lookback\n",prevposition,position)); - if (position > prevposition + maxintronlen) { - debug12(printf(" => Too long\n")); - } else { - for (hit = high_hit - 1; hit >= low_hit; --hit) { + debug12(printf("Considering prevposition %u to position %u as a grand rev lookforward\n",prevposition,position)); + for (hit = high_hit - 1; hit >= low_hit; --hit) { + if ((position = mappings[querypos][hit]) + maxintronlen < prevposition) { + debug12(printf(" => Too long\n")); + } else if (position + indexsize_nt <= prevposition) { currlink = &(links[querypos][hit]); - if ((position = mappings[querypos][hit]) + indexsize_nt <= prevposition) { - currlink->rev_consecutive = indexsize_nt; - /* currlink->rev_rootnlinks = 1; */ - currlink->rev_pos = grand_rev_querypos; - currlink->rev_hit = grand_rev_hit; - currlink->rev_score = best_rev_score; + currlink->rev_consecutive = indexsize_nt; + /* currlink->rev_rootnlinks = 1; */ + currlink->rev_pos = grand_rev_querypos; + currlink->rev_hit = grand_rev_hit; + currlink->rev_score = best_rev_score; #ifdef DEBUG9 - currlink->rev_tracei = ++rev_tracei; - currlink->rev_intronnrev = prevlink->rev_intronnfwd; - currlink->rev_intronnrev = prevlink->rev_intronnrev; - currlink->rev_intronnunk = prevlink->rev_intronnunk + 1; + currlink->rev_tracei = ++rev_tracei; + currlink->rev_intronnrev = prevlink->rev_intronnfwd; + currlink->rev_intronnrev = prevlink->rev_intronnrev; + currlink->rev_intronnunk = prevlink->rev_intronnunk + 1; #endif - } } - debug12(printf("At querypos %d, setting all rev hits to point back to grand_rev %d,%d with a score of %d\n", - querypos,grand_rev_querypos,grand_rev_hit,prevlink->rev_score)); } + debug12(printf("At querypos %d, setting all rev hits to point back to grand_rev %d,%d with a score of %d\n", + querypos,grand_rev_querypos,grand_rev_hit,prevlink->rev_score)); } } diff --git a/src/stage3hr.c b/src/stage3hr.c index b507de0..44e4a80 100644 --- a/src/stage3hr.c +++ b/src/stage3hr.c @@ -1,4 +1,4 @@ -static char rcsid[] = "$Id: stage3hr.c 149163 2014-09-27 03:11:12Z twu $"; +static char rcsid[] = "$Id: stage3hr.c 151052 2014-10-16 19:56:35Z twu $"; #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -24,6 +24,7 @@ static char rcsid[] = "$Id: stage3hr.c 149163 2014-09-27 03:11:12Z twu $"; #include "fastlog.h" +#define SOFT_CLIPS_AVOID_CIRCULARIZATION 1 /* Needed to avoid CIGAR strings like 1S99H */ #define MAX_HITS 100000 @@ -3104,12 +3105,24 @@ compute_circularpos (int *alias, T hit) { hit->plusp,hit->querylength_adj); } else if (hit->plusp == true) { - if (hit->low + hit->trim_left >= hit->chroffset + hit->chrlength) { + if ( +#ifdef SOFT_CLIPS_AVOID_CIRCULARIZATION + hit->low + hit->trim_left >= hit->chroffset + hit->chrlength +#else + hit->low >= hit->chroffset + hit->chrlength +#endif + ) { /* All of read after trimming is in circular alias */ *alias = +1; return -1; - } else if (hit->high - hit->trim_right <= hit->chroffset + hit->chrlength) { + } else if ( +#ifdef SOFT_CLIPS_AVOID_CIRCULARIZATION + hit->high - hit->trim_right <= hit->chroffset + hit->chrlength +#else + hit->high <= hit->chroffset + hit->chrlength +#endif + ) { /* All of read after trimming is in circular proper */ *alias = -1; return -1; @@ -3131,26 +3144,40 @@ compute_circularpos (int *alias, T hit) { } } else { - if (hit->low + hit->trim_right >= hit->chroffset + hit->chrlength) { + if ( +#ifdef SOFT_CLIPS_AVOID_CIRCULARIZATION + hit->low + hit->trim_right >= hit->chroffset + hit->chrlength +#else + hit->low >= hit->chroffset + hit->chrlength +#endif + ) { /* All of read after trimming is in circular alias */ + debug12(printf("All of read after trimming is in circular alias\n")); *alias = +1; return -1; - } else if (hit->high - hit->trim_left <= hit->chroffset + hit->chrlength) { + } else if ( +#ifdef SOFT_CLIPS_AVOID_CIRCULARIZATION + hit->high - hit->trim_left <= hit->chroffset + hit->chrlength +#else + hit->high <= hit->chroffset + hit->chrlength +#endif + ) { /* All of read after trimming is in circular proper */ + debug12(printf("All of read after trimming is in circular proper\n")); *alias = -1; return -1; } else { *alias = 0; if ((circularpos = Substring_circularpos(hit->substring2)) > 0) { - debug12(printf("Returning circularpos %d from substring0 (minus)\n",circularpos)); + debug12(printf("Returning circularpos %d from substring2 (minus)\n",circularpos)); return circularpos; } else if ((circularpos = Substring_circularpos(hit->substring1)) > 0) { debug12(printf("Returning circularpos %d from substring1 (minus)\n",circularpos)); return circularpos; } else if ((circularpos = Substring_circularpos(hit->substring0)) > 0) { - debug12(printf("Returning circularpos %d from substring2 (minus)\n",circularpos)); + debug12(printf("Returning circularpos %d from substring0 (minus)\n",circularpos)); return circularpos; } else { return -1; @@ -6298,9 +6325,11 @@ List_T Stage3end_remove_circular_alias (List_T hitlist) { List_T newlist = NULL, p; T hit; +#ifdef SOFT_CLIPS_AVOID_CIRCULARIZATION int trim; +#endif - debug12(printf("Calling Stage3end_remove_circular_alias\n")); + debug12(printf("Calling Stage3end_remove_circular_alias on %d hits\n",List_length(hitlist))); for (p = hitlist; p != NULL; p = p->rest) { hit = (T) p->first; @@ -6315,14 +6344,24 @@ Stage3end_remove_circular_alias (List_T hitlist) { newlist = List_push(newlist,(void *) hit); } else { +#ifdef SOFT_CLIPS_AVOID_CIRCULARIZATION + /* This allows soft clips to avoid circularization */ if (hit->plusp == true) { trim = hit->trim_left; } else { trim = hit->trim_right; } +#endif - if (hit->low + trim >= hit->chroffset + hit->chrlength) { + if ( +#ifdef SOFT_CLIPS_AVOID_CIRCULARIZATION + hit->low + trim >= hit->chroffset + hit->chrlength +#else + hit->low >= hit->chroffset + hit->chrlength +#endif + ) { /* All in circular alias */ + debug12(printf("Freeing hit because all is in circular alias\n")); Stage3end_free(&hit); } else { -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/gmap.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
