This is an automated email from the git hooks/post-receive script. afif pushed a commit to branch master in repository daligner.
commit 297c59fbf22871b2231b605692a7ed6b8af20951 Author: Afif Elghraoui <[email protected]> Date: Wed Jan 18 22:05:50 2017 -0800 Imported Upstream version 1.0+20161119 --- DB.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ DB.h | 14 ++++++++- LAdump.c | 25 ++++++---------- LAmerge.c | 2 +- QV.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ QV.h | 3 ++ align.c | 17 ++++++----- align.h | 4 +-- filter.c | 10 +++++-- 9 files changed, 230 insertions(+), 38 deletions(-) diff --git a/DB.c b/DB.c index 46046b7..3afff14 100644 --- a/DB.c +++ b/DB.c @@ -314,6 +314,14 @@ void Upper_Read(char *s) *s = '\0'; } +void Letter_Arrow(char *s) +{ static char letter[4] = { '1', '2', '3', '4' }; + + for ( ; *s != 4; s++) + *s = letter[(int) *s]; + *s = '\0'; +} + // Convert read in ascii representation to [0-3] representation (end with 4) void Number_Read(char *s) @@ -341,6 +349,31 @@ void Number_Read(char *s) *s = 4; } +void Number_Arrow(char *s) +{ static char arrow[128] = + { 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 0, 1, 2, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 2, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + }; + + for ( ; *s != '\0'; s++) + *s = arrow[(int) *s]; + *s = 4; +} + /******************************************************************************************* * @@ -426,7 +459,7 @@ int Open_DB(char* path, HITS_DB *db) if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1) if (part == 0) { cutoff = 0; - all = 1; + all = DB_ALL; } else { EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n", @@ -466,7 +499,7 @@ int Open_DB(char* path, HITS_DB *db) db->tracks = NULL; db->part = part; db->cutoff = cutoff; - db->all = all; + db->allarr |= all; db->ufirst = ufirst; db->tfirst = tfirst; @@ -478,7 +511,7 @@ int Open_DB(char* path, HITS_DB *db) db->reads += 1; if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); - free(db->reads); + free(db->reads-1); goto error2; } } @@ -495,7 +528,7 @@ int Open_DB(char* path, HITS_DB *db) fseeko(index,sizeof(HITS_READ)*ufirst,SEEK_CUR); if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); - free(reads); + free(reads-1); goto error2; } @@ -557,10 +590,10 @@ void Trim_DB(HITS_DB *db) if (db->trimmed) return; - if (db->cutoff <= 0 && db->all) return; + if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return; cutoff = db->cutoff; - if (db->all) + if ((db->allarr & DB_ALL) != 0) allflag = 0; else allflag = DB_BEST; @@ -660,10 +693,10 @@ static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart) if (!db->trimmed) return (0); - if (db->cutoff <= 0 && db->all) return (0); + if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return (0); cutoff = db->cutoff; - if (db->all) + if ((db->allarr & DB_ALL) != 0) allflag = 0; else allflag = DB_BEST; @@ -1435,6 +1468,56 @@ int Load_Read(HITS_DB *db, int i, char *read, int ascii) return (0); } +// Load into 'read' the i'th arrow in 'db'. As an ASCII string if ascii is 1, +// and as a numeric string otherwise. +// + +HITS_DB *Arrow_DB = NULL; // Last db/arw used by "Load_Arrow" +FILE *Arrow_File = NULL; // Becomes invalid after closing + +int Load_Arrow(HITS_DB *db, int i, char *read, int ascii) +{ FILE *arrow; + int64 off; + int len, clen; + HITS_READ *r = db->reads; + + if (i >= db->nreads) + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Arrow)\n",Prog_Name); + EXIT(1); + } + if (Arrow_DB != db) + { fclose(Arrow_File); + arrow = Fopen(Catenate(db->path,"","",".arw"),"r"); + if (arrow == NULL) + EXIT(1); + Arrow_File = arrow; + Arrow_DB = db; + } + else + arrow = Arrow_File; + + off = r[i].boff; + len = r[i].rlen; + + if (ftello(arrow) != off) + fseeko(arrow,off,SEEK_SET); + clen = COMPRESSED_LEN(len); + if (clen > 0) + { if (fread(read,clen,1,arrow) != 1) + { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Arrow)\n",Prog_Name); + EXIT(1); + } + } + Uncompress_Read(len,read); + if (ascii == 1) + { Letter_Arrow(read); + read[-1] = '\0'; + } + else + read[-1] = 4; + return (0); +} + char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; diff --git a/DB.h b/DB.h index 983bee7..dc281de 100644 --- a/DB.h +++ b/DB.h @@ -164,6 +164,9 @@ void Lower_Read(char *s); // Convert read from numbers to lowercase letters void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT) void Number_Read(char *s); // Convert read from letters to numbers +void Letter_Arrow(char *s); // Convert arrow pw's from numbers to uppercase letters (0-3 to 1234) +void Number_Arrow(char *s); // Convert arrow pw string from letters to numbers + /******************************************************************************************* * @@ -175,6 +178,9 @@ void Number_Read(char *s); // Convert read from letters to numbers #define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert #define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1) +#define DB_ARROW 0x2 // DB is an arrow DB +#define DB_ALL 0x1 // all wells are in the trimmed DB + // Fields have different interpretations if a .db versus a .dam typedef struct @@ -185,6 +191,7 @@ typedef struct // uncompressed bases in memory block int64 coff; // Offset (in bytes) of compressed quiva streams in '.qvs' file (DB), // Offset (in bytes) of scaffold header string in '.hdr' file (DAM) + // 4 compressed shorts containing snr info if an arrow DB. int flags; // QV of read + flags above (DB only) } HITS_READ; @@ -226,7 +233,7 @@ typedef struct { int ureads; // Total number of reads in untrimmed DB int treads; // Total number of reads in trimmed DB int cutoff; // Minimum read length in block (-1 if not yet set) - int all; // Consider multiple reads from a given well + int allarr; // DB_ALL | DB_ARROW float freq[4]; // frequency of A, C, G, T, respectively // Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed) @@ -368,6 +375,11 @@ char *New_Read_Buffer(HITS_DB *db); int Load_Read(HITS_DB *db, int i, char *read, int ascii); + // Exactly the same as Load_Read, save the arrow information is loaded, not the DNA sequence, + // and there is only a choice between numeric (0) or ascii (1); + +int Load_Arrow(HITS_DB *db, int i, char *read, int ascii); + // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string diff --git a/LAdump.c b/LAdump.c index 070c3c2..716e873 100644 --- a/LAdump.c +++ b/LAdump.c @@ -40,6 +40,7 @@ int main(int argc, char *argv[]) FILE *input; int64 novl; int tspace, tbytes, small; + int tmax; int reps, *pts; int input_pts; @@ -279,7 +280,7 @@ int main(int argc, char *argv[]) { int j, al, tlen; int in, npt, idx, ar; - int64 novls, odeg, omax, sdeg, smax, ttot, tmax; + int64 novls, odeg, omax, sdeg, smax, ttot; in = 0; npt = pts[0]; @@ -359,28 +360,24 @@ int main(int argc, char *argv[]) printf("+ P %lld\n",novls); printf("%% P %lld\n",omax); - printf("+ T %lld\n",ttot); - printf("%% T %lld\n",smax); - printf("@ T %lld\n",tmax); + if (DOTRACE) + { printf("+ T %lld\n",ttot); + printf("%% T %lld\n",smax); + printf("@ T %lld\n",tmax); + } } // Read the file and display selected records { int j; uint16 *trace; - int tmax; int in, npt, idx, ar; int64 verse; rewind(input); - fread(&verse,sizeof(int64),1,input); + fread(&novl,sizeof(int64),1,input); fread(&tspace,sizeof(int),1,input); - if (verse < 0) - { for (j = 0; j < 5; j++) - fread(&verse,sizeof(int64),1,input); - } - tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); @@ -396,12 +393,6 @@ int main(int argc, char *argv[]) // Read it in { Read_Overlap(input,ovl); - if (ovl->path.tlen > tmax) - { tmax = ((int) 1.2*ovl->path.tlen) + 100; - trace = (uint16 *) Realloc(trace,sizeof(uint16)*tmax,"Allocating trace vector"); - if (trace == NULL) - exit (1); - } ovl->path.trace = (void *) trace; Read_Trace(input,ovl,tbytes); diff --git a/LAmerge.c b/LAmerge.c index 985a5f6..6a768fd 100644 --- a/LAmerge.c +++ b/LAmerge.c @@ -168,7 +168,7 @@ static void ovl_reload(IO_block *in, int64 bsize) remains = in->top - in->ptr; if (remains > 0) - memcpy(in->block, in->ptr, remains); + memmove(in->block, in->ptr, remains); in->ptr = in->block; in->top = in->block + remains; in->top += fread(in->top,1,bsize-remains,in->stream); diff --git a/QV.c b/QV.c index cdb6a63..d7d7263 100644 --- a/QV.c +++ b/QV.c @@ -863,6 +863,62 @@ static int delChar, subChar; // Referred by: QVcoding_Scan, Create_QVcoding +void QVcoding_Scan1(int rlen, char *delQV, char *delTag, char *insQV, char *mergeQV, char *subQV) +{ + if (rlen == 0) // Initialization call + { int i; + + // Zero histograms + + bzero(delHist,sizeof(uint64)*256); + bzero(mrgHist,sizeof(uint64)*256); + bzero(insHist,sizeof(uint64)*256); + bzero(subHist,sizeof(uint64)*256); + + for (i = 0; i < 256; i++) + delRun[i] = subRun[i] = 1; + + totChar = 0; + delChar = -1; + subChar = -1; + return; + } + + // Add streams to accumulating histograms and figure out the run chars + // for the deletion and substition streams + + Histogram_Seqs(delHist,(uint8 *) delQV,rlen); + Histogram_Seqs(insHist,(uint8 *) insQV,rlen); + Histogram_Seqs(mrgHist,(uint8 *) mergeQV,rlen); + Histogram_Seqs(subHist,(uint8 *) subQV,rlen); + + if (delChar < 0) + { int k; + + for (k = 0; k < rlen; k++) + if (delTag[k] == 'n' || delTag[k] == 'N') + { delChar = delQV[k]; + break; + } + } + if (delChar >= 0) + Histogram_Runs( delRun,(uint8 *) delQV,rlen,delChar); + totChar += rlen; + if (subChar < 0) + { if (totChar >= 100000) + { int k; + + subChar = 0; + for (k = 1; k < 256; k++) + if (subHist[k] > subHist[subChar]) + subChar = k; + } + } + if (subChar >= 0) + Histogram_Runs( subRun,(uint8 *) subQV,rlen,subChar); + return; +} + int QVcoding_Scan(FILE *input, int num, FILE *temp) { char *slash; int rlen; @@ -1284,6 +1340,44 @@ void Free_QVcoding(QVcoding *coding) * ********************************************************************************************/ +void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub, + FILE *output, QVcoding *coding, int lossy) +{ int clen; + + if (coding->delChar < 0) + { Encode(coding->delScheme, output, (uint8 *) del, rlen); + clen = rlen; + } + else + { Encode_Run(coding->delScheme, coding->dRunScheme, output, + (uint8 *) del, rlen, coding->delChar); + clen = Pack_Tag(tag,del,rlen,coding->delChar); + } + Number_Read(tag); + Compress_Read(clen,tag); + fwrite(tag,1,COMPRESSED_LEN(clen),output); + + if (lossy) + { uint8 *insert = (uint8 *) ins; + uint8 *merge = (uint8 *) mrg; + int k; + + for (k = 0; k < rlen; k++) + { insert[k] = (uint8) ((insert[k] >> 1) << 1); + merge[k] = (uint8) (( merge[k] >> 2) << 2); + } + } + + Encode(coding->insScheme, output, (uint8 *) ins, rlen); + Encode(coding->mrgScheme, output, (uint8 *) mrg, rlen); + if (coding->subChar < 0) + Encode(coding->subScheme, output, (uint8 *) sub, rlen); + else + Encode_Run(coding->subScheme, coding->sRunScheme, output, + (uint8 *) sub, rlen, coding->subChar); + return; +} + int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy) { int rlen, clen; diff --git a/QV.h b/QV.h index 532b2f4..e5c9485 100644 --- a/QV.h +++ b/QV.h @@ -58,6 +58,7 @@ int Get_QV_Line(); // If there is an error then -1 is returned, otherwise the number of entries read. int QVcoding_Scan(FILE *input, int num, FILE *temp); +void QVcoding_Scan1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub); // Given QVcoding_Scan has been called at least once, create an encoding scheme based on // the accumulated statistics and return a pointer to it. The returned encoding object @@ -83,6 +84,8 @@ void Free_QVcoding(QVcoding *coding); // occurred, and the sequence length otherwise. int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy); +void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub, + FILE *output, QVcoding *coding, int lossy); // Assuming the input is position just beyond the compressed encoding of an entry header, // read the set of compressed encodings for the ensuing 5 QV vectors, decompress them, diff --git a/align.c b/align.c index eb877d4..449b833 100644 --- a/align.c +++ b/align.c @@ -2983,8 +2983,8 @@ int Find_Extension(Alignment *align, Work_Data *ework, Align_Spec *espec, if (prefix) { if (reverse_extend(work,spec,align,diag,anti,minp,maxp)) EXIT(1); - apath->aepos = (anti-diag)/2; - apath->bepos = (anti+diag)/2; + apath->aepos = (anti+diag)/2; + apath->bepos = (anti-diag)/2; #ifdef DEBUG_PASSES printf("E1 (%d,%d) => (%d,%d) %d\n", (anti+diag)/2,(anti-diag)/2,apath->abpos,apath->bbpos,apath->diffs); @@ -2993,8 +2993,8 @@ int Find_Extension(Alignment *align, Work_Data *ework, Align_Spec *espec, else { if (forward_extend(work,spec,align,diag,anti,minp,maxp)) EXIT(1); - apath->abpos = (anti-diag)/2; - apath->bbpos = (anti+diag)/2; + apath->abpos = (anti+diag)/2; + apath->bbpos = (anti-diag)/2; #ifdef DEBUG_PASSES printf("F1 (%d,%d) => (%d,%d) %d\n", (anti+diag)/2,(anti-diag)/2,apath->aepos,apath->bepos,apath->diffs); @@ -3044,10 +3044,13 @@ int Read_Trace(FILE *input, Overlap *ovl, int tbytes) return (0); } -void Write_Overlap(FILE *output, Overlap *ovl, int tbytes) -{ fwrite( ((char *) ovl) + PtrSize, OvlIOSize, 1, output); +int Write_Overlap(FILE *output, Overlap *ovl, int tbytes) +{ if (fwrite( ((char *) ovl) + PtrSize, OvlIOSize, 1, output) != 1) + return (1); if (ovl->path.trace != NULL) - fwrite(ovl->path.trace,tbytes,ovl->path.tlen,output); + if (fwrite(ovl->path.trace,tbytes,ovl->path.tlen,output) != (size_t) ovl->path.tlen) + return (1); + return (0); } void Compress_TraceTo8(Overlap *ovl) diff --git a/align.h b/align.h index c3b31ae..daa9151 100644 --- a/align.h +++ b/align.h @@ -325,7 +325,7 @@ typedef struct { accommodate the trace where each value take 'tbytes' bytes (1 if uint8 or 2 if uint16). Write_Overlap write 'ovl' to stream 'output' followed by its trace vector (if any) that - occupies 'tbytes' bytes per value. + occupies 'tbytes' bytes per value. It returns non-zero if there was an error writing. Print_Overlap prints an ASCII version of the contents of 'ovl' to stream 'output' where the trace occupes 'tbytes' per value and the print out is indented from the left @@ -343,7 +343,7 @@ typedef struct { int Read_Overlap(FILE *input, Overlap *ovl); int Read_Trace(FILE *innput, Overlap *ovl, int tbytes); - void Write_Overlap(FILE *output, Overlap *ovl, int tbytes); + int Write_Overlap(FILE *output, Overlap *ovl, int tbytes); void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent); void Compress_TraceTo8(Overlap *ovl); diff --git a/filter.c b/filter.c index a9c5083..90ca7c3 100644 --- a/filter.c +++ b/filter.c @@ -1842,14 +1842,20 @@ static void *report_thread(void *arg) ovla->path.trace = tbuf->trace + (uint64) (ovla->path.trace); if (small) Compress_TraceTo8(ovla); - Write_Overlap(ofile1,ovla,tbytes); + if (Write_Overlap(ofile1,ovla,tbytes)) + { fprintf(stderr,"%s: Cannot write to /tmp, too small?\n",Prog_Name); + exit (1); + } } for (i = 0; i < novlb; i++) { ovlb->path = bmatch[i]; ovlb->path.trace = tbuf->trace + (uint64) (ovlb->path.trace); if (small) Compress_TraceTo8(ovlb); - Write_Overlap(ofile2,ovlb,tbytes); + if (Write_Overlap(ofile2,ovlb,tbytes)) + { fprintf(stderr,"%s: Cannot write to /tmp, too small?\n",Prog_Name); + exit (1); + } } ahits += novla; bhits += novlb; -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/daligner.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
