Changeset: 28e6713b8d2d for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=28e6713b8d2d Modified Files: clients/Tests/MAL-signatures.stable.out clients/Tests/MAL-signatures.stable.out.int128 clients/Tests/exports.stable.out gdk/ChangeLog gdk/gdk.h gdk/gdk_align.c gdk/gdk_bat.c gdk/gdk_batop.c gdk/gdk_logger.c monetdb5/modules/kernel/bat5.c monetdb5/modules/kernel/bat5.h monetdb5/modules/kernel/bat5.mal monetdb5/modules/mal/Tests/inspect05.stable.out.int128 monetdb5/modules/mal/mat.c sql/backends/monet5/sql.c sql/storage/bat/bat_storage.c sql/storage/bat/bat_table.c sql/test/BugTracker-2016/Tests/storagemodel.stable.out sql/test/BugTracker-2016/Tests/storagemodel.stable.out.int128 Branch: default Log Message:
Implemented a candidate list for BATappend. diffs (truncated from 1252 to 300 lines): diff --git a/clients/Tests/MAL-signatures.stable.out b/clients/Tests/MAL-signatures.stable.out --- a/clients/Tests/MAL-signatures.stable.out +++ b/clients/Tests/MAL-signatures.stable.out @@ -537,6 +537,8 @@ Ready. [ "bam", "sam_export", "pattern bam.sam_export(output_path:str):void ", "sam_exportf;", "Export results in the bam.export table to a SAM file" ] [ "bam", "seq_char", "command bam.seq_char(ref_pos:int,alg_seq:str,alg_pos:int,alg_cigar:str):str ", "seq_char;", "Calculate the character in the alignment string (alg_str) that is aligned to position 'ref_pos', conforming to the given cigar string" ] [ "bam", "seq_length", "command bam.seq_length(cigar:str):int ", "seq_length;", "Calculate the real length of a DNA sequence, given its CIGAR string." ] +[ "bat", "append", "command bat.append(i:bat[:any_1],u:bat[:any_1],s:bat[:oid],force:bit):bat[:any_1] ", "BKCappend_cand_force_wrap;", "append the content of u with candidate list s to i" ] +[ "bat", "append", "command bat.append(i:bat[:any_1],u:bat[:any_1],s:bat[:oid]):bat[:any_1] ", "BKCappend_cand_wrap;", "append the content of u with candidate list s to i" ] [ "bat", "append", "command bat.append(i:bat[:any_1],u:bat[:any_1],force:bit):bat[:any_1] ", "BKCappend_force_wrap;", "append the content of u to i" ] [ "bat", "append", "command bat.append(i:bat[:any_1],u:any_1,force:bit):bat[:any_1] ", "BKCappend_val_force_wrap;", "append the value u to i" ] [ "bat", "append", "command bat.append(i:bat[:any_1],u:any_1):bat[:any_1] ", "BKCappend_val_wrap;", "append the value u to i" ] diff --git a/clients/Tests/MAL-signatures.stable.out.int128 b/clients/Tests/MAL-signatures.stable.out.int128 --- a/clients/Tests/MAL-signatures.stable.out.int128 +++ b/clients/Tests/MAL-signatures.stable.out.int128 @@ -641,6 +641,8 @@ Ready. [ "bam", "sam_export", "pattern bam.sam_export(output_path:str):void ", "sam_exportf;", "Export results in the bam.export table to a SAM file" ] [ "bam", "seq_char", "command bam.seq_char(ref_pos:int,alg_seq:str,alg_pos:int,alg_cigar:str):str ", "seq_char;", "Calculate the character in the alignment string (alg_str) that is aligned to position 'ref_pos', conforming to the given cigar string" ] [ "bam", "seq_length", "command bam.seq_length(cigar:str):int ", "seq_length;", "Calculate the real length of a DNA sequence, given its CIGAR string." ] +[ "bat", "append", "command bat.append(i:bat[:any_1],u:bat[:any_1],s:bat[:oid],force:bit):bat[:any_1] ", "BKCappend_cand_force_wrap;", "append the content of u with candidate list s to i" ] +[ "bat", "append", "command bat.append(i:bat[:any_1],u:bat[:any_1],s:bat[:oid]):bat[:any_1] ", "BKCappend_cand_wrap;", "append the content of u with candidate list s to i" ] [ "bat", "append", "command bat.append(i:bat[:any_1],u:bat[:any_1],force:bit):bat[:any_1] ", "BKCappend_force_wrap;", "append the content of u to i" ] [ "bat", "append", "command bat.append(i:bat[:any_1],u:any_1,force:bit):bat[:any_1] ", "BKCappend_val_force_wrap;", "append the value u to i" ] [ "bat", "append", "command bat.append(i:bat[:any_1],u:any_1):bat[:any_1] ", "BKCappend_val_wrap;", "append the value u to i" ] diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out --- a/clients/Tests/exports.stable.out +++ b/clients/Tests/exports.stable.out @@ -19,7 +19,7 @@ int ATOMlen(int id, const void *v); str ATOMname(int id); ptr ATOMnil(int id); int ATOMprint(int id, const void *val, stream *fd); -gdk_return BATappend(BAT *b, BAT *c, bit force); +gdk_return BATappend(BAT *b, BAT *n, BAT *s, bit force); void BATassertProps(BAT *b); atomDesc BATatoms[]; BAT *BATattach(int tt, const char *heapfile, int role); @@ -779,6 +779,8 @@ str BATXMLstr2xml(bat *ret, const bat *b str BATXMLxml2str(bat *ret, const bat *bid); str BATXMLxmltext(bat *ret, const bat *bid); str BATXMLxquery(bat *ret, const bat *bid, const char *const *expr); +str BKCappend_cand_force_wrap(bat *r, const bat *bid, const bat *uid, const bat *sid, const bit *force); +str BKCappend_cand_wrap(bat *r, const bat *bid, const bat *uid, const bat *sid); str BKCappend_force_wrap(bat *r, const bat *bid, const bat *uid, const bit *force); str BKCappend_val_force_wrap(bat *r, const bat *bid, const void *u, const bit *force); str BKCappend_val_wrap(bat *r, const bat *bid, const void *u); diff --git a/gdk/ChangeLog b/gdk/ChangeLog --- a/gdk/ChangeLog +++ b/gdk/ChangeLog @@ -1,6 +1,10 @@ # ChangeLog file for MonetDB # This file is updated with Maddlog +* Mon Dec 5 2016 Sjoerd Mullender <[email protected]> +- BATappend now takes an optional (NULL if not used) candidate list for + the to-be-appended BAT. + * Thu Dec 1 2016 Sjoerd Mullender <[email protected]> - New function BATkeyed(BAT *b) that determines (possibly using a hash table) whether all values in b are distinct. diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -1030,7 +1030,7 @@ gdk_export bte ATOMelmshift(int sz); * @- BUN manipulation * @multitable @columnfractions 0.08 0.7 * @item BAT* - * @tab BATappend (BAT *b, BAT *c, bit force) + * @tab BATappend (BAT *b, BAT *n, BAT *s, bit force) * @item BAT* * @tab BUNappend (BAT *b, ptr right, bit force) * @item BAT* @@ -1262,7 +1262,7 @@ gdk_export bte ATOMelmshift(int sz); gdk_export gdk_return GDKupgradevarheap(BAT *b, var_t v, int copyall, int mayshare); gdk_export gdk_return BUNappend(BAT *b, const void *right, bit force); -gdk_export gdk_return BATappend(BAT *b, BAT *c, bit force); +gdk_export gdk_return BATappend(BAT *b, BAT *n, BAT *s, bit force); gdk_export gdk_return BUNdelete(BAT *b, oid o); gdk_export gdk_return BATdel(BAT *b, BAT *d); diff --git a/gdk/gdk_align.c b/gdk/gdk_align.c --- a/gdk/gdk_align.c +++ b/gdk/gdk_align.c @@ -451,7 +451,7 @@ VIEWreset(BAT *b) b->batCapacity = cnt; /* insert all of v in b, and quit */ - BATappend(b, v, FALSE); + BATappend(b, v, NULL, FALSE); BBPreclaim(v); } return GDK_SUCCEED; diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c --- a/gdk/gdk_bat.c +++ b/gdk/gdk_bat.c @@ -2002,6 +2002,12 @@ BATmode(BAT *b, int mode) * then all values are equal. * revsorted The column is reversely sorted (descending). If * also sorted, then all values are equal. + * nosorted BUN position which proofs not sorted (given position + * and one before are not ordered correctly). + * norevsorted BUN position which proofs not revsorted (given position + * and one before are not ordered correctly). + * nokey Pair of BUN positions that proof not all values are + * distinct (i.e. values at given locations are equal). * * In addition there is a property "unique" that, when set, indicates * that values must be kept unique (and hence that the "key" property diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c --- a/gdk/gdk_batop.c +++ b/gdk/gdk_batop.c @@ -17,6 +17,7 @@ #include "monetdb_config.h" #include "gdk.h" #include "gdk_private.h" +#include "gdk_cand.h" gdk_return unshare_string_heap(BAT *b) @@ -55,11 +56,11 @@ unshare_string_heap(BAT *b) * of inserting individual strings. See the comments in the code for * more information. */ static gdk_return -insert_string_bat(BAT *b, BAT *n, int force) +insert_string_bat(BAT *b, BAT *n, BAT *s, int force) { BATiter ni; /* iterator */ size_t toff = ~(size_t) 0; /* tail offset */ - BUN p, q; /* loop variables */ + BUN p, r; /* loop variables */ const void *tp; /* tail value pointer */ unsigned char tbv; /* tail value-as-bte */ unsigned short tsv; /* tail value-as-sht */ @@ -68,15 +69,21 @@ insert_string_bat(BAT *b, BAT *n, int fo #endif var_t v; /* value */ size_t off; /* offset within n's string heap */ + BUN start, end, cnt; + const oid *restrict cand = NULL, *candend = NULL; assert(b->ttype == TYPE_str); /* only transient bats can use some other bat's string heap */ assert(b->batRole == TRANSIENT || b->tvheap->parentid == abs(b->batCacheid)); - if (n->batCount == 0) + if (n->batCount == 0 || (s && s->batCount == 0)) return GDK_SUCCEED; ni = bat_iterator(n); tp = NULL; + CANDINIT(n, s, start, end, cnt, cand, candend); + cnt = cand ? (BUN) (candend - cand) : end - start; + if (cnt == 0) + return GDK_SUCCEED; if ((!GDK_ELIMDOUBLES(b->tvheap) || b->batCount == 0) && !GDK_ELIMDOUBLES(n->tvheap) && b->tvheap->hashash == n->tvheap->hashash) { @@ -97,7 +104,12 @@ insert_string_bat(BAT *b, BAT *n, int fo */ bat bid = b->batCacheid; - if (b->batCount == 0 && b->tvheap != n->tvheap) { + /* if cand != NULL, there is no wholesale + * copying of n's offset heap, but we may + * still be able to share the string heap */ + if (b->batCount == 0 && + b->tvheap != n->tvheap && + cand == NULL) { if (b->tvheap->parentid != bid) { BBPunshare(b->tvheap->parentid); } else { @@ -107,14 +119,15 @@ insert_string_bat(BAT *b, BAT *n, int fo BBPshare(n->tvheap->parentid); b->tvheap = n->tvheap; toff = 0; - } else if (b->tvheap->parentid == n->tvheap->parentid) { + } else if (b->tvheap->parentid == n->tvheap->parentid && + cand == NULL) { toff = 0; } else if (b->tvheap->parentid != bid && unshare_string_heap(b) != GDK_SUCCEED) { return GDK_FAIL; } } - if (toff == ~(size_t) 0 && n->batCount > 1024) { + if (toff == ~(size_t) 0 && cnt > 1024) { /* If b and n aren't sharing their string * heaps, we try to determine whether to copy * n's whole string heap to the end of b's, or @@ -136,7 +149,11 @@ insert_string_bat(BAT *b, BAT *n, int fo int match = 0, i; size_t len = b->tvheap->hashash ? 1024 * EXTRALEN : 0; for (i = 0; i < 1024; i++) { - p = (BUN) (((double) rand() / RAND_MAX) * (BATcount(n) - 1)); + p = (BUN) (((double) rand() / RAND_MAX) * (cnt - 1)); + if (cand) + p = cand[p] - n->hseqbase; + else + p += start; off = BUNtvaroff(ni, p); if (off < b->tvheap->free && strcmp(b->tvheap->base + off, n->tvheap->base + off) == 0 && @@ -214,9 +231,9 @@ insert_string_bat(BAT *b, BAT *n, int fo if (toff == 0 && n->twidth == b->twidth) { /* we don't need to do any translation of offset * values, so we can use fast memcpy */ - memcpy(Tloc(b, BUNlast(b)), Tloc(n, 0), - BATcount(n) * n->twidth); - BATsetcount(b, BATcount(b) + BATcount(n)); + memcpy(Tloc(b, BUNlast(b)), Tloc(n, start), + cnt * n->twidth); + BATsetcount(b, BATcount(b) + cnt); } else if (toff != ~(size_t) 0) { /* we don't need to insert any actual strings since we * have already made sure that they are all in b's @@ -235,21 +252,30 @@ insert_string_bat(BAT *b, BAT *n, int fo #endif const var_t *restrict tvp = (const var_t *) Tloc(n, 0); - BATloop(n, p, q) { + for (;;) { + if (cand) { + if (cand == candend) + break; + p = *cand++ - n->hseqbase; + } else { + p = start++; + } + if (p >= end) + break; switch (n->twidth) { case 1: - v = (var_t) *tbp++ + GDK_VAROFFSET; + v = (var_t) tbp[p] + GDK_VAROFFSET; break; case 2: - v = (var_t) *tsp++ + GDK_VAROFFSET; + v = (var_t) tsp[p] + GDK_VAROFFSET; break; #if SIZEOF_VAR_T == 8 case 4: - v = (var_t) *tip++; + v = (var_t) tip[p]; break; #endif default: - v = *tvp++; + v = tvp[p]; break; } v = (var_t) ((((size_t) v << GDK_VARSHIFT) + toff) >> GDK_VARSHIFT); @@ -279,8 +305,24 @@ insert_string_bat(BAT *b, BAT *n, int fo /* if b's string heap is much smaller than n's string * heap, don't bother checking whether n's string * values occur in b's string heap */ - BATloop(n, p, q) { - bunfastapp(b, BUNtvar(ni, p)); + r = BUNlast(b); + if (cand) { + oid hseq = n->hseqbase; + while (cand < candend) { + tp = BUNtvar(ni, *cand - hseq); + bunfastapp(b, tp); + HASHins(b, r, tp); + r++; + cand++; + } + } else { + while (start < end) { + tp = BUNtvar(ni, start); + bunfastapp(b, tp); + HASHins(b, r, tp); + r++; + start++; + } } } else { /* Insert values from n individually into b; however, @@ -289,7 +331,17 @@ insert_string_bat(BAT *b, BAT *n, int fo * string heap (in case b's string heap is a copy of * n's). If this is the case, we just copy the * offset, otherwise we insert normally. */ - BATloop(n, p, q) { + r = BUNlast(b); + for (;;) { + if (cand) { + if (cand == candend) + break; + p = *cand++ - n->hseqbase; + } else { + p = start++; + } + if (p >= end) + break; off = BUNtvaroff(ni, p); /* the offset */ tp = n->tvheap->base + off; /* the string */ if (off < b->tvheap->free && @@ -337,6 +389,8 @@ insert_string_bat(BAT *b, BAT *n, int fo } else { bunfastapp(b, tp); } _______________________________________________ checkin-list mailing list [email protected] https://www.monetdb.org/mailman/listinfo/checkin-list
