Changeset: ce0943817915 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/ce0943817915
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:
Rewrite strimp construction as a single threaded process
diffs (truncated from 352 to 300 lines):
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -115,21 +115,21 @@ pair_equal(CharPair *p1, CharPair *p2) {
#define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x)))
#define pairToIndex(b1, b2) (size_t)(((uint16_t)b2)<<8 | ((uint16_t)b1))
-static bool
+inline static bool
pair_equal(CharPair *p1, CharPair *p2) {
return p1->pbytes[0] == p2->pbytes[0] &&
p1->pbytes[1] == p2->pbytes[1];
}
-static size_t
+inline static size_t
histogram_index(PairHistogramElem *hist, size_t hsize, CharPair *p) {
(void) hist;
(void) hsize;
return pairToIndex(p->pbytes[0], p->pbytes[1]);
}
-static bool
+inline static bool
pair_at(PairIterator *pi, CharPair *p) {
if (pi->pos >= pi->lim)
return false;
@@ -138,7 +138,7 @@ pair_at(PairIterator *pi, CharPair *p) {
return true;
}
-static bool
+inline static bool
next_pair(PairIterator *pi) {
if (pi->pos >= pi->lim)
return false;
@@ -148,7 +148,7 @@ next_pair(PairIterator *pi) {
/* Returns true if the specified char is ignored.
*/
-static bool
+inline static bool
ignored(CharPair *p, uint8_t elm) {
assert(elm == 0 || elm == 1);
return isIgnored(p->pbytes[elm]);
@@ -236,7 +236,8 @@ STRMPmakebitstring(const str s, Strimps
*
* TODO: Explore if a priority queue (heap construction and 64 extract
* maximums) is worth it. The tradeoff here is that it will complicate
- * the code but might improve performance.
+ * the code but might improve performance. In a debug build the current
+ * implementation takes ~200 μs.
*/
static void
STRMPchoosePairs(PairHistogramElem *hist, size_t hist_size, CharPair *cp)
@@ -377,75 +378,6 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai
return values >= STRIMP_HEADER_SIZE;
}
-/* Creates the heap for a string imprint. Returns NULL on failure. This
- * follows closely the Heap creation for the order index.
- */
-static Strimps *
-STRMPcreateStrimpHeap(BAT *b, BAT *s)
-{
- uint8_t *h1, *h2;
- Strimps *r = NULL;
- uint64_t descriptor;
- size_t i;
- uint16_t sz;
- CharPair hpairs[STRIMP_HEADER_SIZE];
- const char *nme;
-
- if (b->tstrimps == NULL) {
- MT_lock_set(&b->batIdxLock);
- /* Make sure no other thread got here first */
- if (b->tstrimps == NULL &&
- STRMPbuildHeader(b, s, hpairs)) { /* Find the header pairs,
put the result in hpairs */
- sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the
descriptor and the pair sizes */
- for (i = 0; i < STRIMP_HEADER_SIZE; i++) {
- sz += hpairs[i].psize;
- }
-
- nme = GDKinmemory(b->theap->farmid) ? ":memory:" :
BBP_physical(b->batCacheid);
- /* Allocate the strimps heap */
- if ((r = GDKzalloc(sizeof(Strimps))) == NULL ||
- (r->strimps.farmid = BBPselectfarm(b->batRole,
b->ttype, strimpheap)) < 0 ||
- strconcat_len(r->strimps.filename,
sizeof(r->strimps.filename), nme,
- ".tstrimps", NULL) >=
sizeof(r->strimps.filename) ||
- HEAPalloc(&r->strimps, BATcount(b) *
sizeof(uint64_t) + sz, sizeof(uint8_t), 0) != GDK_SUCCEED) {
- GDKfree(r);
- MT_lock_unset(&b->batIdxLock);
- return NULL;
- }
-
- descriptor = STRIMP_VERSION |
((uint64_t)STRIMP_HEADER_SIZE) << 8 | ((uint64_t)sz) << 16;
-
- ((uint64_t *)r->strimps.base)[0] = descriptor;
- r->sizes_base = h1 = (uint8_t *)r->strimps.base + 8;
- r->pairs_base = h2 = (uint8_t *)h1 + STRIMP_HEADER_SIZE;
-
- for (i = 0; i < STRIMP_HEADER_SIZE; i++) {
- uint8_t psize = hpairs[i].psize;
- h1[i] = psize;
- memcpy(h2, hpairs[i].pbytes, psize);
- h2 += psize;
- }
- r->bitstrings_base = h2;
- r->strimps.free = sz;
-
- b->tstrimps = r;
- b->batDirtydesc = true;
- }
- MT_lock_unset(&b->batIdxLock);
- }
- return b->tstrimps;
-}
-
-/* This macro takes a bat and checks if the strimp construction has been
- * completed. It is completed when the strimp pointer is not null and it
- * is either 1 (i.e. it exists on disk) or the number of bitstrings
- * computed is the same as the number of elements in the BAT.
- */
-#define STRIMP_COMPLETE(b) \
- b->tstrimps != NULL && \
- (b->tstrimps == (Strimps *)1 || \
- (b->tstrimps->strimps.free - ((char
*)b->tstrimps->bitstrings_base - b->tstrimps->strimps.base)) ==
b->batCount*sizeof(uint64_t))
-
static bool
BATcheckstrimps(BAT *b)
{
@@ -529,7 +461,7 @@ BATcheckstrimps(BAT *b)
* not null and the number of bitstrings is equal to the bat
* count.
*/
- ret = STRIMP_COMPLETE(b);
+ ret = b->tstrimps != NULL;
if (ret) {
TRC_DEBUG(ACCELERATOR,
"BATcheckstrimps(" ALGOBATFMT "): already has
strimps, waited " LLFMT " usec\n",
@@ -676,6 +608,205 @@ persistStrimp(BAT *b)
TRC_DEBUG(ACCELERATOR, "persistStrimp(" ALGOBATFMT "): NOT
persisting strimp\n", ALGOBATPAR(b));
}
+static Strimps *
+STRMPcreateStrimpHeap(BAT *b, BAT *s)
+{
+ uint8_t *h1, *h2;
+ Strimps *r = NULL;
+ uint64_t descriptor;
+ size_t i;
+ uint16_t sz;
+ CharPair hpairs[STRIMP_HEADER_SIZE];
+ const char *nme;
+
+ if ((r = b->tstrimps) == NULL &&
+ STRMPbuildHeader(b, s, hpairs)) { /* Find the header pairs, put
+ the result in hpairs */
+ sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the descriptor
and
+ the pair sizes */
+ for (i = 0; i < STRIMP_HEADER_SIZE; i++) {
+ sz += hpairs[i].psize;
+ }
+
+ nme = GDKinmemory(b->theap->farmid) ? ":memory:"
+ : BBP_physical(b->batCacheid);
+ /* Allocate the strimps heap */
+ if ((r = GDKzalloc(sizeof(Strimps))) == NULL ||
+ (r->strimps.farmid =
+ BBPselectfarm(b->batRole, b->ttype, strimpheap)) < 0 ||
+ strconcat_len(r->strimps.filename,
sizeof(r->strimps.filename),
+ nme, ".tstrimps",
+ NULL) >= sizeof(r->strimps.filename) ||
+ HEAPalloc(&r->strimps, BATcount(b) * sizeof(uint64_t) + sz,
+ sizeof(uint8_t), 0) != GDK_SUCCEED) {
+ GDKfree(r);
+ return NULL;
+ }
+
+ descriptor = STRIMP_VERSION | ((uint64_t)STRIMP_HEADER_SIZE) <<
8 |
+ ((uint64_t)sz) << 16;
+
+ ((uint64_t *)r->strimps.base)[0] = descriptor;
+ r->sizes_base = h1 = (uint8_t *)r->strimps.base + 8;
+ r->pairs_base = h2 = (uint8_t *)h1 + STRIMP_HEADER_SIZE;
+
+ for (i = 0; i < STRIMP_HEADER_SIZE; i++) {
+ uint8_t psize = hpairs[i].psize;
+ h1[i] = psize;
+ memcpy(h2, hpairs[i].pbytes, psize);
+ h2 += psize;
+ }
+ r->bitstrings_base = h2;
+ r->strimps.free = sz;
+
+ }
+ return r;
+}
+
+gdk_return
+STRMPcreate(BAT *b, BAT *s) {
+ lng t0 = 0;
+ BAT *pb;
+
+ TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
+ if (ATOMstorage(b->ttype) != TYPE_str) {
+ GDKerror("Cannot create strimps index for non string bats\n");
+ return GDK_FAIL;
+ }
+
+ if (VIEWtparent(b)) {
+ pb = BBP_cache(VIEWtparent(b));
+ assert(pb);
+ } else {
+ pb = b;
+ }
+
+ if (BATcheckstrimps(pb)) {
+ return GDK_SUCCEED;
+ }
+
+ if (pb->tstrimps == NULL) {
+ MT_lock_set(&pb->batIdxLock);
+ if (pb->tstrimps == NULL) {
+ Strimps *r;
+ BATiter bi;
+ BUN i, ncand;
+ oid x;
+ struct canditer ci;
+ str cs;
+ uint64_t *dh;
+
+ if ((r = STRMPcreateStrimpHeap(pb, s)) == NULL) {
+ MT_lock_unset(&b->batIdxLock);
+ return GDK_FAIL;
+ }
+ HEAPincref(&r->strimps);
+ dh = (uint64_t *)r->bitstrings_base;
+
+ /* Compute bitstrings */
+ ncand = canditer_init(&ci, pb, NULL);
+ bi = bat_iterator(pb);
+ for (i = 0; i < ncand; i++) {
+ x = canditer_next(&ci);
+ cs = (str)BUNtvar(bi, x);
+ if (!strNil(cs))
+ *dh++ = STRMPmakebitstring(cs, r);
+ else
+ *dh++ = 0;
+ }
+ bat_iterator_end(&bi);
+
+ pb->tstrimps = r;
+ pb->batDirtydesc = true;
+ persistStrimp(pb);
+ }
+ MT_lock_unset(&pb->batIdxLock);
+ }
+ TRC_DEBUG(ACCELERATOR, "strimp creation took " LLFMT " usec\n",
GDKusec()-t0);
+ return GDK_SUCCEED;
+}
+
+/* Parallel creation */
+#if 0
+/* Creates the heap for a string imprint. Returns NULL on failure. This
+ * follows closely the Heap creation for the order index.
+ */
+static Strimps *
+STRMPcreateStrimpHeap(BAT *b, BAT *s)
+{
+ uint8_t *h1, *h2;
+ Strimps *r = NULL;
+ uint64_t descriptor;
+ size_t i;
+ uint16_t sz;
+ CharPair hpairs[STRIMP_HEADER_SIZE];
+ const char *nme;
+
+ MT_lock_set(&b->batIdxLock);
+ /* Make sure no other thread got here first */
+ if ((r = b->tstrimps) == NULL) {
+ if (STRMPbuildHeader(b, s, hpairs)) { /* Find the header pairs,
put
+ the result in hpairs */
+ sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the
descriptor and
+ the pair sizes */
+ for (i = 0; i < STRIMP_HEADER_SIZE; i++) {
+ sz += hpairs[i].psize;
+ }
+
+ nme = GDKinmemory(b->theap->farmid) ? ":memory:"
+ : BBP_physical(b->batCacheid);
+ /* Allocate the strimps heap */
+ if ((r = GDKzalloc(sizeof(Strimps))) == NULL ||
+ (r->strimps.farmid =
+ BBPselectfarm(b->batRole, b->ttype, strimpheap)) <
0 ||
+ strconcat_len(r->strimps.filename,
sizeof(r->strimps.filename),
+ nme, ".tstrimps",
+ NULL) >= sizeof(r->strimps.filename)
||
+ HEAPalloc(&r->strimps, BATcount(b) *
sizeof(uint64_t) + sz,
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list