Changeset: 6ab7ac7f1321 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6ab7ac7f1321
Modified Files:
        gdk/gdk.h
        gdk/gdk_private.h
        gdk/gdk_strimps.c
        gdk/gdk_strimps.h
Branch: string_imprints
Log Message:

Functions to construct the string imprint for a given BAT


diffs (153 lines):

diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -701,6 +701,7 @@ typedef struct {
        Hash *hash;             /* hash table */
        Imprints *imprints;     /* column imprints index */
        Heap *orderidx;         /* order oid index */
+       Heap *strimps;          /* string imprint index  */
 
        PROPrec *props;         /* list of dynamic properties stored in the bat 
descriptor */
 } COLrec;
@@ -772,6 +773,7 @@ typedef struct BATiter {
 #define thash          T.hash
 #define timprints      T.imprints
 #define tprops         T.props
+#define tstrimps       T.strimps
 
 
 
diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -25,7 +25,8 @@ enum heaptype {
        varheap,
        hashheap,
        imprintsheap,
-       orderidxheap
+       orderidxheap,
+       strimpheap
 };
 
 #ifdef GDKLIBRARY_OLDDATE
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -267,3 +267,85 @@ GDKstrimp_make_bitstring(const str s, St
 
        return ret;
 }
+
+/* Create the heap for a string imprint. Returns NULL on failure. */
+static Heap *
+createStrimpheap(BAT *b, StrimpHeader *h)
+{
+       Heap *r = NULL;
+       uint64_t *d;
+       size_t i,j;
+       const char *nme;
+
+       nme = GDKinmemory(b->theap.farmid) ? ":memory:" : 
BBP_physical(b->batCacheid);
+       if ((r = GDKzalloc(sizeof(Heap))) == NULL ||
+           (r->farmid = BBPselectfarm(b->batRole, b->ttype, strimpheap)) < 0 ||
+           strconcat_len(r->filename, sizeof(r->filename),
+                         nme, ".strimp", NULL) >= sizeof(r->filename) ||
+           HEAPalloc(r, BATcount(b) + STRIMP_OFFSET, sizeof(uint64_t)) != 
GDK_SUCCEED) {
+               GDKfree(r);
+               return NULL;
+       }
+       r->free = STRIMP_OFFSET * sizeof(uint64_t);
+
+       d = (uint64_t *)r->base;
+       /* This loop assumes that we are working with byte pairs
+        * (i.e. the type of the header is uint16_t). TODO: generalize.
+        */
+       for(i = 0; i < STRIMP_HEADER_SIZE; i += 4) {
+               *d = 0;
+               for(j = 0; j < 4; j++) {
+                       *d <<= 16;
+                       *d |= h->bytepairs[i + j];
+               }
+       }
+       return r;
+}
+
+/* Create */
+gdk_return
+GDKstrimp_create_strimp(BAT *b)
+{
+       lng t0 = 0;
+       BATiter bi;
+       BUN i;
+       str s;
+       StrimpHeader *head;
+       Heap *h;
+       uint64_t *dh;
+
+       assert(b->ttype == TYPE_str);
+       TRC_DEBUG_IF(ALGO) t0 = GDKusec();
+
+       if ((head = create_header(b)) == NULL) {
+               return GDK_FAIL;
+       }
+
+       if ((h = createStrimpheap(b, head)) == NULL) {
+               GDKfree(head);
+               return GDK_FAIL;
+       }
+       dh = (uint64_t *)h->base + h->free;
+
+       bi = bat_iterator(b);
+       for (i = 0; i < b->batCount; i++) {
+               s = (str)BUNtvar(bi, i);
+               if (!strNil(s))
+                       *dh++ = GDKstrimp_make_bitstring(s, head);
+               else
+                       *dh++ = 0; /* no pairs in nil values */
+
+       }
+
+       /* After we have computed the strimp, attempt to write it back
+        * to the BAT.
+        */
+       MT_lock_set(&b->batIdxLock);
+       b->tstrimps = h;
+       b->batDirtydesc = true;
+       /* persistStrimp(b) */
+       MT_lock_unset(&b->batIdxLock);
+
+       TRC_DEBUG(ALGO, "strimp creation took " LLFMT " usec\n", GDKusec()-t0);
+       return GDK_SUCCEED;
+}
diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h
--- a/gdk/gdk_strimps.h
+++ b/gdk/gdk_strimps.h
@@ -11,19 +11,25 @@
 
 #include <stdint.h>
 
+
+#define STRIMP_VERSION (uint64_t)1
 /* Count the occurences of pairs of bytes. This is a compromise between
  * just handling ASCII and full UTF-8 support.
  */
 #define STRIMP_HISTSIZE 256*256
-#define STRIMP_SIZE 64
+#define STRIMP_HEADER_SIZE 64
+#define STRIMP_OFFSET 1 + STRIMP_HEADER_SIZE*sizeof(DataPair)/sizeof(uint64_t) 
/* version + header */
 
+
+typedef uint16_t DataPair;
 typedef struct {
        // TODO: find a better name for this
-       uint16_t bytepairs[STRIMP_SIZE];
+       DataPair bytepairs[STRIMP_HEADER_SIZE];
 } StrimpHeader;
 
 gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n); // Remove?
 gdk_export gdk_return GDKstrimp_make_histogram(BAT *b, uint64_t *hist, size_t 
hist_size, size_t *nbins); // make static
 // gdk_export gdk_return GDKstrimp_make_header(StrimpHeader *h, uint64_t 
*hist, size_t hist_size); // make static
-gdk_export gdk_return GDKstrimp_make_header(BAT *b);
+//gdk_export gdk_return GDKstrimp_make_header(BAT *b);
+gdk_export gdk_return GDKstrimp_create_strimp(BAT *b);
 #endif /* _GDK_STRIMPS_H_ */
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to