MonetDB: string_imprints - Start self contained refactoring

Panagiotis Koutsourakis Tue, 01 Jun 2021 09:28:52 -0700

Changeset: 1dd99390a261 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/1dd99390a261
Modified Files:
        gdk/gdk_private.h
        gdk/gdk_strimps.c
Branch: string_imprints
Log Message:


Start self contained refactoring

All the information should be contained in the BAT itself and not in
external data structures.


diffs (truncated from 384 to 300 lines):

diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -387,8 +387,7 @@ struct Imprints {
 
 struct Strimps {
        Heap strimps;
-       void *offsets_base;     /* pointer into strimps heap (pair offsets)  */
-       /* offsets_base is a pointer to either a uint8_t or a uint16_ */
+       uint8_t *sizes_base;    /* pointer into strimps heap (pair sizes)  */
        uint8_t *pairs_base;    /* pointer into strimps heap (pairs start)   */
        void *strimps_base;     /* pointer into strimps heap (strimps start) */
        /* strimps_base is a pointer to either a uint32_t or a uint64_t */
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -7,42 +7,47 @@
  */
 
 
-/* A string imprint is an index that can be used as a prefilter in LIKE
+/* Author: Panagiotis Koutsourakis
+ *
+ * A string imprint is an index that can be used as a prefilter in LIKE
  * queries. It has 2 components:
  *
- * - a header of 64 string element pairs (bytes in the current
- *   implementation but maybe unicode chars might make more sense).
+ * - a header of 32 or 64 string element pairs.
  *
- * - a 64 bit mask for each item in the BAT that encodes the presence or
- *   absence of each element of the header in the specific item.
+ * - a 32 or 64 bit mask for each string in the BAT that encodes the
+ *   presence or absence of each element of the header in the specific
+ *   item.
  *
  * A string imprint is stored in a new Heap in the BAT, aligned in 8
  * byte (64 bit) words.
  *
- * The first 64 bit word describes how the header of the strimp is
- * encoded. The least significant byte (v in the schematic below) is the
- * version number. The second (np) is the number of pairs in the
- * header. The third (b/p) is the number of bytes per pair if each pair
- * is encoded using a constant number of bytes or 0 if it is utf-8. The
- * next 2 bytes (hs) is the size of the header in bytes. The last 3
- * bytes needed to align to the 8 byte boundary should be zero, and are
- * reserved for future use.
+ * The first 64 bit word, the header descriptor, describes how the
+ * header of the strimp is encoded. The least significant byte (v in the
+ * schematic below) is the version number. The second (np) is the number
+ * of pairs in the header. The next 2 bytes (hs) is the size of the
+ * header in bytes. Finally the fifth byte is the persistence byte. The
+ * last 3 bytes needed to align to the 8 byte boundary should be zero,
+ * and are reserved for future use.
  *
- * In the current implementation we use 64 byte pairs for the header, so
+ * The following np bytes are the sizes of the pairs. These can have
+ * values from 2 to 8 and are the number of bytes that the corresponding
+ * pair takes up. Following that there are the bytes encoding the actual
+ * pairs.
  *
- * np  == 64
- * b/p == 2
- * hs  == 128
- *
- * The actual header follows. If it ends before an 8 byte boundary it
- * is padded with zeros.
+ * |   v   |  np   |      hs      |   p   |      reserved      |  8bytes
+ * |                                                           |             
---
+ *                         Strimp Header                                      |
+ * | psz_0 | psz_1 | ...                                       |              |
+ * |                                                           |  ---         |
+ * |                                                           |np bytes      |
+ * |                                               ... | psz_n |  ---      hs 
bytes
+ * |             pair_0          |           pair_1            |              |
+ * |                            ...                            |              |
+ * |                 pair_k-1                   |   pair_k     |              |
+ * |                          pair_n                           |              |
+ * |                                                           |             
---
  *
- * |  v   |  np   |  b/p |      hs      |     reserved         |  8bytes
- * |                                                           |        ---
- *                         Strimp Header                                 |
- * |                                                           |  hs bytes + 
padding
- * |                                                           |         |
- * |                                                           |        ---
+ *
  * The bitmasks for each string in the BAT follow after this.
  *
  * Strimp creation goes as follows:
@@ -50,10 +55,10 @@
  * - Construct a histogram of the element (byte or character) pairs for
  *   all the strings in the BAT.
  *
- * - Take the 64 most frequent pairs as the Strimp Header.
+ * - Take the 32/64 most frequent pairs as the Strimp Header.
  *
- * - For each string in the bat construct a 64 bit mask that encodes the
- *   presence or absence of each member of the header in the string.
+ * - For each string in the bat construct a 32/64 bit mask that encodes
+ *   the presence or absence of each member of the header in the string.
  */
 
 #include "monetdb_config.h"
@@ -61,36 +66,6 @@
 #include "gdk_private.h"
 
 #if 0
-/* This counts how many unicode codepoints the given string
- * contains.
- */
-static size_t
-STRMP_strlen(const uint8_t *s)
-{
-       size_t ret = 0;
-       size_t i;
-       int m,n;
-       uint8_t c;
-
-       i = 0;
-       while((c = *(s + i)) != 0) {
-               if (c < 0x80)
-                       i++;
-               else {
-                       for (n = 0, m=0x40; c & m; n++, m >>= 1)
-                               ;
-                       /* n is now the number of 10xxxxxx bytes that should
-                          follow. */
-                       if (n == 0 || n >= 4)
-                               /* TODO: handle invalid utf-8 */
-                               {}
-                       i += n+1;
-               }
-               ret++;
-       }
-
-       return ret;
-}
 
 /* Given a BAT return the number of digrams in it. The observation is
  * that the number of digrams is the number of characters - 1:
@@ -146,6 +121,53 @@ STRMPndigrams(BAT *b, size_t *n)
                ((TPE *) _a)[_j] = _t;                  \
        } while(0)
 
+/* This counts how many unicode codepoints the given string
+ * contains.
+ */
+static size_t
+STRMP_utf8_strlen(const uint8_t *s)
+{
+       size_t ret = 0;
+       size_t i;
+       int m,n;
+       uint8_t c;
+
+       i = 0;
+       while((c = *(s + i)) != 0) {
+               if (c < 0x80)
+                       i++;
+               else {
+                       for (n = 0, m=0x40; c & m; n++, m >>= 1)
+                               ;
+                       /* n is now the number of 10xxxxxx bytes that should
+                          follow. */
+                       if (n == 0 || n >= 4)
+                               /* TODO: handle invalid utf-8 */
+                               {}
+                       i += n+1;
+               }
+               ret++;
+       }
+
+       return ret;
+}
+
+static size_t
+STRMP_utf8_next_char_index(const str s, size_t cidx) {
+       assert(cidx < strlen(s)); /* !!! */
+       if (*(s+cidx+1) == 0)
+               return 0;
+       return cidx + 1;
+}
+
+#if 0
+static uint8_t*
+STRMP_utf8_char_at(const str s, size_t i, size_t *len) {
+       *len = 1;
+       return (uint8_t *)(s + i);
+}
+#endif
+
 /* Construct a histogram of pairs of bytes in the input BAT.
  *
  * Return the histogram in hist and the number of non-zero bins in
@@ -279,8 +301,12 @@ create_header(BAT *b)
        return header;
 }
 
+#define NPAIRS(d) (((d) & (0xff << 8)) >> 8)
+#define HSIZE(d) (((d) & (0xffff << 16)) >> 16)
+/* #define PSIZES(p) ((uint8_t *) ((p)+8)) */
+/* #define HPAIRS(p) ((uint8_t *) (PSIZES(p) + NPAIRS(*((uint64_t *)p)))) */
 
-/* Given a strimp h and a DataPair p, return the index i for which
+/* Given a strimp h and a pair p, return the index i for which
  *
  * h[i] == p
  *
@@ -289,16 +315,33 @@ create_header(BAT *b)
  * TODO: Should this be inlined somehow? (probably yes)
  */
 static int8_t
-lookup_index(StrimpHeader *h, DataPair n)
+lookup_index(BAT *b, uint8_t *pair, uint8_t psize)
 {
-       size_t i;
-       for(i = 0; i < STRIMP_HEADER_SIZE; i++)
-               if(h->bytepairs[i] == n)
-                       return i;
+       size_t i,j;
+       size_t idx = 0;
+       Heap strimp = b->tstrimps->strimps;
+       uint64_t desc = (uint64_t)strimp.base[0];
+       uint8_t npairs = NPAIRS(desc);
+       uint8_t *pair_sizes = b->tstrimps->sizes_base;
+       uint8_t *pairs = b->tstrimps->pairs_base;
+
+       for(i = 0; i < npairs; i++) {
+               if (psize == pair_sizes[i]) {
+                       uint8_t *h = pairs + idx;
+                       for (j = 0; j < psize; j++) {
+                               if(pair[j] != h[j])
+                                       break;
+                       }
+                       if (j == psize)
+                               return i;
+               }
+               idx += pair_sizes[i];
+       }
 
        return -1;
 }
 
+#define MAX_PAIR_SIZE 8
 
 /* Given a strimp header and a string compute the bitstring of which
  * digrams(byte pairs) are present in the string. The strimp header is a
@@ -307,19 +350,32 @@ lookup_index(StrimpHeader *h, DataPair n
  * This should probably be inlined.
  */
 static uint64_t
-STRMPmakebitstring(const str s, StrimpHeader *h)
+STRMPmakebitstring(const str s, BAT *b)
 {
        uint64_t ret = 0;
-       int8_t pair_idx;
-       char *it;
+       int8_t pair_idx = 0;
+       size_t idx = 0;
+       size_t nidx, nidx2;
+       size_t i;
+       uint8_t pair[MAX_PAIR_SIZE] = {0};
+       size_t psize = 0;
+       size_t ssize = strlen(s);
 
-       for(it = s; *it != 0 && *(it+1) != 0; it++) {
-               pair_idx = lookup_index(h, pairToIndex(*it, *(it+1)));
-               if (pair_idx >= 0)
+       for (idx = 0; idx < ssize - 1; idx++) { /* pairs start at positions < 
size-1 */
+               nidx = STRMP_utf8_next_char_index(s, idx);
+               assert(nidx > 0);
+               nidx2 = STRMP_utf8_next_char_index(s, nidx);
+               assert(nidx2 > 0);
+               psize = nidx2 - idx;
+               assert (psize < MAX_PAIR_SIZE);
+               for (i = idx; i < psize; i++)
+                       pair[idx + i] = *(s + idx + i);
+               pair_idx = lookup_index(b, pair, psize);
+               if (pair_idx >= 0) {
                        assert(pair_idx < STRIMP_HEADER_SIZE);
                        ret |= 0x1 << pair_idx;
+               }
        }
-
        return ret;
 }
 
@@ -372,7 +428,6 @@ create_strimp(BAT *b, StrimpHeader *h)
        return r;
 }
 
-
 static bool
 BATcheckstrimps(BAT *b)
 {
@@ -407,25 +462,23 @@ BATcheckstrimps(BAT *b)
                                         */
                                        if (read(fd, &desc, 8) == 8
                                            && (desc & 0xff) == STRIMP_VERSION
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: string_imprints - Start self contained refactoring

Reply via email to