Changeset: a418fcf02bae for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/a418fcf02bae
Modified Files:
        gdk/gdk.h
        gdk/gdk_strimps.c
        gdk/gdk_strimps.h
        monetdb5/modules/mal/pcre.c
        sql/test/strimps/Tests/strimps_not_like.SQL.py
Branch: strimps_update
Log Message:

Encode null values in the strimp structure

Using strimps for a NOT LIKE query (q) works by running the LIKE query (q') with
strimps and returning the complement of the result. The presence of NULL values
in the BAT complicates this strategy because the result of q' will not include
them and therefore its complement (the result of q) will.

In order to fix this issue we encode the NULL values in the strimp structure
itself so that the filtering process can be instructed to include them in the
resulting candidate list. We do that by using the most significant bit of the
bitstring: if it is 1 it means that the corresponding value in the BAT is NULL.
STRMPfilter now takes a new boolean argument that allows the inclusion of the
NULL entries in the resulting candidate list.

We also need to modify the relevant pcre functions and macros to also include
NULL values in the case that we want to take the complement of the result.


diffs (truncated from 337 to 300 lines):

diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -1844,7 +1844,7 @@ gdk_export lng IMPSimprintsize(BAT *b);
 
 /* Strimps exported functions */
 gdk_export gdk_return STRMPcreate(BAT *b, BAT *s);
-gdk_export BAT *STRMPfilter(BAT *b, BAT *s, const char *q);
+gdk_export BAT *STRMPfilter(BAT *b, BAT *s, const char *q, bool keep_nils);
 gdk_export void STRMPdestroy(BAT *b);
 gdk_export bool BAThasstrimps(BAT *b);
 
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -12,10 +12,11 @@
  * A string imprint is an index that can be used as a prefilter in LIKE
  * queries. It has 2 components:
  *
- * - a header of 64 string element pairs.
+ * - a header of 63 string element pairs.
  *
  * - a 64 bit mask for each string in the BAT that encodes the presence
- *   or absence of each element of the header in the specific item.
+ *   or absence of each element of the header in the specific item or if
+ *   the corresponding entry in the BAT is NULL.
  *
  * A string imprint is stored in a new Heap in the BAT, aligned in 8
  * byte (64 bit) words.
@@ -59,11 +60,11 @@
  * - Construct a histogram of all the element pairs for all the strings
  *   in the BAT.
  *
- * - Take the np most frequent pairs as the Strimp Header.
+ * - Take the np - 1 most frequent pairs as the Strimp Header.
  *
  * - For each string s in the BAT, construct an np-bit mask, m_s that
  *   encodes the presence or absence of each member of the header in the
- *   string.
+ *   string or if s is NULL.
  *
  * Filtering with a query string q goes as follows:
  *
@@ -241,11 +242,6 @@ STRMPmakebitstring(const char *s, Strimp
  * STRIMP_HEADER_SIZE largest elements. This depends on the size of the
  * histogram n. For some small n sorting might be more efficient, but
  * for such inputs the difference should not be noticeable.
- *
- * TODO: Explore if a priority queue (heap construction and 64 extract
- * maximums) is worth it. The tradeoff here is that it will complicate
- * the code but might improve performance. In a debug build the current
- * implementation takes ~200 μs.
  */
 static void
 STRMPchoosePairs(PairHistogramElem *hist, size_t hist_size, CharPair *cp)
@@ -443,7 +439,7 @@ BATcheckstrimps(BAT *b)
                                         */
                                        if (read(fd, &desc, 8) == 8
                                            && (desc & 0xff) == STRIMP_VERSION
-                                           && ((npairs = NPAIRS(desc)) == 64)
+                                           && ((npairs = NPAIRS(desc)) == 
STRIMP_PAIRS)
                                            && (hsize = HSIZE(desc)) >= 200 && 
hsize <= 584
                                            && ((desc >> 32) & 0xff) == 1 /* 
check the persistence byte */
                                            && fstat(fd, &st) == 0
@@ -455,8 +451,8 @@ BATcheckstrimps(BAT *b)
                                                                      
BATcount(b)*sizeof(uint64_t))
                                            && HEAPload(&hp->strimps, nme, 
"tstrimps", false) == GDK_SUCCEED) {
                                                hp->sizes_base = (uint8_t 
*)hp->strimps.base + 8; /* sizes just after the descriptor */
-                                               hp->pairs_base = hp->sizes_base 
+ npairs;         /* pairs just after the offsets */
-                                               hp->bitstrings_base = 
hp->strimps.base + hsize;        /* bitmasks just after the pairs */
+                                               hp->pairs_base = hp->sizes_base 
+ STRIMP_HEADER_SIZE;   /* pairs just after the offsets. */
+                                               hp->bitstrings_base = 
hp->strimps.base + hsize;   /* bitmasks just after the pairs */
 
                                                close(fd);
                                                ATOMIC_INIT(&hp->strimps.refs, 
1);
@@ -496,7 +492,8 @@ BATcheckstrimps(BAT *b)
        do {                                                            \
                for (i = 0; i < ci.ncand; i++) {                        \
                        x = next(&ci);                                  \
-                       if ((bitstring_array[x] & qbmask) == qbmask) {  \
+                       if ((bitstring_array[x] & qbmask) == qbmask || \
+                           (keep_nils && (bitstring_array[x] & ((uint64_t)0x1 
<< (STRIMP_HEADER_SIZE - 1))))) { \
                                rvals[j++] = x;                         \
                        }                                               \
                }                                                       \
@@ -506,7 +503,7 @@ BATcheckstrimps(BAT *b)
  * list.
  */
 BAT *
-STRMPfilter(BAT *b, BAT *s, const char *q)
+STRMPfilter(BAT *b, BAT *s, const char *q, const bool keep_nils)
 {
        BAT *r = NULL;
        BUN i, j = 0;
@@ -546,6 +543,7 @@ STRMPfilter(BAT *b, BAT *s, const char *
        }
 
        qbmask = STRMPmakebitstring(q, strmps);
+       assert((qbmask & ((uint64_t)0x1 << 63)) == 0);
        bitstring_array = (uint64_t *)strmps->bitstrings_base;
        rvals = Tloc(r, 0);
 
@@ -658,6 +656,13 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s)
        if ((r = b->tstrimps) == NULL &&
                STRMPbuildHeader(b, s, hpairs)) { /* Find the header pairs, put
                                                 the result in hpairs */
+               /* The 64th bit in the bit string is used to indicate if
+                  the string is NULL. So the corresponding pair does
+                  not encode useful information. We need to keep it for
+                  alignment but we make sure that it will not match an
+                  actual pair of characters we encounter in strings.*/
+               for (i = 0; i < hpairs[STRIMP_HEADER_SIZE - 1].psize; i++)
+                       hpairs[STRIMP_HEADER_SIZE - 1].pbytes[i] = 0;
                sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the descriptor 
and
                                                the pair sizes */
                for (i = 0; i < STRIMP_HEADER_SIZE; i++) {
@@ -679,7 +684,7 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s)
                        return NULL;
                }
 
-               descriptor = STRIMP_VERSION | ((uint64_t)STRIMP_HEADER_SIZE) << 
8 |
+               descriptor = STRIMP_VERSION | ((uint64_t)(STRIMP_PAIRS)) << 8 |
                        ((uint64_t)sz) << 16;
 
                ((uint64_t *)r->strimps.base)[0] = descriptor;
@@ -761,10 +766,12 @@ STRMPcreate(BAT *b, BAT *s)
                        for (i = 0; i < ci.ncand; i++) {
                                x = canditer_next(&ci);
                                const char *cs = BUNtvar(bi, x);
-                               if (!strNil(cs))
+                               if (!strNil(cs)) {
                                        *dh++ = STRMPmakebitstring(cs, r);
+                                       assert((*(dh - 1) & ((uint64_t)0x1 << 
(STRIMP_PAIRS))) == 0);
+                               }
                                else
-                                       *dh++ = 0;
+                                       *dh++ = (uint64_t)0x1 << 
(STRIMP_PAIRS); /* Encode NULL strings in the last bit */
                        }
                        bat_iterator_end(&bi);
 
diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h
--- a/gdk/gdk_strimps.h
+++ b/gdk/gdk_strimps.h
@@ -12,10 +12,10 @@
 #include <stdint.h>
 
 
-#define STRIMP_VERSION (uint64_t)1
+#define STRIMP_VERSION (uint64_t)2
 #define STRIMP_HISTSIZE 256*256
 #define STRIMP_HEADER_SIZE 64
-#define STRIMP_CREATION_THRESHOLD 5000 /* do not create strimp for "small" 
BATs */
+#define STRIMP_PAIRS (STRIMP_HEADER_SIZE - 1)
 
 typedef struct {
        uint8_t *pbytes;
diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c
--- a/monetdb5/modules/mal/pcre.c
+++ b/monetdb5/modules/mal/pcre.c
@@ -1726,7 +1726,7 @@ BATPCREnotlike(Client cntxt, MalBlkPtr m
 }
 
 /* scan select loop with or without candidates */
-#define pcrescanloop(TEST)             \
+#define pcrescanloop(TEST, KEEP_NULLS)                         \
        do {    \
                TRC_DEBUG(ALGO,                 \
                                  "PCREselect(b=%s#"BUNFMT",anti=%d): "         
\
@@ -1737,7 +1737,7 @@ BATPCREnotlike(Client cntxt, MalBlkPtr m
                 GDK_CHECK_TIMEOUT(timeoffset, counter,                         
                \
                         GOTO_LABEL_TIMEOUT_HANDLER(bailout));                  
        \
                                const char *restrict v = BUNtvar(bi, p - off);  
\
-                               if (TEST)       \
+                               if ((TEST) || ((KEEP_NULLS) && *v == '\200'))   
\
                                        vals[cnt++] = p;        \
                        }               \
                } else {                \
@@ -1746,7 +1746,7 @@ BATPCREnotlike(Client cntxt, MalBlkPtr m
                         GOTO_LABEL_TIMEOUT_HANDLER(bailout));                  
        \
                                oid o = canditer_next(ci);              \
                                const char *restrict v = BUNtvar(bi, o - off);  
\
-                               if (TEST)       \
+                               if ((TEST) || ((KEEP_NULLS) && *v == '\200'))   
\
                                        vals[cnt++] = o;        \
                        }               \
                }               \
@@ -1759,7 +1759,7 @@ BATPCREnotlike(Client cntxt, MalBlkPtr m
 #endif
 
 static str
-pcre_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q, 
BUN *rcnt, const char *pat, bool caseignore, bool anti)
+pcre_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q, 
BUN *rcnt, const char *pat, bool caseignore, bool anti, bool keep_nulls)
 {
 #ifdef HAVE_LIBPCRE
        pcre *re = NULL;
@@ -1784,9 +1784,9 @@ pcre_likeselect(BAT *bn, BAT *b, BAT *s,
                goto bailout;
 
        if (anti)
-               pcrescanloop(v && *v != '\200' && !PCRE_LIKESELECT_BODY);
+               pcrescanloop(v && *v != '\200' && !PCRE_LIKESELECT_BODY, 
keep_nulls);
        else
-               pcrescanloop(v && *v != '\200' && PCRE_LIKESELECT_BODY);
+               pcrescanloop(v && *v != '\200' && PCRE_LIKESELECT_BODY, 
keep_nulls);
 
 bailout:
        bat_iterator_end(&bi);
@@ -1796,7 +1796,7 @@ bailout:
 }
 
 static str
-re_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q, BUN 
*rcnt, const char *pat, bool caseignore, bool anti, bool use_strcmp, uint32_t 
esc)
+re_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q, BUN 
*rcnt, const char *pat, bool caseignore, bool anti, bool use_strcmp, uint32_t 
esc, bool keep_nulls)
 {
        BATiter bi = bat_iterator(b);
        BUN cnt = 0, ncands = ci->ncand;
@@ -1817,26 +1817,26 @@ re_likeselect(BAT *bn, BAT *b, BAT *s, s
        if (use_strcmp) {
                if (caseignore) {
                        if (anti)
-                               pcrescanloop(v && *v != '\200' && 
mywstrcasecmp(v, wpat) != 0);
+                               pcrescanloop(v && *v != '\200' && 
mywstrcasecmp(v, wpat) != 0, keep_nulls);
                        else
-                               pcrescanloop(v && *v != '\200' && 
mywstrcasecmp(v, wpat) == 0);
+                               pcrescanloop(v && *v != '\200' && 
mywstrcasecmp(v, wpat) == 0, keep_nulls);
                } else {
                        if (anti)
-                               pcrescanloop(v && *v != '\200' && strcmp(v, 
pat) != 0);
+                               pcrescanloop(v && *v != '\200' && strcmp(v, 
pat) != 0, keep_nulls);
                        else
-                               pcrescanloop(v && *v != '\200' && strcmp(v, 
pat) == 0);
+                               pcrescanloop(v && *v != '\200' && strcmp(v, 
pat) == 0, keep_nulls);
                }
        } else {
                if (caseignore) {
                        if (anti)
-                               pcrescanloop(v && *v != '\200' && 
!re_match_ignore(v, re));
+                               pcrescanloop(v && *v != '\200' && 
!re_match_ignore(v, re), keep_nulls);
                        else
-                               pcrescanloop(v && *v != '\200' && 
re_match_ignore(v, re));
+                               pcrescanloop(v && *v != '\200' && 
re_match_ignore(v, re), keep_nulls);
                } else {
                        if (anti)
-                               pcrescanloop(v && *v != '\200' && 
!re_match_no_ignore(v, re));
+                               pcrescanloop(v && *v != '\200' && 
!re_match_no_ignore(v, re), keep_nulls);
                        else
-                               pcrescanloop(v && *v != '\200' && 
re_match_no_ignore(v, re));
+                               pcrescanloop(v && *v != '\200' && 
re_match_no_ignore(v, re), keep_nulls);
                }
        }
 
@@ -1887,23 +1887,20 @@ PCRElikeselect(bat *ret, const bat *bid,
         * the BAT contains NULLs.
         */
        if (BAThasstrimps(b)) {
-               if (!*anti || (b->tnonil && *anti)) {
-                       BAT *tmp_s = STRMPfilter(b, s, *pat);
-                       if (tmp_s) {
-                               old_s = s;
-                               s = tmp_s;
-                               if (!*anti)
-                                       with_strimps = true;
-                               else
-                                       with_strimps_anti = true;
-                       } else { /* If we cannot filter with the strimp just 
continue normally */
-                               GDKclrerr();
-                       }
+               BAT *tmp_s = STRMPfilter(b, s, *pat, *anti);
+               if (tmp_s) {
+                       old_s = s;
+                       s = tmp_s;
+                       if (!*anti)
+                               with_strimps = true;
+                       else
+                               with_strimps_anti = true;
+               } else { /* If we cannot filter with the strimp just continue 
normally */
+                       GDKclrerr();
                }
        }
 
-       MT_thread_setalgorithm(// check this
-                                                  use_strcmp ? (with_strimps ? 
"pcrelike: pattern matching using strcmp with strimps" : (with_strimps_anti ? 
"pcrelike: pattern matching using strcmp with strimps anti" : "pcrelike: 
pattern matching using strcmp")) :
+       MT_thread_setalgorithm(use_strcmp ? (with_strimps ? "pcrelike: pattern 
matching using strcmp with strimps" : (with_strimps_anti ? "pcrelike: pattern 
matching using strcmp with strimps anti" : "pcrelike: pattern matching using 
strcmp")) :
                                                   use_re ? (with_strimps ? 
"pcrelike: pattern matching using RE with strimps" : (with_strimps_anti ? 
"pcrelike: patterm matching using RE with strimps anti" : "pcrelike: pattern 
matching using RE")) :
                                                   (with_strimps ? "pcrelike: 
pattern matching using pcre with strimps" : (with_strimps_anti ? "pcrelike: 
pattermatching using pcre with strimps anti" : "pcrelike: pattern matching 
using pcre")));
 
@@ -1929,9 +1926,9 @@ PCRElikeselect(bat *ret, const bat *bid,
        }
 
        if (use_re) {
-               msg = re_likeselect(bn, b, s, &ci, p, q, &rcnt, *pat, 
*caseignore, *anti && !with_strimps_anti, use_strcmp, (unsigned char) **esc);
+               msg = re_likeselect(bn, b, s, &ci, p, q, &rcnt, *pat, 
*caseignore, *anti && !with_strimps_anti, use_strcmp, (unsigned char) **esc,  
with_strimps_anti);
        } else {
-               msg = pcre_likeselect(bn, b, s, &ci, p, q, &rcnt, ppat, 
*caseignore, *anti && !with_strimps_anti);
+               msg = pcre_likeselect(bn, b, s, &ci, p, q, &rcnt, ppat, 
*caseignore, *anti && !with_strimps_anti, with_strimps_anti);
        }
 
        if (!msg) { /* set some properties */
diff --git a/sql/test/strimps/Tests/strimps_not_like.SQL.py 
b/sql/test/strimps/Tests/strimps_not_like.SQL.py
--- a/sql/test/strimps/Tests/strimps_not_like.SQL.py
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to