Changeset: 9820d4322aa5 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/9820d4322aa5
Branch: default
Log Message:

Merge with strimps_v3 branch


diffs (truncated from 2388 to 300 lines):

diff --git a/monetdb5/modules/atoms/CMakeLists.txt 
b/monetdb5/modules/atoms/CMakeLists.txt
--- a/monetdb5/modules/atoms/CMakeLists.txt
+++ b/monetdb5/modules/atoms/CMakeLists.txt
@@ -16,7 +16,7 @@ target_sources(atoms
   PRIVATE
   streams.c streams.h
   blob.c
-  str.c str.h
+  str.c str.h bigram.h
   strptime.c
   url.c
   uuid.c
diff --git a/monetdb5/modules/atoms/bigram.h b/monetdb5/modules/atoms/bigram.h
new file mode 100644
--- /dev/null
+++ b/monetdb5/modules/atoms/bigram.h
@@ -0,0 +1,49 @@
+/*
+ * SPDX-License-Identifier: MPL-2.0
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0.  If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright 2024 MonetDB Foundation;
+ * Copyright August 2008 - 2023 MonetDB B.V.;
+ * Copyright 1997 - July 2008 CWI.
+ */
+
+#include "monetdb_config.h"
+#include "gdk.h"
+
+#ifdef HAVE_HGE
+#define NGRAM_TYPE hge
+#define NGRAM_TYPENIL hge_nil
+#define NGRAM_CST(v) ((hge)LL_CONSTANT(v))
+#define NGRAM_BITS 127
+#define CHARMAP(s) (s & NGRAM_BITS)
+#define SZ 128
+#else
+#define NGRAM_TYPE lng
+#define NGRAM_TYPEID TYPE_lng
+#define NGRAM_TYPENIL lng_nil
+#define NGRAM_CST(v) LL_CONSTANT(v)
+#define NGRAM_BITS 63
+#define CHARMAP(s) (s & NGRAM_BITS)
+#define SZ 64
+#endif
+
+#define BIGRAM_SZ (SZ * SZ)
+#define NGRAM_MULTIPLE 16
+#define TOKEN1(s) (*s)
+#define TOKEN2(s) (*(s + 1))
+#define BIGRAM(s) (TOKEN1(s) && TOKEN2(s))
+
+#define ENC_TOKEN1(t) CHARMAP(*t)
+#define ENC_TOKEN2(t) CHARMAP(*(t + 1))
+
+typedef struct {
+       NGRAM_TYPE *idx;
+       NGRAM_TYPE *sigs;
+       unsigned *histogram;
+       unsigned min, max;
+       unsigned *lists;
+       unsigned *rids;
+} Ngrams;
diff --git a/monetdb5/modules/atoms/str.c b/monetdb5/modules/atoms/str.c
--- a/monetdb5/modules/atoms/str.c
+++ b/monetdb5/modules/atoms/str.c
@@ -67,6 +67,7 @@
 #include <string.h>
 #include "mal_interpreter.h"
 #include "mutf8.h"
+#include "bigram.h"
 
 #define UTF8_assert(s)         assert(checkUTF8(s))
 
@@ -1822,7 +1823,7 @@ STRasciify(str *r, const char *const *s)
 }
 
 static inline void
-BBPnreclaim(int nargs, ...)
+BBPreclaim_n(int nargs, ...)
 {
        va_list valist;
        va_start(valist, nargs);
@@ -1833,446 +1834,135 @@ BBPnreclaim(int nargs, ...)
        va_end(valist);
 }
 
-#define HANDLE_TIMEOUT(qc)                                                     
                \
-       do {                                                                    
                                \
-               TIMEOUT_ERROR(qc, __FILE__, __func__, __LINE__);        \
-               msg = createException(MAL, fname, GDK_EXCEPTION);       \
-       } while (0)
+#define VALUE(s, x)  (s##_vars + VarHeapVal(s##_vals, (x), s##i->width))
+#define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
 
-#define scanloop(TEST, canditer_next)                                          
\
+#define SCAN_LOOP(STR_CMP)                                                     
                \
        do {                                                                    
                                \
-               const oid off = b->hseqbase;                                    
        \
-               TIMEOUT_LOOP(ci.ncand, qry_ctx) {                               
        \
-                       oid o = canditer_next(&ci);                             
                \
-                       const char *restrict v = BUNtvar(bi, o - off);  \
-                       assert(rcnt < BATcapacity(bn));                         
        \
-                       if (TEST)                                               
                                \
-                               vals[rcnt++] = o;                               
                        \
+               TIMEOUT_LOOP(lci->ncand, qry_ctx) {                             
        \
+                       oid lo = canditer_next(lci);                            
        \
+                       const char *ls = VALUE(l, lo - l_base);                 
\
+                       if (!strNil(ls) && (STR_CMP))                           
        \
+                               APPEND(rl, lo);                                 
                        \
                }                                                               
                                        \
        } while (0)
 
 static str
-STRselect(MalStkPtr stk, InstrPtr pci,
-                 int (*str_icmp)(const char *, const char *, int),
-                 int (*str_cmp)(const char *, const char *, int),
-                 const char *fname)
+scan_loop_strselect(BAT *rl, BATiter *li, struct canditer *lci, const char *r,
+                                       int (*str_cmp)(const char *, const char 
*, int),
+                                       bool anti, const char *fname, QryCtx 
*qry_ctx)
 {
-       str msg = MAL_SUCCEED;
+       oid l_base = li->b->hseqbase;
+       const char *l_vars = li->vh->base, *l_vals = li->base;
+       int r_len = str_strlen(r);
 
-       bat *r_id = getArgReference_bat(stk, pci, 0);
-       bat b_id = *getArgReference_bat(stk, pci, 1);
-       bat cb_id = *getArgReference_bat(stk, pci, 2);
-       const char *key = *getArgReference_str(stk, pci, 3);
-       bit icase = pci->argc != 5;
-       bit anti = pci->argc == 5 ? *getArgReference_bit(stk, pci, 4) :
-               *getArgReference_bit(stk, pci, 5);
-
-       BAT *b, *cb = NULL, *bn = NULL, *old_s = NULL;;
-       BUN rcnt = 0;
-       struct canditer ci;
-       bool with_strimps = false,
-               with_strimps_anti = false;
-
-       if (!(b = BATdescriptor(b_id)))
-               throw(MAL, fname, SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
+       lng t0 = 0;
+       TRC_DEBUG_IF(ALGO) t0 = GDKusec();
 
-       if (!is_bat_nil(cb_id) && !(cb = BATdescriptor(cb_id))) {
-               BBPreclaim(b);
-               throw(MAL, fname, SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
-       }
-
-       assert(ATOMstorage(b->ttype) == TYPE_str);
+       if (anti)
+               SCAN_LOOP(str_cmp(ls, r, r_len) != 0);
+       else
+               SCAN_LOOP(str_cmp(ls, r, r_len) == 0);
 
-       if (BAThasstrimps(b)) {
-               BAT *tmp_s;
-               if (STRMPcreate(b, NULL) == GDK_SUCCEED && (tmp_s = 
STRMPfilter(b, cb, key, anti)) != NULL) {
-                       old_s = cb;
-                       cb = tmp_s;
-                       if (!anti)
-                               with_strimps = true;
-                       else
-                               with_strimps_anti = true;
-               } else {
-                       /* strimps failed, continue without */
-                       GDKclrerr();
-               }
-       }
-
-       MT_thread_setalgorithm(with_strimps ?
-                                                  "string_select: strcmp 
function using strimps" :
-                                                  (with_strimps_anti ?
-                                                       "string_select: strcmp 
function using strimps anti"
-                                                       : "string_select: 
strcmp function with no accelerator"));
-
-       canditer_init(&ci, b, cb);
-       if (!(bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT))) {
-               BBPnreclaim(2, b, cb);
-               throw(MAL, fname, SQLSTATE(HY013) MAL_MALLOC_FAIL);
+       BATsetcount(rl, BATcount(rl));
+       if (BATcount(rl) > 0) {
+               BATnegateprops(rl);
+               rl->tnonil = true;
+               rl->tnil = false;
        }
 
-       if (!strNil(key)) {
-               BATiter bi = bat_iterator(b);
-               QryCtx *qry_ctx = MT_thread_get_qry_ctx();
-               if (icase)
-                       str_cmp = str_icmp;
-               oid *vals = Tloc(bn, 0);
-               const int klen = str_strlen(key);
-               if (ci.tpe == cand_dense) {
-                       if (with_strimps_anti)
-                               scanloop(strNil(v) || str_cmp(v, key, klen) == 
0, canditer_next_dense);
-                       else if (anti)
-                               scanloop(!strNil(v) && str_cmp(v, key, klen) != 
0, canditer_next_dense);
-                       else
-                               scanloop(!strNil(v) && str_cmp(v, key, klen) == 
0, canditer_next_dense);
-               } else {
-                       if (with_strimps_anti)
-                               scanloop(strNil(v) || str_cmp(v, key, klen) == 
0, canditer_next);
-                       else if (anti)
-                               scanloop(!strNil(v) && str_cmp(v, key, klen) != 
0, canditer_next);
-                       else
-                               scanloop(!strNil(v) && str_cmp(v, key, klen) == 
0, canditer_next);
-               }
-               bat_iterator_end(&bi);
-               TIMEOUT_CHECK(qry_ctx, HANDLE_TIMEOUT(qry_ctx));
+       TRC_DEBUG(ALGO, "(%s, %s, l=%s #%zu [%s], cl=%s #%zu, 
time="LLFMT"usecs)\n",
+                         fname, "scan_loop_strselect",
+                         BATgetId(li->b), li->count, ATOMname(li->b->ttype),
+                         lci ? BATgetId(lci->s) : "NULL", lci ? lci->ncand : 0,
+                         GDKusec() - t0);
+
+       return MAL_SUCCEED;
+}
 
-               if (!msg) {
-                       BATsetcount(bn, rcnt);
-                       bn->tsorted = true;
-                       bn->trevsorted = bn->batCount <= 1;
-                       bn->tkey = true;
-                       bn->tnil = false;
-                       bn->tnonil = true;
-                       bn->tseqbase = rcnt == 0 ?
-                               0 : rcnt == 1 ?
-                               *(const oid *) Tloc(bn, 0) : rcnt == ci.ncand 
&& ci.tpe == cand_dense ? ci.seq : oid_nil;
+static str
+STRselect(MalStkPtr stk, InstrPtr pci, const str fname,
+                 int (*str_cmp)(const char *, const char *, int))
+{
+       str msg = MAL_SUCCEED;
+       QryCtx *qry_ctx = MT_thread_get_qry_ctx();
+       BAT *l = NULL, *cl = NULL, *rl = NULL;
 
-                       if (with_strimps_anti) {
-                               BAT *rev;
-                               if (old_s) {
-                                       rev = BATdiffcand(old_s, bn);
-#ifndef NDEBUG
-                                       BAT *is = BATintersectcand(old_s, bn);
-                                       if (is) {
-                                               assert(is->batCount == 
bn->batCount);
-                                               BBPreclaim(is);
-                                       }
-                                       assert(rev->batCount == old_s->batCount 
- bn->batCount);
-#endif
-                               } else
-                                       rev = BATnegcands(0, b->batCount, bn);
+       bat *RL = getArgReference_bat(stk, pci, 0);
+       bat *L = getArgReference_bat(stk, pci, 1);
+       bat *CL = getArgReference_bat(stk, pci, 2);
+       const char *r = *getArgReference_str(stk, pci, 3);
+       bool icase = pci->argc != 5;
+       bool anti = pci->argc == 5 ? *getArgReference_bit(stk, pci, 4) :
+               *getArgReference_bit(stk, pci, 5);
 
-                               BBPreclaim(bn);
-                               bn = rev;
-                               if (bn == NULL)
-                                       msg = createException(MAL, fname, 
SQLSTATE(HY013) MAL_MALLOC_FAIL);
-                       }
-               }
+       if (!(l = BATdescriptor(*L)))
+               throw(MAL, fname, RUNTIME_OBJECT_MISSING);
+
+       if (CL && !is_bat_nil(*CL) && !(cl = BATdescriptor(*CL))) {
+               BBPreclaim(l);
+               throw(MAL, fname, RUNTIME_OBJECT_MISSING);
        }
 
-       if (bn && !msg) {
-               *r_id = bn->batCacheid;
-               BBPkeepref(bn);
-       } else {
-               BBPreclaim(bn);
+       BATiter li = bat_iterator(l);
+       struct canditer lci;
+       canditer_init(&lci, l, cl);
+       size_t l_cnt = lci.ncand;
+
+       rl = COLnew(0, TYPE_oid, l_cnt, TRANSIENT);
+       if (!rl) {
+               BBPreclaim_n(2, l, cl);
+               throw(MAL, fname, MAL_MALLOC_FAIL);
        }
 
-       BBPnreclaim(3, b, cb, old_s);
+       if (icase) {
+               if (str_cmp == str_is_prefix)
+                       str_cmp = str_is_iprefix;
+               else if (str_cmp == str_is_suffix)
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to