Changeset: 9820d4322aa5 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/9820d4322aa5 Branch: default Log Message:
Merge with strimps_v3 branch diffs (truncated from 2388 to 300 lines): diff --git a/monetdb5/modules/atoms/CMakeLists.txt b/monetdb5/modules/atoms/CMakeLists.txt --- a/monetdb5/modules/atoms/CMakeLists.txt +++ b/monetdb5/modules/atoms/CMakeLists.txt @@ -16,7 +16,7 @@ target_sources(atoms PRIVATE streams.c streams.h blob.c - str.c str.h + str.c str.h bigram.h strptime.c url.c uuid.c diff --git a/monetdb5/modules/atoms/bigram.h b/monetdb5/modules/atoms/bigram.h new file mode 100644 --- /dev/null +++ b/monetdb5/modules/atoms/bigram.h @@ -0,0 +1,49 @@ +/* + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2024 MonetDB Foundation; + * Copyright August 2008 - 2023 MonetDB B.V.; + * Copyright 1997 - July 2008 CWI. + */ + +#include "monetdb_config.h" +#include "gdk.h" + +#ifdef HAVE_HGE +#define NGRAM_TYPE hge +#define NGRAM_TYPENIL hge_nil +#define NGRAM_CST(v) ((hge)LL_CONSTANT(v)) +#define NGRAM_BITS 127 +#define CHARMAP(s) (s & NGRAM_BITS) +#define SZ 128 +#else +#define NGRAM_TYPE lng +#define NGRAM_TYPEID TYPE_lng +#define NGRAM_TYPENIL lng_nil +#define NGRAM_CST(v) LL_CONSTANT(v) +#define NGRAM_BITS 63 +#define CHARMAP(s) (s & NGRAM_BITS) +#define SZ 64 +#endif + +#define BIGRAM_SZ (SZ * SZ) +#define NGRAM_MULTIPLE 16 +#define TOKEN1(s) (*s) +#define TOKEN2(s) (*(s + 1)) +#define BIGRAM(s) (TOKEN1(s) && TOKEN2(s)) + +#define ENC_TOKEN1(t) CHARMAP(*t) +#define ENC_TOKEN2(t) CHARMAP(*(t + 1)) + +typedef struct { + NGRAM_TYPE *idx; + NGRAM_TYPE *sigs; + unsigned *histogram; + unsigned min, max; + unsigned *lists; + unsigned *rids; +} Ngrams; diff --git a/monetdb5/modules/atoms/str.c b/monetdb5/modules/atoms/str.c --- a/monetdb5/modules/atoms/str.c +++ b/monetdb5/modules/atoms/str.c @@ -67,6 +67,7 @@ #include <string.h> #include "mal_interpreter.h" #include "mutf8.h" +#include "bigram.h" #define UTF8_assert(s) assert(checkUTF8(s)) @@ -1822,7 +1823,7 @@ STRasciify(str *r, const char *const *s) } static inline void -BBPnreclaim(int nargs, ...) +BBPreclaim_n(int nargs, ...) { va_list valist; va_start(valist, nargs); @@ -1833,446 +1834,135 @@ BBPnreclaim(int nargs, ...) va_end(valist); } -#define HANDLE_TIMEOUT(qc) \ - do { \ - TIMEOUT_ERROR(qc, __FILE__, __func__, __LINE__); \ - msg = createException(MAL, fname, GDK_EXCEPTION); \ - } while (0) +#define VALUE(s, x) (s##_vars + VarHeapVal(s##_vals, (x), s##i->width)) +#define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o)) -#define scanloop(TEST, canditer_next) \ +#define SCAN_LOOP(STR_CMP) \ do { \ - const oid off = b->hseqbase; \ - TIMEOUT_LOOP(ci.ncand, qry_ctx) { \ - oid o = canditer_next(&ci); \ - const char *restrict v = BUNtvar(bi, o - off); \ - assert(rcnt < BATcapacity(bn)); \ - if (TEST) \ - vals[rcnt++] = o; \ + TIMEOUT_LOOP(lci->ncand, qry_ctx) { \ + oid lo = canditer_next(lci); \ + const char *ls = VALUE(l, lo - l_base); \ + if (!strNil(ls) && (STR_CMP)) \ + APPEND(rl, lo); \ } \ } while (0) static str -STRselect(MalStkPtr stk, InstrPtr pci, - int (*str_icmp)(const char *, const char *, int), - int (*str_cmp)(const char *, const char *, int), - const char *fname) +scan_loop_strselect(BAT *rl, BATiter *li, struct canditer *lci, const char *r, + int (*str_cmp)(const char *, const char *, int), + bool anti, const char *fname, QryCtx *qry_ctx) { - str msg = MAL_SUCCEED; + oid l_base = li->b->hseqbase; + const char *l_vars = li->vh->base, *l_vals = li->base; + int r_len = str_strlen(r); - bat *r_id = getArgReference_bat(stk, pci, 0); - bat b_id = *getArgReference_bat(stk, pci, 1); - bat cb_id = *getArgReference_bat(stk, pci, 2); - const char *key = *getArgReference_str(stk, pci, 3); - bit icase = pci->argc != 5; - bit anti = pci->argc == 5 ? *getArgReference_bit(stk, pci, 4) : - *getArgReference_bit(stk, pci, 5); - - BAT *b, *cb = NULL, *bn = NULL, *old_s = NULL;; - BUN rcnt = 0; - struct canditer ci; - bool with_strimps = false, - with_strimps_anti = false; - - if (!(b = BATdescriptor(b_id))) - throw(MAL, fname, SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); + lng t0 = 0; + TRC_DEBUG_IF(ALGO) t0 = GDKusec(); - if (!is_bat_nil(cb_id) && !(cb = BATdescriptor(cb_id))) { - BBPreclaim(b); - throw(MAL, fname, SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); - } - - assert(ATOMstorage(b->ttype) == TYPE_str); + if (anti) + SCAN_LOOP(str_cmp(ls, r, r_len) != 0); + else + SCAN_LOOP(str_cmp(ls, r, r_len) == 0); - if (BAThasstrimps(b)) { - BAT *tmp_s; - if (STRMPcreate(b, NULL) == GDK_SUCCEED && (tmp_s = STRMPfilter(b, cb, key, anti)) != NULL) { - old_s = cb; - cb = tmp_s; - if (!anti) - with_strimps = true; - else - with_strimps_anti = true; - } else { - /* strimps failed, continue without */ - GDKclrerr(); - } - } - - MT_thread_setalgorithm(with_strimps ? - "string_select: strcmp function using strimps" : - (with_strimps_anti ? - "string_select: strcmp function using strimps anti" - : "string_select: strcmp function with no accelerator")); - - canditer_init(&ci, b, cb); - if (!(bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT))) { - BBPnreclaim(2, b, cb); - throw(MAL, fname, SQLSTATE(HY013) MAL_MALLOC_FAIL); + BATsetcount(rl, BATcount(rl)); + if (BATcount(rl) > 0) { + BATnegateprops(rl); + rl->tnonil = true; + rl->tnil = false; } - if (!strNil(key)) { - BATiter bi = bat_iterator(b); - QryCtx *qry_ctx = MT_thread_get_qry_ctx(); - if (icase) - str_cmp = str_icmp; - oid *vals = Tloc(bn, 0); - const int klen = str_strlen(key); - if (ci.tpe == cand_dense) { - if (with_strimps_anti) - scanloop(strNil(v) || str_cmp(v, key, klen) == 0, canditer_next_dense); - else if (anti) - scanloop(!strNil(v) && str_cmp(v, key, klen) != 0, canditer_next_dense); - else - scanloop(!strNil(v) && str_cmp(v, key, klen) == 0, canditer_next_dense); - } else { - if (with_strimps_anti) - scanloop(strNil(v) || str_cmp(v, key, klen) == 0, canditer_next); - else if (anti) - scanloop(!strNil(v) && str_cmp(v, key, klen) != 0, canditer_next); - else - scanloop(!strNil(v) && str_cmp(v, key, klen) == 0, canditer_next); - } - bat_iterator_end(&bi); - TIMEOUT_CHECK(qry_ctx, HANDLE_TIMEOUT(qry_ctx)); + TRC_DEBUG(ALGO, "(%s, %s, l=%s #%zu [%s], cl=%s #%zu, time="LLFMT"usecs)\n", + fname, "scan_loop_strselect", + BATgetId(li->b), li->count, ATOMname(li->b->ttype), + lci ? BATgetId(lci->s) : "NULL", lci ? lci->ncand : 0, + GDKusec() - t0); + + return MAL_SUCCEED; +} - if (!msg) { - BATsetcount(bn, rcnt); - bn->tsorted = true; - bn->trevsorted = bn->batCount <= 1; - bn->tkey = true; - bn->tnil = false; - bn->tnonil = true; - bn->tseqbase = rcnt == 0 ? - 0 : rcnt == 1 ? - *(const oid *) Tloc(bn, 0) : rcnt == ci.ncand && ci.tpe == cand_dense ? ci.seq : oid_nil; +static str +STRselect(MalStkPtr stk, InstrPtr pci, const str fname, + int (*str_cmp)(const char *, const char *, int)) +{ + str msg = MAL_SUCCEED; + QryCtx *qry_ctx = MT_thread_get_qry_ctx(); + BAT *l = NULL, *cl = NULL, *rl = NULL; - if (with_strimps_anti) { - BAT *rev; - if (old_s) { - rev = BATdiffcand(old_s, bn); -#ifndef NDEBUG - BAT *is = BATintersectcand(old_s, bn); - if (is) { - assert(is->batCount == bn->batCount); - BBPreclaim(is); - } - assert(rev->batCount == old_s->batCount - bn->batCount); -#endif - } else - rev = BATnegcands(0, b->batCount, bn); + bat *RL = getArgReference_bat(stk, pci, 0); + bat *L = getArgReference_bat(stk, pci, 1); + bat *CL = getArgReference_bat(stk, pci, 2); + const char *r = *getArgReference_str(stk, pci, 3); + bool icase = pci->argc != 5; + bool anti = pci->argc == 5 ? *getArgReference_bit(stk, pci, 4) : + *getArgReference_bit(stk, pci, 5); - BBPreclaim(bn); - bn = rev; - if (bn == NULL) - msg = createException(MAL, fname, SQLSTATE(HY013) MAL_MALLOC_FAIL); - } - } + if (!(l = BATdescriptor(*L))) + throw(MAL, fname, RUNTIME_OBJECT_MISSING); + + if (CL && !is_bat_nil(*CL) && !(cl = BATdescriptor(*CL))) { + BBPreclaim(l); + throw(MAL, fname, RUNTIME_OBJECT_MISSING); } - if (bn && !msg) { - *r_id = bn->batCacheid; - BBPkeepref(bn); - } else { - BBPreclaim(bn); + BATiter li = bat_iterator(l); + struct canditer lci; + canditer_init(&lci, l, cl); + size_t l_cnt = lci.ncand; + + rl = COLnew(0, TYPE_oid, l_cnt, TRANSIENT); + if (!rl) { + BBPreclaim_n(2, l, cl); + throw(MAL, fname, MAL_MALLOC_FAIL); } - BBPnreclaim(3, b, cb, old_s); + if (icase) { + if (str_cmp == str_is_prefix) + str_cmp = str_is_iprefix; + else if (str_cmp == str_is_suffix) _______________________________________________ checkin-list mailing list -- [email protected] To unsubscribe send an email to [email protected]
