Changeset: fb03fc0d6ec6 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/fb03fc0d6ec6
Modified Files:
monetdb5/modules/mal/ngrams.c
sql/scripts/49_strings.sql
Branch: strimps_v3
Log Message:
ngrams sw, ew and contains working
diffs (truncated from 602 to 300 lines):
diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c
--- a/monetdb5/modules/mal/ngrams.c
+++ b/monetdb5/modules/mal/ngrams.c
@@ -44,6 +44,28 @@
#define NGRAM_MULTIPLE 16
+#define SET_EMPTY_BAT_PROPS(B) \
+ do {
\
+ B->tnil = false;
\
+ B->tnonil = true;
\
+ B->tkey = true;
\
+ B->tsorted = true;
\
+ B->trevsorted = true; \
+ B->tseqbase = 0;
\
+ } while (0)
+
+static inline void
+BBPreclaim_n(int nargs, ...)
+{
+ va_list valist;
+ va_start(valist, nargs);
+ for (int i = 0; i < nargs; i++) {
+ BAT *b = va_arg(valist, BAT *);
+ BBPreclaim(b);
+ }
+ va_end(valist);
+}
+
typedef struct {
NGRAM_TYPE *idx;
NGRAM_TYPE *sigs;
@@ -53,51 +75,21 @@ typedef struct {
unsigned int *rid;
} Ngrams;
-static str
-NGcontains(Client c, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+static void
+ngrams_destroy(Ngrams *ng)
{
- (void) c;
- (void) mb;
-
- bit *r = getArgReference_bit(stk, pci, 0);
- const char *s1 = *getArgReference_str(stk, pci, 1);
- const char *s2 = *getArgReference_str(stk, pci, 2);
-
- if (strNil(s1) || strNil(s2)) {
- *r = bit_nil;
- } else {
- int s2_len = str_strlen(s2);
- *r = str_contains(s1, s2, s2_len) == 0;
+ if (ng) {
+ GDKfree(ng->h);
+ GDKfree(ng->idx);
+ GDKfree(ng->pos);
+ GDKfree(ng->rid);
+ GDKfree(ng->sigs);
}
- return MAL_SUCCEED;
-}
-
-static str
-NGselect(bat *R, bat *H, bat *C, str *Needle, bit *anti)
-{
- (void)R;
- (void)H;
- (void)C;
- (void)Needle;
- (void)anti;
- return MAL_SUCCEED;
-}
-
-static void
-ngrams_destroy(Ngrams *n)
-{
- if (n) {
- GDKfree(n->h);
- GDKfree(n->idx);
- GDKfree(n->pos);
- GDKfree(n->rid);
- GDKfree(n->sigs);
- }
- GDKfree(n);
+ GDKfree(ng);
}
static Ngrams *
-ngrams_create(BAT *b, size_t ngramsize)
+ngrams_create_old(BAT *b, size_t ngramsize)
{
Ngrams *n = NULL;
size_t sz = BATcount(b);
@@ -232,7 +224,7 @@ NGc1join_intern(bat *L, bat *R, bat *H,
printf("todo fall back to select \n");
}
- Ngrams *ngi = ngrams_create(h, UNIGRAM_SZ);
+ Ngrams *ngi = ngrams_create_old(h, UNIGRAM_SZ);
if (ngi && ngrams_init_1gram(ngi, h) == 0) { /* TODO add locks and only
create ngram once for full (parent bat) */
BUN cnt = BATcount(h);
/* create L/R */
@@ -338,6 +330,7 @@ NGc1join(bat *L, bat *R, bat *sigs, bat
{
return NGc1join_intern(L, R, sigs, needle, lc, rc, nil_matches,
estimate, anti);
}
+
static int
ngrams_init_2gram(Ngrams *n, BAT *b)
{
@@ -453,7 +446,7 @@ NGc2join_intern(bat *L, bat *R, bat *H,
if (BATcount(n) < 10) {
}
- Ngrams *ngi = ngrams_create(h, BIGRAM_SZ);
+ Ngrams *ngi = ngrams_create_old(h, BIGRAM_SZ);
if (ngi && ngrams_init_2gram(ngi, h) == 0) {
BUN cnt = BATcount(h);
/* create L/R */
@@ -673,7 +666,7 @@ NGc3join_intern(bat *L, bat *R, bat *H,
if (BATcount(n) < 10) {
}
- Ngrams *ngi = ngrams_create(h, TRIGRAM_SZ);
+ Ngrams *ngi = ngrams_create_old(h, TRIGRAM_SZ);
if (ngi && ngrams_init_3gram(ngi, h) == 0) { /* TODO add locks and only
create ngram once for full (parent bat) */
BUN cnt = BATcount(h);
/* create L/R */
@@ -774,6 +767,396 @@ NGc3join(bat *L, bat *R, bat *sigs, bat
return NGc3join_intern(L, R, sigs, needle, lc, rc, nil_matches,
estimate, anti);
}
+static Ngrams *
+ngrams_create(size_t b_sz, size_t ng_sz)
+{
+ Ngrams *ng = GDKmalloc(sizeof(Ngrams));
+ if (ng) {
+ ng->idx = GDKmalloc(ng_sz * sizeof(NGRAM_TYPE));
+ ng->sigs = GDKmalloc(b_sz * sizeof(NGRAM_TYPE));
+ ng->h = GDKmalloc(ng_sz * sizeof(unsigned int));
+ ng->pos = GDKzalloc(ng_sz * sizeof(unsigned int));
+ ng->rid = GDKmalloc(NGRAM_MULTIPLE * b_sz * sizeof(unsigned
int));
+ }
+ if (!ng || !ng->h || !ng->idx || !ng->pos || !ng->rid || !ng->sigs) {
+ ngrams_destroy(ng);
+ return NULL;
+ }
+ return ng;
+}
+
+static str
+ngram_choice(const bat *NG, bte *ngram, const char *fname)
+{
+ BAT *ng = NULL;
+ if ((ng = BATdescriptor(*NG)) == NULL)
+ throw(MAL, fname, SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
+
+ BATiter bi = bat_iterator(ng);
+ if (bi.count != 1) {
+ bat_iterator_end(&bi);
+ BBPreclaim(ng);
+ if (bi.count < 1)
+ throw(MAL, fname, SQLSTATE(42000) "Empty bat\n");
+ else
+ throw(MAL, fname, SQLSTATE(42000) "Single value bat
expected\n");
+ }
+ *ngram = *(bte *) BUNtloc(bi, 0);
+ bat_iterator_end(&bi);
+ BBPreclaim(ng);
+ return MAL_SUCCEED;
+}
+
+static int
+init_unigram_idx(Ngrams *ng, BATiter *bi, size_t b_cnt)
+{
+ NGRAM_TYPE *h = GDKzalloc(UNIGRAM_SZ * sizeof(NGRAM_TYPE)),
+ *hist = h, sum = 0;
+ NGRAM_TYPE *idx = ng->idx;
+ int *id = GDKmalloc(UNIGRAM_SZ*sizeof(int)), i;
+
+ if (!h || !id) {
+ GDKfree(h);
+ GDKfree(id);
+ return -1;
+ }
+
+ for(size_t j = 0; j < b_cnt; j++) {
+ const char *s = BUNtail(*bi, j);
+ if (!strNil(s) && *s) {
+ for(; *s; s++) {
+ h[CHAR_MAP(*s)]++;
+ }
+ }
+ }
+
+ int bc = 0;
+
+ for(size_t j = 0; j < UNIGRAM_SZ; j++) {
+ id[j] = j;
+ idx[j] = 0;
+ ng->h[j] = (unsigned int)hist[j];
+ }
+ GDKqsort(h, id, NULL, UNIGRAM_SZ, sizeof(NGRAM_TYPE), sizeof(int),
NGRAM_TYPEID, true, false);
+ for(i=UNIGRAM_SZ-1; i>=0; i--) {
+ if ((sum + hist[i]) >= (NGRAM_MULTIPLE*b_cnt)-1)
+ break;
+ sum += hist[i];
+ }
+ NGRAM_TYPE larger_cnt = hist[i];
+ for(; hist[i] == larger_cnt; i++)
+ ;
+ NGRAM_TYPE max = hist[0], small = hist[i];
+ ng->max = max;
+ ng->min = small;
+
+ for(size_t j = 0; j < UNIGRAM_SZ && hist[j] > 0; j++) {
+ unsigned int x = id[j];
+ idx[x] = NGRAM_CST(1) << bc;
+ assert(idx[x] > 0);
+ bc++;
+ bc %= NGRAM_BITS;
+ }
+
+ NGRAM_TYPE *sp = ng->sigs;
+ unsigned int pos = 1;
+ for(size_t j = 0; j < b_cnt; j++) {
+ const char *s = BUNtail(*bi, j);
+ NGRAM_TYPE sig = 0;
+ if (!strNil(s) && s[0]) {
+ for(; *s; s++) {
+ int k = CHAR_MAP(*s);
+ sig |= idx[k];
+ if (ng->h[k] <= ng->min) {
+ if (ng->pos[k] == 0) {
+ ng->pos[k] = pos;
+ pos += ng->h[k];
+ ng->h[k] = 0;
+ }
+ /* deduplicate */
+ int done = (ng->h[k] > 0 &&
ng->rid[ng->pos[k] + ng->h[k]-1] == j);
+ if (!done) {
+ ng->rid[ng->pos[k] + ng->h[k]]
= j;
+ ng->h[k]++;
+ }
+ }
+ }
+ *sp = sig;
+ } else {
+ *sp = NGRAM_TYPENIL;
+ }
+ sp++;
+ }
+
+ GDKfree(h);
+ GDKfree(id);
+ return 0;
+}
+
+static str
+join_unigram(BAT *rl, BAT *rr, BATiter *li, BATiter *ri,
+ size_t l_cnt, size_t r_cnt,
+ int (*str_cmp)(const char *, const char *, int))
+{
+ Ngrams *ng = ngrams_create(l_cnt, UNIGRAM_SZ);
+
+ if (!ng)
+ throw(MAL, "join_unigram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
+
+ if (init_unigram_idx(ng, li, l_cnt) != 0)
+ throw(MAL, "join_unigram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
+
+ NGRAM_TYPE nmax = 0;
+ oid *ol = Tloc(rl, 0), *el = ol + 10 * l_cnt;
+ oid *or = Tloc(rr, 0);
+
+ /* if needed grow */
+ for(size_t i = 0; i < r_cnt; i++) {
+ const char *s = BUNtail(*ri, i), *os = s;
+ NGRAM_TYPE sig = 0;
+
+ if ((ol+1000) > el)
+ break;
+ if (!strNil(s) && s[0]) {
+ NGRAM_TYPE min = ng->max;
+ unsigned int min_pos = 0;
+ for(; *s; s++) {
+ unsigned int k = CHAR_MAP(*s);
+ sig |= ng->idx[k];
+ if (ng->h[k] < min) {
+ min = ng->h[k];
+ min_pos = k; /* encoded min ngram */
+ }
+ }
+ if (min <= ng->min) {
+ unsigned int rr = ng->pos[min_pos];
+ int hcnt = ng->h[min_pos];
+ for(int k = 0; k<hcnt; k++, rr++) {
+ unsigned int hr = ng->rid[rr];
+ if (((ng->sigs[hr] & sig) == sig)) {
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]