Changeset: fb03fc0d6ec6 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/fb03fc0d6ec6
Modified Files:
        monetdb5/modules/mal/ngrams.c
        sql/scripts/49_strings.sql
Branch: strimps_v3
Log Message:

ngrams sw, ew and contains working


diffs (truncated from 602 to 300 lines):

diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c
--- a/monetdb5/modules/mal/ngrams.c
+++ b/monetdb5/modules/mal/ngrams.c
@@ -44,6 +44,28 @@
 
 #define NGRAM_MULTIPLE 16
 
+#define SET_EMPTY_BAT_PROPS(B)                                 \
+       do {                                                                    
        \
+               B->tnil = false;                                                
\
+               B->tnonil = true;                                               
\
+               B->tkey = true;                                                 
\
+               B->tsorted = true;                                              
\
+               B->trevsorted = true;                                   \
+               B->tseqbase = 0;                                                
\
+       } while (0)
+
+static inline void
+BBPreclaim_n(int nargs, ...)
+{
+       va_list valist;
+       va_start(valist, nargs);
+       for (int i = 0; i < nargs; i++) {
+               BAT *b = va_arg(valist, BAT *);
+               BBPreclaim(b);
+       }
+       va_end(valist);
+}
+
 typedef struct {
        NGRAM_TYPE *idx;
        NGRAM_TYPE *sigs;
@@ -53,51 +75,21 @@ typedef struct {
        unsigned int *rid;
 } Ngrams;
 
-static str
-NGcontains(Client c, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+static void
+ngrams_destroy(Ngrams *ng)
 {
-       (void) c;
-       (void) mb;
-
-       bit *r = getArgReference_bit(stk, pci, 0);
-       const char *s1 = *getArgReference_str(stk, pci, 1);
-       const char *s2 = *getArgReference_str(stk, pci, 2);
-
-       if (strNil(s1) || strNil(s2)) {
-               *r = bit_nil;
-       } else {
-               int s2_len = str_strlen(s2);
-               *r = str_contains(s1, s2, s2_len) == 0;
+       if (ng) {
+               GDKfree(ng->h);
+               GDKfree(ng->idx);
+               GDKfree(ng->pos);
+               GDKfree(ng->rid);
+               GDKfree(ng->sigs);
        }
-       return MAL_SUCCEED;
-}
-
-static str
-NGselect(bat *R, bat *H, bat *C, str *Needle, bit *anti)
-{
-       (void)R;
-       (void)H;
-       (void)C;
-       (void)Needle;
-       (void)anti;
-       return MAL_SUCCEED;
-}
-
-static void
-ngrams_destroy(Ngrams *n)
-{
-       if (n) {
-               GDKfree(n->h);
-               GDKfree(n->idx);
-               GDKfree(n->pos);
-               GDKfree(n->rid);
-               GDKfree(n->sigs);
-       }
-       GDKfree(n);
+       GDKfree(ng);
 }
 
 static Ngrams *
-ngrams_create(BAT *b, size_t ngramsize)
+ngrams_create_old(BAT *b, size_t ngramsize)
 {
        Ngrams *n = NULL;
        size_t sz = BATcount(b);
@@ -232,7 +224,7 @@ NGc1join_intern(bat *L, bat *R, bat *H, 
                printf("todo fall back to select \n");
        }
 
-       Ngrams *ngi = ngrams_create(h, UNIGRAM_SZ);
+       Ngrams *ngi = ngrams_create_old(h, UNIGRAM_SZ);
        if (ngi && ngrams_init_1gram(ngi, h) == 0) { /* TODO add locks and only 
create ngram once for full (parent bat) */
                BUN cnt = BATcount(h);
                /* create L/R */
@@ -338,6 +330,7 @@ NGc1join(bat *L, bat *R, bat *sigs, bat 
 {
        return NGc1join_intern(L, R, sigs, needle, lc, rc, nil_matches, 
estimate, anti);
 }
+
 static int
 ngrams_init_2gram(Ngrams *n, BAT *b)
 {
@@ -453,7 +446,7 @@ NGc2join_intern(bat *L, bat *R, bat *H, 
        if (BATcount(n) < 10) {
        }
 
-       Ngrams *ngi = ngrams_create(h, BIGRAM_SZ);
+       Ngrams *ngi = ngrams_create_old(h, BIGRAM_SZ);
        if (ngi && ngrams_init_2gram(ngi, h) == 0) {
                BUN cnt = BATcount(h);
                /* create L/R */
@@ -673,7 +666,7 @@ NGc3join_intern(bat *L, bat *R, bat *H, 
        if (BATcount(n) < 10) {
        }
 
-       Ngrams *ngi = ngrams_create(h, TRIGRAM_SZ);
+       Ngrams *ngi = ngrams_create_old(h, TRIGRAM_SZ);
        if (ngi && ngrams_init_3gram(ngi, h) == 0) { /* TODO add locks and only 
create ngram once for full (parent bat) */
                BUN cnt = BATcount(h);
                /* create L/R */
@@ -774,6 +767,396 @@ NGc3join(bat *L, bat *R, bat *sigs, bat 
        return NGc3join_intern(L, R, sigs, needle, lc, rc, nil_matches, 
estimate, anti);
 }
 
+static Ngrams *
+ngrams_create(size_t b_sz, size_t ng_sz)
+{
+       Ngrams *ng = GDKmalloc(sizeof(Ngrams));
+       if (ng) {
+               ng->idx = GDKmalloc(ng_sz * sizeof(NGRAM_TYPE));
+               ng->sigs = GDKmalloc(b_sz * sizeof(NGRAM_TYPE));
+               ng->h = GDKmalloc(ng_sz * sizeof(unsigned int));
+               ng->pos = GDKzalloc(ng_sz * sizeof(unsigned int));
+               ng->rid = GDKmalloc(NGRAM_MULTIPLE * b_sz * sizeof(unsigned 
int));
+       }
+       if (!ng || !ng->h || !ng->idx || !ng->pos || !ng->rid || !ng->sigs) {
+               ngrams_destroy(ng);
+               return NULL;
+       }
+       return ng;
+}
+
+static str
+ngram_choice(const bat *NG, bte *ngram, const char *fname)
+{
+       BAT *ng = NULL;
+       if ((ng = BATdescriptor(*NG)) == NULL)
+               throw(MAL, fname, SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
+
+       BATiter bi = bat_iterator(ng);
+       if (bi.count != 1) {
+               bat_iterator_end(&bi);
+               BBPreclaim(ng);
+               if (bi.count < 1)
+                       throw(MAL, fname, SQLSTATE(42000) "Empty bat\n");
+               else
+                       throw(MAL, fname, SQLSTATE(42000) "Single value bat 
expected\n");
+       }
+       *ngram = *(bte *) BUNtloc(bi, 0);
+       bat_iterator_end(&bi);
+       BBPreclaim(ng);
+       return MAL_SUCCEED;
+}
+
+static int
+init_unigram_idx(Ngrams *ng, BATiter *bi, size_t b_cnt)
+{
+       NGRAM_TYPE *h = GDKzalloc(UNIGRAM_SZ * sizeof(NGRAM_TYPE)),
+               *hist = h, sum = 0;
+       NGRAM_TYPE *idx = ng->idx;
+       int *id = GDKmalloc(UNIGRAM_SZ*sizeof(int)), i;
+
+       if (!h || !id) {
+               GDKfree(h);
+               GDKfree(id);
+               return -1;
+       }
+
+       for(size_t j = 0; j < b_cnt; j++) {
+               const char *s = BUNtail(*bi, j);
+               if (!strNil(s) && *s) {
+                       for(; *s; s++) {
+                               h[CHAR_MAP(*s)]++;
+                       }
+               }
+       }
+
+       int bc = 0;
+
+       for(size_t j = 0; j < UNIGRAM_SZ; j++) {
+               id[j] = j;
+               idx[j] = 0;
+               ng->h[j] = (unsigned int)hist[j];
+       }
+       GDKqsort(h, id, NULL, UNIGRAM_SZ, sizeof(NGRAM_TYPE), sizeof(int), 
NGRAM_TYPEID, true, false);
+       for(i=UNIGRAM_SZ-1; i>=0; i--) {
+               if ((sum + hist[i]) >= (NGRAM_MULTIPLE*b_cnt)-1)
+                       break;
+               sum += hist[i];
+       }
+       NGRAM_TYPE larger_cnt = hist[i];
+       for(; hist[i] == larger_cnt; i++)
+               ;
+       NGRAM_TYPE max = hist[0], small = hist[i];
+       ng->max = max;
+       ng->min = small;
+
+       for(size_t j = 0; j < UNIGRAM_SZ && hist[j] > 0; j++) {
+               unsigned int x = id[j];
+               idx[x] = NGRAM_CST(1) << bc;
+               assert(idx[x] > 0);
+               bc++;
+               bc %= NGRAM_BITS;
+       }
+
+       NGRAM_TYPE *sp = ng->sigs;
+       unsigned int pos = 1;
+       for(size_t j = 0; j < b_cnt; j++) {
+               const char *s = BUNtail(*bi, j);
+               NGRAM_TYPE sig = 0;
+               if (!strNil(s) && s[0]) {
+                       for(; *s; s++) {
+                               int k = CHAR_MAP(*s);
+                               sig |= idx[k];
+                               if (ng->h[k] <= ng->min) {
+                                       if (ng->pos[k] == 0) {
+                                               ng->pos[k] = pos;
+                                               pos += ng->h[k];
+                                               ng->h[k] = 0;
+                                       }
+                                       /* deduplicate */
+                                       int done =  (ng->h[k] > 0 && 
ng->rid[ng->pos[k] + ng->h[k]-1] == j);
+                                       if (!done) {
+                                               ng->rid[ng->pos[k] + ng->h[k]] 
= j;
+                                               ng->h[k]++;
+                                       }
+                               }
+                       }
+                       *sp = sig;
+               } else {
+                       *sp = NGRAM_TYPENIL;
+               }
+               sp++;
+       }
+
+       GDKfree(h);
+       GDKfree(id);
+       return 0;
+}
+
+static str
+join_unigram(BAT *rl, BAT *rr, BATiter *li, BATiter *ri,
+                        size_t l_cnt, size_t r_cnt,
+                        int (*str_cmp)(const char *, const char *, int))
+{
+       Ngrams *ng = ngrams_create(l_cnt, UNIGRAM_SZ);
+
+       if (!ng)
+               throw(MAL, "join_unigram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
+
+       if (init_unigram_idx(ng, li, l_cnt) != 0)
+               throw(MAL, "join_unigram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
+
+       NGRAM_TYPE nmax = 0;
+       oid *ol = Tloc(rl, 0), *el = ol + 10 * l_cnt;
+       oid *or = Tloc(rr, 0);
+
+       /* if needed grow */
+       for(size_t i = 0; i < r_cnt; i++) {
+               const char *s = BUNtail(*ri, i), *os = s;
+               NGRAM_TYPE sig = 0;
+
+               if ((ol+1000) > el)
+                       break;
+               if (!strNil(s) && s[0]) {
+                       NGRAM_TYPE min = ng->max;
+                       unsigned int min_pos = 0;
+                       for(; *s; s++) {
+                               unsigned int k = CHAR_MAP(*s);
+                               sig |= ng->idx[k];
+                               if (ng->h[k] < min) {
+                                       min = ng->h[k];
+                                       min_pos = k; /* encoded min ngram */
+                               }
+                       }
+                       if (min <= ng->min) {
+                               unsigned int rr = ng->pos[min_pos];
+                               int hcnt = ng->h[min_pos];
+                               for(int k = 0; k<hcnt; k++, rr++) {
+                                       unsigned int hr = ng->rid[rr];
+                                       if (((ng->sigs[hr] & sig) == sig)) {
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to