Changeset: e9a266d358b6 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/e9a266d358b6
Modified Files:
        monetdb5/modules/mal/ngrams.c
        monetdb5/modules/mal/ngrams.h
Branch: strimps_v3
Log Message:

Joins and select with candidates for uni, bi and trigrams


diffs (truncated from 1471 to 300 lines):

diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c
--- a/monetdb5/modules/mal/ngrams.c
+++ b/monetdb5/modules/mal/ngrams.c
@@ -18,19 +18,19 @@
 #include "str.h"
 
 static inline int
-is_prefix(const char *s1, const char *s2, int s2_len)
+ng_prefix(const char *s1, const char *s2, int s2_len)
 {
        return strncmp(s1, s2, s2_len);
 }
 
 static inline int
-is_suffix(const char *s1, const char *s2, int s2_len)
+ng_suffix(const char *s1, const char *s2, int s2_len)
 {
        return strcmp(s1 + strlen(s1) - s2_len, s2);
 }
 
 static inline int
-is_contains(const char *s1, const char *s2, int s2_len)
+ng_contains(const char *s1, const char *s2, int s2_len)
 {
        (void) s2_len;
        return strstr(s1, s2) == NULL;
@@ -62,15 +62,15 @@ ngrams_destroy(Ngrams *ng)
 }
 
 static Ngrams *
-ngrams_create(size_t b_cnt, size_t ng_sz)
+ngrams_create(size_t cnt, size_t ng_sz)
 {
        Ngrams *ng = GDKmalloc(sizeof(Ngrams));
        if (ng) {
                ng->idx  = GDKmalloc(ng_sz * sizeof(NGRAM_TYPE));
-               ng->sigs = GDKmalloc(b_cnt * sizeof(NGRAM_TYPE));
+               ng->sigs = GDKmalloc(cnt * sizeof(NGRAM_TYPE));
                ng->histogram = GDKmalloc(ng_sz * sizeof(unsigned));
                ng->lists  = GDKmalloc(ng_sz * sizeof(unsigned));
-               ng->rids  = GDKmalloc(2 * NGRAM_MULTIPLE * b_cnt * 
sizeof(unsigned));
+               ng->rids  = GDKmalloc(2 * NGRAM_MULTIPLE * cnt * 
sizeof(unsigned));
        }
        if (!ng || !ng->idx || !ng->sigs || !ng->histogram || !ng->lists || 
!ng->rids) {
                ngrams_destroy(ng);
@@ -80,29 +80,7 @@ ngrams_create(size_t b_cnt, size_t ng_sz
 }
 
 static str
-ngram_choice(const bat *NG, bte *ngram, const char *fname)
-{
-       BAT *ng = NULL;
-       if ((ng = BATdescriptor(*NG)) == NULL)
-               throw(MAL, fname, SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
-
-       BATiter bi = bat_iterator(ng);
-       if (bi.count != 1) {
-               bat_iterator_end(&bi);
-               BBPreclaim(ng);
-               if (bi.count < 1)
-                       throw(MAL, fname, SQLSTATE(42000) "Empty bat\n");
-               else
-                       throw(MAL, fname, SQLSTATE(42000) "Single value bat 
expected\n");
-       }
-       *ngram = *(bte *) BUNtloc(bi, 0);
-       bat_iterator_end(&bi);
-       BBPreclaim(ng);
-       return MAL_SUCCEED;
-}
-
-static int
-init_unigram_idx(Ngrams *ng, BATiter *bi, size_t b_cnt)
+init_unigram_idx(Ngrams *ng, BATiter *bi, struct canditer *bci, QryCtx 
*qry_ctx)
 {
        NGRAM_TYPE *idx = ng->idx;
        NGRAM_TYPE *sigs = ng->sigs;
@@ -111,21 +89,27 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
        unsigned *rids = ng->rids;
        unsigned *h_tmp = GDKzalloc(UNIGRAM_SZ * sizeof(unsigned));
        unsigned *map = GDKmalloc(UNIGRAM_SZ * sizeof(unsigned));
+       unsigned k = 1;
 
        if (!h_tmp || !map) {
                GDKfree(h_tmp);
                GDKfree(map);
-               return -1;
+               throw(MAL, "init_unigram_idx", SQLSTATE(HY013) MAL_MALLOC_FAIL);
        }
 
-       for (size_t i = 0; i < b_cnt; i++) {
-               const char *s = BUNtail(*bi, i);
+       oid bbase = bi->b->hseqbase, ob;
+       const char *bvars = bi->vh->base, *bvals = bi->base;
+
+       canditer_reset(bci);
+       TIMEOUT_LOOP(bci->ncand, qry_ctx) {
+               ob = canditer_next(bci);
+               const char *s = VALUE(b, ob - bbase);
                if (!strNil(s))
-                       for (; UNIGRAM(s); s++)
+                       for ( ; UNIGRAM(s); s++)
                                h_tmp[ENC_TOKEN1(s)]++;
        }
 
-       for(size_t i = 0; i < UNIGRAM_SZ; i++) {
+       for (size_t i = 0; i < UNIGRAM_SZ; i++) {
                map[i] = i;
                idx[i] = lists[i] = 0;
                h[i] = h_tmp[i];
@@ -135,27 +119,27 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
                         sizeof(unsigned), sizeof(unsigned), TYPE_int, true, 
false);
 
        unsigned j = UNIGRAM_SZ - 1, sum = 0;
-       for (; j; j--) {
+       for ( ; j; j--) {
                sum += h_tmp[j];
-               if (sum + h_tmp[j] >= NGRAM_MULTIPLE * b_cnt - 1)
+               if (sum + h_tmp[j] >= NGRAM_MULTIPLE * bci->ncand - 1)
                        break;
        }
        ng->max = h_tmp[0];
        ng->min = h_tmp[j];
 
        int n = 0;
-       for(size_t i = 0; i < UNIGRAM_SZ && h_tmp[i] > 0; i++) {
-               unsigned x = map[i];
-               idx[x] = NGRAM_CST(1) << n++;
+       for (size_t i = 0; i < UNIGRAM_SZ && h_tmp[i] > 0; i++) {
+               idx[map[i]] = NGRAM_CST(1) << n++;
                n %= NGRAM_BITS;
        }
 
-       unsigned k = 1;
-       for(size_t i = 0; i < b_cnt; i++) {
-               const char *s = BUNtail(*bi, i);
+       canditer_reset(bci);
+       TIMEOUT_LOOP(bci->ncand, qry_ctx) {
+               ob = canditer_next(bci);
+               const char *s = VALUE(b, ob - bbase);
                if (!strNil(s) && UNIGRAM(s)) {
                        NGRAM_TYPE sig = 0;
-                       for(; UNIGRAM(s); s++) {
+                       for ( ; UNIGRAM(s); s++) {
                                unsigned unigram = ENC_TOKEN1(s);
                                sig |= idx[unigram];
                                if (h[unigram] <= ng->min) {
@@ -164,16 +148,17 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
                                                k += h[unigram];
                                                h[unigram] = 0;
                                        }
-                                       int done = (h[unigram] > 0 && 
rids[lists[unigram] + h[unigram] - 1] == i);
+                                       bool done = (h[unigram] > 0 &&
+                                                                
rids[lists[unigram] + h[unigram] - 1] == ob - bbase);
                                        if (!done) {
-                                               rids[lists[unigram] + 
h[unigram]] = i;
+                                               rids[lists[unigram] + 
h[unigram]] = ob - bbase;
                                                h[unigram]++;
                                        }
                                }
                        }
                        *sigs = sig;
                } else if (!strNil(s)) {
-                       *sigs = 1;
+                       *sigs = ~0LL; /* TODO */
                } else {
                        *sigs = NGRAM_TYPENIL;
                }
@@ -182,11 +167,11 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
 
        GDKfree(h_tmp);
        GDKfree(map);
-       return 0;
+       return MAL_SUCCEED;
 }
 
-static int
-init_bigram_idx(Ngrams *ng, BATiter *bi, size_t b_cnt)
+static str
+init_bigram_idx(Ngrams *ng, BATiter *bi, struct canditer *bci, QryCtx *qry_ctx)
 {
        NGRAM_TYPE *idx = ng->idx;
        NGRAM_TYPE *sigs = ng->sigs;
@@ -196,17 +181,23 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
        unsigned (*h_tmp)[SZ] = GDKzalloc(BIGRAM_SZ * sizeof(unsigned));
        unsigned *h_tmp_ptr = (unsigned *) h_tmp;
        unsigned *map = GDKmalloc(BIGRAM_SZ * sizeof(unsigned));
+       unsigned int k = 1;
 
        if (!h_tmp || !map) {
                GDKfree(h_tmp);
                GDKfree(map);
-               return -1;
+               throw(MAL, "init_bigram_idx", SQLSTATE(HY013) MAL_MALLOC_FAIL);
        }
 
-       for (size_t i = 0; i < b_cnt; i++) {
-               const char *s = BUNtail(*bi, i);
+       oid bbase = bi->b->hseqbase, ob;
+       const char *bvars = bi->vh->base, *bvals = bi->base;
+
+       canditer_reset(bci);
+       TIMEOUT_LOOP(bci->ncand, qry_ctx) {
+               ob = canditer_next(bci);
+               const char *s = VALUE(b, ob - bbase);
                if (!strNil(s))
-                       for (; BIGRAM(s); s++)
+                       for ( ; BIGRAM(s); s++)
                                h_tmp[ENC_TOKEN1(s)][ENC_TOKEN2(s)]++;
        }
 
@@ -220,9 +211,9 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
                         sizeof(unsigned), sizeof(unsigned), TYPE_int, true, 
false);
 
        unsigned j = BIGRAM_SZ - 1, sum = 0;
-       for (; j; j--) {
+       for ( ; j; j--) {
                sum += h_tmp_ptr[j];
-               if ((sum + h_tmp_ptr[j]) >= NGRAM_MULTIPLE * b_cnt - 1)
+               if ((sum + h_tmp_ptr[j]) >= NGRAM_MULTIPLE * bci->ncand - 1)
                        break;
        }
        ng->max = h_tmp_ptr[0];
@@ -230,21 +221,17 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
 
        int n = 0;
        for (size_t i = 0; i < BIGRAM_SZ && h_tmp_ptr[i] > 0; i++) {
-               /* unsigned x = (map[i] / SZ) % SZ, y = map[i] % SZ; */
-               /* idx[x*SZ + y] = NGRAM_CST(1) << n; */
-               /* n++; */
-               /* n %= NGRAM_BITS; */
-               /* assert(x*SZ + y == map[i]); */
                idx[map[i]] = NGRAM_CST(1) << n++;
                n %= NGRAM_BITS;
        }
 
-       unsigned int k = 1;
-       for (size_t i = 0; i < b_cnt; i++) {
-               const char *s = BUNtail(*bi, i);
+       canditer_reset(bci);
+       TIMEOUT_LOOP(bci->ncand, qry_ctx) {
+               ob = canditer_next(bci);
+               const char *s = VALUE(b, ob - bbase);
                if (!strNil(s) && BIGRAM(s)) {
                        NGRAM_TYPE sig = 0;
-                       for (; BIGRAM(s); s++) {
+                       for ( ; BIGRAM(s); s++) {
                                unsigned bigram = ENC_TOKEN1(s)*SZ + 
ENC_TOKEN2(s);
                                sig |= idx[bigram];
                                if (h[bigram] <= ng->min) {
@@ -253,16 +240,16 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
                                                k += h[bigram];
                                                h[bigram] = 0;
                                        }
-                                       int done = (h[bigram] > 0 && 
rids[lists[bigram] + h[bigram] - 1] == i);
+                                       int done = (h[bigram] > 0 && 
rids[lists[bigram] + h[bigram] - 1] == ob - bbase);
                                        if (!done) {
-                                               rids[lists[bigram] + h[bigram]] 
= i;
+                                               rids[lists[bigram] + h[bigram]] 
= ob - bbase;
                                                h[bigram]++;
                                        }
                                }
                        }
                        *sigs = sig;
-               /* } else if (!strNil(s)) { */
-               /*      *sigs = 1; */
+               } else if (!strNil(s)) {
+                       *sigs = ~0LL; /* TODO */
                } else {
                        *sigs = NGRAM_TYPENIL;
                }
@@ -271,11 +258,11 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
 
        GDKfree(h_tmp);
        GDKfree(map);
-       return 0;
+       return MAL_SUCCEED;
 }
 
-static int
-init_trigram_idx(Ngrams *ng, BATiter *bi, size_t b_cnt)
+static str
+init_trigram_idx(Ngrams *ng, BATiter *bi, struct canditer *bci, QryCtx 
*qry_ctx)
 {
        NGRAM_TYPE *idx = ng->idx;
        NGRAM_TYPE *sigs = ng->sigs;
@@ -285,17 +272,23 @@ init_trigram_idx(Ngrams *ng, BATiter *bi
        unsigned (*h_tmp)[SZ][SZ] = GDKzalloc(TRIGRAM_SZ * sizeof(unsigned));
        unsigned *h_tmp_ptr = (unsigned *) h_tmp;
        unsigned *map = GDKmalloc(TRIGRAM_SZ * sizeof(unsigned));
+       unsigned k = 1;
 
        if (!h_tmp || !map) {
                GDKfree(h_tmp);
                GDKfree(map);
-               return -1;
+               throw(MAL, "init_trigram_idx", SQLSTATE(HY013) MAL_MALLOC_FAIL);
        }
 
-       for (size_t i = 0; i < b_cnt; i++) {
-               const char *s = BUNtail(*bi, i);
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to