Changeset: 4632baa8c417 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/4632baa8c417
Modified Files:
        monetdb5/modules/atoms/str.c
Branch: strimps_v3
Log Message:

Tweak algo choice


diffs (truncated from 352 to 300 lines):

diff --git a/monetdb5/modules/atoms/str.c b/monetdb5/modules/atoms/str.c
--- a/monetdb5/modules/atoms/str.c
+++ b/monetdb5/modules/atoms/str.c
@@ -2214,6 +2214,339 @@ nested_loop_strjoin(BAT *rl, BAT *rr, BA
        return MAL_SUCCEED;
 }
 
+static void
+ngrams_destroy(Ngrams *ng)
+{
+       if (ng) {
+               GDKfree(ng->idx);
+               GDKfree(ng->sigs);
+               GDKfree(ng->histogram);
+               GDKfree(ng->lists);
+               GDKfree(ng->rids);
+       }
+       GDKfree(ng);
+}
+
+static Ngrams *
+ngrams_create(size_t cnt, size_t ng_sz)
+{
+       Ngrams *ng = GDKmalloc(sizeof(Ngrams));
+       if (ng) {
+               ng->idx  = GDKmalloc(ng_sz * sizeof(NGRAM_TYPE));
+               ng->sigs = GDKmalloc(cnt * sizeof(NGRAM_TYPE));
+               ng->histogram = GDKmalloc(ng_sz * sizeof(unsigned));
+               ng->lists  = GDKmalloc(ng_sz * sizeof(unsigned));
+               ng->rids  = GDKmalloc(2 * NGRAM_MULTIPLE * cnt * 
sizeof(unsigned));
+       }
+       if (!ng || !ng->idx || !ng->sigs || !ng->histogram || !ng->lists || 
!ng->rids) {
+               ngrams_destroy(ng);
+               return NULL;
+       }
+       return ng;
+}
+
+static str
+init_bigram_idx(Ngrams *ng, BATiter *bi, struct canditer *bci, QryCtx *qry_ctx)
+{
+       NGRAM_TYPE *idx = ng->idx;
+       NGRAM_TYPE *sigs = ng->sigs;
+       unsigned *h = ng->histogram;
+       unsigned *lists = ng->lists;
+       unsigned *rids = ng->rids;
+       unsigned (*h_tmp)[SZ] = GDKzalloc(BIGRAM_SZ * sizeof(unsigned));
+       unsigned *h_tmp_ptr = (unsigned *) h_tmp;
+       unsigned *map = GDKmalloc(BIGRAM_SZ * sizeof(unsigned));
+       unsigned int k = 1;
+
+       if (!h_tmp || !map) {
+               GDKfree(h_tmp);
+               GDKfree(map);
+               throw(MAL, "init_bigram_idx", SQLSTATE(HY013) MAL_MALLOC_FAIL);
+       }
+
+       oid bbase = bi->b->hseqbase, ob;
+       const char *b_vars = bi->vh->base, *b_vals = bi->base;
+
+       canditer_reset(bci);
+       TIMEOUT_LOOP(bci->ncand, qry_ctx) {
+               ob = canditer_next(bci);
+               const char *s = VALUE(b, ob - bbase);
+               if (!strNil(s))
+                       for ( ; BIGRAM(s); s++)
+                               h_tmp[ENC_TOKEN1(s)][ENC_TOKEN2(s)]++;
+       }
+
+       for (unsigned i = 0; i < BIGRAM_SZ; i++) {
+               map[i] = i;
+               idx[i] = lists[i] = 0;
+               h[i] = h_tmp_ptr[i];
+       }
+
+       GDKqsort(h_tmp, map, NULL, BIGRAM_SZ,
+                        sizeof(unsigned), sizeof(unsigned), TYPE_int, true, 
false);
+
+       unsigned j = BIGRAM_SZ - 1, sum = 0;
+       for ( ; j; j--) {
+               sum += h_tmp_ptr[j];
+               if ((sum + h_tmp_ptr[j]) >= NGRAM_MULTIPLE * bci->ncand - 1)
+                       break;
+       }
+       ng->max = h_tmp_ptr[0];
+       ng->min = h_tmp_ptr[j];
+
+       int n = 0;
+       for (size_t i = 0; i < BIGRAM_SZ && h_tmp_ptr[i] > 0; i++) {
+               idx[map[i]] = NGRAM_CST(1) << n++;
+               n %= NGRAM_BITS;
+       }
+
+       canditer_reset(bci);
+       TIMEOUT_LOOP(bci->ncand, qry_ctx) {
+               ob = canditer_next(bci);
+               const char *s = VALUE(b, ob - bbase);
+               if (!strNil(s) && BIGRAM(s)) {
+                       NGRAM_TYPE sig = 0;
+                       for ( ; BIGRAM(s); s++) {
+                               unsigned bigram = ENC_TOKEN1(s)*SZ + 
ENC_TOKEN2(s);
+                               sig |= idx[bigram];
+                               if (h[bigram] <= ng->min) {
+                                       if (lists[bigram] == 0) {
+                                               lists[bigram] = k;
+                                               k += h[bigram];
+                                               h[bigram] = 0;
+                                       }
+                                       int done = (h[bigram] > 0 &&
+                                                               
rids[lists[bigram] + h[bigram] - 1] == ob - bbase);
+                                       if (!done) {
+                                               rids[lists[bigram] + h[bigram]] 
= (unsigned)(ob - bbase);
+                                               h[bigram]++;
+                                       }
+                               }
+                       }
+                       *sigs = sig;
+               } else {
+                       *sigs = NGRAM_TYPENIL;
+               }
+               sigs++;
+       }
+
+       GDKfree(h_tmp);
+       GDKfree(map);
+       return MAL_SUCCEED;
+}
+
+/* static str */
+/* bigram_strselect(BAT *rl, BATiter *li, struct canditer *lci, const char *r, 
*/
+/*                              int (*str_cmp)(const char *, const char *, 
int), QryCtx *qry_ctx) */
+/* { */
+/*     str msg = MAL_SUCCEED; */
+/*     Ngrams *ng = ngrams_create(lci->ncand, BIGRAM_SZ); */
+/*     if (!ng) */
+/*             throw(MAL, "select_bigram", SQLSTATE(HY013) MAL_MALLOC_FAIL); */
+
+/*     NGRAM_TYPE *idx = ng->idx; */
+/*     NGRAM_TYPE *sigs = ng->sigs; */
+/*     unsigned *h = ng->histogram; */
+/*     unsigned *lists = ng->lists; */
+/*     unsigned *rids = ng->rids; */
+/*     oid lbase = li->b->hseqbase, ol; */
+/*     const char *lvars = li->vh->base, *lvals = li->base; */
+/*     const char *rs = r, *rs_iter = r; */
+
+/*     if (strNil(rs)) */
+/*             return msg; */
+
+/*     if (strlen(rs) < 2) { */
+/*             canditer_reset(lci); */
+/*             TIMEOUT_LOOP(lci->ncand, qry_ctx) { */
+/*                     ol = canditer_next(lci); */
+/*                     const char *ls = VALUE(l, ol - lbase); */
+/*                     if (!strNil(ls) && str_cmp(ls, rs, str_strlen(rs)) == 
0) */
+/*                             APPEND(rl, ol); */
+/*             } */
+/*     } */
+
+/*     msg = init_bigram_idx(ng, li, lci, qry_ctx); */
+/*     if (msg) { */
+/*             ngrams_destroy(ng); */
+/*             return msg; */
+/*     } */
+
+/*     NGRAM_TYPE sig = 0; */
+/*     unsigned min = ng->max, min_pos = 0; */
+/*     for ( ; BIGRAM(rs_iter); rs_iter++) { */
+/*             unsigned bigram = ENC_TOKEN1(rs_iter)*SZ + ENC_TOKEN2(rs_iter); 
*/
+/*             sig |= idx[bigram]; */
+/*             if (h[bigram] < min) { */
+/*                     min = h[bigram]; */
+/*                     min_pos = bigram; */
+/*             } */
+/*     } */
+
+/*     if (min <= ng->min) { */
+/*             unsigned list = lists[min_pos], list_cnt = h[min_pos]; */
+/*             for (size_t i = 0; i < list_cnt; i++, list++) { */
+/*                     unsigned ol = rids[list]; */
+/*                     if ((sigs[ol] & sig) == sig) { */
+/*                             const char *ls = VALUE(l, ol); */
+/*                             if (str_cmp(ls, rs, str_strlen(rs)) == 0) */
+/*                                     APPEND(rl, ol + lbase); */
+/*                     } */
+/*             } */
+/*     } else { */
+/*             canditer_reset(lci); */
+/*             TIMEOUT_LOOP(lci->ncand, qry_ctx) { */
+/*                     ol = canditer_next(lci); */
+/*                     if ((sigs[ol - lbase] & sig) == sig) { */
+/*                             const char *ls = VALUE(l, ol - lbase); */
+/*                             if (str_cmp(ls, rs, str_strlen(rs)) == 0) */
+/*                                     APPEND(rl, ol); */
+/*                     } */
+/*             } */
+/*     } */
+
+/*     BATsetcount(rl, BATcount(rl)); */
+/*     ngrams_destroy(ng); */
+/*     return msg; */
+/* } */
+
+static str
+bigram_strjoin(BAT *rl, BAT *rr, BATiter *li, BATiter *ri,
+                          struct canditer *lci, struct canditer *rci,
+                          int (*str_cmp)(const char *, const char *, int),
+                          const char *fname, QryCtx *qry_ctx)
+{
+       str msg = MAL_SUCCEED;
+
+       Ngrams *ng = ngrams_create(lci->ncand, BIGRAM_SZ);
+       if (!ng)
+               throw(MAL, fname, MAL_MALLOC_FAIL);
+
+       NGRAM_TYPE *idx = ng->idx;
+       NGRAM_TYPE *sigs = ng->sigs;
+       unsigned *h = ng->histogram;
+       unsigned *lists = ng->lists;
+       unsigned *rids = ng->rids;
+       size_t new_cap;
+
+       msg = init_bigram_idx(ng, li, lci, qry_ctx);
+       if (msg) {
+               ngrams_destroy(ng);
+               return msg;
+       }
+
+       BATprint(GDKstdout, li->b);
+       printf("%s\n", MT_thread_getname());
+
+       oid lbase = li->b->hseqbase, rbase = ri->b->hseqbase, or, ol;
+       const char *l_vars = li->vh->base, *r_vars = ri->vh->base,
+               *l_vals = li->base, *r_vals = ri->base;
+
+       lng t0 = 0;
+       TRC_DEBUG_IF(ALGO) t0 = GDKusec();
+
+       canditer_reset(lci);
+       TIMEOUT_LOOP(rci->ncand, qry_ctx) {
+               or = canditer_next(rci);
+               const char *rs = VALUE(r, or - rbase), *rs_iter = rs;
+               if (strNil(rs))
+                       continue;
+               if (strlen(rs) < 2) {
+                       canditer_reset(lci);
+                       TIMEOUT_LOOP(lci->ncand, qry_ctx) {
+                               ol = canditer_next(lci);
+                               const char *ls = VALUE(l, ol - lbase);
+                               if (!strNil(ls)) {
+                                       if (str_cmp(ls, rs, str_strlen(rs)) == 
0) {
+                                               APPEND(rl, ol);
+                                               if (rr) APPEND(rr, or);
+                                               if (BATcount(rl) == 
BATcapacity(rl)) {
+                                                       new_cap = BATgrows(rl);
+                                                       if (BATextend(rl, 
new_cap) != GDK_SUCCEED ||
+                                                               (rr && 
BATextend(rr, new_cap) != GDK_SUCCEED)) {
+                                                               
ngrams_destroy(ng);
+                                                               throw(MAL, 
fname, GDK_EXCEPTION);
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               } else  if (BIGRAM(rs)) {
+                       NGRAM_TYPE sig = 0;
+                       unsigned min = ng->max, min_pos = 0;
+                       for ( ; BIGRAM(rs_iter); rs_iter++) {
+                               unsigned bigram = ENC_TOKEN1(rs_iter)*SZ + 
ENC_TOKEN2(rs_iter);
+                               sig |= idx[bigram];
+                               if (h[bigram] < min) {
+                                       min = h[bigram];
+                                       min_pos = bigram;
+                               }
+                       }
+                       if (min <= ng->min) {
+                               unsigned list = lists[min_pos], list_cnt = 
h[min_pos];
+                               for (size_t i = 0; i < list_cnt; i++, list++) {
+                                       unsigned ol = rids[list];
+                                       if ((sigs[ol] & sig) == sig) {
+                                               const char *ls = VALUE(l, ol);
+                                               if (str_cmp(ls, rs, 
str_strlen(rs)) == 0) {
+                                                       APPEND(rl, ol + lbase);
+                                                       if (rr) APPEND(rr, or);
+                                                       if (BATcount(rl) == 
BATcapacity(rl)) {
+                                                               new_cap = 
BATgrows(rl);
+                                                               if 
(BATextend(rl, new_cap) != GDK_SUCCEED ||
+                                                                       (rr && 
BATextend(rr, new_cap) != GDK_SUCCEED)) {
+                                                                       
ngrams_destroy(ng);
+                                                                       
throw(MAL, fname, GDK_EXCEPTION);
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       } else {
+                               canditer_reset(lci);
+                               TIMEOUT_LOOP(lci->ncand, qry_ctx) {
+                                       ol = canditer_next(lci);
+                                       if ((sigs[ol - lbase] & sig) == sig) {
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to