Changeset: 0d3b608a469e for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/0d3b608a469e
Modified Files:
        monetdb5/modules/atoms/bigram.h
Branch: strimps_v3
Log Message:

Move bigram impl to c file


diffs (truncated from 346 to 300 lines):

diff --git a/monetdb5/modules/atoms/bigram.h b/monetdb5/modules/atoms/bigram.h
--- a/monetdb5/modules/atoms/bigram.h
+++ b/monetdb5/modules/atoms/bigram.h
@@ -12,8 +12,6 @@
 
 #include "monetdb_config.h"
 #include "gdk.h"
-#include "mal_exception.h"
-#include "str.h"
 
 #ifdef HAVE_HGE
 #define NGRAM_TYPE hge
@@ -49,333 +47,3 @@ typedef struct {
        unsigned *lists;
        unsigned *rids;
 } Ngrams;
-
-static void
-ngrams_destroy(Ngrams *ng)
-{
-       if (ng) {
-               GDKfree(ng->idx);
-               GDKfree(ng->sigs);
-               GDKfree(ng->histogram);
-               GDKfree(ng->lists);
-               GDKfree(ng->rids);
-       }
-       GDKfree(ng);
-}
-
-static Ngrams *
-ngrams_create(size_t cnt, size_t ng_sz)
-{
-       Ngrams *ng = GDKmalloc(sizeof(Ngrams));
-       if (ng) {
-               ng->idx  = GDKmalloc(ng_sz * sizeof(NGRAM_TYPE));
-               ng->sigs = GDKmalloc(cnt * sizeof(NGRAM_TYPE));
-               ng->histogram = GDKmalloc(ng_sz * sizeof(unsigned));
-               ng->lists  = GDKmalloc(ng_sz * sizeof(unsigned));
-               ng->rids  = GDKmalloc(2 * NGRAM_MULTIPLE * cnt * 
sizeof(unsigned));
-       }
-       if (!ng || !ng->idx || !ng->sigs || !ng->histogram || !ng->lists || 
!ng->rids) {
-               ngrams_destroy(ng);
-               return NULL;
-       }
-       return ng;
-}
-
-static str
-init_bigram_idx(Ngrams *ng, BATiter *bi, struct canditer *bci, QryCtx *qry_ctx)
-{
-       NGRAM_TYPE *idx = ng->idx;
-       NGRAM_TYPE *sigs = ng->sigs;
-       unsigned *h = ng->histogram;
-       unsigned *lists = ng->lists;
-       unsigned *rids = ng->rids;
-       unsigned (*h_tmp)[SZ] = GDKzalloc(BIGRAM_SZ * sizeof(unsigned));
-       unsigned *h_tmp_ptr = (unsigned *) h_tmp;
-       unsigned *map = GDKmalloc(BIGRAM_SZ * sizeof(unsigned));
-       unsigned int k = 1;
-
-       if (!h_tmp || !map) {
-               GDKfree(h_tmp);
-               GDKfree(map);
-               throw(MAL, "init_bigram_idx", SQLSTATE(HY013) MAL_MALLOC_FAIL);
-       }
-
-       oid bbase = bi->b->hseqbase, ob;
-       const char *b_vars = bi->vh->base, *b_vals = bi->base;
-
-       canditer_reset(bci);
-       TIMEOUT_LOOP(bci->ncand, qry_ctx) {
-               ob = canditer_next(bci);
-               const char *s = VALUE(b, ob - bbase);
-               if (!strNil(s))
-                       for ( ; BIGRAM(s); s++)
-                               h_tmp[ENC_TOKEN1(s)][ENC_TOKEN2(s)]++;
-       }
-
-       for (unsigned i = 0; i < BIGRAM_SZ; i++) {
-               map[i] = i;
-               idx[i] = lists[i] = 0;
-               h[i] = h_tmp_ptr[i];
-       }
-
-       GDKqsort(h_tmp, map, NULL, BIGRAM_SZ,
-                        sizeof(unsigned), sizeof(unsigned), TYPE_int, true, 
false);
-
-       unsigned j = BIGRAM_SZ - 1, sum = 0;
-       for ( ; j; j--) {
-               sum += h_tmp_ptr[j];
-               if ((sum + h_tmp_ptr[j]) >= NGRAM_MULTIPLE * bci->ncand - 1)
-                       break;
-       }
-       ng->max = h_tmp_ptr[0];
-       ng->min = h_tmp_ptr[j];
-
-       int n = 0;
-       for (size_t i = 0; i < BIGRAM_SZ && h_tmp_ptr[i] > 0; i++) {
-               idx[map[i]] = NGRAM_CST(1) << n++;
-               n %= NGRAM_BITS;
-       }
-
-       canditer_reset(bci);
-       TIMEOUT_LOOP(bci->ncand, qry_ctx) {
-               ob = canditer_next(bci);
-               const char *s = VALUE(b, ob - bbase);
-               if (!strNil(s) && BIGRAM(s)) {
-                       NGRAM_TYPE sig = 0;
-                       for ( ; BIGRAM(s); s++) {
-                               unsigned bigram = ENC_TOKEN1(s)*SZ + 
ENC_TOKEN2(s);
-                               sig |= idx[bigram];
-                               if (h[bigram] <= ng->min) {
-                                       if (lists[bigram] == 0) {
-                                               lists[bigram] = k;
-                                               k += h[bigram];
-                                               h[bigram] = 0;
-                                       }
-                                       int done = (h[bigram] > 0 &&
-                                                               
rids[lists[bigram] + h[bigram] - 1] == ob - bbase);
-                                       if (!done) {
-                                               rids[lists[bigram] + h[bigram]] 
= (unsigned)(ob - bbase);
-                                               h[bigram]++;
-                                       }
-                               }
-                       }
-                       *sigs = sig;
-               } else {
-                       *sigs = NGRAM_TYPENIL;
-               }
-               sigs++;
-       }
-
-       GDKfree(h_tmp);
-       GDKfree(map);
-       return MAL_SUCCEED;
-}
-
-/* static str */
-/* bigram_strselect(BAT *rl, BATiter *li, struct canditer *lci, const char *r, 
*/
-/*                              int (*str_cmp)(const char *, const char *, 
int), QryCtx *qry_ctx) */
-/* { */
-/*     str msg = MAL_SUCCEED; */
-/*     Ngrams *ng = ngrams_create(lci->ncand, BIGRAM_SZ); */
-/*     if (!ng) */
-/*             throw(MAL, "select_bigram", SQLSTATE(HY013) MAL_MALLOC_FAIL); */
-
-/*     NGRAM_TYPE *idx = ng->idx; */
-/*     NGRAM_TYPE *sigs = ng->sigs; */
-/*     unsigned *h = ng->histogram; */
-/*     unsigned *lists = ng->lists; */
-/*     unsigned *rids = ng->rids; */
-/*     oid lbase = li->b->hseqbase, ol; */
-/*     const char *lvars = li->vh->base, *lvals = li->base; */
-/*     const char *rs = r, *rs_iter = r; */
-
-/*     if (strNil(rs)) */
-/*             return msg; */
-
-/*     if (strlen(rs) < 2) { */
-/*             canditer_reset(lci); */
-/*             TIMEOUT_LOOP(lci->ncand, qry_ctx) { */
-/*                     ol = canditer_next(lci); */
-/*                     const char *ls = VALUE(l, ol - lbase); */
-/*                     if (!strNil(ls) && str_cmp(ls, rs, str_strlen(rs)) == 
0) */
-/*                             APPEND(rl, ol); */
-/*             } */
-/*     } */
-
-/*     msg = init_bigram_idx(ng, li, lci, qry_ctx); */
-/*     if (msg) { */
-/*             ngrams_destroy(ng); */
-/*             return msg; */
-/*     } */
-
-/*     NGRAM_TYPE sig = 0; */
-/*     unsigned min = ng->max, min_pos = 0; */
-/*     for ( ; BIGRAM(rs_iter); rs_iter++) { */
-/*             unsigned bigram = ENC_TOKEN1(rs_iter)*SZ + ENC_TOKEN2(rs_iter); 
*/
-/*             sig |= idx[bigram]; */
-/*             if (h[bigram] < min) { */
-/*                     min = h[bigram]; */
-/*                     min_pos = bigram; */
-/*             } */
-/*     } */
-
-/*     if (min <= ng->min) { */
-/*             unsigned list = lists[min_pos], list_cnt = h[min_pos]; */
-/*             for (size_t i = 0; i < list_cnt; i++, list++) { */
-/*                     unsigned ol = rids[list]; */
-/*                     if ((sigs[ol] & sig) == sig) { */
-/*                             const char *ls = VALUE(l, ol); */
-/*                             if (str_cmp(ls, rs, str_strlen(rs)) == 0) */
-/*                                     APPEND(rl, ol + lbase); */
-/*                     } */
-/*             } */
-/*     } else { */
-/*             canditer_reset(lci); */
-/*             TIMEOUT_LOOP(lci->ncand, qry_ctx) { */
-/*                     ol = canditer_next(lci); */
-/*                     if ((sigs[ol - lbase] & sig) == sig) { */
-/*                             const char *ls = VALUE(l, ol - lbase); */
-/*                             if (str_cmp(ls, rs, str_strlen(rs)) == 0) */
-/*                                     APPEND(rl, ol); */
-/*                     } */
-/*             } */
-/*     } */
-
-/*     BATsetcount(rl, BATcount(rl)); */
-/*     ngrams_destroy(ng); */
-/*     return msg; */
-/* } */
-
-static str
-bigram_strjoin(BAT *rl, BAT *rr, BATiter *li, BATiter *ri,
-                          struct canditer *lci, struct canditer *rci,
-                          int (*str_cmp)(const char *, const char *, int),
-                          const char *fname, QryCtx *qry_ctx)
-{
-       str msg = MAL_SUCCEED;
-
-       Ngrams *ng = ngrams_create(lci->ncand, BIGRAM_SZ);
-       if (!ng)
-               throw(MAL, fname, MAL_MALLOC_FAIL);
-
-       NGRAM_TYPE *idx = ng->idx;
-       NGRAM_TYPE *sigs = ng->sigs;
-       unsigned *h = ng->histogram;
-       unsigned *lists = ng->lists;
-       unsigned *rids = ng->rids;
-       size_t new_cap;
-
-       msg = init_bigram_idx(ng, li, lci, qry_ctx);
-       if (msg) {
-               ngrams_destroy(ng);
-               return msg;
-       }
-
-       oid lbase = li->b->hseqbase, rbase = ri->b->hseqbase, or, ol;
-       const char *l_vars = li->vh->base, *r_vars = ri->vh->base,
-               *l_vals = li->base, *r_vals = ri->base;
-
-       lng t0 = 0;
-       TRC_DEBUG_IF(ALGO) t0 = GDKusec();
-
-       canditer_reset(lci);
-       TIMEOUT_LOOP(rci->ncand, qry_ctx) {
-               or = canditer_next(rci);
-               const char *rs = VALUE(r, or - rbase), *rs_iter = rs;
-               if (strNil(rs))
-                       continue;
-               if (strlen(rs) < 2) {
-                       canditer_reset(lci);
-                       TIMEOUT_LOOP(lci->ncand, qry_ctx) {
-                               ol = canditer_next(lci);
-                               const char *ls = VALUE(l, ol - lbase);
-                               if (!strNil(ls)) {
-                                       if (str_cmp(ls, rs, str_strlen(rs)) == 
0) {
-                                               APPEND(rl, ol);
-                                               if (rr) APPEND(rr, or);
-                                               if (BATcount(rl) == 
BATcapacity(rl)) {
-                                                       new_cap = BATgrows(rl);
-                                                       if (BATextend(rl, 
new_cap) != GDK_SUCCEED ||
-                                                               (rr && 
BATextend(rr, new_cap) != GDK_SUCCEED)) {
-                                                               
ngrams_destroy(ng);
-                                                               throw(MAL, 
fname, GDK_EXCEPTION);
-                                                       }
-                                               }
-                                       }
-                               }
-                       }
-               } else  if (BIGRAM(rs)) {
-                       NGRAM_TYPE sig = 0;
-                       unsigned min = ng->max, min_pos = 0;
-                       for ( ; BIGRAM(rs_iter); rs_iter++) {
-                               unsigned bigram = ENC_TOKEN1(rs_iter)*SZ + 
ENC_TOKEN2(rs_iter);
-                               sig |= idx[bigram];
-                               if (h[bigram] < min) {
-                                       min = h[bigram];
-                                       min_pos = bigram;
-                               }
-                       }
-                       if (min <= ng->min) {
-                               unsigned list = lists[min_pos], list_cnt = 
h[min_pos];
-                               for (size_t i = 0; i < list_cnt; i++, list++) {
-                                       unsigned ol = rids[list];
-                                       if ((sigs[ol] & sig) == sig) {
-                                               const char *ls = VALUE(l, ol);
-                                               if (str_cmp(ls, rs, 
str_strlen(rs)) == 0) {
-                                                       APPEND(rl, ol + lbase);
-                                                       if (rr) APPEND(rr, or);
-                                                       if (BATcount(rl) == 
BATcapacity(rl)) {
-                                                               new_cap = 
BATgrows(rl);
-                                                               if 
(BATextend(rl, new_cap) != GDK_SUCCEED ||
-                                                                       (rr && 
BATextend(rr, new_cap) != GDK_SUCCEED)) {
-                                                                       
ngrams_destroy(ng);
-                                                                       
throw(MAL, fname, GDK_EXCEPTION);
-                                                               }
-                                                       }
-                                               }
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to