Changeset: 167d56a38ac5 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/167d56a38ac5
Modified Files:
        monetdb5/modules/mal/ngrams.c
        sql/scripts/49_strings.sql
Branch: strimps_v3
Log Message:

bigrams and trigrams available


diffs (truncated from 470 to 300 lines):

diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c
--- a/monetdb5/modules/mal/ngrams.c
+++ b/monetdb5/modules/mal/ngrams.c
@@ -446,8 +446,8 @@ NGc2join_intern(bat *L, bat *R, bat *H, 
        if (BATcount(n) < 10) {
        }
 
-       Ngrams *ngi = ngrams_create_old(h, BIGRAM_SZ);
-       if (ngi && ngrams_init_2gram(ngi, h) == 0) {
+       Ngrams *ng = ngrams_create_old(h, BIGRAM_SZ);
+       if (ng && ngrams_init_2gram(ng, h) == 0) {
                BUN cnt = BATcount(h);
                /* create L/R */
                BAT *l = COLnew(0, TYPE_oid, 10*cnt, TRANSIENT);
@@ -467,23 +467,23 @@ NGc2join_intern(bat *L, bat *R, bat *H, 
                        if ((ol+1000) > el)
                                break;
                        if (!strNil(s) && s[0] && s[1]) { /* skipped */
-                               NGRAM_TYPE min = ngi->max;
+                               NGRAM_TYPE min = ng->max;
                                unsigned int min_pos = 0;
                                unsigned char p = CHAR_MAP(*s++);
                                for(; *s; p=CHAR_MAP(*s), s++) {
                                        unsigned int k = p*GZ+CHAR_MAP(*s);
-                                       sig |= ngi->idx[k];
-                                       if (ngi->h[k] < min) {
-                                               min = ngi->h[k];
+                                       sig |= ng->idx[k];
+                                       if (ng->h[k] < min) {
+                                               min = ng->h[k];
                                                min_pos = k; /* encoded min 
ngram */
                                        }
                                }
-                               if (min <= ngi->min) {
-                                       unsigned int rr = ngi->pos[min_pos];
-                                       int hcnt = ngi->h[min_pos];
+                               if (min <= ng->min) {
+                                       unsigned int rr = ng->pos[min_pos];
+                                       int hcnt = ng->h[min_pos];
                                        for(int k = 0; k<hcnt; k++, rr++) {
-                                               unsigned int hr = ngi->rid[rr];
-                                               if (((ngi->sigs[hr] & sig) == 
sig)) {
+                                               unsigned int hr = ng->rid[rr];
+                                               if (((ng->sigs[hr] & sig) == 
sig)) {
                                                        char *hs = BUNtail(hi, 
hr);
                                                        if (strstr(hs, os) != 
NULL) {
                                                                *ol++ = hr;
@@ -494,7 +494,7 @@ NGc2join_intern(bat *L, bat *R, bat *H, 
                                } else {
                                        unsigned int hcnt = BATcount(h);
                                        for(size_t k = 0; k < hcnt; k++) {
-                                               if (((ngi->sigs[k] & sig) == 
sig)) {
+                                               if (((ng->sigs[k] & sig) == 
sig)) {
                                                        char *hs = BUNtail(hi, 
k);
                                                        if (strstr(hs, os) != 
NULL) {
                                                                *ol++ = k;
@@ -526,7 +526,7 @@ NGc2join_intern(bat *L, bat *R, bat *H, 
                *R = r->batCacheid;
                BBPkeepref(l);
                BBPkeepref(r);
-               ngrams_destroy(ngi);
+               ngrams_destroy(ng);
                return MAL_SUCCEED;
        }
        BBPreclaim(h);
@@ -838,8 +838,8 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
                ng->h[j] = (unsigned int)hist[j];
        }
        GDKqsort(h, id, NULL, UNIGRAM_SZ, sizeof(NGRAM_TYPE), sizeof(int), 
NGRAM_TYPEID, true, false);
-       for(i=UNIGRAM_SZ-1; i>=0; i--) {
-               if ((sum + hist[i]) >= (NGRAM_MULTIPLE*b_cnt)-1)
+       for(i = UNIGRAM_SZ - 1; i >= 0; i--) {
+               if ((sum + hist[i]) >= (NGRAM_MULTIPLE * b_cnt) - 1)
                        break;
                sum += hist[i];
        }
@@ -893,6 +893,186 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
        return 0;
 }
 
+static int
+init_bigram_idx(Ngrams *ng, BATiter *bi, size_t b_cnt)
+{
+       NGRAM_TYPE (*h)[GZ] = (NGRAM_TYPE (*)[GZ])GDKzalloc(BIGRAM_SZ * 
sizeof(NGRAM_TYPE)),
+               *hist = (NGRAM_TYPE*)h, sum = 0;
+       NGRAM_TYPE *idx = ng->idx;
+       int *id = GDKmalloc(BIGRAM_SZ * sizeof(int)), i;
+
+       if (!h || !id) {
+               GDKfree(h);
+               GDKfree(id);
+               return -1;
+       }
+
+       for (size_t j = 0; j < b_cnt; j++) {
+               const char *s = BUNtail(*bi, j);
+               if (!strNil(s) && *s) {
+                       unsigned char p = CHAR_MAP(*s++);
+                       for (; *s; p = CHAR_MAP(*s), s++) {
+                               h[p][CHAR_MAP(*s)]++;
+                       }
+               }
+       }
+
+       int bc = 0;
+
+       for(size_t j = 0; j < BIGRAM_SZ; j++) {
+               id[j] = j;
+               idx[j] = 0;
+               ng->h[j] = (unsigned int)hist[j];
+       }
+       GDKqsort(h, id, NULL, BIGRAM_SZ, sizeof(NGRAM_TYPE), sizeof(int), 
NGRAM_TYPEID, true, false);
+       for (i = BIGRAM_SZ - 1; i >= 0; i--) {
+               if ((sum + hist[i]) >= (NGRAM_MULTIPLE * b_cnt) - 1)
+                       break;
+               sum += hist[i];
+       }
+       NGRAM_TYPE larger_cnt = hist[i];
+       for(; hist[i] == larger_cnt; i++)
+               ;
+       NGRAM_TYPE max = hist[0], small = hist[i];
+       ng->max = max;
+       ng->min = small;
+
+       for (size_t j = 0; j < BIGRAM_SZ && hist[j] > 0; j++) {
+               int y = (id[j]/GZ) % GZ, z = id[j] % GZ;
+               idx[y*GZ+z] = NGRAM_CST(1) << bc;
+               assert(idx[y*GZ+z] > 0);
+               bc++;
+               bc %= NGRAM_BITS;
+       }
+
+       NGRAM_TYPE *sp = ng->sigs;
+       unsigned int pos = 1;
+       for(size_t j = 0; j < b_cnt; j++) {
+               const char *s = BUNtail(*bi, j);
+               NGRAM_TYPE sig = 0;
+               if (!strNil(s) && s[0] && s[1]) {
+                       unsigned char p = CHAR_MAP(*s++);
+                       for(; *s; p = CHAR_MAP(*s), s++) {
+                               int k = p*GZ+CHAR_MAP(*s);
+                               sig |= idx[k];
+                               if (ng->h[k] <= ng->min) {
+                                       if (ng->pos[k] == 0) {
+                                               ng->pos[k] = pos;
+                                               pos += ng->h[k];
+                                               ng->h[k] = 0;
+                                       }
+                                       /* deduplicate */
+                                       int done =  (ng->h[k] > 0 && 
ng->rid[ng->pos[k] + ng->h[k]-1] == j);
+                                       if (!done) {
+                                               ng->rid[ng->pos[k] + ng->h[k]] 
= j;
+                                               ng->h[k]++;
+                                       }
+                               }
+                       }
+                       *sp = sig;
+               } else {
+                       *sp = NGRAM_TYPENIL;
+               }
+               sp++;
+       }
+
+       GDKfree(h);
+       GDKfree(id);
+       return 0;
+}
+
+static int
+init_trigram_idx(Ngrams *ng, BATiter *bi, size_t b_cnt)
+{
+       NGRAM_TYPE (*h)[GZ][GZ] = (NGRAM_TYPE (*)[GZ][GZ])GDKzalloc(TRIGRAM_SZ 
* sizeof(NGRAM_TYPE)),
+               *hist = (NGRAM_TYPE*)h, sum = 0;
+       NGRAM_TYPE *idx = ng->idx;
+       int *id = (int*)GDKmalloc(TRIGRAM_SZ*sizeof(int)), i;
+
+       if (!h || !id) {
+               GDKfree(h);
+               GDKfree(id);
+               return -1;
+       }
+
+       for (size_t i = 0; i < b_cnt; i++) {
+               const char *s = BUNtail(*bi,i);
+               if (!strNil(s) && *s) {
+                       unsigned char pp = CHAR_MAP(*s++);
+                       if (!*s)
+                               continue;
+                       unsigned char p = CHAR_MAP(*s++);
+                       for(; *s; pp = p, p = CHAR_MAP(*s), s++)
+                               h[pp][p][CHAR_MAP(*s)]++;
+               }
+       }
+
+       int bc = 0;
+
+       for (size_t j = 0; j < TRIGRAM_SZ; j++) {
+               id[j] = j;
+               idx[j] = 0;
+               ng->h[j] = (unsigned int)hist[j];  /* TODO check for overflow ? 
*/
+       }
+       GDKqsort(h, id, NULL, TRIGRAM_SZ, sizeof(NGRAM_TYPE), sizeof(int), 
NGRAM_TYPEID, true, false);
+       for (i = TRIGRAM_SZ - 1; i >= 0; i--) {
+               if ((sum + hist[i]) >= (NGRAM_MULTIPLE * b_cnt) - 1)
+                       break;
+               sum += hist[i];
+       }
+       NGRAM_TYPE larger_cnt = hist[i];
+       for (; hist[i] == larger_cnt; i++)
+               ;
+       NGRAM_TYPE max = hist[0], small = hist[i];
+       ng->max = max;
+       ng->min = small;
+
+       for (size_t j = 0; j < TRIGRAM_SZ && hist[j] > 0; j++) {
+               unsigned int x = id[j]/(GZ*GZ), y = (id[j]/GZ)%GZ,
+                       z = id[j]%GZ;
+               idx[x*GZ*GZ+y*GZ+z] = NGRAM_CST(1)<<bc;
+               assert(idx[x*GZ*GZ+y*GZ+z] > 0);
+               bc++;
+               bc %= NGRAM_BITS;
+       }
+
+       NGRAM_TYPE *sp = ng->sigs;
+       unsigned int pos = 1;
+       for (size_t j = 0; j < b_cnt; j++) {
+               const char *s = BUNtail(*bi, j);
+               NGRAM_TYPE sig = 0;
+               if (!strNil(s) && s[0] && s[1] && s[2]) {
+                       unsigned char pp = CHAR_MAP(*s++);
+                       unsigned char p = CHAR_MAP(*s++);
+                       for(; *s; pp = p, p = CHAR_MAP(*s), s++) {
+                               int k = pp*GZ*GZ+p*GZ+CHAR_MAP(*s);
+                               sig |= idx[k];
+                               if (ng->h[k] <= ng->min) {
+                                       if (ng->pos[k] == 0) {
+                                               ng->pos[k] = pos;
+                                               pos += ng->h[k];
+                                               ng->h[k] = 0;
+                                       }
+                                       /* deduplicate */
+                                       int done =  (ng->h[k] > 0 && 
ng->rid[ng->pos[k] + ng->h[k]-1] == j);
+                                       if (!done) {
+                                               ng->rid[ng->pos[k] + ng->h[k]] 
= j;
+                                               ng->h[k]++;
+                                       }
+                               }
+                       }
+                       *sp = sig;
+               } else {
+                       *sp = NGRAM_TYPENIL;
+               }
+               sp++;
+       }
+
+       GDKfree(h);
+       GDKfree(id);
+       return 0;
+}
+
 static str
 join_unigram(BAT *rl, BAT *rr, BATiter *li, BATiter *ri,
                         size_t l_cnt, size_t r_cnt,
@@ -915,7 +1095,7 @@ join_unigram(BAT *rl, BAT *rr, BATiter *
                const char *s = BUNtail(*ri, i), *os = s;
                NGRAM_TYPE sig = 0;
 
-               if ((ol+1000) > el)
+               if ((ol + 1000) > el)
                        break;
                if (!strNil(s) && s[0]) {
                        NGRAM_TYPE min = ng->max;
@@ -972,6 +1152,165 @@ join_unigram(BAT *rl, BAT *rr, BATiter *
 }
 
 static str
+join_bigram(BAT *rl, BAT *rr, BATiter *li, BATiter *ri,
+                        size_t l_cnt, size_t r_cnt,
+                        int (*str_cmp)(const char *, const char *, int))
+{
+       Ngrams *ng = ngrams_create(l_cnt, BIGRAM_SZ);
+
+       if (!ng)
+               throw(MAL, "join_bigram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
+
+       if (init_bigram_idx(ng, li, l_cnt) != 0)
+               throw(MAL, "join_bigram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
+
+       NGRAM_TYPE nmax = 0;
+       oid *ol = Tloc(rl, 0), *el = ol + 10 * l_cnt;
+       oid *or = Tloc(rr, 0);
+
+       /* if needed grow */
+       for(size_t i = 0; i < r_cnt; i++) {
+               const char *s = BUNtail(*ri, i), *os = s;
+               NGRAM_TYPE sig = 0;
+
+               if ((ol + 1000) > el)
+                       break;
+               if (!strNil(s) && s[0] && s[1]) { /* skipped */
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to