Changeset: aacd9407a636 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/aacd9407a636
Modified Files:
        monetdb5/modules/mal/CMakeLists.txt
        monetdb5/modules/mal/ngrams.c
        monetdb5/modules/mal/ngrams.h
        sql/scripts/49_strings.sql
Branch: strimps_v3
Log Message:

uni, bi and tri clean implementations


diffs (truncated from 908 to 300 lines):

diff --git a/monetdb5/modules/mal/CMakeLists.txt 
b/monetdb5/modules/mal/CMakeLists.txt
--- a/monetdb5/modules/mal/CMakeLists.txt
+++ b/monetdb5/modules/mal/CMakeLists.txt
@@ -41,8 +41,7 @@ target_sources(malmodules
   projectionpath.c
   tablet.c tablet.h
   batcalc.c calc.c
-  ngrams.c ngrams.h
-  ngrams_old.c)
+  ngrams.c ngrams.h)
 
 target_include_directories(malmodules
   PRIVATE
diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c
--- a/monetdb5/modules/mal/ngrams.c
+++ b/monetdb5/modules/mal/ngrams.c
@@ -54,9 +54,9 @@ ngrams_destroy(Ngrams *ng)
        if (ng) {
                GDKfree(ng->idx);
                GDKfree(ng->sigs);
-               GDKfree(ng->h);
-               GDKfree(ng->pos);
-               GDKfree(ng->rid);
+               GDKfree(ng->histogram);
+               GDKfree(ng->lists);
+               GDKfree(ng->rids);
        }
        GDKfree(ng);
 }
@@ -68,18 +68,17 @@ ngrams_create(size_t b_cnt, size_t ng_sz
        if (ng) {
                ng->idx  = GDKmalloc(ng_sz * sizeof(NGRAM_TYPE));
                ng->sigs = GDKmalloc(b_cnt * sizeof(NGRAM_TYPE));
-               ng->h    = GDKmalloc(ng_sz * sizeof(unsigned));
-               ng->pos  = GDKmalloc(ng_sz * sizeof(unsigned));
-               ng->rid  = GDKmalloc(NGRAM_MULTIPLE * b_cnt * sizeof(unsigned));
+               ng->histogram = GDKmalloc(ng_sz * sizeof(unsigned));
+               ng->lists  = GDKmalloc(ng_sz * sizeof(unsigned));
+               ng->rids  = GDKmalloc(2 * NGRAM_MULTIPLE * b_cnt * 
sizeof(unsigned));
        }
-       if (!ng || !ng->idx || !ng->sigs || !ng->h || !ng->pos || !ng->rid) {
+       if (!ng || !ng->idx || !ng->sigs || !ng->histogram || !ng->lists || 
!ng->rids) {
                ngrams_destroy(ng);
                return NULL;
        }
        return ng;
 }
 
-
 static str
 ngram_choice(const bat *NG, bte *ngram, const char *fname)
 {
@@ -107,9 +106,9 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
 {
        NGRAM_TYPE *idx = ng->idx;
        NGRAM_TYPE *sigs = ng->sigs;
-       unsigned *h = ng->h;
-       unsigned *pos = ng->pos;
-       unsigned *rid = ng->rid;
+       unsigned *h = ng->histogram;
+       unsigned *lists = ng->lists;
+       unsigned *rids = ng->rids;
        unsigned *h_tmp = GDKzalloc(UNIGRAM_SZ * sizeof(unsigned));
        unsigned *map = GDKmalloc(UNIGRAM_SZ * sizeof(unsigned));
 
@@ -119,16 +118,16 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
                return -1;
        }
 
-       for(size_t i = 0; i < b_cnt; i++) {
+       for (size_t i = 0; i < b_cnt; i++) {
                const char *s = BUNtail(*bi, i);
-               if (!strNil(s) && *s)
-                       for(const char *c = s; *c; c++)
-                               h_tmp[CHAR_MAP(*c)]++;
+               if (!strNil(s))
+                       for (; UNIGRAM(s); s++)
+                               h_tmp[ENC_TOKEN1(s)]++;
        }
 
        for(size_t i = 0; i < UNIGRAM_SZ; i++) {
                map[i] = i;
-               idx[i] = pos[i] = 0;
+               idx[i] = lists[i] = 0;
                h[i] = h_tmp[i];
        }
 
@@ -144,36 +143,37 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
        ng->max = h_tmp[0];
        ng->min = h_tmp[j];
 
-       int n_shift = 0;
-       for(size_t i = 0; i < UNIGRAM_SZ && h[i] > 0; i++) {
+       int n = 0;
+       for(size_t i = 0; i < UNIGRAM_SZ && h_tmp[i] > 0; i++) {
                unsigned x = map[i];
-               idx[x] = NGRAM_CST(1) << n_shift;
-               n_shift++;
-               n_shift %= NGRAM_BITS;
+               idx[x] = NGRAM_CST(1) << n++;
+               n %= NGRAM_BITS;
        }
 
-       unsigned p = 1;
+       unsigned k = 1;
        for(size_t i = 0; i < b_cnt; i++) {
                const char *s = BUNtail(*bi, i);
-               if (!strNil(s) && s[0]) {
+               if (!strNil(s) && UNIGRAM(s)) {
                        NGRAM_TYPE sig = 0;
-                       for(; *s; s++) {
-                               unsigned c = CHAR_MAP(*s);
-                               sig |= idx[c];
-                               if (h[c] <= ng->min) {
-                                       if (pos[c] == 0) {
-                                               pos[c] = p;
-                                               p += h[c];
-                                               h[c] = 0;
+                       for(; UNIGRAM(s); s++) {
+                               unsigned unigram = ENC_TOKEN1(s);
+                               sig |= idx[unigram];
+                               if (h[unigram] <= ng->min) {
+                                       if (lists[unigram] == 0) {
+                                               lists[unigram] = k;
+                                               k += h[unigram];
+                                               h[unigram] = 0;
                                        }
-                                       int done =  (h[c] > 0 && rid[pos[c] + 
h[c] - 1] == i);
+                                       int done = (h[unigram] > 0 && 
rids[lists[unigram] + h[unigram] - 1] == i);
                                        if (!done) {
-                                               rid[pos[c] + h[c]] = i;
-                                               h[c]++;
+                                               rids[lists[unigram] + 
h[unigram]] = i;
+                                               h[unigram]++;
                                        }
                                }
                        }
                        *sigs = sig;
+               } else if (!strNil(s)) {
+                       *sigs = 1;
                } else {
                        *sigs = NGRAM_TYPENIL;
                }
@@ -190,10 +190,10 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
 {
        NGRAM_TYPE *idx = ng->idx;
        NGRAM_TYPE *sigs = ng->sigs;
-       unsigned *h = ng->h;
-       unsigned *pos = ng->pos;
-       unsigned *rid = ng->rid;
-       unsigned (*h_tmp)[GZ] = GDKzalloc(BIGRAM_SZ * sizeof(unsigned));
+       unsigned *h = ng->histogram;
+       unsigned *lists = ng->lists;
+       unsigned *rids = ng->rids;
+       unsigned (*h_tmp)[SZ] = GDKzalloc(BIGRAM_SZ * sizeof(unsigned));
        unsigned *h_tmp_ptr = (unsigned *) h_tmp;
        unsigned *map = GDKmalloc(BIGRAM_SZ * sizeof(unsigned));
 
@@ -205,17 +205,15 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
 
        for (size_t i = 0; i < b_cnt; i++) {
                const char *s = BUNtail(*bi, i);
-               if (!strNil(s) && *s) {
-                       unsigned char p = CHAR_MAP(*s++);
-                       for (; *s; p = CHAR_MAP(*s), s++)
-                               h_tmp[p][CHAR_MAP(*s)]++;
-               }
+               if (!strNil(s))
+                       for (; BIGRAM(s); s++)
+                               h_tmp[ENC_TOKEN1(s)][ENC_TOKEN2(s)]++;
        }
 
        for (size_t i = 0; i < BIGRAM_SZ; i++) {
                map[i] = i;
-               idx[i] = pos[i] = 0;
-               ng->h[i] = h_tmp_ptr[i];
+               idx[i] = lists[i] = 0;
+               h[i] = h_tmp_ptr[i];
        }
 
        GDKqsort(h_tmp, map, NULL, BIGRAM_SZ,
@@ -230,38 +228,41 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
        ng->max = h_tmp_ptr[0];
        ng->min = h_tmp_ptr[j];
 
-       int n_shift = 0;
+       int n = 0;
        for (size_t i = 0; i < BIGRAM_SZ && h_tmp_ptr[i] > 0; i++) {
-               unsigned x = (map[i] / GZ) % GZ;
-               unsigned y = map[i] % GZ;
-               idx[x * GZ + y] = NGRAM_CST(1) << n_shift;
-               n_shift++;
-               n_shift %= NGRAM_BITS;
+               /* unsigned x = (map[i] / SZ) % SZ, y = map[i] % SZ; */
+               /* idx[x*SZ + y] = NGRAM_CST(1) << n; */
+               /* n++; */
+               /* n %= NGRAM_BITS; */
+               /* assert(x*SZ + y == map[i]); */
+               idx[map[i]] = NGRAM_CST(1) << n++;
+               n %= NGRAM_BITS;
        }
 
-       unsigned int p = 1;
+       unsigned int k = 1;
        for (size_t i = 0; i < b_cnt; i++) {
                const char *s = BUNtail(*bi, i);
-               if (!strNil(s) && s[0] && s[1]) {
+               if (!strNil(s) && BIGRAM(s)) {
                        NGRAM_TYPE sig = 0;
-                       unsigned c = CHAR_MAP(*s++);
-                       for (; *s; c = CHAR_MAP(*s), s++) {
-                               int k = c * GZ + CHAR_MAP(*s);
-                               sig |= idx[k];
-                               if (h[k] <= ng->min) {
-                                       if (pos[k] == 0) {
-                                               pos[k] = p;
-                                               p += h[k];
-                                               h[k] = 0;
+                       for (; BIGRAM(s); s++) {
+                               unsigned bigram = ENC_TOKEN1(s)*SZ + 
ENC_TOKEN2(s);
+                               sig |= idx[bigram];
+                               if (h[bigram] <= ng->min) {
+                                       if (lists[bigram] == 0) {
+                                               lists[bigram] = k;
+                                               k += h[bigram];
+                                               h[bigram] = 0;
                                        }
-                                       int done =  (h[k] > 0 && rid[pos[k] + 
h[k] -1] == i);
+                                       int done = (h[bigram] > 0 && 
rids[lists[bigram] + h[bigram] - 1] == i);
                                        if (!done) {
-                                               rid[pos[k] + h[k]] = i;
-                                               h[k]++;
+                                               rids[lists[bigram] + h[bigram]] 
= i;
+                                               h[bigram]++;
                                        }
                                }
                        }
                        *sigs = sig;
+               /* } else if (!strNil(s)) { */
+               /*      *sigs = 1; */
                } else {
                        *sigs = NGRAM_TYPENIL;
                }
@@ -278,10 +279,10 @@ init_trigram_idx(Ngrams *ng, BATiter *bi
 {
        NGRAM_TYPE *idx = ng->idx;
        NGRAM_TYPE *sigs = ng->sigs;
-       unsigned *h = ng->h;
-       unsigned *pos = ng->pos;
-       unsigned *rid = ng->rid;
-       unsigned (*h_tmp)[GZ][GZ] = GDKzalloc(TRIGRAM_SZ * sizeof(unsigned));
+       unsigned *h = ng->histogram;
+       unsigned *lists = ng->lists;
+       unsigned *rids = ng->rids;
+       unsigned (*h_tmp)[SZ][SZ] = GDKzalloc(TRIGRAM_SZ * sizeof(unsigned));
        unsigned *h_tmp_ptr = (unsigned *) h_tmp;
        unsigned *map = GDKmalloc(TRIGRAM_SZ * sizeof(unsigned));
 
@@ -293,20 +294,15 @@ init_trigram_idx(Ngrams *ng, BATiter *bi
 
        for (size_t i = 0; i < b_cnt; i++) {
                const char *s = BUNtail(*bi, i);
-               if (!strNil(s) && *s) {
-                       unsigned char pp = CHAR_MAP(*s++);
-                       if (!*s)
-                               continue;
-                       unsigned char p = CHAR_MAP(*s++);
-                       for(; *s; pp = p, p = CHAR_MAP(*s), s++)
-                               h_tmp[pp][p][CHAR_MAP(*s)]++;
-               }
+               if (!strNil(s))
+                       for (; TRIGRAM(s); s++)
+                               
h_tmp[ENC_TOKEN1(s)][ENC_TOKEN2(s)][ENC_TOKEN3(s)]++;
        }
 
-       for (size_t j = 0; j < TRIGRAM_SZ; j++) {
-               map[j] = j;
-               idx[j] = pos[j] = 0;
-               ng->h[j] = h_tmp_ptr[j];
+       for (size_t i = 0; i < TRIGRAM_SZ; i++) {
+               map[i] = i;
+               idx[i] = lists[i] = 0;
+               h[i] = h_tmp_ptr[i];
        }
 
        GDKqsort(h_tmp, map, NULL, TRIGRAM_SZ,
@@ -321,41 +317,36 @@ init_trigram_idx(Ngrams *ng, BATiter *bi
        ng->max = h_tmp_ptr[0];
        ng->min = h_tmp_ptr[j];
 
-       int n_shift = 0;
-       for (size_t j = 0; j < TRIGRAM_SZ && h_tmp_ptr[j] > 0; j++) {
-               unsigned x = map[j]/(GZ*GZ);
-               unsigned y = (map[j]/GZ)%GZ;
-               unsigned z = map[j]%GZ;
-               idx[x*GZ*GZ+y*GZ+z] = NGRAM_CST(1) << n_shift;
-               n_shift++;
-               n_shift %= NGRAM_BITS;
+       int n = 0;
+       for (size_t i = 0; i < TRIGRAM_SZ && h_tmp_ptr[i] > 0; i++) {
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to