Changeset: 3cfcf8ae683a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/3cfcf8ae683a
Modified Files:
        monetdb5/modules/mal/ngrams.c
        monetdb5/modules/mal/ngrams.h
Branch: strimps_v3
Log Message:

minor stuff


diffs (truncated from 440 to 300 lines):

diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c
--- a/monetdb5/modules/mal/ngrams.c
+++ b/monetdb5/modules/mal/ngrams.c
@@ -17,6 +17,26 @@
 #include "string.h"
 #include "str.h"
 
+static inline int
+is_prefix(const char *s1, const char *s2, int s2_len)
+{
+       return strncmp(s1, s2, s2_len);
+}
+
+static inline int
+is_suffix(const char *s1, const char *s2, int s2_len)
+{
+       int sl = str_strlen(s1);
+       return sl < s2_len ? -1 : strcmp(s1 + sl - s2_len, s2);
+}
+
+static inline int
+is_contains(const char *s1, const char *s2, int s2_len)
+{
+       (void) s2_len;
+       return strstr(s1, s2) == NULL;
+}
+
 static inline void
 BBPreclaim_n(int nargs, ...)
 {
@@ -47,10 +67,10 @@ ngrams_create(size_t b_cnt, size_t ng_sz
 {
        Ngrams *ng = GDKmalloc(sizeof(Ngrams));
        if (ng) {
-               ng->idx  = GDKzalloc(ng_sz * sizeof(NGRAM_TYPE));
-               ng->sigs = GDKzalloc(b_cnt * sizeof(NGRAM_TYPE));
-               ng->h    = GDKzalloc(ng_sz * sizeof(unsigned));
-               ng->pos  = GDKzalloc(ng_sz * sizeof(unsigned));
+               ng->idx  = GDKmalloc(ng_sz * sizeof(NGRAM_TYPE));
+               ng->sigs = GDKmalloc(b_cnt * sizeof(NGRAM_TYPE));
+               ng->h    = GDKmalloc(ng_sz * sizeof(unsigned));
+               ng->pos  = GDKmalloc(ng_sz * sizeof(unsigned));
                ng->rid  = GDKzalloc(NGRAM_MULTIPLE * b_cnt * sizeof(unsigned));
        }
        if (!ng || !ng->idx || !ng->sigs || !ng->h || !ng->pos || !ng->rid) {
@@ -770,11 +790,11 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
        unsigned *pos = ng->pos;
        unsigned *rid = ng->rid;
        unsigned *h_tmp = GDKzalloc(UNIGRAM_SZ * sizeof(unsigned));
-       unsigned *ascii_map = GDKmalloc(UNIGRAM_SZ * sizeof(unsigned));
+       unsigned *map = GDKmalloc(UNIGRAM_SZ * sizeof(unsigned));
 
-       if (!h_tmp || !ascii_map) {
+       if (!h_tmp || !map) {
                GDKfree(h_tmp);
-               GDKfree(ascii_map);
+               GDKfree(map);
                return -1;
        }
 
@@ -786,12 +806,12 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
        }
 
        for(size_t i = 0; i < UNIGRAM_SZ; i++) {
-               ascii_map[i] = i;
+               map[i] = i;
                idx[i] = pos[i] = 0;
                h[i] = h_tmp[i];
        }
 
-       GDKqsort(h_tmp, ascii_map, NULL, UNIGRAM_SZ,
+       GDKqsort(h_tmp, map, NULL, UNIGRAM_SZ,
                         sizeof(unsigned), sizeof(unsigned), TYPE_int, true, 
false);
 
        unsigned j = UNIGRAM_SZ - 1, sum = 0;
@@ -805,7 +825,7 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
 
        int n_shift = 0;
        for(size_t i = 0; i < UNIGRAM_SZ && h[i] > 0; i++) {
-               unsigned x = ascii_map[i];
+               unsigned x = map[i];
                idx[x] = NGRAM_CST(1) << n_shift;
                n_shift++;
                n_shift %= NGRAM_BITS;
@@ -814,7 +834,7 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
        unsigned p = 1;
        for(size_t i = 0; i < b_cnt; i++) {
                const char *s = BUNtail(*bi, i);
-               if (!strNil(s) && *s) {
+               if (!strNil(s) && s[0]) {
                        NGRAM_TYPE sig = 0;
                        for(; *s; s++) {
                                unsigned c = CHAR_MAP(*s);
@@ -840,7 +860,7 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
        }
 
        GDKfree(h_tmp);
-       GDKfree(ascii_map);
+       GDKfree(map);
        return 0;
 }
 
@@ -854,11 +874,11 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
        unsigned *rid = ng->rid;
        unsigned (*h_tmp)[GZ] = GDKzalloc(BIGRAM_SZ * sizeof(unsigned));
        unsigned *h_tmp_ptr = (unsigned*) h_tmp;
-       unsigned *ascii_map = GDKmalloc(BIGRAM_SZ * sizeof(unsigned));
+       unsigned *map = GDKmalloc(BIGRAM_SZ * sizeof(unsigned));
 
-       if (!h_tmp || !ascii_map) {
+       if (!h_tmp || !map) {
                GDKfree(h_tmp);
-               GDKfree(ascii_map);
+               GDKfree(map);
                return -1;
        }
 
@@ -872,12 +892,12 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
        }
 
        for(size_t i = 0; i < BIGRAM_SZ; i++) {
-               ascii_map[i] = i;
+               map[i] = i;
                idx[i] = 0;
                ng->h[i] = h_tmp_ptr[i];
        }
 
-       GDKqsort(h_tmp, ascii_map, NULL, BIGRAM_SZ,
+       GDKqsort(h_tmp, map, NULL, BIGRAM_SZ,
                         sizeof(unsigned), sizeof(unsigned), TYPE_int, true, 
false);
 
        unsigned j = BIGRAM_SZ - 1, sum = 0;
@@ -891,8 +911,8 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
 
        int n_shift = 0;
        for (size_t i = 0; i < BIGRAM_SZ && h_tmp_ptr[i] > 0; i++) {
-               unsigned x = (ascii_map[i]/GZ) % GZ;
-               unsigned y = ascii_map[i] % GZ;
+               unsigned x = (map[i]/GZ) % GZ;
+               unsigned y = map[i] % GZ;
                idx[x * GZ + y] = NGRAM_CST(1) << n_shift;
                n_shift++;
                n_shift %= NGRAM_BITS;
@@ -928,7 +948,7 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
        }
 
        GDKfree(h_tmp);
-       GDKfree(ascii_map);
+       GDKfree(map);
        return 0;
 }
 
@@ -1033,6 +1053,11 @@ join_unigram(BAT *rl, BAT *rr, BATiter *
                         int (*str_cmp)(const char *, const char *, int))
 {
        Ngrams *ng = ngrams_create(l_cnt, UNIGRAM_SZ);
+       NGRAM_TYPE *idx = ng->idx;
+       NGRAM_TYPE *sigs = ng->sigs;
+       unsigned *h = ng->h;
+       unsigned *pos = ng->pos;
+       unsigned *rid = ng->rid;
 
        if (!ng)
                throw(MAL, "join_unigram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
@@ -1044,52 +1069,52 @@ join_unigram(BAT *rl, BAT *rr, BATiter *
        oid *o_rl = Tloc(rl, 0);
        oid *o_rr = Tloc(rr, 0);
 
-       for(size_t i = 0; i < r_cnt; i++) {
-               const char *s = BUNtail(*ri, i), *os = s;
+       for (size_t j = 0; j < r_cnt; j++) {
+               const char *rs = BUNtail(*ri, j), *rsc = rs;
                NGRAM_TYPE sig = 0;
-               if (!strNil(s) && s[0]) {
+               if (!strNil(rs) && rs[0]) {
                        unsigned min = ng->max;
                        unsigned min_pos = 0;
-                       for(; *s; s++) {
-                               unsigned int c = CHAR_MAP(*s);
-                               sig |= ng->idx[c];
-                               if (ng->h[c] < min) {
-                                       min = ng->h[c];
-                                       min_pos = c;
+                       for (; *rsc; rsc++) {
+                               unsigned d = CHAR_MAP(*rsc);
+                               sig |= idx[d];
+                               if (h[d] < min) {
+                                       min = h[d];
+                                       min_pos = d;
                                }
                        }
                        if (min <= ng->min) {
-                               unsigned int rrr = ng->pos[min_pos];
-                               int hcnt = ng->h[min_pos];
-                               for(int k = 0; k<hcnt; k++, rrr++) {
-                                       unsigned int hr = ng->rid[rrr];
-                                       if (((ng->sigs[hr] & sig) == sig)) {
-                                               char *hs = BUNtail(*li, hr);
-                                               if (str_cmp(hs, os, 
str_strlen(os)) == 0) {
+                               unsigned rrr = pos[min_pos];
+                               unsigned l_cnt = h[min_pos];
+                               for(size_t i = 0; i < l_cnt; i++, rrr++) {
+                                       unsigned hr = rid[rrr];
+                                       if (((sigs[hr] & sig) == sig)) {
+                                               char *ls = BUNtail(*li, hr);
+                                               if (str_cmp(ls, rs, 
str_strlen(rs)) == 0) {
                                                        *o_rl++ = hr;
-                                                       *o_rr++ = i;
+                                                       *o_rr++ = j;
                                                }
                                        }
                                }
                        } else {
-                               for(size_t k = 0; k < l_cnt; k++) {
-                                       if (((ng->sigs[k] & sig) == sig)) {
-                                               char *hs = BUNtail(*li, k);
-                                               if (str_cmp(hs, os, 
str_strlen(os)) == 0) {
-                                                       *o_rl++ = k;
-                                                       *o_rr++ = i;
+                               for (size_t i = 0; i < l_cnt; i++) {
+                                       if (((sigs[i] & sig) == sig)) {
+                                               char *ls = BUNtail(*li, i);
+                                               if (str_cmp(ls, rs, 
str_strlen(rs)) == 0) {
+                                                       *o_rl++ = i;
+                                                       *o_rr++ = j;
                                                }
                                        }
                                }
                        }
                        if (min > nmax)
                                nmax = min;
-               } else if (!strNil(s)) {
-                       for(size_t k = 0; k < l_cnt; k++) {
-                               char *hs = BUNtail(*li, k);
-                               if (str_cmp(hs, os, str_strlen(os)) == 0) {
-                                       *o_rl++ = k;
-                                       *o_rr++ = i;
+               } else if (!strNil(rs)) {
+                       for (size_t i = 0; i < l_cnt; i++) {
+                               const char *ls = BUNtail(*li, i);
+                               if (str_cmp(ls, rs, str_strlen(rs)) == 0) {
+                                       *o_rl++ = i;
+                                       *o_rr++ = j;
                                }
                        }
                }
@@ -1107,6 +1132,11 @@ join_bigram(BAT *rl, BAT *rr, BATiter *l
                         int (*str_cmp)(const char *, const char *, int))
 {
        Ngrams *ng = ngrams_create(l_cnt, BIGRAM_SZ);
+       NGRAM_TYPE *idx = ng->idx;
+       NGRAM_TYPE *sigs = ng->sigs;
+       unsigned *h = ng->h;
+       unsigned *pos = ng->pos;
+       unsigned *rid = ng->rid;
 
        if (!ng)
                throw(MAL, "join_bigram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
@@ -1131,18 +1161,18 @@ join_bigram(BAT *rl, BAT *rr, BATiter *l
                        unsigned char p = CHAR_MAP(*s++);
                        for (; *s; p = CHAR_MAP(*s), s++) {
                                unsigned int k = p * GZ + CHAR_MAP(*s);
-                               sig |= ng->idx[k];
-                               if (ng->h[k] < min) {
-                                       min = ng->h[k];
+                               sig |= idx[k];
+                               if (h[k] < min) {
+                                       min = h[k];
                                        min_pos = k; /* encoded min ngram */
                                }
                        }
                        if (min <= ng->min) {
-                               unsigned int rrr = ng->pos[min_pos];
-                               int hcnt = ng->h[min_pos];
+                               unsigned int rrr = pos[min_pos];
+                               int hcnt = h[min_pos];
                                for (int k = 0; k < hcnt; k++, rrr++) {
-                                       unsigned int hr = ng->rid[rrr];
-                                       if (((ng->sigs[hr] & sig) == sig)) {
+                                       unsigned int hr = rid[rrr];
+                                       if (((sigs[hr] & sig) == sig)) {
                                                char *hs = BUNtail(*li, hr);
                                                if (str_cmp(hs, os, 
str_strlen(os)) == 0) {
                                                        *ol++ = hr;
@@ -1152,7 +1182,7 @@ join_bigram(BAT *rl, BAT *rr, BATiter *l
                                }
                        } else {
                                for (size_t k = 0; k < l_cnt; k++) {
-                                       if (((ng->sigs[k] & sig) == sig)) {
+                                       if (((sigs[k] & sig) == sig)) {
                                                char *hs = BUNtail(*li, k);
                                                if (str_cmp(hs, os, 
str_strlen(os)) == 0) {
                                                        *ol++ = k;
@@ -1186,6 +1216,11 @@ join_trigram(BAT *rl, BAT *rr, BATiter *
                         int (*str_cmp)(const char *, const char *, int))
 {
        Ngrams *ng = ngrams_create(l_cnt, TRIGRAM_SZ);
+       NGRAM_TYPE *idx = ng->idx;
+       NGRAM_TYPE *sigs = ng->sigs;
+       unsigned *h = ng->h;
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to