Changeset: aacd9407a636 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/aacd9407a636
Modified Files:
monetdb5/modules/mal/CMakeLists.txt
monetdb5/modules/mal/ngrams.c
monetdb5/modules/mal/ngrams.h
sql/scripts/49_strings.sql
Branch: strimps_v3
Log Message:
uni, bi and tri clean implementations
diffs (truncated from 908 to 300 lines):
diff --git a/monetdb5/modules/mal/CMakeLists.txt
b/monetdb5/modules/mal/CMakeLists.txt
--- a/monetdb5/modules/mal/CMakeLists.txt
+++ b/monetdb5/modules/mal/CMakeLists.txt
@@ -41,8 +41,7 @@ target_sources(malmodules
projectionpath.c
tablet.c tablet.h
batcalc.c calc.c
- ngrams.c ngrams.h
- ngrams_old.c)
+ ngrams.c ngrams.h)
target_include_directories(malmodules
PRIVATE
diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c
--- a/monetdb5/modules/mal/ngrams.c
+++ b/monetdb5/modules/mal/ngrams.c
@@ -54,9 +54,9 @@ ngrams_destroy(Ngrams *ng)
if (ng) {
GDKfree(ng->idx);
GDKfree(ng->sigs);
- GDKfree(ng->h);
- GDKfree(ng->pos);
- GDKfree(ng->rid);
+ GDKfree(ng->histogram);
+ GDKfree(ng->lists);
+ GDKfree(ng->rids);
}
GDKfree(ng);
}
@@ -68,18 +68,17 @@ ngrams_create(size_t b_cnt, size_t ng_sz
if (ng) {
ng->idx = GDKmalloc(ng_sz * sizeof(NGRAM_TYPE));
ng->sigs = GDKmalloc(b_cnt * sizeof(NGRAM_TYPE));
- ng->h = GDKmalloc(ng_sz * sizeof(unsigned));
- ng->pos = GDKmalloc(ng_sz * sizeof(unsigned));
- ng->rid = GDKmalloc(NGRAM_MULTIPLE * b_cnt * sizeof(unsigned));
+ ng->histogram = GDKmalloc(ng_sz * sizeof(unsigned));
+ ng->lists = GDKmalloc(ng_sz * sizeof(unsigned));
+ ng->rids = GDKmalloc(2 * NGRAM_MULTIPLE * b_cnt *
sizeof(unsigned));
}
- if (!ng || !ng->idx || !ng->sigs || !ng->h || !ng->pos || !ng->rid) {
+ if (!ng || !ng->idx || !ng->sigs || !ng->histogram || !ng->lists ||
!ng->rids) {
ngrams_destroy(ng);
return NULL;
}
return ng;
}
-
static str
ngram_choice(const bat *NG, bte *ngram, const char *fname)
{
@@ -107,9 +106,9 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
{
NGRAM_TYPE *idx = ng->idx;
NGRAM_TYPE *sigs = ng->sigs;
- unsigned *h = ng->h;
- unsigned *pos = ng->pos;
- unsigned *rid = ng->rid;
+ unsigned *h = ng->histogram;
+ unsigned *lists = ng->lists;
+ unsigned *rids = ng->rids;
unsigned *h_tmp = GDKzalloc(UNIGRAM_SZ * sizeof(unsigned));
unsigned *map = GDKmalloc(UNIGRAM_SZ * sizeof(unsigned));
@@ -119,16 +118,16 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
return -1;
}
- for(size_t i = 0; i < b_cnt; i++) {
+ for (size_t i = 0; i < b_cnt; i++) {
const char *s = BUNtail(*bi, i);
- if (!strNil(s) && *s)
- for(const char *c = s; *c; c++)
- h_tmp[CHAR_MAP(*c)]++;
+ if (!strNil(s))
+ for (; UNIGRAM(s); s++)
+ h_tmp[ENC_TOKEN1(s)]++;
}
for(size_t i = 0; i < UNIGRAM_SZ; i++) {
map[i] = i;
- idx[i] = pos[i] = 0;
+ idx[i] = lists[i] = 0;
h[i] = h_tmp[i];
}
@@ -144,36 +143,37 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
ng->max = h_tmp[0];
ng->min = h_tmp[j];
- int n_shift = 0;
- for(size_t i = 0; i < UNIGRAM_SZ && h[i] > 0; i++) {
+ int n = 0;
+ for(size_t i = 0; i < UNIGRAM_SZ && h_tmp[i] > 0; i++) {
unsigned x = map[i];
- idx[x] = NGRAM_CST(1) << n_shift;
- n_shift++;
- n_shift %= NGRAM_BITS;
+ idx[x] = NGRAM_CST(1) << n++;
+ n %= NGRAM_BITS;
}
- unsigned p = 1;
+ unsigned k = 1;
for(size_t i = 0; i < b_cnt; i++) {
const char *s = BUNtail(*bi, i);
- if (!strNil(s) && s[0]) {
+ if (!strNil(s) && UNIGRAM(s)) {
NGRAM_TYPE sig = 0;
- for(; *s; s++) {
- unsigned c = CHAR_MAP(*s);
- sig |= idx[c];
- if (h[c] <= ng->min) {
- if (pos[c] == 0) {
- pos[c] = p;
- p += h[c];
- h[c] = 0;
+ for(; UNIGRAM(s); s++) {
+ unsigned unigram = ENC_TOKEN1(s);
+ sig |= idx[unigram];
+ if (h[unigram] <= ng->min) {
+ if (lists[unigram] == 0) {
+ lists[unigram] = k;
+ k += h[unigram];
+ h[unigram] = 0;
}
- int done = (h[c] > 0 && rid[pos[c] +
h[c] - 1] == i);
+ int done = (h[unigram] > 0 &&
rids[lists[unigram] + h[unigram] - 1] == i);
if (!done) {
- rid[pos[c] + h[c]] = i;
- h[c]++;
+ rids[lists[unigram] +
h[unigram]] = i;
+ h[unigram]++;
}
}
}
*sigs = sig;
+ } else if (!strNil(s)) {
+ *sigs = 1;
} else {
*sigs = NGRAM_TYPENIL;
}
@@ -190,10 +190,10 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
{
NGRAM_TYPE *idx = ng->idx;
NGRAM_TYPE *sigs = ng->sigs;
- unsigned *h = ng->h;
- unsigned *pos = ng->pos;
- unsigned *rid = ng->rid;
- unsigned (*h_tmp)[GZ] = GDKzalloc(BIGRAM_SZ * sizeof(unsigned));
+ unsigned *h = ng->histogram;
+ unsigned *lists = ng->lists;
+ unsigned *rids = ng->rids;
+ unsigned (*h_tmp)[SZ] = GDKzalloc(BIGRAM_SZ * sizeof(unsigned));
unsigned *h_tmp_ptr = (unsigned *) h_tmp;
unsigned *map = GDKmalloc(BIGRAM_SZ * sizeof(unsigned));
@@ -205,17 +205,15 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
for (size_t i = 0; i < b_cnt; i++) {
const char *s = BUNtail(*bi, i);
- if (!strNil(s) && *s) {
- unsigned char p = CHAR_MAP(*s++);
- for (; *s; p = CHAR_MAP(*s), s++)
- h_tmp[p][CHAR_MAP(*s)]++;
- }
+ if (!strNil(s))
+ for (; BIGRAM(s); s++)
+ h_tmp[ENC_TOKEN1(s)][ENC_TOKEN2(s)]++;
}
for (size_t i = 0; i < BIGRAM_SZ; i++) {
map[i] = i;
- idx[i] = pos[i] = 0;
- ng->h[i] = h_tmp_ptr[i];
+ idx[i] = lists[i] = 0;
+ h[i] = h_tmp_ptr[i];
}
GDKqsort(h_tmp, map, NULL, BIGRAM_SZ,
@@ -230,38 +228,41 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
ng->max = h_tmp_ptr[0];
ng->min = h_tmp_ptr[j];
- int n_shift = 0;
+ int n = 0;
for (size_t i = 0; i < BIGRAM_SZ && h_tmp_ptr[i] > 0; i++) {
- unsigned x = (map[i] / GZ) % GZ;
- unsigned y = map[i] % GZ;
- idx[x * GZ + y] = NGRAM_CST(1) << n_shift;
- n_shift++;
- n_shift %= NGRAM_BITS;
+ /* unsigned x = (map[i] / SZ) % SZ, y = map[i] % SZ; */
+ /* idx[x*SZ + y] = NGRAM_CST(1) << n; */
+ /* n++; */
+ /* n %= NGRAM_BITS; */
+ /* assert(x*SZ + y == map[i]); */
+ idx[map[i]] = NGRAM_CST(1) << n++;
+ n %= NGRAM_BITS;
}
- unsigned int p = 1;
+ unsigned int k = 1;
for (size_t i = 0; i < b_cnt; i++) {
const char *s = BUNtail(*bi, i);
- if (!strNil(s) && s[0] && s[1]) {
+ if (!strNil(s) && BIGRAM(s)) {
NGRAM_TYPE sig = 0;
- unsigned c = CHAR_MAP(*s++);
- for (; *s; c = CHAR_MAP(*s), s++) {
- int k = c * GZ + CHAR_MAP(*s);
- sig |= idx[k];
- if (h[k] <= ng->min) {
- if (pos[k] == 0) {
- pos[k] = p;
- p += h[k];
- h[k] = 0;
+ for (; BIGRAM(s); s++) {
+ unsigned bigram = ENC_TOKEN1(s)*SZ +
ENC_TOKEN2(s);
+ sig |= idx[bigram];
+ if (h[bigram] <= ng->min) {
+ if (lists[bigram] == 0) {
+ lists[bigram] = k;
+ k += h[bigram];
+ h[bigram] = 0;
}
- int done = (h[k] > 0 && rid[pos[k] +
h[k] -1] == i);
+ int done = (h[bigram] > 0 &&
rids[lists[bigram] + h[bigram] - 1] == i);
if (!done) {
- rid[pos[k] + h[k]] = i;
- h[k]++;
+ rids[lists[bigram] + h[bigram]]
= i;
+ h[bigram]++;
}
}
}
*sigs = sig;
+ /* } else if (!strNil(s)) { */
+ /* *sigs = 1; */
} else {
*sigs = NGRAM_TYPENIL;
}
@@ -278,10 +279,10 @@ init_trigram_idx(Ngrams *ng, BATiter *bi
{
NGRAM_TYPE *idx = ng->idx;
NGRAM_TYPE *sigs = ng->sigs;
- unsigned *h = ng->h;
- unsigned *pos = ng->pos;
- unsigned *rid = ng->rid;
- unsigned (*h_tmp)[GZ][GZ] = GDKzalloc(TRIGRAM_SZ * sizeof(unsigned));
+ unsigned *h = ng->histogram;
+ unsigned *lists = ng->lists;
+ unsigned *rids = ng->rids;
+ unsigned (*h_tmp)[SZ][SZ] = GDKzalloc(TRIGRAM_SZ * sizeof(unsigned));
unsigned *h_tmp_ptr = (unsigned *) h_tmp;
unsigned *map = GDKmalloc(TRIGRAM_SZ * sizeof(unsigned));
@@ -293,20 +294,15 @@ init_trigram_idx(Ngrams *ng, BATiter *bi
for (size_t i = 0; i < b_cnt; i++) {
const char *s = BUNtail(*bi, i);
- if (!strNil(s) && *s) {
- unsigned char pp = CHAR_MAP(*s++);
- if (!*s)
- continue;
- unsigned char p = CHAR_MAP(*s++);
- for(; *s; pp = p, p = CHAR_MAP(*s), s++)
- h_tmp[pp][p][CHAR_MAP(*s)]++;
- }
+ if (!strNil(s))
+ for (; TRIGRAM(s); s++)
+
h_tmp[ENC_TOKEN1(s)][ENC_TOKEN2(s)][ENC_TOKEN3(s)]++;
}
- for (size_t j = 0; j < TRIGRAM_SZ; j++) {
- map[j] = j;
- idx[j] = pos[j] = 0;
- ng->h[j] = h_tmp_ptr[j];
+ for (size_t i = 0; i < TRIGRAM_SZ; i++) {
+ map[i] = i;
+ idx[i] = lists[i] = 0;
+ h[i] = h_tmp_ptr[i];
}
GDKqsort(h_tmp, map, NULL, TRIGRAM_SZ,
@@ -321,41 +317,36 @@ init_trigram_idx(Ngrams *ng, BATiter *bi
ng->max = h_tmp_ptr[0];
ng->min = h_tmp_ptr[j];
- int n_shift = 0;
- for (size_t j = 0; j < TRIGRAM_SZ && h_tmp_ptr[j] > 0; j++) {
- unsigned x = map[j]/(GZ*GZ);
- unsigned y = (map[j]/GZ)%GZ;
- unsigned z = map[j]%GZ;
- idx[x*GZ*GZ+y*GZ+z] = NGRAM_CST(1) << n_shift;
- n_shift++;
- n_shift %= NGRAM_BITS;
+ int n = 0;
+ for (size_t i = 0; i < TRIGRAM_SZ && h_tmp_ptr[i] > 0; i++) {
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]