Changeset: 167d56a38ac5 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/167d56a38ac5
Modified Files:
monetdb5/modules/mal/ngrams.c
sql/scripts/49_strings.sql
Branch: strimps_v3
Log Message:
bigrams and trigrams available
diffs (truncated from 470 to 300 lines):
diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c
--- a/monetdb5/modules/mal/ngrams.c
+++ b/monetdb5/modules/mal/ngrams.c
@@ -446,8 +446,8 @@ NGc2join_intern(bat *L, bat *R, bat *H,
if (BATcount(n) < 10) {
}
- Ngrams *ngi = ngrams_create_old(h, BIGRAM_SZ);
- if (ngi && ngrams_init_2gram(ngi, h) == 0) {
+ Ngrams *ng = ngrams_create_old(h, BIGRAM_SZ);
+ if (ng && ngrams_init_2gram(ng, h) == 0) {
BUN cnt = BATcount(h);
/* create L/R */
BAT *l = COLnew(0, TYPE_oid, 10*cnt, TRANSIENT);
@@ -467,23 +467,23 @@ NGc2join_intern(bat *L, bat *R, bat *H,
if ((ol+1000) > el)
break;
if (!strNil(s) && s[0] && s[1]) { /* skipped */
- NGRAM_TYPE min = ngi->max;
+ NGRAM_TYPE min = ng->max;
unsigned int min_pos = 0;
unsigned char p = CHAR_MAP(*s++);
for(; *s; p=CHAR_MAP(*s), s++) {
unsigned int k = p*GZ+CHAR_MAP(*s);
- sig |= ngi->idx[k];
- if (ngi->h[k] < min) {
- min = ngi->h[k];
+ sig |= ng->idx[k];
+ if (ng->h[k] < min) {
+ min = ng->h[k];
min_pos = k; /* encoded min
ngram */
}
}
- if (min <= ngi->min) {
- unsigned int rr = ngi->pos[min_pos];
- int hcnt = ngi->h[min_pos];
+ if (min <= ng->min) {
+ unsigned int rr = ng->pos[min_pos];
+ int hcnt = ng->h[min_pos];
for(int k = 0; k<hcnt; k++, rr++) {
- unsigned int hr = ngi->rid[rr];
- if (((ngi->sigs[hr] & sig) ==
sig)) {
+ unsigned int hr = ng->rid[rr];
+ if (((ng->sigs[hr] & sig) ==
sig)) {
char *hs = BUNtail(hi,
hr);
if (strstr(hs, os) !=
NULL) {
*ol++ = hr;
@@ -494,7 +494,7 @@ NGc2join_intern(bat *L, bat *R, bat *H,
} else {
unsigned int hcnt = BATcount(h);
for(size_t k = 0; k < hcnt; k++) {
- if (((ngi->sigs[k] & sig) ==
sig)) {
+ if (((ng->sigs[k] & sig) ==
sig)) {
char *hs = BUNtail(hi,
k);
if (strstr(hs, os) !=
NULL) {
*ol++ = k;
@@ -526,7 +526,7 @@ NGc2join_intern(bat *L, bat *R, bat *H,
*R = r->batCacheid;
BBPkeepref(l);
BBPkeepref(r);
- ngrams_destroy(ngi);
+ ngrams_destroy(ng);
return MAL_SUCCEED;
}
BBPreclaim(h);
@@ -838,8 +838,8 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
ng->h[j] = (unsigned int)hist[j];
}
GDKqsort(h, id, NULL, UNIGRAM_SZ, sizeof(NGRAM_TYPE), sizeof(int),
NGRAM_TYPEID, true, false);
- for(i=UNIGRAM_SZ-1; i>=0; i--) {
- if ((sum + hist[i]) >= (NGRAM_MULTIPLE*b_cnt)-1)
+ for(i = UNIGRAM_SZ - 1; i >= 0; i--) {
+ if ((sum + hist[i]) >= (NGRAM_MULTIPLE * b_cnt) - 1)
break;
sum += hist[i];
}
@@ -893,6 +893,186 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
return 0;
}
+static int
+init_bigram_idx(Ngrams *ng, BATiter *bi, size_t b_cnt)
+{
+ NGRAM_TYPE (*h)[GZ] = (NGRAM_TYPE (*)[GZ])GDKzalloc(BIGRAM_SZ *
sizeof(NGRAM_TYPE)),
+ *hist = (NGRAM_TYPE*)h, sum = 0;
+ NGRAM_TYPE *idx = ng->idx;
+ int *id = GDKmalloc(BIGRAM_SZ * sizeof(int)), i;
+
+ if (!h || !id) {
+ GDKfree(h);
+ GDKfree(id);
+ return -1;
+ }
+
+ for (size_t j = 0; j < b_cnt; j++) {
+ const char *s = BUNtail(*bi, j);
+ if (!strNil(s) && *s) {
+ unsigned char p = CHAR_MAP(*s++);
+ for (; *s; p = CHAR_MAP(*s), s++) {
+ h[p][CHAR_MAP(*s)]++;
+ }
+ }
+ }
+
+ int bc = 0;
+
+ for(size_t j = 0; j < BIGRAM_SZ; j++) {
+ id[j] = j;
+ idx[j] = 0;
+ ng->h[j] = (unsigned int)hist[j];
+ }
+ GDKqsort(h, id, NULL, BIGRAM_SZ, sizeof(NGRAM_TYPE), sizeof(int),
NGRAM_TYPEID, true, false);
+ for (i = BIGRAM_SZ - 1; i >= 0; i--) {
+ if ((sum + hist[i]) >= (NGRAM_MULTIPLE * b_cnt) - 1)
+ break;
+ sum += hist[i];
+ }
+ NGRAM_TYPE larger_cnt = hist[i];
+ for(; hist[i] == larger_cnt; i++)
+ ;
+ NGRAM_TYPE max = hist[0], small = hist[i];
+ ng->max = max;
+ ng->min = small;
+
+ for (size_t j = 0; j < BIGRAM_SZ && hist[j] > 0; j++) {
+ int y = (id[j]/GZ) % GZ, z = id[j] % GZ;
+ idx[y*GZ+z] = NGRAM_CST(1) << bc;
+ assert(idx[y*GZ+z] > 0);
+ bc++;
+ bc %= NGRAM_BITS;
+ }
+
+ NGRAM_TYPE *sp = ng->sigs;
+ unsigned int pos = 1;
+ for(size_t j = 0; j < b_cnt; j++) {
+ const char *s = BUNtail(*bi, j);
+ NGRAM_TYPE sig = 0;
+ if (!strNil(s) && s[0] && s[1]) {
+ unsigned char p = CHAR_MAP(*s++);
+ for(; *s; p = CHAR_MAP(*s), s++) {
+ int k = p*GZ+CHAR_MAP(*s);
+ sig |= idx[k];
+ if (ng->h[k] <= ng->min) {
+ if (ng->pos[k] == 0) {
+ ng->pos[k] = pos;
+ pos += ng->h[k];
+ ng->h[k] = 0;
+ }
+ /* deduplicate */
+ int done = (ng->h[k] > 0 &&
ng->rid[ng->pos[k] + ng->h[k]-1] == j);
+ if (!done) {
+ ng->rid[ng->pos[k] + ng->h[k]]
= j;
+ ng->h[k]++;
+ }
+ }
+ }
+ *sp = sig;
+ } else {
+ *sp = NGRAM_TYPENIL;
+ }
+ sp++;
+ }
+
+ GDKfree(h);
+ GDKfree(id);
+ return 0;
+}
+
+static int
+init_trigram_idx(Ngrams *ng, BATiter *bi, size_t b_cnt)
+{
+ NGRAM_TYPE (*h)[GZ][GZ] = (NGRAM_TYPE (*)[GZ][GZ])GDKzalloc(TRIGRAM_SZ
* sizeof(NGRAM_TYPE)),
+ *hist = (NGRAM_TYPE*)h, sum = 0;
+ NGRAM_TYPE *idx = ng->idx;
+ int *id = (int*)GDKmalloc(TRIGRAM_SZ*sizeof(int)), i;
+
+ if (!h || !id) {
+ GDKfree(h);
+ GDKfree(id);
+ return -1;
+ }
+
+ for (size_t i = 0; i < b_cnt; i++) {
+ const char *s = BUNtail(*bi,i);
+ if (!strNil(s) && *s) {
+ unsigned char pp = CHAR_MAP(*s++);
+ if (!*s)
+ continue;
+ unsigned char p = CHAR_MAP(*s++);
+ for(; *s; pp = p, p = CHAR_MAP(*s), s++)
+ h[pp][p][CHAR_MAP(*s)]++;
+ }
+ }
+
+ int bc = 0;
+
+ for (size_t j = 0; j < TRIGRAM_SZ; j++) {
+ id[j] = j;
+ idx[j] = 0;
+ ng->h[j] = (unsigned int)hist[j]; /* TODO check for overflow ?
*/
+ }
+ GDKqsort(h, id, NULL, TRIGRAM_SZ, sizeof(NGRAM_TYPE), sizeof(int),
NGRAM_TYPEID, true, false);
+ for (i = TRIGRAM_SZ - 1; i >= 0; i--) {
+ if ((sum + hist[i]) >= (NGRAM_MULTIPLE * b_cnt) - 1)
+ break;
+ sum += hist[i];
+ }
+ NGRAM_TYPE larger_cnt = hist[i];
+ for (; hist[i] == larger_cnt; i++)
+ ;
+ NGRAM_TYPE max = hist[0], small = hist[i];
+ ng->max = max;
+ ng->min = small;
+
+ for (size_t j = 0; j < TRIGRAM_SZ && hist[j] > 0; j++) {
+ unsigned int x = id[j]/(GZ*GZ), y = (id[j]/GZ)%GZ,
+ z = id[j]%GZ;
+ idx[x*GZ*GZ+y*GZ+z] = NGRAM_CST(1)<<bc;
+ assert(idx[x*GZ*GZ+y*GZ+z] > 0);
+ bc++;
+ bc %= NGRAM_BITS;
+ }
+
+ NGRAM_TYPE *sp = ng->sigs;
+ unsigned int pos = 1;
+ for (size_t j = 0; j < b_cnt; j++) {
+ const char *s = BUNtail(*bi, j);
+ NGRAM_TYPE sig = 0;
+ if (!strNil(s) && s[0] && s[1] && s[2]) {
+ unsigned char pp = CHAR_MAP(*s++);
+ unsigned char p = CHAR_MAP(*s++);
+ for(; *s; pp = p, p = CHAR_MAP(*s), s++) {
+ int k = pp*GZ*GZ+p*GZ+CHAR_MAP(*s);
+ sig |= idx[k];
+ if (ng->h[k] <= ng->min) {
+ if (ng->pos[k] == 0) {
+ ng->pos[k] = pos;
+ pos += ng->h[k];
+ ng->h[k] = 0;
+ }
+ /* deduplicate */
+ int done = (ng->h[k] > 0 &&
ng->rid[ng->pos[k] + ng->h[k]-1] == j);
+ if (!done) {
+ ng->rid[ng->pos[k] + ng->h[k]]
= j;
+ ng->h[k]++;
+ }
+ }
+ }
+ *sp = sig;
+ } else {
+ *sp = NGRAM_TYPENIL;
+ }
+ sp++;
+ }
+
+ GDKfree(h);
+ GDKfree(id);
+ return 0;
+}
+
static str
join_unigram(BAT *rl, BAT *rr, BATiter *li, BATiter *ri,
size_t l_cnt, size_t r_cnt,
@@ -915,7 +1095,7 @@ join_unigram(BAT *rl, BAT *rr, BATiter *
const char *s = BUNtail(*ri, i), *os = s;
NGRAM_TYPE sig = 0;
- if ((ol+1000) > el)
+ if ((ol + 1000) > el)
break;
if (!strNil(s) && s[0]) {
NGRAM_TYPE min = ng->max;
@@ -972,6 +1152,165 @@ join_unigram(BAT *rl, BAT *rr, BATiter *
}
static str
+join_bigram(BAT *rl, BAT *rr, BATiter *li, BATiter *ri,
+ size_t l_cnt, size_t r_cnt,
+ int (*str_cmp)(const char *, const char *, int))
+{
+ Ngrams *ng = ngrams_create(l_cnt, BIGRAM_SZ);
+
+ if (!ng)
+ throw(MAL, "join_bigram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
+
+ if (init_bigram_idx(ng, li, l_cnt) != 0)
+ throw(MAL, "join_bigram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
+
+ NGRAM_TYPE nmax = 0;
+ oid *ol = Tloc(rl, 0), *el = ol + 10 * l_cnt;
+ oid *or = Tloc(rr, 0);
+
+ /* if needed grow */
+ for(size_t i = 0; i < r_cnt; i++) {
+ const char *s = BUNtail(*ri, i), *os = s;
+ NGRAM_TYPE sig = 0;
+
+ if ((ol + 1000) > el)
+ break;
+ if (!strNil(s) && s[0] && s[1]) { /* skipped */
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]