Changeset: e9a266d358b6 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/e9a266d358b6
Modified Files:
monetdb5/modules/mal/ngrams.c
monetdb5/modules/mal/ngrams.h
Branch: strimps_v3
Log Message:
Joins and select with candidates for uni, bi and trigrams
diffs (truncated from 1471 to 300 lines):
diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c
--- a/monetdb5/modules/mal/ngrams.c
+++ b/monetdb5/modules/mal/ngrams.c
@@ -18,19 +18,19 @@
#include "str.h"
static inline int
-is_prefix(const char *s1, const char *s2, int s2_len)
+ng_prefix(const char *s1, const char *s2, int s2_len)
{
return strncmp(s1, s2, s2_len);
}
static inline int
-is_suffix(const char *s1, const char *s2, int s2_len)
+ng_suffix(const char *s1, const char *s2, int s2_len)
{
return strcmp(s1 + strlen(s1) - s2_len, s2);
}
static inline int
-is_contains(const char *s1, const char *s2, int s2_len)
+ng_contains(const char *s1, const char *s2, int s2_len)
{
(void) s2_len;
return strstr(s1, s2) == NULL;
@@ -62,15 +62,15 @@ ngrams_destroy(Ngrams *ng)
}
static Ngrams *
-ngrams_create(size_t b_cnt, size_t ng_sz)
+ngrams_create(size_t cnt, size_t ng_sz)
{
Ngrams *ng = GDKmalloc(sizeof(Ngrams));
if (ng) {
ng->idx = GDKmalloc(ng_sz * sizeof(NGRAM_TYPE));
- ng->sigs = GDKmalloc(b_cnt * sizeof(NGRAM_TYPE));
+ ng->sigs = GDKmalloc(cnt * sizeof(NGRAM_TYPE));
ng->histogram = GDKmalloc(ng_sz * sizeof(unsigned));
ng->lists = GDKmalloc(ng_sz * sizeof(unsigned));
- ng->rids = GDKmalloc(2 * NGRAM_MULTIPLE * b_cnt *
sizeof(unsigned));
+ ng->rids = GDKmalloc(2 * NGRAM_MULTIPLE * cnt *
sizeof(unsigned));
}
if (!ng || !ng->idx || !ng->sigs || !ng->histogram || !ng->lists ||
!ng->rids) {
ngrams_destroy(ng);
@@ -80,29 +80,7 @@ ngrams_create(size_t b_cnt, size_t ng_sz
}
static str
-ngram_choice(const bat *NG, bte *ngram, const char *fname)
-{
- BAT *ng = NULL;
- if ((ng = BATdescriptor(*NG)) == NULL)
- throw(MAL, fname, SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
-
- BATiter bi = bat_iterator(ng);
- if (bi.count != 1) {
- bat_iterator_end(&bi);
- BBPreclaim(ng);
- if (bi.count < 1)
- throw(MAL, fname, SQLSTATE(42000) "Empty bat\n");
- else
- throw(MAL, fname, SQLSTATE(42000) "Single value bat
expected\n");
- }
- *ngram = *(bte *) BUNtloc(bi, 0);
- bat_iterator_end(&bi);
- BBPreclaim(ng);
- return MAL_SUCCEED;
-}
-
-static int
-init_unigram_idx(Ngrams *ng, BATiter *bi, size_t b_cnt)
+init_unigram_idx(Ngrams *ng, BATiter *bi, struct canditer *bci, QryCtx
*qry_ctx)
{
NGRAM_TYPE *idx = ng->idx;
NGRAM_TYPE *sigs = ng->sigs;
@@ -111,21 +89,27 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
unsigned *rids = ng->rids;
unsigned *h_tmp = GDKzalloc(UNIGRAM_SZ * sizeof(unsigned));
unsigned *map = GDKmalloc(UNIGRAM_SZ * sizeof(unsigned));
+ unsigned k = 1;
if (!h_tmp || !map) {
GDKfree(h_tmp);
GDKfree(map);
- return -1;
+ throw(MAL, "init_unigram_idx", SQLSTATE(HY013) MAL_MALLOC_FAIL);
}
- for (size_t i = 0; i < b_cnt; i++) {
- const char *s = BUNtail(*bi, i);
+ oid bbase = bi->b->hseqbase, ob;
+ const char *bvars = bi->vh->base, *bvals = bi->base;
+
+ canditer_reset(bci);
+ TIMEOUT_LOOP(bci->ncand, qry_ctx) {
+ ob = canditer_next(bci);
+ const char *s = VALUE(b, ob - bbase);
if (!strNil(s))
- for (; UNIGRAM(s); s++)
+ for ( ; UNIGRAM(s); s++)
h_tmp[ENC_TOKEN1(s)]++;
}
- for(size_t i = 0; i < UNIGRAM_SZ; i++) {
+ for (size_t i = 0; i < UNIGRAM_SZ; i++) {
map[i] = i;
idx[i] = lists[i] = 0;
h[i] = h_tmp[i];
@@ -135,27 +119,27 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
sizeof(unsigned), sizeof(unsigned), TYPE_int, true,
false);
unsigned j = UNIGRAM_SZ - 1, sum = 0;
- for (; j; j--) {
+ for ( ; j; j--) {
sum += h_tmp[j];
- if (sum + h_tmp[j] >= NGRAM_MULTIPLE * b_cnt - 1)
+ if (sum + h_tmp[j] >= NGRAM_MULTIPLE * bci->ncand - 1)
break;
}
ng->max = h_tmp[0];
ng->min = h_tmp[j];
int n = 0;
- for(size_t i = 0; i < UNIGRAM_SZ && h_tmp[i] > 0; i++) {
- unsigned x = map[i];
- idx[x] = NGRAM_CST(1) << n++;
+ for (size_t i = 0; i < UNIGRAM_SZ && h_tmp[i] > 0; i++) {
+ idx[map[i]] = NGRAM_CST(1) << n++;
n %= NGRAM_BITS;
}
- unsigned k = 1;
- for(size_t i = 0; i < b_cnt; i++) {
- const char *s = BUNtail(*bi, i);
+ canditer_reset(bci);
+ TIMEOUT_LOOP(bci->ncand, qry_ctx) {
+ ob = canditer_next(bci);
+ const char *s = VALUE(b, ob - bbase);
if (!strNil(s) && UNIGRAM(s)) {
NGRAM_TYPE sig = 0;
- for(; UNIGRAM(s); s++) {
+ for ( ; UNIGRAM(s); s++) {
unsigned unigram = ENC_TOKEN1(s);
sig |= idx[unigram];
if (h[unigram] <= ng->min) {
@@ -164,16 +148,17 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
k += h[unigram];
h[unigram] = 0;
}
- int done = (h[unigram] > 0 &&
rids[lists[unigram] + h[unigram] - 1] == i);
+ bool done = (h[unigram] > 0 &&
+
rids[lists[unigram] + h[unigram] - 1] == ob - bbase);
if (!done) {
- rids[lists[unigram] +
h[unigram]] = i;
+ rids[lists[unigram] +
h[unigram]] = ob - bbase;
h[unigram]++;
}
}
}
*sigs = sig;
} else if (!strNil(s)) {
- *sigs = 1;
+ *sigs = ~0LL; /* TODO */
} else {
*sigs = NGRAM_TYPENIL;
}
@@ -182,11 +167,11 @@ init_unigram_idx(Ngrams *ng, BATiter *bi
GDKfree(h_tmp);
GDKfree(map);
- return 0;
+ return MAL_SUCCEED;
}
-static int
-init_bigram_idx(Ngrams *ng, BATiter *bi, size_t b_cnt)
+static str
+init_bigram_idx(Ngrams *ng, BATiter *bi, struct canditer *bci, QryCtx *qry_ctx)
{
NGRAM_TYPE *idx = ng->idx;
NGRAM_TYPE *sigs = ng->sigs;
@@ -196,17 +181,23 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
unsigned (*h_tmp)[SZ] = GDKzalloc(BIGRAM_SZ * sizeof(unsigned));
unsigned *h_tmp_ptr = (unsigned *) h_tmp;
unsigned *map = GDKmalloc(BIGRAM_SZ * sizeof(unsigned));
+ unsigned int k = 1;
if (!h_tmp || !map) {
GDKfree(h_tmp);
GDKfree(map);
- return -1;
+ throw(MAL, "init_bigram_idx", SQLSTATE(HY013) MAL_MALLOC_FAIL);
}
- for (size_t i = 0; i < b_cnt; i++) {
- const char *s = BUNtail(*bi, i);
+ oid bbase = bi->b->hseqbase, ob;
+ const char *bvars = bi->vh->base, *bvals = bi->base;
+
+ canditer_reset(bci);
+ TIMEOUT_LOOP(bci->ncand, qry_ctx) {
+ ob = canditer_next(bci);
+ const char *s = VALUE(b, ob - bbase);
if (!strNil(s))
- for (; BIGRAM(s); s++)
+ for ( ; BIGRAM(s); s++)
h_tmp[ENC_TOKEN1(s)][ENC_TOKEN2(s)]++;
}
@@ -220,9 +211,9 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
sizeof(unsigned), sizeof(unsigned), TYPE_int, true,
false);
unsigned j = BIGRAM_SZ - 1, sum = 0;
- for (; j; j--) {
+ for ( ; j; j--) {
sum += h_tmp_ptr[j];
- if ((sum + h_tmp_ptr[j]) >= NGRAM_MULTIPLE * b_cnt - 1)
+ if ((sum + h_tmp_ptr[j]) >= NGRAM_MULTIPLE * bci->ncand - 1)
break;
}
ng->max = h_tmp_ptr[0];
@@ -230,21 +221,17 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
int n = 0;
for (size_t i = 0; i < BIGRAM_SZ && h_tmp_ptr[i] > 0; i++) {
- /* unsigned x = (map[i] / SZ) % SZ, y = map[i] % SZ; */
- /* idx[x*SZ + y] = NGRAM_CST(1) << n; */
- /* n++; */
- /* n %= NGRAM_BITS; */
- /* assert(x*SZ + y == map[i]); */
idx[map[i]] = NGRAM_CST(1) << n++;
n %= NGRAM_BITS;
}
- unsigned int k = 1;
- for (size_t i = 0; i < b_cnt; i++) {
- const char *s = BUNtail(*bi, i);
+ canditer_reset(bci);
+ TIMEOUT_LOOP(bci->ncand, qry_ctx) {
+ ob = canditer_next(bci);
+ const char *s = VALUE(b, ob - bbase);
if (!strNil(s) && BIGRAM(s)) {
NGRAM_TYPE sig = 0;
- for (; BIGRAM(s); s++) {
+ for ( ; BIGRAM(s); s++) {
unsigned bigram = ENC_TOKEN1(s)*SZ +
ENC_TOKEN2(s);
sig |= idx[bigram];
if (h[bigram] <= ng->min) {
@@ -253,16 +240,16 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
k += h[bigram];
h[bigram] = 0;
}
- int done = (h[bigram] > 0 &&
rids[lists[bigram] + h[bigram] - 1] == i);
+ int done = (h[bigram] > 0 &&
rids[lists[bigram] + h[bigram] - 1] == ob - bbase);
if (!done) {
- rids[lists[bigram] + h[bigram]]
= i;
+ rids[lists[bigram] + h[bigram]]
= ob - bbase;
h[bigram]++;
}
}
}
*sigs = sig;
- /* } else if (!strNil(s)) { */
- /* *sigs = 1; */
+ } else if (!strNil(s)) {
+ *sigs = ~0LL; /* TODO */
} else {
*sigs = NGRAM_TYPENIL;
}
@@ -271,11 +258,11 @@ init_bigram_idx(Ngrams *ng, BATiter *bi,
GDKfree(h_tmp);
GDKfree(map);
- return 0;
+ return MAL_SUCCEED;
}
-static int
-init_trigram_idx(Ngrams *ng, BATiter *bi, size_t b_cnt)
+static str
+init_trigram_idx(Ngrams *ng, BATiter *bi, struct canditer *bci, QryCtx
*qry_ctx)
{
NGRAM_TYPE *idx = ng->idx;
NGRAM_TYPE *sigs = ng->sigs;
@@ -285,17 +272,23 @@ init_trigram_idx(Ngrams *ng, BATiter *bi
unsigned (*h_tmp)[SZ][SZ] = GDKzalloc(TRIGRAM_SZ * sizeof(unsigned));
unsigned *h_tmp_ptr = (unsigned *) h_tmp;
unsigned *map = GDKmalloc(TRIGRAM_SZ * sizeof(unsigned));
+ unsigned k = 1;
if (!h_tmp || !map) {
GDKfree(h_tmp);
GDKfree(map);
- return -1;
+ throw(MAL, "init_trigram_idx", SQLSTATE(HY013) MAL_MALLOC_FAIL);
}
- for (size_t i = 0; i < b_cnt; i++) {
- const char *s = BUNtail(*bi, i);
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]