Changeset: ee99daf915d3 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/ee99daf915d3
Modified Files:
monetdb5/modules/mal/ngrams.c
monetdb5/modules/mal/ngrams.h
Branch: strimps_v3
Log Message:
Added nested loop for small inputs. Threshold not set in stone yet.
diffs (135 lines):
diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c
--- a/monetdb5/modules/mal/ngrams.c
+++ b/monetdb5/modules/mal/ngrams.c
@@ -643,6 +643,50 @@ NGselect(MalStkPtr stk, InstrPtr pci,
}
static str
+join_nested_loop(BAT *rl, BAT *rr, BATiter *li, BATiter *ri,
+ struct canditer *lci, struct canditer *rci,
+ int (*str_cmp)(const char *, const char *,
int),
+ QryCtx *qry_ctx)
+{
+ str msg = MAL_SUCCEED;
+ size_t new_cap;
+ oid lbase = li->b->hseqbase, rbase = ri->b->hseqbase, or, ol;
+ const char *lvars = li->vh->base, *rvars = ri->vh->base,
+ *lvals = li->base, *rvals = ri->base;
+
+ canditer_reset(lci);
+ TIMEOUT_LOOP(rci->ncand, qry_ctx) {
+ or = canditer_next(rci);
+ const char *rs = VALUE(r, or - rbase);
+ if (strNil(rs))
+ continue;
+ canditer_reset(lci);
+ TIMEOUT_LOOP(lci->ncand, qry_ctx) {
+ ol = canditer_next(lci);
+ const char *ls = VALUE(l, ol - lbase);
+ if (!strNil(ls)) {
+ if (str_cmp(ls, rs, str_strlen(rs)) == 0) {
+ APPEND(rl, ol);
+ if (rr) APPEND(rr, or);
+ if (BATcount(rl) == BATcapacity(rl)) {
+ new_cap = BATgrows(rl);
+ if (BATextend(rl, new_cap) !=
GDK_SUCCEED ||
+ (rr && BATextend(rr,
new_cap) != GDK_SUCCEED)) {
+ throw(MAL,
"join_unigram", GDK_EXCEPTION);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ BATsetcount(rl, BATcount(rl));
+ if (rr) BATsetcount(rr, BATcount(rr));
+
+ return msg;
+}
+
+static str
join_unigram(BAT *rl, BAT *rr, BATiter *li, BATiter *ri,
struct canditer *lci, struct canditer *rci,
int (*str_cmp)(const char *, const char *, int),
@@ -755,7 +799,7 @@ join_unigram(BAT *rl, BAT *rr, BATiter *
BATsetcount(rl, BATcount(rl));
if (rr) BATsetcount(rr, BATcount(rr));
ngrams_destroy(ng);
- return MAL_SUCCEED;
+ return msg;
}
static str
@@ -871,7 +915,8 @@ join_bigram(BAT *rl, BAT *rr, BATiter *l
BATsetcount(rl, BATcount(rl));
if (rr) BATsetcount(rr, BATcount(rr));
ngrams_destroy(ng);
- return MAL_SUCCEED;
+
+ return msg;
}
static str
@@ -987,7 +1032,7 @@ join_trigram(BAT *rl, BAT *rr, BATiter *
BATsetcount(rl, BATcount(rl));
if (rr) BATsetcount(rr, BATcount(rr));
ngrams_destroy(ng);
- return MAL_SUCCEED;
+ return msg;
}
static str
@@ -1061,21 +1106,21 @@ NGjoin(MalStkPtr stk, InstrPtr pci,
throw(MAL, fname, SQLSTATE(HY013) MAL_MALLOC_FAIL);
}
- switch(ngram) {
- case 1:
- msg = join_unigram(rl, rr, &li, &ri, &lci, &rci, str_cmp,
qry_ctx);
- break;
- case 2:
- msg = join_bigram(rl, rr, &li, &ri, &lci, &rci, str_cmp,
qry_ctx);
- break;
- case 3:
- msg = join_trigram(rl, rr, &li, &ri, &lci, &rci, str_cmp,
qry_ctx);
- break;
- default:
- bat_iterator_end(&li);
- bat_iterator_end(&ri);
- BBPreclaim_n(6, rl, rr, l, r, cl, cr);
- throw(MAL, fname, SQLSTATE(42000) "Only uni, bi or trigrams
available.");
+ if (lci.ncand < 1000 || rci.ncand < 5)
+ join_nested_loop(rl, rr, &li, &ri, &lci, &rci, str_cmp,
qry_ctx);
+ else {
+ if (ngram == 1)
+ msg = join_unigram(rl, rr, &li, &ri, &lci, &rci,
str_cmp, qry_ctx);
+ else if (ngram == 2)
+ msg = join_bigram(rl, rr, &li, &ri, &lci, &rci,
str_cmp, qry_ctx);
+ else if (ngram == 3)
+ msg = join_trigram(rl, rr, &li, &ri, &lci, &rci,
str_cmp, qry_ctx);
+ else {
+ bat_iterator_end(&li);
+ bat_iterator_end(&ri);
+ BBPreclaim_n(6, rl, rr, l, r, cl, cr);
+ throw(MAL, fname, SQLSTATE(42000) "Only uni, bi or
trigrams available.");
+ }
}
bat_iterator_end(&li);
diff --git a/monetdb5/modules/mal/ngrams.h b/monetdb5/modules/mal/ngrams.h
--- a/monetdb5/modules/mal/ngrams.h
+++ b/monetdb5/modules/mal/ngrams.h
@@ -44,9 +44,9 @@
#define BIGRAM(s) (TOKEN1(s) && TOKEN2(s))
#define TRIGRAM(s) (TOKEN1(s) && TOKEN2(s) && TOKEN3(s))
-#define ENC_TOKEN1(t) CHARMAP(*t) /* encoded token #one */
-#define ENC_TOKEN2(t) CHARMAP(*(t + 1)) /* encoded token #two */
-#define ENC_TOKEN3(t) CHARMAP(*(t + 2)) /* encoded token #three
*/
+#define ENC_TOKEN1(t) CHARMAP(*t) /* encoded first token
*/
+#define ENC_TOKEN2(t) CHARMAP(*(t + 1)) /* encoded second token
*/
+#define ENC_TOKEN3(t) CHARMAP(*(t + 2)) /* encoded third token
*/
#define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##i->width))
#define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]