Changeset: ee99daf915d3 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/ee99daf915d3
Modified Files:
        monetdb5/modules/mal/ngrams.c
        monetdb5/modules/mal/ngrams.h
Branch: strimps_v3
Log Message:

Added nested loop for small inputs. Threshold not set in stone yet.


diffs (135 lines):

diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c
--- a/monetdb5/modules/mal/ngrams.c
+++ b/monetdb5/modules/mal/ngrams.c
@@ -643,6 +643,50 @@ NGselect(MalStkPtr stk, InstrPtr pci,
 }
 
 static str
+join_nested_loop(BAT *rl, BAT *rr, BATiter *li, BATiter *ri,
+                                struct canditer *lci, struct canditer *rci,
+                                int (*str_cmp)(const char *, const char *, 
int),
+                                QryCtx *qry_ctx)
+{
+       str msg = MAL_SUCCEED;
+       size_t new_cap;
+       oid lbase = li->b->hseqbase, rbase = ri->b->hseqbase, or, ol;
+       const char *lvars = li->vh->base, *rvars = ri->vh->base,
+               *lvals = li->base, *rvals = ri->base;
+
+       canditer_reset(lci);
+       TIMEOUT_LOOP(rci->ncand, qry_ctx) {
+               or = canditer_next(rci);
+               const char *rs = VALUE(r, or - rbase);
+               if (strNil(rs))
+                       continue;
+               canditer_reset(lci);
+               TIMEOUT_LOOP(lci->ncand, qry_ctx) {
+                       ol = canditer_next(lci);
+                       const char *ls = VALUE(l, ol - lbase);
+                       if (!strNil(ls)) {
+                               if (str_cmp(ls, rs, str_strlen(rs)) == 0) {
+                                       APPEND(rl, ol);
+                                       if (rr) APPEND(rr, or);
+                                       if (BATcount(rl) == BATcapacity(rl)) {
+                                               new_cap = BATgrows(rl);
+                                               if (BATextend(rl, new_cap) != 
GDK_SUCCEED ||
+                                                       (rr && BATextend(rr, 
new_cap) != GDK_SUCCEED)) {
+                                                       throw(MAL, 
"join_unigram", GDK_EXCEPTION);
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+       BATsetcount(rl, BATcount(rl));
+       if (rr) BATsetcount(rr, BATcount(rr));
+
+       return msg;
+}
+
+static str
 join_unigram(BAT *rl, BAT *rr, BATiter *li, BATiter *ri,
                         struct canditer *lci, struct canditer *rci,
                         int (*str_cmp)(const char *, const char *, int),
@@ -755,7 +799,7 @@ join_unigram(BAT *rl, BAT *rr, BATiter *
        BATsetcount(rl, BATcount(rl));
        if (rr) BATsetcount(rr, BATcount(rr));
        ngrams_destroy(ng);
-       return MAL_SUCCEED;
+       return msg;
 }
 
 static str
@@ -871,7 +915,8 @@ join_bigram(BAT *rl, BAT *rr, BATiter *l
        BATsetcount(rl, BATcount(rl));
        if (rr) BATsetcount(rr, BATcount(rr));
        ngrams_destroy(ng);
-       return MAL_SUCCEED;
+
+       return msg;
 }
 
 static str
@@ -987,7 +1032,7 @@ join_trigram(BAT *rl, BAT *rr, BATiter *
        BATsetcount(rl, BATcount(rl));
        if (rr) BATsetcount(rr, BATcount(rr));
        ngrams_destroy(ng);
-       return MAL_SUCCEED;
+       return msg;
 }
 
 static str
@@ -1061,21 +1106,21 @@ NGjoin(MalStkPtr stk, InstrPtr pci,
                throw(MAL, fname, SQLSTATE(HY013) MAL_MALLOC_FAIL);
        }
 
-       switch(ngram) {
-       case 1:
-               msg = join_unigram(rl, rr, &li, &ri, &lci, &rci, str_cmp, 
qry_ctx);
-               break;
-       case 2:
-               msg = join_bigram(rl, rr, &li, &ri, &lci, &rci, str_cmp, 
qry_ctx);
-               break;
-       case 3:
-               msg = join_trigram(rl, rr, &li, &ri, &lci, &rci, str_cmp, 
qry_ctx);
-               break;
-       default:
-               bat_iterator_end(&li);
-               bat_iterator_end(&ri);
-               BBPreclaim_n(6, rl, rr, l, r, cl, cr);
-               throw(MAL, fname, SQLSTATE(42000) "Only uni, bi or trigrams 
available.");
+       if (lci.ncand < 1000 || rci.ncand < 5)
+               join_nested_loop(rl, rr, &li, &ri, &lci, &rci, str_cmp, 
qry_ctx);
+       else {
+               if (ngram == 1)
+                       msg = join_unigram(rl, rr, &li, &ri, &lci, &rci, 
str_cmp, qry_ctx);
+               else if (ngram == 2)
+                       msg = join_bigram(rl, rr, &li, &ri, &lci, &rci, 
str_cmp, qry_ctx);
+               else if (ngram == 3)
+                       msg = join_trigram(rl, rr, &li, &ri, &lci, &rci, 
str_cmp, qry_ctx);
+               else {
+                       bat_iterator_end(&li);
+                       bat_iterator_end(&ri);
+                       BBPreclaim_n(6, rl, rr, l, r, cl, cr);
+                       throw(MAL, fname, SQLSTATE(42000) "Only uni, bi or 
trigrams available.");
+               }
        }
 
        bat_iterator_end(&li);
diff --git a/monetdb5/modules/mal/ngrams.h b/monetdb5/modules/mal/ngrams.h
--- a/monetdb5/modules/mal/ngrams.h
+++ b/monetdb5/modules/mal/ngrams.h
@@ -44,9 +44,9 @@
 #define BIGRAM(s)      (TOKEN1(s) && TOKEN2(s))
 #define TRIGRAM(s)     (TOKEN1(s) && TOKEN2(s) && TOKEN3(s))
 
-#define ENC_TOKEN1(t) CHARMAP(*t)                      /* encoded token #one */
-#define ENC_TOKEN2(t) CHARMAP(*(t + 1))                /* encoded token #two */
-#define ENC_TOKEN3(t) CHARMAP(*(t + 2))                /* encoded token #three 
*/
+#define ENC_TOKEN1(t) CHARMAP(*t)                      /* encoded first token 
*/
+#define ENC_TOKEN2(t) CHARMAP(*(t + 1))                /* encoded second token 
*/
+#define ENC_TOKEN3(t) CHARMAP(*(t + 2))                /* encoded third token 
*/
 
 #define VALUE(s, x)  (s##vars + VarHeapVal(s##vals, (x), s##i->width))
 #define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to