Changeset: cd3383e679c5 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/cd3383e679c5
Added Files:
monetdb5/modules/mal/ngrams.h
Modified Files:
monetdb5/modules/mal/CMakeLists.txt
monetdb5/modules/mal/ngrams.c
sql/scripts/49_strings.sql
Branch: strimps_v3
Log Message:
Something is wrong
diffs (truncated from 1342 to 300 lines):
diff --git a/monetdb5/modules/mal/CMakeLists.txt
b/monetdb5/modules/mal/CMakeLists.txt
--- a/monetdb5/modules/mal/CMakeLists.txt
+++ b/monetdb5/modules/mal/CMakeLists.txt
@@ -41,7 +41,7 @@ target_sources(malmodules
projectionpath.c
tablet.c tablet.h
batcalc.c calc.c
- ngrams.c)
+ ngrams.c ngrams.h)
target_include_directories(malmodules
PRIVATE
diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c
--- a/monetdb5/modules/mal/ngrams.c
+++ b/monetdb5/modules/mal/ngrams.c
@@ -11,49 +11,12 @@
*/
#include "monetdb_config.h"
+#include "ngrams.h"
#include "mal_interpreter.h"
#include "mal_exception.h"
#include "string.h"
#include "str.h"
-#if HAVE_HGE
-#define GZ 128
-#define CHAR_MAP(s) (s&127)
-#else
-#define GZ 64
-#define CHAR_MAP(s) (s&63)
-#endif
-
-#define UNIGRAM_SZ GZ
-#define BIGRAM_SZ (GZ*GZ)
-#define TRIGRAM_SZ (GZ*GZ*GZ)
-
-#if HAVE_HGE
-#define NGRAM_TYPE hge
-#define NGRAM_TYPEID TYPE_hge
-#define NGRAM_TYPENIL hge_nil
-#define NGRAM_CST(v) ((hge)LL_CONSTANT(v))
-#define NGRAM_BITS 127
-#else
-#define NGRAM_TYPE lng
-#define NGRAM_TYPEID TYPE_lng
-#define NGRAM_TYPENIL lng_nil
-#define NGRAM_CST(v) LL_CONSTANT(v)
-#define NGRAM_BITS 63
-#endif
-
-#define NGRAM_MULTIPLE 16
-
-#define SET_EMPTY_BAT_PROPS(B) \
- do {
\
- B->tnil = false;
\
- B->tnonil = true;
\
- B->tkey = true;
\
- B->tsorted = true;
\
- B->trevsorted = true; \
- B->tseqbase = 0;
\
- } while (0)
-
static inline void
BBPreclaim_n(int nargs, ...)
{
@@ -66,15 +29,6 @@ BBPreclaim_n(int nargs, ...)
va_end(valist);
}
-typedef struct {
- NGRAM_TYPE *idx;
- NGRAM_TYPE *sigs;
- NGRAM_TYPE max, min;
- unsigned int *h;
- unsigned int *pos;
- unsigned int *rid;
-} Ngrams;
-
static void
ngrams_destroy(Ngrams *ng)
{
@@ -89,23 +43,702 @@ ngrams_destroy(Ngrams *ng)
}
static Ngrams *
-ngrams_create(size_t b_sz, size_t ng_sz)
+ngrams_create(size_t b_cnt, size_t ng_sz)
{
Ngrams *ng = GDKmalloc(sizeof(Ngrams));
if (ng) {
- ng->idx = GDKmalloc(ng_sz * sizeof(NGRAM_TYPE));
- ng->sigs = GDKmalloc(b_sz * sizeof(NGRAM_TYPE));
- ng->h = GDKmalloc(ng_sz * sizeof(unsigned int));
- ng->pos = GDKzalloc(ng_sz * sizeof(unsigned int));
- ng->rid = GDKmalloc(NGRAM_MULTIPLE * b_sz * sizeof(unsigned
int));
+ ng->idx = GDKzalloc(ng_sz * sizeof(NGRAM_TYPE));
+ ng->sigs = GDKzalloc(b_cnt * sizeof(NGRAM_TYPE));
+ ng->h = GDKzalloc(ng_sz * sizeof(unsigned));
+ ng->pos = GDKzalloc(ng_sz * sizeof(unsigned));
+ ng->rid = GDKzalloc(NGRAM_MULTIPLE * b_cnt * sizeof(unsigned));
}
- if (!ng || !ng->h || !ng->idx || !ng->pos || !ng->rid || !ng->sigs) {
+ if (!ng || !ng->idx || !ng->sigs || !ng->h || !ng->pos || !ng->rid) {
ngrams_destroy(ng);
return NULL;
}
return ng;
}
+static Ngrams *
+ngrams_create_old(BAT *b, size_t ngramsize)
+{
+ Ngrams *n = NULL;
+ size_t sz = BATcount(b);
+
+ n = (Ngrams*)GDKmalloc(sizeof(Ngrams));
+ if (n) {
+ n->h = (unsigned int*)GDKmalloc(ngramsize*sizeof(int));
+ n->pos = (unsigned int*)GDKzalloc(ngramsize*sizeof(int));
+ n->rid = (unsigned int*)GDKmalloc(NGRAM_MULTIPLE* sz *
sizeof(int));
+ n->idx = (NGRAM_TYPE*)GDKmalloc(ngramsize*sizeof(NGRAM_TYPE));
+ n->sigs = (NGRAM_TYPE*)GDKmalloc(sz * sizeof(NGRAM_TYPE));
+ }
+ if (!n || !n->h || !n->idx || !n->pos || !n->rid || !n->sigs) {
+ ngrams_destroy(n);
+ return NULL;
+ }
+ return n;
+}
+
+static int
+ngrams_init_1gram(Ngrams *n, BAT *b)
+{
+ BUN cnt = BATcount(b);
+ NGRAM_TYPE *h = (NGRAM_TYPE *)GDKzalloc(UNIGRAM_SZ*sizeof(NGRAM_TYPE)),
*hist = (NGRAM_TYPE*)h, sum = 0;
+ int *id = (int*)GDKmalloc(UNIGRAM_SZ*sizeof(int)), i;
+ NGRAM_TYPE *idx = n->idx;
+
+ if (!h || !id) {
+ GDKfree(h);
+ GDKfree(id);
+ return -1;
+ }
+
+ BATiter bi = bat_iterator(b);
+ for(BUN i=0; i<cnt; i++) {
+ const char *s = BUNtail(bi,i);
+ if (!strNil(s) && *s) { /* skipped */
+ for(; *s; s++) {
+ h[CHAR_MAP(*s)]++;
+ }
+ }
+ }
+ bat_iterator_end(&bi);
+
+ int bc = 0;
+
+ for(int i=0; i<UNIGRAM_SZ; i++) {
+ id[i] = i;
+ idx[i] = 0;
+ n->h[i] = (unsigned int)hist[i];
+ }
+ GDKqsort(h, id, NULL, UNIGRAM_SZ, sizeof(NGRAM_TYPE), sizeof(int),
NGRAM_TYPEID, true, false);
+ for(i=UNIGRAM_SZ-1; i>=0; i--) {
+ if ((size_t)(sum + hist[i]) >= (NGRAM_MULTIPLE*cnt)-1)
+ break;
+ sum += hist[i];
+ }
+ NGRAM_TYPE larger_cnt = hist[i];
+ for(; hist[i] == larger_cnt; i++)
+ ;
+ NGRAM_TYPE max = hist[0], small = hist[i];
+ n->max = max;
+ n->min = small;
+
+ for(int i=0; i<UNIGRAM_SZ && hist[i] > 0; i++) {
+ unsigned int x=id[i];
+ idx[x] = NGRAM_CST(1)<<bc;
+ assert(idx[x] > 0);
+ bc++;
+ bc %= NGRAM_BITS;
+ }
+
+ bi = bat_iterator(b);
+ NGRAM_TYPE *sp = n->sigs;
+ unsigned int pos = 1;
+ for(BUN i=0; i<cnt; i++) {
+ const char *s = BUNtail(bi, i);
+ NGRAM_TYPE sig = 0;
+ if (!strNil(s) && s[0]) { /* too short skipped */
+ for(; *s; s++) {
+ int k = CHAR_MAP(*s);
+ sig |= idx[k];
+ if (n->h[k] <= n->min) {
+ if (n->pos[k] == 0) {
+ n->pos[k] = pos;
+ pos += n->h[k];
+ n->h[k] = 0;
+ }
+ /* deduplicate */
+ int done = (n->h[k] > 0 &&
n->rid[n->pos[k] + n->h[k]-1] == i);
+ if (!done) {
+ n->rid[n->pos[k] + n->h[k]] = i;
+ n->h[k]++;
+ }
+ }
+ }
+ *sp = sig;
+ } else {
+ *sp = NGRAM_TYPENIL;
+ }
+ sp++;
+ }
+ bat_iterator_end(&bi);
+
+ GDKfree(h);
+ GDKfree(id);
+ return 0;
+}
+
+static str
+NGc1join_intern(bat *L, bat *R, bat *H, bat *N, bat *lc, bat *rc, bit
*nil_matches, lng *estimate, bit *anti)
+{
+ (void)nil_matches;
+ (void)estimate;
+ BAT *h = BATdescriptor(*H);
+ BAT *n = BATdescriptor(*N);
+
+ if (lc && !is_bat_nil(*lc))
+ assert(0);
+ if (rc && !is_bat_nil(*rc))
+ assert(0);
+
+ if (*anti)
+ throw(MAL, "gram.c1", "No anti contains yet\n");
+ if (!h || !n) {
+ BBPreclaim(h);
+ BBPreclaim(n);
+ throw(MAL, "gram.c1", RUNTIME_OBJECT_MISSING);
+ }
+
+ if (BATcount(n) < 10) {
+ printf("todo fall back to select \n");
+ }
+
+ Ngrams *ngi = ngrams_create_old(h, UNIGRAM_SZ);
+ if (ngi && ngrams_init_1gram(ngi, h) == 0) { /* TODO add locks and only
create ngram once for full (parent bat) */
+ BUN cnt = BATcount(h);
+ /* create L/R */
+ BAT *l = COLnew(0, TYPE_oid, 10*cnt, TRANSIENT);
+ BAT *r = COLnew(0, TYPE_oid, 10*cnt, TRANSIENT);
+
+ int ncnt = 0, ncnt1 = 0, ncnt2 = 0, ncnt3 = 0, ncnt4 = 0, ncnt5
= 0;
+ BATiter ni = bat_iterator(n);
+ BATiter hi = bat_iterator(h);
+ NGRAM_TYPE nmax = 0;
+ oid *ol = Tloc(l, 0), *el = ol + 10*cnt;
+ oid *or = Tloc(r, 0);
+ cnt = BATcount(n);
+ /* if needed grow */
+ for(BUN i = 0; i<cnt; i++) {
+ const char *s = BUNtail(ni,i), *os = s;
+ NGRAM_TYPE sig = 0;
+
+ if ((ol+1000) > el)
+ break;
+ if (!strNil(s) && s[0]) {
+ NGRAM_TYPE min = ngi->max;
+ unsigned int min_pos = 0;
+ for(; *s; s++) {
+ unsigned int k = CHAR_MAP(*s);
+ sig |= ngi->idx[k];
+ if (ngi->h[k] < min) {
+ min = ngi->h[k];
+ min_pos = k; /* encoded min
ngram */
+ }
+ }
+ ncnt++;
+ if (min <= ngi->min) {
+ unsigned int rr = ngi->pos[min_pos];
+ int hcnt = ngi->h[min_pos];
+ ncnt1++;
+ for(int k = 0; k<hcnt; k++, rr++) {
+ unsigned int hr = ngi->rid[rr];
+ if (((ngi->sigs[hr] & sig) ==
sig)) {
+ char *hs = BUNtail(hi,
hr);
+ ncnt3++;
+ if (strstr(hs, os) !=
NULL) {
+ *ol++ = hr;
+ *or++ = (oid)i;
+ }
+ }
+ }
+ } else {
+ unsigned int hcnt = BATcount(h);
+ ncnt2++;
+ for(size_t k = 0; k < hcnt; k++) {
+ if (((ngi->sigs[k] & sig) ==
sig)) {
+ char *hs = BUNtail(hi,
k);
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]