Changeset: b9323c7b75de for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/b9323c7b75de
Modified Files:
monetdb5/modules/mal/ngrams.c
sql/scripts/49_strings.sql
Branch: strimps_v3
Log Message:
Remove old code
diffs (truncated from 778 to 300 lines):
diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c
--- a/monetdb5/modules/mal/ngrams.c
+++ b/monetdb5/modules/mal/ngrams.c
@@ -89,685 +89,6 @@ ngrams_destroy(Ngrams *ng)
}
static Ngrams *
-ngrams_create_old(BAT *b, size_t ngramsize)
-{
- Ngrams *n = NULL;
- size_t sz = BATcount(b);
-
- n = (Ngrams*)GDKmalloc(sizeof(Ngrams));
- if (n) {
- n->h = (unsigned int*)GDKmalloc(ngramsize*sizeof(int));
- n->pos = (unsigned int*)GDKzalloc(ngramsize*sizeof(int));
- n->rid = (unsigned int*)GDKmalloc(NGRAM_MULTIPLE* sz *
sizeof(int));
- n->idx = (NGRAM_TYPE*)GDKmalloc(ngramsize*sizeof(NGRAM_TYPE));
- n->sigs = (NGRAM_TYPE*)GDKmalloc(sz * sizeof(NGRAM_TYPE));
- }
- if (!n || !n->h || !n->idx || !n->pos || !n->rid || !n->sigs) {
- ngrams_destroy(n);
- return NULL;
- }
- return n;
-}
-
-static int
-ngrams_init_1gram(Ngrams *n, BAT *b)
-{
- BUN cnt = BATcount(b);
- NGRAM_TYPE *h = (NGRAM_TYPE *)GDKzalloc(UNIGRAM_SZ*sizeof(NGRAM_TYPE)),
*hist = (NGRAM_TYPE*)h, sum = 0;
- int *id = (int*)GDKmalloc(UNIGRAM_SZ*sizeof(int)), i;
- NGRAM_TYPE *idx = n->idx;
-
- if (!h || !id) {
- GDKfree(h);
- GDKfree(id);
- return -1;
- }
-
- BATiter bi = bat_iterator(b);
- for(BUN i=0; i<cnt; i++) {
- const char *s = BUNtail(bi,i);
- if (!strNil(s) && *s) { /* skipped */
- for(; *s; s++) {
- h[CHAR_MAP(*s)]++;
- }
- }
- }
- bat_iterator_end(&bi);
-
- int bc = 0;
-
- for(int i=0; i<UNIGRAM_SZ; i++) {
- id[i] = i;
- idx[i] = 0;
- n->h[i] = (unsigned int)hist[i];
- }
- GDKqsort(h, id, NULL, UNIGRAM_SZ, sizeof(NGRAM_TYPE), sizeof(int),
NGRAM_TYPEID, true, false);
- for(i=UNIGRAM_SZ-1; i>=0; i--) {
- if ((sum + hist[i]) >= (NGRAM_MULTIPLE*cnt)-1)
- break;
- sum += hist[i];
- }
- NGRAM_TYPE larger_cnt = hist[i];
- for(; hist[i] == larger_cnt; i++)
- ;
- NGRAM_TYPE max = hist[0], small = hist[i];
- n->max = max;
- n->min = small;
-
- for(int i=0; i<UNIGRAM_SZ && hist[i] > 0; i++) {
- unsigned int x=id[i];
- idx[x] = NGRAM_CST(1)<<bc;
- assert(idx[x] > 0);
- bc++;
- bc %= NGRAM_BITS;
- }
-
- bi = bat_iterator(b);
- NGRAM_TYPE *sp = n->sigs;
- unsigned int pos = 1;
- for(BUN i=0; i<cnt; i++) {
- const char *s = BUNtail(bi, i);
- NGRAM_TYPE sig = 0;
- if (!strNil(s) && s[0]) { /* too short skipped */
- for(; *s; s++) {
- int k = CHAR_MAP(*s);
- sig |= idx[k];
- if (n->h[k] <= n->min) {
- if (n->pos[k] == 0) {
- n->pos[k] = pos;
- pos += n->h[k];
- n->h[k] = 0;
- }
- /* deduplicate */
- int done = (n->h[k] > 0 &&
n->rid[n->pos[k] + n->h[k]-1] == i);
- if (!done) {
- n->rid[n->pos[k] + n->h[k]] = i;
- n->h[k]++;
- }
- }
- }
- *sp = sig;
- } else {
- *sp = NGRAM_TYPENIL;
- }
- sp++;
- }
- bat_iterator_end(&bi);
-
- GDKfree(h);
- GDKfree(id);
- return 0;
-}
-
-static str
-NGc1join_intern(bat *L, bat *R, bat *H, bat *N, bat *lc, bat *rc, bit
*nil_matches, lng *estimate, bit *anti)
-{
- (void)nil_matches;
- (void)estimate;
- BAT *h = BATdescriptor(*H);
- BAT *n = BATdescriptor(*N);
-
- if (lc && !is_bat_nil(*lc))
- assert(0);
- if (rc && !is_bat_nil(*rc))
- assert(0);
-
- if (*anti)
- throw(MAL, "gram.c1", "No anti contains yet\n");
- if (!h || !n) {
- BBPreclaim(h);
- BBPreclaim(n);
- throw(MAL, "gram.c1", RUNTIME_OBJECT_MISSING);
- }
-
- if (BATcount(n) < 10) {
- printf("todo fall back to select \n");
- }
-
- Ngrams *ngi = ngrams_create_old(h, UNIGRAM_SZ);
- if (ngi && ngrams_init_1gram(ngi, h) == 0) { /* TODO add locks and only
create ngram once for full (parent bat) */
- BUN cnt = BATcount(h);
- /* create L/R */
- BAT *l = COLnew(0, TYPE_oid, 10*cnt, TRANSIENT);
- BAT *r = COLnew(0, TYPE_oid, 10*cnt, TRANSIENT);
-
- int ncnt = 0, ncnt1 = 0, ncnt2 = 0, ncnt3 = 0, ncnt4 = 0, ncnt5
= 0;
- BATiter ni = bat_iterator(n);
- BATiter hi = bat_iterator(h);
- NGRAM_TYPE nmax = 0;
- oid *ol = Tloc(l, 0), *el = ol + 10*cnt;
- oid *or = Tloc(r, 0);
- cnt = BATcount(n);
- /* if needed grow */
- for(BUN i = 0; i<cnt; i++) {
- const char *s = BUNtail(ni,i), *os = s;
- NGRAM_TYPE sig = 0;
-
- if ((ol+1000) > el)
- break;
- if (!strNil(s) && s[0]) {
- NGRAM_TYPE min = ngi->max;
- unsigned int min_pos = 0;
- for(; *s; s++) {
- unsigned int k = CHAR_MAP(*s);
- sig |= ngi->idx[k];
- if (ngi->h[k] < min) {
- min = ngi->h[k];
- min_pos = k; /* encoded min
ngram */
- }
- }
- ncnt++;
- if (min <= ngi->min) {
- unsigned int rr = ngi->pos[min_pos];
- int hcnt = ngi->h[min_pos];
- ncnt1++;
- for(int k = 0; k<hcnt; k++, rr++) {
- unsigned int hr = ngi->rid[rr];
- if (((ngi->sigs[hr] & sig) ==
sig)) {
- char *hs = BUNtail(hi,
hr);
- ncnt3++;
- if (strstr(hs, os) !=
NULL) {
- *ol++ = hr;
- *or++ = (oid)i;
- }
- }
- }
- } else {
- unsigned int hcnt = BATcount(h);
- ncnt2++;
- for(size_t k = 0; k < hcnt; k++) {
- if (((ngi->sigs[k] & sig) ==
sig)) {
- char *hs = BUNtail(hi,
k);
- ncnt4++;
- if (strstr(hs, os) !=
NULL) {
- *ol++ = k;
- *or++ = (oid)i;
- }
- }
- }
- }
- if (min > nmax)
- nmax = min;
- } else if (!strNil(s)) { /* skipped */
- unsigned int hcnt = BATcount(h);
- ncnt++;
- for(size_t k = 0; k < hcnt; k++) {
- char *hs = BUNtail(hi, k);
- ncnt5++;
- if (strstr(hs, os) != NULL) {
- *ol++ = k;
- *or++ = (oid)i;
- }
- }
- }
- }
- bat_iterator_end(&ni);
- bat_iterator_end(&hi);
- BBPreclaim(h);
- BBPreclaim(n);
- BATsetcount(l, ol - (oid*)Tloc(l, 0));
- BATsetcount(r, ol - (oid*)Tloc(l, 0));
- *L = l->batCacheid;
- *R = r->batCacheid;
- BBPkeepref(l);
- BBPkeepref(r);
- ngrams_destroy(ngi);
- return MAL_SUCCEED;
- }
- BBPreclaim(h);
- BBPreclaim(n);
- throw(MAL, "gram.c1", SQLSTATE(HY013) MAL_MALLOC_FAIL);
-}
-
-static str
-NGc1join1(bat *L, bat *sigs, bat *needle, bat *lc, bit *nil_matches, lng
*estimate, bit *anti)
-{
- return NGc1join_intern(L, NULL, sigs, needle, lc, NULL, nil_matches,
estimate, anti);
-}
-
-static str
-NGc1join(bat *L, bat *R, bat *sigs, bat *needle, bat *lc, bat *rc, bit
*nil_matches, lng *estimate, bit *anti)
-{
- return NGc1join_intern(L, R, sigs, needle, lc, rc, nil_matches,
estimate, anti);
-}
-
-static int
-ngrams_init_2gram(Ngrams *n, BAT *b)
-{
- BUN cnt = BATcount(b);
- NGRAM_TYPE (*h)[GZ] = (NGRAM_TYPE
(*)[GZ])GDKzalloc(BIGRAM_SZ*sizeof(NGRAM_TYPE)), *hist = (NGRAM_TYPE*)h, sum =
0;
- int *id = (int*)GDKmalloc(BIGRAM_SZ*sizeof(int)), i;
- NGRAM_TYPE *idx = n->idx;
-
- if (!h || !id) {
- GDKfree(h);
- GDKfree(id);
- return -1;
- }
-
- BATiter bi = bat_iterator(b);
- for(BUN i=0; i<cnt; i++) {
- const char *s = BUNtail(bi,i);
- if (!strNil(s) && *s) { /* skipped */
- unsigned char p = CHAR_MAP(*s++);
- for(; *s; p=CHAR_MAP(*s), s++) {
- h[p][CHAR_MAP(*s)]++;
- }
- }
- }
- bat_iterator_end(&bi);
-
- int bc = 0;
-
- for(int i=0; i<BIGRAM_SZ; i++) {
- id[i] = i;
- idx[i] = 0;
- n->h[i] = (unsigned int)hist[i];
- }
- GDKqsort(h, id, NULL, BIGRAM_SZ, sizeof(NGRAM_TYPE), sizeof(int),
NGRAM_TYPEID, true, false);
- for(i=BIGRAM_SZ-1; i>=0; i--) {
- if ((sum + hist[i]) >= (NGRAM_MULTIPLE*cnt)-1)
- break;
- sum += hist[i];
- }
- NGRAM_TYPE larger_cnt = hist[i];
- for(; hist[i] == larger_cnt; i++)
- ;
- NGRAM_TYPE max = hist[0], small = hist[i];
- n->max = max;
- n->min = small;
- for(int i=0; i<BIGRAM_SZ && hist[i] > 0; i++) {
- int y=(id[i]/GZ)%GZ, z=id[i]%GZ;
- idx[y*GZ+z] = NGRAM_CST(1)<<bc;
- assert(idx[y*GZ+z] > 0);
- bc++;
- bc %= NGRAM_BITS;
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]