Changeset: 38ad0c81dfb8 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/38ad0c81dfb8 Removed Files: monetdb5/modules/mal/ngrams.c monetdb5/modules/mal/ngrams.h Branch: strimps_v3 Log Message:
Remove old files diffs (truncated from 1369 to 300 lines): diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c deleted file mode 100644 --- a/monetdb5/modules/mal/ngrams.c +++ /dev/null @@ -1,1298 +0,0 @@ -/* - * SPDX-License-Identifier: MPL-2.0 - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - * - * Copyright 2024 MonetDB Foundation; - * Copyright August 2008 - 2023 MonetDB B.V.; - * Copyright 1997 - July 2008 CWI. - */ - -#include "monetdb_config.h" -#include "ngrams.h" -#include "mal_interpreter.h" -#include "mal_exception.h" -#include "string.h" -#include "str.h" - -static inline int -ng_prefix(const char *s1, const char *s2, int s2_len) -{ - return strncmp(s1, s2, s2_len); -} - -static inline int -ng_suffix(const char *s1, const char *s2, int s2_len) -{ - return strcmp(s1 + strlen(s1) - s2_len, s2); -} - -static inline int -ng_contains(const char *s1, const char *s2, int s2_len) -{ - (void) s2_len; - return strstr(s1, s2) == NULL; -} - -static inline void -BBPreclaim_n(int nargs, ...) -{ - va_list valist; - va_start(valist, nargs); - for (int i = 0; i < nargs; i++) { - BAT *b = va_arg(valist, BAT *); - BBPreclaim(b); - } - va_end(valist); -} - -static void -ngrams_destroy(Ngrams *ng) -{ - if (ng) { - GDKfree(ng->idx); - GDKfree(ng->sigs); - GDKfree(ng->histogram); - GDKfree(ng->lists); - GDKfree(ng->rids); - } - GDKfree(ng); -} - -static Ngrams * -ngrams_create(size_t cnt, size_t ng_sz) -{ - Ngrams *ng = GDKmalloc(sizeof(Ngrams)); - if (ng) { - ng->idx = GDKmalloc(ng_sz * sizeof(NGRAM_TYPE)); - ng->sigs = GDKmalloc(cnt * sizeof(NGRAM_TYPE)); - ng->histogram = GDKmalloc(ng_sz * sizeof(unsigned)); - ng->lists = GDKmalloc(ng_sz * sizeof(unsigned)); - ng->rids = GDKmalloc(2 * NGRAM_MULTIPLE * cnt * sizeof(unsigned)); - } - if (!ng || !ng->idx || !ng->sigs || !ng->histogram || !ng->lists || !ng->rids) { - ngrams_destroy(ng); - return NULL; - } - return ng; -} - -static str -init_unigram_idx(Ngrams *ng, BATiter *bi, struct canditer *bci, QryCtx *qry_ctx) -{ - NGRAM_TYPE *idx = ng->idx; - NGRAM_TYPE *sigs = ng->sigs; - unsigned *h = ng->histogram; - unsigned *lists = ng->lists; - unsigned *rids = ng->rids; - unsigned *h_tmp = GDKzalloc(UNIGRAM_SZ * sizeof(unsigned)); - unsigned *map = GDKmalloc(UNIGRAM_SZ * sizeof(unsigned)); - unsigned k = 1; - - if (!h_tmp || !map) { - GDKfree(h_tmp); - GDKfree(map); - throw(MAL, "init_unigram_idx", SQLSTATE(HY013) MAL_MALLOC_FAIL); - } - - oid bbase = bi->b->hseqbase, ob; - const char *bvars = bi->vh->base, *bvals = bi->base; - - canditer_reset(bci); - TIMEOUT_LOOP(bci->ncand, qry_ctx) { - ob = canditer_next(bci); - const char *s = VALUE(b, ob - bbase); - if (!strNil(s)) - for ( ; UNIGRAM(s); s++) - h_tmp[ENC_TOKEN1(s)]++; - } - - for (size_t i = 0; i < UNIGRAM_SZ; i++) { - map[i] = i; - idx[i] = lists[i] = 0; - h[i] = h_tmp[i]; - } - - GDKqsort(h_tmp, map, NULL, UNIGRAM_SZ, - sizeof(unsigned), sizeof(unsigned), TYPE_int, true, false); - - unsigned j = UNIGRAM_SZ - 1, sum = 0; - for ( ; j; j--) { - sum += h_tmp[j]; - if (sum + h_tmp[j] >= NGRAM_MULTIPLE * bci->ncand - 1) - break; - } - ng->max = h_tmp[0]; - ng->min = h_tmp[j]; - - int n = 0; - for (size_t i = 0; i < UNIGRAM_SZ && h_tmp[i] > 0; i++) { - idx[map[i]] = NGRAM_CST(1) << n++; - n %= NGRAM_BITS; - } - - canditer_reset(bci); - TIMEOUT_LOOP(bci->ncand, qry_ctx) { - ob = canditer_next(bci); - const char *s = VALUE(b, ob - bbase); - if (!strNil(s) && UNIGRAM(s)) { - NGRAM_TYPE sig = 0; - for ( ; UNIGRAM(s); s++) { - unsigned unigram = ENC_TOKEN1(s); - sig |= idx[unigram]; - if (h[unigram] <= ng->min) { - if (lists[unigram] == 0) { - lists[unigram] = k; - k += h[unigram]; - h[unigram] = 0; - } - bool done = (h[unigram] > 0 && - rids[lists[unigram] + h[unigram] - 1] == ob - bbase); - if (!done) { - rids[lists[unigram] + h[unigram]] = ob - bbase; - h[unigram]++; - } - } - } - *sigs = sig; - } else { - *sigs = NGRAM_TYPENIL; - } - sigs++; - } - - GDKfree(h_tmp); - GDKfree(map); - return MAL_SUCCEED; -} - -static str -init_bigram_idx(Ngrams *ng, BATiter *bi, struct canditer *bci, QryCtx *qry_ctx) -{ - NGRAM_TYPE *idx = ng->idx; - NGRAM_TYPE *sigs = ng->sigs; - unsigned *h = ng->histogram; - unsigned *lists = ng->lists; - unsigned *rids = ng->rids; - unsigned (*h_tmp)[SZ] = GDKzalloc(BIGRAM_SZ * sizeof(unsigned)); - unsigned *h_tmp_ptr = (unsigned *) h_tmp; - unsigned *map = GDKmalloc(BIGRAM_SZ * sizeof(unsigned)); - unsigned int k = 1; - - if (!h_tmp || !map) { - GDKfree(h_tmp); - GDKfree(map); - throw(MAL, "init_bigram_idx", SQLSTATE(HY013) MAL_MALLOC_FAIL); - } - - oid bbase = bi->b->hseqbase, ob; - const char *bvars = bi->vh->base, *bvals = bi->base; - - canditer_reset(bci); - TIMEOUT_LOOP(bci->ncand, qry_ctx) { - ob = canditer_next(bci); - const char *s = VALUE(b, ob - bbase); - if (!strNil(s)) - for ( ; BIGRAM(s); s++) - h_tmp[ENC_TOKEN1(s)][ENC_TOKEN2(s)]++; - } - - for (size_t i = 0; i < BIGRAM_SZ; i++) { - map[i] = i; - idx[i] = lists[i] = 0; - h[i] = h_tmp_ptr[i]; - } - - GDKqsort(h_tmp, map, NULL, BIGRAM_SZ, - sizeof(unsigned), sizeof(unsigned), TYPE_int, true, false); - - unsigned j = BIGRAM_SZ - 1, sum = 0; - for ( ; j; j--) { - sum += h_tmp_ptr[j]; - if ((sum + h_tmp_ptr[j]) >= NGRAM_MULTIPLE * bci->ncand - 1) - break; - } - ng->max = h_tmp_ptr[0]; - ng->min = h_tmp_ptr[j]; - - int n = 0; - for (size_t i = 0; i < BIGRAM_SZ && h_tmp_ptr[i] > 0; i++) { - idx[map[i]] = NGRAM_CST(1) << n++; - n %= NGRAM_BITS; - } - - canditer_reset(bci); - TIMEOUT_LOOP(bci->ncand, qry_ctx) { - ob = canditer_next(bci); - const char *s = VALUE(b, ob - bbase); - if (!strNil(s) && BIGRAM(s)) { - NGRAM_TYPE sig = 0; - for ( ; BIGRAM(s); s++) { - unsigned bigram = ENC_TOKEN1(s)*SZ + ENC_TOKEN2(s); - sig |= idx[bigram]; - if (h[bigram] <= ng->min) { - if (lists[bigram] == 0) { - lists[bigram] = k; - k += h[bigram]; - h[bigram] = 0; - } - int done = (h[bigram] > 0 && - rids[lists[bigram] + h[bigram] - 1] == ob - bbase); - if (!done) { - rids[lists[bigram] + h[bigram]] = ob - bbase; - h[bigram]++; - } - } - } - *sigs = sig; - } else { - *sigs = NGRAM_TYPENIL; - } - sigs++; - } - - GDKfree(h_tmp); - GDKfree(map); - return MAL_SUCCEED; -} - -static str -init_trigram_idx(Ngrams *ng, BATiter *bi, struct canditer *bci, QryCtx *qry_ctx) -{ - NGRAM_TYPE *idx = ng->idx; - NGRAM_TYPE *sigs = ng->sigs; - unsigned *h = ng->histogram; - unsigned *lists = ng->lists; - unsigned *rids = ng->rids; - unsigned (*h_tmp)[SZ][SZ] = GDKzalloc(TRIGRAM_SZ * sizeof(unsigned)); - unsigned *h_tmp_ptr = (unsigned *) h_tmp; - unsigned *map = GDKmalloc(TRIGRAM_SZ * sizeof(unsigned)); - unsigned k = 1; - - if (!h_tmp || !map) { - GDKfree(h_tmp); - GDKfree(map); - throw(MAL, "init_trigram_idx", SQLSTATE(HY013) MAL_MALLOC_FAIL); - } - - oid bbase = bi->b->hseqbase, ob; - const char *bvars = bi->vh->base, *bvals = bi->base; - - canditer_reset(bci); - TIMEOUT_LOOP(bci->ncand, qry_ctx) { - ob = canditer_next(bci); - const char *s = VALUE(b, ob - bbase); - if (!strNil(s)) - for ( ; TRIGRAM(s); s++) - h_tmp[ENC_TOKEN1(s)][ENC_TOKEN2(s)][ENC_TOKEN3(s)]++; - } - - for (size_t i = 0; i < TRIGRAM_SZ; i++) { - map[i] = i; - idx[i] = lists[i] = 0; - h[i] = h_tmp_ptr[i]; _______________________________________________ checkin-list mailing list -- [email protected] To unsubscribe send an email to [email protected]
