Changeset: 38ad0c81dfb8 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/38ad0c81dfb8
Removed Files:
        monetdb5/modules/mal/ngrams.c
        monetdb5/modules/mal/ngrams.h
Branch: strimps_v3
Log Message:

Remove old files


diffs (truncated from 1369 to 300 lines):

diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c
deleted file mode 100644
--- a/monetdb5/modules/mal/ngrams.c
+++ /dev/null
@@ -1,1298 +0,0 @@
-/*
- * SPDX-License-Identifier: MPL-2.0
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0.  If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- *
- * Copyright 2024 MonetDB Foundation;
- * Copyright August 2008 - 2023 MonetDB B.V.;
- * Copyright 1997 - July 2008 CWI.
- */
-
-#include "monetdb_config.h"
-#include "ngrams.h"
-#include "mal_interpreter.h"
-#include "mal_exception.h"
-#include "string.h"
-#include "str.h"
-
-static inline int
-ng_prefix(const char *s1, const char *s2, int s2_len)
-{
-       return strncmp(s1, s2, s2_len);
-}
-
-static inline int
-ng_suffix(const char *s1, const char *s2, int s2_len)
-{
-       return strcmp(s1 + strlen(s1) - s2_len, s2);
-}
-
-static inline int
-ng_contains(const char *s1, const char *s2, int s2_len)
-{
-       (void) s2_len;
-       return strstr(s1, s2) == NULL;
-}
-
-static inline void
-BBPreclaim_n(int nargs, ...)
-{
-       va_list valist;
-       va_start(valist, nargs);
-       for (int i = 0; i < nargs; i++) {
-               BAT *b = va_arg(valist, BAT *);
-               BBPreclaim(b);
-       }
-       va_end(valist);
-}
-
-static void
-ngrams_destroy(Ngrams *ng)
-{
-       if (ng) {
-               GDKfree(ng->idx);
-               GDKfree(ng->sigs);
-               GDKfree(ng->histogram);
-               GDKfree(ng->lists);
-               GDKfree(ng->rids);
-       }
-       GDKfree(ng);
-}
-
-static Ngrams *
-ngrams_create(size_t cnt, size_t ng_sz)
-{
-       Ngrams *ng = GDKmalloc(sizeof(Ngrams));
-       if (ng) {
-               ng->idx  = GDKmalloc(ng_sz * sizeof(NGRAM_TYPE));
-               ng->sigs = GDKmalloc(cnt * sizeof(NGRAM_TYPE));
-               ng->histogram = GDKmalloc(ng_sz * sizeof(unsigned));
-               ng->lists  = GDKmalloc(ng_sz * sizeof(unsigned));
-               ng->rids  = GDKmalloc(2 * NGRAM_MULTIPLE * cnt * 
sizeof(unsigned));
-       }
-       if (!ng || !ng->idx || !ng->sigs || !ng->histogram || !ng->lists || 
!ng->rids) {
-               ngrams_destroy(ng);
-               return NULL;
-       }
-       return ng;
-}
-
-static str
-init_unigram_idx(Ngrams *ng, BATiter *bi, struct canditer *bci, QryCtx 
*qry_ctx)
-{
-       NGRAM_TYPE *idx = ng->idx;
-       NGRAM_TYPE *sigs = ng->sigs;
-       unsigned *h = ng->histogram;
-       unsigned *lists = ng->lists;
-       unsigned *rids = ng->rids;
-       unsigned *h_tmp = GDKzalloc(UNIGRAM_SZ * sizeof(unsigned));
-       unsigned *map = GDKmalloc(UNIGRAM_SZ * sizeof(unsigned));
-       unsigned k = 1;
-
-       if (!h_tmp || !map) {
-               GDKfree(h_tmp);
-               GDKfree(map);
-               throw(MAL, "init_unigram_idx", SQLSTATE(HY013) MAL_MALLOC_FAIL);
-       }
-
-       oid bbase = bi->b->hseqbase, ob;
-       const char *bvars = bi->vh->base, *bvals = bi->base;
-
-       canditer_reset(bci);
-       TIMEOUT_LOOP(bci->ncand, qry_ctx) {
-               ob = canditer_next(bci);
-               const char *s = VALUE(b, ob - bbase);
-               if (!strNil(s))
-                       for ( ; UNIGRAM(s); s++)
-                               h_tmp[ENC_TOKEN1(s)]++;
-       }
-
-       for (size_t i = 0; i < UNIGRAM_SZ; i++) {
-               map[i] = i;
-               idx[i] = lists[i] = 0;
-               h[i] = h_tmp[i];
-       }
-
-       GDKqsort(h_tmp, map, NULL, UNIGRAM_SZ,
-                        sizeof(unsigned), sizeof(unsigned), TYPE_int, true, 
false);
-
-       unsigned j = UNIGRAM_SZ - 1, sum = 0;
-       for ( ; j; j--) {
-               sum += h_tmp[j];
-               if (sum + h_tmp[j] >= NGRAM_MULTIPLE * bci->ncand - 1)
-                       break;
-       }
-       ng->max = h_tmp[0];
-       ng->min = h_tmp[j];
-
-       int n = 0;
-       for (size_t i = 0; i < UNIGRAM_SZ && h_tmp[i] > 0; i++) {
-               idx[map[i]] = NGRAM_CST(1) << n++;
-               n %= NGRAM_BITS;
-       }
-
-       canditer_reset(bci);
-       TIMEOUT_LOOP(bci->ncand, qry_ctx) {
-               ob = canditer_next(bci);
-               const char *s = VALUE(b, ob - bbase);
-               if (!strNil(s) && UNIGRAM(s)) {
-                       NGRAM_TYPE sig = 0;
-                       for ( ; UNIGRAM(s); s++) {
-                               unsigned unigram = ENC_TOKEN1(s);
-                               sig |= idx[unigram];
-                               if (h[unigram] <= ng->min) {
-                                       if (lists[unigram] == 0) {
-                                               lists[unigram] = k;
-                                               k += h[unigram];
-                                               h[unigram] = 0;
-                                       }
-                                       bool done = (h[unigram] > 0 &&
-                                                                
rids[lists[unigram] + h[unigram] - 1] == ob - bbase);
-                                       if (!done) {
-                                               rids[lists[unigram] + 
h[unigram]] = ob - bbase;
-                                               h[unigram]++;
-                                       }
-                               }
-                       }
-                       *sigs = sig;
-               } else {
-                       *sigs = NGRAM_TYPENIL;
-               }
-               sigs++;
-       }
-
-       GDKfree(h_tmp);
-       GDKfree(map);
-       return MAL_SUCCEED;
-}
-
-static str
-init_bigram_idx(Ngrams *ng, BATiter *bi, struct canditer *bci, QryCtx *qry_ctx)
-{
-       NGRAM_TYPE *idx = ng->idx;
-       NGRAM_TYPE *sigs = ng->sigs;
-       unsigned *h = ng->histogram;
-       unsigned *lists = ng->lists;
-       unsigned *rids = ng->rids;
-       unsigned (*h_tmp)[SZ] = GDKzalloc(BIGRAM_SZ * sizeof(unsigned));
-       unsigned *h_tmp_ptr = (unsigned *) h_tmp;
-       unsigned *map = GDKmalloc(BIGRAM_SZ * sizeof(unsigned));
-       unsigned int k = 1;
-
-       if (!h_tmp || !map) {
-               GDKfree(h_tmp);
-               GDKfree(map);
-               throw(MAL, "init_bigram_idx", SQLSTATE(HY013) MAL_MALLOC_FAIL);
-       }
-
-       oid bbase = bi->b->hseqbase, ob;
-       const char *bvars = bi->vh->base, *bvals = bi->base;
-
-       canditer_reset(bci);
-       TIMEOUT_LOOP(bci->ncand, qry_ctx) {
-               ob = canditer_next(bci);
-               const char *s = VALUE(b, ob - bbase);
-               if (!strNil(s))
-                       for ( ; BIGRAM(s); s++)
-                               h_tmp[ENC_TOKEN1(s)][ENC_TOKEN2(s)]++;
-       }
-
-       for (size_t i = 0; i < BIGRAM_SZ; i++) {
-               map[i] = i;
-               idx[i] = lists[i] = 0;
-               h[i] = h_tmp_ptr[i];
-       }
-
-       GDKqsort(h_tmp, map, NULL, BIGRAM_SZ,
-                        sizeof(unsigned), sizeof(unsigned), TYPE_int, true, 
false);
-
-       unsigned j = BIGRAM_SZ - 1, sum = 0;
-       for ( ; j; j--) {
-               sum += h_tmp_ptr[j];
-               if ((sum + h_tmp_ptr[j]) >= NGRAM_MULTIPLE * bci->ncand - 1)
-                       break;
-       }
-       ng->max = h_tmp_ptr[0];
-       ng->min = h_tmp_ptr[j];
-
-       int n = 0;
-       for (size_t i = 0; i < BIGRAM_SZ && h_tmp_ptr[i] > 0; i++) {
-               idx[map[i]] = NGRAM_CST(1) << n++;
-               n %= NGRAM_BITS;
-       }
-
-       canditer_reset(bci);
-       TIMEOUT_LOOP(bci->ncand, qry_ctx) {
-               ob = canditer_next(bci);
-               const char *s = VALUE(b, ob - bbase);
-               if (!strNil(s) && BIGRAM(s)) {
-                       NGRAM_TYPE sig = 0;
-                       for ( ; BIGRAM(s); s++) {
-                               unsigned bigram = ENC_TOKEN1(s)*SZ + 
ENC_TOKEN2(s);
-                               sig |= idx[bigram];
-                               if (h[bigram] <= ng->min) {
-                                       if (lists[bigram] == 0) {
-                                               lists[bigram] = k;
-                                               k += h[bigram];
-                                               h[bigram] = 0;
-                                       }
-                                       int done = (h[bigram] > 0 &&
-                                                               
rids[lists[bigram] + h[bigram] - 1] == ob - bbase);
-                                       if (!done) {
-                                               rids[lists[bigram] + h[bigram]] 
= ob - bbase;
-                                               h[bigram]++;
-                                       }
-                               }
-                       }
-                       *sigs = sig;
-               } else {
-                       *sigs = NGRAM_TYPENIL;
-               }
-               sigs++;
-       }
-
-       GDKfree(h_tmp);
-       GDKfree(map);
-       return MAL_SUCCEED;
-}
-
-static str
-init_trigram_idx(Ngrams *ng, BATiter *bi, struct canditer *bci, QryCtx 
*qry_ctx)
-{
-       NGRAM_TYPE *idx = ng->idx;
-       NGRAM_TYPE *sigs = ng->sigs;
-       unsigned *h = ng->histogram;
-       unsigned *lists = ng->lists;
-       unsigned *rids = ng->rids;
-       unsigned (*h_tmp)[SZ][SZ] = GDKzalloc(TRIGRAM_SZ * sizeof(unsigned));
-       unsigned *h_tmp_ptr = (unsigned *) h_tmp;
-       unsigned *map = GDKmalloc(TRIGRAM_SZ * sizeof(unsigned));
-       unsigned k = 1;
-
-       if (!h_tmp || !map) {
-               GDKfree(h_tmp);
-               GDKfree(map);
-               throw(MAL, "init_trigram_idx", SQLSTATE(HY013) MAL_MALLOC_FAIL);
-       }
-
-       oid bbase = bi->b->hseqbase, ob;
-       const char *bvars = bi->vh->base, *bvals = bi->base;
-
-       canditer_reset(bci);
-       TIMEOUT_LOOP(bci->ncand, qry_ctx) {
-               ob = canditer_next(bci);
-               const char *s = VALUE(b, ob - bbase);
-               if (!strNil(s))
-                       for ( ; TRIGRAM(s); s++)
-                               
h_tmp[ENC_TOKEN1(s)][ENC_TOKEN2(s)][ENC_TOKEN3(s)]++;
-       }
-
-       for (size_t i = 0; i < TRIGRAM_SZ; i++) {
-               map[i] = i;
-               idx[i] = lists[i] = 0;
-               h[i] = h_tmp_ptr[i];
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to