Changeset: d18d809dbe73 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/d18d809dbe73 Added Files: monetdb5/modules/mal/ngrams.c Removed Files: monetdb5/modules/mal/ngram.c monetdb5/modules/mal/ngram.h Modified Files: monetdb5/modules/mal/CMakeLists.txt sql/scripts/48_txtsim.sql sql/scripts/49_strings.sql Branch: strimps_v3 Log Message:
Refactor ngrams contains (WIP) diffs (truncated from 2918 to 300 lines): diff --git a/monetdb5/modules/mal/CMakeLists.txt b/monetdb5/modules/mal/CMakeLists.txt --- a/monetdb5/modules/mal/CMakeLists.txt +++ b/monetdb5/modules/mal/CMakeLists.txt @@ -41,7 +41,7 @@ target_sources(malmodules projectionpath.c tablet.c tablet.h batcalc.c calc.c - ngram.c ngram.h) + ngrams.c) target_include_directories(malmodules PRIVATE diff --git a/monetdb5/modules/mal/ngram.c b/monetdb5/modules/mal/ngram.c deleted file mode 100644 --- a/monetdb5/modules/mal/ngram.c +++ /dev/null @@ -1,1637 +0,0 @@ -/* - * SPDX-License-Identifier: MPL-2.0 - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - * - * Copyright 2024 MonetDB Foundation; - * Copyright August 2008 - 2023 MonetDB B.V.; - * Copyright 1997 - July 2008 CWI. - */ - -#include <monetdb_config.h> -#include <mal_exception.h> -#include <gdk_cand.h> -#include <gdk_atoms.h> -#include <string.h> - -#define M 1000000 -#if 0 -#define GZ 128 -#define CHAR_MAP(s) (s&127) -#else -#define GZ 64 -#define CHAR_MAP(s) (s&63) -#endif -#define SZ_1GRAM GZ -#define SZ_2GRAM (GZ*GZ) -#define SZ_3GRAM (GZ*GZ*GZ) -#define SZ_4GRAM ((size_t)GZ*GZ*GZ*GZ) - -#define hist_1gram sht_hist_1gram -#define hist_2gram sht_hist_2gram -#define hist_3gram sht_hist_3gram -#define NGsignature NGsignature_sht -#define NGand NGand_sht -#define NGandselect NGandselect_sht -#define NGRAM_TYPE sht -#define NGRAM_TYPEID TYPE_sht -#define NGRAM_TYPENIL sht_nil -#define NGRAM_CST -#define NGRAM_BITS 15 -#include "ngram.h" - -#undef hist_1gram -#undef hist_2gram -#undef hist_3gram -#undef NGsignature -#undef NGand -#undef NGandselect -#undef NGRAM_TYPE -#undef NGRAM_TYPEID -#undef NGRAM_TYPENIL -#undef NGRAM_CST -#undef NGRAM_BITS - -#define hist_1gram int_hist_1gram -#define hist_2gram int_hist_2gram -#define hist_3gram int_hist_3gram -#define NGsignature NGsignature_int -#define NGand NGand_int -#define NGandselect NGandselect_int -#define NGRAM_TYPE int -#define NGRAM_TYPEID TYPE_int -#define NGRAM_TYPENIL int_nil -#define NGRAM_CST -#define NGRAM_BITS 31 -#include "ngram.h" - -#undef hist_1gram -#undef hist_2gram -#undef hist_3gram -#undef NGsignature -#undef NGand -#undef NGandselect -#undef NGRAM_TYPE -#undef NGRAM_TYPEID -#undef NGRAM_TYPENIL -#undef NGRAM_CST -#undef NGRAM_BITS - -#define hist_1gram lng_hist_1gram -#define hist_2gram lng_hist_2gram -#define hist_3gram lng_hist_3gram -#define NGsignature NGsignature_lng -#define NGand NGand_lng -#define NGandselect NGandselect_lng -#define NGRAM_TYPE lng -#define NGRAM_TYPEID TYPE_lng -#define NGRAM_TYPENIL lng_nil -#define NGRAM_CST(v) LL_CONSTANT(v) -#define NGRAM_BITS 63 -#include "ngram.h" - -#undef hist_1gram -#undef hist_2gram -#undef hist_3gram -#undef NGsignature -#undef NGand -#undef NGandselect -#undef NGRAM_TYPE -#undef NGRAM_TYPEID -#undef NGRAM_TYPENIL -#undef NGRAM_CST -#undef NGRAM_BITS - -#define hist_1gram hge_hist_1gram -#define hist_2gram hge_hist_2gram -#define hist_3gram hge_hist_3gram -#define NGsignature NGsignature_hge -#define NGand NGand_hge -#define NGandselect NGandselect_hge -#define NGRAM_TYPE hge -#define NGRAM_TYPEID TYPE_hge -#define NGRAM_TYPENIL hge_nil -#define NGRAM_CST(v) ((hge)LL_CONSTANT(v)) -#define NGRAM_BITS 127 -#include "ngram.h" - -#undef hist_1gram -#undef hist_2gram -#undef hist_3gram -#undef NGsignature -#undef NGand -#undef NGandselect -#undef NGRAM_TYPE -#undef NGRAM_TYPEID -#undef NGRAM_TYPENIL -#undef NGRAM_CST -#undef NGRAM_BITS - -static str -NGandjoin_intern(bat *L, bat *R, bat *sigs, bat *needle, bat *lc, bat *rc, bit *nil_matches, lng *estimate, bit *anti) -{ - (void)L; - (void)R; - (void)sigs; - (void)needle; - (void)lc; - (void)rc; - (void)nil_matches; - (void)estimate; - (void)anti; - return MAL_SUCCEED; -} - -static str -NGandjoin1(bat *L, bat *sigs, bat *needle, bat *lc, bit *nil_matches, lng *estimate, bit *anti) -{ - return NGandjoin_intern(L, NULL, sigs, needle, lc, NULL, nil_matches, estimate, anti); -} - -static str -NGandjoin(bat *L, bat *R, bat *sigs, bat *needle, bat *lc, bat *rc, bit *nil_matches, lng *estimate, bit *anti) -{ - return NGandjoin_intern(L, R, sigs, needle, lc, rc, nil_matches, estimate, anti); -} - -static inline int -popcount64(uint64_t x) -{ -#if defined(__GNUC__) - return (uint32_t) __builtin_popcountll(x); -#elif defined(_MSC_VER) - return (uint32_t) __popcnt64(x); -#else - x = (x & 0x5555555555555555ULL) + ((x >> 1) & 0x5555555555555555ULL); - x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); - x = (x & 0x0F0F0F0F0F0F0F0FULL) + ((x >> 4) & 0x0F0F0F0F0F0F0F0FULL); - return (x * 0x0101010101010101ULL) >> 56; -#endif -} - -static str -NGpopcnt(int *cnt, lng *v) -{ - *cnt = popcount64(*v); - return MAL_SUCCEED; -} - -static str -NGsignature_dummy( str *sig, str *str, int *n) -{ - (void)sig; - (void)str; - (void)n; - throw(MAL, "ngram.signature", "no scalar version\n"); -} - -static char * -gor_lng(lng *res, const bat *bid) -{ - BAT *b; - lng val = 0; - BUN nval = 0; - - if ((b = BATdescriptor(*bid)) == NULL) - throw(MAL, "gram.gor", RUNTIME_OBJECT_MISSING); - - const lng *vals = (const lng *) Tloc(b, 0); - for (BUN i = 0, n = BATcount(b); i < n; i++) { - if (is_lng_nil(vals[i])) - continue; /* nils are ignored */ - if (vals[i] == 0) { - /* any value zero is easy: result is zero */ - BBPunfix(b->batCacheid); - *res = 0; - return MAL_SUCCEED; - } - if (vals[i] < 0) { - val |= -vals[i]; - } else { - val |= vals[i]; - } - nval++; /* count non-nil values */ - } - BBPunfix(b->batCacheid); - if (nval == 0) { - /* if there are no non-nil values, the result is nil */ - *res = lng_nil; - } else { - *res = val; - } - return MAL_SUCCEED; -} - -static char * -subgrouped_gor_cand_lng(bat *retval, const bat *bid, const bat *gid, - const bat *eid, const bat *sid, - const bit *skip_nils) -{ - BAT *b, *bn; /* these two are always assigned */ - BAT *g = NULL; /* these three are optional and may not ... */ - BAT *e = NULL; /* ... be assigned to below, ... */ - BAT *s = NULL; /* ... so we initialize them here */ - - /* we ignore these two inputs */ - (void) skip_nils; - - /* the bat we're supposed to be working on (bid) is not - * optional, but the others are, so we test whether the bat id - * is not nil, and if it isn't, whether we can find the BAT - * descriptor */ - if ((b = BATdescriptor(*bid)) == NULL || - (gid && !is_bat_nil(*gid) && (g = BATdescriptor(*gid)) == NULL) || - (eid && !is_bat_nil(*eid) && (e = BATdescriptor(*eid)) == NULL) || - (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL)) { - if (b) - BBPunfix(b->batCacheid); - if (g) - BBPunfix(g->batCacheid); - if (e) - BBPunfix(e->batCacheid); - if (s) - BBPunfix(s->batCacheid); - throw(MAL, "ngram.gor", RUNTIME_OBJECT_MISSING); - } - - oid min, max; /* min and max group id */ - BUN ngrp; /* number of groups, number of candidates */ - struct canditer ci; /* candidate list iterator */ - const char *err; /* error message */ - err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp, &ci); - if (err != NULL) { - BBPunfix(b->batCacheid); - if (g) - BBPunfix(g->batCacheid); - if (e) - BBPunfix(e->batCacheid); - if (s) - BBPunfix(s->batCacheid); - throw(MAL, "ngram.gor", "%s\n", err); - } - - /* create a result BAT and initialize it with all zeros */ - bn = BATconstant(min, TYPE_lng, &(lng){0}, ngrp, TRANSIENT); - if (bn == NULL) { - BBPunfix(b->batCacheid); - if (g) - BBPunfix(g->batCacheid); - if (e) - BBPunfix(e->batCacheid); - if (s) _______________________________________________ checkin-list mailing list -- [email protected] To unsubscribe send an email to [email protected]
