Changeset: d18d809dbe73 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/d18d809dbe73
Added Files:
        monetdb5/modules/mal/ngrams.c
Removed Files:
        monetdb5/modules/mal/ngram.c
        monetdb5/modules/mal/ngram.h
Modified Files:
        monetdb5/modules/mal/CMakeLists.txt
        sql/scripts/48_txtsim.sql
        sql/scripts/49_strings.sql
Branch: strimps_v3
Log Message:

Refactor ngrams contains (WIP)


diffs (truncated from 2918 to 300 lines):

diff --git a/monetdb5/modules/mal/CMakeLists.txt 
b/monetdb5/modules/mal/CMakeLists.txt
--- a/monetdb5/modules/mal/CMakeLists.txt
+++ b/monetdb5/modules/mal/CMakeLists.txt
@@ -41,7 +41,7 @@ target_sources(malmodules
   projectionpath.c
   tablet.c tablet.h
   batcalc.c calc.c
-  ngram.c ngram.h)
+  ngrams.c)
 
 target_include_directories(malmodules
   PRIVATE
diff --git a/monetdb5/modules/mal/ngram.c b/monetdb5/modules/mal/ngram.c
deleted file mode 100644
--- a/monetdb5/modules/mal/ngram.c
+++ /dev/null
@@ -1,1637 +0,0 @@
-/*
- * SPDX-License-Identifier: MPL-2.0
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0.  If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- *
- * Copyright 2024 MonetDB Foundation;
- * Copyright August 2008 - 2023 MonetDB B.V.;
- * Copyright 1997 - July 2008 CWI.
- */
-
-#include <monetdb_config.h>
-#include <mal_exception.h>
-#include <gdk_cand.h>
-#include <gdk_atoms.h>
-#include <string.h>
-
-#define M 1000000
-#if 0
-#define GZ 128
-#define CHAR_MAP(s) (s&127)
-#else
-#define GZ 64
-#define CHAR_MAP(s) (s&63)
-#endif
-#define SZ_1GRAM GZ
-#define SZ_2GRAM (GZ*GZ)
-#define SZ_3GRAM (GZ*GZ*GZ)
-#define SZ_4GRAM ((size_t)GZ*GZ*GZ*GZ)
-
-#define hist_1gram sht_hist_1gram
-#define hist_2gram sht_hist_2gram
-#define hist_3gram sht_hist_3gram
-#define NGsignature NGsignature_sht
-#define NGand NGand_sht
-#define NGandselect NGandselect_sht
-#define NGRAM_TYPE sht
-#define NGRAM_TYPEID TYPE_sht
-#define NGRAM_TYPENIL sht_nil
-#define NGRAM_CST
-#define NGRAM_BITS 15
-#include "ngram.h"
-
-#undef hist_1gram
-#undef hist_2gram
-#undef hist_3gram
-#undef NGsignature
-#undef NGand
-#undef NGandselect
-#undef NGRAM_TYPE
-#undef NGRAM_TYPEID
-#undef NGRAM_TYPENIL
-#undef NGRAM_CST
-#undef NGRAM_BITS
-
-#define hist_1gram int_hist_1gram
-#define hist_2gram int_hist_2gram
-#define hist_3gram int_hist_3gram
-#define NGsignature NGsignature_int
-#define NGand NGand_int
-#define NGandselect NGandselect_int
-#define NGRAM_TYPE int
-#define NGRAM_TYPEID TYPE_int
-#define NGRAM_TYPENIL int_nil
-#define NGRAM_CST
-#define NGRAM_BITS 31
-#include "ngram.h"
-
-#undef hist_1gram
-#undef hist_2gram
-#undef hist_3gram
-#undef NGsignature
-#undef NGand
-#undef NGandselect
-#undef NGRAM_TYPE
-#undef NGRAM_TYPEID
-#undef NGRAM_TYPENIL
-#undef NGRAM_CST
-#undef NGRAM_BITS
-
-#define hist_1gram lng_hist_1gram
-#define hist_2gram lng_hist_2gram
-#define hist_3gram lng_hist_3gram
-#define NGsignature NGsignature_lng
-#define NGand NGand_lng
-#define NGandselect NGandselect_lng
-#define NGRAM_TYPE lng
-#define NGRAM_TYPEID TYPE_lng
-#define NGRAM_TYPENIL lng_nil
-#define NGRAM_CST(v) LL_CONSTANT(v)
-#define NGRAM_BITS 63
-#include "ngram.h"
-
-#undef hist_1gram
-#undef hist_2gram
-#undef hist_3gram
-#undef NGsignature
-#undef NGand
-#undef NGandselect
-#undef NGRAM_TYPE
-#undef NGRAM_TYPEID
-#undef NGRAM_TYPENIL
-#undef NGRAM_CST
-#undef NGRAM_BITS
-
-#define hist_1gram hge_hist_1gram
-#define hist_2gram hge_hist_2gram
-#define hist_3gram hge_hist_3gram
-#define NGsignature NGsignature_hge
-#define NGand NGand_hge
-#define NGandselect NGandselect_hge
-#define NGRAM_TYPE hge
-#define NGRAM_TYPEID TYPE_hge
-#define NGRAM_TYPENIL hge_nil
-#define NGRAM_CST(v) ((hge)LL_CONSTANT(v))
-#define NGRAM_BITS 127
-#include "ngram.h"
-
-#undef hist_1gram
-#undef hist_2gram
-#undef hist_3gram
-#undef NGsignature
-#undef NGand
-#undef NGandselect
-#undef NGRAM_TYPE
-#undef NGRAM_TYPEID
-#undef NGRAM_TYPENIL
-#undef NGRAM_CST
-#undef NGRAM_BITS
-
-static str
-NGandjoin_intern(bat *L, bat *R, bat *sigs, bat *needle, bat *lc, bat *rc, bit 
*nil_matches, lng *estimate, bit *anti)
-{
-       (void)L;
-       (void)R;
-       (void)sigs;
-       (void)needle;
-       (void)lc;
-       (void)rc;
-       (void)nil_matches;
-       (void)estimate;
-       (void)anti;
-       return MAL_SUCCEED;
-}
-
-static str
-NGandjoin1(bat *L, bat *sigs, bat *needle, bat *lc, bit *nil_matches, lng 
*estimate, bit *anti)
-{
-       return NGandjoin_intern(L, NULL, sigs, needle, lc, NULL, nil_matches, 
estimate, anti);
-}
-
-static str
-NGandjoin(bat *L, bat *R, bat *sigs, bat *needle, bat *lc, bat *rc, bit 
*nil_matches, lng *estimate, bit *anti)
-{
-       return NGandjoin_intern(L, R, sigs, needle, lc, rc, nil_matches, 
estimate, anti);
-}
-
-static inline int
-popcount64(uint64_t x)
-{
-#if defined(__GNUC__)
-    return (uint32_t) __builtin_popcountll(x);
-#elif defined(_MSC_VER)
-    return (uint32_t) __popcnt64(x);
-#else
-    x = (x & 0x5555555555555555ULL) + ((x >> 1) & 0x5555555555555555ULL);
-    x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
-    x = (x & 0x0F0F0F0F0F0F0F0FULL) + ((x >> 4) & 0x0F0F0F0F0F0F0F0FULL);
-    return (x * 0x0101010101010101ULL) >> 56;
-#endif
-}
-
-static str
-NGpopcnt(int *cnt, lng *v)
-{
-       *cnt = popcount64(*v);
-       return MAL_SUCCEED;
-}
-
-static str
-NGsignature_dummy( str *sig, str *str, int *n)
-{
-       (void)sig;
-       (void)str;
-       (void)n;
-       throw(MAL, "ngram.signature", "no scalar version\n");
-}
-
-static char *
-gor_lng(lng *res, const bat *bid)
-{
-       BAT *b;
-       lng val = 0;
-       BUN nval = 0;
-
-       if ((b = BATdescriptor(*bid)) == NULL)
-               throw(MAL, "gram.gor", RUNTIME_OBJECT_MISSING);
-
-       const lng *vals = (const lng *) Tloc(b, 0);
-       for (BUN i = 0, n = BATcount(b); i < n; i++) {
-               if (is_lng_nil(vals[i]))
-                       continue; /* nils are ignored */
-               if (vals[i] == 0) {
-                       /* any value zero is easy: result is zero */
-                       BBPunfix(b->batCacheid);
-                       *res = 0;
-                       return MAL_SUCCEED;
-               }
-               if (vals[i] < 0) {
-                       val |= -vals[i];
-               } else {
-                       val |= vals[i];
-               }
-               nval++;         /* count non-nil values */
-       }
-       BBPunfix(b->batCacheid);
-       if (nval == 0) {
-               /* if there are no non-nil values, the result is nil */
-               *res = lng_nil;
-       } else {
-               *res = val;
-       }
-       return MAL_SUCCEED;
-}
-
-static char *
-subgrouped_gor_cand_lng(bat *retval, const bat *bid, const bat *gid,
-                       const bat *eid, const bat *sid,
-                       const bit *skip_nils)
-{
-       BAT *b, *bn;            /* these two are always assigned */
-       BAT *g = NULL;          /* these three are optional and may not ... */
-       BAT *e = NULL;          /* ... be assigned to below, ... */
-       BAT *s = NULL;          /* ... so we initialize them here */
-
-       /* we ignore these two inputs */
-       (void) skip_nils;
-
-       /* the bat we're supposed to be working on (bid) is not
-        * optional, but the others are, so we test whether the bat id
-        * is not nil, and if it isn't, whether we can find the BAT
-        * descriptor */
-       if ((b = BATdescriptor(*bid)) == NULL ||
-           (gid && !is_bat_nil(*gid) && (g = BATdescriptor(*gid)) == NULL) ||
-           (eid && !is_bat_nil(*eid) && (e = BATdescriptor(*eid)) == NULL) ||
-           (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL)) {
-               if (b)
-                       BBPunfix(b->batCacheid);
-               if (g)
-                       BBPunfix(g->batCacheid);
-               if (e)
-                       BBPunfix(e->batCacheid);
-               if (s)
-                       BBPunfix(s->batCacheid);
-               throw(MAL, "ngram.gor", RUNTIME_OBJECT_MISSING);
-       }
-
-       oid min, max;           /* min and max group id */
-       BUN ngrp;       /* number of groups, number of candidates */
-       struct canditer ci;     /* candidate list iterator */
-       const char *err;        /* error message */
-       err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp, &ci);
-       if (err != NULL) {
-               BBPunfix(b->batCacheid);
-               if (g)
-                       BBPunfix(g->batCacheid);
-               if (e)
-                       BBPunfix(e->batCacheid);
-               if (s)
-                       BBPunfix(s->batCacheid);
-               throw(MAL, "ngram.gor", "%s\n", err);
-       }
-
-       /* create a result BAT and initialize it with all zeros */
-       bn = BATconstant(min, TYPE_lng, &(lng){0}, ngrp, TRANSIENT);
-       if (bn == NULL) {
-               BBPunfix(b->batCacheid);
-               if (g)
-                       BBPunfix(g->batCacheid);
-               if (e)
-                       BBPunfix(e->batCacheid);
-               if (s)
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to