Changeset: 0b1430f85d89 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/0b1430f85d89
Branch: nested
Log Message:

merge upstream


diffs (truncated from 796 to 300 lines):

diff --git a/monetdb5/modules/kernel/vss2.c b/monetdb5/modules/kernel/vss2.c
new file mode 100644
--- /dev/null
+++ b/monetdb5/modules/kernel/vss2.c
@@ -0,0 +1,791 @@
+/*
+ * SPDX-License-Identifier: MPL-2.0
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0.  If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * For copyright information, see the file debian/copyright.
+ */
+
+/*
+ * BOND: Branch-and-bound ON Decomposed data
+ *
+ * Vector columns are stored decomposed: each dimension is a separate BAT.
+ * BOND exploits this layout by computing partial L2 distances dimension
+ * by dimension, pruning candidates whose partial distance already exceeds
+ * the current k-th best after each batch of dimensions.
+ * Dimension ordering: dimensions are processed in decreasing order of
+ * |query[d] - mean[d]|, so that the most discriminating dimensions come
+ * first and pruning kicks in early.
+ */
+
+
+/* monetdb_config.h must be included as the first include file */
+#include "mal_errors.h"
+#include <monetdb_config.h>
+#include <gdk.h>
+
+/* mal_exception.h actually contains everything we need */
+#include <mal.h>
+#include <mal_exception.h>
+#include <mal_client.h>
+#include <mal_arguments.h>
+
+/* system include files */
+#include <string.h>
+#include <stdlib.h>
+
+
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#define DIMENSION_MISMATCH_ERR "Dimensions mismatch error"
+
+typedef struct bond_collection {
+       BAT **dims;        /* array of dimension BATs (not owned, just 
referenced) */
+       int ndims;         /* number of dimensions */
+       BUN nvecs;         /* number of vectors (rows) */
+       dbl *dim_means;    /* mean value per dimension (for dimension ordering) 
*/
+       dbl *dim_max;           /* max value per dimension */
+       dbl *dim_min;           /* min value per dimension */
+       oid *candidates;
+       oid *tcand;
+       oid *tc;
+       dbl *dists;
+       dbl *tdist;
+       dbl *td;
+       dbl kth_upper;          /* kth upper bound */
+} bond_collection;
+
+/* Helper: comparison function for sorting dimensions by discriminating power 
*/
+typedef struct {
+       int dim;
+       dbl score;
+} dim_score;
+
+//static int dbl_cmp(const void *a, const void *b) {
+//    dbl fa = *(const dbl *)a;
+//    dbl fb = *(const dbl *)b;
+//    return (fa > fb) - (fa < fb);
+//}
+
+static int
+dim_score_cmp(const void *a, const void *b)
+{
+       dbl sa = ((const dim_score *)a)->score;
+       dbl sb = ((const dim_score *)b)->score;
+       /* descending order */
+       if (sa > sb)
+               return -1;
+       if (sa < sb)
+               return 1;
+       return 0;
+}
+
+/*
+ * bond_dim_order: return an array of dimension indices sorted by
+ * |query[d] - mean[d]| descending (most discriminating first).
+ */
+static int *
+bond_dim_order(allocator *ma, bond_collection *bc, const dbl *query_vals)
+{
+       dim_score *scores = ma_alloc(ma, bc->ndims * sizeof(dim_score));
+       int *order = ma_alloc(ma, bc->ndims * sizeof(int));
+
+       if (!scores || !order) {
+               TRC_ERROR(MAL_SERVER, "bond_dim_order: allocation failed\n");
+               return NULL;
+       }
+
+       for (int d = 0; d < bc->ndims; d++) {
+               scores[d].dim = d;
+               dbl diff = query_vals[d] - bc->dim_means[d];
+               scores[d].score = diff < 0 ? -diff : diff;
+       }
+
+       qsort(scores, bc->ndims, sizeof(dim_score), dim_score_cmp);
+
+       for (int i = 0; i < bc->ndims; i++)
+               order[i] = scores[i].dim;
+
+       return order;
+}
+
+#define VS 1024
+static bond_collection *
+bond_create(allocator *ma, BAT **dim_bats, int ndims)
+{
+       if (dim_bats && ndims > 0) {
+               bond_collection *bc = ma_alloc(ma, sizeof(bond_collection));
+               *bc = (bond_collection) {
+                       .ndims = ndims,
+                       .kth_upper = DBL_MAX,
+               };
+               if (bc) {
+                       bc->candidates = ma_alloc(ma, VS * sizeof(oid));
+                       bc->dists = ma_alloc(ma, VS * sizeof(dbl));
+
+                       bc->tcand = ma_alloc(ma, VS * sizeof(oid));
+                       bc->tdist = ma_alloc(ma, VS * sizeof(dbl));
+                       bc->tc = ma_alloc(ma, VS * sizeof(oid));
+                       bc->td = ma_alloc(ma, VS * sizeof(dbl));
+                       bc->dims = ma_alloc(ma, ndims * sizeof(BAT *));
+                       bc->dim_means = ma_zalloc(ma, ndims * sizeof(dbl));
+                       bc->dim_max = ma_zalloc(ma, ndims * sizeof(dbl));
+                       bc->dim_min = ma_zalloc(ma, ndims * sizeof(dbl));
+                       bc->nvecs = BATcount(dim_bats[0]);
+                       for (int d = 0; d < ndims; d++) {
+                               BAT *b = dim_bats[d];
+                               bc->dims[d] = b;
+                       }
+                       return bc;
+               }
+       }
+       return NULL;
+}
+
+static inline void
+heap_down(oid *hcand, dbl *hd, int p, int k)
+{
+       int l = p*2+1, r = p*2+2, q = p;
+       if (l < k && hd[q] < hd[l])
+               q = l;
+       if (r < k && hd[q] < hd[r])
+               q = r;
+       if (q != p) {
+               oid s = hcand[q];
+               hcand[q] = hcand[p];
+               hcand[p] = s;
+               dbl d = hd[q];
+               hd[q] = hd[p];
+               hd[p] = d;
+               heap_down(hcand, hd, q, k);
+       }
+}
+
+static inline void
+heap_del(oid *hcand, dbl *hd, int k)
+{
+       hcand[0] = hcand[k-1];
+       hd[0] = hd[k-1];
+       heap_down(hcand, hd, 0, k-1);
+}
+
+static inline void
+heap_up(oid *hcand, dbl *hd, int p)
+{
+       if (p == 0)
+               return;
+       size_t q = (p-1)/2;
+       if (hd[q] < hd[p]) { /* max heap */
+               oid s = hcand[q];
+               hcand[q] = hcand[p];
+               hcand[p] = s;
+               dbl d = hd[q];
+               hd[q] = hd[p];
+               hd[p] = d;
+               heap_up(hcand, hd, q);
+       }
+}
+
+/* max heap */
+static dbl
+topn_merge(oid *cands, dbl *d, oid *icands, dbl *id, BUN n, BUN k)
+{
+       for (BUN i = 0; i < n; i++) {
+               if (id[i] < d[0]) {
+                       heap_del(cands, d, k);
+                       cands[k-1] = icands[i];
+                       d[k-1] = id[i];
+                       heap_up(cands, d, k-1);
+               }
+       }
+       /*
+       for(i = 0; i < k; i++)
+               printf("%d %F\n", (int)hcand[i], hd[i]);
+               */
+       return d[0];
+}
+
+/* max heap */
+static dbl
+topn(oid *cands, dbl *d, bond_collection *bc, BUN n, BUN k)
+{
+       oid *hcand = bc->tcand;
+       dbl *hd = bc->tdist;
+       BUN i = 0;
+       for (; i < k && i < n; i++) {
+               hcand[i] = cands[i];
+               hd[i] = d[i];
+               heap_up(hcand, hd, i);
+       }
+
+       for (; i < n; i++) {
+               if (d[i] < hd[0]) {
+                       heap_del(hcand, hd, k);
+                       hcand[k-1] = cands[i];
+                       hd[k-1] = d[i];
+                       heap_up(hcand, hd, k-1);
+               }
+       }
+       /*
+       for(i = 0; i < k; i++)
+               printf("%d %F\n", (int)hcand[i], hd[i]);
+               */
+       return hd[0];
+}
+
+static dbl
+bond_upper_bound_sampled(bond_collection *bc, const dbl *query_vals, BUN k)
+{
+       oid *cands = bc->candidates;
+       dbl *dists = bc->dists;
+
+       for (BUN i = 0; i < VS; i++) {
+               cands[i] = i;
+               dists[i] = 0;
+       }
+       // calc avg over the sample instead
+       for (int d = 0; d < bc->ndims; d++) {
+               dbl *v = (dbl*)Tloc(bc->dims[d], 0);
+               bc->dim_min[d] = bc->dim_max[d] = bc->dim_means[d] = v[0];
+               for(BUN i = 1; i < VS; i++) {
+                       bc->dim_means[d] += v[i];
+                       if (bc->dim_min[d] > v[i])
+                               bc->dim_min[d] = v[i];
+                       if (bc->dim_max[d] < v[i])
+                               bc->dim_max[d] = v[i];
+               }
+               bc->dim_means[d] /= VS;
+       }
+
+       // calc full distance for the sample across all dimensions
+    for (int d = 0; d < bc->ndims; d++) {
+        const dbl *vals = (const dbl *) Tloc(bc->dims[d], 0);
+        dbl q = query_vals[d];
+
+        for (BUN i = 0; i < VS; i++) {
+            dbl diff = vals[cands[i]] - q;
+            dists[i] += diff * diff;
+        }
+    }
+       dbl res = topn(cands, dists, bc, VS, k);
+       for(BUN i = 0; i < k; i++) {
+               cands [i] = bc->tcand[i];
+               dists[i] = bc->tdist[i];
+       }
+       return res;
+}
+
+
+static char*
+bond_search_fast(allocator *ma, bond_collection *bc, const dbl *query_vals,
+                       BUN k, int *dim_order, BAT *cands,
+                       BAT **oid_result, BAT **dist_result)
+{
+       lng T0 = GDKusec();
+       if (cands || !bc || !query_vals || k == 0 || !oid_result || 
!dist_result || !dim_order)
+               throw(MAL, "vss.bond_search", "invalid arguments");
+
+    dbl *rem_per_dim = ma_zalloc(ma, sizeof(dbl) * bc->ndims);
+       if (!rem_per_dim)
+               throw(MAL, "vss.bond_search", MAL_MALLOC_FAIL);
+       // pre-calculate theoretical maximums
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to