Changeset: 0b1430f85d89 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/0b1430f85d89 Branch: nested Log Message:
merge upstream diffs (truncated from 796 to 300 lines): diff --git a/monetdb5/modules/kernel/vss2.c b/monetdb5/modules/kernel/vss2.c new file mode 100644 --- /dev/null +++ b/monetdb5/modules/kernel/vss2.c @@ -0,0 +1,791 @@ +/* + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * For copyright information, see the file debian/copyright. + */ + +/* + * BOND: Branch-and-bound ON Decomposed data + * + * Vector columns are stored decomposed: each dimension is a separate BAT. + * BOND exploits this layout by computing partial L2 distances dimension + * by dimension, pruning candidates whose partial distance already exceeds + * the current k-th best after each batch of dimensions. + * Dimension ordering: dimensions are processed in decreasing order of + * |query[d] - mean[d]|, so that the most discriminating dimensions come + * first and pruning kicks in early. + */ + + +/* monetdb_config.h must be included as the first include file */ +#include "mal_errors.h" +#include <monetdb_config.h> +#include <gdk.h> + +/* mal_exception.h actually contains everything we need */ +#include <mal.h> +#include <mal_exception.h> +#include <mal_client.h> +#include <mal_arguments.h> + +/* system include files */ +#include <string.h> +#include <stdlib.h> + + +#include <math.h> +#include <stddef.h> +#include <stdint.h> + +#define DIMENSION_MISMATCH_ERR "Dimensions mismatch error" + +typedef struct bond_collection { + BAT **dims; /* array of dimension BATs (not owned, just referenced) */ + int ndims; /* number of dimensions */ + BUN nvecs; /* number of vectors (rows) */ + dbl *dim_means; /* mean value per dimension (for dimension ordering) */ + dbl *dim_max; /* max value per dimension */ + dbl *dim_min; /* min value per dimension */ + oid *candidates; + oid *tcand; + oid *tc; + dbl *dists; + dbl *tdist; + dbl *td; + dbl kth_upper; /* kth upper bound */ +} bond_collection; + +/* Helper: comparison function for sorting dimensions by discriminating power */ +typedef struct { + int dim; + dbl score; +} dim_score; + +//static int dbl_cmp(const void *a, const void *b) { +// dbl fa = *(const dbl *)a; +// dbl fb = *(const dbl *)b; +// return (fa > fb) - (fa < fb); +//} + +static int +dim_score_cmp(const void *a, const void *b) +{ + dbl sa = ((const dim_score *)a)->score; + dbl sb = ((const dim_score *)b)->score; + /* descending order */ + if (sa > sb) + return -1; + if (sa < sb) + return 1; + return 0; +} + +/* + * bond_dim_order: return an array of dimension indices sorted by + * |query[d] - mean[d]| descending (most discriminating first). + */ +static int * +bond_dim_order(allocator *ma, bond_collection *bc, const dbl *query_vals) +{ + dim_score *scores = ma_alloc(ma, bc->ndims * sizeof(dim_score)); + int *order = ma_alloc(ma, bc->ndims * sizeof(int)); + + if (!scores || !order) { + TRC_ERROR(MAL_SERVER, "bond_dim_order: allocation failed\n"); + return NULL; + } + + for (int d = 0; d < bc->ndims; d++) { + scores[d].dim = d; + dbl diff = query_vals[d] - bc->dim_means[d]; + scores[d].score = diff < 0 ? -diff : diff; + } + + qsort(scores, bc->ndims, sizeof(dim_score), dim_score_cmp); + + for (int i = 0; i < bc->ndims; i++) + order[i] = scores[i].dim; + + return order; +} + +#define VS 1024 +static bond_collection * +bond_create(allocator *ma, BAT **dim_bats, int ndims) +{ + if (dim_bats && ndims > 0) { + bond_collection *bc = ma_alloc(ma, sizeof(bond_collection)); + *bc = (bond_collection) { + .ndims = ndims, + .kth_upper = DBL_MAX, + }; + if (bc) { + bc->candidates = ma_alloc(ma, VS * sizeof(oid)); + bc->dists = ma_alloc(ma, VS * sizeof(dbl)); + + bc->tcand = ma_alloc(ma, VS * sizeof(oid)); + bc->tdist = ma_alloc(ma, VS * sizeof(dbl)); + bc->tc = ma_alloc(ma, VS * sizeof(oid)); + bc->td = ma_alloc(ma, VS * sizeof(dbl)); + bc->dims = ma_alloc(ma, ndims * sizeof(BAT *)); + bc->dim_means = ma_zalloc(ma, ndims * sizeof(dbl)); + bc->dim_max = ma_zalloc(ma, ndims * sizeof(dbl)); + bc->dim_min = ma_zalloc(ma, ndims * sizeof(dbl)); + bc->nvecs = BATcount(dim_bats[0]); + for (int d = 0; d < ndims; d++) { + BAT *b = dim_bats[d]; + bc->dims[d] = b; + } + return bc; + } + } + return NULL; +} + +static inline void +heap_down(oid *hcand, dbl *hd, int p, int k) +{ + int l = p*2+1, r = p*2+2, q = p; + if (l < k && hd[q] < hd[l]) + q = l; + if (r < k && hd[q] < hd[r]) + q = r; + if (q != p) { + oid s = hcand[q]; + hcand[q] = hcand[p]; + hcand[p] = s; + dbl d = hd[q]; + hd[q] = hd[p]; + hd[p] = d; + heap_down(hcand, hd, q, k); + } +} + +static inline void +heap_del(oid *hcand, dbl *hd, int k) +{ + hcand[0] = hcand[k-1]; + hd[0] = hd[k-1]; + heap_down(hcand, hd, 0, k-1); +} + +static inline void +heap_up(oid *hcand, dbl *hd, int p) +{ + if (p == 0) + return; + size_t q = (p-1)/2; + if (hd[q] < hd[p]) { /* max heap */ + oid s = hcand[q]; + hcand[q] = hcand[p]; + hcand[p] = s; + dbl d = hd[q]; + hd[q] = hd[p]; + hd[p] = d; + heap_up(hcand, hd, q); + } +} + +/* max heap */ +static dbl +topn_merge(oid *cands, dbl *d, oid *icands, dbl *id, BUN n, BUN k) +{ + for (BUN i = 0; i < n; i++) { + if (id[i] < d[0]) { + heap_del(cands, d, k); + cands[k-1] = icands[i]; + d[k-1] = id[i]; + heap_up(cands, d, k-1); + } + } + /* + for(i = 0; i < k; i++) + printf("%d %F\n", (int)hcand[i], hd[i]); + */ + return d[0]; +} + +/* max heap */ +static dbl +topn(oid *cands, dbl *d, bond_collection *bc, BUN n, BUN k) +{ + oid *hcand = bc->tcand; + dbl *hd = bc->tdist; + BUN i = 0; + for (; i < k && i < n; i++) { + hcand[i] = cands[i]; + hd[i] = d[i]; + heap_up(hcand, hd, i); + } + + for (; i < n; i++) { + if (d[i] < hd[0]) { + heap_del(hcand, hd, k); + hcand[k-1] = cands[i]; + hd[k-1] = d[i]; + heap_up(hcand, hd, k-1); + } + } + /* + for(i = 0; i < k; i++) + printf("%d %F\n", (int)hcand[i], hd[i]); + */ + return hd[0]; +} + +static dbl +bond_upper_bound_sampled(bond_collection *bc, const dbl *query_vals, BUN k) +{ + oid *cands = bc->candidates; + dbl *dists = bc->dists; + + for (BUN i = 0; i < VS; i++) { + cands[i] = i; + dists[i] = 0; + } + // calc avg over the sample instead + for (int d = 0; d < bc->ndims; d++) { + dbl *v = (dbl*)Tloc(bc->dims[d], 0); + bc->dim_min[d] = bc->dim_max[d] = bc->dim_means[d] = v[0]; + for(BUN i = 1; i < VS; i++) { + bc->dim_means[d] += v[i]; + if (bc->dim_min[d] > v[i]) + bc->dim_min[d] = v[i]; + if (bc->dim_max[d] < v[i]) + bc->dim_max[d] = v[i]; + } + bc->dim_means[d] /= VS; + } + + // calc full distance for the sample across all dimensions + for (int d = 0; d < bc->ndims; d++) { + const dbl *vals = (const dbl *) Tloc(bc->dims[d], 0); + dbl q = query_vals[d]; + + for (BUN i = 0; i < VS; i++) { + dbl diff = vals[cands[i]] - q; + dists[i] += diff * diff; + } + } + dbl res = topn(cands, dists, bc, VS, k); + for(BUN i = 0; i < k; i++) { + cands [i] = bc->tcand[i]; + dists[i] = bc->tdist[i]; + } + return res; +} + + +static char* +bond_search_fast(allocator *ma, bond_collection *bc, const dbl *query_vals, + BUN k, int *dim_order, BAT *cands, + BAT **oid_result, BAT **dist_result) +{ + lng T0 = GDKusec(); + if (cands || !bc || !query_vals || k == 0 || !oid_result || !dist_result || !dim_order) + throw(MAL, "vss.bond_search", "invalid arguments"); + + dbl *rem_per_dim = ma_zalloc(ma, sizeof(dbl) * bc->ndims); + if (!rem_per_dim) + throw(MAL, "vss.bond_search", MAL_MALLOC_FAIL); + // pre-calculate theoretical maximums _______________________________________________ checkin-list mailing list -- [email protected] To unsubscribe send an email to [email protected]
