Changeset: 2c59920cd335 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=2c59920cd335
Modified Files:
        gdk/gdk.h
        gdk/gdk_imprints.c
        gdk/gdk_join.c
Branch: imprints-join
Log Message:

support creating imprints according to another BAT's imprint bin borders


diffs (284 lines):

diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -1947,6 +1947,8 @@ gdk_export gdk_return BAThash_imps(BAT *
  */
 
 gdk_export gdk_return BATimprints(BAT *b);
+gdk_export gdk_return BATsubimprints(BAT *b, BAT *g);
+gdk_export gdk_return IMPSinternal(BAT *b, BAT *g);
 gdk_export void IMPSdestroy(BAT *b);
 gdk_export lng IMPSimprintsize(BAT *b);
 
diff --git a/gdk/gdk_imprints.c b/gdk/gdk_imprints.c
--- a/gdk/gdk_imprints.c
+++ b/gdk/gdk_imprints.c
@@ -183,6 +183,16 @@ do {                                                       
                \
        }                                                               \
 } while (0)
 
+#define COPY_BINS(TYPE)                                                        
        \
+do {                                                                           
                \
+       BUN k;                                                                  
                \
+       TYPE *restrict h = imprints->bins;                              \
+       TYPE *restrict src = g->timprints->bins;                \
+       for (k = 0; k < 64; ++k) {                                              
\
+               h[k] = src[k];                                                  
        \
+       }                                                                       
                        \
+} while (0)                                                                    
                \
+
 /* Check whether we have imprints on b (and return true if we do).  It
  * may be that the imprints were made persistent, but we hadn't seen
  * that yet, so check the file system.  This also returns true if b is
@@ -275,11 +285,23 @@ BATcheckimprints(BAT *b)
 }
 
 gdk_return
-BATimprints(BAT *b)
+BATsubimprints(BAT *b, BAT *g) {
+        return IMPSinternal(b, g);
+}
+
+gdk_return
+BATimprints(BAT *b) {
+        return IMPSinternal(b, NULL);
+}
+
+
+gdk_return
+IMPSinternal(BAT *b, BAT *g)
 {
        BAT *o = NULL, *s1 = NULL, *s2 = NULL, *s3 = NULL, *s4 = NULL;
        Imprints *imprints;
        lng t0 = 0;
+       int copy_histo = 0;
 
        /* we only create imprints for types that look like types we know */
        switch (ATOMbasetype(b->ttype)) {
@@ -299,6 +321,21 @@ BATimprints(BAT *b)
                return GDK_FAIL;
        }
 
+       if (g) {
+               /* require that both BATs has the same type*/
+               if (ATOMbasetype(b->ttype) != ATOMbasetype(g->ttype)) {
+                       fprintf(stderr, "IMPSinternal: b and g are not the same 
type\n");
+                       return GDK_FAIL;
+               }
+
+               if (g->timprints == NULL) {
+                       fprintf(stderr, "IMPSinternal: g contains no imprints");
+                       return GDK_FAIL;
+               }
+
+               copy_histo = 1;
+       }
+
        BATcheck(b, "BATimprints", GDK_FAIL);
 
        if (BATcheckimprints(b))
@@ -348,46 +385,54 @@ BATimprints(BAT *b)
                                                           imprintsheap);
 
 #define SMP_SIZE 2048
-               s1 = BATsample(b, SMP_SIZE);
-               if (s1 == NULL) {
-                       MT_lock_unset(&GDKimprintsLock(b->batCacheid));
-                       GDKfree(imprints);
-                       return GDK_FAIL;
+
+               if (g) {
+
+                       /* determine the # of bits for b's imprints, directly 
use g's # of bits at this moment */
+                       imprints->bits = g->timprints->bits;
+
+               } else {
+                       s1 = BATsample(b, SMP_SIZE);
+                       if (s1 == NULL) {
+                               MT_lock_unset(&GDKimprintsLock(b->batCacheid));
+                               GDKfree(imprints);
+                               return GDK_FAIL;
+                       }
+                       s2 = BATunique(b, s1);
+                       if (s2 == NULL) {
+                               MT_lock_unset(&GDKimprintsLock(b->batCacheid));
+                               BBPunfix(s1->batCacheid);
+                               GDKfree(imprints);
+                               return GDK_FAIL;
+                       }
+                       s3 = BATproject(s2, b);
+                       if (s3 == NULL) {
+                               MT_lock_unset(&GDKimprintsLock(b->batCacheid));
+                               BBPunfix(s1->batCacheid);
+                               BBPunfix(s2->batCacheid);
+                               GDKfree(imprints);
+                               return GDK_FAIL;
+                       }
+                       s3->tkey = 1;   /* we know is unique on tail now */
+                       if (BATsort(&s4, NULL, NULL, s3, NULL, NULL, 0, 0) != 
GDK_SUCCEED) {
+                               MT_lock_unset(&GDKimprintsLock(b->batCacheid));
+                               BBPunfix(s1->batCacheid);
+                               BBPunfix(s2->batCacheid);
+                               BBPunfix(s3->batCacheid);
+                               GDKfree(imprints);
+                               return GDK_FAIL;
+                       }
+                       /* s4 now is ordered and unique on tail */
+                       assert(s4->tkey && s4->tsorted);
+                       cnt = BATcount(s4);
+                       imprints->bits = 64;
+                       if (cnt <= 32)
+                               imprints->bits = 32;
+                       if (cnt <= 16)
+                               imprints->bits = 16;
+                       if (cnt <= 8)
+                               imprints->bits = 8;
                }
-               s2 = BATunique(b, s1);
-               if (s2 == NULL) {
-                       MT_lock_unset(&GDKimprintsLock(b->batCacheid));
-                       BBPunfix(s1->batCacheid);
-                       GDKfree(imprints);
-                       return GDK_FAIL;
-               }
-               s3 = BATproject(s2, b);
-               if (s3 == NULL) {
-                       MT_lock_unset(&GDKimprintsLock(b->batCacheid));
-                       BBPunfix(s1->batCacheid);
-                       BBPunfix(s2->batCacheid);
-                       GDKfree(imprints);
-                       return GDK_FAIL;
-               }
-               s3->tkey = 1;   /* we know is unique on tail now */
-               if (BATsort(&s4, NULL, NULL, s3, NULL, NULL, 0, 0) != 
GDK_SUCCEED) {
-                       MT_lock_unset(&GDKimprintsLock(b->batCacheid));
-                       BBPunfix(s1->batCacheid);
-                       BBPunfix(s2->batCacheid);
-                       BBPunfix(s3->batCacheid);
-                       GDKfree(imprints);
-                       return GDK_FAIL;
-               }
-               /* s4 now is ordered and unique on tail */
-               assert(s4->tkey && s4->tsorted);
-               cnt = BATcount(s4);
-               imprints->bits = 64;
-               if (cnt <= 32)
-                       imprints->bits = 32;
-               if (cnt <= 16)
-                       imprints->bits = 16;
-               if (cnt <= 8)
-                       imprints->bits = 8;
 
                /* The heap we create here consists of four parts:
                 * bins, max 64 entries with bin boundaries, domain of b;
@@ -422,29 +467,59 @@ BATimprints(BAT *b)
                imprints->imps = (void *) (imprints->stats + 64 * 3);
                imprints->dict = (void *) ((uintptr_t) ((char *) imprints->imps 
+ pages * (imprints->bits / 8) + sizeof(uint64_t)) & ~(sizeof(uint64_t) - 1));
 
+
                switch (ATOMbasetype(b->ttype)) {
                case TYPE_bte:
-                       FILL_HISTOGRAM(bte);
+                       if (!copy_histo) {
+                               FILL_HISTOGRAM(bte);
+                       } else {
+                               COPY_BINS(bte);
+                       }
+                       //!copy_histo ? FILL_HISTOGRAM(bte) : COPY_BINS(bte);
                        break;
                case TYPE_sht:
-                       FILL_HISTOGRAM(sht);
+                       if (!copy_histo) {
+                               FILL_HISTOGRAM(sht);
+                       } else {
+                               COPY_BINS(sht);
+                       }
                        break;
                case TYPE_int:
-                       FILL_HISTOGRAM(int);
+                       if (!copy_histo) {
+                               FILL_HISTOGRAM(int);
+                       } else {
+                               COPY_BINS(int);
+                       }
                        break;
                case TYPE_lng:
-                       FILL_HISTOGRAM(lng);
+                       if (!copy_histo) {
+                               FILL_HISTOGRAM(lng);
+                       } else {
+                               COPY_BINS(lng);
+                       }
                        break;
 #ifdef HAVE_HGE
                case TYPE_hge:
-                       FILL_HISTOGRAM(hge);
+                       if (!copy_histo) {
+                               FILL_HISTOGRAM(hge);
+                       } else {
+                               COPY_BINS(hge);
+                       }
                        break;
 #endif
                case TYPE_flt:
-                       FILL_HISTOGRAM(flt);
+                       if (!copy_histo) {
+                               FILL_HISTOGRAM(flt);
+                       } else {
+                               COPY_BINS(flt);
+                       }
                        break;
                case TYPE_dbl:
-                       FILL_HISTOGRAM(dbl);
+                       if (!copy_histo) {
+                               FILL_HISTOGRAM(dbl);
+                       } else {
+                               COPY_BINS(dbl);
+                       }
                        break;
                default:
                        /* should never reach here */
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -2653,6 +2653,7 @@ imps_hashjoin(BAT *r1, BAT *r2, BAT *l, 
        int lskipped = 0;       /* whether we skipped values in l */
        const Hash *restrict hsh;
        int t;
+       gdk_return ret;
 
        ALGODEBUG fprintf(stderr, "#imps_hashjoin(l=%s#" BUNFMT "[%s]%s%s%s%s,"
                          "r=%s#" BUNFMT "[%s]%s%s%s%s,sl=%s#" BUNFMT "%s%s%s,"
@@ -2680,14 +2681,29 @@ imps_hashjoin(BAT *r1, BAT *r2, BAT *l, 
                          swapped ? " swapped" : "",
                          *reason ? " " : "", reason);
 
-       if (l->timprints == NULL || r->timprints == NULL) {
-               fprintf(stderr, "both columns require imprints\n");
+       /* check imprints for the smaller side (r); if not exist, build it;
+        * for the larger side (l), if imprint exists, destroy it first, then 
re-build it according to r's bin borders
+        */
+       if (!(BATcheckimprints(r) || (BATimprints(r) == GDK_SUCCEED))) {
+               fprintf(stderr, "smaller column fail in build imprints\n");
+               return GDK_FAIL;
+       }
+
+       if (BATcheckimprints(l)) {
+               IMPSdestroy(l);
+               ret = BATsubimprints(l, r);
+       } else {
+               ret = BATsubimprints(l, r);
+       }
+
+       if (ret != GDK_SUCCEED) {
+               fprintf(stderr, "both columns require imprints with identical 
bin borders\n");
                return GDK_FAIL;
        }
 
        /* Ignore the candidate list at this moment */
        if (sl != NULL || sr != NULL) {
-               fprintf(stderr, "do not deal with candidate list first");
+               fprintf(stderr, "do not deal with candidate list first\n");
                return GDK_FAIL;
        }
 
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to