Changeset: 2cb1187ae44f for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/2cb1187ae44f
Modified Files:
        gdk/gdk_histogram.c
        gdk/gdk_private.h
Branch: histograms
Log Message:

Cleanup, use smaller sizes for bucket entries


diffs (186 lines):

diff --git a/gdk/gdk_histogram.c b/gdk/gdk_histogram.c
--- a/gdk/gdk_histogram.c
+++ b/gdk/gdk_histogram.c
@@ -21,6 +21,10 @@
 #define NBUCKETS 64
 #define SAMPLE_SIZE 1024
 
+static_assert(NBUCKETS > 2, "The number of buckets must be > 2");
+static_assert(SAMPLE_SIZE > NBUCKETS, "The sample size must be > number of 
buckets");
+static_assert(SAMPLE_SIZE < INT_MAX, "The sample size must be on the integer 
range");
+
 #ifdef HAVE_HGE
 #define        VAR_UPCAST hge
 #define        VAR_TPE    TYPE_hge
@@ -29,6 +33,20 @@
 #define        VAR_TPE    TYPE_lng
 #endif
 
+#define histogram_entry(TPE)   \
+typedef struct HistogramEntry_##TPE {  \
+       TPE min, max;   /* The minimum and maximum value in the range */        
\
+       int count;      /* number of values */  \
+} HistogramEntry_##TPE;
+
+histogram_entry(bte)
+histogram_entry(sht)
+histogram_entry(int)
+histogram_entry(lng)
+#ifdef HAVE_HGE
+histogram_entry(hge)
+#endif
+
 static bool
 can_create_histogram(int tpe)
 {
@@ -116,34 +134,36 @@ min_and_max_values(BATiter *bi, ValPtr m
                BUN k = 0, l = BATcount(sam);   \
        \
                h->nbuckets = (int) (j - i + 1); \
-               if (!(h->histogram = GDKmalloc(sizeof(struct HistogramEntry) * 
h->nbuckets))) \
+               if (!(h->histogram = GDKmalloc(sizeof(HistogramEntry_##TPE) * 
h->nbuckets))) \
                        return NULL; \
        \
-               while (true) { \
-                       struct HistogramEntry *restrict e = 
&(h->histogram[k++]); \
-                       VALinit(&(e->min), tpe, &ii); /* TODO? maybe I don't 
need to materialize this for 'perfect' histograms */\
-                       VALinit(&(e->max), tpe, &ii);   \
+               while (true) { /* Initialize bucket ranges */ \
+                       HistogramEntry_##TPE *restrict e = 
&(((HistogramEntry_##TPE *)h->histogram)[k++]); \
+                       e->min = e->max = ii; /* TODO? maybe I don't need to 
materialize this for 'perfect' histograms */ \
                        e->count = 0; \
                        if (ii == j) \
                                break; \
                        ii++; \
                } \
        \
-               for (BUN k = 0 ; k < l ; k++) { \
+               HistogramEntry_##TPE *restrict hist = (HistogramEntry_##TPE 
*)h->histogram; \
+               for (BUN k = 0 ; k < l ; k++) { /* Now create the histogram */\
                        TPE next = v[k]; \
                        if (is_##TPE##_nil(next)) { \
-                               h->nulls++; \
+                               nulls++; \
                        } else { \
-                               int offset = (next - i); /* get the offset from 
min, then increment */ \
-                               h->histogram[offset].count++; \
+                               int offset = (int) (next - i); /* get the 
offset from min, then increment */ \
+                               hist[offset].count++; \
                        } \
                } \
+               h->nulls = nulls; \
        } while (0)
 
 static Histogram *
 create_perfect_histogram(BAT *sam, Histogram *h, ValPtr min, ValPtr max)
 {
-       int tpe = ATOMbasetype(sam->ttype);
+       int nulls = 0, tpe = ATOMbasetype(sam->ttype);
+
        switch (tpe) {
        case TYPE_bte:
                perfect_histogram_loop(bte);
@@ -168,6 +188,71 @@ create_perfect_histogram(BAT *sam, Histo
        return h;
 }
 
+#define generic_histogram_loop(TPE)    \
+       do { \
+               TPE i = *(TPE*)VALget(min), ii = i, j = *(TPE*)VALget(max), 
gap; \
+               const TPE *restrict v = Tloc(sam, 0); \
+               BUN l = BATcount(sam);  \
+       \
+               h->nbuckets = NBUCKETS; \
+               if (!(h->histogram = GDKmalloc(sizeof(HistogramEntry_##TPE) * 
h->nbuckets))) \
+                       return NULL; \
+               gap = (TPE) floor((double) (j - i) / (double) h->nbuckets); \
+       \
+               for (BUN k = 0 ; k < NBUCKETS ; k++) { \
+                       HistogramEntry_##TPE *restrict e = 
&(((HistogramEntry_##TPE *)h->histogram)[k]); \
+                       e->min = ii;    \
+                       ii += gap; \
+                       if (k == NBUCKETS - 1) { \
+                               e->max = ii; \
+                       } else { \
+                               e->max = j; /* The last bucket will contain the 
max value inclusive */ \
+                       } \
+                       e->count = 0; \
+               } \
+       \
+               HistogramEntry_##TPE *restrict hist = (HistogramEntry_##TPE 
*)h->histogram; \
+               for (BUN k = 0 ; k < l ; k++) { /* Now create the histogram */\
+                       TPE next = v[k]; \
+                       if (is_##TPE##_nil(next)) { \
+                               nulls++; \
+                       } else { \
+                               (void) hist; \
+                               /* ... */ \
+                       } \
+               } \
+               h->nulls = nulls; \
+       } while (0)
+
+static Histogram *
+create_generic_histogram(BAT *sam, Histogram *h, ValPtr min, ValPtr max)
+{
+       int nulls = 0, tpe = ATOMbasetype(sam->ttype);
+
+       switch (tpe) {
+       case TYPE_bte:
+               generic_histogram_loop(bte);
+               break;
+       case TYPE_sht:
+               generic_histogram_loop(sht);
+               break;
+       case TYPE_int:
+               generic_histogram_loop(int);
+               break;
+       case TYPE_lng:
+               generic_histogram_loop(lng);
+               break;
+#ifdef HAVE_HGE
+       case TYPE_hge:
+               generic_histogram_loop(hge);
+               break;
+#endif
+       default:
+               assert(0);
+       }
+       return h;
+}
+
 gdk_return
 HISTOGRAMcreate(BAT *b)
 {
@@ -238,9 +323,9 @@ HISTOGRAMcreate(BAT *b)
                        h->size = (int) BATcount(sam); /* it should fit */
                        if (perfect_histogram) {
                                h = create_perfect_histogram(sam, h, &min, 
&max);
-                       }/* else {
-                                do the generic way
-                       }*/
+                       } else {
+                               h = create_generic_histogram(sam, h, &min, 
&max);
+                       }
 
                        BBPreclaim(sam);
                        if (!h) {
diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -456,14 +456,8 @@ struct Strimps {
 };
 
 /* The current sample size is 1024, so an int will fit for the bucket count */
-struct HistogramEntry {
-       ValRecord min;  /* The minimum value in the range */
-       ValRecord max;  /* The maximum value in the range */
-       int count;      /* number of values */
-};
-
 struct Histogram {
-       struct HistogramEntry *histogram;       /* The histogram itself (at the 
moment it's malloc'ed) */
+       void *histogram;        /* The histogram itself (at the moment it's 
malloc'ed). It contains min. max and count per bucket */
        int nbuckets;   /* The number of buckets excluding the one for NULL 
values */
        int nulls;      /* The number of null values */
        int size;       /* The total number of values in the histogram (if the 
table is smaller than the sample size)*/
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to