Changeset: da348dfdb6b8 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=da348dfdb6b8
Modified Files:
        monetdb5/modules/mosaic/mosaic.h
        monetdb5/modules/mosaic/mosaic_dictionary.c
Branch: mosaic
Log Message:

Add bte field for dictionaries
and add dictionary compression over the string offsets.


diffs (truncated from 517 to 300 lines):

diff --git a/monetdb5/modules/mosaic/mosaic.h b/monetdb5/modules/mosaic/mosaic.h
--- a/monetdb5/modules/mosaic/mosaic.h
+++ b/monetdb5/modules/mosaic/mosaic.h
@@ -82,6 +82,7 @@ typedef struct MOSAICHEADER{
        int dictsize;           // used by dictionary compression, it is a 
small table
        int framesize;          // used by frame compression, it is a small 
table
        union{
+               bte valbte[256];
                sht valsht[256];
                int valint[256];
                lng vallng[256];
@@ -94,6 +95,7 @@ typedef struct MOSAICHEADER{
        }dict;
        lng dictfreq[256];// keep track on their use
        union{
+               bte valbte[256];
                sht valsht[256];
                int valint[256];
                lng vallng[256];
diff --git a/monetdb5/modules/mosaic/mosaic_dictionary.c 
b/monetdb5/modules/mosaic/mosaic_dictionary.c
--- a/monetdb5/modules/mosaic/mosaic_dictionary.c
+++ b/monetdb5/modules/mosaic/mosaic_dictionary.c
@@ -53,6 +53,8 @@ MOSdump_dictionaryInternal(char *buf, si
 {
 
        switch(ATOMbasetype(task->type)){
+       case TYPE_bte:
+               snprintf(buf,len,"%d", task->hdr->dict.valbte[i]); break;
        case TYPE_sht:
                snprintf(buf,len,"%hd", task->hdr->dict.valsht[i]); break;
        case TYPE_int:
@@ -69,7 +71,14 @@ MOSdump_dictionaryInternal(char *buf, si
                snprintf(buf,len,"%f", task->hdr->dict.valflt[i]); break;
        case TYPE_dbl:
                snprintf(buf,len,"%g", task->hdr->dict.valdbl[i]); break;
-       }
+       case TYPE_str:
+               switch(task->bsrc->twidth){
+                       case 1: snprintf(buf,len,"%d", 
task->hdr->dict.valbte[i]); break;
+                       case 2: snprintf(buf,len,"%hd", 
task->hdr->dict.valsht[i]); break;
+                       case 4: snprintf(buf,len,"%d", 
task->hdr->dict.valint[i]); break;
+                       case 8: snprintf(buf,len,LLFMT,  
task->hdr->dict.vallng[i]); 
+               }
+}
 }
 
 void
@@ -85,6 +94,8 @@ MOSdump_dictionary(Client cntxt, MOStask
        }
        mnstr_printf(cntxt->fdout,"\n");
        switch(ATOMbasetype(task->type)){
+       case TYPE_bte:
+               snprintf(buf,len,"%d %d", 
task->hdr->checksum.sumbte,task->hdr->checksum2.sumbte); break;
        case TYPE_sht:
                snprintf(buf,len,"%hd %hd", 
task->hdr->checksum.sumsht,task->hdr->checksum2.sumsht); break;
        case TYPE_int:
@@ -101,6 +112,13 @@ MOSdump_dictionary(Client cntxt, MOStask
                snprintf(buf,len,"%f %f", 
task->hdr->checksum.sumflt,task->hdr->checksum2.sumflt); break;
        case TYPE_dbl:
                snprintf(buf,len,"%g %g", 
task->hdr->checksum.sumdbl,task->hdr->checksum2.sumdbl); break;
+       case TYPE_str:
+               switch(task->bsrc->twidth){
+                       case 1: snprintf(buf,len,"%d", 
task->hdr->dict.valbte[i]); break;
+                       case 2: snprintf(buf,len,"%hd", 
task->hdr->dict.valsht[i]); break;
+                       case 4: snprintf(buf,len,"%d", 
task->hdr->dict.valint[i]); break;
+                       case 8: snprintf(buf,len,LLFMT,  
task->hdr->dict.vallng[i]); 
+               }
        }
        mnstr_printf(cntxt->fdout,"#checksums %s\n",buf);
 }
@@ -165,11 +183,11 @@ MOSskip_dictionary(Client cntxt, MOStask
        if( task->range[MOSAIC_DICT] > task->start){\
                i = task->range[MOSAIC_DICT] - task->start;\
                if ( i > MOSlimit() ) i = MOSlimit();\
-               if( i * sizeof(TPE) <= wordaligned( MosaicBlkSize + i,TPE))\
+               if( i * sizeof(TPE) <= wordaligned( MosaicBlkSize + 
(i*hdr->bits)/8,TPE))\
                        return 0.0;\
-               if( task->dst +  wordaligned(MosaicBlkSize + i,sizeof(TPE)) >= 
task->bsrc->tmosaic->base + task->bsrc->tmosaic->size)\
+               if( task->dst +  wordaligned(MosaicBlkSize + 
(i*hdr->bits)/8,sizeof(TPE)) >= task->bsrc->tmosaic->base + 
task->bsrc->tmosaic->size)\
                        return 0.0;\
-               if(i) factor = ((flt) i * sizeof(TPE))/ 
wordaligned(MosaicBlkSize + sizeof(int) + i,TPE);\
+               if(i) factor = ((flt) i * sizeof(TPE))/ 
wordaligned(MosaicBlkSize + sizeof(int) + (i*hdr->bits)/8,TPE);\
                return factor;\
        }\
        for(i =0; i<limit; i++, val++){\
@@ -177,63 +195,74 @@ MOSskip_dictionary(Client cntxt, MOStask
                if( j == hdr->dictsize || hdr->dict.val##TPE[j] != *val )\
                        break;\
        }\
-       if( i * sizeof(TPE) <= wordaligned( MosaicBlkSize + i,TPE))\
+       if( i * sizeof(TPE) <= wordaligned( MosaicBlkSize + (i * hdr->bits)/8 
,TPE))\
                return 0.0;\
-       if(i) factor = (flt) ((int)i * sizeof(int)) / wordaligned( 
MosaicBlkSize + i,TPE);\
+       if(i) factor = (flt) ((int)i * sizeof(TPE)) / wordaligned( 
MosaicBlkSize + (i * hdr->bits)/8,TPE);\
 }
 
-// store it in the compressed heap header directly
-// filter out the most frequent ones
+// Create a larger dictionary buffer then we allow for in the mosaic header 
first
+// Store the most frequent ones in the compressed heap header directly based 
on estimated savings
+// Improve by using binary search rather then linear scan
+#define TMPDICT 256
+
 #define makeDict(TPE)\
-{      TPE v,*val = ((TPE*)task->src) + task->start;\
+{      TPE *val = ((TPE*)task->src) + task->start,v,w;\
        BUN limit = task->stop - task->start > MOSlimit()? MOSlimit(): 
task->stop - task->start;\
+       int cw,cv;\
        for(i = 0; i< limit; i++, val++){\
-               for(j= 0; j< hdr->dictsize; j++)\
-                       if( task->hdr->dict.val##TPE[j] == *val) break;\
-               if ( j == hdr->dictsize){\
-                       if ( hdr->dictsize == 256){\
-                               int min = 0;\
-                               for(j=1;j<256;j++)\
-                                       if( cnt[min] <cnt[j]) min = j;\
-                               j=min;\
-                               cnt[j]=0;\
-                               break;\
+               MOSfind(j,dict.val##TPE,*val,0,dictsize);\
+               if(j == dictsize && dictsize == 0 ){\
+                       dict.val##TPE[j]= *val;\
+                       cnt[j] = 1;\
+                       dictsize++;\
+               } else  \
+               if( dictsize < TMPDICT && dict.val##TPE[j] != *val){\
+                       w= *val; cw= 1;\
+                       for( ; j< dictsize; j++)\
+                       if (dict.val##TPE[j] > w){\
+                               v =dict.val##TPE[j];\
+                               dict.val##TPE[j]= w;\
+                               w = v;\
+                               cv = cnt[j];\
+                               cnt[j]= cw;\
+                               cw = cv;\
                        }\
-                       task->hdr->dict.val##TPE[j] = *val;\
-                       cnt[j]++;\
-                       hdr->dictsize++;\
-               } else\
-                       cnt[j]++;\
-       }\
-       for(k=0; k< hdr->dictsize; k++)\
-               for(j=k+1; j< hdr->dictsize; j++)\
-                       if(task->hdr->dict.val##TPE[k] 
>task->hdr->dict.val##TPE[j]){\
-                               v = task->hdr->dict.val##TPE[k];\
-                               task->hdr->dict.val##TPE[k] = 
task->hdr->dict.val##TPE[j];\
-                               task->hdr->dict.val##TPE[j] = v;\
-                       }\
-       hdr->bits = 1;\
-       hdr->mask =1;\
-       for( k=2 ; k < hdr->dictsize; k *=2){\
-               hdr->bits++;\
-               hdr->mask = (hdr->mask <<1) | 1;\
-       }\
-}
+                       dictsize++;\
+                       dict.val##TPE[j]= *val;\
+                       cnt[j] = 1;\
+               } else cnt[j]++;\
+} }
 
 
+/* take a larger sample before fixing the dictionary */
 void
 MOScreatedictionary(Client cntxt, MOStask task)
 {
        BUN i;
-       int j,k;
+       int j;
        MosaicHdr hdr = task->hdr;
-       lng cnt[256];
+    union{
+        bte valbte[TMPDICT];
+        sht valsht[TMPDICT];
+        int valint[TMPDICT];
+        lng vallng[TMPDICT];
+        oid valoid[TMPDICT];
+        flt valflt[TMPDICT];
+        dbl valdbl[TMPDICT];
+#ifdef HAVE_HGE
+        hge valhge[TMPDICT];
+#endif
+    }dict;
+       lng cnt[TMPDICT];
+       int dictsize = 0;
 
        (void) cntxt;
+       memset((char*) &dict, 0, TMPDICT * sizeof(lng));
        memset((char*) cnt, 0, sizeof(cnt));
-       hdr->dictsize = 0;
+
+       hdr->dictsize = dictsize;
        switch(ATOMbasetype(task->type)){
-       case TYPE_bte: break; // no compression achievable
+       case TYPE_bte: makeDict(bte); break;
        case TYPE_sht: makeDict(sht); break;
        case TYPE_int: makeDict(int); break;
        case TYPE_lng: makeDict(lng); break;
@@ -243,11 +272,60 @@ MOScreatedictionary(Client cntxt, MOStas
 #ifdef HAVE_HGE
        case TYPE_hge: makeDict(hge); break;
 #endif
+       case TYPE_str:
+               switch(task->bsrc->twidth){
+               case 1: //makeDict(bte); break;
+{      bte *val = ((bte*)task->src) + task->start,v,w;
+       BUN limit = task->stop - task->start > MOSlimit()? MOSlimit(): 
task->stop - task->start;
+       int cw,cv;
+       for(i = 0; i< limit; i++, val++){
+               MOSfind(j,dict.valbte,*val,0,dictsize);
+               if(j == dictsize && dictsize == 0 ){
+                       dict.valbte[j]= *val;
+                       cnt[j] = 1;
+                       dictsize++;
+               } else  
+               if(dictsize < TMPDICT &&  dict.valbte[j] != *val){
+                       w= *val; cw= 1;
+                       for( ; j< dictsize; j++)
+                       if (dict.valbte[j] > w){
+                               v =dict.valbte[j];
+                               dict.valbte[j]= w;
+                               w = v;
+                               cv = cnt[j];
+                               cnt[j]= cw;
+                               cw = cv;
+                       }
+                       dictsize++;
+                       dict.valbte[j]= w;
+                       cnt[j] = cw;
+               } else cnt[j]++;
+} }
+       break;
+               case 2: makeDict(sht); break;
+               case 4: makeDict(int); break;
+               case 8: makeDict(lng); break;
+               }
+               break;
 #ifdef _DEBUG_MOSAIC_
        default:
                mnstr_printf(cntxt->fdout,"#does not support dictionary type 
%d\n",ATOMbasetype(task->type));
 #endif
        }
+       /* find the 256 most frequent values and save them in the mosaic header 
*/
+       if( dictsize < 256){
+               memcpy((char*)&task->hdr->dict,  (char*)&dict, dictsize * 
sizeof(lng));
+               memcpy((char*)task->hdr->dictfreq,  (char*)&cnt, dictsize * 
sizeof(lng));
+               hdr->dictsize = dictsize;
+       } else {
+       }
+       /* calculate the bit-width */
+       hdr->bits = 1;
+       hdr->mask =1;
+       for( j=2 ; j < dictsize; j *=2){
+               hdr->bits++;
+               hdr->mask = (hdr->mask <<1) | 1;
+       }
 #ifdef _DEBUG_MOSAIC_
        MOSdump_dictionary(cntxt, task);
 #endif
@@ -264,39 +342,44 @@ MOSestimate_dictionary(Client cntxt, MOS
        (void) cntxt;
 
        switch(ATOMbasetype(task->type)){
-       case TYPE_bte: break; // no compression achievable
+       case TYPE_bte: estimateDict(bte); break; 
        case TYPE_sht: estimateDict(sht); break;
        case TYPE_int: estimateDict(int); break;
        case TYPE_oid: estimateDict(oid); break;
+       case TYPE_lng: estimateDict(lng); break;
        case TYPE_flt: estimateDict(flt); break;
        case TYPE_dbl: estimateDict(dbl); break;
 #ifdef HAVE_HGE
        case TYPE_hge: estimateDict(hge); break;
 #endif
-       case TYPE_lng:
-               {       lng *val = ((lng*)task->src) + task->start;
-                       // assume uniform compression statistics
-                       if( task->range[MOSAIC_DICT] > task->start){
-                               i = task->range[MOSAIC_DICT] - task->start;
-                               if ( i > MOSlimit() ) i = MOSlimit();
-                               if( i * sizeof(lng) <= wordaligned( 
MosaicBlkSize + i,lng))
-                                       return 0.0;
-                               if( task->dst +  wordaligned(MosaicBlkSize + 
i,sizeof(lng)) >= task->bsrc->tmosaic->base + task->bsrc->tmosaic->size)
-                                       return 0.0;
-                               if(i) factor = ((flt) i * sizeof(lng))/ 
wordaligned(MosaicBlkSize + sizeof(lng) + i,lng);
-                               return factor;
-                       }
-
-                       for(i =task->start; i<task->stop; i++, val++){
-                               
MOSfind(j,task->hdr->dict.vallng,*val,0,hdr->dictsize);
-                               if( j == hdr->dictsize || 
task->hdr->dict.vallng[j] != *val)
-                                       break;
-                       }
-                       i -= task->start;
-                       if ( i > MOSlimit() ) i = MOSlimit();
-                       if( i * sizeof(lng) < wordaligned( MosaicBlkSize + 
i,lng))
-                               return 0.0;
-                       if(i) factor = (flt) ((lng)i * sizeof(lng)) / 
wordaligned( MosaicBlkSize + i,lng);
+       case TYPE_str:
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to