Changeset: bbbfb9de755a for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=bbbfb9de755a
Modified Files:
monetdb5/modules/mal/mosaic.c
monetdb5/modules/mal/mosaic.h
monetdb5/modules/mal/mosaic_dictionary.c
monetdb5/modules/mal/mosaic_dictionary.h
Branch: mosaic
Log Message:
Use a global column dictionary
diffs (truncated from 617 to 300 lines):
diff --git a/monetdb5/modules/mal/mosaic.c b/monetdb5/modules/mal/mosaic.c
--- a/monetdb5/modules/mal/mosaic.c
+++ b/monetdb5/modules/mal/mosaic.c
@@ -284,7 +284,6 @@ MOScompressInternal(Client cntxt, int *r
task->elm = BATcount(bsrc);
task->size = bsrc->T->heap.free;
task->timer = GDKusec();
- task->dictsize = task->elm < 20? 2: DICTSIZE;
MOSinit(task,bcompress);
MOSinitHeader(task);
@@ -303,11 +302,11 @@ MOScompressInternal(Client cntxt, int *r
task->elm = BATcount(bcompress);
task->size = bcompress->T->heap.free;
task->timer = GDKusec();
- task->dictsize = task->elm < 20? 2: DICTSIZE;
MOSinit(task,bsrc);
MOSinitHeader(task);
}
+ MOScreatedictionary(cntxt,task);
// always start with an EOL block
MOSsetTag(task->blk,MOSAIC_EOL);
@@ -582,7 +581,7 @@ MOSdecompressInternal(Client cntxt, int
throw(MAL, "mosaic.decompress", "cannot decompress tail-VIEW");
}
- bsrc = BATnew(b->htype,b->ttype,BATcount(b),TRANSIENT);
+ bsrc = BATnew(b->htype,b->ttype,BATcount(b)+ MosaicHdrSize,TRANSIENT);
if ( bsrc == NULL) {
BBPreleaseref(b->batCacheid);
throw(MAL, "mosaic.decompress", MAL_MALLOC_FAIL);
@@ -614,13 +613,11 @@ MOSdecompressInternal(Client cntxt, int
MOSinit(task,bsrc);
task->src = Tloc(b, BUNfirst(b));
task->timer = GDKusec();
- task->dictsize = BATcount(b) < 20? 2: DICTSIZE;
} else {
// create a local decompressed copy
MOSinit(task,b);
task->src = Tloc(bsrc, BUNfirst(bsrc));
task->timer = GDKusec();
- task->dictsize = BATcount(b) < 20? 2: DICTSIZE;
}
while(task->blk){
diff --git a/monetdb5/modules/mal/mosaic.h b/monetdb5/modules/mal/mosaic.h
--- a/monetdb5/modules/mal/mosaic.h
+++ b/monetdb5/modules/mal/mosaic.h
@@ -39,7 +39,7 @@
#define MIN_INPUT_COUNT 1
/* The compressor kinds currently hardwired */
-#define MOSAIC_METHODS 8
+#define MOSAIC_METHODS 10
#define MOSAIC_NONE 0 // no compression at all
#define MOSAIC_RLE 1 // use run-length encoding
#define MOSAIC_DICT 2 // local dictionary encoding
@@ -47,7 +47,8 @@
#define MOSAIC_LINEAR 4 // use an encoding for a linear sequence
#define MOSAIC_VARIANCE 5 // adaptive dictionary over
deltas
#define MOSAIC_PREFIX 6 // prefix/postfix bitwise compression
-#define MOSAIC_ZONE 7 // adaptive zone map over
non-compressed data
+#define MOSAIC_INDEX 7 // columnwise full dictionary index
with bit indices
+#define MOSAIC_ZONE 8 // adaptive zone map over
non-compressed data
#define MOSAIC_EOL 9 // marker for the last block
//Compression should have a significant reduction to apply.
@@ -63,6 +64,12 @@ typedef struct MOSAICHEADER{
int top;
oid oidbase[MOSAICINDEX];
BUN offset[MOSAICINDEX];
+ int dictsize;
+#ifdef HAVE_HGE
+ hge dict[256];
+#else
+ lng dict[256];
+#endif
} * MosaicHdr;
// bit stuffed header block, currently 4 bytes wide
@@ -99,8 +106,9 @@ typedef struct MOSTASK{
BAT *b; // source column
BUN elm; // elements left to compress
char *src; // read pointer into source
+ BAT *index; // collection of unique elements
+ BAT *freq; // frequency of these elements
- int dictsize; // entries in a dictionary
lng xsize,size;// original and compressed size
lng timer; // compression time
void *min, *max;// space for zones indices
diff --git a/monetdb5/modules/mal/mosaic_dictionary.c
b/monetdb5/modules/mal/mosaic_dictionary.c
--- a/monetdb5/modules/mal/mosaic_dictionary.c
+++ b/monetdb5/modules/mal/mosaic_dictionary.c
@@ -38,15 +38,15 @@ MOSadvance_dictionary(Client cntxt, MOSt
task->start += MOSgetCnt(task->blk);
switch(ATOMstorage(task->type)){
//case TYPE_bte: CASE_bit: no compression achievable
- case TYPE_sht: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(sht)+ sizeof(bte) *
MOSgetCnt(task->blk),sht)); break;
- case TYPE_int: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(int)+ sizeof(bte) *
MOSgetCnt(task->blk),int)); break;
- case TYPE_lng: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(lng)+ sizeof(bte) *
MOSgetCnt(task->blk),lng)); break;
- case TYPE_oid: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(oid)+ sizeof(bte) *
MOSgetCnt(task->blk),oid)); break;
- case TYPE_wrd: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(wrd)+ sizeof(bte) *
MOSgetCnt(task->blk),wrd)); break;
- case TYPE_flt: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(flt)+ sizeof(bte) *
MOSgetCnt(task->blk),flt)); break;
- case TYPE_dbl: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(dbl)+ sizeof(bte) *
MOSgetCnt(task->blk),dbl)); break;
+ case TYPE_sht: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),sht)); break;
+ case TYPE_int: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),int)); break;
+ case TYPE_lng: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),lng)); break;
+ case TYPE_oid: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),oid)); break;
+ case TYPE_wrd: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),wrd)); break;
+ case TYPE_flt: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),flt)); break;
+ case TYPE_dbl: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),dbl)); break;
#ifdef HAVE_HGE
- case TYPE_hge: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(hge)+ sizeof(bte) *
MOSgetCnt(task->blk),hge)); break;
+ case TYPE_hge: task->blk = (MosaicBlk)( ((char*)task->blk) +
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),hge)); break;
#endif
}
}
@@ -55,40 +55,38 @@ MOSadvance_dictionary(Client cntxt, MOSt
void
MOSdump_dictionary(Client cntxt, MOStask task)
{
- MosaicBlk blk= task->blk;
+ MosaicHdr hdr= task->hdr;
int i;
- int *size;
- void *val = (void*)(((char*) blk) + MosaicBlkSize);
+ void *val = (void*)hdr->dict;
- size = (int*) (((char*)blk) + MosaicBlkSize);
- mnstr_printf(cntxt->fdout,"#dict " BUNFMT" ", MOSgetCnt(blk));
- switch(task->type){
+ mnstr_printf(cntxt->fdout,"#");
+ switch(ATOMstorage(task->type)){
case TYPE_sht:
- for(i=0; i< *size; i++)
- mnstr_printf(cntxt->fdout,"sht [%d] %hd",i, ((sht*) val)[i]);
break;
+ for(i=0; i< hdr->dictsize; i++)
+ mnstr_printf(cntxt->fdout,"sht [%d] %hd ",i, ((sht*) val)[i]);
break;
case TYPE_int:
- for(i=0; i< *size; i++)
- mnstr_printf(cntxt->fdout,"int [%d] %d",i, ((int*) val)[i]);
break;
+ for(i=0; i< hdr->dictsize; i++)
+ mnstr_printf(cntxt->fdout,"int [%d] %d ",i, ((int*) val)[i]);
break;
case TYPE_oid:
- for(i=0; i< *size; i++)
+ for(i=0; i< hdr->dictsize; i++)
mnstr_printf(cntxt->fdout,"oid [%d] "OIDFMT, i, ((oid*)
val)[i]); break;
case TYPE_lng:
- for(i=0; i< *size; i++)
+ for(i=0; i< hdr->dictsize; i++)
mnstr_printf(cntxt->fdout,"lng [%d] "LLFMT, i, ((lng*)
val)[i]); break;
#ifdef HAVE_HGE
case TYPE_hge:
- for(i=0; i< *size; i++)
- mnstr_printf(cntxt->fdout,"hge [%d] %.40g", i, (dbl) ((hge*)
val)[i]); break;
+ for(i=0; i< hdr->dictsize; i++)
+ mnstr_printf(cntxt->fdout,"hge [%d] %.40g ", i, (dbl) ((hge*)
val)[i]); break;
#endif
case TYPE_wrd:
- for(i=0; i< *size; i++)
+ for(i=0; i< hdr->dictsize; i++)
mnstr_printf(cntxt->fdout,"wrd [%d] "SZFMT, i, ((wrd*)
val)[i]); break;
case TYPE_flt:
- for(i=0; i< *size; i++)
- mnstr_printf(cntxt->fdout,"flt [%d] %f",i, ((flt*) val)[i]);
break;
+ for(i=0; i< hdr->dictsize; i++)
+ mnstr_printf(cntxt->fdout,"flt [%d] %f ",i, ((flt*) val)[i]);
break;
case TYPE_dbl:
- for(i=0; i< *size; i++)
- mnstr_printf(cntxt->fdout,"dbl [%d] %g",i, ((dbl*) val)[i]);
break;
+ for(i=0; i< hdr->dictsize; i++)
+ mnstr_printf(cntxt->fdout,"dbl [%d] %g ",i, ((dbl*) val)[i]);
break;
}
mnstr_printf(cntxt->fdout,"\n");
}
@@ -103,38 +101,116 @@ MOSskip_dictionary(Client cntxt, MOStask
#define estimateDict(TPE)\
{ TPE *val = (TPE*)task->src;\
- TPE *dict = (TPE*)GDKzalloc(sizeof(TPE) * task->dictsize);\
- if( dict== NULL)\
- return 1.0;\
for(i =0; i<task->elm; i++, val++){\
- for(j= 0; j< size; j++)\
- if( dict[j] == *val) {cnt++;break;}\
- if ( j == size){\
- if ( size == task->dictsize)\
- break;\
- dict[j] = *val;\
- size++;\
- cnt++;\
- }\
+ for(j= 0; j< hdr->dictsize; j++)\
+ if( hdr->dict[j] == *val) break;\
}\
- GDKfree(dict);\
if ( i > MOSlimit() ) i = MOSlimit();\
- if(i) factor = (flt) ((int)i * sizeof(int)) / wordaligned(2*
MosaicBlkSize + size * sizeof(TPE)+ sizeof(bte) * MOSgetCnt(task->blk),TPE);\
+ if(i) factor = (flt) ((int)i * sizeof(int)) / wordaligned(
MosaicBlkSize + i,TPE);\
}
+// store it in the compressed heap header directly
+// filter out the most frequent ones
+#define makeDict(TPE)\
+{ TPE *val = (TPE*)task->src;\
+ TPE *dict = (TPE*)hdr->dict,v;\
+ for(i =0; i< (int) task->elm; i++, val++){\
+ for(j= 0; j< hdr->dictsize; j++)\
+ if( dict[j] == *val) break;\
+ if ( j == hdr->dictsize){\
+ if ( hdr->dictsize == 256){\
+ int min = 0;\
+ for(j=1;j<256;j++)\
+ if( cnt[min] <cnt[j]) min = j;\
+ j=min;\
+ cnt[j]=0;\
+ break;\
+ }\
+ dict[j] = *val;\
+ cnt[j]++;\
+ hdr->dictsize++;\
+ } else\
+ cnt[j]++;\
+ }\
+ for(i=0; i< hdr->dictsize; i++)\
+ for(j=i+1; j< hdr->dictsize; j++)\
+ if(dict[i] >dict[j]){\
+ v= dict[i];\
+ dict[i] = dict[j];\
+ dict[j] = v;\
+ }\
+}
+
+void
+MOScreatedictionary(Client cntxt, MOStask task)
+{ int i, j;
+ MosaicHdr hdr = task->hdr;
+ lng cnt[256];
+
+ (void) cntxt;
+ for(j=0;j<256;j++)
+ cnt[j]=0;
+ hdr->dictsize = 0;
+ switch(ATOMstorage(task->type)){
+ case TYPE_sht: makeDict(sht); break;
+ case TYPE_lng: makeDict(lng); break;
+ case TYPE_oid: makeDict(oid); break;
+ case TYPE_wrd: makeDict(wrd); break;
+ case TYPE_flt: makeDict(flt); break;
+ case TYPE_dbl: makeDict(dbl); break;
+#ifdef HAVE_HGE
+ case TYPE_hge: makeDict(hge); break;
+#endif
+ case TYPE_int:
+ { int *val = (int*)task->src;
+ int *dict = (int*)hdr->dict,v;
+
+ for(i =0; i< (int) task->elm; i++, val++){
+ for(j= 0; j< hdr->dictsize; j++)
+ if( dict[j] == *val) break;
+ if ( j == hdr->dictsize){
+ if ( hdr->dictsize == 256){
+ int min = 0;
+ // select low frequent candidate
+ for(j=1;j<256;j++)
+ if( cnt[min] <cnt[j])
min = j;
+ j=min;
+ cnt[j]=0;
+ break;
+ }
+ dict[j] = *val;
+ cnt[j]++;
+ hdr->dictsize++;
+ } else
+ cnt[j]++;
+ }
+ // sort it
+ for(i=0; i< hdr->dictsize; i++)
+ for(j=i+1; j< hdr->dictsize; j++)
+ if(dict[i] >dict[j]){
+ v= dict[i];
+ dict[i] = dict[j];
+ dict[j] = v;
+ }
+ }
+ }
+#ifdef _DEBUG_MOSAIC_
+ MOSdump_dictionary(cntxt, task);
+#endif
+}
// calculate the expected reduction using DICT in terms of elements compressed
flt
MOSestimate_dictionary(Client cntxt, MOStask task)
{ BUN i = -1;
- int cnt =0,j;
- int size=0;
+ int j;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list