Changeset: da348dfdb6b8 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=da348dfdb6b8
Modified Files:
monetdb5/modules/mosaic/mosaic.h
monetdb5/modules/mosaic/mosaic_dictionary.c
Branch: mosaic
Log Message:
Add bte field for dictionaries
and add dictionary compression over the string offsets.
diffs (truncated from 517 to 300 lines):
diff --git a/monetdb5/modules/mosaic/mosaic.h b/monetdb5/modules/mosaic/mosaic.h
--- a/monetdb5/modules/mosaic/mosaic.h
+++ b/monetdb5/modules/mosaic/mosaic.h
@@ -82,6 +82,7 @@ typedef struct MOSAICHEADER{
int dictsize; // used by dictionary compression, it is a
small table
int framesize; // used by frame compression, it is a small
table
union{
+ bte valbte[256];
sht valsht[256];
int valint[256];
lng vallng[256];
@@ -94,6 +95,7 @@ typedef struct MOSAICHEADER{
}dict;
lng dictfreq[256];// keep track on their use
union{
+ bte valbte[256];
sht valsht[256];
int valint[256];
lng vallng[256];
diff --git a/monetdb5/modules/mosaic/mosaic_dictionary.c
b/monetdb5/modules/mosaic/mosaic_dictionary.c
--- a/monetdb5/modules/mosaic/mosaic_dictionary.c
+++ b/monetdb5/modules/mosaic/mosaic_dictionary.c
@@ -53,6 +53,8 @@ MOSdump_dictionaryInternal(char *buf, si
{
switch(ATOMbasetype(task->type)){
+ case TYPE_bte:
+ snprintf(buf,len,"%d", task->hdr->dict.valbte[i]); break;
case TYPE_sht:
snprintf(buf,len,"%hd", task->hdr->dict.valsht[i]); break;
case TYPE_int:
@@ -69,7 +71,14 @@ MOSdump_dictionaryInternal(char *buf, si
snprintf(buf,len,"%f", task->hdr->dict.valflt[i]); break;
case TYPE_dbl:
snprintf(buf,len,"%g", task->hdr->dict.valdbl[i]); break;
- }
+ case TYPE_str:
+ switch(task->bsrc->twidth){
+ case 1: snprintf(buf,len,"%d",
task->hdr->dict.valbte[i]); break;
+ case 2: snprintf(buf,len,"%hd",
task->hdr->dict.valsht[i]); break;
+ case 4: snprintf(buf,len,"%d",
task->hdr->dict.valint[i]); break;
+ case 8: snprintf(buf,len,LLFMT,
task->hdr->dict.vallng[i]);
+ }
+}
}
void
@@ -85,6 +94,8 @@ MOSdump_dictionary(Client cntxt, MOStask
}
mnstr_printf(cntxt->fdout,"\n");
switch(ATOMbasetype(task->type)){
+ case TYPE_bte:
+ snprintf(buf,len,"%d %d",
task->hdr->checksum.sumbte,task->hdr->checksum2.sumbte); break;
case TYPE_sht:
snprintf(buf,len,"%hd %hd",
task->hdr->checksum.sumsht,task->hdr->checksum2.sumsht); break;
case TYPE_int:
@@ -101,6 +112,13 @@ MOSdump_dictionary(Client cntxt, MOStask
snprintf(buf,len,"%f %f",
task->hdr->checksum.sumflt,task->hdr->checksum2.sumflt); break;
case TYPE_dbl:
snprintf(buf,len,"%g %g",
task->hdr->checksum.sumdbl,task->hdr->checksum2.sumdbl); break;
+ case TYPE_str:
+ switch(task->bsrc->twidth){
+ case 1: snprintf(buf,len,"%d",
task->hdr->dict.valbte[i]); break;
+ case 2: snprintf(buf,len,"%hd",
task->hdr->dict.valsht[i]); break;
+ case 4: snprintf(buf,len,"%d",
task->hdr->dict.valint[i]); break;
+ case 8: snprintf(buf,len,LLFMT,
task->hdr->dict.vallng[i]);
+ }
}
mnstr_printf(cntxt->fdout,"#checksums %s\n",buf);
}
@@ -165,11 +183,11 @@ MOSskip_dictionary(Client cntxt, MOStask
if( task->range[MOSAIC_DICT] > task->start){\
i = task->range[MOSAIC_DICT] - task->start;\
if ( i > MOSlimit() ) i = MOSlimit();\
- if( i * sizeof(TPE) <= wordaligned( MosaicBlkSize + i,TPE))\
+ if( i * sizeof(TPE) <= wordaligned( MosaicBlkSize +
(i*hdr->bits)/8,TPE))\
return 0.0;\
- if( task->dst + wordaligned(MosaicBlkSize + i,sizeof(TPE)) >=
task->bsrc->tmosaic->base + task->bsrc->tmosaic->size)\
+ if( task->dst + wordaligned(MosaicBlkSize +
(i*hdr->bits)/8,sizeof(TPE)) >= task->bsrc->tmosaic->base +
task->bsrc->tmosaic->size)\
return 0.0;\
- if(i) factor = ((flt) i * sizeof(TPE))/
wordaligned(MosaicBlkSize + sizeof(int) + i,TPE);\
+ if(i) factor = ((flt) i * sizeof(TPE))/
wordaligned(MosaicBlkSize + sizeof(int) + (i*hdr->bits)/8,TPE);\
return factor;\
}\
for(i =0; i<limit; i++, val++){\
@@ -177,63 +195,74 @@ MOSskip_dictionary(Client cntxt, MOStask
if( j == hdr->dictsize || hdr->dict.val##TPE[j] != *val )\
break;\
}\
- if( i * sizeof(TPE) <= wordaligned( MosaicBlkSize + i,TPE))\
+ if( i * sizeof(TPE) <= wordaligned( MosaicBlkSize + (i * hdr->bits)/8
,TPE))\
return 0.0;\
- if(i) factor = (flt) ((int)i * sizeof(int)) / wordaligned(
MosaicBlkSize + i,TPE);\
+ if(i) factor = (flt) ((int)i * sizeof(TPE)) / wordaligned(
MosaicBlkSize + (i * hdr->bits)/8,TPE);\
}
-// store it in the compressed heap header directly
-// filter out the most frequent ones
+// Create a larger dictionary buffer then we allow for in the mosaic header
first
+// Store the most frequent ones in the compressed heap header directly based
on estimated savings
+// Improve by using binary search rather then linear scan
+#define TMPDICT 256
+
#define makeDict(TPE)\
-{ TPE v,*val = ((TPE*)task->src) + task->start;\
+{ TPE *val = ((TPE*)task->src) + task->start,v,w;\
BUN limit = task->stop - task->start > MOSlimit()? MOSlimit():
task->stop - task->start;\
+ int cw,cv;\
for(i = 0; i< limit; i++, val++){\
- for(j= 0; j< hdr->dictsize; j++)\
- if( task->hdr->dict.val##TPE[j] == *val) break;\
- if ( j == hdr->dictsize){\
- if ( hdr->dictsize == 256){\
- int min = 0;\
- for(j=1;j<256;j++)\
- if( cnt[min] <cnt[j]) min = j;\
- j=min;\
- cnt[j]=0;\
- break;\
+ MOSfind(j,dict.val##TPE,*val,0,dictsize);\
+ if(j == dictsize && dictsize == 0 ){\
+ dict.val##TPE[j]= *val;\
+ cnt[j] = 1;\
+ dictsize++;\
+ } else \
+ if( dictsize < TMPDICT && dict.val##TPE[j] != *val){\
+ w= *val; cw= 1;\
+ for( ; j< dictsize; j++)\
+ if (dict.val##TPE[j] > w){\
+ v =dict.val##TPE[j];\
+ dict.val##TPE[j]= w;\
+ w = v;\
+ cv = cnt[j];\
+ cnt[j]= cw;\
+ cw = cv;\
}\
- task->hdr->dict.val##TPE[j] = *val;\
- cnt[j]++;\
- hdr->dictsize++;\
- } else\
- cnt[j]++;\
- }\
- for(k=0; k< hdr->dictsize; k++)\
- for(j=k+1; j< hdr->dictsize; j++)\
- if(task->hdr->dict.val##TPE[k]
>task->hdr->dict.val##TPE[j]){\
- v = task->hdr->dict.val##TPE[k];\
- task->hdr->dict.val##TPE[k] =
task->hdr->dict.val##TPE[j];\
- task->hdr->dict.val##TPE[j] = v;\
- }\
- hdr->bits = 1;\
- hdr->mask =1;\
- for( k=2 ; k < hdr->dictsize; k *=2){\
- hdr->bits++;\
- hdr->mask = (hdr->mask <<1) | 1;\
- }\
-}
+ dictsize++;\
+ dict.val##TPE[j]= *val;\
+ cnt[j] = 1;\
+ } else cnt[j]++;\
+} }
+/* take a larger sample before fixing the dictionary */
void
MOScreatedictionary(Client cntxt, MOStask task)
{
BUN i;
- int j,k;
+ int j;
MosaicHdr hdr = task->hdr;
- lng cnt[256];
+ union{
+ bte valbte[TMPDICT];
+ sht valsht[TMPDICT];
+ int valint[TMPDICT];
+ lng vallng[TMPDICT];
+ oid valoid[TMPDICT];
+ flt valflt[TMPDICT];
+ dbl valdbl[TMPDICT];
+#ifdef HAVE_HGE
+ hge valhge[TMPDICT];
+#endif
+ }dict;
+ lng cnt[TMPDICT];
+ int dictsize = 0;
(void) cntxt;
+ memset((char*) &dict, 0, TMPDICT * sizeof(lng));
memset((char*) cnt, 0, sizeof(cnt));
- hdr->dictsize = 0;
+
+ hdr->dictsize = dictsize;
switch(ATOMbasetype(task->type)){
- case TYPE_bte: break; // no compression achievable
+ case TYPE_bte: makeDict(bte); break;
case TYPE_sht: makeDict(sht); break;
case TYPE_int: makeDict(int); break;
case TYPE_lng: makeDict(lng); break;
@@ -243,11 +272,60 @@ MOScreatedictionary(Client cntxt, MOStas
#ifdef HAVE_HGE
case TYPE_hge: makeDict(hge); break;
#endif
+ case TYPE_str:
+ switch(task->bsrc->twidth){
+ case 1: //makeDict(bte); break;
+{ bte *val = ((bte*)task->src) + task->start,v,w;
+ BUN limit = task->stop - task->start > MOSlimit()? MOSlimit():
task->stop - task->start;
+ int cw,cv;
+ for(i = 0; i< limit; i++, val++){
+ MOSfind(j,dict.valbte,*val,0,dictsize);
+ if(j == dictsize && dictsize == 0 ){
+ dict.valbte[j]= *val;
+ cnt[j] = 1;
+ dictsize++;
+ } else
+ if(dictsize < TMPDICT && dict.valbte[j] != *val){
+ w= *val; cw= 1;
+ for( ; j< dictsize; j++)
+ if (dict.valbte[j] > w){
+ v =dict.valbte[j];
+ dict.valbte[j]= w;
+ w = v;
+ cv = cnt[j];
+ cnt[j]= cw;
+ cw = cv;
+ }
+ dictsize++;
+ dict.valbte[j]= w;
+ cnt[j] = cw;
+ } else cnt[j]++;
+} }
+ break;
+ case 2: makeDict(sht); break;
+ case 4: makeDict(int); break;
+ case 8: makeDict(lng); break;
+ }
+ break;
#ifdef _DEBUG_MOSAIC_
default:
mnstr_printf(cntxt->fdout,"#does not support dictionary type
%d\n",ATOMbasetype(task->type));
#endif
}
+ /* find the 256 most frequent values and save them in the mosaic header
*/
+ if( dictsize < 256){
+ memcpy((char*)&task->hdr->dict, (char*)&dict, dictsize *
sizeof(lng));
+ memcpy((char*)task->hdr->dictfreq, (char*)&cnt, dictsize *
sizeof(lng));
+ hdr->dictsize = dictsize;
+ } else {
+ }
+ /* calculate the bit-width */
+ hdr->bits = 1;
+ hdr->mask =1;
+ for( j=2 ; j < dictsize; j *=2){
+ hdr->bits++;
+ hdr->mask = (hdr->mask <<1) | 1;
+ }
#ifdef _DEBUG_MOSAIC_
MOSdump_dictionary(cntxt, task);
#endif
@@ -264,39 +342,44 @@ MOSestimate_dictionary(Client cntxt, MOS
(void) cntxt;
switch(ATOMbasetype(task->type)){
- case TYPE_bte: break; // no compression achievable
+ case TYPE_bte: estimateDict(bte); break;
case TYPE_sht: estimateDict(sht); break;
case TYPE_int: estimateDict(int); break;
case TYPE_oid: estimateDict(oid); break;
+ case TYPE_lng: estimateDict(lng); break;
case TYPE_flt: estimateDict(flt); break;
case TYPE_dbl: estimateDict(dbl); break;
#ifdef HAVE_HGE
case TYPE_hge: estimateDict(hge); break;
#endif
- case TYPE_lng:
- { lng *val = ((lng*)task->src) + task->start;
- // assume uniform compression statistics
- if( task->range[MOSAIC_DICT] > task->start){
- i = task->range[MOSAIC_DICT] - task->start;
- if ( i > MOSlimit() ) i = MOSlimit();
- if( i * sizeof(lng) <= wordaligned(
MosaicBlkSize + i,lng))
- return 0.0;
- if( task->dst + wordaligned(MosaicBlkSize +
i,sizeof(lng)) >= task->bsrc->tmosaic->base + task->bsrc->tmosaic->size)
- return 0.0;
- if(i) factor = ((flt) i * sizeof(lng))/
wordaligned(MosaicBlkSize + sizeof(lng) + i,lng);
- return factor;
- }
-
- for(i =task->start; i<task->stop; i++, val++){
-
MOSfind(j,task->hdr->dict.vallng,*val,0,hdr->dictsize);
- if( j == hdr->dictsize ||
task->hdr->dict.vallng[j] != *val)
- break;
- }
- i -= task->start;
- if ( i > MOSlimit() ) i = MOSlimit();
- if( i * sizeof(lng) < wordaligned( MosaicBlkSize +
i,lng))
- return 0.0;
- if(i) factor = (flt) ((lng)i * sizeof(lng)) /
wordaligned( MosaicBlkSize + i,lng);
+ case TYPE_str:
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list