Changeset: 601c51fa9124 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=601c51fa9124
Modified Files:
monetdb5/modules/mosaic/mosaic_dictionary.c
Branch: mosaic
Log Message:
Select the high frequency values
At most 256 high frequent values are kept in the dictionary
diffs (77 lines):
diff --git a/monetdb5/modules/mosaic/mosaic_dictionary.c
b/monetdb5/modules/mosaic/mosaic_dictionary.c
--- a/monetdb5/modules/mosaic/mosaic_dictionary.c
+++ b/monetdb5/modules/mosaic/mosaic_dictionary.c
@@ -203,7 +203,7 @@ MOSskip_dictionary(Client cntxt, MOStask
// Create a larger dictionary buffer then we allow for in the mosaic header
first
// Store the most frequent ones in the compressed heap header directly based
on estimated savings
// Improve by using binary search rather then linear scan
-#define TMPDICT 256
+#define TMPDICT 16*256
#define makeDict(TPE)\
{ TPE *val = ((TPE*)task->src) + task->start,v,w;\
@@ -234,12 +234,12 @@ MOSskip_dictionary(Client cntxt, MOStask
} }
-/* take a larger sample before fixing the dictionary */
+/* Take a larger sample before fixing the dictionary */
void
MOScreatedictionary(Client cntxt, MOStask task)
{
BUN i;
- int j;
+ int j, k, max;
MosaicHdr hdr = task->hdr;
union{
bte valbte[TMPDICT];
@@ -254,11 +254,13 @@ MOScreatedictionary(Client cntxt, MOStas
#endif
}dict;
lng cnt[TMPDICT];
+ bte keep[TMPDICT];
int dictsize = 0;
(void) cntxt;
memset((char*) &dict, 0, TMPDICT * sizeof(lng));
memset((char*) cnt, 0, sizeof(cnt));
+ memset((char*) keep, 0, sizeof(keep));
hdr->dictsize = dictsize;
switch(ATOMbasetype(task->type)){
@@ -314,10 +316,35 @@ MOScreatedictionary(Client cntxt, MOStas
}
/* find the 256 most frequent values and save them in the mosaic header
*/
if( dictsize < 256){
+#ifdef HAVE_HGE
+ memcpy((char*)&task->hdr->dict, (char*)&dict, dictsize *
sizeof(hge));
+#else
memcpy((char*)&task->hdr->dict, (char*)&dict, dictsize *
sizeof(lng));
+#endif
memcpy((char*)task->hdr->dictfreq, (char*)&cnt, dictsize *
sizeof(lng));
hdr->dictsize = dictsize;
} else {
+ /* brute force search of the top-k */
+ for(j=0; j< 256; j++){
+ for(max = 0; max <dictsize && keep[max]; max++){}
+ for(k=0; k< dictsize; k++)
+ if( keep[k]==0){
+ if( cnt[k]> cnt[max]) max = k;
+ }
+ keep[max]=1;
+ }
+ /* keep the top-k, in order */
+ for( j=k=0; k<dictsize; k++)
+ if( keep[k]){
+#ifdef HAVE_HGE
+ task->hdr->dict.valhge[j] = dict.valhge[k];
+#else
+ task->hdr->dict.vallng[j] = dict.vallng[k];
+#endif
+ task->hdr->dictfreq[j] = cnt[k];
+ }
+ hdr->dictsize = j;
+ assert(j<256);
}
/* calculate the bit-width */
hdr->bits = 1;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list