Changeset: 3ef5f7ef38bb for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3ef5f7ef38bb
Modified Files:
monetdb5/modules/mosaic/mosaic_dictionary.c
Branch: mosaic
Log Message:
Fix MOScreatedictionary for large columns:
- We do a more uniformly distributed sample of the column.
- We make sure that the dictionary is correctly constructed.
diffs (95 lines):
diff --git a/monetdb5/modules/mosaic/mosaic_dictionary.c
b/monetdb5/modules/mosaic/mosaic_dictionary.c
--- a/monetdb5/modules/mosaic/mosaic_dictionary.c
+++ b/monetdb5/modules/mosaic/mosaic_dictionary.c
@@ -146,10 +146,13 @@ MOSskip_dictionary(MOStask task)
#define TMPDICT 16*256
#define makeDict(TPE)\
-{ TPE *val = ((TPE*)task->src) + task->start,v,w;\
- BUN limit = task->stop - task->start > MOSAICMAXCNT? MOSAICMAXCNT:
task->stop - task->start;\
- lng cw,cv;\
- for(i = 0; i< limit; i++, val++){\
+{ TPE *val,v,w;\
+ BUN limit = task->stop - task->start > TMPDICT ? TMPDICT: task->stop -
task->start;\
+ BAT* bsample = BATsample_with_seed(task->bsrc, limit, (16-07-1985));\
+ lng cw,cv; \
+ for(i = 0; i< limit; i++){\
+ oid sample = BUNtoid(bsample,i);\
+ val = ((TPE*)task->src) + (sample - task->bsrc->hseqbase);\
MOSfind(j,dict.val##TPE,*val,0,dictsize);\
if(j == dictsize && dictsize == 0 ){\
dict.val##TPE[j]= *val;\
@@ -171,7 +174,33 @@ MOSskip_dictionary(MOStask task)
dict.val##TPE[j]= w;\
cnt[j] = 1;\
} else if (dictsize < TMPDICT) cnt[j]++;\
-} }
+ }\
+ BBPunfix(bsample->batCacheid);\
+ /* find the 256 most frequent values and save them in the mosaic header
*/ \
+ if( dictsize <= 256){ \
+ memcpy((char*)&task->hdr->dict, (char*)&dict, dictsize *
sizeof(TPE)); \
+ memcpy((char*)task->hdr->dictfreq, (char*)&cnt, dictsize *
sizeof(lng)); \
+ hdr->dictsize = dictsize; \
+ } else { \
+ /* brute force search of the top-k */ \
+ for(j=0; j< 256; j++){ \
+ for(k=0; k< dictsize; k++) \
+ if( keep[k]==0){ \
+ if( cnt[k]> cnt[max]) max = k; \
+ } \
+ keep[max]=1; \
+ } \
+ /* keep the top-k, in order */ \
+ for( j=k=0; k<dictsize; k++) \
+ if( keep[k]){ \
+ task->hdr->dict.val##TPE[j] = dict.val##TPE[k]; \
+ task->hdr->dictfreq[j] = cnt[k]; \
+ j++; \
+ } \
+ hdr->dictsize = j; \
+ assert(j<=256); \
+ } \
+}
/* Take a larger sample before fixing the dictionary */
@@ -222,38 +251,6 @@ MOScreatedictionary(MOStask task)
}
break;
}
- /* find the 256 most frequent values and save them in the mosaic header
*/
- if( dictsize < 256){
-#ifdef HAVE_HGE
- memcpy((char*)&task->hdr->dict, (char*)&dict, dictsize *
sizeof(hge));
-#else
- memcpy((char*)&task->hdr->dict, (char*)&dict, dictsize *
sizeof(lng));
-#endif
- memcpy((char*)task->hdr->dictfreq, (char*)&cnt, dictsize *
sizeof(lng));
- hdr->dictsize = dictsize;
- } else {
- /* brute force search of the top-k */
- for(j=0; j< 256; j++){
- for(max = 0; max <dictsize && keep[max]; max++){}
- for(k=0; k< dictsize; k++)
- if( keep[k]==0){
- if( cnt[k]> cnt[max]) max = k;
- }
- keep[max]=1;
- }
- /* keep the top-k, in order */
- for( j=k=0; k<dictsize; k++)
- if( keep[k]){
-#ifdef HAVE_HGE
- task->hdr->dict.valhge[j] = dict.valhge[k];
-#else
- task->hdr->dict.vallng[j] = dict.vallng[k];
-#endif
- task->hdr->dictfreq[j] = cnt[k];
- }
- hdr->dictsize = j;
- assert(j<256);
- }
/* calculate the bit-width */
hdr->bits = 1;
hdr->mask =1;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list