Changeset: 3ef5f7ef38bb for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3ef5f7ef38bb
Modified Files:
        monetdb5/modules/mosaic/mosaic_dictionary.c
Branch: mosaic
Log Message:

Fix MOScreatedictionary for large columns:
- We do a more uniformly distributed sample of the column.
- We make sure that the dictionary is correctly constructed.


diffs (95 lines):

diff --git a/monetdb5/modules/mosaic/mosaic_dictionary.c 
b/monetdb5/modules/mosaic/mosaic_dictionary.c
--- a/monetdb5/modules/mosaic/mosaic_dictionary.c
+++ b/monetdb5/modules/mosaic/mosaic_dictionary.c
@@ -146,10 +146,13 @@ MOSskip_dictionary(MOStask task)
 #define TMPDICT 16*256
 
 #define makeDict(TPE)\
-{      TPE *val = ((TPE*)task->src) + task->start,v,w;\
-       BUN limit = task->stop - task->start > MOSAICMAXCNT? MOSAICMAXCNT: 
task->stop - task->start;\
-       lng cw,cv;\
-       for(i = 0; i< limit; i++, val++){\
+{      TPE *val,v,w;\
+       BUN limit = task->stop - task->start > TMPDICT ? TMPDICT:  task->stop - 
task->start;\
+       BAT* bsample = BATsample_with_seed(task->bsrc, limit, (16-07-1985));\
+       lng cw,cv; \
+       for(i = 0; i< limit; i++){\
+               oid sample = BUNtoid(bsample,i);\
+               val = ((TPE*)task->src) + (sample - task->bsrc->hseqbase);\
                MOSfind(j,dict.val##TPE,*val,0,dictsize);\
                if(j == dictsize && dictsize == 0 ){\
                        dict.val##TPE[j]= *val;\
@@ -171,7 +174,33 @@ MOSskip_dictionary(MOStask task)
                        dict.val##TPE[j]= w;\
                        cnt[j] = 1;\
                } else if (dictsize < TMPDICT) cnt[j]++;\
-} }
+       }\
+       BBPunfix(bsample->batCacheid);\
+       /* find the 256 most frequent values and save them in the mosaic header 
*/ \
+       if( dictsize <= 256){ \
+               memcpy((char*)&task->hdr->dict,  (char*)&dict, dictsize * 
sizeof(TPE)); \
+               memcpy((char*)task->hdr->dictfreq,  (char*)&cnt, dictsize * 
sizeof(lng)); \
+               hdr->dictsize = dictsize; \
+       } else { \
+               /* brute force search of the top-k */ \
+               for(j=0; j< 256; j++){ \
+                       for(k=0; k< dictsize; k++) \
+                       if( keep[k]==0){ \
+                               if( cnt[k]> cnt[max]) max = k; \
+                       } \
+                       keep[max]=1; \
+               } \
+               /* keep the top-k, in order */ \
+               for( j=k=0; k<dictsize; k++) \
+               if( keep[k]){ \
+                       task->hdr->dict.val##TPE[j] = dict.val##TPE[k]; \
+                       task->hdr->dictfreq[j] = cnt[k]; \
+                       j++; \
+               } \
+               hdr->dictsize = j; \
+               assert(j<=256); \
+       } \
+}
 
 
 /* Take a larger sample before fixing the dictionary */
@@ -222,38 +251,6 @@ MOScreatedictionary(MOStask task)
                }
                break;
        }
-       /* find the 256 most frequent values and save them in the mosaic header 
*/
-       if( dictsize < 256){
-#ifdef HAVE_HGE
-               memcpy((char*)&task->hdr->dict,  (char*)&dict, dictsize * 
sizeof(hge));
-#else
-               memcpy((char*)&task->hdr->dict,  (char*)&dict, dictsize * 
sizeof(lng));
-#endif
-               memcpy((char*)task->hdr->dictfreq,  (char*)&cnt, dictsize * 
sizeof(lng));
-               hdr->dictsize = dictsize;
-       } else {
-               /* brute force search of the top-k */
-               for(j=0; j< 256; j++){
-                       for(max = 0; max <dictsize && keep[max]; max++){}
-                       for(k=0; k< dictsize; k++)
-                       if( keep[k]==0){
-                               if( cnt[k]> cnt[max]) max = k;
-                       }
-                       keep[max]=1;
-               }
-               /* keep the top-k, in order */
-               for( j=k=0; k<dictsize; k++)
-               if( keep[k]){
-#ifdef HAVE_HGE
-                       task->hdr->dict.valhge[j] = dict.valhge[k];
-#else
-                       task->hdr->dict.vallng[j] = dict.vallng[k];
-#endif
-                       task->hdr->dictfreq[j] = cnt[k];
-               }
-               hdr->dictsize = j;
-               assert(j<256);
-       }
        /* calculate the bit-width */
        hdr->bits = 1;
        hdr->mask =1;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to