Changeset: d21f7ebcdca4 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d21f7ebcdca4
Modified Files:
monetdb5/modules/mosaic/mosaic_prefix.c
Branch: mosaic
Log Message:
Find common prefix
Using a block and a binary search to find
the number of bits for the common prefix
diffs (truncated from 526 to 300 lines):
diff --git a/monetdb5/modules/mosaic/mosaic_prefix.c
b/monetdb5/modules/mosaic/mosaic_prefix.c
--- a/monetdb5/modules/mosaic/mosaic_prefix.c
+++ b/monetdb5/modules/mosaic/mosaic_prefix.c
@@ -20,8 +20,8 @@
/*
* 2014-2016 author Martin Kersten
* Bit_prefix compression
- * Factor out the leading bits from a series of values.
- * The prefix size is determined by the first two non-identical values.
+ * Factor out leading bits from a series of values.
+ * The prefix size is determined by looking ahead in a small block.
* To use the bitvector, we limit the extracted tail to at most 32bits
* The administration are 2 TPE values (mask,reference value)
* The size of the residu is stored in the reference value lower bits
@@ -40,6 +40,7 @@ MOSdump_prefix(Client cntxt, MOStask tas
void *val = (void*)(((char*) blk) + MosaicBlkSize);
mnstr_printf(cntxt->fdout,"#prefix "BUNFMT" ", MOSgetCnt(blk));
+
switch(task->type){
case TYPE_bte:
mnstr_printf(cntxt->fdout,"bte %hhd", *(bte*) val); break;
@@ -143,7 +144,7 @@ MOSadvance_prefix(Client cntxt, MOStask
bits = (int)(val & (~mask));
bytes = wordaligned(2 * sizeof(unsigned char),int);
bytes +=
wordaligned(getBitVectorSize(MOSgetCnt(task->blk),bits) * sizeof(int), int);
- task->blk = (MosaicBlk) (((char*) dst) +
wordaligned(bytes, int));
+ task->blk = (MosaicBlk) (((char*) dst) + bytes);
}
break;
case 2:
@@ -153,7 +154,7 @@ MOSadvance_prefix(Client cntxt, MOStask
bits = (int)(val & (~mask));
bytes = wordaligned(2 * sizeof(unsigned short),int);
bytes +=
wordaligned(getBitVectorSize(MOSgetCnt(task->blk),bits) * sizeof(int), int);
- task->blk = (MosaicBlk) (((char*) dst) +
wordaligned(bytes, int));
+ task->blk = (MosaicBlk) (((char*) dst) + bytes);
}
break;
case 4:
@@ -173,7 +174,7 @@ MOSadvance_prefix(Client cntxt, MOStask
bits = (int)(val & (~mask));
bytes = wordaligned(2 * sizeof(ulng),int);
bytes +=
wordaligned(getBitVectorSize(MOSgetCnt(task->blk),bits) * sizeof(int), int);
- task->blk = (MosaicBlk) (((char*) dst) +
wordaligned(bytes, int));
+ task->blk = (MosaicBlk) (((char*) dst) + bytes);
}
}
#ifdef _DEBUG_MOSAIC_
@@ -189,7 +190,137 @@ MOSskip_prefix(Client cntxt, MOStask tas
task->blk = 0; // ENDOFLIST
}
-// Find common prefix
+// logarithmic search for common prefix in a given block
+// use static prefix mask attempts
+
+static void
+findPrefixBit(Client cntxt, unsigned char *v, int limit, int *bits, unsigned
char *prefixmask)
+{
+ int i, step = 8, width = 0;
+ unsigned char prefix, mask;
+ do{
+ step /=2;
+ mask = 1 ;
+ for( i=0; i < 8 - 1 - (width +step); i++)
+ mask = (mask <<1) | 1;
+ mask = ~mask;
+ prefix = v[0] & mask;
+ for(i=0; i< limit; i++)
+ if( (v[i] & mask) != prefix)
+ break;
+#ifdef _DEBUG_PREFIX_
+ mnstr_printf(cntxt->fdout,"#findprefix width %d step %d mask %o
%d\n", width, step, mask, limit-i);
+#endif
+ if( i == limit){
+ width += step;
+ *bits = width;
+ *prefixmask = mask;
+ }
+ } while (step > 1);
+#ifdef _DEBUG_PREFIX_
+ mnstr_printf(cntxt->fdout,"#findprefix final %d \n", *bits);
+#else
+ (void) cntxt;
+#endif
+}
+
+static void
+findPrefixSht(Client cntxt, unsigned short *v, int limit, int *bits, unsigned
short *prefixmask)
+{
+ int i, step = 16, width = 0;
+ unsigned short prefix, mask;
+#ifdef _DEBUG_PREFIX_
+ mnstr_printf(cntxt->fdout,"#findprefix start %u %d \n", *v, *bits);
+#endif
+ do{
+ step /=2;
+ mask = 1 ;
+ for( i=0; i < 16-1 - (width +step); i++)
+ mask = (mask <<1) | 1;
+ mask = ~mask;
+ prefix = v[0] & mask;
+ for(i=0; i< limit; i++)
+ if( (v[i] & mask) != prefix)
+ break;
+#ifdef _DEBUG_PREFIX_
+ mnstr_printf(cntxt->fdout,"#findprefix width %d step %d mask %o
%d\n", width, step, mask, limit-i);
+#endif
+ if( i == limit){
+ width += step;
+ *bits = width;
+ *prefixmask = mask;
+ }
+ } while (step > 1);
+#ifdef _DEBUG_PREFIX_
+ mnstr_printf(cntxt->fdout,"#findprefix final %d \n", *bits);
+#else
+ (void) cntxt;
+#endif
+}
+
+static void
+findPrefixInt(Client cntxt, unsigned int *v, int limit, int *bits, unsigned
int *prefixmask)
+{
+ int i, step = 32, width = 0;
+ unsigned int prefix, mask;
+ do{
+ step /=2;
+ mask = 1 ;
+ for( i=0; i < 32-1 - (width +step); i++)
+ mask = (mask <<1) | 1;
+ mask = ~mask;
+ prefix = v[0] & mask;
+ for(i=0; i< limit; i++)
+ if( (v[i] & mask) != prefix)
+ break;
+#ifdef _DEBUG_PREFIX_
+ mnstr_printf(cntxt->fdout,"#findprefix width %d step %d mask %o
%d\n", width, step, mask, limit-i);
+#endif
+ if( i == limit){
+ width += step;
+ *bits = width;
+ *prefixmask = mask;
+ }
+ } while (step > 1);
+#ifdef _DEBUG_PREFIX_
+ mnstr_printf(cntxt->fdout,"#findprefix final %d \n", *bits);
+#else
+ (void) cntxt;
+#endif
+}
+
+static void
+findPrefixLng(Client cntxt, ulng *v, int limit, int *bits, ulng *prefixmask)
+{
+ int i, step = 64, width = 0;
+ ulng prefix, mask;
+ do{
+ step /=2;
+ mask = 1 ;
+ for( i=0; i < 64 - 1 - (width +step); i++)
+ mask = (mask <<1) | 1;
+ mask = ~mask;
+ prefix = v[0] & mask;
+ for(i=0; i< limit; i++)
+ if( (v[i] & mask) != prefix)
+ break;
+#ifdef _DEBUG_PREFIX_
+ mnstr_printf(cntxt->fdout,"#findprefix width %d step %d mask
"LLFMT" %d\n", width, step, mask, limit-i);
+#endif
+ if( i == limit){
+ width += step;
+ *bits = width;
+ *prefixmask = mask;
+ }
+ } while (step > 1 && *bits < 32);
+ // we only use at most 32 bits as prefix due to bitvector implementation
+#ifdef _DEBUG_PREFIX_
+ mnstr_printf(cntxt->fdout,"#findprefix final %d \n", *bits);
+#else
+ (void) cntxt;
+#endif
+}
+
#define Prefix(Prefix,Mask,X,Y,N) \
{ int k, m = 1; \
for(k=0; k<N; k+=1, X>>=1, Y>>=1){\
@@ -201,6 +332,7 @@ MOSskip_prefix(Client cntxt, MOStask tas
}
+#define LOOKAHEAD (limit <10? limit:10)
// calculate the expected reduction
flt
MOSestimate_prefix(Client cntxt, MOStask task)
@@ -217,22 +349,13 @@ MOSestimate_prefix(Client cntxt, MOStask
if( task->elm >= 2)
switch(size){
case 1:
- { unsigned char *v = ((unsigned char*) task->src) +
task->start, *w= v+1, val= *v,val2= *w, mask;
- // search first non-identical value
- for(i = 0;i < limit-1; i++, w++)
- if( *v != *w ){
- val2 = *w;
- break;
- }
- // all are the same?
- if ( i == limit -1)
- break;
- Prefix(prefixbits, mask, val, val2, 8);
+ { unsigned char *v = ((unsigned char*) task->src) +
task->start, *w= v+1, val= *v, mask;
+ findPrefixBit(cntxt, v, LOOKAHEAD, &prefixbits, &mask);
if( prefixbits == 0)
break;
#ifdef _DEBUG_PREFIX_
- mnstr_printf(cntxt->fdout,"#prefix estimate 1 %o %o val %d bits
%d mask %o\n",
+ mnstr_printf(cntxt->fdout,"#prefix estimate size 1 %o %o val %d
bits %d mask %o\n",
*v,*w, val, prefixbits, mask);
#endif
if( task->range[MOSAIC_PREFIX] > task->start +1 /* need
at least two*/){
@@ -264,20 +387,12 @@ MOSestimate_prefix(Client cntxt, MOStask
}
break;
case 2:
- { unsigned short *v = ((unsigned short*) task->src) +
task->start, *w= v+1, val= *v,val2= *w, mask;
- // search first non-identical value
- for(i = 0;i < limit-1;i++, w++)
- if( *v != *w ){
- val2 = *w;
- break;
- }
- if ( i == limit-1)
- break;
- Prefix(prefixbits, mask, val, val2, 16);
+ { unsigned short *v = ((unsigned short*) task->src) +
task->start, *w= v+1, val= *v, mask;
+ findPrefixSht(cntxt, v, LOOKAHEAD, &prefixbits, &mask);
if( prefixbits == 0)
break;
#ifdef _DEBUG_PREFIX_
- mnstr_printf(cntxt->fdout,"#prefix estimate 2 %o %o elm "BUNFMT"
val %d bits %d mask %o\n",
+ mnstr_printf(cntxt->fdout,"#prefix estimate size 2 %u %o elm
"BUNFMT" val %d bits %d mask %o\n",
*v,*w, i,val, prefixbits, mask);
#endif
@@ -309,22 +424,14 @@ MOSestimate_prefix(Client cntxt, MOStask
}
break;
case 4:
- { unsigned int *v = ((unsigned int*) task->src) +
task->start, *w= v+1, val= *v,val2= *w, mask;
- // search first non-identical value
- for(i = 0;i < limit-1 ;i++, w++)
- if( *v != *w ){
- val2 = *w;
- break;
- }
- if ( i == limit-1)
- break;
- Prefix(prefixbits, mask, val, val2, 32);
+ { unsigned int *v = ((unsigned int*) task->src) +
task->start, *w= v+1, val= *v, mask;
+ findPrefixInt(cntxt, v, LOOKAHEAD, &prefixbits,&mask);
if( prefixbits == 0)
break;
#ifdef _DEBUG_PREFIX_
- mnstr_printf(cntxt->fdout,"#prefix estimate 4 %o %o elm "BUNFMT"
val %d bits %d mask %o\n",
- *v,*w, i, val, prefixbits, mask);
+ mnstr_printf(cntxt->fdout,"#prefix estimate size 4 %o elm "BUNFMT"
val %d bits %d mask %o\n",
+ *v, i, val, prefixbits, mask);
#endif
if( task->range[MOSAIC_PREFIX] > task->start + 1){
bits = (task->range[MOSAIC_PREFIX] -
task->start) * (32-prefixbits);
@@ -355,17 +462,9 @@ MOSestimate_prefix(Client cntxt, MOStask
}
break;
case 8:
- { ulng *v = ((ulng*) task->src) + task->start, *w= v+1,
val= *v,val2= *w, mask;
- // search first non-identical value
- for(i = 0; i < limit-1 ;i++, w++)
- if( *v != *w ){
- val2 = *w;
- break;
- }
- if ( i == limit-1 )
- break;
- Prefix(prefixbits, mask, val, val2, 32); // bitvector
has limit of 32-bit fields
- if( prefixbits == 0 || 64-prefixbits >= 32)
+ { ulng *v = ((ulng*) task->src) + task->start, *w= v+1,
val= *v,mask;
+ findPrefixLng(cntxt, v, LOOKAHEAD, &prefixbits,&mask);
+ if( prefixbits == 0 )
break;
if( task->range[MOSAIC_PREFIX] > task->start + 1){
@@ -394,7 +493,7 @@ MOSestimate_prefix(Client cntxt, MOStask
factor = ( (flt)i * sizeof(lng))/ store;
}
}
-#ifdef _DEBUG_MOSAIC_
+#ifdef _DEBUG_PREFIX_
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list