Update of /cvsroot/monetdb/pathfinder/runtime
In directory sc8-pr-cvs7.sourceforge.net:/tmp/cvs-serv26648/runtime
Modified Files:
pathfinder.mx
Log Message:
- remove more skewed data from the value index and tune both creation and
lookup
Index: pathfinder.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/runtime/pathfinder.mx,v
retrieving revision 1.320
retrieving revision 1.321
diff -u -d -r1.320 -r1.321
--- pathfinder.mx 5 Apr 2007 17:07:39 -0000 1.320
+++ pathfinder.mx 6 Apr 2007 00:03:23 -0000 1.321
@@ -1858,6 +1858,25 @@
}
+PROC vx_reduce(BAT[oid,int] vxt, BAT[void,oid] vxp) : BAT[oid,oid]
+{
+ # sort on hash code, to make next stap bearable
+ vxt := vxt.tsort();
+ vxp := vxt.hmark([EMAIL
PROTECTED]).leftfetchjoin(vxp).access(BAT_WRITE);
+ vxt := vxt.tmark([EMAIL PROTECTED]);
+
+ # eliminate too frequent values from the index
+ var lim := 16*log2(lng(count(vxt)));
+ var sel := histogram(vxt);
+ sel := [<](sel, lim);
+ sel := vxt.leftjoin(sel).tmark([EMAIL PROTECTED]).access(BAT_WRITE); #
true for infrequent values
+ var tmp := reverse(kunique(reverse(vxt))).project(true); # true for
first frequent value only
+ vxp :=
vxp.inplace(mirror(tmp).leftfetchjoin(sel).ord_uselect(false).project(oid_nil));
+ sel := sel.inplace(tmp).ord_uselect(true).mirror(); tmp := nil; #
select first hsh occurrence anyway
+
+ return reverse(sel.leftfetchjoin(vxt)).leftfetchjoin(vxp);
+}
+
# double-underscore __X() means we must have the RT_LOCK_FREELIST of this
collection
# this computes a new QN_NID and HSH_NID indices for a suffix of the PRE-table
@@ -1897,18 +1916,7 @@
var vxt := vxm.leftfetchjoin(prp); vxm := nil;
vxt := vxt.leftfetchjoin(prp_txt);
vxt := [hash](vxt).access(BAT_APPEND);
-
- # filter out highly repeated text values from the hsh_nid index
- var lim := log2(lng(count(vxt)));
- vxm := histogram(vxt);
- tmp := [<](vxm, lim*lim);
- tmp := vxt.leftjoin(tmp).tmark([EMAIL PROTECTED]).access(BAT_WRITE); #
true for infrequent values
- vxp := [ifthenelse](tmp, vxp, oid_nil); # key for infrequent,
nil for infrequent
- vxm := kunique(reverse(vxt));
- vxm := reverse(vxm).project(true);
- tmp := tmp.inplace(vxm).uselect(true).hmark([EMAIL PROTECTED]); vxm :=
nil; # select first hsh occurrence anyway
- vxt := tmp.leftfetchjoin(vxt);
- vxp := tmp.leftfetchjoin(vxp); tmp := nil;
+ vxm := vx_reduce(vxt, vxp).access(BAT_WRITE); vxp := nil; vxt := nil;
# add the newly shredded attributes. The new attributes are >att and have
an attr_own > pre
tmp := attr_own.reverse().ord_select(att,oid_nil).reverse();
@@ -1919,22 +1927,19 @@
tmp := tmp.[swizzle](map_pid);
}
tmp := tmp.ord_uselect(pre,oid_nil).hmark([EMAIL PROTECTED]);
- vxm := [int](tmp.leftfetchjoin(attr_qn)).access(BAT_WRITE);
- vxm := [:rotate_xor_hash=](vxm, 27,
tmp.leftfetchjoin(attr_prp).leftfetchjoin(prp_val));
-
- # combine text/attribute index
- vxt := vxt.access(BAT_APPEND).append(vxm); vxm := nil;
- tmp := tmp.leftfetchjoin(attr_own);
- vxp := vxp.access(BAT_APPEND).append(tmp); tmp := nil;
- vxm := reverse(vxt).leftfetchjoin(vxp); vxp := nil; vxt := nil;
+ vxt := [int](tmp.leftfetchjoin(attr_qn)).access(BAT_WRITE);
+ vxt := [:rotate_xor_hash=](vxt, 13,
tmp.leftfetchjoin(attr_prp).leftfetchjoin(prp_val));
+ vxp := tmp.leftfetchjoin(attr_own); tmp := nil;
+ tmp := vx_reduce(vxt, vxp); vxp := nil; vxt := nil;
if (updatable) { # hash-index, created here each time document is first
used
- (idx := bat(oid, oid, (count(idx)*4)/3).insert(idx)).accbuild("hash");
- (vxm := bat(int, oid, (count(vxm)*4)/3).insert(vxm)).accbuild("hash");
+ (idx := bat(oid, oid, (count(idx)*8)/7).insert(idx)).accbuild("hash");
+ (vxm := bat(int, oid,
((count(vxm)+count(tmp))*8)/7).insert(vxm).insert(tmp)).accbuild("hash");
} else { # read-only case. indices are sorted (binary search) and
persistent
# idx [qn,pre] lexico-ordered (thanks to stable-sort)
idx :=
idx.access(BAT_WRITE).sorder().access(BAT_READ).rename(nme).mmap(1).persists(true);
- vxm :=
vxm.access(BAT_WRITE).order().access(BAT_READ).rename(vx_nme).mmap(1).persists(true);
+ tmp := merged_union(vxm.hmark([EMAIL PROTECTED]), tmp.hmark([EMAIL
PROTECTED]), vxm.tmark([EMAIL PROTECTED]), tmp.tmark([EMAIL PROTECTED])); vxm
:= nil;
+ vxm := reverse(tmp.fetch(0)).leftfetchjoin(tmp.fetch(1));
if (pre = [EMAIL PROTECTED])
CATCH(pf_checkpoint(bat(void,str).append(nme).append(vx_nme))); # immediate
commit
}
return new(void,bat).insert(nil,idx).insert(nil,vxm);
@@ -2217,10 +2222,11 @@
var iter_hsh := cross(iter_val, [int](qns));
var key_iter := iter_hsh.hmark([EMAIL PROTECTED]);
var key_hsh := iter_hsh.tmark([EMAIL
PROTECTED]).copy().access(BAT_WRITE);
- key_hsh := [:rotate_xor_hash=](key_hsh, 27,
key_iter.leftjoin(iter_val).tmark([EMAIL PROTECTED]));
+ key_hsh := [:rotate_xor_hash=](key_hsh, 13,
key_iter.leftjoin(iter_val).tmark([EMAIL PROTECTED]));
iter_hsh := reverse(key_iter).leftfetchjoin(key_hsh);
# lookup uses the hash-index on vx_hsh_nid; provide isolation with
_ins/_del bats
+CATCH(ERROR("pp"));
lock_set(coll_shortlock);
var err := CATCH(iter_cand := iter_hsh.leftjoin(vx_hsh_nid));
lock_unset(coll_shortlock);
@@ -2239,10 +2245,7 @@
if (count(iter_cand) = 0) return empty_bat;
# check against finding nil NIDs. These indicate attribute values
ommitted from the index
- if (isnil(min(iter_cand))) {
- if (gettext) iter_cand := iter_cand.ord_select(oid_nil,oid_nil);
- else use_index := false; # was omitted, we have to scan
- }
+ if (isnil(min(iter_cand))) use_index := false; # was omitted, we have
to scan
}
if (not(use_index)) { # everything is a candidate
iter_cand := cross(iter_val,reverse(pre_kind));
@@ -2539,7 +2542,7 @@
BAT[oid,oid] qn,
BAT[oid,str] val) : BAT[int,oid]
{
- return reverse([:rotate_xor_hash=]([int](qn).access(BAT_WRITE), 27,
val)).leftfetchjoin(nid);
+ return reverse([:rotate_xor_hash=]([int](qn).access(BAT_WRITE), 13,
val)).leftfetchjoin(nid);
}
PROC vx_maintain(BAT[oid,oid] nid,
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins