In perl.git, the branch yves/superfasthash5 has been updated <http://perl5.git.perl.org/perl.git/commitdiff/d7aa6a58fb3cf5079424556f831c7051cc820c6a?hp=6b3e2080fb0e74ed46bca0c58d75d9dd9dd9d8a2>
- Log ----------------------------------------------------------------- commit d7aa6a58fb3cf5079424556f831c7051cc820c6a Author: Yves Orton <[email protected]> Date: Mon Sep 10 17:45:43 2012 +0200 tweaks to PERL_HASH logic (experimental) M hv.h commit 585782958a0e7ec7fec9dc4634106439628e4ed0 Author: Yves Orton <[email protected]> Date: Mon Sep 10 17:45:11 2012 +0200 experimental crap to play with how the hash logic allocates buckets M hv.c commit 064454668c9c3bb81c99f1b23776d9ada6305c86 Author: Yves Orton <[email protected]> Date: Mon Sep 10 17:39:13 2012 +0200 add routines for introspecting hash utilization to the Internals namespace my ($keys,$buckets,$used_buckets, $max_used_bucket_chain, $min_used_bucket_chain, $average_length_of_used_bucket_chain, $stddev_of_used_bucket_chain, $bucket_length_counts_array_ref)= Internals::bucket_info($hashref); my ($key_info_array)= Internals::bucket_array($hashref); bucket_array() returns an AoA of keys per bucket in the order they are stored in the buckets. It is "useful" for introspecting the actual bucket orders. bucket_into() returns a detailed summary of the utilization and statistics of the hash. Work In Progress. M universal.c ----------------------------------------------------------------------- Summary of changes: hv.c | 74 ++++++++++++++++++------------- hv.h | 6 +- universal.c | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 187 insertions(+), 33 deletions(-) diff --git a/hv.c b/hv.c index b5e3d91..b33ee52 100644 --- a/hv.c +++ b/hv.c @@ -793,37 +793,51 @@ Perl_hv_common(pTHX_ HV *hv, SV *keysv, const char *key, STRLEN klen, else /* gotta do the real thing */ HeKEY_hek(entry) = save_hek_flags(key, klen, hash, flags); HeVAL(entry) = val; - HeNEXT(entry) = *oentry; - *oentry = entry; - - if (val == &PL_sv_placeholder) - HvPLACEHOLDERS(hv)++; - if (masked_flags & HVhek_ENABLEHVKFLAGS) - HvHASKFLAGS_on(hv); - { - const HE *counter = HeNEXT(entry); - - xhv->xhv_keys++; /* HvTOTALKEYS(hv)++ */ - if (!counter) { /* initial entry? */ - } else if (xhv->xhv_keys > xhv->xhv_max) { - /* Use only the old HvUSEDKEYS(hv) > HvMAX(hv) condition to limit - bucket splits on a rehashed hash, as we're not going to - split it again, and if someone is lucky (evil) enough to - get all the keys in one list they could exhaust our memory - as we repeatedly double the number of buckets on every - entry. Linear search feels a less worse thing to do. */ - hsplit(hv); - } else if(!HvREHASH(hv)) { - U32 n_links = 1; - - while ((counter = HeNEXT(counter))) - n_links++; - - if (n_links > HV_MAX_LENGTH_BEFORE_SPLIT) { - hsplit(hv); - } - } + U32 n_links = 1; + + if (0 && key) { + while ( *oentry && ( + HeHASH(*oentry) <= hash && + HeKLEN(*oentry) <= HeKLEN(entry) && + strLT(HeKEY(*oentry),HeKEY(entry)) ) ) + { + oentry=&(HeNEXT(*oentry)); + n_links++; + } + } + HeNEXT(entry) = *oentry; + *oentry = entry; + + if (val == &PL_sv_placeholder) + HvPLACEHOLDERS(hv)++; + if (masked_flags & HVhek_ENABLEHVKFLAGS) + HvHASKFLAGS_on(hv); + + { + const HE *counter = HeNEXT(entry); + + xhv->xhv_keys++; /* HvTOTALKEYS(hv)++ */ + if (!counter && n_links == 1) { + /* empty bucket, so dont bother splitting now */ + } else if ( xhv->xhv_keys > xhv->xhv_max ) { + /* Use only the old HvUSEDKEYS(hv) > HvMAX(hv) condition to limit + bucket splits on a rehashed hash, as we're not going to + split it again, and if someone is lucky (evil) enough to + get all the keys in one list they could exhaust our memory + as we repeatedly double the number of buckets on every + entry. Linear search feels a less worse thing to do. */ + hsplit(hv); + } else if(counter && !HvREHASH(hv)) { + + while ((counter = HeNEXT(counter))) + n_links++; + + if (n_links > HV_MAX_LENGTH_BEFORE_SPLIT) { + hsplit(hv); + } + } + } } if (return_svp) { diff --git a/hv.h b/hv.h index 491d899..13ab73a 100644 --- a/hv.h +++ b/hv.h @@ -141,7 +141,7 @@ struct xpvhv { * perls hashing algorithm), if it is not defined we use the "Super Fast" * hash from Paul Hsieh */ -#undef HASH_FUNC_ONE_AT_A_TIME +#define HASH_FUNC_ONE_AT_A_TIME #ifdef HASH_FUNC_ONE_AT_A_TIME /* FYI: This is the "One-at-a-Time" algorithm by Bob Jenkins @@ -152,7 +152,7 @@ struct xpvhv { const char * const s_PeRlHaSh_tmp = (str); \ const unsigned char *s_PeRlHaSh = (const unsigned char *)s_PeRlHaSh_tmp; \ I32 i_PeRlHaSh = (len); \ - U32 hash_PeRlHaSh = ((internal) ? PL_rehash_seed : PERL_HASH_SEED); \ + U32 hash_PeRlHaSh = ((internal) ? PL_rehash_seed : PERL_HASH_SEED) + len; \ assert(hash_PeRlHaSh!=0); \ while (i_PeRlHaSh--) { \ hash_PeRlHaSh += *s_PeRlHaSh++; \ @@ -186,7 +186,7 @@ struct xpvhv { register const char * const strtmp_PeRlHaSh = (str); \ register const unsigned char *str_PeRlHaSh = (const unsigned char *)strtmp_PeRlHaSh; \ register U32 len_PeRlHaSh = (len); \ - register U32 hash_PeRlHaSh = ((internal) ? PL_rehash_seed : PERL_HASH_SEED) ^ len; \ + register U32 hash_PeRlHaSh = ((internal) ? PL_rehash_seed : PERL_HASH_SEED) + len; \ register U32 tmp_PeRlHaSh; \ register int rem_PeRlHaSh= len_PeRlHaSh & 3; \ len_PeRlHaSh >>= 2; \ diff --git a/universal.c b/universal.c index da7b50e..bec0cd3 100644 --- a/universal.c +++ b/universal.c @@ -1140,6 +1140,144 @@ XS(XS_Internals_PERL_HASH) /* Subject to change */ Perl_croak(aTHX_ "Internals::PERL_HASH $hashref, $value"); } +XS(XS_Internals_bucket_info) +{ + dVAR; + dXSARGS; + PERL_UNUSED_ARG(cv); + if (items == 1 && SvROK(ST(0)) && SvTYPE(SvRV(ST(0)))==SVt_PVHV && !SvMAGICAL(SvRV(ST(0)))) { + const HV * const hv = (const HV *) SvRV(POPs); + U32 max_bucket= HvMAX(hv); + U32 total_keys= HvUSEDKEYS(hv); + U32 used_buckets= 0; + U32 max_buckets= 0; + U32 min_buckets= 0; + U32 i,j; + NV sum_sq= 0; + NV sum= 0; + HE **he_ptr= HvARRAY(hv); + HE *he; + NV stddev; + U32 n= total_keys; + AV *histo= newAV(); + SV **histo_raw; + av_store(histo,0,newSViv(0)); + histo_raw= AvARRAY(histo); + + for ( i= 0; i <= max_bucket; i++ ) { + U32 count= 0; + for (he= he_ptr[i]; he; he= HeNEXT(he) ) { + if (HeVAL(he) != &PL_sv_placeholder) { + count++; + if (!--n) { + i= max_bucket+1; + break; + } + } + } + if (count) { + if (used_buckets) { + if ( max_buckets < count ) { + for (j= max_buckets+1; j <= count; j++) { + av_store(histo,j,newSViv(0)); + } + histo_raw= AvARRAY(histo); + max_buckets= count; + } + if ( min_buckets > count ) + min_buckets= count; + } else { + for (j= 0; j <= count; j++) { + av_store(histo,j,newSViv(0)); + } + histo_raw= AvARRAY(histo); + max_buckets= count; + min_buckets= count; + } + used_buckets++; + sum_sq+= (count * count); + sum += count; + } + SvIVX(histo_raw[count])++; + } + mXPUSHi(total_keys); + mXPUSHi(max_bucket+1); + mXPUSHi(used_buckets); + + mXPUSHi(max_buckets); + mXPUSHi(min_buckets); + + mXPUSHn(sum/used_buckets); + mXPUSHn((sum_sq - ((sum * sum)/(NV)used_buckets))/(NV)used_buckets); + mXPUSHs(newRV_noinc(histo)); + XSRETURN(8); + } else { + Perl_croak(aTHX_ "Internals::bucket_info takes only one argument, a $hashref"); + } +} + +XS(XS_Internals_bucket_array) +{ + dVAR; + dXSARGS; + PERL_UNUSED_ARG(cv); + if (items == 1 && SvROK(ST(0))) { + const HV * const hv = (const HV *) SvRV(ST(0)); + if (SvTYPE(hv) == SVt_PVHV) { + U32 n, i, max; + AV *info_av; + HE **he_ptr; + HE *he; + if (SvMAGICAL(hv)) { + Perl_croak(aTHX_ "Internals::bucket_info only works on 'normal' hashes"); + } + info_av= newAV(); + n= HvUSEDKEYS(hv); + he_ptr= HvARRAY(hv); + max= HvMAX(hv); + av_store(info_av,max,&PL_sv_undef); + ST(0)= sv_2mortal(newRV_noinc((SV*)info_av)); + for ( i= 0; i <= max; i++ ) { + AV *key_av= NULL; + for (he= he_ptr[i]; he; he= HeNEXT(he) ) { + SV *key_sv; + if (!key_av) { + key_av= newAV(); + av_store(info_av,i,(SV *)newRV_noinc((SV *)key_av)); + } + if (HeVAL(he) != &PL_sv_placeholder) { + char *str; + STRLEN len; + char mode; + if (HeKLEN(he) == HEf_SVKEY) { + SV *sv= HeSVKEY(he); + SvGETMAGIC(sv); + str= SvPV(sv, len); + mode= SvUTF8(sv) ? 1 : 0; + } else { + str= HeKEY(he); + len= HeKLEN(he); + mode= HeKUTF8(he) ? 1 : 0; + } + key_sv= newSVpvn(str,len); + if (mode) { + SvUTF8_on(key_sv); + } + } else { + key_sv= newSV_type(SVt_NULL); + } + av_push(key_av, key_sv); + } + } + } + else { + Perl_croak(aTHX_ "Internals::bucket_info only works on 'normal' hashes"); + } + } else { + Perl_croak(aTHX_ "Internals::bucket_info takes only one argument, a $hashref"); + } +} + XS(XS_re_is_regexp) { dVAR; @@ -1420,6 +1558,8 @@ struct xsub_details details[] = { {"Internals::rehash_seed", XS_Internals_rehash_seed, ""}, {"Internals::HvREHASH", XS_Internals_HvREHASH, "\\%"}, {"Internals::PERL_HASH", XS_Internals_PERL_HASH, "\\%$"}, + {"Internals::bucket_array", XS_Internals_bucket_array, "$"}, + {"Internals::bucket_info", XS_Internals_bucket_info, "$"}, {"re::is_regexp", XS_re_is_regexp, "$"}, {"re::regname", XS_re_regname, ";$$"}, {"re::regnames", XS_re_regnames, ";$"}, -- Perl5 Master Repository
