In perl.git, the branch yves/superfasthash5 has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/d7aa6a58fb3cf5079424556f831c7051cc820c6a?hp=6b3e2080fb0e74ed46bca0c58d75d9dd9dd9d8a2>

- Log -----------------------------------------------------------------
commit d7aa6a58fb3cf5079424556f831c7051cc820c6a
Author: Yves Orton <[email protected]>
Date:   Mon Sep 10 17:45:43 2012 +0200

    tweaks to PERL_HASH logic (experimental)

M       hv.h

commit 585782958a0e7ec7fec9dc4634106439628e4ed0
Author: Yves Orton <[email protected]>
Date:   Mon Sep 10 17:45:11 2012 +0200

    experimental crap to play with how the hash logic allocates buckets

M       hv.c

commit 064454668c9c3bb81c99f1b23776d9ada6305c86
Author: Yves Orton <[email protected]>
Date:   Mon Sep 10 17:39:13 2012 +0200

    add routines for introspecting hash utilization to the Internals namespace
    
        my ($keys,$buckets,$used_buckets, $max_used_bucket_chain,
        $min_used_bucket_chain, $average_length_of_used_bucket_chain,
        $stddev_of_used_bucket_chain, $bucket_length_counts_array_ref)=
            Internals::bucket_info($hashref);
    
        my ($key_info_array)= Internals::bucket_array($hashref);
    
    bucket_array() returns an AoA of keys per bucket in the order they are
    stored in the buckets. It is "useful" for introspecting the actual
    bucket orders.
    
    bucket_into() returns a detailed summary of the utilization and
    statistics of the hash.
    
    Work In Progress.

M       universal.c
-----------------------------------------------------------------------

Summary of changes:
 hv.c        |   74 ++++++++++++++++++-------------
 hv.h        |    6 +-
 universal.c |  140 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 187 insertions(+), 33 deletions(-)

diff --git a/hv.c b/hv.c
index b5e3d91..b33ee52 100644
--- a/hv.c
+++ b/hv.c
@@ -793,37 +793,51 @@ Perl_hv_common(pTHX_ HV *hv, SV *keysv, const char *key, 
STRLEN klen,
     else                                       /* gotta do the real thing */
        HeKEY_hek(entry) = save_hek_flags(key, klen, hash, flags);
     HeVAL(entry) = val;
-    HeNEXT(entry) = *oentry;
-    *oentry = entry;
-
-    if (val == &PL_sv_placeholder)
-       HvPLACEHOLDERS(hv)++;
-    if (masked_flags & HVhek_ENABLEHVKFLAGS)
-       HvHASKFLAGS_on(hv);
-
     {
-       const HE *counter = HeNEXT(entry);
-
-       xhv->xhv_keys++; /* HvTOTALKEYS(hv)++ */
-       if (!counter) {                         /* initial entry? */
-       } else if (xhv->xhv_keys > xhv->xhv_max) {
-               /* Use only the old HvUSEDKEYS(hv) > HvMAX(hv) condition to 
limit
-                  bucket splits on a rehashed hash, as we're not going to
-                  split it again, and if someone is lucky (evil) enough to
-                  get all the keys in one list they could exhaust our memory
-                  as we repeatedly double the number of buckets on every
-                  entry. Linear search feels a less worse thing to do.  */
-           hsplit(hv);
-       } else if(!HvREHASH(hv)) {
-           U32 n_links = 1;
-
-           while ((counter = HeNEXT(counter)))
-               n_links++;
-
-           if (n_links > HV_MAX_LENGTH_BEFORE_SPLIT) {
-               hsplit(hv);
-           }
-       }
+        U32 n_links = 1;
+
+        if (0 && key) {
+            while ( *oentry && (
+                    HeHASH(*oentry) <= hash &&
+                    HeKLEN(*oentry) <= HeKLEN(entry) &&
+                    strLT(HeKEY(*oentry),HeKEY(entry)) ) )
+            {
+                oentry=&(HeNEXT(*oentry));
+                n_links++;
+            }
+        }
+        HeNEXT(entry) = *oentry;
+        *oentry = entry;
+
+        if (val == &PL_sv_placeholder)
+            HvPLACEHOLDERS(hv)++;
+        if (masked_flags & HVhek_ENABLEHVKFLAGS)
+            HvHASKFLAGS_on(hv);
+
+        {
+            const HE *counter = HeNEXT(entry);
+
+            xhv->xhv_keys++; /* HvTOTALKEYS(hv)++ */
+            if (!counter && n_links == 1) {
+                /* empty bucket, so dont bother splitting now */
+            } else if ( xhv->xhv_keys > xhv->xhv_max ) {
+                    /* Use only the old HvUSEDKEYS(hv) > HvMAX(hv) condition 
to limit
+                       bucket splits on a rehashed hash, as we're not going to
+                       split it again, and if someone is lucky (evil) enough to
+                       get all the keys in one list they could exhaust our 
memory
+                       as we repeatedly double the number of buckets on every
+                       entry. Linear search feels a less worse thing to do.  */
+                hsplit(hv);
+            } else if(counter && !HvREHASH(hv)) {
+
+                while ((counter = HeNEXT(counter)))
+                    n_links++;
+
+                if (n_links > HV_MAX_LENGTH_BEFORE_SPLIT) {
+                    hsplit(hv);
+                }
+            }
+        }
     }
 
     if (return_svp) {
diff --git a/hv.h b/hv.h
index 491d899..13ab73a 100644
--- a/hv.h
+++ b/hv.h
@@ -141,7 +141,7 @@ struct xpvhv {
  * perls hashing algorithm), if it is not defined we use the "Super Fast"
  * hash from Paul Hsieh
  */
-#undef HASH_FUNC_ONE_AT_A_TIME
+#define HASH_FUNC_ONE_AT_A_TIME
 
 #ifdef HASH_FUNC_ONE_AT_A_TIME
 /* FYI: This is the "One-at-a-Time" algorithm by Bob Jenkins
@@ -152,7 +152,7 @@ struct xpvhv {
         const char * const s_PeRlHaSh_tmp = (str); \
        const unsigned char *s_PeRlHaSh = (const unsigned char 
*)s_PeRlHaSh_tmp; \
         I32 i_PeRlHaSh = (len); \
-        U32 hash_PeRlHaSh = ((internal) ? PL_rehash_seed : PERL_HASH_SEED); \
+        U32 hash_PeRlHaSh = ((internal) ? PL_rehash_seed : PERL_HASH_SEED) + 
len; \
         assert(hash_PeRlHaSh!=0);                    \
        while (i_PeRlHaSh--) { \
            hash_PeRlHaSh += *s_PeRlHaSh++; \
@@ -186,7 +186,7 @@ struct xpvhv {
         register const char * const strtmp_PeRlHaSh = (str); \
         register const unsigned char *str_PeRlHaSh = (const unsigned char 
*)strtmp_PeRlHaSh; \
         register U32 len_PeRlHaSh = (len); \
-        register U32 hash_PeRlHaSh = ((internal) ? PL_rehash_seed : 
PERL_HASH_SEED) ^ len; \
+        register U32 hash_PeRlHaSh = ((internal) ? PL_rehash_seed : 
PERL_HASH_SEED) + len; \
         register U32 tmp_PeRlHaSh; \
         register int rem_PeRlHaSh= len_PeRlHaSh & 3; \
         len_PeRlHaSh >>= 2; \
diff --git a/universal.c b/universal.c
index da7b50e..bec0cd3 100644
--- a/universal.c
+++ b/universal.c
@@ -1140,6 +1140,144 @@ XS(XS_Internals_PERL_HASH)        /* Subject to change  
*/
     Perl_croak(aTHX_ "Internals::PERL_HASH $hashref, $value");
 }
 
+XS(XS_Internals_bucket_info)
+{
+    dVAR;
+    dXSARGS;
+    PERL_UNUSED_ARG(cv);
+    if (items == 1 && SvROK(ST(0)) && SvTYPE(SvRV(ST(0)))==SVt_PVHV && 
!SvMAGICAL(SvRV(ST(0)))) {
+        const HV * const hv = (const HV *) SvRV(POPs);
+        U32 max_bucket= HvMAX(hv);
+        U32 total_keys= HvUSEDKEYS(hv);
+        U32 used_buckets= 0;
+        U32 max_buckets= 0;
+        U32 min_buckets= 0;
+        U32 i,j;
+        NV sum_sq= 0;
+        NV sum= 0;
+        HE **he_ptr= HvARRAY(hv);
+        HE *he;
+        NV stddev;
+        U32 n= total_keys;
+        AV *histo= newAV();
+        SV **histo_raw;
+        av_store(histo,0,newSViv(0));
+        histo_raw= AvARRAY(histo);
+
+        for ( i= 0; i <= max_bucket; i++ ) {
+            U32 count= 0;
+            for (he= he_ptr[i]; he; he= HeNEXT(he) ) {
+                if (HeVAL(he) != &PL_sv_placeholder) {
+                    count++;
+                    if (!--n) {
+                        i= max_bucket+1;
+                        break;
+                    }
+                }
+            }
+            if (count) {
+                if (used_buckets) {
+                    if ( max_buckets < count ) {
+                        for (j= max_buckets+1; j <= count; j++) {
+                            av_store(histo,j,newSViv(0));
+                        }
+                        histo_raw= AvARRAY(histo);
+                        max_buckets= count;
+                    }
+                    if ( min_buckets > count )
+                        min_buckets= count;
+                } else {
+                    for (j= 0; j <= count; j++) {
+                        av_store(histo,j,newSViv(0));
+                    }
+                    histo_raw= AvARRAY(histo);
+                    max_buckets= count;
+                    min_buckets= count;
+                }
+                used_buckets++;
+                sum_sq+= (count * count);
+                sum += count;
+            }
+            SvIVX(histo_raw[count])++;
+        }
+        mXPUSHi(total_keys);
+        mXPUSHi(max_bucket+1);
+        mXPUSHi(used_buckets);
+
+        mXPUSHi(max_buckets);
+        mXPUSHi(min_buckets);
+
+        mXPUSHn(sum/used_buckets);
+        mXPUSHn((sum_sq - ((sum * sum)/(NV)used_buckets))/(NV)used_buckets);
+        mXPUSHs(newRV_noinc(histo));
+        XSRETURN(8);
+    } else {
+        Perl_croak(aTHX_ "Internals::bucket_info takes only one argument, a 
$hashref");
+    }
+}
+
+XS(XS_Internals_bucket_array)
+{
+    dVAR;
+    dXSARGS;
+    PERL_UNUSED_ARG(cv);
+    if (items == 1 && SvROK(ST(0))) {
+        const HV * const hv = (const HV *) SvRV(ST(0));
+        if (SvTYPE(hv) == SVt_PVHV) {
+            U32 n, i, max;
+            AV *info_av;
+            HE **he_ptr;
+            HE *he;
+            if (SvMAGICAL(hv)) {
+                Perl_croak(aTHX_ "Internals::bucket_info only works on 
'normal' hashes");
+            }
+            info_av= newAV();
+            n= HvUSEDKEYS(hv);
+            he_ptr= HvARRAY(hv);
+            max= HvMAX(hv);
+            av_store(info_av,max,&PL_sv_undef);
+            ST(0)= sv_2mortal(newRV_noinc((SV*)info_av));
+            for ( i= 0; i <= max; i++ ) {
+                AV *key_av= NULL;
+                for (he= he_ptr[i]; he; he= HeNEXT(he) ) {
+                    SV *key_sv;
+                    if (!key_av) {
+                        key_av= newAV();
+                        av_store(info_av,i,(SV *)newRV_noinc((SV *)key_av));
+                    }
+                    if (HeVAL(he) != &PL_sv_placeholder) {
+                        char *str;
+                        STRLEN len;
+                        char mode;
+                        if (HeKLEN(he) == HEf_SVKEY) {
+                            SV *sv= HeSVKEY(he);
+                            SvGETMAGIC(sv);
+                            str= SvPV(sv, len);
+                            mode= SvUTF8(sv) ? 1 : 0;
+                        } else {
+                            str= HeKEY(he);
+                            len= HeKLEN(he);
+                            mode= HeKUTF8(he) ? 1 : 0;
+                        }
+                        key_sv= newSVpvn(str,len);
+                        if (mode) {
+                            SvUTF8_on(key_sv);
+                        }
+                    } else {
+                        key_sv= newSV_type(SVt_NULL);
+                    }
+                    av_push(key_av, key_sv);
+                }
+            }
+        }
+        else {
+            Perl_croak(aTHX_ "Internals::bucket_info only works on 'normal' 
hashes");
+        }
+    } else {
+        Perl_croak(aTHX_ "Internals::bucket_info takes only one argument, a 
$hashref");
+    }
+}
+
 XS(XS_re_is_regexp)
 {
     dVAR; 
@@ -1420,6 +1558,8 @@ struct xsub_details details[] = {
     {"Internals::rehash_seed", XS_Internals_rehash_seed, ""},
     {"Internals::HvREHASH", XS_Internals_HvREHASH, "\\%"},
     {"Internals::PERL_HASH", XS_Internals_PERL_HASH, "\\%$"},
+    {"Internals::bucket_array", XS_Internals_bucket_array, "$"},
+    {"Internals::bucket_info", XS_Internals_bucket_info, "$"},
     {"re::is_regexp", XS_re_is_regexp, "$"},
     {"re::regname", XS_re_regname, ";$$"},
     {"re::regnames", XS_re_regnames, ";$"},

--
Perl5 Master Repository

Reply via email to