In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/615a2e7f2463ed7fed1d6e1819306477ba1709d1?hp=b94d11d663eee9a5d693a9ebb9cdad0f6ba6b881>
- Log ----------------------------------------------------------------- commit 615a2e7f2463ed7fed1d6e1819306477ba1709d1 Author: Yves Orton <[email protected]> Date: Sat Apr 19 14:42:47 2014 +0200 regcomp.c - cleanup the ahocorasick start class logic so it more self-documenting The logic of setting up an AHO-CORASICK regex start class was not fully encapsuated in the make_trie_failtable() function, which itself was poorly named. Merged the code into make_trie_failtable() and renamed it to construct_ahocorasick_from_trie(). M embed.fnc M embed.h M proto.h M regcomp.c commit 3eaa3d146cbf0b603941d8c6dac35bdb13d79ee6 Author: Yves Orton <[email protected]> Date: Sun Apr 13 13:29:44 2014 +0200 Hash::Util - fixes to hash stats and add bucket_stats_formatted() * we should do the mean/stddev on the on the occupied buckets not all buckets. This was always intended to the be average chain-length, which implies that empty buckets with no-chains at all are excluded. * Add bucket_stats_formatted(), bump version Creates reports like this: Keys: 500 Buckets: 314/512 Quality-Score: 1.01 (Good) Utilized Buckets: 61.33% Optimal: 97.66% Keys In Collision: 37.20% Chain Length - mean: 1.59 stddev: 0.81 Buckets 512 [0000000000000000000000000111111111111111111111122222222222233334] Len 0 Pct: 38.67 [#########################] Len 1 Pct: 34.57 [######################] Len 2 Pct: 19.53 [############] Len 3 Pct: 5.47 [####] Len 4 Pct: 1.17 [#] Len 5 Pct: 0.59 [] Keys 500 [1111111111111111111111111111111111111111222222222222222222333334] Pos 1 Pct: 62.80 [########################################] Pos 2 Pct: 27.40 [##################] Pos 3 Pct: 7.40 [#####] Pos 4 Pct: 1.80 [#] Pos 5 Pct: 0.60 [] * Make it possible to get stats on PL_strtab * bump version to 0.17 M ext/Hash-Util/Changes M ext/Hash-Util/Util.xs M ext/Hash-Util/lib/Hash/Util.pm M pod/perldelta.pod commit 54e07e2b21cb1f58c04d67bca2a311715ba8815e Author: Yves Orton <[email protected]> Date: Sun Apr 13 12:54:12 2014 +0200 hv_func.h - fix seed initialization in sdbm and djb2 hashing algorithms. In a previous commit I added code to "mix in" the length of the string into the seed used by these functions, to avoid issues with zero seeds, and with the hope that it makes it harder to create multicollision attacks against these hash functions. Unfortunately when I restructured the seed logic for the inline functions in hv_func.h I messed it up, and these hash functions were broken. I never noticed because they are both such bad hash functions for our needs that I never built with them, and we have no infrastructure to make it easy to test building with non-standard hash functions so it never got automatically tested. Hopefully at some point someone will find a round-tuit and teach Configure about selecting alternate hash functions. M hv_func.h commit 3ca75eca84b9d0987b69e271b2c50cae574df77e Author: Yves Orton <[email protected]> Date: Fri Mar 21 17:47:45 2014 +0100 universal.c - utf8::downgrade($x,FAIL_OK) is not supposed to treat FAIL_OK as an integer M pod/perldelta.pod M universal.c ----------------------------------------------------------------------- Summary of changes: embed.fnc | 5 +- embed.h | 2 +- ext/Hash-Util/Changes | 15 +++-- ext/Hash-Util/Util.xs | 14 ++++- ext/Hash-Util/lib/Hash/Util.pm | 131 +++++++++++++++++++++++++++++++++++++---- hv_func.h | 4 +- pod/perldelta.pod | 16 +++++ proto.h | 13 ++-- regcomp.c | 49 ++++++++------- universal.c | 2 +- 10 files changed, 196 insertions(+), 55 deletions(-) diff --git a/embed.fnc b/embed.fnc index 85dbdf4..159f26c 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2156,9 +2156,8 @@ Es |I32 |make_trie |NN RExC_state_t *pRExC_state \ |NN regnode *startbranch|NN regnode *first \ |NN regnode *last|NN regnode *tail \ |U32 word_count|U32 flags|U32 depth -Es |void |make_trie_failtable |NN RExC_state_t *pRExC_state \ - |NN regnode *source|NN regnode *stclass \ - |U32 depth +Es |regnode *|construct_ahocorasick_from_trie|NN RExC_state_t *pRExC_state \ + |NN regnode *source|U32 depth # ifdef DEBUGGING Es |void |regdump_intflags|NULLOK const char *lead| const U32 flags Es |void |regdump_extflags|NULLOK const char *lead| const U32 flags diff --git a/embed.h b/embed.h index ca9983d..ca1b91b 100644 --- a/embed.h +++ b/embed.h @@ -917,6 +917,7 @@ #define add_data S_add_data #define alloc_maybe_populate_EXACT(a,b,c,d,e,f) S_alloc_maybe_populate_EXACT(aTHX_ a,b,c,d,e,f) #define compute_EXACTish(a) S_compute_EXACTish(aTHX_ a) +#define construct_ahocorasick_from_trie(a,b,c) S_construct_ahocorasick_from_trie(aTHX_ a,b,c) #define could_it_be_a_POSIX_class(a) S_could_it_be_a_POSIX_class(aTHX_ a) #define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b) #define get_invlist_iter_addr(a) S_get_invlist_iter_addr(aTHX_ a) @@ -938,7 +939,6 @@ #define invlist_trim(a) S_invlist_trim(aTHX_ a) #define join_exact(a,b,c,d,e,f,g) S_join_exact(aTHX_ a,b,c,d,e,f,g) #define make_trie(a,b,c,d,e,f,g,h) S_make_trie(aTHX_ a,b,c,d,e,f,g,h) -#define make_trie_failtable(a,b,c,d) S_make_trie_failtable(aTHX_ a,b,c,d) #define nextchar(a) S_nextchar(aTHX_ a) #define parse_lparen_question_flags(a) S_parse_lparen_question_flags(aTHX_ a) #define populate_ANYOF_from_invlist(a,b) S_populate_ANYOF_from_invlist(aTHX_ a,b) diff --git a/ext/Hash-Util/Changes b/ext/Hash-Util/Changes index 06589b5..ddef72c 100644 --- a/ext/Hash-Util/Changes +++ b/ext/Hash-Util/Changes @@ -1,9 +1,12 @@ Revision history for Perl extension Hash::Util. -0.05 +0.17 + Add bucket_stats_formatted() as utility method to Hash::Util + Bug fixes to hash_stats() -Pre /ext version of the code. By Michael G Schwern <[email protected]> -on top of code by Nick Ing-Simmons and Jeffrey Friedl. +0.07 Sun Jun 11 21:24:15 CEST 2006 + - added front-end support for the new Hash::Util::FieldHash + (Anno Siegel) 0.06 Thu Mar 25 20:26:32 2004 - original XS version; created by h2xs 1.21 with options @@ -13,8 +16,8 @@ on top of code by Nick Ing-Simmons and Jeffrey Friedl. developed to support restricted hashes in Data::Dump::Streamer (shameless plug :-) +0.05 +Pre /ext version of the code. By Michael G Schwern <[email protected]> +on top of code by Nick Ing-Simmons and Jeffrey Friedl. -0.07 Sun Jun 11 21:24:15 CEST 2006 - - added front-end support for the new Hash::Util::FieldHash - (Anno Siegel) diff --git a/ext/Hash-Util/Util.xs b/ext/Hash-Util/Util.xs index 2758d69..3210200 100644 --- a/ext/Hash-Util/Util.xs +++ b/ext/Hash-Util/Util.xs @@ -128,8 +128,13 @@ bucket_info(rhv) nothing (the empty list). */ + const HV * hv; if (SvROK(rhv) && SvTYPE(SvRV(rhv))==SVt_PVHV && !SvMAGICAL(SvRV(rhv))) { - const HV * const hv = (const HV *) SvRV(rhv); + hv = (const HV *) SvRV(rhv); + } else if (!SvOK(rhv)) { + hv = PL_strtab; + } + if (hv) { U32 max_bucket_index= HvMAX(hv); U32 total_keys= HvUSEDKEYS(hv); HE **bucket_array= HvARRAY(hv); @@ -183,8 +188,13 @@ bucket_array(rhv) * of the hash store, combined with regular remappings means that relative * order of keys changes each remap. */ + const HV * hv; if (SvROK(rhv) && SvTYPE(SvRV(rhv))==SVt_PVHV && !SvMAGICAL(SvRV(rhv))) { - const HV * const hv = (const HV *) SvRV(rhv); + hv = (const HV *) SvRV(rhv); + } else if (!SvOK(rhv)) { + hv = PL_strtab; + } + if (hv) { HE **he_ptr= HvARRAY(hv); if (!he_ptr) { XSRETURN(0); diff --git a/ext/Hash-Util/lib/Hash/Util.pm b/ext/Hash-Util/lib/Hash/Util.pm index 8ae25d1..fb98d5a 100644 --- a/ext/Hash-Util/lib/Hash/Util.pm +++ b/ext/Hash-Util/lib/Hash/Util.pm @@ -29,12 +29,12 @@ our @EXPORT_OK = qw( hidden_ref_keys legal_ref_keys hash_seed hash_value hv_store - bucket_stats bucket_info bucket_array + bucket_stats bucket_stats_formatted bucket_info bucket_array lock_hash_recurse unlock_hash_recurse hash_traversal_mask ); -our $VERSION = '0.16'; +our $VERSION = '0.17'; require XSLoader; XSLoader::load(); @@ -523,21 +523,19 @@ See also bucket_stats() and bucket_array(). Returns a list of statistics about a hash. - my ($keys, buckets, $used, $utilization_ratio, $collision_pct, - $mean, $stddev, @length_counts) = bucket_info($hashref); - + my ($keys, $buckets, $used, $quality, $utilization_ratio, $collision_pct, + $mean, $stddev, @length_counts) = bucket_stats($hashref); Fields are as follows: - 0: Number of keys in the hash 1: Number of buckets in the hash 2: Number of used buckets in the hash 3: Hash Quality Score 4: Percent of buckets used 5: Percent of keys which are in collision - 6: Average bucket length - 7: Standard Deviation of bucket lengths. + 6: Mean bucket length of occupied buckets + 7: Standard Deviation of bucket lengths of occupied buckets rest : list of counts, Kth element is the number of buckets with K keys in it. @@ -581,21 +579,128 @@ sub bucket_stats { my ($keys, $buckets, $used, @length_counts) = bucket_info($hash); my $sum; my $score; - for (0 .. $#length_counts) { + for (1 .. $#length_counts) { $sum += ($length_counts[$_] * $_); $score += $length_counts[$_] * ( $_ * ($_ + 1 ) / 2 ); } $score = $score / (( $keys / (2 * $buckets )) * ( $keys + ( 2 * $buckets ) - 1 )) if $keys; - my $mean= $sum/$buckets; - $sum= 0; - $sum += ($length_counts[$_] * (($_-$mean)**2)) for 0 .. $#length_counts; + my ($mean, $stddev)= (0, 0); + if ($used) { + $mean= $sum / $used; + $sum= 0; + $sum += ($length_counts[$_] * (($_-$mean)**2)) for 1 .. $#length_counts; - my $stddev= sqrt($sum/$buckets); + $stddev= sqrt($sum/$used); + } return $keys, $buckets, $used, $keys ? ($score, $used/$buckets, ($keys-$used)/$keys, $mean, $stddev, @length_counts) : (); } +=item B<bucket_stats_formatted> + + print bucket_stats_formatted($hashref); + +Return a formatted report of the information returned by bucket_stats(). +An example report looks like this: + + Keys: 50 Buckets: 33/64 Quality-Score: 1.01 (Good) + Utilized Buckets: 51.56% Optimal: 78.12% Keys In Collision: 34.00% + Chain Length - mean: 1.52 stddev: 0.66 + Buckets 64 [0000000000000000000000000000000111111111111111111122222222222333] + Len 0 Pct: 48.44 [###############################] + Len 1 Pct: 29.69 [###################] + Len 2 Pct: 17.19 [###########] + Len 3 Pct: 4.69 [###] + Keys 50 [11111111111111111111111111111111122222222222222333] + Pos 1 Pct: 66.00 [#################################] + Pos 2 Pct: 28.00 [##############] + Pos 3 Pct: 6.00 [###] + +The first set of stats gives some summary statistical information, +including the quality score translated into "Good", "Poor" and "Bad", +(score<=1.05, score<=1.2, score>1.2). See the documentation in +bucket_stats() for more details. + +The two sets of barcharts give stats and a visual indication of performance +of the hash. + +The first gives data on bucket chain lengths and provides insight on how +much work a fetch *miss* will take. In this case we have to inspect every item +in a bucket before we can be sure the item is not in the list. The performance +for an insert is equivalent to this case, as is a delete where the item +is not in the hash. + +The second gives data on how many keys are at each depth in the chain, and +gives an idea of how much work a fetch *hit* will take. The performance for +an update or delete of an item in the hash is equivalent to this case. + +Note that these statistics are summary only. Actual performance will depend +on real hit/miss ratios accessing the hash. If you are concerned by hit ratios +you are recommended to "oversize" your hash by using something like: + + keys(%hash)= keys(%hash) << $k; + +With $k chosen carefully, and likely to be a small number like 1 or 2. In +theory the larger the bucket array the less chance of collision. + +=cut + + +sub _bucket_stats_formatted_bars { + my ($total, $ary, $start_idx, $title, $row_title)= @_; + + my $return = ""; + my $max_width= $total > 64 ? 64 : $total; + my $bar_width= $max_width / $total; + + my $str= ""; + if ( @$ary < 10) { + for my $idx ($start_idx .. $#$ary) { + $str .= $idx x sprintf("%.0f", ($ary->[$idx] * $bar_width)); + } + } else { + $str= "-" x $max_width; + } + $return .= sprintf "%-7s %6d [%s]\n",$title, $total, $str; + + foreach my $idx ($start_idx .. $#$ary) { + $return .= sprintf "%-.3s %3d %6.2f%% %6d [%s]\n", + $row_title, + $idx, + $ary->[$idx] / $total * 100, + $ary->[$idx], + "#" x sprintf("%.0f", ($ary->[$idx] * $bar_width)), + ; + } + return $return; +} + +sub bucket_stats_formatted { + my ($hashref)= @_; + my ($keys, $buckets, $used, $score, $utilization_ratio, $collision_pct, + $mean, $stddev, @length_counts) = bucket_stats($hashref); + + my $return= sprintf "Keys: %d Buckets: %d/%d Quality-Score: %.2f (%s)\n" + . "Utilized Buckets: %.2f%% Optimal: %.2f%% Keys In Collision: %.2f%%\n" + . "Chain Length - mean: %.2f stddev: %.2f\n", + $keys, $used, $buckets, $score, $score <= 1.05 ? "Good" : $score < 1.2 ? "Poor" : "Bad", + $utilization_ratio * 100, + $keys/$buckets * 100, + $collision_pct * 100, + $mean, $stddev; + + my @key_depth; + $key_depth[$_]= $length_counts[$_] + ( $key_depth[$_+1] || 0 ) + for reverse 1 .. $#length_counts; + + if ($keys) { + $return .= _bucket_stats_formatted_bars($buckets, \@length_counts, 0, "Buckets", "Len"); + $return .= _bucket_stats_formatted_bars($keys, \@key_depth, 1, "Keys", "Pos"); + } + return $return +} + =item B<hv_store> my $sv = 0; diff --git a/hv_func.h b/hv_func.h index 53230ae..473ec46 100644 --- a/hv_func.h +++ b/hv_func.h @@ -455,7 +455,7 @@ S_perl_hash_murmur3(const unsigned char * const seed, const unsigned char *ptr, PERL_STATIC_INLINE U32 S_perl_hash_djb2(const unsigned char * const seed, const unsigned char *str, const STRLEN len) { const unsigned char * const end = (const unsigned char *)str + len; - U32 hash = *((U32*)seed + len); + U32 hash = *((U32*)seed) + len; while (str < end) { hash = ((hash << 5) + hash) + *str++; } @@ -465,7 +465,7 @@ S_perl_hash_djb2(const unsigned char * const seed, const unsigned char *str, con PERL_STATIC_INLINE U32 S_perl_hash_sdbm(const unsigned char * const seed, const unsigned char *str, const STRLEN len) { const unsigned char * const end = (const unsigned char *)str + len; - U32 hash = *((U32*)seed + len); + U32 hash = *((U32*)seed) + len; while (str < end) { hash = (hash << 6) + (hash << 16) - hash + *str++; } diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 09401cb..2760ad2 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -208,6 +208,15 @@ Synchronize POD changes from the CPAN release. L<perl5db.pl> has been upgraded from version 1.44 to 1.45. +=item * + +A mismatch between the documentation and the code in utf8::downgrade() +was fixed in favour of the documentation. The optional second argument +is now correctly treated as a perl boolean (true/false semantics) and +not as an integer. + +=item * + fork() in the debugger under C<tmux> will now create a new window for the forked process. L<[perl #121333]|https://rt.perl.org/Ticket/Display.html?id=121333> @@ -227,6 +236,13 @@ L<Unicode::Normalize> has been upgraded from version 1.17 to 1.18. The XSUB implementation has been removed in favour of pure Perl. +=item * + +L<Hash::Util> has been upgraded from version 0.16 to 0.17. + +Minor bug fixes and documentation fixes to Hash::Util::hash_stats() + + =back =head2 Removed Modules and Pragmata diff --git a/proto.h b/proto.h index 70cf3de..eb60a4f 100644 --- a/proto.h +++ b/proto.h @@ -6701,6 +6701,12 @@ PERL_STATIC_INLINE U8 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) #define PERL_ARGS_ASSERT_COMPUTE_EXACTISH \ assert(pRExC_state) +STATIC regnode * S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) + __attribute__nonnull__(pTHX_1) + __attribute__nonnull__(pTHX_2); +#define PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE \ + assert(pRExC_state); assert(source) + STATIC bool S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state) __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS \ @@ -6828,13 +6834,6 @@ STATIC I32 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, re #define PERL_ARGS_ASSERT_MAKE_TRIE \ assert(pRExC_state); assert(startbranch); assert(first); assert(last); assert(tail) -STATIC void S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) - __attribute__nonnull__(pTHX_1) - __attribute__nonnull__(pTHX_2) - __attribute__nonnull__(pTHX_3); -#define PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE \ - assert(pRExC_state); assert(source); assert(stclass) - STATIC char * S_nextchar(pTHX_ RExC_state_t *pRExC_state) __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_NEXTCHAR \ diff --git a/regcomp.c b/regcomp.c index baeea53..a395e2b 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2984,8 +2984,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, : MADE_TRIE; } -STATIC void -S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) +STATIC regnode * +S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) { /* The Trie is constructed and compressed now so we can build a fail array if * it's needed @@ -3023,13 +3023,26 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode U32 *fail; reg_ac_data *aho; const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); + regnode *stclass; GET_RE_DEBUG_FLAGS_DECL; - PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; + PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif + if ( OP(source) == TRIE ) { + struct regnode_1 *op = (struct regnode_1 *) + PerlMemShared_calloc(1, sizeof(struct regnode_1)); + StructCopy(source,op,struct regnode_1); + stclass = (regnode *)op; + } else { + struct regnode_charclass *op = (struct regnode_charclass *) + PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); + StructCopy(source,op,struct regnode_charclass); + stclass = (regnode *)op; + } + OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */ ARG_SET( stclass, data_slot ); aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); @@ -3096,6 +3109,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode }); Safefree(q); /*RExC_seen |= REG_TRIEDFA_SEEN;*/ + return stclass; } @@ -6824,22 +6838,8 @@ reStudy: else if (PL_regkind[OP(first)] == TRIE && ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) { - regnode *trie_op; - /* this can happen only on restudy */ - if ( OP(first) == TRIE ) { - struct regnode_1 *trieop = (struct regnode_1 *) - PerlMemShared_calloc(1, sizeof(struct regnode_1)); - StructCopy(first,trieop,struct regnode_1); - trie_op=(regnode *)trieop; - } else { - struct regnode_charclass *trieop = (struct regnode_charclass *) - PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); - StructCopy(first,trieop,struct regnode_charclass); - trie_op=(regnode *)trieop; - } - OP(trie_op)+=2; - make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); - ri->regstclass = trie_op; + /* this can happen only on restudy */ + ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0); } #endif else if (REGNODE_SIMPLE(OP(first))) @@ -16192,7 +16192,16 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) PerlMemShared_free(aho->fail); /* do this last!!!! */ PerlMemShared_free(ri->data->data[n]); - PerlMemShared_free(ri->regstclass); + /* we should only ever get called once, so + * assert as much, and also guard the free + * which /might/ happen twice. At the least + * it will make code anlyzers happy and it + * doesn't cost much. - Yves */ + assert(ri->regstclass); + if (ri->regstclass) { + PerlMemShared_free(ri->regstclass); + ri->regstclass = 0; + } } } break; diff --git a/universal.c b/universal.c index 65e02df..777a924 100644 --- a/universal.c +++ b/universal.c @@ -513,7 +513,7 @@ XS(XS_utf8_downgrade) croak_xs_usage(cv, "sv, failok=0"); else { SV * const sv = ST(0); - const bool failok = (items < 2) ? 0 : (int)SvIV(ST(1)); + const bool failok = (items < 2) ? 0 : SvTRUE(ST(1)) ? 1 : 0; const bool RETVAL = sv_utf8_downgrade(sv, failok); ST(0) = boolSV(RETVAL); -- Perl5 Master Repository
