In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/8ea4c679c8b563e13695d0d28d68e63e34ead44b?hp=2439e03355dec26654acf614900c077433bc27e0>
- Log ----------------------------------------------------------------- commit 8ea4c679c8b563e13695d0d28d68e63e34ead44b Author: Brian Fraser <[email protected]> Date: Tue Mar 5 20:40:34 2013 -0300 open FOO || die; doesn't need a special identifier parser. Turns out this can just use scan_word(). M toke.c commit 32833930e32dc619abdaaab54e88de2a2765fb86 Author: Brian Fraser <[email protected]> Date: Tue Mar 5 18:18:49 2013 -0300 Restrict the valid identifier syntax, fix some identifier bugs. Fixes: * Length-one identifiers are now restricted to [\p{XIDS}\p{POSIX_Punct}\p{POSIX_Digit}\p{POSIX_Cntrl}] plus, if under 'no utf8', the 128 non-ASCII characters in the Latin1 range. * Identifiers that start with ASCII letters can be followed with XIDC characters (The committer made some small edits in the pod) M gv.c M pod/perldata.pod M pod/perldelta.pod M t/lib/croak/op M t/uni/variables.t M toke.c commit 07f7264624e0307ed32e3b140ef2a0ea9d86a07f Author: Brian Fraser <[email protected]> Date: Tue Mar 5 17:46:52 2013 -0300 Fix several differences in the parsing of $.. and ${...} Namely: * The first character in ${...} used to have no restrictions * ${foo:bar} used to be legal * ${foo::bar} worked, but ${foo'bar} didn't And possibly other subtle, so far undiscovered bugs. This was resolved by simply using the same code for both things. Note that this commit is not entirely useful on its own; While tests pass, it requires changes from the following commit to work entirely. M MANIFEST M embed.fnc M embed.h M pod/perldelta.pod M proto.h M t/uni/labels.t A t/uni/variables.t M toke.c ----------------------------------------------------------------------- Summary of changes: MANIFEST | 1 + embed.fnc | 3 + embed.h | 1 + gv.c | 2 +- pod/perldata.pod | 104 ++++++++++++++++++++++++++++++++- pod/perldelta.pod | 17 ++++-- proto.h | 7 ++ t/lib/croak/op | 8 --- t/uni/labels.t | 12 ++-- t/uni/variables.t | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++++ toke.c | 170 +++++++++++++++++++++++------------------------------ 11 files changed, 378 insertions(+), 117 deletions(-) create mode 100644 t/uni/variables.t diff --git a/MANIFEST b/MANIFEST index 4a49acd..832dc9c 100644 --- a/MANIFEST +++ b/MANIFEST @@ -5609,6 +5609,7 @@ t/uni/tr_sjis.t See if Unicode tr/// in sjis works t/uni/tr_utf8.t See if Unicode tr/// in utf8 works t/uni/universal.t See if Unicode in calls to UNIVERSAL works t/uni/upper.t See if Unicode casing works +t/uni/variables.t See that the rules for variable names work t/uni/write.t See if Unicode formats work t/win32/fs.t Test Win32 link for compatibility t/win32/runenv.t Test if Win* perl honors its env variables diff --git a/embed.fnc b/embed.fnc index c9832d4..2f5e089 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2239,6 +2239,9 @@ so |SV* |new_constant |NULLOK const char *s|STRLEN len \ |STRLEN typelen s |int |deprecate_commaless_var_list s |int |ao |int toketype +s |void|parse_ident|NN char **s|NN char **d \ + |NN char * const e|int allow_package \ + |bool is_utf8 # if defined(PERL_CR_FILTER) s |I32 |cr_textfilter |int idx|NULLOK SV *sv|int maxlen s |void |strip_return |NN SV *sv diff --git a/embed.h b/embed.h index 9654979..248ed50 100644 --- a/embed.h +++ b/embed.h @@ -1609,6 +1609,7 @@ #define lop(a,b,c) S_lop(aTHX_ a,b,c) #define missingterm(a) S_missingterm(aTHX_ a) #define no_op(a,b) S_no_op(aTHX_ a,b) +#define parse_ident(a,b,c,d,e) S_parse_ident(aTHX_ a,b,c,d,e) #define pending_ident() S_pending_ident(aTHX) #define readpipe_override() S_readpipe_override(aTHX) #define scan_const(a) S_scan_const(aTHX_ a) diff --git a/gv.c b/gv.c index e8f5402..8ac08ab 100644 --- a/gv.c +++ b/gv.c @@ -1598,7 +1598,7 @@ Perl_gv_fetchpvn_flags(pTHX_ const char *nambeg, STRLEN full_len, I32 flags, : ""), SVfARG(namesv)); GV *gv; SvREFCNT_dec_NN(namesv); - if (USE_UTF8_IN_NAMES) + if (is_utf8) SvUTF8_on(err); qerror(err); gv = gv_fetchpvs("<none>::", GV_ADDMULTI, SVt_PVHV); diff --git a/pod/perldata.pod b/pod/perldata.pod index 9bff98f..8bf3dfd 100644 --- a/pod/perldata.pod +++ b/pod/perldata.pod @@ -24,8 +24,9 @@ containing letters, underscores, and digits. In some cases, it may be a chain of identifiers, separated by C<::> (or by the slightly archaic C<'>); all but the last are interpreted as names of packages, to locate the namespace in which to look up the final identifier -(see L<perlmod/Packages> for details). It's possible to substitute -for a simple identifier, an expression that produces a reference +(see L<perlmod/Packages> for details). For a more in-depth discussion +on identifiers, see L<Identifier parsing>. It's possible to +substitute for a simple identifier, an expression that produces a reference to the value at runtime. This is described in more detail below and in L<perlref>. X<identifier> @@ -104,6 +105,105 @@ C<$$>. (Most of these one character names have a predefined significance to Perl. For instance, C<$$> is the current process id.) +=head2 Identifier parsing +X<identifiers> + +Up until Perl 5.18, the actual rules of what a valid identifier +was were a bit fuzzy. However, in general, anything defined here should +work on previous versions of Perl, while the opposite -- edge cases +that work in previous versions, but aren't defined here -- probably +won't work on newer versions. +As an important side note, please note that the following only applies +to bareword identifiers as found in Perl source code, not identifiers +introduced through symbolic references, which have much fewer +restrictions. +If working under the effect of the C<use utf8;> pragma, the following +rules apply: + + / (?[ ( \p{Word} & \p{XID_Start} ) + [_] ]) \p{XID_Continue}* /x + +If not under C<use utf8>, the source is treated as ASCII + 128 extra +controls, and identifiers should match + + / (?aa) (?!\d) \w+ /x + +That is, any word character in the ASCII range, as long as the first +character is not a digit. + +There are two package separators in Perl: A double colon (C<::>) and a single +quote (C<'>). Normal identifiers can start or end with a double colon, and +can contain several parts delimited by double colons. +Single quotes have similar rules, but with the exception that they are not +legal at the end of an identifier: That is, C<$'foo> and C<$foo'bar> are +legal, but C<$foo'bar'> are not. + + +Finally, if the identifier is preceded by a sigil -- +More so, normal identifiers can start or end with any number +of double colons (::), and can contain several parts delimited +by double colons. +And additionally, if the identifier is preceded by a sigil -- +that is, if the identifier is part of a variable name -- it +may optionally be enclosed in braces. + +While you can mix double colons with singles quotes, the quotes must come +after the colons: C<$::::'foo> and C<$foo::'bar> are legal, but C<$::'::foo> +and C<$foo'::bar> are not. + +Put together, a grammar to match a basic identifier becomes + + / + (?(DEFINE) + (?<variable> + (?&sigil) + (?: + (?&normal_identifier) + | \{ \s* (?&normal_identifier) \s* \} + ) + ) + (?<normal_identifier> + (?: :: )* '? + (?&basic_identifier) + (?: (?= (?: :: )+ '? | (?: :: )* ' ) (?&normal_identifier) )? + (?: :: )* + ) + (?<basic_identifier> + # is use utf8 on? + (?(?{ (caller(0))[8] & $utf8::hint_bits }) + (?&Perl_XIDS) \p{XID_Continue}* + | (?aa) (?!\d) \w+ + ) + ) + (?<sigil> [&*\$\@\%]) + (?<Perl_XIDS> (?[ ( \p{Word} & \p{XID_Start} ) + [_] ]) ) + ) + /x + +Meanwhile, special identifiers don't follow the above rules; For the most +part, all of the identifiers in this category have a special meaning given +by Perl. Because they have special parsing rules, these generally can't be +fully-qualified. They come in four forms: + +=over + +=item A sigil, followed solely by digits matching \p{POSIX_Digit}, like C<$0>, +C<$1>, or C<$10000>. + +=item A sigil, followed by either a caret and a single POSIX uppercase letter, +like C<$^V> or C<$^W>, or a sigil followed by a literal control character +matching the C<\p{POSIX_Cntrl}> property. Due to a historical oddity, if not +running under C<use utf8>, the 128 extra controls in the C<[0x80-0xff]> range +may also be used in length one variables. + +=item Similar to the above, a sigil, followed by bareword text in brackets, +where the first character is either a caret followed by an uppercase letter, +or a literal control, like C<${^GLOBAL_PHASE}> or C<${\7LOBAL_PHASE}>. + +=item A sigil followed by a single character matching the C<\p{POSIX_Punct}> +property, like C<$!> or C<%+>. + +=back + =head2 Context X<context> X<scalar context> X<list context> diff --git a/pod/perldelta.pod b/pod/perldelta.pod index c692139..0b91648 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -37,11 +37,18 @@ L</Selected Bug Fixes> section. =head1 Incompatible Changes -XXX For a release on a stable branch, this section aspires to be: - - There are no changes intentionally incompatible with 5.XXX.XXX - If any exist, they are bugs, and we request that you submit a - report. See L</Reporting Bugs> below. +=head2 Explicit rules for variable names and identifiers + +Due to an oversight, length-one variable names in 5.16 were completely +unrestricted, and opened the door to several kinds of insanity. As of +5.18, these now follow the rules of other identifiers, in addition +to accepting characters that match the \p{POSIX_Punct} property. + +There are no longer any differences in the parsing of identifiers specified +as $... or ${...}; previously, they were dealt with in different parts of +the core, and so had slightly different behavior. For instance, +C<${foo:bar}> was a legal variable name. Since they are now both parsed +by the same code, that is no longer the case. [ List each incompatible change as a =head2 entry ] diff --git a/proto.h b/proto.h index 9192960..35d49db 100644 --- a/proto.h +++ b/proto.h @@ -7272,6 +7272,13 @@ STATIC void S_no_op(pTHX_ const char *const what, char *s) #define PERL_ARGS_ASSERT_NO_OP \ assert(what) +STATIC void S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package, bool is_utf8) + __attribute__nonnull__(pTHX_1) + __attribute__nonnull__(pTHX_2) + __attribute__nonnull__(pTHX_3); +#define PERL_ARGS_ASSERT_PARSE_IDENT \ + assert(s); assert(d); assert(e) + STATIC int S_pending_ident(pTHX); STATIC void S_readpipe_override(pTHX); STATIC char* S_scan_const(pTHX_ char *start) diff --git a/t/lib/croak/op b/t/lib/croak/op index 86e40f8..22f1e76 100644 --- a/t/lib/croak/op +++ b/t/lib/croak/op @@ -5,14 +5,6 @@ EXPECT Can't use global $! in "my" at - line 1, near "my $!" Execution of - aborted due to compilation errors. ######## -# NAME my $<special_unicode> -use utf8; -BEGIN { binmode STDERR, ":utf8" } -my $â ; -EXPECT -Can't use global $â in "my" at - line 3, near "my $â " -Execution of - aborted due to compilation errors. -######## # NAME OP_HELEM fields package Foo; use fields qw(a b); diff --git a/t/uni/labels.t b/t/uni/labels.t index 3d7d476..3fa9d38 100644 --- a/t/uni/labels.t +++ b/t/uni/labels.t @@ -15,7 +15,7 @@ use feature qw 'unicode_strings evalbytes'; use charnames qw( :full ); -plan(9); +plan(10); LABEL: { pass("Sanity check, UTF-8 labels don't throw a syntax error."); @@ -54,11 +54,13 @@ SKIP: { like $@, qr/Label not found for "next ï¼¥" at/u, "next's error is UTF-8 clean"; } -my $d = 4; +my $d = 2; LÃBEL: { + my $e = $@; my $prog = "redo L\N{LATIN CAPITAL LETTER A WITH ACUTE}BEL"; - if ($d % 2) { + if ($d == 1) { + is $e, '', "redo UTF8 works"; utf8::downgrade($prog); } if ($d--) { @@ -68,8 +70,8 @@ LÃBEL: { } } -is $@, '', "redo to downgradeable labels works"; -is $d, -1, "Latin-1 labels reachable regardless of UTF-8ness"; +like $@, qr/Unrecognized character/, "redo to downgradeable labels"; +is $d, 0, "Latin-1 labels are reachable"; { no warnings; diff --git a/t/uni/variables.t b/t/uni/variables.t new file mode 100644 index 0000000..0e810a4 --- /dev/null +++ b/t/uni/variables.t @@ -0,0 +1,170 @@ +#!./perl + +# Checks if the parser behaves correctly in edge case +# (including weird syntax errors) + +BEGIN { + require './test.pl'; +} + +use 5.016; +use utf8; +use open qw( :utf8 :std ); +no warnings qw(misc reserved); + +plan (tests => 65850); + +# ${single:colon} should not be valid syntax +{ + no strict; + + local $@; + eval "\${\x{30cd}single:\x{30cd}colon} = 1"; + like($@, + qr/syntax error .* near "\x{30cd}single:/, + '${\x{30cd}single:\x{30cd}colon} should not be valid syntax' + ); + + local $@; + no utf8; + evalbytes '${single:colon} = 1'; + like($@, + qr/syntax error .* near "single:/, + '...same with ${single:colon}' + ); +} + +# ${yadda'etc} and ${yadda::etc} should both work under strict +{ + local $@; + eval q<use strict; ${flark::fleem}>; + is($@, '', q<${package::var} works>); + + local $@; + eval q<use strict; ${fleem'flark}>; + is($@, '', q<...as does ${package'var}>); +} + +# The first character in ${...} should respect the rules +{ + local $@; + use utf8; + eval '${âasd} = 1'; + like($@, qr/\QUnrecognized character/, q(the first character in ${...} isn't special)) +} + +# Checking that at least some of the special variables work +for my $v (qw( ^V ; < > ( ) {^GLOBAL_PHASE} ^W _ 1 4 0 [ ] ! @ / \ = )) { + local $@; + evalbytes "\$$v;"; + is $@, '', "No syntax error for \$$v"; + + local $@; + eval "use utf8; \$$v;"; + is $@, '', "No syntax error for \$$v under use utf8"; +} + +# Checking if the Latin-1 range behaves as expected, and that the behavior is the +# same whenever under strict or not. +for ( 0x80..0xff ) { + no warnings 'closure'; + my $chr = chr; + my $esc = sprintf("%X", ord $chr); + utf8::downgrade($chr); + if ($chr !~ /\p{XIDS}/u) { + is evalbytes "no strict; \$$chr = 10", + 10, + sprintf("\\x%02x, part of the latin-1 range, is legal as a length-1 variable", $_); + + utf8::upgrade($chr); + local $@; + eval "no strict; use utf8; \$$chr = 1"; + like $@, + qr/\QUnrecognized character \x{\E\L$esc/, + sprintf("..but is illegal as a length-1 variable under use utf8", $_); + } + else { + { + no utf8; + local $@; + evalbytes "no strict; \$$chr = 1"; + is($@, '', sprintf("\\x%02x, =~ \\p{XIDS}, latin-1, no utf8, no strict, is a valid length-1 variable", $_)); + + local $@; + evalbytes "use strict; \$$chr = 1"; + is($@, + '', + sprintf("\\x%02x under no utf8 does not have to be required under strict, even though it matches XIDS", $_) + ); + + local $@; + evalbytes "\$a$chr = 1"; + like($@, + qr/Unrecognized character /, + sprintf("...but under no utf8, it's not allowed in two-or-more character variables") + ); + + local $@; + evalbytes "\$a$chr = 1"; + like($@, + qr/Unrecognized character /, + sprintf("...but under no utf8, it's not allowed in two-or-more character variables") + ); + } + { + use utf8; + my $u = $chr; + utf8::upgrade($u); + local $@; + eval "no strict; \$$u = 1"; + is($@, '', sprintf("\\x%02x, =~ \\p{XIDS}, UTF-8, use utf8, no strict, is a valid length-1 variable", $_)); + + local $@; + eval "use strict; \$$u = 1"; + like($@, + qr/Global symbol "\$$u" requires explicit package name/, + sprintf("\\x%02x under utf8 has to be required under strict", $_) + ); + } + } +} + +{ + use utf8; + my $ret = eval "my \$c\x{327} = 100; \$c\x{327}"; # c + cedilla + is($@, '', "ASCII character + combining character works as a variable name"); + is($ret, 100, "...and returns the correct value"); +} + +# From Tom Christiansen's 'highly illegal variable names are now accidentally legal' mail +for my $chr ( + "\N{EM DASH}", "\x{F8FF}", "\N{POUND SIGN}", "\N{SOFT HYPHEN}", + "\N{THIN SPACE}", "\x{11_1111}", "\x{DC00}", "\N{COMBINING DIAERESIS}", + "\N{COMBINING ENCLOSING CIRCLE BACKSLASH}", + ) +{ + no warnings 'non_unicode'; + my $esc = sprintf("%x", ord $chr); + local $@; + eval "\$$chr = 1; \$$chr"; + like($@, + qr/\QUnrecognized character \x{$esc};/, + "\\x{$esc} is illegal for a length-one identifier" + ); +} + +for my $i (0x100..0xffff) { + my $chr = chr($i); + my $esc = sprintf("%x", $i); + local $@; + eval "my \$$chr = q<test>; \$$chr;"; + if ( $chr =~ /^\p{_Perl_IDStart}$/ ) { + is($@, '', sprintf("\\x{%04x} is XIDS, works as a length-1 variable", $i)); + } + else { + like($@, + qr/\QUnrecognized character \x{$esc};/, + "\\x{$esc} isn't XIDS, illegal as a length-1 variable", + ) + } +} \ No newline at end of file diff --git a/toke.c b/toke.c index 006f885..4579e63 100644 --- a/toke.c +++ b/toke.c @@ -5025,7 +5025,7 @@ Perl_yylex(pTHX) #endif switch (*s) { default: - if (isIDFIRST_lazy_if(s,UTF)) + if (UTF ? isIDFIRST_utf8((U8*)s) : isALNUMC(*s)) goto keylookup; { SV *dsv = newSVpvs_flags("", SVs_TEMP); @@ -8105,15 +8105,9 @@ Perl_yylex(pTHX) case KEY_open: s = SKIPSPACE1(s); if (isIDFIRST_lazy_if(s,UTF)) { - const char *t; - for (d = s; isWORDCHAR_lazy_if(d,UTF);) { - d += UTF ? UTF8SKIP(d) : 1; - if (UTF) { - while (UTF8_IS_CONTINUED(*d) && _is_utf8_mark((U8*)d)) { - d += UTF ? UTF8SKIP(d) : 1; - } - } - } + const char *t; + d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, + &len); for (t=d; isSPACE(*t);) t++; if ( *t && strchr("|&*+-=!?:.", *t) && ckWARN_d(WARN_PRECEDENCE) @@ -9188,6 +9182,54 @@ now_ok: return res; } +PERL_STATIC_INLINE void +S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package, bool is_utf8) { + dVAR; + PERL_ARGS_ASSERT_PARSE_IDENT; + + for (;;) { + if (*d >= e) + Perl_croak(aTHX_ "%s", ident_too_long); + if (is_utf8 && isIDFIRST_utf8((U8*)*s)) { + /* The UTF-8 case must come first, otherwise things + * like c\N{COMBINING TILDE} would start failing, as the + * isWORDCHAR_A case below would gobble the 'c' up. + */ + + char *t = *s + UTF8SKIP(*s); + while (isIDCONT_utf8((U8*)t)) + t += UTF8SKIP(t); + if (*d + (t - *s) > e) + Perl_croak(aTHX_ "%s", ident_too_long); + Copy(*s, *d, t - *s, char); + *d += t - *s; + *s = t; + } + else if ( isWORDCHAR_A(**s) ) { + do { + *(*d)++ = *(*s)++; + } while isWORDCHAR_A(**s); + } + else if (allow_package && **s == '\'' && isIDFIRST_lazy_if(*s+1,is_utf8)) { + *(*d)++ = ':'; + *(*d)++ = ':'; + (*s)++; + } + else if (allow_package && **s == ':' && (*s)[1] == ':' + /* Disallow things like Foo::$bar. For the curious, this is + * the code path that triggers the "Bad name after" warning + * when looking for barewords. + */ + && (*s)[2] != '$') { + *(*d)++ = *(*s)++; + *(*d)++ = *(*s)++; + } + else + break; + } + return; +} + /* Returns a NUL terminated string, with the length of the string written to *slp */ @@ -9197,44 +9239,14 @@ S_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STRLEN dVAR; char *d = dest; char * const e = d + destlen - 3; /* two-character token, ending NUL */ + bool is_utf8 = cBOOL(UTF); PERL_ARGS_ASSERT_SCAN_WORD; - for (;;) { - if (d >= e) - Perl_croak(aTHX_ "%s", ident_too_long); - if (isWORDCHAR(*s) - || (!UTF && isALPHANUMERIC_L1(*s))) /* UTF handled below */ - { - *d++ = *s++; - } - else if (allow_package && (*s == '\'') && isIDFIRST_lazy_if(s+1,UTF)) { - *d++ = ':'; - *d++ = ':'; - s++; - } - else if (allow_package && (s[0] == ':') && (s[1] == ':') && (s[2] != '$')) { - *d++ = *s++; - *d++ = *s++; - } - else if (UTF && UTF8_IS_START(*s) && isWORDCHAR_utf8((U8*)s)) { - char *t = s + UTF8SKIP(s); - size_t len; - while (UTF8_IS_CONTINUED(*t) && _is_utf8_mark((U8*)t)) - t += UTF8SKIP(t); - len = t - s; - if (d + len > e) - Perl_croak(aTHX_ "%s", ident_too_long); - Copy(s, d, len, char); - d += len; - s = t; - } - else { - *d = '\0'; - *slp = d - dest; - return s; - } - } + parse_ident(&s, &d, e, allow_package, is_utf8); + *d = '\0'; + *slp = d - dest; + return s; } STATIC char * @@ -9245,6 +9257,7 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck char funny = *s++; char *d = dest; char * const e = d + destlen - 3; /* two-character token, ending NUL */ + bool is_utf8 = cBOOL(UTF); PERL_ARGS_ASSERT_SCAN_IDENT; @@ -9258,33 +9271,7 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck } } else { - for (;;) { - if (d >= e) - Perl_croak(aTHX_ "%s", ident_too_long); - if (isWORDCHAR(*s)) /* UTF handled below */ - *d++ = *s++; - else if (*s == '\'' && isIDFIRST_lazy_if(s+1,UTF)) { - *d++ = ':'; - *d++ = ':'; - s++; - } - else if (*s == ':' && s[1] == ':') { - *d++ = *s++; - *d++ = *s++; - } - else if (UTF && UTF8_IS_START(*s) && isWORDCHAR_utf8((U8*)s)) { - char *t = s + UTF8SKIP(s); - while (UTF8_IS_CONTINUED(*t) && _is_utf8_mark((U8*)t)) - t += UTF8SKIP(t); - if (d + (t - s) > e) - Perl_croak(aTHX_ "%s", ident_too_long); - Copy(s, d, t - s, char); - d += t - s; - s = t; - } - else - break; - } + parse_ident(&s, &d, e, 1, is_utf8); } *d = '\0'; d = dest; @@ -9294,7 +9281,7 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck return s; } if (*s == '$' && s[1] && - (isWORDCHAR_lazy_if(s+1,UTF) || s[1] == '$' || s[1] == '{' || strnEQ(s+1,"::",2)) ) + (isIDFIRST_lazy_if(s+1,is_utf8) || s[1] == '$' || s[1] == '{' || strnEQ(s+1,"::",2)) ) { return s; } @@ -9302,8 +9289,15 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck bracket = s; s++; } - if (s < send) { - if (UTF) { + +#define VALID_LEN_ONE_IDENT(d, u) (isPUNCT_A((U8)*(d)) \ + || isCNTRL_A((U8)*(d)) \ + || isDIGIT_A((U8)*(d)) \ + || (!(u) && !UTF8_IS_INVARIANT((U8)*(d)))) + if (s < send + && (isIDFIRST_lazy_if(s, is_utf8) || VALID_LEN_ONE_IDENT(s, is_utf8))) + { + if (is_utf8) { const STRLEN skip = UTF8SKIP(s); STRLEN i; d[skip] = '\0'; @@ -9331,25 +9325,9 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck } } } - if (isIDFIRST_lazy_if(d,UTF)) { - d += UTF8SKIP(d); - if (UTF) { - char *end = s; - while ((end < send && isWORDCHAR_lazy_if(end,UTF)) || *end == ':') { - end += UTF8SKIP(end); - while (end < send && UTF8_IS_CONTINUED(*end) && _is_utf8_mark((U8*)end)) - end += UTF8SKIP(end); - } - Copy(s, d, end - s, char); - d += end - s; - s = end; - } - else { - while ((isWORDCHAR(*s) || *s == ':') && d < e) - *d++ = *s++; - if (d >= e) - Perl_croak(aTHX_ "%s", ident_too_long); - } + if (isIDFIRST_lazy_if(d,is_utf8)) { + d += is_utf8 ? UTF8SKIP(d) : 1; + parse_ident(&s, &d, e, 1, is_utf8); *d = '\0'; while (s < send && SPACE_OR_TAB(*s)) s++; @@ -9391,10 +9369,10 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck if (PL_lex_state == LEX_NORMAL) { if (ckWARN(WARN_AMBIGUOUS) && (keyword(dest, d - dest, 0) - || get_cvn_flags(dest, d - dest, UTF ? SVf_UTF8 : 0))) + || get_cvn_flags(dest, d - dest, is_utf8 ? SVf_UTF8 : 0))) { SV *tmp = newSVpvn_flags( dest, d - dest, - SVs_TEMP | (UTF ? SVf_UTF8 : 0) ); + SVs_TEMP | (is_utf8 ? SVf_UTF8 : 0) ); if (funny == '#') funny = '@'; Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS), -- Perl5 Master Repository
