In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/2d66f61e3a78c413bbb02ffa1bab2ad8aa6aed5a?hp=6e89a33a76ad97097e5b456387f1289b6addf741>

- Log -----------------------------------------------------------------
commit 2d66f61e3a78c413bbb02ffa1bab2ad8aa6aed5a
Author: Karl Williamson <[email protected]>
Date:   Wed Jan 1 11:36:17 2014 -0700

    regexec.c: Clarify comment

M       regexec.c

commit 3db24e1e3c8c10bf892a8f48ff3d780fdd1f88a0
Author: Karl Williamson <[email protected]>
Date:   Wed Jan 1 09:59:20 2014 -0700

    regexec.c: Guard against malformed UTF-8 in [...]
    
    The code that handles bracketed character classes assumed that the
    string being matched against did not have the too-short malformation;
    this could lead to reading beyond-the-end-of-buffer.  (It did check for
    other malformations.)  This is solved by changing the function that
    operates on bracketed character classes to take and use an extra
    parameter, the actaul buffer end.

M       embed.fnc
M       embed.h
M       proto.h
M       regexec.c

commit f3943cf2ca1b7a02583ef8bbeb1ced9414bf1fac
Author: Karl Williamson <[email protected]>
Date:   Wed Jan 1 09:52:55 2014 -0700

    pp.c: Remove unnecessary mask operation.
    
    An unsigned character (U8) should not have more than 8 bits of data, so
    no need to force that by masking with 0xFF.

M       pp.c

commit 4f6386b6c255e97472036daac543efef3399b495
Author: Karl Williamson <[email protected]>
Date:   Wed Jan 1 09:49:04 2014 -0700

    pp.c: Guard against malformed UTF-8 input in ord()
    
    This code got the actual length of the input scalar, but discarded it.
    If that scalar contains malformed UTF-8 that has fewer bytes than is
    indicated, a read beyond-buffer-end could happen.  Simply use the actual
    length.

M       pp.c
-----------------------------------------------------------------------

Summary of changes:
 embed.fnc |  5 ++++-
 embed.h   |  2 +-
 pp.c      |  5 +++--
 proto.h   |  7 ++++---
 regexec.c | 26 +++++++++++++++-----------
 5 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index dd9e6cf..18610ae 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2144,7 +2144,10 @@ ERs      |I32    |regrepeat      |NN regexp *prog|NN 
char **startposp \
                                |I32 max \
                                |int depth
 ERs    |I32    |regtry         |NN regmatch_info *reginfo|NN char **startposp
-ERs    |bool   |reginclass     |NULLOK regexp * const prog|NN const regnode * 
const n|NN const U8 * const p\
+ERs    |bool   |reginclass     |NULLOK regexp * const prog  \
+                               |NN const regnode * const n  \
+                               |NN const U8 * const p       \
+                               |NN const U8 * const p_end   \
                                |bool const utf8_target
 Es     |CHECKPOINT|regcppush   |NN const regexp *rex|I32 parenfloor\
                                |U32 maxopenparen
diff --git a/embed.h b/embed.h
index b84b5e4..e918bcb 100644
--- a/embed.h
+++ b/embed.h
@@ -1002,7 +1002,7 @@
 #define regcppush(a,b,c)       S_regcppush(aTHX_ a,b,c)
 #define reghop3                        S_reghop3
 #define reghopmaybe3           S_reghopmaybe3
-#define reginclass(a,b,c,d)    S_reginclass(aTHX_ a,b,c,d)
+#define reginclass(a,b,c,d,e)  S_reginclass(aTHX_ a,b,c,d,e)
 #define regmatch(a,b,c)                S_regmatch(aTHX_ a,b,c)
 #define regrepeat(a,b,c,d,e,f) S_regrepeat(aTHX_ a,b,c,d,e,f)
 #define regtry(a,b)            S_regtry(aTHX_ a,b)
diff --git a/pp.c b/pp.c
index dd4d89a..6925f49 100644
--- a/pp.c
+++ b/pp.c
@@ -3324,12 +3324,13 @@ PP(pp_ord)
     if (PL_encoding && SvPOK(argsv) && !DO_UTF8(argsv)) {
         SV * const tmpsv = sv_2mortal(newSVsv(argsv));
         s = (U8*)sv_recode_to_utf8(tmpsv, PL_encoding);
+        len = UTF8SKIP(s);  /* Should be well-formed; so this is its length */
         argsv = tmpsv;
     }
 
     XPUSHu(DO_UTF8(argsv)
-           ? utf8n_to_uvchr(s, UTF8_MAXBYTES, 0, UTF8_ALLOW_ANYUV)
-           : (UV)(*s & 0xff));
+           ? utf8n_to_uvchr(s, len, 0, UTF8_ALLOW_ANYUV)
+           : (UV)(*s));
 
     RETURN;
 }
diff --git a/proto.h b/proto.h
index c486045..647d357 100644
--- a/proto.h
+++ b/proto.h
@@ -7158,12 +7158,13 @@ STATIC U8*      S_reghopmaybe3(U8 *s, SSize_t off, 
const U8 *lim)
 #define PERL_ARGS_ASSERT_REGHOPMAYBE3  \
        assert(s); assert(lim)
 
-STATIC bool    S_reginclass(pTHX_ regexp * const prog, const regnode * const 
n, const U8 * const p, bool const utf8_target)
+STATIC bool    S_reginclass(pTHX_ regexp * const prog, const regnode * const 
n, const U8 * const p, const U8 * const p_end, bool const utf8_target)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_2)
-                       __attribute__nonnull__(pTHX_3);
+                       __attribute__nonnull__(pTHX_3)
+                       __attribute__nonnull__(pTHX_4);
 #define PERL_ARGS_ASSERT_REGINCLASS    \
-       assert(n); assert(p)
+       assert(n); assert(p); assert(p_end)
 
 STATIC SSize_t S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, 
regnode *prog)
                        __attribute__warn_unused_result__
diff --git a/regexec.c b/regexec.c
index d0e0fd2..3e82bc4 100644
--- a/regexec.c
+++ b/regexec.c
@@ -101,10 +101,10 @@ static const char* const non_utf8_target_but_utf8_required
 #define        STATIC  static
 #endif
 
-/* Valid for non-utf8 strings: avoids the reginclass
+/* Valid only for non-utf8 strings: avoids the reginclass
  * call if there are no complications: i.e., if everything matchable is
  * straight forward in the bitmap */
-#define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
+#define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0)   \
                                              : ANYOF_BITMAP_TEST(p,*(c)))
 
 /*
@@ -1392,12 +1392,13 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \
        }                                                                      \
        else {                                                                 \
            U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);          \
-           tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
+           tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                 \
+                                                       0, UTF8_ALLOW_DEFAULT); 
\
        }                                                                      \
        tmp = TeSt1_UtF8;                                                      \
-       LOAD_UTF8_CHARCLASS_ALNUM();                                            
                    \
+       LOAD_UTF8_CHARCLASS_ALNUM();                                           \
        REXEC_FBC_UTF8_SCAN(                                                   \
-           if (tmp == ! (TeSt2_UtF8)) { \
+           if (tmp == ! (TeSt2_UtF8)) {                                       \
                tmp = !tmp;                                                    \
                IF_SUCCESS;                                                    \
            }                                                                  \
@@ -1491,7 +1492,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, 
char *s,
     case ANYOF_SYNTHETIC:
         if (utf8_target) {
             REXEC_FBC_UTF8_CLASS_SCAN(
-                      reginclass(prog, c, (U8*)s, utf8_target));
+                      reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
         }
         else {
             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
@@ -4323,7 +4324,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, 
regnode *prog)
                    const U8 * const r =
                             reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
 
-                   ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
+                   ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,
+                                                                   0, 
uniflags);
                }
                if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
                    ln = isWORDCHAR_uni(ln);
@@ -4388,7 +4390,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, 
regnode *prog)
             if (NEXTCHR_IS_EOS)
                 sayNO;
            if (utf8_target) {
-               if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
+               if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
+                                                                   
utf8_target))
                    sayNO;
                locinput += UTF8SKIP(locinput);
            }
@@ -7002,7 +7005,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const 
regnode *p,
        if (utf8_target) {
            while (hardcount < max
                    && scan < loceol
-                  && reginclass(prog, p, (U8*)scan, utf8_target))
+                  && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
            {
                scan += UTF8SKIP(scan);
                hardcount++;
@@ -7401,6 +7404,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const 
regnode* node, bool doinit
  
   n is the ANYOF regnode
   p is the target string
+  p_end points to one byte beyond the end of the target string
   utf8_target tells whether p is in UTF-8.
 
   Returns true if matched; false otherwise.
@@ -7412,7 +7416,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const 
regnode* node, bool doinit
  */
 
 STATIC bool
-S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* 
const p, const bool utf8_target)
+S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* 
const p, const U8* const p_end, const bool utf8_target)
 {
     dVAR;
     const char flags = ANYOF_FLAGS(n);
@@ -7425,7 +7429,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * 
const n, const U8* const
      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
         STRLEN c_len = 0;
-       c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
+       c = utf8n_to_uvchr(p, p_end - p, &c_len,
                (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
                | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
                /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for

--
Perl5 Master Repository

Reply via email to