Change 27582: Pass the (byte) length of the entire string into

Nicholas Clark Thu, 23 Mar 2006 04:45:15 -0800

Change 27582 by [EMAIL PROTECTED] on 2006/03/23 12:44:35

        Pass the (byte) length of the entire string into
        utf8_mg_pos_cache_update()
        Start to use the cache to store two pairs of byte/utf-8 pairs.
        Add the first third of the cache update code.


Affected files ...

... //depot/perl/embed.fnc#336 edit
... //depot/perl/embed.h#573 edit
... //depot/perl/proto.h#683 edit
... //depot/perl/sv.c#1206 edit

Differences ...

==== //depot/perl/embed.fnc#336 (text) ====
Index: perl/embed.fnc
--- perl/embed.fnc#335~27580~   2006-03-23 03:33:48.000000000 -0800
+++ perl/embed.fnc      2006-03-23 04:44:35.000000000 -0800
@@ -1361,7 +1361,7 @@
                |NN const U8 *const start|NN const U8 *const send \
                |STRLEN uoffset|STRLEN uoffset0|STRLEN boffset0
 s      |void   |utf8_mg_pos_cache_update|NN SV *sv|NN MAGIC **mgp \
-               |STRLEN byte|STRLEN utf8
+               |STRLEN byte|STRLEN utf8|STRLEN blen
 s      |STRLEN |sv_pos_b2u_forwards|NN const U8 *s|NN const U8 *const target
 s      |STRLEN |sv_pos_b2u_midway|NN const U8 *s|NN const U8 *const target \
                |NN const U8 *end|STRLEN endu

==== //depot/perl/embed.h#573 (text+w) ====
Index: perl/embed.h
--- perl/embed.h#572~27580~     2006-03-23 03:33:48.000000000 -0800
+++ perl/embed.h        2006-03-23 04:44:35.000000000 -0800
@@ -3537,7 +3537,7 @@
 #define sv_pos_u2b_forwards(a,b,c)     S_sv_pos_u2b_forwards(aTHX_ a,b,c)
 #define sv_pos_u2b_midway(a,b,c,d)     S_sv_pos_u2b_midway(aTHX_ a,b,c,d)
 #define sv_pos_u2b_cached(a,b,c,d,e,f,g)       S_sv_pos_u2b_cached(aTHX_ 
a,b,c,d,e,f,g)
-#define utf8_mg_pos_cache_update(a,b,c,d)      
S_utf8_mg_pos_cache_update(aTHX_ a,b,c,d)
+#define utf8_mg_pos_cache_update(a,b,c,d,e)    
S_utf8_mg_pos_cache_update(aTHX_ a,b,c,d,e)
 #define sv_pos_b2u_forwards(a,b)       S_sv_pos_b2u_forwards(aTHX_ a,b)
 #define sv_pos_b2u_midway(a,b,c,d)     S_sv_pos_b2u_midway(aTHX_ a,b,c,d)
 #define stringify_regexp(a,b,c)        S_stringify_regexp(aTHX_ a,b,c)

==== //depot/perl/proto.h#683 (text+w) ====
Index: perl/proto.h
--- perl/proto.h#682~27580~     2006-03-23 03:33:48.000000000 -0800
+++ perl/proto.h        2006-03-23 04:44:35.000000000 -0800
@@ -3725,7 +3725,7 @@
                        __attribute__nonnull__(pTHX_3)
                        __attribute__nonnull__(pTHX_4);
 
-STATIC void    S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN 
byte, STRLEN utf8)
+STATIC void    S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN 
byte, STRLEN utf8, STRLEN blen)
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_2);
 

==== //depot/perl/sv.c#1206 (text) ====
Index: perl/sv.c
--- perl/sv.c#1205~27580~       2006-03-23 03:33:48.000000000 -0800
+++ perl/sv.c   2006-03-23 04:44:35.000000000 -0800
@@ -30,19 +30,16 @@
 #endif
 
 #ifdef PERL_UTF8_CACHE_ASSERT
-/* The cache element 0 is the Unicode offset;
- * the cache element 1 is the byte offset of the element 0;
- * the cache element 2 is the Unicode length of the substring;
- * the cache element 3 is the byte length of the substring;
- * The checking of the substring side would be good
- * but substr() has enough code paths to make my head spin;
- * if adding more checks watch out for the following tests:
+/* if adding more checks watch out for the following tests:
  *   t/op/index.t t/op/length.t t/op/pat.t t/op/substr.t
  *   lib/utf8.t lib/Unicode/Collate/t/index.t
  * --jhi
  */
 #define ASSERT_UTF8_CACHE(cache) \
-    STMT_START { if (cache) { assert((cache)[0] <= (cache)[1]); } } STMT_END
+    STMT_START { if (cache) { assert((cache)[0] <= (cache)[1]); \
+                             assert((cache)[2] <= (cache)[3]); \
+                             assert((cache)[3] <= (cache)[1]);} \
+                             } STMT_END
 #else
 #define ASSERT_UTF8_CACHE(cache) NOOP
 #endif
@@ -5405,6 +5402,10 @@
                /* An exact match. */
                return cache[1];
            }
+           if (cache[2] == uoffset) {
+               /* An exact match. */
+               return cache[3];
+           }
 
            if (cache[0] < uoffset) {
                /* The cache already knows part of the way.   */
@@ -5464,7 +5465,7 @@
        boffset = real_boffset;
     }
 
-    S_utf8_mg_pos_cache_update(aTHX_ sv, mgp, boffset, uoffset);
+    S_utf8_mg_pos_cache_update(aTHX_ sv, mgp, boffset, uoffset, send - start);
     return boffset;
 }
 
@@ -5524,7 +5525,8 @@
  */
 
 static void
-S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8)
+S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8,
+                          STRLEN blen)
 {
     STRLEN *cache;
     if (SvREADONLY(sv))
@@ -5567,10 +5569,62 @@
                       " real %"UVf" for %"SVf, (UV) utf8, (UV) realutf8, sv);
        }
     }
-    cache[0] = utf8;
-    cache[1] = byte;
+
+    /* Cache is held with the later position first, to simplify the code
+       that deals with unbounded ends.  */
+       
+    ASSERT_UTF8_CACHE(cache);
+    if (cache[1] == 0) {
+       /* Cache is totally empty  */
+       cache[0] = utf8;
+       cache[1] = byte;
+    } else if (cache[3] == 0) {
+       if (byte > cache[1]) {
+           /* New one is larger, so goes first.  */
+           cache[2] = cache[0];
+           cache[3] = cache[1];
+           cache[0] = utf8;
+           cache[1] = byte;
+       } else {
+           cache[2] = utf8;
+           cache[3] = byte;
+       }
+    } else {
+#define THREEWAY_SQUARE(a,b,c,d) \
+           ((float)((d) - (c))) * ((float)((d) - (c))) \
+           + ((float)((c) - (b))) * ((float)((c) - (b))) \
+              + ((float)((b) - (a))) * ((float)((b) - (a)))
+
+       /* Cache has 2 slots in use, and we know three potential pairs.
+          Keep the two that give the lowest RMS distance. Do the
+          calcualation in bytes simply because we always know the byte
+          length.  squareroot has the same ordering as the positive value,
+          so don't bother with the actual square root.  */
+       const float existing = THREEWAY_SQUARE(0, cache[3], cache[1], blen);
+       if (byte > cache[1]) {
+           /* New position is after the existing pair of pairs.  */
+           const float keep_earlier
+               = THREEWAY_SQUARE(0, cache[3], byte, blen);
+           const float keep_later
+               = THREEWAY_SQUARE(0, cache[1], byte, blen);
+
+           if (keep_later < keep_earlier) {
+               if (keep_later < existing) {
+                   cache[2] = cache[0];
+                   cache[3] = cache[1];
+                   cache[0] = utf8;
+                   cache[1] = byte;
+               }
+           }
+           else {
+               if (keep_earlier < existing) {
+                   cache[0] = utf8;
+                   cache[1] = byte;
+               }
+           }
+       }
+    }
     ASSERT_UTF8_CACHE(cache);
-    /* Drop the stale "length" cache */
 }
 
 /* If we don't know the character offset of the end of a region, our only
@@ -5626,15 +5680,16 @@
     const U8* s;
     const STRLEN byte = *offsetp;
     STRLEN len;
+    STRLEN blen;
     MAGIC* mg = NULL;
     const U8* send;
 
     if (!sv)
        return;
 
-    s = (const U8*)SvPV_const(sv, len);
+    s = (const U8*)SvPV_const(sv, blen);
 
-    if (len < byte)
+    if (blen < byte)
        Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset");
 
     send = s + byte;
@@ -5648,6 +5703,11 @@
                *offsetp = cache[0];
                return;
            }
+           if (cache[3] == byte) {
+               /* An exact match. */
+               *offsetp = cache[2];
+               return;
+           }
 
            if (cache[1] < byte) {
                /* We already know part of the way. */
@@ -5655,7 +5715,7 @@
                    /* Actually, we know the end too.  */
                    len = cache[0]
                        + S_sv_pos_b2u_midway(aTHX_ s + cache[1], send,
-                                             s + len, mg->mg_len - cache[0]);
+                                             s + blen, mg->mg_len - cache[0]);
                } else {
                    len = cache[0]
                        + S_sv_pos_b2u_forwards(aTHX_ s + cache[1], send);
@@ -5681,7 +5741,7 @@
                }
            }
        } else if (mg->mg_len != -1) {
-           len = S_sv_pos_b2u_midway(aTHX_ s, send, s + len, mg->mg_len);
+           len = S_sv_pos_b2u_midway(aTHX_ s, send, s + blen, mg->mg_len);
        } else {
            len = S_sv_pos_b2u_forwards(aTHX_ s, send);
        }
@@ -5691,7 +5751,7 @@
     }
     *offsetp = len;
 
-    S_utf8_mg_pos_cache_update(aTHX_ sv, &mg, byte, len);
+    S_utf8_mg_pos_cache_update(aTHX_ sv, &mg, byte, len, blen);
 }
 
 /*
End of Patch.

Change 27582: Pass the (byte) length of the entire string into

Reply via email to