Change 27582 by [EMAIL PROTECTED] on 2006/03/23 12:44:35
Pass the (byte) length of the entire string into
utf8_mg_pos_cache_update()
Start to use the cache to store two pairs of byte/utf-8 pairs.
Add the first third of the cache update code.
Affected files ...
... //depot/perl/embed.fnc#336 edit
... //depot/perl/embed.h#573 edit
... //depot/perl/proto.h#683 edit
... //depot/perl/sv.c#1206 edit
Differences ...
==== //depot/perl/embed.fnc#336 (text) ====
Index: perl/embed.fnc
--- perl/embed.fnc#335~27580~ 2006-03-23 03:33:48.000000000 -0800
+++ perl/embed.fnc 2006-03-23 04:44:35.000000000 -0800
@@ -1361,7 +1361,7 @@
|NN const U8 *const start|NN const U8 *const send \
|STRLEN uoffset|STRLEN uoffset0|STRLEN boffset0
s |void |utf8_mg_pos_cache_update|NN SV *sv|NN MAGIC **mgp \
- |STRLEN byte|STRLEN utf8
+ |STRLEN byte|STRLEN utf8|STRLEN blen
s |STRLEN |sv_pos_b2u_forwards|NN const U8 *s|NN const U8 *const target
s |STRLEN |sv_pos_b2u_midway|NN const U8 *s|NN const U8 *const target \
|NN const U8 *end|STRLEN endu
==== //depot/perl/embed.h#573 (text+w) ====
Index: perl/embed.h
--- perl/embed.h#572~27580~ 2006-03-23 03:33:48.000000000 -0800
+++ perl/embed.h 2006-03-23 04:44:35.000000000 -0800
@@ -3537,7 +3537,7 @@
#define sv_pos_u2b_forwards(a,b,c) S_sv_pos_u2b_forwards(aTHX_ a,b,c)
#define sv_pos_u2b_midway(a,b,c,d) S_sv_pos_u2b_midway(aTHX_ a,b,c,d)
#define sv_pos_u2b_cached(a,b,c,d,e,f,g) S_sv_pos_u2b_cached(aTHX_
a,b,c,d,e,f,g)
-#define utf8_mg_pos_cache_update(a,b,c,d)
S_utf8_mg_pos_cache_update(aTHX_ a,b,c,d)
+#define utf8_mg_pos_cache_update(a,b,c,d,e)
S_utf8_mg_pos_cache_update(aTHX_ a,b,c,d,e)
#define sv_pos_b2u_forwards(a,b) S_sv_pos_b2u_forwards(aTHX_ a,b)
#define sv_pos_b2u_midway(a,b,c,d) S_sv_pos_b2u_midway(aTHX_ a,b,c,d)
#define stringify_regexp(a,b,c) S_stringify_regexp(aTHX_ a,b,c)
==== //depot/perl/proto.h#683 (text+w) ====
Index: perl/proto.h
--- perl/proto.h#682~27580~ 2006-03-23 03:33:48.000000000 -0800
+++ perl/proto.h 2006-03-23 04:44:35.000000000 -0800
@@ -3725,7 +3725,7 @@
__attribute__nonnull__(pTHX_3)
__attribute__nonnull__(pTHX_4);
-STATIC void S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN
byte, STRLEN utf8)
+STATIC void S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN
byte, STRLEN utf8, STRLEN blen)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2);
==== //depot/perl/sv.c#1206 (text) ====
Index: perl/sv.c
--- perl/sv.c#1205~27580~ 2006-03-23 03:33:48.000000000 -0800
+++ perl/sv.c 2006-03-23 04:44:35.000000000 -0800
@@ -30,19 +30,16 @@
#endif
#ifdef PERL_UTF8_CACHE_ASSERT
-/* The cache element 0 is the Unicode offset;
- * the cache element 1 is the byte offset of the element 0;
- * the cache element 2 is the Unicode length of the substring;
- * the cache element 3 is the byte length of the substring;
- * The checking of the substring side would be good
- * but substr() has enough code paths to make my head spin;
- * if adding more checks watch out for the following tests:
+/* if adding more checks watch out for the following tests:
* t/op/index.t t/op/length.t t/op/pat.t t/op/substr.t
* lib/utf8.t lib/Unicode/Collate/t/index.t
* --jhi
*/
#define ASSERT_UTF8_CACHE(cache) \
- STMT_START { if (cache) { assert((cache)[0] <= (cache)[1]); } } STMT_END
+ STMT_START { if (cache) { assert((cache)[0] <= (cache)[1]); \
+ assert((cache)[2] <= (cache)[3]); \
+ assert((cache)[3] <= (cache)[1]);} \
+ } STMT_END
#else
#define ASSERT_UTF8_CACHE(cache) NOOP
#endif
@@ -5405,6 +5402,10 @@
/* An exact match. */
return cache[1];
}
+ if (cache[2] == uoffset) {
+ /* An exact match. */
+ return cache[3];
+ }
if (cache[0] < uoffset) {
/* The cache already knows part of the way. */
@@ -5464,7 +5465,7 @@
boffset = real_boffset;
}
- S_utf8_mg_pos_cache_update(aTHX_ sv, mgp, boffset, uoffset);
+ S_utf8_mg_pos_cache_update(aTHX_ sv, mgp, boffset, uoffset, send - start);
return boffset;
}
@@ -5524,7 +5525,8 @@
*/
static void
-S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8)
+S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8,
+ STRLEN blen)
{
STRLEN *cache;
if (SvREADONLY(sv))
@@ -5567,10 +5569,62 @@
" real %"UVf" for %"SVf, (UV) utf8, (UV) realutf8, sv);
}
}
- cache[0] = utf8;
- cache[1] = byte;
+
+ /* Cache is held with the later position first, to simplify the code
+ that deals with unbounded ends. */
+
+ ASSERT_UTF8_CACHE(cache);
+ if (cache[1] == 0) {
+ /* Cache is totally empty */
+ cache[0] = utf8;
+ cache[1] = byte;
+ } else if (cache[3] == 0) {
+ if (byte > cache[1]) {
+ /* New one is larger, so goes first. */
+ cache[2] = cache[0];
+ cache[3] = cache[1];
+ cache[0] = utf8;
+ cache[1] = byte;
+ } else {
+ cache[2] = utf8;
+ cache[3] = byte;
+ }
+ } else {
+#define THREEWAY_SQUARE(a,b,c,d) \
+ ((float)((d) - (c))) * ((float)((d) - (c))) \
+ + ((float)((c) - (b))) * ((float)((c) - (b))) \
+ + ((float)((b) - (a))) * ((float)((b) - (a)))
+
+ /* Cache has 2 slots in use, and we know three potential pairs.
+ Keep the two that give the lowest RMS distance. Do the
+ calcualation in bytes simply because we always know the byte
+ length. squareroot has the same ordering as the positive value,
+ so don't bother with the actual square root. */
+ const float existing = THREEWAY_SQUARE(0, cache[3], cache[1], blen);
+ if (byte > cache[1]) {
+ /* New position is after the existing pair of pairs. */
+ const float keep_earlier
+ = THREEWAY_SQUARE(0, cache[3], byte, blen);
+ const float keep_later
+ = THREEWAY_SQUARE(0, cache[1], byte, blen);
+
+ if (keep_later < keep_earlier) {
+ if (keep_later < existing) {
+ cache[2] = cache[0];
+ cache[3] = cache[1];
+ cache[0] = utf8;
+ cache[1] = byte;
+ }
+ }
+ else {
+ if (keep_earlier < existing) {
+ cache[0] = utf8;
+ cache[1] = byte;
+ }
+ }
+ }
+ }
ASSERT_UTF8_CACHE(cache);
- /* Drop the stale "length" cache */
}
/* If we don't know the character offset of the end of a region, our only
@@ -5626,15 +5680,16 @@
const U8* s;
const STRLEN byte = *offsetp;
STRLEN len;
+ STRLEN blen;
MAGIC* mg = NULL;
const U8* send;
if (!sv)
return;
- s = (const U8*)SvPV_const(sv, len);
+ s = (const U8*)SvPV_const(sv, blen);
- if (len < byte)
+ if (blen < byte)
Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset");
send = s + byte;
@@ -5648,6 +5703,11 @@
*offsetp = cache[0];
return;
}
+ if (cache[3] == byte) {
+ /* An exact match. */
+ *offsetp = cache[2];
+ return;
+ }
if (cache[1] < byte) {
/* We already know part of the way. */
@@ -5655,7 +5715,7 @@
/* Actually, we know the end too. */
len = cache[0]
+ S_sv_pos_b2u_midway(aTHX_ s + cache[1], send,
- s + len, mg->mg_len - cache[0]);
+ s + blen, mg->mg_len - cache[0]);
} else {
len = cache[0]
+ S_sv_pos_b2u_forwards(aTHX_ s + cache[1], send);
@@ -5681,7 +5741,7 @@
}
}
} else if (mg->mg_len != -1) {
- len = S_sv_pos_b2u_midway(aTHX_ s, send, s + len, mg->mg_len);
+ len = S_sv_pos_b2u_midway(aTHX_ s, send, s + blen, mg->mg_len);
} else {
len = S_sv_pos_b2u_forwards(aTHX_ s, send);
}
@@ -5691,7 +5751,7 @@
}
*offsetp = len;
- S_utf8_mg_pos_cache_update(aTHX_ sv, &mg, byte, len);
+ S_utf8_mg_pos_cache_update(aTHX_ sv, &mg, byte, len, blen);
}
/*
End of Patch.