Revision: 1699
          http://mrbs.svn.sourceforge.net/mrbs/?rev=1699&view=rev
Author:   jberanek
Date:     2010-12-13 23:21:37 +0000 (Mon, 13 Dec 2010)

Log Message:
-----------
- New, generally faster utf8 handling functions, hopefully. utf8_substr()
 with a large offset does appear to be slower though. :(
- New utf8 helper, utf8_seq(). Returns the next UTF8 sequence, and the
 byte offset of the next sequence. Allows you to step through a UTF-8
 string without continuously stepping from the start or building
 temporary strings.

Modified Paths:
--------------
    mrbs/branches/ics_attachments/web/language.inc

Modified: mrbs/branches/ics_attachments/web/language.inc
===================================================================
--- mrbs/branches/ics_attachments/web/language.inc      2010-12-13 22:01:54 UTC 
(rev 1698)
+++ mrbs/branches/ics_attachments/web/language.inc      2010-12-13 23:21:37 UTC 
(rev 1699)
@@ -770,7 +770,9 @@
   }
   return $string;
 }
-  
+
+
+//  
 function utf8_strftime($format, $time)
 {
   // %p doesn't actually work in some locales, we have to patch it up ourselves
@@ -794,7 +796,7 @@
 
 // UTF-8 compatible substr function obtained from a contribution by
 // "frank at jkelloggs dot dk" in the PHP online manual for substr()
-function utf8_substr($str,$start)
+function utf8_substr_old($str,$start)
 {
   preg_match_all("/./su", $str, $ar);
 
@@ -806,14 +808,59 @@
   }
 }
 
+function utf8_substr($str, $start)
+{
+  $i = 0;
+  $index = 0;
+  while ((ord($str[$index]) != 0) && ($i < $start))
+  {
+    $index = utf8_next_index($str, $index);
+    $i++;
+  }
 
+  if (!isset($index))
+  {
+    return NULL;
+  }
+  if (func_num_args() >= 3)
+  {
+    $length = func_get_arg(2);
+    $end_index = $index;
+
+    $j = 0;
+    while (isset($end_index) && ($j < $length))
+    {
+      $end_index = utf8_next_index($str, $end_index);
+      $j++;
+    }
+    $j = 0;
+    $ret = "."; // dummy to fool PHP
+    for ($i = $index;
+         (ord($str[$i]) != 0) && (!isset($end_index) || ($i < $end_index));
+         $i++)
+    { 
+      $ret[$j++] = $str[$i];
+    }
+    return $ret;
+  }
+  else
+  {
+    $j = 0;
+    $ret = "."; // dummy to fool PHP
+    for ($i = $index; ord($str[$i]) != 0; $i++)
+    {
+      $ret[$j++] = $str[$i];
+    }
+    return $ret;
+  }
+}
+
+
 // Takes a string (which may be UTF-8) and returns how long it is in
 // _bytes_
 function utf8_bytecount($str)
 {
-  preg_match_all("/./s", $str, $ar);
-
-  return count($ar[0]);
+  return count(str_split($str));
 }
 
 
@@ -821,17 +868,97 @@
 // removed from the front
 function utf8_next($str)
 {
-  return utf8_substr($str, 1);
+  $ret = NULL;
+
+  if (isset($str))
+  {
+    $index = utf8_next_index($str, 0);
+
+    if ($index)
+    {
+      $i = 0;
+      $ret = "."; // dummy to fool PHP
+      while (ord($str[$index]) != 0)
+      {
+        $ret[$i++] = $str[$index++];
+      }
+    }
+  }
+  return $ret;
 }
 
 
+// Takes a UTF-8 string and returns the string with one Unicode character
+// removed from the front
+function utf8_next_index($str, $start)
+{
+  $ret = NULL;
+
+  $i = $start;
+
+  if (isset($str))
+  {
+    if (ord($str[$i]) < 0xc0)
+    {
+      $i++;
+    }
+    else
+    {
+      $i++;
+      while ((ord($str[$i]) & 0xc0) == 0x80)
+      {
+        $i++;
+      }
+    }
+    if (ord($str[$i]) != 0)
+    {
+      $ret = $i;
+    }
+  }
+  return $ret;
+}
+
+
+function utf8_seq($str, &$byte_index)
+{
+  $ret = "."; // dummy to fool PHP
+
+  $next = utf8_next_index($str, $byte_index);
+
+  if (isset($next))
+  {
+    $j = 0;
+    for ($i = $byte_index; $i < $next; $i++)
+    {
+      $ret[$j] = $str[$i];
+      $j++;
+    }
+  }
+  else
+  {
+    $j = 0;
+    for ($i = $byte_index; ord($str[$i]) != 0; $i++)
+    {
+      $ret[$j] = $str[$i];
+      $j++;
+    }
+  }
+  $byte_index = $next;
+  return $ret;
+}
+
+  
 // Takes a UTF-8 string, and returns the number of _characters_ in the
 // string
 function utf8_strlen($str)
 {
-  preg_match_all("/./su", $str, $ar);
-
-  return count($ar[0]);
+  $len = 0;
+  $next = 0;
+  while ($next = utf8_next_index($str, $next))
+  {
+    $len++;
+  }
+  return $len;
 }
 
 


This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.

------------------------------------------------------------------------------
Lotusphere 2011
Register now for Lotusphere 2011 and learn how
to connect the dots, take your collaborative environment
to the next level, and enter the era of Social Business.
http://p.sf.net/sfu/lotusphere-d2d
_______________________________________________
Mrbs-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/mrbs-commits

Reply via email to