string: Use Byte-Shift algorithm for very long needles

xiaoxiang Thu, 14 Apr 2022 06:46:23 -0700

This is an automated email from the ASF dual-hosted git repository.

xiaoxiang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nuttx.git


commit c3cefe6da22d794b03ec130190ad9bf023b23eb8
Author: chao.an <[email protected]>
AuthorDate: Thu Mar 25 23:37:30 2021 +0800

    libc/string: Use Byte-Shift algorithm for very long needles
    
    Reference here:
    https://github.com/taleinat/byteshift_strstr
    
    Signed-off-by: chao.an <[email protected]>
    Signed-off-by: anjiahao <[email protected]>
---
 libs/libc/string/lib_strstr.c | 227 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 192 insertions(+), 35 deletions(-)

diff --git a/libs/libc/string/lib_strstr.c b/libs/libc/string/lib_strstr.c
index d8ee9f2768..ab7bfd7cc1 100644
--- a/libs/libc/string/lib_strstr.c
+++ b/libs/libc/string/lib_strstr.c
@@ -25,67 +25,224 @@
 #include <nuttx/config.h>
 
 #include <string.h>
+#include <stdbool.h>
+#include <arpa/inet.h>
+
+/****************************************************************************
+ * Pre-processor Definitions
+ ****************************************************************************/
+
+#if LONG_MAX == INT32_MAX
+  #define LONG_INT_IS_4_BYTES 1
+  #define LONG_INT_N_BYTES    4
+#else
+  #define LONG_INT_N_BYTES    sizeof(long)
+#endif
+
+/* Determine the size, in bytes, of a long integer. */
+
+#if defined (CONFIG_ENDIAN_BIG)
+  #define ULONG_BIGENDIAN(x) (x)
+#elif defined (LONG_INT_IS_4_BYTES)
+  #define ULONG_BIGENDIAN(x) htonl(x)
+#else
+  #undef ULONG_BIGENDIAN
+#endif
 
 /****************************************************************************
  * Public Functions
  ****************************************************************************/
 
-#undef strstr /* See mm/README.txt */
-FAR char *strstr(FAR const char *str, FAR const char *substr)
+/* Finds the first occurrence of the sub-string needle in the
+ * string haystack. Returns NULL if needle was not found.
+ */
+
+FAR char *strstr(FAR const char *haystack, FAR const char *needle)
 {
-  FAR const char *candidate; /* Candidate in str with matching start character 
*/
-  char ch;                   /* First character of the substring */
-  size_t len;                /* The length of the substring */
+#ifndef ULONG_BIGENDIAN
+  FAR const unsigned char *needle_cmp_end;
+#endif
+  FAR const unsigned char *i_haystack;
+  const char needle_first = *needle;
+  FAR const unsigned char *i_needle;
+  unsigned long last_haystack_chars;
+  unsigned long last_needle_chars;
+  FAR const char *sub_start;
+  size_t needle_cmp_len;
+  bool identical = true;
+  unsigned long mask;
+  size_t compare_len;
+  size_t needle_len;
+
+  if (!*needle)
+    {
+      return (FAR char *)haystack;
+    }
+
+  /* Runs strchr() on the first section of the haystack as it has a lower
+   * algorithmic complexity for discarding the first non-matching characters.
+   */
+
+  haystack = strchr(haystack, needle_first);
+  if (!haystack) /* First character of needle is not in the haystack. */
+    {
+      return NULL;
+    }
 
-  /* Special case the empty substring */
+  /* First characters of haystack and needle are the same now. Both are
+   * guaranteed to be at least one character long.
+   * Now computes the sum of the first needle_len characters of haystack
+   * minus the sum of characters values of needle.
+   */
 
-  len = strlen(substr);
-  ch  = *substr;
+  i_haystack = (FAR const unsigned char *)haystack + 1;
+  i_needle = (FAR const unsigned char *)needle + 1;
 
-  if (!ch)
+  while (*i_haystack && *i_needle)
     {
-      /* We'll say that an empty substring matches at the beginning of
-       * the string
-       */
+      identical &= *i_haystack++ == *i_needle++;
+    }
+
+  /* i_haystack now references the (needle_len + 1)-th character. */
+
+  if (*i_needle) /* haystack is smaller than needle. */
+    {
+      return NULL;
+    }
+  else if (identical)
+    {
+      return (FAR char *)haystack;
+    }
+
+  needle_len = i_needle - (FAR const unsigned char *)needle;
 
-      return (FAR char *)str;
+  /* Note:
+   * needle_len > 1, because we checked that it isn't zero, and if it
+   * is 1 then identical must be true because the first strchr() ensured
+   * that the first characters are identical
+   */
+
+  sub_start = haystack;
+  needle_cmp_len = (needle_len < LONG_INT_N_BYTES) ?
+                    needle_len : LONG_INT_N_BYTES;
+
+#ifdef ULONG_BIGENDIAN
+  last_needle_chars =
+    ULONG_BIGENDIAN(*((FAR unsigned long *)(i_needle - needle_cmp_len)))
+    >> (8 * (LONG_INT_N_BYTES - needle_cmp_len));
+  last_haystack_chars =
+    ULONG_BIGENDIAN(*((FAR unsigned long *)(i_haystack - needle_cmp_len)))
+    >> (8 * (LONG_INT_N_BYTES - needle_cmp_len));
+#else
+  needle_cmp_end = i_needle;
+
+  i_needle -= needle_cmp_len;
+  i_haystack -= needle_cmp_len;
+  last_needle_chars = 0;
+  last_haystack_chars = 0;
+
+  while (i_needle != needle_cmp_end)
+    {
+      last_needle_chars <<= 8;
+      last_needle_chars ^= *i_needle++;
+      last_haystack_chars <<= 8;
+      last_haystack_chars ^= *i_haystack++;
     }
+#endif
 
-  /* Search for the substring */
+  /* At this point:
+   * needle is at least two characters long
+   * haystack is at least needle_len characters long (also at least two)
+   * the first characters of needle and haystack are identical
+   */
 
-  candidate = str;
-  for (; ; )
+  if (needle_len > LONG_INT_N_BYTES + 1)
     {
-      /* strchr() will return a pointer to the next occurrence of the
-       * character ch in the string
+      /* we will call memcmp() only once we know that the LONG_INT_N_BYTES
+       * last chars are equal, so it will be enough to compare all but the
+       * last LONG_INT_N_BYTES characters
        */
 
-      candidate = strchr(candidate, ch);
-      if (!candidate || strlen(candidate) < len)
-        {
-          /* First character of the substring does not appear in the string
-           * or the remainder of the string is not long enough to contain the
-           * substring.
-           */
+      compare_len = needle_len - LONG_INT_N_BYTES;
 
-          return NULL;
-        }
+      /* iterate through the remainder of haystack while checking for
+       * identity of the last LONG_INT_N_BYTES, and checking the rest
+       * with memcmp()
+       */
 
-      /* Check if this is the beginning of a matching substring */
+      while (*i_haystack)
+        {
+          last_haystack_chars <<= 8;
+          last_haystack_chars ^= *i_haystack++;
+
+          sub_start++;
+          if (last_haystack_chars == last_needle_chars &&
+              memcmp(sub_start, needle, compare_len) == 0)
+            {
+              return (FAR char *)sub_start;
+            }
+        }
+    }
+  else if (needle_len == LONG_INT_N_BYTES + 1)
+    {
+      /* iterate through the remainder of haystack while checking for
+       * identity of the last LONG_INT_N_BYTES as well as the single
+       * additional character, which is the first one
+       */
 
-      if (strncmp(candidate, substr, len) == 0)
+      while (*i_haystack)
         {
-          return (FAR char *)candidate;
+          last_haystack_chars <<= 8;
+          last_haystack_chars ^= *i_haystack++;
+
+          sub_start++;
+          if (last_haystack_chars == last_needle_chars &&
+              *sub_start == needle_first)
+            {
+              return (FAR char *)sub_start;
+            }
         }
+    }
+  else if (needle_len == LONG_INT_N_BYTES)
+    {
+      /* iterate through the remainder of haystack while checking for
+       * identity of the last LONG_INT_N_BYTES characters, which
+       * should exactly match the entire needle
+       */
 
-      /* No, find the next candidate after this one */
+      while (*i_haystack)
+        {
+          last_haystack_chars <<= 8;
+          last_haystack_chars ^= *i_haystack++;
 
-      candidate++;
+          if (last_haystack_chars == last_needle_chars)
+            {
+              return (FAR char *)(i_haystack - needle_len);
+            }
+        }
     }
+  else /* needle_len < LONG_INT_N_BYTES */
+    {
+      mask = (((unsigned long)1) << (needle_len * 8)) - 1;
+      last_needle_chars &= mask;
 
-  /* Won't get here, but some compilers might complain.  Other compilers
-   * might complain about this code being unreachable too.
-   */
+      /* iterate through the remainder of haystack, updating the sums'
+       * difference and checking for identity whenever the difference
+       * is zero
+       */
+
+      while (*i_haystack)
+        {
+          last_haystack_chars <<= 8;
+          last_haystack_chars ^= *i_haystack++;
+          last_haystack_chars &= mask;
+
+          if (last_haystack_chars == last_needle_chars)
+            {
+              return (FAR char *)(i_haystack - needle_len);
+            }
+        }
+    }
 
   return NULL;
 }

[incubator-nuttx] 01/03: libc/string: Use Byte-Shift algorithm for very long needles

Reply via email to