This is an automated email from the ASF dual-hosted git repository. xiaoxiang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-nuttx.git
commit c3cefe6da22d794b03ec130190ad9bf023b23eb8 Author: chao.an <[email protected]> AuthorDate: Thu Mar 25 23:37:30 2021 +0800 libc/string: Use Byte-Shift algorithm for very long needles Reference here: https://github.com/taleinat/byteshift_strstr Signed-off-by: chao.an <[email protected]> Signed-off-by: anjiahao <[email protected]> --- libs/libc/string/lib_strstr.c | 227 +++++++++++++++++++++++++++++++++++------- 1 file changed, 192 insertions(+), 35 deletions(-) diff --git a/libs/libc/string/lib_strstr.c b/libs/libc/string/lib_strstr.c index d8ee9f2768..ab7bfd7cc1 100644 --- a/libs/libc/string/lib_strstr.c +++ b/libs/libc/string/lib_strstr.c @@ -25,67 +25,224 @@ #include <nuttx/config.h> #include <string.h> +#include <stdbool.h> +#include <arpa/inet.h> + +/**************************************************************************** + * Pre-processor Definitions + ****************************************************************************/ + +#if LONG_MAX == INT32_MAX + #define LONG_INT_IS_4_BYTES 1 + #define LONG_INT_N_BYTES 4 +#else + #define LONG_INT_N_BYTES sizeof(long) +#endif + +/* Determine the size, in bytes, of a long integer. */ + +#if defined (CONFIG_ENDIAN_BIG) + #define ULONG_BIGENDIAN(x) (x) +#elif defined (LONG_INT_IS_4_BYTES) + #define ULONG_BIGENDIAN(x) htonl(x) +#else + #undef ULONG_BIGENDIAN +#endif /**************************************************************************** * Public Functions ****************************************************************************/ -#undef strstr /* See mm/README.txt */ -FAR char *strstr(FAR const char *str, FAR const char *substr) +/* Finds the first occurrence of the sub-string needle in the + * string haystack. Returns NULL if needle was not found. + */ + +FAR char *strstr(FAR const char *haystack, FAR const char *needle) { - FAR const char *candidate; /* Candidate in str with matching start character */ - char ch; /* First character of the substring */ - size_t len; /* The length of the substring */ +#ifndef ULONG_BIGENDIAN + FAR const unsigned char *needle_cmp_end; +#endif + FAR const unsigned char *i_haystack; + const char needle_first = *needle; + FAR const unsigned char *i_needle; + unsigned long last_haystack_chars; + unsigned long last_needle_chars; + FAR const char *sub_start; + size_t needle_cmp_len; + bool identical = true; + unsigned long mask; + size_t compare_len; + size_t needle_len; + + if (!*needle) + { + return (FAR char *)haystack; + } + + /* Runs strchr() on the first section of the haystack as it has a lower + * algorithmic complexity for discarding the first non-matching characters. + */ + + haystack = strchr(haystack, needle_first); + if (!haystack) /* First character of needle is not in the haystack. */ + { + return NULL; + } - /* Special case the empty substring */ + /* First characters of haystack and needle are the same now. Both are + * guaranteed to be at least one character long. + * Now computes the sum of the first needle_len characters of haystack + * minus the sum of characters values of needle. + */ - len = strlen(substr); - ch = *substr; + i_haystack = (FAR const unsigned char *)haystack + 1; + i_needle = (FAR const unsigned char *)needle + 1; - if (!ch) + while (*i_haystack && *i_needle) { - /* We'll say that an empty substring matches at the beginning of - * the string - */ + identical &= *i_haystack++ == *i_needle++; + } + + /* i_haystack now references the (needle_len + 1)-th character. */ + + if (*i_needle) /* haystack is smaller than needle. */ + { + return NULL; + } + else if (identical) + { + return (FAR char *)haystack; + } + + needle_len = i_needle - (FAR const unsigned char *)needle; - return (FAR char *)str; + /* Note: + * needle_len > 1, because we checked that it isn't zero, and if it + * is 1 then identical must be true because the first strchr() ensured + * that the first characters are identical + */ + + sub_start = haystack; + needle_cmp_len = (needle_len < LONG_INT_N_BYTES) ? + needle_len : LONG_INT_N_BYTES; + +#ifdef ULONG_BIGENDIAN + last_needle_chars = + ULONG_BIGENDIAN(*((FAR unsigned long *)(i_needle - needle_cmp_len))) + >> (8 * (LONG_INT_N_BYTES - needle_cmp_len)); + last_haystack_chars = + ULONG_BIGENDIAN(*((FAR unsigned long *)(i_haystack - needle_cmp_len))) + >> (8 * (LONG_INT_N_BYTES - needle_cmp_len)); +#else + needle_cmp_end = i_needle; + + i_needle -= needle_cmp_len; + i_haystack -= needle_cmp_len; + last_needle_chars = 0; + last_haystack_chars = 0; + + while (i_needle != needle_cmp_end) + { + last_needle_chars <<= 8; + last_needle_chars ^= *i_needle++; + last_haystack_chars <<= 8; + last_haystack_chars ^= *i_haystack++; } +#endif - /* Search for the substring */ + /* At this point: + * needle is at least two characters long + * haystack is at least needle_len characters long (also at least two) + * the first characters of needle and haystack are identical + */ - candidate = str; - for (; ; ) + if (needle_len > LONG_INT_N_BYTES + 1) { - /* strchr() will return a pointer to the next occurrence of the - * character ch in the string + /* we will call memcmp() only once we know that the LONG_INT_N_BYTES + * last chars are equal, so it will be enough to compare all but the + * last LONG_INT_N_BYTES characters */ - candidate = strchr(candidate, ch); - if (!candidate || strlen(candidate) < len) - { - /* First character of the substring does not appear in the string - * or the remainder of the string is not long enough to contain the - * substring. - */ + compare_len = needle_len - LONG_INT_N_BYTES; - return NULL; - } + /* iterate through the remainder of haystack while checking for + * identity of the last LONG_INT_N_BYTES, and checking the rest + * with memcmp() + */ - /* Check if this is the beginning of a matching substring */ + while (*i_haystack) + { + last_haystack_chars <<= 8; + last_haystack_chars ^= *i_haystack++; + + sub_start++; + if (last_haystack_chars == last_needle_chars && + memcmp(sub_start, needle, compare_len) == 0) + { + return (FAR char *)sub_start; + } + } + } + else if (needle_len == LONG_INT_N_BYTES + 1) + { + /* iterate through the remainder of haystack while checking for + * identity of the last LONG_INT_N_BYTES as well as the single + * additional character, which is the first one + */ - if (strncmp(candidate, substr, len) == 0) + while (*i_haystack) { - return (FAR char *)candidate; + last_haystack_chars <<= 8; + last_haystack_chars ^= *i_haystack++; + + sub_start++; + if (last_haystack_chars == last_needle_chars && + *sub_start == needle_first) + { + return (FAR char *)sub_start; + } } + } + else if (needle_len == LONG_INT_N_BYTES) + { + /* iterate through the remainder of haystack while checking for + * identity of the last LONG_INT_N_BYTES characters, which + * should exactly match the entire needle + */ - /* No, find the next candidate after this one */ + while (*i_haystack) + { + last_haystack_chars <<= 8; + last_haystack_chars ^= *i_haystack++; - candidate++; + if (last_haystack_chars == last_needle_chars) + { + return (FAR char *)(i_haystack - needle_len); + } + } } + else /* needle_len < LONG_INT_N_BYTES */ + { + mask = (((unsigned long)1) << (needle_len * 8)) - 1; + last_needle_chars &= mask; - /* Won't get here, but some compilers might complain. Other compilers - * might complain about this code being unreachable too. - */ + /* iterate through the remainder of haystack, updating the sums' + * difference and checking for identity whenever the difference + * is zero + */ + + while (*i_haystack) + { + last_haystack_chars <<= 8; + last_haystack_chars ^= *i_haystack++; + last_haystack_chars &= mask; + + if (last_haystack_chars == last_needle_chars) + { + return (FAR char *)(i_haystack - needle_len); + } + } + } return NULL; }
