Re: [PATCH] SBuf c-string comparisons

Amos Jeffries Thu, 17 Apr 2014 03:44:16 -0700

On 17/04/2014 7:23 a.m., Alex Rousskov wrote:
> On 04/16/2014 12:30 PM, Amos Jeffries wrote:
>> On 17/04/2014 2:32 a.m., Alex Rousskov wrote:
>>> On 04/16/2014 12:05 AM, Amos Jeffries wrote:
>>>> I don't see any way around this without hand-crafing a full byte-by-byte
>>>> strncmp replacement. 
> 
>>> I am not against hand-crafting if it is really necessary -- we already
>>> hand-craft memCaseCmp IIRC. Personally, I would hand-craft _if_ system
>>> implementation of strncmp() is just a basic loop rather than some
>>> complicated, optimized low-level code. Otherwise, I would find a way to
>>> avoid strlen().
> 
>> Which system? which architecture? which compiler? which library?
> 
> Any reasonable/popular implementation selected by the developer. This is
> a one-time check done by the developer, not an automated check done
> during Squid build. Sorry I was not clear about that.
> 
> 
>> That is a tricky "_if_" to code for.
> 
> I hope the above clarifies that no coding is necessary for this _if_.
>


So glibc: a do-while loop scanning word-by-word with individual
byte-by-byte loop (unrolled) over the bytes in each word.


37  if (n >= 4)
38  {
39  size_t n4 = n >> 2;
40  do
41  {
42  c1 = (unsigned char) *s1++;
43  c2 = (unsigned char) *s2++;
44  if (c1 == '\0' || c1 != c2)
45  return c1 - c2;
46  c1 = (unsigned char) *s1++;
47  c2 = (unsigned char) *s2++;
48  if (c1 == '\0' || c1 != c2)
49  return c1 - c2;
50  c1 = (unsigned char) *s1++;
51  c2 = (unsigned char) *s2++;
52  if (c1 == '\0' || c1 != c2)
53  return c1 - c2;
54  c1 = (unsigned char) *s1++;
55  c2 = (unsigned char) *s2++;
56  if (c1 == '\0' || c1 != c2)
57  return c1 - c2;
58  } while (--n4 > 0);
59  n &= 3;
60  }
61
62  while (n > 0)
63  {
64  c1 = (unsigned char) *s1++;
65  c2 = (unsigned char) *s2++;
66  if (c1 == '\0' || c1 != c2)
67  return c1 - c2;
68  n--;
69  }
70

> 
>> So...
>>  trying to find a way to determine the length of a c-string potentially
>> unterminated, without using strlen() or otherwise looping over it.
>> OR,
>>  trying to find out where the system strn*() function stopped.
>>
>> I'm all ears for suggestions on that little gem.
> 
> I do not think the above is possible.
> 

Indeed.

> 
>>> Since the hand-crafted implementation is simple, I do not consider it an
>>> overkill. And I am sure there is a way to avoid it if needed.
> 
>> I would absolutely love to hear what that is.
> 
> See the cloning sketch in the previous email. To summarize, known
> solutions are:
> 
>   1) a custom loop to properly limit SBuf iteration
>   2) cloning to guarantee SBuf 0-termination

The cloning mechanism uses strlen() internally. So no benefit, but extra
malloc+free costs.

> 
> Since I expect (2) to be sometimes a lot slower than (1), I would go for
> (1), especially if a quick check of a popular strncmp() implementation
> does not expose some low-level optimizations that we would not be able
> (or would not want) to duplicate in Squid.
> 
> 
> Hope this clarifies,
> 
> Alex.
> 

Patch with hand-rolled scanner attached.

Amos

=== modified file 'src/SBuf.cc'
--- src/SBuf.cc 2014-04-06 07:08:04 +0000
+++ src/SBuf.cc 2014-04-17 10:39:36 +0000
@@ -360,64 +360,101 @@
     store_->mem[off_+pos] = toset;
     ++stats.setChar;
 }
 
 static int
 memcasecmp(const char *b1, const char *b2, SBuf::size_type len)
 {
     int rv=0;
     while (len > 0) {
         rv = tolower(*b1)-tolower(*b2);
         if (rv != 0)
             return rv;
         ++b1;
         ++b2;
         --len;
     }
     return rv;
 }
 
 int
-SBuf::compare(const SBuf &S, SBufCaseSensitive isCaseSensitive, size_type n) 
const
+SBuf::compare(const SBuf &S, const SBufCaseSensitive isCaseSensitive, const 
size_type n) const
 {
     if (n != npos)
         return substr(0,n).compare(S.substr(0,n),isCaseSensitive);
 
-    size_type byteCompareLen = min(S.length(), length());
+    const size_type byteCompareLen = min(S.length(), length());
     ++stats.compareSlow;
     int rv = 0;
     if (isCaseSensitive == caseSensitive) {
         rv = memcmp(buf(), S.buf(), byteCompareLen);
     } else {
         rv = memcasecmp(buf(), S.buf(), byteCompareLen);
     }
     if (rv != 0)
         return rv;
     if (length() == S.length())
         return 0;
     if (length() > S.length())
         return 1;
     return -1;
 }
 
+int
+SBuf::compare(const char *s, const SBufCaseSensitive isCaseSensitive, const 
size_type n) const
+{
+    // 0-length comparison is always true regardless of buffer states
+    if (!n || (!length() && *s=='\0')) {
+        ++stats.compareFast;
+        return 0;
+    }
+
+    // N-length compare MUST provide a non-NULL C-string pointer
+    assert(s);
+
+    // brute-force scan in order to avoid ever needing strlen() on a c-string.
+    ++stats.compareSlow;
+    const char *left = buf();
+    const char *right = s;
+    size_type byteCount = min(length(), n);
+    int rv = 0;
+
+    if (isCaseSensitive == caseSensitive) {
+        while ((rv = *left - *right++) == 0) {
+            if (*left++ == '\0' || --byteCount == 0)
+                break;
+        }
+    } else {
+        while ((rv = tolower(*left) - tolower(*right++)) == 0) {
+            if (*left++ == '\0' || --byteCount == 0)
+                break;
+        }
+    }
+    if (byteCount)
+        return rv;
+    if (length() < n)
+        return '\0' - *right;
+    return rv;
+}
+
 bool
-SBuf::startsWith(const SBuf &S, SBufCaseSensitive isCaseSensitive) const
+SBuf::startsWith(const SBuf &S, const SBufCaseSensitive isCaseSensitive) const
 {
     debugs(24, 8, id << " startsWith " << S.id << ", caseSensitive: " <<
            isCaseSensitive);
     if (length() < S.length()) {
         debugs(24, 8, "no, too short");
         ++stats.compareFast;
         return false;
     }
     return (compare(S, isCaseSensitive, S.length()) == 0);
 }
 
 bool
 SBuf::operator ==(const SBuf & S) const
 {
     debugs(24, 8, id << " == " << S.id);
     if (length() != S.length()) {
         debugs(24, 8, "no, different lengths");
         ++stats.compareFast;
         return false; //shortcut: must be equal length
     }

=== modified file 'src/SBuf.h'
--- src/SBuf.h  2014-04-06 07:08:04 +0000
+++ src/SBuf.h  2014-04-16 17:49:48 +0000
@@ -238,58 +238,71 @@
     char at(size_type pos) const {checkAccessBounds(pos); return 
operator[](pos);}
 
     /** direct-access set a byte at a specified operation.
      *
      * \param pos the position to be overwritten
      * \param toset the value to be written
      * \throw OutOfBoundsException when pos is of bounds
      * \note bounds is 0 <= pos < length(); caller must pay attention to 
signedness
      * \note performs a copy-on-write if needed.
      */
     void setAt(size_type pos, char toset);
 
     /** compare to other SBuf, str(case)cmp-style
      *
      * \param isCaseSensitive one of caseSensitive or caseInsensitive
      * \param n compare up to this many bytes. if npos (default), compare 
whole SBufs
      * \retval >0 argument of the call is greater than called SBuf
      * \retval <0 argument of the call is smaller than called SBuf
      * \retval 0  argument of the call has the same contents of called SBuf
      */
-    int compare(const SBuf &S, SBufCaseSensitive isCaseSensitive, size_type n 
= npos) const;
+    int compare(const SBuf &S, const SBufCaseSensitive isCaseSensitive, const 
size_type n = npos) const;
 
-    /// shorthand version for compare
-    inline int cmp(const SBuf &S, size_type n = npos) const {
+    /// shorthand version for compare()
+    inline int cmp(const SBuf &S, const size_type n = npos) const {
         return compare(S,caseSensitive,n);
     }
 
-    /// shorthand version for case-insensitive comparison
-    inline int caseCmp(const SBuf &S, size_type n = npos) const {
+    /// shorthand version for case-insensitive compare()
+    inline int caseCmp(const SBuf &S, const size_type n = npos) const {
+        return compare(S,caseInsensitive,n);
+    }
+
+    /// Comparison with a C-string.
+    int compare(const char *s, const SBufCaseSensitive isCaseSensitive, const 
size_type n = npos) const;
+
+    /// Shorthand version for C-string compare().
+    inline int cmp(const char *S, const size_type n = npos) const {
+        return compare(S,caseSensitive,n);
+    }
+
+    /// Shorthand version for case-insensitive C-string compare().
+    inline int caseCmp(const char *S, const size_type n = npos) const {
         return compare(S,caseInsensitive,n);
     }
 
     /** check whether the entire supplied argument is a prefix of the SBuf.
      *  \param S the prefix to match against
      *  \param isCaseSensitive one of caseSensitive or caseInsensitive
      *  \retval true argument is a prefix of the SBuf
      */
-    bool startsWith(const SBuf &S, SBufCaseSensitive isCaseSensitive = 
caseSensitive) const;
+    bool startsWith(const SBuf &S, const SBufCaseSensitive isCaseSensitive = 
caseSensitive) const;
 
     bool operator ==(const SBuf & S) const;
     bool operator !=(const SBuf & S) const;
     bool operator <(const SBuf &S) const {return (cmp(S) < 0);}
     bool operator >(const SBuf &S) const {return (cmp(S) > 0);}
     bool operator <=(const SBuf &S) const {return (cmp(S) <= 0);}
     bool operator >=(const SBuf &S) const {return (cmp(S) >= 0);}
 
     /** Consume bytes at the head of the SBuf
      *
      * Consume N chars at SBuf head, or to SBuf's end,
      * whichever is shorter. If more bytes are consumed than available,
      * the SBuf is emptied
      * \param n how many bytes to remove; could be zero.
      *     npos (or no argument) means 'to the end of SBuf'
      * \return a new SBuf containing the consumed bytes.
      */
     SBuf consume(size_type n = npos);
 
     /// gets global statistic informations

Re: [PATCH] SBuf c-string comparisons

Reply via email to