Hi all,

As I like algorithms, and I don't like bad performances, I started to
look at the performances of the address book searches. There are several
places where a search for a pattern in a set of strings happen: the find
function in the address book GUI, the search for matching address when
composing a message, and the search for an existing entry when
auto-collecting addresses upon reception of a message.

All of those case share the same properties: we repeatedly look for the
same fixed pattern in a huge set of strings. The complications come from
the fact that we (sometimes) allow wildcards in the pattern, and we want
to be able to do case-insensitive searches.

Currently, the simplest case (no wildcard, case-sensitive) is handled
using string.Matches("*pattern*"). This is a pity in the case where the
pattern does not have wildcards inside, as some more efficient
algorithms could be used.

But the most time-consuming task seems to be to convert *all* the
searched strings to lower case so that the search is actually
case-insensitive. Would it be possible to cache the lower-case version
of the strings used in the address-book? The wxString class itself can
not be inherited as it has no virtual functions. Maybe we could build a
handle to both the string and it's lower case version. But then all the
code of the ADB should be modified to use this new class... Other
suggestions?

With respect to the search algorithm itself, I implemented a very simple
one given (among tons of others) in 

  http://www-igm.univ-mlv.fr/~lecroq/string/node19.html#SECTION00190

The code pretty simple and quite effective. As for regexps, it requires
that the pattern is 'compiled' before starting any search. Of course, it
does not handle wildcards, so either we remove this feature or we switch
depending on the fact that the given pattern has a wildcard or not. Note
that address auto-collection never uses wildcard, so would always
benefit from the new algorithm.

Here are some results I got with the code below:

When matching:
Using wxQSPattern... 1993ms
Using wxString::Matches... 7791ms
When NOT matching:
Using wxQSPattern... 3856ms
Using wxString::Matches... 8863ms
When NOT matching (lowercase):
Using wxQSPattern... 24355ms
Using wxString::Matches... 29602ms
Press any key to continue


#include <ctype.h>
#include "Mcommon.h"
#include "wx/string.h"
#include "wx/regex.h"
#include "wx/timer.h"
#include "wx/datetime.h"


class wxQSPattern {
private:
   wxString m_pattern;
   int      m_qsBc[256];
public:
   wxQSPattern(const wxString& pattern);
   size_t Find(const wxString& text);
   bool IsIn(const wxString& text) {
      return wxString::npos != Find(text);
   }
private:
   void init();
};


wxQSPattern::wxQSPattern(const wxString& pattern) 
   : m_pattern(pattern)
{
   init();
}

void wxQSPattern::init() {
   size_t i;
   size_t m = m_pattern.Length();
   const char* x = m_pattern.c_str();
   for (i = 0; i < 256; ++i) {
      m_qsBc[i] = m + 1;
   }
   for (i = 0; i < m; ++i) {
      m_qsBc[x[i]] = m - i;
   }
}

size_t wxQSPattern::Find(const wxString& text) {
   size_t m = m_pattern.Length();
   size_t n = text.Length();
   const char* x = m_pattern.c_str();
   const char* y = text.c_str();
   size_t j = 0;
   while (j <= n - m) {
      if (memcmp(x, y + j, m) == 0) {
         return j;
      }
      j += m_qsBc[y[j + m]];
   }
   return wxString::npos;
}


void testQSPattern() {
   const size_t maxIter = 10000000;
   size_t i;
   
   wxString text("GCATCGCAGAGAGTATACAGTACG");

   {
      wxString lookFor("GCAGAGAG");
      cout << "When matching:" << endl;
      cout << "Using wxQSPattern... " << flush;
      wxQSPattern pattern(lookFor);
      wxStopWatch chrono; 
      for (i = 0; i < maxIter; ++i) {
         (void)pattern.IsIn(text);
      }
      cout << chrono.Time() << "ms" << endl;


      cout << "Using wxString::Matches... " << flush;
      wxString re;
      re << '*' << lookFor << '*';
      chrono.Start(); 
      for (i = 0; i < maxIter; ++i) {
         (void)text.Matches(re);
      }
      cout << chrono.Time() << "ms" << endl;
   }
   {
      wxString lookFor("GCAGAGAGC");
      cout << "When NOT matching:" << endl;
      cout << "Using wxQSPattern... " << flush;
      wxQSPattern pattern(lookFor);
      wxStopWatch chrono; 
      for (i = 0; i < maxIter; ++i) {
         (void)pattern.IsIn(text);
      }
      cout << chrono.Time() << "ms" << endl;


      cout << "Using wxString::Matches... " << flush;
      wxString re;
      re << '*' << lookFor << '*';
      chrono.Start(); 
      for (i = 0; i < maxIter; ++i) {
         (void)text.Matches(re);
      }
      cout << chrono.Time() << "ms" << endl;
   }
   {
      wxString lookFor("GCAGAGAGC");
      cout << "When NOT matching (lowercase):" << endl;
      cout << "Using wxQSPattern... " << flush;
      wxQSPattern pattern(lookFor.Lower());
      wxStopWatch chrono; 
      for (i = 0; i < maxIter; ++i) {
         (void)pattern.IsIn(text.Lower());
      }
      cout << chrono.Time() << "ms" << endl;


      cout << "Using wxString::Matches... " << flush;
      wxString re;
      re << '*' << lookFor.Lower() << '*';
      chrono.Start(); 
      for (i = 0; i < maxIter; ++i) {
         (void)text.Lower().Matches(re);
      }
      cout << chrono.Time() << "ms" << endl;
   }
}


int main(int argc, char** argv)
{
#if 0
   testRemoveReplyPrefix();
   testDate();
#endif
   testQSPattern();

   return 0;


}


-- 
Xavier Nodet
"They that can give up essential liberty to obtain a little temporary
safety deserve neither liberty nor safety." - Benjamin Franklin, 1759.

Attachment: pgp00000.pgp
Description: PGP signature

Reply via email to