Hello,

I found LIKE operators are slower on multi-byte encoding databases
than single-byte encoding ones. It comes from difference between
MatchText() and MBMatchText().

We've had an optimization for single-byte encodings using 
pg_database_encoding_max_length() == 1 test. I'll propose to extend it
in UTF-8 with locale-C case. All of trailing bytes are different from first
bytes in UTF-8 multi-byte characters, so we can use functions for single-bytes
and byte-wise comparison in the case. With the attached patch, the performance
of UTF-8 LIKE operators are pushed up to near other single-bytes encodings.

Databases initialized with locale-C are widely used in Japan, because
Japanese locale are broken in almost of platforms. Japanese user can
choose EUC-jp or UTF-8 as a server encoding, but I think UTF-8 will be
more and more used in the future.


---- test ----

initdb --no-locale --encoding=<encoding>

[HEAD]
SQL_ASCII :  7171 ms /  7203 ms /  7187 ms
LATIN1    :  7172 ms /  7156 ms /  7141 ms
UTF8      : 16235 ms / 16281 ms / 16281 ms
EUC_JP    : 17454 ms / 17453 ms / 17438 ms

[with patch]
SQL_ASCII :  7062 ms /  7125 ms /  7125 ms
LATIN1    :  7047 ms /  7063 ms /  7047 ms
UTF8      :  7188 ms /  7234 ms /  7235 ms
EUC_JP    : 17468 ms / 17453 ms / 17453 ms


CREATE OR REPLACE FUNCTION test() RETURNS INTEGER AS $$
DECLARE
    cnt integer;
BEGIN
    FOR i IN 1..1000 LOOP
        SELECT count(*) INTO cnt FROM item WHERE i_title LIKE 
'%BABABABABARIBA%' LIMIT 50;
    END LOOP;
    RETURN cnt;
END;
$$ LANGUAGE plpgsql;

SELECT count(*) FROM item; -- borrowed from DBT-1 (TPC-W)
 count 
-------
 10000
(1 row)

---- patch ----

Index: src/backend/utils/adt/like.c
===================================================================
--- src/backend/utils/adt/like.c        (head)
+++ src/backend/utils/adt/like.c        (working copy)
@@ -21,6 +21,7 @@
 
 #include "mb/pg_wchar.h"
 #include "utils/builtins.h"
+#include "utils/pg_locale.h"
 
 
 #define LIKE_TRUE                                              1
@@ -119,6 +120,13 @@
 
 
 /*
+ * true iff match functions for single-byte characters are available.
+ */
+#define sb_match_available() \
+       (pg_database_encoding_max_length() == 1 || \
+        (lc_collate_is_c() && GetDatabaseEncoding() == PG_UTF8))
+
+/*
  *     interface routines called by the function manager
  */
 
@@ -138,7 +146,7 @@
        p = VARDATA(pat);
        plen = (VARSIZE(pat) - VARHDRSZ);
 
-       if (pg_database_encoding_max_length() == 1)
+       if (sb_match_available())
                result = (MatchText(s, slen, p, plen) == LIKE_TRUE);
        else
                result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE);
@@ -162,7 +170,7 @@
        p = VARDATA(pat);
        plen = (VARSIZE(pat) - VARHDRSZ);
 
-       if (pg_database_encoding_max_length() == 1)
+       if (sb_match_available())
                result = (MatchText(s, slen, p, plen) != LIKE_TRUE);
        else
                result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE);
@@ -186,7 +194,7 @@
        p = VARDATA(pat);
        plen = (VARSIZE(pat) - VARHDRSZ);
 
-       if (pg_database_encoding_max_length() == 1)
+       if (sb_match_available())
                result = (MatchText(s, slen, p, plen) == LIKE_TRUE);
        else
                result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE);
@@ -210,7 +218,7 @@
        p = VARDATA(pat);
        plen = (VARSIZE(pat) - VARHDRSZ);
 
-       if (pg_database_encoding_max_length() == 1)
+       if (sb_match_available())
                result = (MatchText(s, slen, p, plen) != LIKE_TRUE);
        else
                result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE);
@@ -275,7 +283,7 @@
        int                     slen,
                                plen;
 
-       if (pg_database_encoding_max_length() == 1)
+       if (sb_match_available())
        {
                s = NameStr(*str);
                slen = strlen(s);
@@ -316,7 +324,7 @@
        int                     slen,
                                plen;
 
-       if (pg_database_encoding_max_length() == 1)
+       if (sb_match_available())
        {
                s = NameStr(*str);
                slen = strlen(s);
@@ -357,7 +365,7 @@
        int                     slen,
                                plen;
 
-       if (pg_database_encoding_max_length() == 1)
+       if (sb_match_available())
        {
                s = VARDATA(str);
                slen = (VARSIZE(str) - VARHDRSZ);
@@ -393,7 +401,7 @@
        int                     slen,
                                plen;
 
-       if (pg_database_encoding_max_length() == 1)
+       if (sb_match_available())
        {
                s = VARDATA(str);
                slen = (VARSIZE(str) - VARHDRSZ);
@@ -429,7 +437,7 @@
        text       *esc = PG_GETARG_TEXT_P(1);
        text       *result;
 
-       if (pg_database_encoding_max_length() == 1)
+       if (sb_match_available())
                result = do_like_escape(pat, esc);
        else
                result = MB_do_like_escape(pat, esc);


Regards,
---
ITAGAKI Takahiro
NTT Open Source Software Center


---------------------------(end of broadcast)---------------------------
TIP 7: You can help support the PostgreSQL project by donating at

                http://www.postgresql.org/about/donate

Reply via email to