HAWQ-1317. Port "Fix some regex issues with out-of-range characters and large 
char ranges" from pg


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/23c45c74
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/23c45c74
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/23c45c74

Branch: refs/heads/2.1.0.0-incubating
Commit: 23c45c746e6223e039cff29ca61d8ec5a2bb63bd
Parents: 35ed3ad
Author: amyrazz44 <[email protected]>
Authored: Wed Feb 8 10:36:49 2017 +0800
Committer: ivan <[email protected]>
Committed: Thu Feb 9 14:34:14 2017 +0800

----------------------------------------------------------------------
 src/backend/regex/regc_lex.c               |   8 +-
 src/backend/regex/regc_locale.c            |  55 ++++++---
 src/backend/regex/regcomp.c                |   3 +
 src/include/regex/regcustom.h              |   3 +-
 src/include/regex/regex.h                  |   2 +-
 src/test/feature/full_tests.txt            |   2 +-
 src/test/feature/regex/ans/regex_basic.ans | 144 ++++++++++++++++++++++++
 src/test/feature/regex/sql/regex_basic.sql |  41 +++++++
 src/test/feature/regex/test_regex.cpp      |  27 +++++
 9 files changed, 262 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/backend/regex/regc_lex.c
----------------------------------------------------------------------
diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c
index 782c008..4f6b8ea 100644
--- a/src/backend/regex/regc_lex.c
+++ b/src/backend/regex/regc_lex.c
@@ -792,13 +792,13 @@ lexescape(struct vars * v)
                        break;
                case CHR('u'):
                        c = lexdigits(v, 16, 4, 4);
-                       if (ISERR())
+                       if (ISERR() || c < CHR_MIN || c > CHR_MAX)
                                FAILW(REG_EESCAPE);
                        RETV(PLAIN, c);
                        break;
                case CHR('U'):
                        c = lexdigits(v, 16, 8, 8);
-                       if (ISERR())
+                       if (ISERR() || c < CHR_MIN || c > CHR_MAX) 
                                FAILW(REG_EESCAPE);
                        RETV(PLAIN, c);
                        break;
@@ -816,7 +816,7 @@ lexescape(struct vars * v)
                case CHR('x'):
                        NOTE(REG_UUNPORT);
                        c = lexdigits(v, 16, 1, 255);           /* REs >255 
long outside spec */
-                       if (ISERR())
+                       if (ISERR() || c < CHR_MIN || c > CHR_MAX)
                                FAILW(REG_EESCAPE);
                        RETV(PLAIN, c);
                        break;
@@ -872,6 +872,8 @@ lexescape(struct vars * v)
 
 /*
  * lexdigits - slurp up digits and return chr value
+ * This does not account for overflow; callers should range-check the result
+ * if maxlen is large enough to make that possible.
  */
 static chr                                             /* chr value; errors 
signalled via ERR */
 lexdigits(struct vars * v,

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/backend/regex/regc_locale.c
----------------------------------------------------------------------
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
index 339380e..6ca59b2 100644
--- a/src/backend/regex/regc_locale.c
+++ b/src/backend/regex/regc_locale.c
@@ -471,8 +471,7 @@ range(struct vars * v,                      /* context */
        int                     nchrs;
        struct cvec *cv;
        celt            c,
-                               lc,
-                               uc;
+                               cc;
 
        if (a != b && !before(a, b))
        {
@@ -489,25 +488,47 @@ range(struct vars * v,                    /* context */
        }
 
        /*
-        * When case-independent, it's hard to decide when cvec ranges are 
usable,
-        * so for now at least, we won't try.  We allocate enough space for two
-        * case variants plus a little extra for the two title case variants.
-        */
-
-       nchrs = (b - a + 1) * 2 + 4;
-
-       cv = getcvec(v, nchrs, 0);
+       * When case-independent, it's hard to decide when cvec ranges are 
usable,
+       * so for now at least, we won't try.  We use a range for the originally
+       * specified chrs and then add on any case-equivalents that are outside
+       * that range as individual chrs.
+       *
+       * To ensure sane behavior if someone specifies a very large range, limit
+       * the allocation size to 100000 chrs (arbitrary) and check for overrun
+       * inside the loop below.
+       */
+
+       nchrs = b - a + 1;
+    
+       if (nchrs <= 0 || nchrs > 100000)
+               nchrs = 100000;
+
+       cv = getcvec(v, nchrs, 1);
        NOERRN();
+       addrange(cv, a, b);
 
        for (c = a; c <= b; c++)
        {
-               addchr(cv, c);
-               lc = pg_wc_tolower((chr) c);
-               if (c != lc)
-                       addchr(cv, lc);
-               uc = pg_wc_toupper((chr) c);
-               if (c != uc)
-                       addchr(cv, uc);
+               cc = pg_wc_tolower((chr) c);
+               if (cc !=c && (before(cc, a) || before(b, cc)))
+               {
+                       if (cv->nchrs >= cv->chrspace)
+                       {
+                               ERR(REG_ETOOBIG);
+                               return NULL;
+                       }
+                       addchr(cv, cc);
+               }
+               cc = pg_wc_toupper((chr) c);
+               if (cc != c && (before(cc, a) || before(b, cc)))
+               {
+                       if (cv->nchrs >= cv->chrspace)
+                       {
+                               ERR(REG_ETOOBIG);
+                               return NULL;
+                       }
+                       addchr(cv, cc);
+               }
        }
 
        return cv;

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/backend/regex/regcomp.c
----------------------------------------------------------------------
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 0c08237..35c4d99 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -1509,6 +1509,7 @@ dovec(struct vars * v,
        {
                ch = *p;
                newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
+               NOERR();
        }
 
        /* and the ranges */
@@ -1518,6 +1519,7 @@ dovec(struct vars * v,
                to = *(p + 1);
                if (from <= to)
                        subrange(v, from, to, lp, rp);
+               NOERR();
        }
 }
 
@@ -1844,6 +1846,7 @@ rfree(regex_t *re)
        FREE(g);
 }
 
+
 #ifdef REG_DEBUG
 
 /*

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/include/regex/regcustom.h
----------------------------------------------------------------------
diff --git a/src/include/regex/regcustom.h b/src/include/regex/regcustom.h
index 269f926..cd43eca 100644
--- a/src/include/regex/regcustom.h
+++ b/src/include/regex/regcustom.h
@@ -54,7 +54,8 @@ typedef int celt;                             /* type to hold 
chr, or NOCELT */
 #define DIGITVAL(c) ((c)-'0')  /* turn chr digit into its value */
 #define CHRBITS 32                             /* bits in a chr; must not use 
sizeof */
 #define CHR_MIN 0x00000000             /* smallest and largest chr; the value 
*/
-#define CHR_MAX 0xfffffffe             /* CHR_MAX-CHR_MIN+1 should fit in uchr 
*/
+#define CHR_MAX 0x7ffffffe             /* CHR_MAX-CHR_MIN+1 must fit in an 
int, and
+                                 * CHR_MAX+1 must fit in both chr and celt */
 
 /* functions operating on chr */
 #define iscalnum(x) pg_wc_isalnum(x)

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/include/regex/regex.h
----------------------------------------------------------------------
diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h
index abd90bc..154438a 100644
--- a/src/include/regex/regex.h
+++ b/src/include/regex/regex.h
@@ -151,7 +151,7 @@ typedef struct
 #define REG_INVARG     16                      /* invalid argument to regex 
function */
 #define REG_MIXED      17                      /* character widths of regex 
and string differ */
 #define REG_BADOPT     18                      /* invalid embedded option */
-#define REG_ETOOBIG 19                 /* nfa has too many states */
+#define REG_ETOOBIG 19                 /* regular expression is too complex */
 /* two specials for debugging and testing */
 #define REG_ATOI       101                     /* convert error-code name to 
number */
 #define REG_ITOA       102                     /* convert error-code number to 
name */

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/test/feature/full_tests.txt
----------------------------------------------------------------------
diff --git a/src/test/feature/full_tests.txt b/src/test/feature/full_tests.txt
index 254f866..beda4e8 100644
--- a/src/test/feature/full_tests.txt
+++ b/src/test/feature/full_tests.txt
@@ -2,5 +2,5 @@
 #SERIAL=* are the serial tests to run, optional but should not be empty
 #you can have several PARALLEL or SRRIAL
 
-PARALLEL=TestErrorTable.*:TestPreparedStatement.*:TestUDF.*:TestAOSnappy.*:TestAlterOwner.*:TestAlterTable.*:TestCreateTable.*:TestGuc.*:TestType.*:TestDatabase.*:TestParquet.*:TestPartition.*:TestSubplan.*:TestAggregate.*:TestCreateTypeComposite.*:TestGpDistRandom.*:TestInformationSchema.*:TestQueryInsert.*:TestQueryNestedCaseNull.*:TestQueryPolymorphism.*:TestQueryPortal.*:TestQueryPrepare.*:TestQuerySequence.*:TestCommonLib.*:TestToast.*:TestTransaction.*:TestCommand.*:TestCopy.*:TestParser.*:TestHawqRegister.*
+PARALLEL=TestErrorTable.*:TestPreparedStatement.*:TestUDF.*:TestAOSnappy.*:TestAlterOwner.*:TestAlterTable.*:TestCreateTable.*:TestGuc.*:TestType.*:TestDatabase.*:TestParquet.*:TestPartition.*:TestSubplan.*:TestAggregate.*:TestCreateTypeComposite.*:TestGpDistRandom.*:TestInformationSchema.*:TestQueryInsert.*:TestQueryNestedCaseNull.*:TestQueryPolymorphism.*:TestQueryPortal.*:TestQueryPrepare.*:TestQuerySequence.*:TestCommonLib.*:TestToast.*:TestTransaction.*:TestCommand.*:TestCopy.*:TestParser.*:TestHawqRegister.*:TestRegex.*
 
SERIAL=TestExternalOid.TestExternalOidAll:TestExternalTable.TestExternalTableAll:TestTemp.BasicTest:TestRowTypes.*

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/test/feature/regex/ans/regex_basic.ans
----------------------------------------------------------------------
diff --git a/src/test/feature/regex/ans/regex_basic.ans 
b/src/test/feature/regex/ans/regex_basic.ans
new file mode 100644
index 0000000..52a9077
--- /dev/null
+++ b/src/test/feature/regex/ans/regex_basic.ans
@@ -0,0 +1,144 @@
+-- start_ignore
+SET SEARCH_PATH=TestRegex_TestRegexBasic;
+SET
+-- end_ignore
+--
+-- Regular expression tests
+--
+-- Don't want to have to double backslashes in regexes
+set standard_conforming_strings = on;
+SET
+-- Test simple quantified backrefs
+select 'bbbbb' ~ '^([bc])\1*$' as t;
+ t 
+---
+ t
+(1 row)
+
+select 'ccc' ~ '^([bc])\1*$' as t;
+ t 
+---
+ t
+(1 row)
+
+select 'xxx' ~ '^([bc])\1*$' as f;
+ f 
+---
+ f
+(1 row)
+
+select 'b' ~ '^([bc])\1*$' as t;
+ t 
+---
+ t
+(1 row)
+
+-- Test lookahead constraints
+select regexp_matches('ab', 'a(?=b)b*');
+ regexp_matches 
+----------------
+ {ab}
+(1 row)
+
+select regexp_matches('a', 'a(?=b)b*');
+ regexp_matches 
+----------------
+(0 rows)
+
+select regexp_matches('abc', 'a(?=b)b*(?=c)c*');
+ regexp_matches 
+----------------
+ {abc}
+(1 row)
+
+select regexp_matches('ab', 'a(?=b)b*(?=c)c*');
+ regexp_matches 
+----------------
+(0 rows)
+
+select regexp_matches('ab', 'a(?!b)b*');
+ regexp_matches 
+----------------
+(0 rows)
+
+select regexp_matches('a', 'a(?!b)b*');
+ regexp_matches 
+----------------
+ {a}
+(1 row)
+
+select regexp_matches('b', '(?=b)b');
+ regexp_matches 
+----------------
+ {b}
+(1 row)
+
+select regexp_matches('a', '(?=b)b');
+ regexp_matches 
+----------------
+(0 rows)
+
+-- Test optimization of single-chr-or-bracket-expression lookaround constraints
+select 'xz' ~ 'x(?=[xy])';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'xy' ~ 'x(?=[xy])';
+ ?column? 
+----------
+ t
+(1 row)
+
+select 'xz' ~ 'x(?![xy])';
+ ?column? 
+----------
+ t
+(1 row)
+
+select 'xy' ~ 'x(?![xy])';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'x'  ~ 'x(?![xy])';
+ ?column? 
+----------
+ t
+(1 row)
+
+select 'zyy' ~ '(?<![xy])yy+';
+psql:/tmp/TestRegex_TestRegexBasic.sql:33: ERROR:  invalid regular expression: 
quantifier operand invalid
+-- Test for infinite loop in cfindloop with zero-length possible match
+-- but no actual match (can only happen in the presence of backrefs)
+select 'a' ~ '$()|^\1';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'a' ~ '.. ()|\1';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'a' ~ '()*\1';
+ ?column? 
+----------
+ t
+(1 row)
+
+select 'a' ~ '()+\1';
+ ?column? 
+----------
+ t
+(1 row)
+
+-- Error conditions
+select 'xyz' ~ 'x(\w)(?=\1)';  -- no backrefs in LACONs
+psql:/tmp/TestRegex_TestRegexBasic.sql:43: ERROR:  invalid regular expression: 
invalid backreference number
+select 'a' ~ '\x7fffffff';  -- invalid chr code
+psql:/tmp/TestRegex_TestRegexBasic.sql:44: ERROR:  invalid regular expression: 
invalid escape \ sequence

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/test/feature/regex/sql/regex_basic.sql
----------------------------------------------------------------------
diff --git a/src/test/feature/regex/sql/regex_basic.sql 
b/src/test/feature/regex/sql/regex_basic.sql
new file mode 100644
index 0000000..7dfad9c
--- /dev/null
+++ b/src/test/feature/regex/sql/regex_basic.sql
@@ -0,0 +1,41 @@
+--
+-- Regular expression tests
+--
+
+-- Don't want to have to double backslashes in regexes
+set standard_conforming_strings = on;
+
+-- Test simple quantified backrefs
+select 'bbbbb' ~ '^([bc])\1*$' as t;
+select 'ccc' ~ '^([bc])\1*$' as t;
+select 'xxx' ~ '^([bc])\1*$' as f;
+select 'b' ~ '^([bc])\1*$' as t;
+
+-- Test lookahead constraints
+select regexp_matches('ab', 'a(?=b)b*');
+select regexp_matches('a', 'a(?=b)b*');
+select regexp_matches('abc', 'a(?=b)b*(?=c)c*');
+select regexp_matches('ab', 'a(?=b)b*(?=c)c*');
+select regexp_matches('ab', 'a(?!b)b*');
+select regexp_matches('a', 'a(?!b)b*');
+select regexp_matches('b', '(?=b)b');
+select regexp_matches('a', '(?=b)b');
+
+-- Test optimization of single-chr-or-bracket-expression lookaround constraints
+select 'xz' ~ 'x(?=[xy])';
+select 'xy' ~ 'x(?=[xy])';
+select 'xz' ~ 'x(?![xy])';
+select 'xy' ~ 'x(?![xy])';
+select 'x'  ~ 'x(?![xy])';
+select 'zyy' ~ '(?<![xy])yy+';
+
+-- Test for infinite loop in cfindloop with zero-length possible match
+-- but no actual match (can only happen in the presence of backrefs)
+select 'a' ~ '$()|^\1';
+select 'a' ~ '.. ()|\1';
+select 'a' ~ '()*\1';
+select 'a' ~ '()+\1';
+
+-- Error conditions
+select 'xyz' ~ 'x(\w)(?=\1)';  -- no backrefs in LACONs
+select 'a' ~ '\x7fffffff';  -- invalid chr code

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/test/feature/regex/test_regex.cpp
----------------------------------------------------------------------
diff --git a/src/test/feature/regex/test_regex.cpp 
b/src/test/feature/regex/test_regex.cpp
new file mode 100644
index 0000000..5b08357
--- /dev/null
+++ b/src/test/feature/regex/test_regex.cpp
@@ -0,0 +1,27 @@
+#include <pwd.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <vector>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <iostream>
+#include <string>
+
+#include "lib/sql_util.h"
+
+#include "gtest/gtest.h"
+
+class TestRegex : public ::testing::Test {
+ public:
+  TestRegex() {}
+  ~TestRegex() {}
+};
+
+
+TEST_F(TestRegex, TestRegexBasic) {
+  hawq::test::SQLUtility util;
+  util.execSQLFile("regex/sql/regex_basic.sql",
+                                  "regex/ans/regex_basic.ans");
+}
+

Reply via email to