Repository: incubator-hawq Updated Branches: refs/heads/master 35ed3ad38 -> 23c45c746
HAWQ-1317. Port "Fix some regex issues with out-of-range characters and large char ranges" from pg Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/23c45c74 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/23c45c74 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/23c45c74 Branch: refs/heads/master Commit: 23c45c746e6223e039cff29ca61d8ec5a2bb63bd Parents: 35ed3ad Author: amyrazz44 <[email protected]> Authored: Wed Feb 8 10:36:49 2017 +0800 Committer: ivan <[email protected]> Committed: Thu Feb 9 14:34:14 2017 +0800 ---------------------------------------------------------------------- src/backend/regex/regc_lex.c | 8 +- src/backend/regex/regc_locale.c | 55 ++++++--- src/backend/regex/regcomp.c | 3 + src/include/regex/regcustom.h | 3 +- src/include/regex/regex.h | 2 +- src/test/feature/full_tests.txt | 2 +- src/test/feature/regex/ans/regex_basic.ans | 144 ++++++++++++++++++++++++ src/test/feature/regex/sql/regex_basic.sql | 41 +++++++ src/test/feature/regex/test_regex.cpp | 27 +++++ 9 files changed, 262 insertions(+), 23 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/backend/regex/regc_lex.c ---------------------------------------------------------------------- diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c index 782c008..4f6b8ea 100644 --- a/src/backend/regex/regc_lex.c +++ b/src/backend/regex/regc_lex.c @@ -792,13 +792,13 @@ lexescape(struct vars * v) break; case CHR('u'): c = lexdigits(v, 16, 4, 4); - if (ISERR()) + if (ISERR() || c < CHR_MIN || c > CHR_MAX) FAILW(REG_EESCAPE); RETV(PLAIN, c); break; case CHR('U'): c = lexdigits(v, 16, 8, 8); - if (ISERR()) + if (ISERR() || c < CHR_MIN || c > CHR_MAX) FAILW(REG_EESCAPE); RETV(PLAIN, c); break; @@ -816,7 +816,7 @@ lexescape(struct vars * v) case CHR('x'): NOTE(REG_UUNPORT); c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */ - if (ISERR()) + if (ISERR() || c < CHR_MIN || c > CHR_MAX) FAILW(REG_EESCAPE); RETV(PLAIN, c); break; @@ -872,6 +872,8 @@ lexescape(struct vars * v) /* * lexdigits - slurp up digits and return chr value + * This does not account for overflow; callers should range-check the result + * if maxlen is large enough to make that possible. */ static chr /* chr value; errors signalled via ERR */ lexdigits(struct vars * v, http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/backend/regex/regc_locale.c ---------------------------------------------------------------------- diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c index 339380e..6ca59b2 100644 --- a/src/backend/regex/regc_locale.c +++ b/src/backend/regex/regc_locale.c @@ -471,8 +471,7 @@ range(struct vars * v, /* context */ int nchrs; struct cvec *cv; celt c, - lc, - uc; + cc; if (a != b && !before(a, b)) { @@ -489,25 +488,47 @@ range(struct vars * v, /* context */ } /* - * When case-independent, it's hard to decide when cvec ranges are usable, - * so for now at least, we won't try. We allocate enough space for two - * case variants plus a little extra for the two title case variants. - */ - - nchrs = (b - a + 1) * 2 + 4; - - cv = getcvec(v, nchrs, 0); + * When case-independent, it's hard to decide when cvec ranges are usable, + * so for now at least, we won't try. We use a range for the originally + * specified chrs and then add on any case-equivalents that are outside + * that range as individual chrs. + * + * To ensure sane behavior if someone specifies a very large range, limit + * the allocation size to 100000 chrs (arbitrary) and check for overrun + * inside the loop below. + */ + + nchrs = b - a + 1; + + if (nchrs <= 0 || nchrs > 100000) + nchrs = 100000; + + cv = getcvec(v, nchrs, 1); NOERRN(); + addrange(cv, a, b); for (c = a; c <= b; c++) { - addchr(cv, c); - lc = pg_wc_tolower((chr) c); - if (c != lc) - addchr(cv, lc); - uc = pg_wc_toupper((chr) c); - if (c != uc) - addchr(cv, uc); + cc = pg_wc_tolower((chr) c); + if (cc !=c && (before(cc, a) || before(b, cc))) + { + if (cv->nchrs >= cv->chrspace) + { + ERR(REG_ETOOBIG); + return NULL; + } + addchr(cv, cc); + } + cc = pg_wc_toupper((chr) c); + if (cc != c && (before(cc, a) || before(b, cc))) + { + if (cv->nchrs >= cv->chrspace) + { + ERR(REG_ETOOBIG); + return NULL; + } + addchr(cv, cc); + } } return cv; http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/backend/regex/regcomp.c ---------------------------------------------------------------------- diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 0c08237..35c4d99 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -1509,6 +1509,7 @@ dovec(struct vars * v, { ch = *p; newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp); + NOERR(); } /* and the ranges */ @@ -1518,6 +1519,7 @@ dovec(struct vars * v, to = *(p + 1); if (from <= to) subrange(v, from, to, lp, rp); + NOERR(); } } @@ -1844,6 +1846,7 @@ rfree(regex_t *re) FREE(g); } + #ifdef REG_DEBUG /* http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/include/regex/regcustom.h ---------------------------------------------------------------------- diff --git a/src/include/regex/regcustom.h b/src/include/regex/regcustom.h index 269f926..cd43eca 100644 --- a/src/include/regex/regcustom.h +++ b/src/include/regex/regcustom.h @@ -54,7 +54,8 @@ typedef int celt; /* type to hold chr, or NOCELT */ #define DIGITVAL(c) ((c)-'0') /* turn chr digit into its value */ #define CHRBITS 32 /* bits in a chr; must not use sizeof */ #define CHR_MIN 0x00000000 /* smallest and largest chr; the value */ -#define CHR_MAX 0xfffffffe /* CHR_MAX-CHR_MIN+1 should fit in uchr */ +#define CHR_MAX 0x7ffffffe /* CHR_MAX-CHR_MIN+1 must fit in an int, and + * CHR_MAX+1 must fit in both chr and celt */ /* functions operating on chr */ #define iscalnum(x) pg_wc_isalnum(x) http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/include/regex/regex.h ---------------------------------------------------------------------- diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h index abd90bc..154438a 100644 --- a/src/include/regex/regex.h +++ b/src/include/regex/regex.h @@ -151,7 +151,7 @@ typedef struct #define REG_INVARG 16 /* invalid argument to regex function */ #define REG_MIXED 17 /* character widths of regex and string differ */ #define REG_BADOPT 18 /* invalid embedded option */ -#define REG_ETOOBIG 19 /* nfa has too many states */ +#define REG_ETOOBIG 19 /* regular expression is too complex */ /* two specials for debugging and testing */ #define REG_ATOI 101 /* convert error-code name to number */ #define REG_ITOA 102 /* convert error-code number to name */ http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/test/feature/full_tests.txt ---------------------------------------------------------------------- diff --git a/src/test/feature/full_tests.txt b/src/test/feature/full_tests.txt index 254f866..beda4e8 100644 --- a/src/test/feature/full_tests.txt +++ b/src/test/feature/full_tests.txt @@ -2,5 +2,5 @@ #SERIAL=* are the serial tests to run, optional but should not be empty #you can have several PARALLEL or SRRIAL -PARALLEL=TestErrorTable.*:TestPreparedStatement.*:TestUDF.*:TestAOSnappy.*:TestAlterOwner.*:TestAlterTable.*:TestCreateTable.*:TestGuc.*:TestType.*:TestDatabase.*:TestParquet.*:TestPartition.*:TestSubplan.*:TestAggregate.*:TestCreateTypeComposite.*:TestGpDistRandom.*:TestInformationSchema.*:TestQueryInsert.*:TestQueryNestedCaseNull.*:TestQueryPolymorphism.*:TestQueryPortal.*:TestQueryPrepare.*:TestQuerySequence.*:TestCommonLib.*:TestToast.*:TestTransaction.*:TestCommand.*:TestCopy.*:TestParser.*:TestHawqRegister.* +PARALLEL=TestErrorTable.*:TestPreparedStatement.*:TestUDF.*:TestAOSnappy.*:TestAlterOwner.*:TestAlterTable.*:TestCreateTable.*:TestGuc.*:TestType.*:TestDatabase.*:TestParquet.*:TestPartition.*:TestSubplan.*:TestAggregate.*:TestCreateTypeComposite.*:TestGpDistRandom.*:TestInformationSchema.*:TestQueryInsert.*:TestQueryNestedCaseNull.*:TestQueryPolymorphism.*:TestQueryPortal.*:TestQueryPrepare.*:TestQuerySequence.*:TestCommonLib.*:TestToast.*:TestTransaction.*:TestCommand.*:TestCopy.*:TestParser.*:TestHawqRegister.*:TestRegex.* SERIAL=TestExternalOid.TestExternalOidAll:TestExternalTable.TestExternalTableAll:TestTemp.BasicTest:TestRowTypes.* http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/test/feature/regex/ans/regex_basic.ans ---------------------------------------------------------------------- diff --git a/src/test/feature/regex/ans/regex_basic.ans b/src/test/feature/regex/ans/regex_basic.ans new file mode 100644 index 0000000..52a9077 --- /dev/null +++ b/src/test/feature/regex/ans/regex_basic.ans @@ -0,0 +1,144 @@ +-- start_ignore +SET SEARCH_PATH=TestRegex_TestRegexBasic; +SET +-- end_ignore +-- +-- Regular expression tests +-- +-- Don't want to have to double backslashes in regexes +set standard_conforming_strings = on; +SET +-- Test simple quantified backrefs +select 'bbbbb' ~ '^([bc])\1*$' as t; + t +--- + t +(1 row) + +select 'ccc' ~ '^([bc])\1*$' as t; + t +--- + t +(1 row) + +select 'xxx' ~ '^([bc])\1*$' as f; + f +--- + f +(1 row) + +select 'b' ~ '^([bc])\1*$' as t; + t +--- + t +(1 row) + +-- Test lookahead constraints +select regexp_matches('ab', 'a(?=b)b*'); + regexp_matches +---------------- + {ab} +(1 row) + +select regexp_matches('a', 'a(?=b)b*'); + regexp_matches +---------------- +(0 rows) + +select regexp_matches('abc', 'a(?=b)b*(?=c)c*'); + regexp_matches +---------------- + {abc} +(1 row) + +select regexp_matches('ab', 'a(?=b)b*(?=c)c*'); + regexp_matches +---------------- +(0 rows) + +select regexp_matches('ab', 'a(?!b)b*'); + regexp_matches +---------------- +(0 rows) + +select regexp_matches('a', 'a(?!b)b*'); + regexp_matches +---------------- + {a} +(1 row) + +select regexp_matches('b', '(?=b)b'); + regexp_matches +---------------- + {b} +(1 row) + +select regexp_matches('a', '(?=b)b'); + regexp_matches +---------------- +(0 rows) + +-- Test optimization of single-chr-or-bracket-expression lookaround constraints +select 'xz' ~ 'x(?=[xy])'; + ?column? +---------- + f +(1 row) + +select 'xy' ~ 'x(?=[xy])'; + ?column? +---------- + t +(1 row) + +select 'xz' ~ 'x(?![xy])'; + ?column? +---------- + t +(1 row) + +select 'xy' ~ 'x(?![xy])'; + ?column? +---------- + f +(1 row) + +select 'x' ~ 'x(?![xy])'; + ?column? +---------- + t +(1 row) + +select 'zyy' ~ '(?<![xy])yy+'; +psql:/tmp/TestRegex_TestRegexBasic.sql:33: ERROR: invalid regular expression: quantifier operand invalid +-- Test for infinite loop in cfindloop with zero-length possible match +-- but no actual match (can only happen in the presence of backrefs) +select 'a' ~ '$()|^\1'; + ?column? +---------- + f +(1 row) + +select 'a' ~ '.. ()|\1'; + ?column? +---------- + f +(1 row) + +select 'a' ~ '()*\1'; + ?column? +---------- + t +(1 row) + +select 'a' ~ '()+\1'; + ?column? +---------- + t +(1 row) + +-- Error conditions +select 'xyz' ~ 'x(\w)(?=\1)'; -- no backrefs in LACONs +psql:/tmp/TestRegex_TestRegexBasic.sql:43: ERROR: invalid regular expression: invalid backreference number +select 'a' ~ '\x7fffffff'; -- invalid chr code +psql:/tmp/TestRegex_TestRegexBasic.sql:44: ERROR: invalid regular expression: invalid escape \ sequence http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/test/feature/regex/sql/regex_basic.sql ---------------------------------------------------------------------- diff --git a/src/test/feature/regex/sql/regex_basic.sql b/src/test/feature/regex/sql/regex_basic.sql new file mode 100644 index 0000000..7dfad9c --- /dev/null +++ b/src/test/feature/regex/sql/regex_basic.sql @@ -0,0 +1,41 @@ +-- +-- Regular expression tests +-- + +-- Don't want to have to double backslashes in regexes +set standard_conforming_strings = on; + +-- Test simple quantified backrefs +select 'bbbbb' ~ '^([bc])\1*$' as t; +select 'ccc' ~ '^([bc])\1*$' as t; +select 'xxx' ~ '^([bc])\1*$' as f; +select 'b' ~ '^([bc])\1*$' as t; + +-- Test lookahead constraints +select regexp_matches('ab', 'a(?=b)b*'); +select regexp_matches('a', 'a(?=b)b*'); +select regexp_matches('abc', 'a(?=b)b*(?=c)c*'); +select regexp_matches('ab', 'a(?=b)b*(?=c)c*'); +select regexp_matches('ab', 'a(?!b)b*'); +select regexp_matches('a', 'a(?!b)b*'); +select regexp_matches('b', '(?=b)b'); +select regexp_matches('a', '(?=b)b'); + +-- Test optimization of single-chr-or-bracket-expression lookaround constraints +select 'xz' ~ 'x(?=[xy])'; +select 'xy' ~ 'x(?=[xy])'; +select 'xz' ~ 'x(?![xy])'; +select 'xy' ~ 'x(?![xy])'; +select 'x' ~ 'x(?![xy])'; +select 'zyy' ~ '(?<![xy])yy+'; + +-- Test for infinite loop in cfindloop with zero-length possible match +-- but no actual match (can only happen in the presence of backrefs) +select 'a' ~ '$()|^\1'; +select 'a' ~ '.. ()|\1'; +select 'a' ~ '()*\1'; +select 'a' ~ '()+\1'; + +-- Error conditions +select 'xyz' ~ 'x(\w)(?=\1)'; -- no backrefs in LACONs +select 'a' ~ '\x7fffffff'; -- invalid chr code http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/test/feature/regex/test_regex.cpp ---------------------------------------------------------------------- diff --git a/src/test/feature/regex/test_regex.cpp b/src/test/feature/regex/test_regex.cpp new file mode 100644 index 0000000..5b08357 --- /dev/null +++ b/src/test/feature/regex/test_regex.cpp @@ -0,0 +1,27 @@ +#include <pwd.h> +#include <sys/types.h> +#include <unistd.h> +#include <vector> +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <iostream> +#include <string> + +#include "lib/sql_util.h" + +#include "gtest/gtest.h" + +class TestRegex : public ::testing::Test { + public: + TestRegex() {} + ~TestRegex() {} +}; + + +TEST_F(TestRegex, TestRegexBasic) { + hawq::test::SQLUtility util; + util.execSQLFile("regex/sql/regex_basic.sql", + "regex/ans/regex_basic.ans"); +} +
