On Fri, 23 Mar 2018 11:45:33 +0300
Alexander Korotkov <[email protected]> wrote:
> On Thu, Mar 22, 2018 at 7:09 PM, Teodor Sigaev <[email protected]>
> wrote:
>
> > Patch looks resonable, but I see some place to improvement:
> > spg_text_leaf_consistent() only needs to check with
> > text_startswith() if reconstucted value came to leaf consistent is
> > shorter than given prefix. For example, if level >= length of
> > prefix then we guarantee that fully reconstracted is matched too.
> > But do not miss that you may need to return value for index only
> > scan, consult returnData field
> >
> > In attachment rebased and minorly edited version of your patch.
>
>
> I took a look at this patch. In addition to Teodor's comments I can
> note following.
>
> * This line looks strange for me.
>
> - if (strategy > 10)
> + if (strategy > 10 && strategy !=
> RTPrefixStrategyNumber)
>
> It's not because we added strategy != RTPrefixStrategyNumber condition
> there.
> It's because we already used magic number here and now have a mix of
> magic number and macro constant in one line. Once we anyway touch
> this place, could we get rid of magic numbers here?
>
> * I'm a little concern about operator name. We're going to choose @^
> operator for
> prefix search without any preliminary discussion. However,
> personally I don't
> have better ideas :)
Teodor, Alexander, thanks for review. In new version I have added the
optimization in spgist using level variable and also got rid of magic
numbers.
About the operator it's actually ^@ (not @^ :)), I thought about it and
don't really have any idea what operator can be used instead.
Attached version 5 of the patch.
--
---
Ildus Kurbangaliev
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 9d1772f349..a1b4724a88 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -2274,6 +2274,21 @@
<entry><literal>ph</literal></entry>
</row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>text_startswith</primary>
+ </indexterm>
+ <literal><function>text_startswith(<parameter>string</parameter>, <parameter>prefix</parameter>)</function></literal>
+ </entry>
+ <entry><type>bool</type></entry>
+ <entry>
+ Returns true if <parameter>string</parameter> starts with <parameter>prefix</parameter>.
+ </entry>
+ <entry><literal>text_startswith('alphabet', 'alph')</literal></entry>
+ <entry><literal>t</literal></entry>
+ </row>
+
<row>
<entry>
<indexterm>
@@ -4033,6 +4048,12 @@ cast(-44 as bit(12)) <lineannotation>111111010100</lineannotation>
ILIKE</function>, respectively. All of these operators are
<productname>PostgreSQL</productname>-specific.
</para>
+
+ <para>
+ There is also the prefix operator <literal>^@</literal> and corresponding
+ <function>text_startswith</function> function which covers cases when only
+ searching by beginning of the string is needed.
+ </para>
</sect2>
diff --git a/doc/src/sgml/spgist.sgml b/doc/src/sgml/spgist.sgml
index e47f70be89..06b7519052 100644
--- a/doc/src/sgml/spgist.sgml
+++ b/doc/src/sgml/spgist.sgml
@@ -161,6 +161,7 @@
<literal>~<~</literal>
<literal>~>=~</literal>
<literal>~>~</literal>
+ <literal>^@</literal>
</entry>
</row>
<row>
diff --git a/src/backend/access/spgist/spgtextproc.c b/src/backend/access/spgist/spgtextproc.c
index f156b2166e..c5e27dd7aa 100644
--- a/src/backend/access/spgist/spgtextproc.c
+++ b/src/backend/access/spgist/spgtextproc.c
@@ -67,6 +67,20 @@
*/
#define SPGIST_MAX_PREFIX_LENGTH Max((int) (BLCKSZ - 258 * 16 - 100), 32)
+/*
+ * Strategy for collation aware operator on text is equal to btree strategy
+ * plus value of 10.
+ *
+ * Current collation aware strategies and their corresponding btree strategies:
+ * 11 BTLessStrategyNumber
+ * 12 BTLessEqualStrategyNumber
+ * 14 BTGreaterEqualStrategyNumber
+ * 15 BTGreaterStrategyNumber
+ */
+#define SPG_STRATEGY_ADDITION (10)
+#define SPG_IS_COLLATION_AWARE_STRATEGY(s) ((s) > SPG_STRATEGY_ADDITION \
+ && (s) != RTPrefixStrategyNumber)
+
/* Struct for sorting values in picksplit */
typedef struct spgNodePtr
{
@@ -496,10 +510,10 @@ spg_text_inner_consistent(PG_FUNCTION_ARGS)
* well end with a partial multibyte character, so that applying
* any encoding-sensitive test to it would be risky anyhow.)
*/
- if (strategy > 10)
+ if (SPG_IS_COLLATION_AWARE_STRATEGY(strategy))
{
if (collate_is_c)
- strategy -= 10;
+ strategy -= SPG_STRATEGY_ADDITION;
else
continue;
}
@@ -526,6 +540,10 @@ spg_text_inner_consistent(PG_FUNCTION_ARGS)
if (r < 0)
res = false;
break;
+ case RTPrefixStrategyNumber:
+ if (r != 0)
+ res = false;
+ break;
default:
elog(ERROR, "unrecognized strategy number: %d",
in->scankeys[j].sk_strategy);
@@ -605,10 +623,31 @@ spg_text_leaf_consistent(PG_FUNCTION_ARGS)
int queryLen = VARSIZE_ANY_EXHDR(query);
int r;
- if (strategy > 10)
+ if (strategy == RTPrefixStrategyNumber)
+ {
+ if (!in->returnData && level >= queryLen)
+ {
+ /*
+ * If we got to the leaf when level is equal or longer than
+ * query then it is a prefix. We skip cases when returnData is
+ * true, it would mean then leaf could be visited anyway.
+ */
+ continue;
+ }
+
+ res = DatumGetBool(DirectFunctionCall2(text_startswith,
+ out->leafValue, PointerGetDatum(query)));
+
+ if (!res) /* no need to consider remaining conditions */
+ break;
+
+ continue;
+ }
+
+ if (SPG_IS_COLLATION_AWARE_STRATEGY(strategy))
{
/* Collation-aware comparison */
- strategy -= 10;
+ strategy -= SPG_STRATEGY_ADDITION;
/* If asserts enabled, verify encoding of reconstructed string */
Assert(pg_verifymbstr(fullValue, fullLen, false));
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index bf240aa9c5..2bde1d48e9 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -1204,7 +1204,6 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
Oid vartype;
Oid opfamily;
Pattern_Prefix_Status pstatus;
- Const *patt;
Const *prefix = NULL;
Selectivity rest_selec = 0;
double nullfrac = 0.0;
@@ -1307,18 +1306,28 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
nullfrac = stats->stanullfrac;
}
- /*
- * Pull out any fixed prefix implied by the pattern, and estimate the
- * fractional selectivity of the remainder of the pattern. Unlike many of
- * the other functions in this file, we use the pattern operator's actual
- * collation for this step. This is not because we expect the collation
- * to make a big difference in the selectivity estimate (it seldom would),
- * but because we want to be sure we cache compiled regexps under the
- * right cache key, so that they can be re-used at runtime.
- */
- patt = (Const *) other;
- pstatus = pattern_fixed_prefix(patt, ptype, collation,
- &prefix, &rest_selec);
+ if (ptype == Pattern_Type_Prefix)
+ {
+ Assert(consttype == TEXTOID);
+ pstatus = Pattern_Prefix_Partial;
+ rest_selec = 1.0; /* all */
+ prefix = (Const *) other;
+ }
+ else
+ {
+ /*
+ * Pull out any fixed prefix implied by the pattern, and estimate the
+ * fractional selectivity of the remainder of the pattern. Unlike
+ * many of the other functions in this file, we use the pattern
+ * operator's actual collation for this step. This is not because
+ * we expect the collation to make a big difference in the selectivity
+ * estimate (it seldom would), but because we want to be sure we cache
+ * compiled regexps under the right cache key, so that they can be
+ * re-used at runtime.
+ */
+ pstatus = pattern_fixed_prefix((Const *) other, ptype, collation,
+ &prefix, &rest_selec);
+ }
/*
* If necessary, coerce the prefix constant to the right type.
@@ -1334,7 +1343,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
break;
case BYTEAOID:
prefixstr = DatumGetCString(DirectFunctionCall1(byteaout,
- prefix->constvalue));
+ prefix->constvalue));
break;
default:
elog(ERROR, "unrecognized consttype: %u",
@@ -1449,7 +1458,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
/* result should be in range, but make sure... */
CLAMP_PROBABILITY(result);
- if (prefix)
+ if (prefix && prefix->constvalue != constval)
{
pfree(DatumGetPointer(prefix->constvalue));
pfree(prefix);
@@ -1488,6 +1497,16 @@ likesel(PG_FUNCTION_ARGS)
}
/*
+ * prefixsel - selectivity of prefix operator
+ */
+Datum
+prefixsel(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false));
+}
+
+/*
+ *
* iclikesel - Selectivity of ILIKE pattern match.
*/
Datum
@@ -2906,6 +2925,15 @@ likejoinsel(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false));
}
+/*
+ * prefixjoinsel - Join selectivity of prefix operator
+ */
+Datum
+prefixjoinsel(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false));
+}
+
/*
* iclikejoinsel - Join selectivity of ILIKE pattern match.
*/
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 4346410d5a..cb721e9a57 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -1761,6 +1761,34 @@ text_ge(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(result);
}
+Datum
+text_startswith(PG_FUNCTION_ARGS)
+{
+ Datum arg1 = PG_GETARG_DATUM(0);
+ Datum arg2 = PG_GETARG_DATUM(1);
+ bool result;
+ Size len1,
+ len2;
+
+ len1 = toast_raw_datum_size(arg1);
+ len2 = toast_raw_datum_size(arg2);
+ if (len2 > len1)
+ result = false;
+ else
+ {
+ text *targ1 = DatumGetTextPP(arg1);
+ text *targ2 = DatumGetTextPP(arg2);
+
+ result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
+ len2 - VARHDRSZ) == 0);
+
+ PG_FREE_IF_COPY(targ1, 0);
+ PG_FREE_IF_COPY(targ2, 1);
+ }
+
+ PG_RETURN_BOOL(result);
+}
+
Datum
bttextcmp(PG_FUNCTION_ARGS)
{
diff --git a/src/include/access/stratnum.h b/src/include/access/stratnum.h
index bddfac4c10..0db11a1117 100644
--- a/src/include/access/stratnum.h
+++ b/src/include/access/stratnum.h
@@ -68,8 +68,9 @@ typedef uint16 StrategyNumber;
#define RTSubEqualStrategyNumber 25 /* for inet <<= */
#define RTSuperStrategyNumber 26 /* for inet << */
#define RTSuperEqualStrategyNumber 27 /* for inet >>= */
+#define RTPrefixStrategyNumber 28 /* for text ^@ */
-#define RTMaxStrategyNumber 27
+#define RTMaxStrategyNumber 28
#endif /* STRATNUM_H */
diff --git a/src/include/catalog/pg_amop.h b/src/include/catalog/pg_amop.h
index 03af581df4..8eb74a21ca 100644
--- a/src/include/catalog/pg_amop.h
+++ b/src/include/catalog/pg_amop.h
@@ -799,6 +799,7 @@ DATA(insert ( 4017 25 25 11 s 664 4000 0 ));
DATA(insert ( 4017 25 25 12 s 665 4000 0 ));
DATA(insert ( 4017 25 25 14 s 667 4000 0 ));
DATA(insert ( 4017 25 25 15 s 666 4000 0 ));
+DATA(insert ( 4017 25 25 28 s 3449 4000 0 ));
/*
* btree jsonb_ops
diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h
index e74f963eb5..4f7f4812fe 100644
--- a/src/include/catalog/pg_operator.h
+++ b/src/include/catalog/pg_operator.h
@@ -134,6 +134,8 @@ DESCR("less than");
DATA(insert OID = 98 ( "=" PGNSP PGUID b t t 25 25 16 98 531 texteq eqsel eqjoinsel ));
DESCR("equal");
#define TextEqualOperator 98
+DATA(insert OID = 3449 ( "^@" PGNSP PGUID b f f 25 25 16 0 0 text_startswith prefixsel prefixjoinsel ));
+DESCR("starts with");
DATA(insert OID = 349 ( "||" PGNSP PGUID b f f 2277 2283 2277 0 0 array_append - - ));
DESCR("append element onto end of array");
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index bfc90098f8..d672b33cc9 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -209,6 +209,7 @@ DATA(insert OID = 64 ( int2lt PGNSP PGUID 12 1 0 0 0 f f t t f i s 2 0 16
DATA(insert OID = 65 ( int4eq PGNSP PGUID 12 1 0 0 0 f f t t f i s 2 0 16 "23 23" _null_ _null_ _null_ _null_ _null_ int4eq _null_ _null_ _null_ ));
DATA(insert OID = 66 ( int4lt PGNSP PGUID 12 1 0 0 0 f f t t f i s 2 0 16 "23 23" _null_ _null_ _null_ _null_ _null_ int4lt _null_ _null_ _null_ ));
DATA(insert OID = 67 ( texteq PGNSP PGUID 12 1 0 0 0 f f t t f i s 2 0 16 "25 25" _null_ _null_ _null_ _null_ _null_ texteq _null_ _null_ _null_ ));
+DATA(insert OID = 3450 ( text_startswith PGNSP PGUID 12 1 0 0 0 f f t t f i s 2 0 16 "25 25" _null_ _null_ _null_ _null_ _null_ text_startswith _null_ _null_ _null_ ));
DATA(insert OID = 68 ( xideq PGNSP PGUID 12 1 0 0 0 f f t t f i s 2 0 16 "28 28" _null_ _null_ _null_ _null_ _null_ xideq _null_ _null_ _null_ ));
DATA(insert OID = 3308 ( xidneq PGNSP PGUID 12 1 0 0 0 f f t t f i s 2 0 16 "28 28" _null_ _null_ _null_ _null_ _null_ xidneq _null_ _null_ _null_ ));
DATA(insert OID = 69 ( cideq PGNSP PGUID 12 1 0 0 0 f f t t f i s 2 0 16 "29 29" _null_ _null_ _null_ _null_ _null_ cideq _null_ _null_ _null_ ));
@@ -2568,6 +2569,10 @@ DATA(insert OID = 1828 ( nlikejoinsel PGNSP PGUID 12 1 0 0 0 f f f t f s s 5 0
DESCR("join selectivity of NOT LIKE");
DATA(insert OID = 1829 ( icregexnejoinsel PGNSP PGUID 12 1 0 0 0 f f f t f s s 5 0 701 "2281 26 2281 21 2281" _null_ _null_ _null_ _null_ _null_ icregexnejoinsel _null_ _null_ _null_ ));
DESCR("join selectivity of case-insensitive regex non-match");
+DATA(insert OID = 3437 ( prefixsel PGNSP PGUID 12 1 0 0 0 f f f t f s s 4 0 701 "2281 26 2281 23" _null_ _null_ _null_ _null_ _null_ prefixsel _null_ _null_ _null_ ));
+DESCR("restriction selectivity of exact prefix");
+DATA(insert OID = 3438 ( prefixjoinsel PGNSP PGUID 12 1 0 0 0 f f f t f s s 5 0 701 "2281 26 2281 21 2281" _null_ _null_ _null_ _null_ _null_ prefixjoinsel _null_ _null_ _null_ ));
+DESCR("join selectivity of exact prefix");
/* Aggregate-related functions */
DATA(insert OID = 1830 ( float8_avg PGNSP PGUID 12 1 0 0 0 f f f t f i s 1 0 701 "1022" _null_ _null_ _null_ _null_ _null_ float8_avg _null_ _null_ _null_ ));
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index 299c9f846a..95e44280c4 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -87,8 +87,11 @@ typedef struct VariableStatData
typedef enum
{
- Pattern_Type_Like, Pattern_Type_Like_IC,
- Pattern_Type_Regex, Pattern_Type_Regex_IC
+ Pattern_Type_Like,
+ Pattern_Type_Like_IC,
+ Pattern_Type_Regex,
+ Pattern_Type_Regex_IC,
+ Pattern_Type_Prefix
} Pattern_Type;
typedef enum
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index 057faff2e5..09757c5a74 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -372,6 +372,12 @@ SELECT count(*) FROM radix_text_tbl WHERE t ~>~ 'Worth
48
(1 row)
+SELECT count(*) FROM radix_text_tbl WHERE t ^@ 'Worth';
+ count
+-------
+ 2
+(1 row)
+
SELECT * FROM gpolygon_tbl ORDER BY f1 <-> '(0,0)'::point LIMIT 10;
f1
-------------------------------------------------
@@ -1182,6 +1188,21 @@ SELECT count(*) FROM radix_text_tbl WHERE t ~>~ 'Worth
48
(1 row)
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM radix_text_tbl WHERE t ^@ 'Worth';
+ QUERY PLAN
+------------------------------------------------------------
+ Aggregate
+ -> Index Only Scan using sp_radix_ind on radix_text_tbl
+ Index Cond: (t ^@ 'Worth'::text)
+(3 rows)
+
+SELECT count(*) FROM radix_text_tbl WHERE t ^@ 'Worth';
+ count
+-------
+ 2
+(1 row)
+
EXPLAIN (COSTS OFF)
SELECT * FROM gpolygon_tbl ORDER BY f1 <-> '(0,0)'::point LIMIT 10;
QUERY PLAN
@@ -1763,6 +1784,23 @@ SELECT count(*) FROM radix_text_tbl WHERE t ~>~ 'Worth
48
(1 row)
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM radix_text_tbl WHERE t ^@ 'Worth';
+ QUERY PLAN
+------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on radix_text_tbl
+ Recheck Cond: (t ^@ 'Worth'::text)
+ -> Bitmap Index Scan on sp_radix_ind
+ Index Cond: (t ^@ 'Worth'::text)
+(5 rows)
+
+SELECT count(*) FROM radix_text_tbl WHERE t ^@ 'Worth';
+ count
+-------
+ 2
+(1 row)
+
RESET enable_seqscan;
RESET enable_indexscan;
RESET enable_bitmapscan;
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index 01608d2c04..919248cdd2 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -718,6 +718,7 @@ sha224(bytea)
sha256(bytea)
sha384(bytea)
sha512(bytea)
+text_startswith(text,text)
macaddr8_eq(macaddr8,macaddr8)
macaddr8_lt(macaddr8,macaddr8)
macaddr8_le(macaddr8,macaddr8)
@@ -1887,7 +1888,8 @@ ORDER BY 1, 2, 3;
4000 | 25 | <<=
4000 | 26 | >>
4000 | 27 | >>=
-(121 rows)
+ 4000 | 28 | ^@
+(122 rows)
-- Check that all opclass search operators have selectivity estimators.
-- This is not absolutely required, but it seems a reasonable thing
diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
index 7f17588b0d..c9671a4e13 100644
--- a/src/test/regress/sql/create_index.sql
+++ b/src/test/regress/sql/create_index.sql
@@ -224,6 +224,8 @@ SELECT count(*) FROM radix_text_tbl WHERE t > 'Worth
SELECT count(*) FROM radix_text_tbl WHERE t ~>~ 'Worth St ';
+SELECT count(*) FROM radix_text_tbl WHERE t ^@ 'Worth';
+
SELECT * FROM gpolygon_tbl ORDER BY f1 <-> '(0,0)'::point LIMIT 10;
SELECT circle_center(f1), round(radius(f1)) as radius FROM gcircle_tbl ORDER BY f1 <-> '(200,300)'::point LIMIT 10;
@@ -441,6 +443,10 @@ EXPLAIN (COSTS OFF)
SELECT count(*) FROM radix_text_tbl WHERE t ~>~ 'Worth St ';
SELECT count(*) FROM radix_text_tbl WHERE t ~>~ 'Worth St ';
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM radix_text_tbl WHERE t ^@ 'Worth';
+SELECT count(*) FROM radix_text_tbl WHERE t ^@ 'Worth';
+
EXPLAIN (COSTS OFF)
SELECT * FROM gpolygon_tbl ORDER BY f1 <-> '(0,0)'::point LIMIT 10;
SELECT * FROM gpolygon_tbl ORDER BY f1 <-> '(0,0)'::point LIMIT 10;
@@ -578,6 +584,10 @@ EXPLAIN (COSTS OFF)
SELECT count(*) FROM radix_text_tbl WHERE t ~>~ 'Worth St ';
SELECT count(*) FROM radix_text_tbl WHERE t ~>~ 'Worth St ';
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM radix_text_tbl WHERE t ^@ 'Worth';
+SELECT count(*) FROM radix_text_tbl WHERE t ^@ 'Worth';
+
RESET enable_seqscan;
RESET enable_indexscan;
RESET enable_bitmapscan;