[HACKERS] [PATCH] few fts functions for jsonb

Dmitry Dolgov Tue, 28 Feb 2017 03:01:17 -0800

Hi all

I would like to propose patch with a set of new small functions for fts in
case of
jsonb data type:


* to_tsvector(config, jsonb) - make a tsvector from all string values and
  elements of jsonb object. To prevent the situation, when tsquery can find
a
  phrase consisting of lexemes from two different values/elements, this
  function will add an increment to position of each lexeme from every new
  value/element.

* ts_headline(config, jsonb, tsquery, options) - generate a headline
directly
  from jsonb object

Here are the examples how they work:

```
=# select to_tsvector('{"a": "aaa bbb", "b": ["ccc ddd"], "c": {"d": "eee
fff"}}'::jsonb);
                   to_tsvector
-------------------------------------------------
 'aaa':1 'bbb':2 'ccc':4 'ddd':5 'eee':7 'fff':8
(1 row)


=# select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc
ddd"}}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >');
     ts_headline
----------------------
 aaa <bbb> ccc <ddd>
(1 row)
```

Any comments or suggestions?

diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c
index 6e5de8f..08e08e5 100644
--- a/src/backend/tsearch/to_tsany.c
+++ b/src/backend/tsearch/to_tsany.c
@@ -16,6 +16,8 @@
 #include "tsearch/ts_cache.h"
 #include "tsearch/ts_utils.h"
 #include "utils/builtins.h"
+#include "utils/jsonb.h"
+#include "utils/fmgrprotos.h"
 
 
 typedef struct MorphOpaque
@@ -256,6 +258,58 @@ to_tsvector(PG_FUNCTION_ARGS)
 										PointerGetDatum(in)));
 }
 
+Datum
+jsonb_to_tsvector(PG_FUNCTION_ARGS)
+{
+	Jsonb				*jb = PG_GETARG_JSONB(0);
+	JsonbIterator		*it;
+	JsonbValue			v;
+	Oid					cfgId;
+	ParsedText			prs;
+	TSVector			result, item_vector;
+	JsonbIteratorToken	type;
+	int					i;
+
+	cfgId = getTSCurrentConfig(true);
+	it = JsonbIteratorInit(&jb->root);
+
+	while ((type = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
+	{
+		if ((type == WJB_VALUE || type == WJB_ELEM) && v.type == jbvString)
+		{
+			prs.lenwords = v.val.string.len / 6;
+
+			if (prs.lenwords == 0)
+				prs.lenwords = 2;
+
+			prs.curwords = 0;
+			prs.pos = 0;
+			prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
+
+			parsetext(cfgId, &prs, v.val.string.val, v.val.string.len);
+
+			if (prs.curwords)
+			{
+				if (result->size != 0)
+				{
+					for (i = 0; i < prs.curwords; i++)
+						prs.words[i].pos.pos = prs.words[i].pos.pos + TS_JUMP;
+
+					item_vector = make_tsvector(&prs);
+
+					result = DirectFunctionCall2(tsvector_concat,
+											TSVectorGetDatum(result),
+											PointerGetDatum(item_vector));
+				}
+				else
+					result = make_tsvector(&prs);
+			}
+		}
+	}
+
+	PG_RETURN_DATUM(result);
+}
+
 /*
  * to_tsquery
  */
diff --git a/src/backend/tsearch/wparser.c b/src/backend/tsearch/wparser.c
index 8ca1c62..035632e 100644
--- a/src/backend/tsearch/wparser.c
+++ b/src/backend/tsearch/wparser.c
@@ -21,6 +21,7 @@
 #include "tsearch/ts_utils.h"
 #include "utils/builtins.h"
 #include "utils/varlena.h"
+#include "utils/jsonb.h"
 
 
 /******sql-level interface******/
@@ -362,3 +363,41 @@ ts_headline_opt(PG_FUNCTION_ARGS)
 										PG_GETARG_DATUM(1),
 										PG_GETARG_DATUM(2)));
 }
+
+Datum
+ts_headline_jsonb(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_byid_opt,
+								  ObjectIdGetDatum(getTSCurrentConfig(true)),
+										CStringGetTextDatum(jsonb_values_as_string(PG_GETARG_DATUM(0))),
+										PG_GETARG_DATUM(1)));
+}
+
+Datum
+ts_headline_jsonb_byid(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_byid_opt,
+										PG_GETARG_DATUM(0),
+										CStringGetTextDatum(jsonb_values_as_string(PG_GETARG_DATUM(1))),
+										PG_GETARG_DATUM(2)));
+}
+
+Datum
+ts_headline_jsonb_opt(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_byid_opt,
+								  ObjectIdGetDatum(getTSCurrentConfig(true)),
+										CStringGetTextDatum(jsonb_values_as_string(PG_GETARG_DATUM(0))),
+										PG_GETARG_DATUM(1),
+										PG_GETARG_DATUM(2)));
+}
+
+Datum
+ts_headline_jsonb_byid_opt(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_byid_opt,
+										PG_GETARG_DATUM(0),
+										CStringGetTextDatum(jsonb_values_as_string(PG_GETARG_DATUM(1))),
+										PG_GETARG_DATUM(2),
+										PG_GETARG_DATUM(3)));
+}
diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c
index 6a7aab2..d504b87 100644
--- a/src/backend/utils/adt/jsonfuncs.c
+++ b/src/backend/utils/adt/jsonfuncs.c
@@ -4130,3 +4130,29 @@ setPathArray(JsonbIterator **it, Datum *path_elems, bool *path_nulls,
 		}
 	}
 }
+
+/*
+ * Gather all string values and elements from jsonb into one string buffer.
+ * It's convenient for using inside ts_headline_* functions.
+ */
+char*
+jsonb_values_as_string(Jsonb *jb)
+{
+	JsonbIterator		*it;
+	JsonbValue			v;
+	JsonbIteratorToken	type;
+	StringInfo			buffer = makeStringInfo();
+
+	it = JsonbIteratorInit(&jb->root);
+
+	while ((type = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
+	{
+		if ((type == WJB_VALUE || type == WJB_ELEM) && v.type == jbvString)
+		{
+			appendStringInfoString(buffer, v.val.string.val);
+			appendBinaryStringInfo(buffer, " ", 1);
+		}
+	}
+
+	return buffer->data;
+}
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index a4cc86d..ccedece 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4759,6 +4759,15 @@ DESCR("generate headline");
 DATA(insert OID = 3755 (  ts_headline	PGNSP PGUID 12 100 0 0 0 f f f f t f s s 2 0 25 "25 3615" _null_ _null_ _null_ _null_ _null_ ts_headline _null_ _null_ _null_ ));
 DESCR("generate headline");
 
+DATA(insert OID = 4201 (  ts_headline	PGNSP PGUID 12 100 0 0 0 f f f f t f i s 4 0 25 "3734 3802 3615 25" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb_byid_opt _null_ _null_ _null_ ));
+DESCR("generate headline from jsonb");
+DATA(insert OID = 4202 (  ts_headline	PGNSP PGUID 12 100 0 0 0 f f f f t f i s 3 0 25 "3734 3802 3615" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb_byid _null_ _null_ _null_ ));
+DESCR("generate headline from jsonb");
+DATA(insert OID = 4203 (  ts_headline	PGNSP PGUID 12 100 0 0 0 f f f f t f s s 3 0 25 "3802 3615 25" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb_opt _null_ _null_ _null_ ));
+DESCR("generate headline from jsonb");
+DATA(insert OID = 4204 (  ts_headline	PGNSP PGUID 12 100 0 0 0 f f f f t f s s 2 0 25 "3802 3615" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb _null_ _null_ _null_ ));
+DESCR("generate headline from jsonb");
+
 DATA(insert OID = 3745 (  to_tsvector		PGNSP PGUID 12 100 0 0 0 f f f f t f i s 2 0 3614 "3734 25" _null_ _null_ _null_ _null_ _null_ to_tsvector_byid _null_ _null_ _null_ ));
 DESCR("transform to tsvector");
 DATA(insert OID = 3746 (  to_tsquery		PGNSP PGUID 12 100 0 0 0 f f f f t f i s 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ _null_ to_tsquery_byid _null_ _null_ _null_ ));
@@ -4775,6 +4784,8 @@ DATA(insert OID = 3751 (  plainto_tsquery	PGNSP PGUID 12 100 0 0 0 f f f f t f s
 DESCR("transform to tsquery");
 DATA(insert OID = 5001 (  phraseto_tsquery	PGNSP PGUID 12 100 0 0 0 f f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ phraseto_tsquery _null_ _null_ _null_ ));
 DESCR("transform to tsquery");
+DATA(insert OID = 3800 (  to_tsvector		PGNSP PGUID 12 100 0 0 0 f f f f t f s s 1 0 3614 "3802" _null_ _null_ _null_ _null_ _null_ jsonb_to_tsvector _null_ _null_ _null_ ));
+DESCR("transform jsonb to tsvector");
 
 DATA(insert OID = 3752 (  tsvector_update_trigger			PGNSP PGUID 12 1 0 0 0 f f f f f f v s 0 0 2279 "" _null_ _null_ _null_ _null_ _null_ tsvector_update_trigger_byid _null_ _null_ _null_ ));
 DESCR("trigger for automatic update of tsvector column");
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index 155650c..873e2e1 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -86,6 +86,15 @@ typedef struct
 #define MAXNUMPOS	(256)
 #define LIMITPOS(x) ( ( (x) >= MAXENTRYPOS ) ? (MAXENTRYPOS-1) : (x) )
 
+/*
+ * In case if a TSVector contains several parts and we want to treat them as
+ * separate, it's necessary to add an artificial increment to position of each
+ * lexeme from every next part. It's required to avoid the situation when
+ * tsquery can find a phrase consisting of lexemes from two of such parts.
+ * TS_JUMP defined a value of this increment.
+ */
+#define TS_JUMP 1
+
 /* This struct represents a complete tsvector datum */
 typedef struct
 {
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index 411e158..233f7b7 100644
--- a/src/include/utils/jsonb.h
+++ b/src/include/utils/jsonb.h
@@ -377,5 +377,7 @@ extern char *JsonbToCString(StringInfo out, JsonbContainer *in,
 extern char *JsonbToCStringIndent(StringInfo out, JsonbContainer *in,
 					 int estimated_len);
 
+extern char *jsonb_values_as_string(Jsonb *jsonb);
+
 
 #endif   /* __JSONB_H__ */
diff --git a/src/test/regress/expected/jsonb.out b/src/test/regress/expected/jsonb.out
index 8ec4150..3333730 100644
--- a/src/test/regress/expected/jsonb.out
+++ b/src/test/regress/expected/jsonb.out
@@ -3474,3 +3474,86 @@ HINT:  Try using the function jsonb_set to replace key value.
 select jsonb_insert('{"a": {"b": "value"}}', '{a, b}', '"new_value"', true);
 ERROR:  cannot replace existing key
 HINT:  Try using the function jsonb_set to replace key value.
+-- jsonb to tsvector
+select to_tsvector('{"a": "aaa bbb ddd ccc", "b": ["eee fff ggg"], "c": {"d": "hhh iii"}}'::jsonb);
+                                to_tsvector                                
+---------------------------------------------------------------------------
+ 'aaa':1 'bbb':2 'ccc':4 'ddd':3 'eee':6 'fff':7 'ggg':8 'hhh':10 'iii':11
+(1 row)
+
+-- jsonb to tsvector with stop words
+select to_tsvector('{"a": "aaa in bbb ddd ccc", "b": ["the eee fff ggg"], "c": {"d": "hhh. iii"}}'::jsonb);
+                                to_tsvector                                 
+----------------------------------------------------------------------------
+ 'aaa':1 'bbb':3 'ccc':5 'ddd':4 'eee':8 'fff':9 'ggg':10 'hhh':12 'iii':13
+(1 row)
+
+-- ts_vector corner cases
+select to_tsvector('""'::jsonb);
+ to_tsvector 
+-------------
+ 
+(1 row)
+
+select to_tsvector('{}'::jsonb);
+ to_tsvector 
+-------------
+ 
+(1 row)
+
+select to_tsvector('[]'::jsonb);
+ to_tsvector 
+-------------
+ 
+(1 row)
+
+select to_tsvector('null'::jsonb);
+ to_tsvector 
+-------------
+ 
+(1 row)
+
+-- ts_headline for jsonb
+select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'));
+                                 ts_headline                                  
+------------------------------------------------------------------------------
+ aaa <b>bbb</b> ccc <b>ddd</b> fffccc1 ddd1 ccc1 ddd1 ggg hhhiii jjj iii jjj 
+(1 row)
+
+select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'));
+                          ts_headline                          
+---------------------------------------------------------------
+ aaa <b>bbb</b> ccc <b>ddd</b> fff\x02 ggg hhhiii jjj iii jjj 
+(1 row)
+
+select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >');
+                            ts_headline                             
+--------------------------------------------------------------------
+ aaa <bbb> ccc <ddd> fffccc1 ddd1 ccc1 ddd1 ggg hhhiii jjj iii jjj 
+(1 row)
+
+select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >');
+                            ts_headline                             
+--------------------------------------------------------------------
+ aaa <bbb> ccc <ddd> fffccc1 ddd1 ccc1 ddd1 ggg hhhiii jjj iii jjj 
+(1 row)
+
+-- corner cases for ts_headline with jsonb
+select ts_headline('null'::jsonb, tsquery('aaa & bbb'));
+ ts_headline 
+-------------
+ 
+(1 row)
+
+select ts_headline('{}'::jsonb, tsquery('aaa & bbb'));
+ ts_headline 
+-------------
+ 
+(1 row)
+
+select ts_headline('[]'::jsonb, tsquery('aaa & bbb'));
+ ts_headline 
+-------------
+ 
+(1 row)
+
diff --git a/src/test/regress/sql/jsonb.sql b/src/test/regress/sql/jsonb.sql
index e2eaca0..2e31ee6 100644
--- a/src/test/regress/sql/jsonb.sql
+++ b/src/test/regress/sql/jsonb.sql
@@ -878,3 +878,26 @@ select jsonb_insert('{"a": {"b": "value"}}', '{a, c}', '"new_value"', true);
 
 select jsonb_insert('{"a": {"b": "value"}}', '{a, b}', '"new_value"');
 select jsonb_insert('{"a": {"b": "value"}}', '{a, b}', '"new_value"', true);
+
+-- jsonb to tsvector
+select to_tsvector('{"a": "aaa bbb ddd ccc", "b": ["eee fff ggg"], "c": {"d": "hhh iii"}}'::jsonb);
+
+-- jsonb to tsvector with stop words
+select to_tsvector('{"a": "aaa in bbb ddd ccc", "b": ["the eee fff ggg"], "c": {"d": "hhh. iii"}}'::jsonb);
+
+-- ts_vector corner cases
+select to_tsvector('""'::jsonb);
+select to_tsvector('{}'::jsonb);
+select to_tsvector('[]'::jsonb);
+select to_tsvector('null'::jsonb);
+
+-- ts_headline for jsonb
+select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'));
+select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'));
+select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >');
+select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >');
+
+-- corner cases for ts_headline with jsonb
+select ts_headline('null'::jsonb, tsquery('aaa & bbb'));
+select ts_headline('{}'::jsonb, tsquery('aaa & bbb'));
+select ts_headline('[]'::jsonb, tsquery('aaa & bbb'));

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] [PATCH] few fts functions for jsonb

Reply via email to