Hi all I would like to propose patch with a set of new small functions for fts in case of jsonb data type:
* to_tsvector(config, jsonb) - make a tsvector from all string values and elements of jsonb object. To prevent the situation, when tsquery can find a phrase consisting of lexemes from two different values/elements, this function will add an increment to position of each lexeme from every new value/element. * ts_headline(config, jsonb, tsquery, options) - generate a headline directly from jsonb object Here are the examples how they work: ``` =# select to_tsvector('{"a": "aaa bbb", "b": ["ccc ddd"], "c": {"d": "eee fff"}}'::jsonb); to_tsvector ------------------------------------------------- 'aaa':1 'bbb':2 'ccc':4 'ddd':5 'eee':7 'fff':8 (1 row) =# select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd"}}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >'); ts_headline ---------------------- aaa <bbb> ccc <ddd> (1 row) ``` Any comments or suggestions?
diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c index 6e5de8f..08e08e5 100644 --- a/src/backend/tsearch/to_tsany.c +++ b/src/backend/tsearch/to_tsany.c @@ -16,6 +16,8 @@ #include "tsearch/ts_cache.h" #include "tsearch/ts_utils.h" #include "utils/builtins.h" +#include "utils/jsonb.h" +#include "utils/fmgrprotos.h" typedef struct MorphOpaque @@ -256,6 +258,58 @@ to_tsvector(PG_FUNCTION_ARGS) PointerGetDatum(in))); } +Datum +jsonb_to_tsvector(PG_FUNCTION_ARGS) +{ + Jsonb *jb = PG_GETARG_JSONB(0); + JsonbIterator *it; + JsonbValue v; + Oid cfgId; + ParsedText prs; + TSVector result, item_vector; + JsonbIteratorToken type; + int i; + + cfgId = getTSCurrentConfig(true); + it = JsonbIteratorInit(&jb->root); + + while ((type = JsonbIteratorNext(&it, &v, false)) != WJB_DONE) + { + if ((type == WJB_VALUE || type == WJB_ELEM) && v.type == jbvString) + { + prs.lenwords = v.val.string.len / 6; + + if (prs.lenwords == 0) + prs.lenwords = 2; + + prs.curwords = 0; + prs.pos = 0; + prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); + + parsetext(cfgId, &prs, v.val.string.val, v.val.string.len); + + if (prs.curwords) + { + if (result->size != 0) + { + for (i = 0; i < prs.curwords; i++) + prs.words[i].pos.pos = prs.words[i].pos.pos + TS_JUMP; + + item_vector = make_tsvector(&prs); + + result = DirectFunctionCall2(tsvector_concat, + TSVectorGetDatum(result), + PointerGetDatum(item_vector)); + } + else + result = make_tsvector(&prs); + } + } + } + + PG_RETURN_DATUM(result); +} + /* * to_tsquery */ diff --git a/src/backend/tsearch/wparser.c b/src/backend/tsearch/wparser.c index 8ca1c62..035632e 100644 --- a/src/backend/tsearch/wparser.c +++ b/src/backend/tsearch/wparser.c @@ -21,6 +21,7 @@ #include "tsearch/ts_utils.h" #include "utils/builtins.h" #include "utils/varlena.h" +#include "utils/jsonb.h" /******sql-level interface******/ @@ -362,3 +363,41 @@ ts_headline_opt(PG_FUNCTION_ARGS) PG_GETARG_DATUM(1), PG_GETARG_DATUM(2))); } + +Datum +ts_headline_jsonb(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_byid_opt, + ObjectIdGetDatum(getTSCurrentConfig(true)), + CStringGetTextDatum(jsonb_values_as_string(PG_GETARG_DATUM(0))), + PG_GETARG_DATUM(1))); +} + +Datum +ts_headline_jsonb_byid(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_byid_opt, + PG_GETARG_DATUM(0), + CStringGetTextDatum(jsonb_values_as_string(PG_GETARG_DATUM(1))), + PG_GETARG_DATUM(2))); +} + +Datum +ts_headline_jsonb_opt(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_byid_opt, + ObjectIdGetDatum(getTSCurrentConfig(true)), + CStringGetTextDatum(jsonb_values_as_string(PG_GETARG_DATUM(0))), + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(2))); +} + +Datum +ts_headline_jsonb_byid_opt(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_byid_opt, + PG_GETARG_DATUM(0), + CStringGetTextDatum(jsonb_values_as_string(PG_GETARG_DATUM(1))), + PG_GETARG_DATUM(2), + PG_GETARG_DATUM(3))); +} diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c index 6a7aab2..d504b87 100644 --- a/src/backend/utils/adt/jsonfuncs.c +++ b/src/backend/utils/adt/jsonfuncs.c @@ -4130,3 +4130,29 @@ setPathArray(JsonbIterator **it, Datum *path_elems, bool *path_nulls, } } } + +/* + * Gather all string values and elements from jsonb into one string buffer. + * It's convenient for using inside ts_headline_* functions. + */ +char* +jsonb_values_as_string(Jsonb *jb) +{ + JsonbIterator *it; + JsonbValue v; + JsonbIteratorToken type; + StringInfo buffer = makeStringInfo(); + + it = JsonbIteratorInit(&jb->root); + + while ((type = JsonbIteratorNext(&it, &v, false)) != WJB_DONE) + { + if ((type == WJB_VALUE || type == WJB_ELEM) && v.type == jbvString) + { + appendStringInfoString(buffer, v.val.string.val); + appendBinaryStringInfo(buffer, " ", 1); + } + } + + return buffer->data; +} diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index a4cc86d..ccedece 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -4759,6 +4759,15 @@ DESCR("generate headline"); DATA(insert OID = 3755 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f s s 2 0 25 "25 3615" _null_ _null_ _null_ _null_ _null_ ts_headline _null_ _null_ _null_ )); DESCR("generate headline"); +DATA(insert OID = 4201 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f i s 4 0 25 "3734 3802 3615 25" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb_byid_opt _null_ _null_ _null_ )); +DESCR("generate headline from jsonb"); +DATA(insert OID = 4202 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f i s 3 0 25 "3734 3802 3615" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb_byid _null_ _null_ _null_ )); +DESCR("generate headline from jsonb"); +DATA(insert OID = 4203 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f s s 3 0 25 "3802 3615 25" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb_opt _null_ _null_ _null_ )); +DESCR("generate headline from jsonb"); +DATA(insert OID = 4204 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f s s 2 0 25 "3802 3615" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb _null_ _null_ _null_ )); +DESCR("generate headline from jsonb"); + DATA(insert OID = 3745 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f f t f i s 2 0 3614 "3734 25" _null_ _null_ _null_ _null_ _null_ to_tsvector_byid _null_ _null_ _null_ )); DESCR("transform to tsvector"); DATA(insert OID = 3746 ( to_tsquery PGNSP PGUID 12 100 0 0 0 f f f f t f i s 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ _null_ to_tsquery_byid _null_ _null_ _null_ )); @@ -4775,6 +4784,8 @@ DATA(insert OID = 3751 ( plainto_tsquery PGNSP PGUID 12 100 0 0 0 f f f f t f s DESCR("transform to tsquery"); DATA(insert OID = 5001 ( phraseto_tsquery PGNSP PGUID 12 100 0 0 0 f f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ phraseto_tsquery _null_ _null_ _null_ )); DESCR("transform to tsquery"); +DATA(insert OID = 3800 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f f t f s s 1 0 3614 "3802" _null_ _null_ _null_ _null_ _null_ jsonb_to_tsvector _null_ _null_ _null_ )); +DESCR("transform jsonb to tsvector"); DATA(insert OID = 3752 ( tsvector_update_trigger PGNSP PGUID 12 1 0 0 0 f f f f f f v s 0 0 2279 "" _null_ _null_ _null_ _null_ _null_ tsvector_update_trigger_byid _null_ _null_ _null_ )); DESCR("trigger for automatic update of tsvector column"); diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h index 155650c..873e2e1 100644 --- a/src/include/tsearch/ts_type.h +++ b/src/include/tsearch/ts_type.h @@ -86,6 +86,15 @@ typedef struct #define MAXNUMPOS (256) #define LIMITPOS(x) ( ( (x) >= MAXENTRYPOS ) ? (MAXENTRYPOS-1) : (x) ) +/* + * In case if a TSVector contains several parts and we want to treat them as + * separate, it's necessary to add an artificial increment to position of each + * lexeme from every next part. It's required to avoid the situation when + * tsquery can find a phrase consisting of lexemes from two of such parts. + * TS_JUMP defined a value of this increment. + */ +#define TS_JUMP 1 + /* This struct represents a complete tsvector datum */ typedef struct { diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h index 411e158..233f7b7 100644 --- a/src/include/utils/jsonb.h +++ b/src/include/utils/jsonb.h @@ -377,5 +377,7 @@ extern char *JsonbToCString(StringInfo out, JsonbContainer *in, extern char *JsonbToCStringIndent(StringInfo out, JsonbContainer *in, int estimated_len); +extern char *jsonb_values_as_string(Jsonb *jsonb); + #endif /* __JSONB_H__ */ diff --git a/src/test/regress/expected/jsonb.out b/src/test/regress/expected/jsonb.out index 8ec4150..3333730 100644 --- a/src/test/regress/expected/jsonb.out +++ b/src/test/regress/expected/jsonb.out @@ -3474,3 +3474,86 @@ HINT: Try using the function jsonb_set to replace key value. select jsonb_insert('{"a": {"b": "value"}}', '{a, b}', '"new_value"', true); ERROR: cannot replace existing key HINT: Try using the function jsonb_set to replace key value. +-- jsonb to tsvector +select to_tsvector('{"a": "aaa bbb ddd ccc", "b": ["eee fff ggg"], "c": {"d": "hhh iii"}}'::jsonb); + to_tsvector +--------------------------------------------------------------------------- + 'aaa':1 'bbb':2 'ccc':4 'ddd':3 'eee':6 'fff':7 'ggg':8 'hhh':10 'iii':11 +(1 row) + +-- jsonb to tsvector with stop words +select to_tsvector('{"a": "aaa in bbb ddd ccc", "b": ["the eee fff ggg"], "c": {"d": "hhh. iii"}}'::jsonb); + to_tsvector +---------------------------------------------------------------------------- + 'aaa':1 'bbb':3 'ccc':5 'ddd':4 'eee':8 'fff':9 'ggg':10 'hhh':12 'iii':13 +(1 row) + +-- ts_vector corner cases +select to_tsvector('""'::jsonb); + to_tsvector +------------- + +(1 row) + +select to_tsvector('{}'::jsonb); + to_tsvector +------------- + +(1 row) + +select to_tsvector('[]'::jsonb); + to_tsvector +------------- + +(1 row) + +select to_tsvector('null'::jsonb); + to_tsvector +------------- + +(1 row) + +-- ts_headline for jsonb +select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh')); + ts_headline +------------------------------------------------------------------------------ + aaa <b>bbb</b> ccc <b>ddd</b> fffccc1 ddd1 ccc1 ddd1 ggg hhhiii jjj iii jjj +(1 row) + +select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh')); + ts_headline +--------------------------------------------------------------- + aaa <b>bbb</b> ccc <b>ddd</b> fff\x02 ggg hhhiii jjj iii jjj +(1 row) + +select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >'); + ts_headline +-------------------------------------------------------------------- + aaa <bbb> ccc <ddd> fffccc1 ddd1 ccc1 ddd1 ggg hhhiii jjj iii jjj +(1 row) + +select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >'); + ts_headline +-------------------------------------------------------------------- + aaa <bbb> ccc <ddd> fffccc1 ddd1 ccc1 ddd1 ggg hhhiii jjj iii jjj +(1 row) + +-- corner cases for ts_headline with jsonb +select ts_headline('null'::jsonb, tsquery('aaa & bbb')); + ts_headline +------------- + +(1 row) + +select ts_headline('{}'::jsonb, tsquery('aaa & bbb')); + ts_headline +------------- + +(1 row) + +select ts_headline('[]'::jsonb, tsquery('aaa & bbb')); + ts_headline +------------- + +(1 row) + diff --git a/src/test/regress/sql/jsonb.sql b/src/test/regress/sql/jsonb.sql index e2eaca0..2e31ee6 100644 --- a/src/test/regress/sql/jsonb.sql +++ b/src/test/regress/sql/jsonb.sql @@ -878,3 +878,26 @@ select jsonb_insert('{"a": {"b": "value"}}', '{a, c}', '"new_value"', true); select jsonb_insert('{"a": {"b": "value"}}', '{a, b}', '"new_value"'); select jsonb_insert('{"a": {"b": "value"}}', '{a, b}', '"new_value"', true); + +-- jsonb to tsvector +select to_tsvector('{"a": "aaa bbb ddd ccc", "b": ["eee fff ggg"], "c": {"d": "hhh iii"}}'::jsonb); + +-- jsonb to tsvector with stop words +select to_tsvector('{"a": "aaa in bbb ddd ccc", "b": ["the eee fff ggg"], "c": {"d": "hhh. iii"}}'::jsonb); + +-- ts_vector corner cases +select to_tsvector('""'::jsonb); +select to_tsvector('{}'::jsonb); +select to_tsvector('[]'::jsonb); +select to_tsvector('null'::jsonb); + +-- ts_headline for jsonb +select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh')); +select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh')); +select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >'); +select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >'); + +-- corner cases for ts_headline with jsonb +select ts_headline('null'::jsonb, tsquery('aaa & bbb')); +select ts_headline('{}'::jsonb, tsquery('aaa & bbb')); +select ts_headline('[]'::jsonb, tsquery('aaa & bbb'));
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers