[SQL] tsearch2: ts_headline performance problem

Anders Østergaard Jensen Sun, 06 Jan 2008 06:17:30 -0800

Hi there people,

I'm trying to build a text search engine for relatively large PDFdocuments with a web frontend. I use PostgreSQL 8.3 beta3/4 withtsearch2 for full-text indexing. Everything performs quite well, untilI start indexing 800 pages of text into the database (with the utf8danish locale and dictionary) -- then ts_headline performsunreasonably slow. I use a GIN index on the table (calleddocument_revisions which is linked to a master document table called'documents' -- the system is made for version tracking of multipledocuments into the same index) and an update trigger for maintainingthe tsvector row on the table.


Omitting ts_headline yields the following EXPLAIN result:

<snip>
metabase=# explain analyze select * from f_search_revision('website');
NOTICE:  f_search_revisions_arr: loop: (1) doc_id = (1000)
CONTEXT:  SQL statement "SELECT f_search_revision_arr( $1 ) AS res"
PL/pgSQL function "f_search_revision_int" line 13 at SQL statement
SQL function "f_search_revision" statement 1
NOTICE:  hest: ({1000})
CONTEXT:  SQL function "f_search_revision" statement 1
NOTICE:  total ranking for (1) eq (0.0865452) for id = (1000)
CONTEXT:  SQL function "f_search_revision" statement 1
                                                         QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------

Function Scan on f_search_revision (cost=0.00..260.00 rows=1000width=408) (actual time=2413.384..2413.384 rows=1 loops=1)

 Total runtime: 2413.465 ms
(2 rows)
</snip>

The function f_search_revision invokes another function that fetchesall matching document_revisions without calling ts_headline (methodf_search_revision_arr), followed by a post-processing of the matchingrows for fetching the ts_ranking values and ts_headline. The followingsnippet shows the query plan for the function call with the same query:


<snip>

metabase=# explain analyze select * fromf_search_revision_arr('website');

NOTICE:  f_search_revisions_arr: loop: (1) doc_id = (1000)
                                                     QUERY PLAN
---------------------------------------------------------------------------------------------------------------------

Function Scan on f_search_revision_arr (cost=0.00..0.26 rows=1width=32) (actual time=3.417..3.418 rows=1 loops=1)

 Total runtime: 3.442 ms
(2 rows)
</snip>

Here is the code for the functions that I am using:

CREATE OR REPLACE FUNCTION f_search_revision_arr(q TEXT) RETURNSINTEGER[] AS $$

        declare
                iter            record; 
                results integer[];
                i                               integer;
        begin
                i := 1;
                for iter in SELECT * FROM documents doc, plainto_tsquery(q) AS 
tsq
                        WHERE (doc.search_idx @@ tsq)

OR doc.id IN (SELECT document_id FROM document_revisions dr,plainto_tsquery(q) AS tsq2

                        WHERE dr.search_idx @@ tsq2) loop
                        results[i] = iter.id;

raise notice 'f_search_revisions_arr: loop: (%) doc_id = (%)', i,results[i];

                        i := i + 1;
                end loop;
                return results;
        end;
$$ LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION f_search_revision_int(q TEXT) RETURNSt_search_result[] AS $$

        declare
                doc_ids integer[];
                doc_rankings float[];
                i integer;
                iter record;
                pointer_doc record;
                pointer_rev record;
                pointer_type record;
                results t_search_result[];
        begin
                i := 1;

                SELECT INTO iter f_search_revision_arr(q) AS res;
                doc_ids := iter.res;
                raise notice 'hest: (%)', doc_ids;

                -- Ensure that something was found:
                if (doc_ids IS NULL) then
                        return results;
                end if;

                for i in 1..array_upper(doc_ids, 1) loop
                        -- raise notice 'upper = (%)', doc_ids[i];
                        SELECT INTO pointer_doc ts_rank(doc.search_idx, tsq) AS 
rank_doc,
                                ts_headline(doc.name, tsq) AS headline_name,
                                ts_headline(doc.description, tsq) AS 
headline_description,
                                ts_headline(doc.keywords, tsq) AS 
headline_keywords,
                                doc.name,
                                doc.description,
                                doc.keywords,
                                doc.document_type_id
                                FROM documents doc, plainto_tsquery(q) AS tsq
                                WHERE doc.id = doc_ids[i];

                        SELECT INTO pointer_rev ts_rank(rev.search_idx, tsq) AS 
rank_rev,
                                ts_headline(rev.content, tsq) AS 
headline_content,
                                rev.id AS revision_id
                                FROM document_revisions rev, plainto_tsquery(q) 
AS tsq
                                WHERE rev.document_id = doc_ids[i];

                        -- Select the document type:

SELECT INTO pointer_type mime_type, type_name, image_url,extension, internal_type

                                FROM document_types WHERE id = 
pointer_doc.document_type_id;

doc_rankings[i] = pointer_doc.rank_doc + pointer_rev.rank_rev; --SUM!raise notice 'total ranking for (%) eq (%) for id = (%)', i,doc_rankings[i], doc_ids[i];

                        results[i] = ROW(pointer_rev.revision_id,
                                         doc_ids[i],
                                         pointer_doc.name, 
pointer_doc.description, pointer_doc.keywords,

pointer_doc.document_type_id, pointer_type.mime_type,pointer_type.type_name, pointer_type.image_url,pointer_type.extension, pointer_type.internal_type,doc_rankings[i], pointer_doc.headline_name,pointer_doc.headline_description, pointer_doc.headline_keywords,pointer_rev.headline_content,

                                         0);
                end loop;
                return results;
        end;
$$ LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION f_search_revision(q TEXT) RETURNS SETOFt_search_result AS $$

        SELECT * FROM f_unnest(f_search_revision_int($1));
$$ LANGUAGE sql;

------

How come that ts_headline yields so bad performance? Is there any wayof tuning ts_headline? I need the highlighting functionality forpresenting the search results in the web frontend properly.Alternately, are there any alternatives to the ts_headline function?


Sincerely,

Anders




---------------------------(end of broadcast)---------------------------
TIP 2: Don't 'kill -9' the postmaster

[SQL] tsearch2: ts_headline performance problem

Reply via email to