I'm trying to figure out how to compile text search statistics on a 
per-document basis.
While I successfully compute text search statistics for the entire corpus with 
a call
to ts_stat after having inserted all documents, what I also want is to run 
ts_stat on
the tsvector for each row so as to get the term frequency per document.

Sample code and comments follow.


-- Dumped from database version 9.5.7

CREATE DATABASE nlp;

\connect nlp

CREATE EXTENSION IF NOT EXISTS plpgsql WITH SCHEMA pg_catalog;

SET search_path = public, pg_catalog;

-- This table stores one document per row

CREATE TABLE document (
    document_id serial primary key,
    content text NOT NULL,
    document_vector tsvector
);

-- This is the table I need help with how to to populate
-- with term frequency per document

CREATE TABLE document_statistics (
    document_id integer primary key,
    word text,
    ndoc bigint, /* this will be one, since there is only one document */
    nentry bigint /* this is the number of interest */
);


ALTER TABLE ONLY document_statistics
    ADD CONSTRAINT document_statistics_document_id_fkey
    FOREIGN KEY (document_id)
    REFERENCES document(document_id);


CREATE FUNCTION document_bit() RETURNS trigger
    LANGUAGE plpgsql
    AS $$
BEGIN
        -- Compile document statistics for each document upon insert

        SELECT to_tsvector('simple', new.content) INTO new.document_vector;     
        RETURN new;
END;
$$;

CREATE TRIGGER document_bit
    BEFORE INSERT OR UPDATE ON document
    FOR EACH ROW EXECUTE PROCEDURE document_bit();


-- Sample data

INSERT INTO document (content) VALUES ('Hello World!');
INSERT INTO document (content) VALUES ('The quick brown dog jumped over the 
lazy dog');
INSERT INTO document (content) VALUES ('One flew over the coo coo''s nest',);

-- Once all the individual documents are inserted, then
-- calculate overall corpus statistics

insert into corpus_statistics select * from ts_stat('select document_vector 
from document');


-- I'm thinking something like this proposed after insert trigger
-- is where I want to compute document statistics, but can't
-- figure out how to make it work

CREATE FUNCTION document_ait() RETURNS trigger
    LANGUAGE plpgsql
    AS $$
DECLARE
        word_stat record;
BEGIN
/*
        --Here's one pathetic try

        FOR word_stat IN
                select * from ts_stat('select * from ' || (new.document_vector))
                LOOP
                RAISE NOTICE '%' , word_stat;
                INSERT INTO public.document_statistics(
            document_id, word, ndoc, nentry)
                VALUES (new.document_id, word_stat.word, word_stat.ndoc, 
word_stat.nentry);
        END LOOP;
*/
        RETURN new;
END;
$$;

CREATE TRIGGER document_ait AFTER INSERT ON document FOR EACH ROW EXECUTE 
PROCEDURE document_ait();

Reply via email to