create table data
(
data_id integer primary key,
...
);
create table keywords
(
keyword_id integer primary key,
keyword text collate nocase unique
);
create table n2mKeywords
(
data_id integer references data,
keyword_id integer references keywords,
primary key (data_id, keyword_id),
unique (keyword_id, data_id)
) WITHOUT ROWID;
select * form data where id in
(select data_id from keywords natural join n2mkeywords where keyword = 'onca'
or keyword = 'bonca'
INTERSECT
select data_id from keywords natural join n2mkeyworks where keyword =
'chocolate');
will return all the data tuples associated with the keyword ('once' or 'bonca')
and 'chocolate'
> -----Original Message-----
> From: sqlite-users-bounces at mailinglists.sqlite.org [mailto:sqlite-users-
> bounces at mailinglists.sqlite.org] On Behalf Of Andy Rahn
> Sent: Thursday, 9 July, 2015 11:26
> To: General
> Subject: [sqlite] Suggestions for Fast Set Logic?
>
> I want to build an application that can search for specific documents
> based
> on a lot of criteria from a user (e.g. matching keywords, text, etc...).
> And then to combine these results using boolean logic ... For example,
> keyword 'animal' AND rating > 3
>
> Each document has an integer id.
>
> My strategy so far is to gather the id for each subquery into a temporary
> table. Then I can combine the tables using UNION or INTERSECT keywords.
>
> However, when I try a sample that does this on 250,000 rows, I find the
> performance is rather slow: 2 seconds. (I've built sqlite3 myself on
> MacOS)
>
> sqlite3 --version
> > 3.8.10.2 2015-05-20 18:17:19 2ef4f3a5b1d1d0c4338f8243d40a2452cc1f7fe4
>
>
>
> > cat config.sql intersection.sql | sqlite3
> > 250001
> > Run Time: real 2.000 user 1.956734 sys 0.044040
> >
>
> (samples files included below.)
>
> I wonder if anyone has suggestions for how to improve this approach. For
> example, I've considered trying to build a monolithic query which does the
> subqueries and UNION/INTERSECTION logic all at once, but my previous
> experience has shown that the resulting queries are very complex and hard
> for me to reason about if/when they get slow. This approach lets me
> profile each subquery easily. It also lets me tackle sorting the results
> as a separate step.
>
> Another idea I've toyed with is building a custom implementation of UNION
> /
> INTERSECTION for result sets that are just sets of integers. I could do
> this as a virtual table in sqlite.
>
> Thanks for your thoughts on this problem.
>
> Andy
>
>
> config.sql contains:
>
> CREATE TABLE config( numAssets );
>
> INSERT INTO config VALUES( 250000 );
>
> intersection.sql contains:
>
> CREATE TEMPORARY TABLE idset1 ( id PRIMARY KEY );
> CREATE TEMPORARY TABLE idset2 ( id PRIMARY KEY );
>
> CREATE INDEX idset1_id ON idset1 ( id );
> CREATE INDEX idset2_id ON idset2 ( id );
>
> BEGIN TRANSACTION;
>
> INSERT INTO idset1
> SELECT id FROM ( WITH RECURSIVE
> cnt( id ) AS (
> VALUES( 0 ) UNION ALL
> SELECT id+1 FROM cnt WHERE id < ( SELECT MAX( numAssets ) FROM
> config
> ))
> select * from cnt ), config;
>
> COMMIT TRANSACTION;
>
>
> BEGIN TRANSACTION;
>
> INSERT INTO idset2
> SELECT id + 1000000 FROM ( WITH RECURSIVE
> cnt( id ) AS (
> VALUES( 0 ) UNION ALL
> SELECT id+1 FROM cnt WHERE id < ( SELECT MAX( numAssets ) FROM
> config
> ))
> select * from cnt ), config;
>
> COMMIT TRANSACTION;
>
> .timer ON
> SELECT count(id) FROM idset1 WHERE id IN ( SELECT id FROM idset1 UNION
> SELECT id FROM idset2 );
> .timer OFF
> _______________________________________________
> sqlite-users mailing list
> sqlite-users at mailinglists.sqlite.org
> http://mailinglists.sqlite.org/cgi-bin/mailman/listinfo/sqlite-users