http://git-wip-us.apache.org/repos/asf/lucy/blob/5618020f/perl/lib/Lucy/Docs/Cookbook/CustomQueryParser.pod ---------------------------------------------------------------------- diff --git a/perl/lib/Lucy/Docs/Cookbook/CustomQueryParser.pod b/perl/lib/Lucy/Docs/Cookbook/CustomQueryParser.pod deleted file mode 100644 index 250d536..0000000 --- a/perl/lib/Lucy/Docs/Cookbook/CustomQueryParser.pod +++ /dev/null @@ -1,236 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -=head1 NAME - -Lucy::Docs::Cookbook::CustomQueryParser - Sample subclass of QueryParser. - -=head1 ABSTRACT - -Implement a custom search query language using a subclass of -L<Lucy::Search::QueryParser>. - -=head1 The language - -At first, our query language will support only simple term queries and phrases -delimited by double quotes. For simplicity's sake, it will not support -parenthetical groupings, boolean operators, or prepended plus/minus. The -results for all subqueries will be unioned together -- i.e. joined using an OR --- which is usually the best approach for small-to-medium-sized document -collections. - -Later, we'll add support for trailing wildcards. - -=head1 Single-field parser - -Our initial parser implentation will generate queries against a single fixed -field, "content", and it will analyze text using a fixed choice of English -EasyAnalyzer. We won't subclass Lucy::Search::QueryParser just yet. - - package FlatQueryParser; - use Lucy::Search::TermQuery; - use Lucy::Search::PhraseQuery; - use Lucy::Search::ORQuery; - use Carp; - - sub new { - my $analyzer = Lucy::Analysis::EasyAnalyzer->new( - language => 'en', - ); - return bless { - field => 'content', - analyzer => $analyzer, - }, __PACKAGE__; - } - -Some private helper subs for creating TermQuery and PhraseQuery objects will -help keep the size of our main parse() subroutine down: - - sub _make_term_query { - my ( $self, $term ) = @_; - return Lucy::Search::TermQuery->new( - field => $self->{field}, - term => $term, - ); - } - - sub _make_phrase_query { - my ( $self, $terms ) = @_; - return Lucy::Search::PhraseQuery->new( - field => $self->{field}, - terms => $terms, - ); - } - -Our private _tokenize() method treats double-quote delimited material as a -single token and splits on whitespace everywhere else. - - sub _tokenize { - my ( $self, $query_string ) = @_; - my @tokens; - while ( length $query_string ) { - if ( $query_string =~ s/^\s+// ) { - next; # skip whitespace - } - elsif ( $query_string =~ s/^("[^"]*(?:"|$))// ) { - push @tokens, $1; # double-quoted phrase - } - else { - $query_string =~ s/(\S+)//; - push @tokens, $1; # single word - } - } - return \@tokens; - } - -The main parsing routine creates an array of tokens by calling _tokenize(), -runs the tokens through through the EasyAnalyzer, creates TermQuery or -PhraseQuery objects according to how many tokens emerge from the -EasyAnalyzer's split() method, and adds each of the sub-queries to the primary -ORQuery. - - sub parse { - my ( $self, $query_string ) = @_; - my $tokens = $self->_tokenize($query_string); - my $analyzer = $self->{analyzer}; - my $or_query = Lucy::Search::ORQuery->new; - - for my $token (@$tokens) { - if ( $token =~ s/^"// ) { - $token =~ s/"$//; - my $terms = $analyzer->split($token); - my $query = $self->_make_phrase_query($terms); - $or_query->add_child($phrase_query); - } - else { - my $terms = $analyzer->split($token); - if ( @$terms == 1 ) { - my $query = $self->_make_term_query( $terms->[0] ); - $or_query->add_child($query); - } - elsif ( @$terms > 1 ) { - my $query = $self->_make_phrase_query($terms); - $or_query->add_child($query); - } - } - } - - return $or_query; - } - -=head1 Multi-field parser - -Most often, the end user will want their search query to match not only a -single 'content' field, but also 'title' and so on. To make that happen, we -have to turn queries such as this... - - foo AND NOT bar - -... into the logical equivalent of this: - - (title:foo OR content:foo) AND NOT (title:bar OR content:bar) - -Rather than continue with our own from-scratch parser class and write the -routines to accomplish that expansion, we're now going to subclass Lucy::Search::QueryParser -and take advantage of some of its existing methods. - -Our first parser implementation had the "content" field name and the choice of -English EasyAnalyzer hard-coded for simplicity, but we don't need to do that -once we subclass Lucy::Search::QueryParser. QueryParser's constructor -- -which we will inherit, allowing us to eliminate our own constructor -- -requires a Schema which conveys field -and Analyzer information, so we can just defer to that. - - package FlatQueryParser; - use base qw( Lucy::Search::QueryParser ); - use Lucy::Search::TermQuery; - use Lucy::Search::PhraseQuery; - use Lucy::Search::ORQuery; - use PrefixQuery; - use Carp; - - # Inherit new() - -We're also going to jettison our _make_term_query() and _make_phrase_query() -helper subs and chop our parse() subroutine way down. Our revised parse() -routine will generate Lucy::Search::LeafQuery objects instead of TermQueries -and PhraseQueries: - - sub parse { - my ( $self, $query_string ) = @_; - my $tokens = $self->_tokenize($query_string); - my $or_query = Lucy::Search::ORQuery->new; - for my $token (@$tokens) { - my $leaf_query = Lucy::Search::LeafQuery->new( text => $token ); - $or_query->add_child($leaf_query); - } - return $self->expand($or_query); - } - -The magic happens in QueryParser's expand() method, which walks the ORQuery -object we supply to it looking for LeafQuery objects, and calls expand_leaf() -for each one it finds. expand_leaf() performs field-specific analysis, -decides whether each query should be a TermQuery or a PhraseQuery, and if -multiple fields are required, creates an ORQuery which mults out e.g. C<foo> -into C<(title:foo OR content:foo)>. - -=head1 Extending the query language - -To add support for trailing wildcards to our query language, we need to -override expand_leaf() to accommodate PrefixQuery, while deferring to the -parent class implementation on TermQuery and PhraseQuery. - - sub expand_leaf { - my ( $self, $leaf_query ) = @_; - my $text = $leaf_query->get_text; - if ( $text =~ /\*$/ ) { - my $or_query = Lucy::Search::ORQuery->new; - for my $field ( @{ $self->get_fields } ) { - my $prefix_query = PrefixQuery->new( - field => $field, - query_string => $text, - ); - $or_query->add_child($prefix_query); - } - return $or_query; - } - else { - return $self->SUPER::expand_leaf($leaf_query); - } - } - -Ordinarily, those asterisks would have been stripped when running tokens -through the EasyAnalyzer -- query strings containing "foo*" would produce -TermQueries for the term "foo". Our override intercepts tokens with trailing -asterisks and processes them as PrefixQueries before C<SUPER::expand_leaf> can -discard them, so that a search for "foo*" can match "food", "foosball", and so -on. - -=head1 Usage - -Insert our custom parser into the search.cgi sample app to get a feel for how -it behaves: - - my $parser = FlatQueryParser->new( schema => $searcher->get_schema ); - my $query = $parser->parse( decode( 'UTF-8', $cgi->param('q') || '' ) ); - my $hits = $searcher->hits( - query => $query, - offset => $offset, - num_wanted => $page_size, - ); - ... - -=cut -
http://git-wip-us.apache.org/repos/asf/lucy/blob/5618020f/perl/lib/Lucy/Docs/Cookbook/FastUpdates.pod ---------------------------------------------------------------------- diff --git a/perl/lib/Lucy/Docs/Cookbook/FastUpdates.pod b/perl/lib/Lucy/Docs/Cookbook/FastUpdates.pod deleted file mode 100644 index eff8e54..0000000 --- a/perl/lib/Lucy/Docs/Cookbook/FastUpdates.pod +++ /dev/null @@ -1,153 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -=head1 NAME - -Lucy::Docs::Cookbook::FastUpdates - Near real-time index updates. - -=head1 ABSTRACT - -While index updates are fast on average, worst-case update performance may be -significantly slower. To make index updates consistently quick, we must -manually intervene to control the process of index segment consolidation. - -=head1 The problem - -Ordinarily, modifying an index is cheap. New data is added to new segments, -and the time to write a new segment scales more or less linearly with the -number of documents added during the indexing session. - -Deletions are also cheap most of the time, because we don't remove documents -immediately but instead mark them as deleted, and adding the deletion mark is -cheap. - -However, as new segments are added and the deletion rate for existing segments -increases, search-time performance slowly begins to degrade. At some point, -it becomes necessary to consolidate existing segments, rewriting their data -into a new segment. - -If the recycled segments are small, the time it takes to rewrite them may not -be significant. Every once in a while, though, a large amount of data must be -rewritten. - -=head1 Procrastinating and playing catch-up - -The simplest way to force fast index updates is to avoid rewriting anything. - -Indexer relies upon L<IndexManager|Lucy::Index::IndexManager>'s -recycle() method to tell it which segments should be consolidated. If we -subclass IndexManager and override recycle() so that it always returns an -empty array, we get consistently quick performance: - - package NoMergeManager; - use base qw( Lucy::Index::IndexManager ); - sub recycle { [] } - - package main; - my $indexer = Lucy::Index::Indexer->new( - index => '/path/to/index', - manager => NoMergeManager->new, - ); - ... - $indexer->commit; - -However, we can't procrastinate forever. Eventually, we'll have to run an -ordinary, uncontrolled indexing session, potentially triggering a large -rewrite of lots of small and/or degraded segments: - - my $indexer = Lucy::Index::Indexer->new( - index => '/path/to/index', - # manager => NoMergeManager->new, - ); - ... - $indexer->commit; - -=head1 Acceptable worst-case update time, slower degradation - -Never merging anything at all in the main indexing process is probably -overkill. Small segments are relatively cheap to merge; we just need to guard -against the big rewrites. - -Setting a ceiling on the number of documents in the segments to be recycled -allows us to avoid a mass proliferation of tiny, single-document segments, -while still offering decent worst-case update speed: - - package LightMergeManager; - use base qw( Lucy::Index::IndexManager ); - - sub recycle { - my $self = shift; - my $seg_readers = $self->SUPER::recycle(@_); - @$seg_readers = grep { $_->doc_max < 10 } @$seg_readers; - return $seg_readers; - } - -However, we still have to consolidate every once in a while, and while that -happens content updates will be locked out. - -=head1 Background merging - -If it's not acceptable to lock out updates while the index consolidation -process runs, the alternative is to move the consolidation process out of -band, using Lucy::Index::BackgroundMerger. - -It's never safe to have more than one Indexer attempting to modify the content -of an index at the same time, but a BackgroundMerger and an Indexer can -operate simultaneously: - - # Indexing process. - use Scalar::Util qw( blessed ); - my $retries = 0; - while (1) { - eval { - my $indexer = Lucy::Index::Indexer->new( - index => '/path/to/index', - manager => LightMergeManager->new, - ); - $indexer->add_doc($doc); - $indexer->commit; - }; - last unless $@; - if ( blessed($@) and $@->isa("Lucy::Store::LockErr") ) { - # Catch LockErr. - warn "Couldn't get lock ($retries retries)"; - $retries++; - } - else { - die "Write failed: $@"; - } - } - - # Background merge process. - my $manager = Lucy::Index::IndexManager->new; - $manager->set_write_lock_timeout(60_000); - my $bg_merger = Lucy::Index::BackgroundMerger->new( - index => '/path/to/index', - manager => $manager, - ); - $bg_merger->commit; - -The exception handling code becomes useful once you have more than one index -modification process happening simultaneously. By default, Indexer tries -several times to acquire a write lock over the span of one second, then holds -it until commit() completes. BackgroundMerger handles most of its work -without the write lock, but it does need it briefly once at the beginning and -once again near the end. Under normal loads, the internal retry logic will -resolve conflicts, but if it's not acceptable to miss an insert, you probably -want to catch LockErr exceptions thrown by Indexer. In contrast, a LockErr -from BackgroundMerger probably just needs to be logged. - -=cut - http://git-wip-us.apache.org/repos/asf/lucy/blob/5618020f/perl/lib/Lucy/Docs/DocIDs.pod ---------------------------------------------------------------------- diff --git a/perl/lib/Lucy/Docs/DocIDs.pod b/perl/lib/Lucy/Docs/DocIDs.pod deleted file mode 100644 index 4210f3d..0000000 --- a/perl/lib/Lucy/Docs/DocIDs.pod +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -=head1 NAME - -Lucy::Docs::DocIDs - Characteristics of Apache Lucy document ids. - -=head1 DESCRIPTION - -=head2 Document ids are signed 32-bit integers - -Document ids in Apache Lucy start at 1. Because 0 is never a valid doc id, we -can use it as a sentinel value: - - while ( my $doc_id = $posting_list->next ) { - ... - } - -=head2 Document ids are ephemeral - -The document ids used by Lucy are associated with a single index -snapshot. The moment an index is updated, the mapping of document ids to -documents is subject to change. - -Since IndexReader objects represent a point-in-time view of an index, document -ids are guaranteed to remain static for the life of the reader. However, -because they are not permanent, Lucy document ids cannot be used as -foreign keys to locate records in external data sources. If you truly need a -primary key field, you must define it and populate it yourself. - -Furthermore, the order of document ids does not tell you anything about the -sequence in which documents were added to the index. - -=cut - http://git-wip-us.apache.org/repos/asf/lucy/blob/5618020f/perl/lib/Lucy/Docs/FileFormat.pod ---------------------------------------------------------------------- diff --git a/perl/lib/Lucy/Docs/FileFormat.pod b/perl/lib/Lucy/Docs/FileFormat.pod deleted file mode 100644 index 2859442..0000000 --- a/perl/lib/Lucy/Docs/FileFormat.pod +++ /dev/null @@ -1,239 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -=head1 NAME - -Lucy::Docs::FileFormat - Overview of index file format. - -=head1 OVERVIEW - -It is not necessary to understand the current implementation details of the -index file format in order to use Apache Lucy effectively, but it may be -helpful if you are interested in tweaking for high performance, exotic usage, -or debugging and development. - -On a file system, an index is a directory. The files inside have a -hierarchical relationship: an index is made up of "segments", each of which is -an independent inverted index with its own subdirectory; each segment is made -up of several component parts. - - [index]--| - |--snapshot_XXX.json - |--schema_XXX.json - |--write.lock - | - |--seg_1--| - | |--segmeta.json - | |--cfmeta.json - | |--cf.dat-------| - | |--[lexicon] - | |--[postings] - | |--[documents] - | |--[highlight] - | |--[deletions] - | - |--seg_2--| - | |--segmeta.json - | |--cfmeta.json - | |--cf.dat-------| - | |--[lexicon] - | |--[postings] - | |--[documents] - | |--[highlight] - | |--[deletions] - | - |--[...]--| - -=head1 Write-once philosophy - -All segment directory names consist of the string "seg_" followed by a number -in base 36: seg_1, seg_5m, seg_p9s2 and so on, with higher numbers indicating -more recent segments. Once a segment is finished and committed, its name is -never re-used and its files are never modified. - -Old segments become obsolete and can be removed when their data has been -consolidated into new segments during the process of segment merging and -optimization. A fully-optimized index has only one segment. - -=head1 Top-level entries - -There are a handful of "top-level" files and directories which belong to the -entire index rather than to a particular segment. - -=head2 snapshot_XXX.json - -A "snapshot" file, e.g. C<snapshot_m7p.json>, is list of index files and -directories. Because index files, once written, are never modified, the list -of entries in a snapshot defines a point-in-time view of the data in an index. - -Like segment directories, snapshot files also utilize the -unique-base-36-number naming convention; the higher the number, the more -recent the file. The appearance of a new snapshot file within the index -directory constitutes an index update. While a new segment is being written -new files may be added to the index directory, but until a new snapshot file -gets written, a Searcher opening the index for reading won't know about them. - -=head2 schema_XXX.json - -The schema file is a Schema object describing the index's format, serialized -as JSON. It, too, is versioned, and a given snapshot file will reference one -and only one schema file. - -=head2 locks - -By default, only one indexing process may safely modify the index at any given -time. Processes reserve an index by laying claim to the C<write.lock> file -within the C<locks/> directory. A smattering of other lock files may be used -from time to time, as well. - -=head1 A segment's component parts - -By default, each segment has up to five logical components: lexicon, postings, -document storage, highlight data, and deletions. Binary data from these -components gets stored in virtual files within the "cf.dat" compound file; -metadata is stored in a shared "segmeta.json" file. - -=head2 segmeta.json - -The segmeta.json file is a central repository for segment metadata. In -addition to information such as document counts and field numbers, it also -warehouses arbitrary metadata on behalf of individual index components. - -=head2 Lexicon - -Each indexed field gets its own lexicon in each segment. The exact files -involved depend on the field's type, but generally speaking there will be two -parts. First, there's a primary C<lexicon-XXX.dat> file which houses a -complete term list associating terms with corpus frequency statistics, -postings file locations, etc. Second, one or more "lexicon index" files may -be present which contain periodic samples from the primary lexicon file to -facilitate fast lookups. - -=head2 Postings - -"Posting" is a technical term from the field of -L<information retrieval|Lucy::Docs::IRTheory>, defined as a single -instance of a one term indexing one document. If you are looking at the index -in the back of a book, and you see that "freedom" is referenced on pages 8, -86, and 240, that would be three postings, which taken together form a -"posting list". The same terminology applies to an index in electronic form. - -Each segment has one postings file per indexed field. When a search is -performed for a single term, first that term is looked up in the lexicon. If -the term exists in the segment, the record in the lexicon will contain -information about which postings file to look at and where to look. - -The first thing any posting record tells you is a document id. By iterating -over all the postings associated with a term, you can find all the documents -that match that term, a process which is analogous to looking up page numbers -in a book's index. However, each posting record typically contains other -information in addition to document id, e.g. the positions at which the term -occurs within the field. - -=head2 Documents - -The document storage section is a simple database, organized into two files: - -=over - -=item * - -B<documents.dat> - Serialized documents. - -=item * - -B<documents.ix> - Document storage index, a solid array of 64-bit integers -where each integer location corresponds to a document id, and the value at -that location points at a file position in the documents.dat file. - -=back - -=head2 Highlight data - -The files which store data used for excerpting and highlighting are organized -similarly to the files used to store documents. - -=over - -=item * - -B<highlight.dat> - Chunks of serialized highlight data, one per doc id. - -=item * - -B<highlight.ix> - Highlight data index -- as with the C<documents.ix> file, a -solid array of 64-bit file pointers. - -=back - -=head2 Deletions - -When a document is "deleted" from a segment, it is not actually purged right -away; it is merely marked as "deleted" via a deletions file. Deletions files -contains bit vectors with one bit for each document in the segment; if bit -#254 is set then document 254 is deleted, and if that document turns up in a -search it will be masked out. - -It is only when a segment's contents are rewritten to a new segment during the -segment-merging process that deleted documents truly go away. - -=head1 Compound Files - -If you peer inside an index directory, you won't actually find any files named -"documents.dat", "highlight.ix", etc. unless there is an indexing process -underway. What you will find instead is one "cf.dat" and one "cfmeta.json" -file per segment. - -To minimize the need for file descriptors at search-time, all per-segment -binary data files are concatenated together in "cf.dat" at the close of each -indexing session. Information about where each file begins and ends is stored -in C<cfmeta.json>. When the segment is opened for reading, a single file -descriptor per "cf.dat" file can be shared among several readers. - -=head1 A Typical Search - -Here's a simplified narrative, dramatizing how a search for "freedom" against -a given segment plays out: - -=over - -=item 1 - -The searcher asks the relevant Lexicon Index, "Do you know anything about -'freedom'?" Lexicon Index replies, "Can't say for sure, but if the main -Lexicon file does, 'freedom' is probably somewhere around byte 21008". - -=item 2 - -The main Lexicon tells the searcher "One moment, let me scan our records... -Yes, we have 2 documents which contain 'freedom'. You'll find them in -seg_6/postings-4.dat starting at byte 66991." - -=item 3 - -The Postings file says "Yep, we have 'freedom', all right! Document id 40 -has 1 'freedom', and document 44 has 8. If you need to know more, like if any -'freedom' is part of the phrase 'freedom of speech', ask me about positions! - -=item 4 - -If the searcher is only looking for 'freedom' in isolation, that's where it -stops. It now knows enough to assign the documents scores against "freedom", -with the 8-freedom document likely ranking higher than the single-freedom -document. - -=back - - http://git-wip-us.apache.org/repos/asf/lucy/blob/5618020f/perl/lib/Lucy/Docs/IRTheory.pod ---------------------------------------------------------------------- diff --git a/perl/lib/Lucy/Docs/IRTheory.pod b/perl/lib/Lucy/Docs/IRTheory.pod deleted file mode 100644 index 7696ea8..0000000 --- a/perl/lib/Lucy/Docs/IRTheory.pod +++ /dev/null @@ -1,94 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -=head1 NAME - -Lucy::Docs::IRTheory - Crash course in information retrieval. - -=head1 ABSTRACT - -Just enough Information Retrieval theory to find your way around Apache Lucy. - -=head1 Terminology - -Lucy uses some terminology from the field of information retrieval which -may be unfamiliar to many users. "Document" and "term" mean pretty much what -you'd expect them to, but others such as "posting" and "inverted index" need a -formal introduction: - -=over - -=item * - -I<document> - An atomic unit of retrieval. - -=item * - -I<term> - An attribute which describes a document. - -=item * - -I<posting> - One term indexing one document. - -=item * - -I<term list> - The complete list of terms which describe a document. - -=item * - -I<posting list> - The complete list of documents which a term indexes. - -=item * - -I<inverted index> - A data structure which maps from terms to documents. - -=back - -Since Lucy is a practical implementation of IR theory, it loads these -abstract, distilled definitions down with useful traits. For instance, a -"posting" in its most rarefied form is simply a term-document pairing; in -Lucy, the class L<Lucy::Index::Posting::MatchPosting> fills this -role. However, by associating additional information with a posting like the -number of times the term occurs in the document, we can turn it into a -L<ScorePosting|Lucy::Index::Posting::ScorePosting>, making it possible -to rank documents by relevance rather than just list documents which happen to -match in no particular order. - -=head1 TF/IDF ranking algorithm - -Lucy uses a variant of the well-established "Term Frequency / Inverse -Document Frequency" weighting scheme. A thorough treatment of TF/IDF is too -ambitious for our present purposes, but in a nutshell, it means that... - -=over - -=item - -in a search for C<skate park>, documents which score well for the -comparatively rare term C<skate> will rank higher than documents which score -well for the more common term C<park>. - -=item - -a 10-word text which has one occurrence each of both C<skate> and C<park> will -rank higher than a 1000-word text which also contains one occurrence of each. - -=back - -A web search for "tf idf" will turn up many excellent explanations of the -algorithm. - -=cut - http://git-wip-us.apache.org/repos/asf/lucy/blob/5618020f/perl/lib/Lucy/Docs/Tutorial.pod ---------------------------------------------------------------------- diff --git a/perl/lib/Lucy/Docs/Tutorial.pod b/perl/lib/Lucy/Docs/Tutorial.pod deleted file mode 100644 index 7ec7467..0000000 --- a/perl/lib/Lucy/Docs/Tutorial.pod +++ /dev/null @@ -1,89 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -=head1 NAME - -Lucy::Docs::Tutorial - Step-by-step introduction to Apache Lucy. - -=head1 ABSTRACT - -Explore Apache Lucy's basic functionality by starting with a minimalist CGI -search app based on L<Lucy::Simple> and transforming it, step by step, into an -"advanced search" interface utilizing more flexible core modules like -L<Lucy::Index::Indexer> and L<Lucy::Search::IndexSearcher>. - -=head1 DESCRIPTION - -=head2 Chapters - -=over - -=item * - -L<Lucy::Docs::Tutorial::Simple> - Build a bare-bones search app using -L<Lucy::Simple>. - -=item * - -L<Lucy::Docs::Tutorial::BeyondSimple> - Rebuild the app using core -classes like L<Indexer|Lucy::Index::Indexer> and -L<IndexSearcher|Lucy::Search::IndexSearcher> in place of Lucy::Simple. - -=item * - -L<Lucy::Docs::Tutorial::FieldType> - Experiment with different field -characteristics using subclasses of L<Lucy::Plan::FieldType>. - -=item * - -L<Lucy::Docs::Tutorial::Analysis> - Examine how the choice of -L<Lucy::Analysis::Analyzer> subclass affects search results. - -=item * - -L<Lucy::Docs::Tutorial::Highlighter> - Augment search results with -highlighted excerpts. - -=item * - -L<Lucy::Docs::Tutorial::QueryObjects> - Unlock advanced search features -by using Query objects instead of query strings. - -=back - -=head2 Source materials - -The source material used by the tutorial app -- a multi-text-file presentation -of the United States constitution -- can be found in the C<sample> directory -at the root of the Lucy distribution, along with finished indexing and search -apps. - - sample/indexer.pl # indexing app - sample/search.cgi # search app - sample/us_constitution # corpus - -=head2 Conventions - -The user is expected to be familiar with OO Perl and basic CGI programming. - -The code in this tutorial assumes a Unix-flavored operating system and the -Apache webserver, but will work with minor modifications on other setups. - -=head1 SEE ALSO - -More advanced and esoteric subjects are covered in -L<Lucy::Docs::Cookbook>. - - http://git-wip-us.apache.org/repos/asf/lucy/blob/5618020f/perl/lib/Lucy/Docs/Tutorial/Analysis.pod ---------------------------------------------------------------------- diff --git a/perl/lib/Lucy/Docs/Tutorial/Analysis.pod b/perl/lib/Lucy/Docs/Tutorial/Analysis.pod deleted file mode 100644 index 24c0b58..0000000 --- a/perl/lib/Lucy/Docs/Tutorial/Analysis.pod +++ /dev/null @@ -1,94 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -=head1 NAME - -Lucy::Docs::Tutorial::Analysis - How to choose and use Analyzers. - -=head1 DESCRIPTION - -Try swapping out the EasyAnalyzer in our Schema for a StandardTokenizer: - - my $tokenizer = Lucy::Analysis::StandardTokenizer->new; - my $type = Lucy::Plan::FullTextType->new( - analyzer => $tokenizer, - ); - -Search for C<senate>, C<Senate>, and C<Senator> before and after making the -change and re-indexing. - -Under EasyAnalyzer, the results are identical for all three searches, but -under StandardTokenizer, searches are case-sensitive, and the result sets for -C<Senate> and C<Senator> are distinct. - -=head2 EasyAnalyzer - -What's happening is that EasyAnalyzer is performing more aggressive processing -than StandardTokenizer. In addition to tokenizing, it's also converting all -text to lower case so that searches are case-insensitive, and using a -"stemming" algorithm to reduce related words to a common stem (C<senat>, in -this case). - -EasyAnalyzer is actually multiple Analyzers wrapped up in a single package. -In this case, it's three-in-one, since specifying a EasyAnalyzer with -C<< language => 'en' >> is equivalent to this snippet: - - my $tokenizer = Lucy::Analysis::StandardTokenizer->new; - my $normalizer = Lucy::Analysis::Normalizer->new; - my $stemmer = Lucy::Analysis::SnowballStemmer->new( language => 'en' ); - my $polyanalyzer = Lucy::Analysis::PolyAnalyzer->new( - analyzers => [ $tokenizer, $normalizer, $stemmer ], - ); - -You can add or subtract Analyzers from there if you like. Try adding a fourth -Analyzer, a SnowballStopFilter for suppressing "stopwords" like "the", "if", -and "maybe". - - my $stopfilter = Lucy::Analysis::SnowballStopFilter->new( - language => 'en', - ); - my $polyanalyzer = Lucy::Analysis::PolyAnalyzer->new( - analyzers => [ $tokenizer, $normalizer, $stopfilter, $stemmer ], - ); - -Also, try removing the SnowballStemmer. - - my $polyanalyzer = Lucy::Analysis::PolyAnalyzer->new( - analyzers => [ $tokenizer, $normalizer ], - ); - -The original choice of a stock English EasyAnalyzer probably still yields the -best results for this document collection, but you get the idea: sometimes you -want a different Analyzer. - -=head2 When the best Analyzer is no Analyzer - -Sometimes you don't want an Analyzer at all. That was true for our "url" -field because we didn't need it to be searchable, but it's also true for -certain types of searchable fields. For instance, "category" fields are often -set up to match exactly or not at all, as are fields like "last_name" (because -you may not want to conflate results for "Humphrey" and "Humphries"). - -To specify that there should be no analysis performed at all, use StringType: - - my $type = Lucy::Plan::StringType->new; - $schema->spec_field( name => 'category', type => $type ); - -=head2 Highlighting up next - -In our next tutorial chapter, L<Lucy::Docs::Tutorial::Highlighter>, -we'll add highlighted excerpts from the "content" field to our search results. - - http://git-wip-us.apache.org/repos/asf/lucy/blob/5618020f/perl/lib/Lucy/Docs/Tutorial/BeyondSimple.pod ---------------------------------------------------------------------- diff --git a/perl/lib/Lucy/Docs/Tutorial/BeyondSimple.pod b/perl/lib/Lucy/Docs/Tutorial/BeyondSimple.pod deleted file mode 100644 index 6ce1261..0000000 --- a/perl/lib/Lucy/Docs/Tutorial/BeyondSimple.pod +++ /dev/null @@ -1,153 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -=head1 NAME - -Lucy::Docs::Tutorial::BeyondSimple - A more flexible app structure. - -=head1 DESCRIPTION - -=head2 Goal - -In this tutorial chapter, we'll refactor the apps we built in -L<Lucy::Docs::Tutorial::Simple> so that they look exactly the same from -the end user's point of view, but offer the developer greater possibilites for -expansion. - -To achieve this, we'll ditch Lucy::Simple and replace it with the -classes that it uses internally: - -=over - -=item * - -L<Lucy::Plan::Schema> - Plan out your index. - -=item * - -L<Lucy::Plan::FullTextType> - Field type for full text search. - -=item * - -L<Lucy::Analysis::EasyAnalyzer> - A one-size-fits-all parser/tokenizer. - -=item * - -L<Lucy::Index::Indexer> - Manipulate index content. - -=item * - -L<Lucy::Search::IndexSearcher> - Search an index. - -=item * - -L<Lucy::Search::Hits> - Iterate over hits returned by a Searcher. - -=back - -=head2 Adaptations to indexer.pl - -After we load our modules... - - use Lucy::Plan::Schema; - use Lucy::Plan::FullTextType; - use Lucy::Analysis::EasyAnalyzer; - use Lucy::Index::Indexer; - -... the first item we're going need is a L<Schema|Lucy::Plan::Schema>. - -The primary job of a Schema is to specify what fields are available and how -they're defined. We'll start off with three fields: title, content and url. - - # Create Schema. - my $schema = Lucy::Plan::Schema->new; - my $easyanalyzer = Lucy::Analysis::EasyAnalyzer->new( - language => 'en', - ); - my $type = Lucy::Plan::FullTextType->new( - analyzer => $easyanalyzer, - ); - $schema->spec_field( name => 'title', type => $type ); - $schema->spec_field( name => 'content', type => $type ); - $schema->spec_field( name => 'url', type => $type ); - -All of the fields are spec'd out using the "FullTextType" FieldType, -indicating that they will be searchable as "full text" -- which means that -they can be searched for individual words. The "analyzer", which is unique to -FullTextType fields, is what breaks up the text into searchable tokens. - -Next, we'll swap our Lucy::Simple object out for a Lucy::Index::Indexer. -The substitution will be straightforward because Simple has merely been -serving as a thin wrapper around an inner Indexer, and we'll just be peeling -away the wrapper. - -First, replace the constructor: - - # Create Indexer. - my $indexer = Lucy::Index::Indexer->new( - index => $path_to_index, - schema => $schema, - create => 1, - truncate => 1, - ); - -Next, have the C<$indexer> object C<add_doc> where we were having the -C<$lucy> object C<add_doc> before: - - foreach my $filename (@filenames) { - my $doc = parse_file($filename); - $indexer->add_doc($doc); - } - -There's only one extra step required: at the end of the app, you must call -commit() explicitly to close the indexing session and commit your changes. -(Lucy::Simple hides this detail, calling commit() implicitly when it needs to). - - $indexer->commit; - -=head2 Adaptations to search.cgi - -In our search app as in our indexing app, Lucy::Simple has served as a -thin wrapper -- this time around L<Lucy::Search::IndexSearcher> and -L<Lucy::Search::Hits>. Swapping out Simple for these two classes is -also straightforward: - - use Lucy::Search::IndexSearcher; - - my $searcher = Lucy::Search::IndexSearcher->new( - index => $path_to_index, - ); - my $hits = $searcher->hits( # returns a Hits object, not a hit count - query => $q, - offset => $offset, - num_wanted => $page_size, - ); - my $hit_count = $hits->total_hits; # get the hit count here - - ... - - while ( my $hit = $hits->next ) { - ... - } - -=head2 Hooray! - -Congratulations! Your apps do the same thing as before... but now they'll be -easier to customize. - -In our next chapter, L<Lucy::Docs::Tutorial::FieldType>, we'll explore -how to assign different behaviors to different fields. - - http://git-wip-us.apache.org/repos/asf/lucy/blob/5618020f/perl/lib/Lucy/Docs/Tutorial/FieldType.pod ---------------------------------------------------------------------- diff --git a/perl/lib/Lucy/Docs/Tutorial/FieldType.pod b/perl/lib/Lucy/Docs/Tutorial/FieldType.pod deleted file mode 100644 index 05d0e82..0000000 --- a/perl/lib/Lucy/Docs/Tutorial/FieldType.pod +++ /dev/null @@ -1,74 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -=head1 NAME - -Lucy::Docs::Tutorial::FieldType - Specify per-field properties and -behaviors. - -=head1 DESCRIPTION - -The Schema we used in the last chapter specifies three fields: - - my $type = Lucy::Plan::FullTextType->new( - analyzer => $polyanalyzer, - ); - $schema->spec_field( name => 'title', type => $type ); - $schema->spec_field( name => 'content', type => $type ); - $schema->spec_field( name => 'url', type => $type ); - -Since they are all defined as "full text" fields, they are all searchable -- -including the C<url> field, a dubious choice. Some URLs contain meaningful -information, but these don't, really: - - http://example.com/us_constitution/amend1.txt - -We may as well not bother indexing the URL content. To achieve that we need -to assign the C<url> field to a different FieldType. - -=head2 StringType - -Instead of FullTextType, we'll use a -L<StringType|Lucy::Plan::StringType>, which doesn't use an -Analyzer to break up text into individual fields. Furthermore, we'll mark -this StringType as unindexed, so that its content won't be searchable at all. - - my $url_type = Lucy::Plan::StringType->new( indexed => 0 ); - $schema->spec_field( name => 'url', type => $url_type ); - -To observe the change in behavior, try searching for C<us_constitution> both -before and after changing the Schema and re-indexing. - -=head2 Toggling 'stored' - -For a taste of other FieldType possibilities, try turning off C<stored> for -one or more fields. - - my $content_type = Lucy::Plan::FullTextType->new( - analyzer => $polyanalyzer, - stored => 0, - ); - -Turning off C<stored> for either C<title> or C<url> mangles our results page, -but since we're not displaying C<content>, turning it off for C<content> has -no effect -- except on index size. - -=head2 Analyzers up next - -Analyzers play a crucial role in the behavior of FullTextType fields. In our -next tutorial chapter, L<Lucy::Docs::Tutorial::Analysis>, we'll see how -changing up the Analyzer changes search results. - - http://git-wip-us.apache.org/repos/asf/lucy/blob/5618020f/perl/lib/Lucy/Docs/Tutorial/Highlighter.pod ---------------------------------------------------------------------- diff --git a/perl/lib/Lucy/Docs/Tutorial/Highlighter.pod b/perl/lib/Lucy/Docs/Tutorial/Highlighter.pod deleted file mode 100644 index 9b6879c..0000000 --- a/perl/lib/Lucy/Docs/Tutorial/Highlighter.pod +++ /dev/null @@ -1,76 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -=head1 NAME - -Lucy::Docs::Tutorial::Highlighter - Augment search results with -highlighted excerpts. - -=head1 DESCRIPTION - -Adding relevant excerpts with highlighted search terms to your search results -display makes it much easier for end users to scan the page and assess which -hits look promising, dramatically improving their search experience. - -=head2 Adaptations to indexer.pl - -L<Lucy::Highlight::Highlighter> uses information generated at index -time. To save resources, highlighting is disabled by default and must be -turned on for individual fields. - - my $highlightable = Lucy::Plan::FullTextType->new( - analyzer => $polyanalyzer, - highlightable => 1, - ); - $schema->spec_field( name => 'content', type => $highlightable ); - -=head2 Adaptations to search.cgi - -To add highlighting and excerpting to the search.cgi sample app, create a -C<$highlighter> object outside the hits iterating loop... - - my $highlighter = Lucy::Highlight::Highlighter->new( - searcher => $searcher, - query => $q, - field => 'content' - ); - -... then modify the loop and the per-hit display to generate and include the -excerpt. - - # Create result list. - my $report = ''; - while ( my $hit = $hits->next ) { - my $score = sprintf( "%0.3f", $hit->get_score ); - my $excerpt = $highlighter->create_excerpt($hit); - $report .= qq| - <p> - <a href="$hit->{url}"><strong>$hit->{title}</strong></a> - <em>$score</em> - <br /> - $excerpt - <br /> - <span class="excerptURL">$hit->{url}</span> - </p> - |; - } - -=head2 Next chapter: Query objects - -Our next tutorial chapter, L<Lucy::Docs::Tutorial::QueryObjects>, -illustrates how to build an "advanced search" interface using -L<Query|Lucy::Search::Query> objects instead of query strings. - - http://git-wip-us.apache.org/repos/asf/lucy/blob/5618020f/perl/lib/Lucy/Docs/Tutorial/QueryObjects.pod ---------------------------------------------------------------------- diff --git a/perl/lib/Lucy/Docs/Tutorial/QueryObjects.pod b/perl/lib/Lucy/Docs/Tutorial/QueryObjects.pod deleted file mode 100644 index 6ff812a..0000000 --- a/perl/lib/Lucy/Docs/Tutorial/QueryObjects.pod +++ /dev/null @@ -1,198 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -=head1 NAME - -Lucy::Docs::Tutorial::QueryObjects - Use Query objects instead of query -strings. - -=head1 DESCRIPTION - -Until now, our search app has had only a single search box. In this tutorial -chapter, we'll move towards an "advanced search" interface, by adding a -"category" drop-down menu. Three new classes will be required: - -=over - -=item * - -L<QueryParser|Lucy::Search::QueryParser> - Turn a query string into a -L<Query|Lucy::Search::Query> object. - -=item * - -L<TermQuery|Lucy::Search::TermQuery> - Query for a specific term within -a specific field. - -=item * - -L<ANDQuery|Lucy::Search::ANDQuery> - "AND" together multiple Query -objects to produce an intersected result set. - -=back - -=head2 Adaptations to indexer.pl - -Our new "category" field will be a StringType field rather than a FullTextType -field, because we will only be looking for exact matches. It needs to be -indexed, but since we won't display its value, it doesn't need to be stored. - - my $cat_type = Lucy::Plan::StringType->new( stored => 0 ); - $schema->spec_field( name => 'category', type => $cat_type ); - -There will be three possible values: "article", "amendment", and "preamble", -which we'll hack out of the source file's name during our C<parse_file> -subroutine: - - my $category - = $filename =~ /art/ ? 'article' - : $filename =~ /amend/ ? 'amendment' - : $filename =~ /preamble/ ? 'preamble' - : die "Can't derive category for $filename"; - return { - title => $title, - content => $bodytext, - url => "/us_constitution/$filename", - category => $category, - }; - -=head2 Adaptations to search.cgi - -The "category" constraint will be added to our search interface using an HTML -"select" element (this routine will need to be integrated into the HTML -generation section of search.cgi): - - # Build up the HTML "select" object for the "category" field. - sub generate_category_select { - my $cat = shift; - my $select = qq| - <select name="category"> - <option value="">All Sections</option> - <option value="article">Articles</option> - <option value="amendment">Amendments</option> - </select>|; - if ($cat) { - $select =~ s/"$cat"/"$cat" selected/; - } - return $select; - } - -We'll start off by loading our new modules and extracting our new CGI -parameter. - - use Lucy::Search::QueryParser; - use Lucy::Search::TermQuery; - use Lucy::Search::ANDQuery; - - ... - - my $category = decode( "UTF-8", $cgi->param('category') || '' ); - -QueryParser's constructor requires a "schema" argument. We can get that from -our IndexSearcher: - - # Create an IndexSearcher and a QueryParser. - my $searcher = Lucy::Search::IndexSearcher->new( - index => $path_to_index, - ); - my $qparser = Lucy::Search::QueryParser->new( - schema => $searcher->get_schema, - ); - -Previously, we have been handing raw query strings to IndexSearcher. Behind -the scenes, IndexSearcher has been using a QueryParser to turn those query -strings into Query objects. Now, we will bring QueryParser into the -foreground and parse the strings explicitly. - - my $query = $qparser->parse($q); - -If the user has specified a category, we'll use an ANDQuery to join our parsed -query together with a TermQuery representing the category. - - if ($category) { - my $category_query = Lucy::Search::TermQuery->new( - field => 'category', - term => $category, - ); - $query = Lucy::Search::ANDQuery->new( - children => [ $query, $category_query ] - ); - } - -Now when we execute the query... - - # Execute the Query and get a Hits object. - my $hits = $searcher->hits( - query => $query, - offset => $offset, - num_wanted => $page_size, - ); - -... we'll get a result set which is the intersection of the parsed query and -the category query. - -=head1 Using TermQuery with full text fields - -When querying full text fields, the easiest way is to create query objects -using QueryParser. But sometimes you want to create TermQuery for a single -term in a FullTextType field directly. In this case, we have to run the -search term through the field's analyzer to make sure it gets normalized in -the same way as the field's content. - - sub make_term_query { - my ($field, $term) = @_; - - my $token; - my $type = $schema->fetch_type($field); - - if ( $type->isa('Lucy::Plan::FullTextType') ) { - # Run the term through the full text analysis chain. - my $analyzer = $type->get_analyzer; - my $tokens = $analyzer->split($term); - - if ( @$tokens != 1 ) { - # If the term expands to more than one token, or no - # tokens at all, it will never match a token in the - # full text field. - return Lucy::Search::NoMatchQuery->new; - } - - $token = $tokens->[0]; - } - else { - # Exact match for other types. - $token = $term; - } - - return Lucy::Search::TermQuery->new( - field => $field, - term => $token, - ); - } - -=head1 Congratulations! - -You've made it to the end of the tutorial. - -=head1 SEE ALSO - -For additional thematic documentation, see the Apache Lucy -L<Cookbook|Lucy::Docs::Cookbook>. - -ANDQuery has a companion class, L<ORQuery|Lucy::Search::ORQuery>, and a -close relative, -L<RequiredOptionalQuery|Lucy::Search::RequiredOptionalQuery>. - - http://git-wip-us.apache.org/repos/asf/lucy/blob/5618020f/perl/lib/Lucy/Docs/Tutorial/Simple.pod ---------------------------------------------------------------------- diff --git a/perl/lib/Lucy/Docs/Tutorial/Simple.pod b/perl/lib/Lucy/Docs/Tutorial/Simple.pod deleted file mode 100644 index b40d7a1..0000000 --- a/perl/lib/Lucy/Docs/Tutorial/Simple.pod +++ /dev/null @@ -1,298 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -=head1 NAME - -Lucy::Docs::Tutorial::Simple - Bare-bones search app. - -=head2 Setup - -Copy the text presentation of the US Constitution from the C<sample> directory -of the Apache Lucy distribution to the base level of your web server's -C<htdocs> directory. - - $ cp -R sample/us_constitution /usr/local/apache2/htdocs/ - -=head2 Indexing: indexer.pl - -Our first task will be to create an application called C<indexer.pl> which -builds a searchable "inverted index" from a collection of documents. - -After we specify some configuration variables and load all necessary -modules... - - #!/usr/local/bin/perl - use strict; - use warnings; - - # (Change configuration variables as needed.) - my $path_to_index = '/path/to/index'; - my $uscon_source = '/usr/local/apache2/htdocs/us_constitution'; - - use Lucy::Simple; - use File::Spec::Functions qw( catfile ); - -... we'll start by creating a Lucy::Simple object, telling it where we'd -like the index to be located and the language of the source material. - - my $lucy = Lucy::Simple->new( - path => $path_to_index, - language => 'en', - ); - -Next, we'll add a subroutine which parses our sample documents. - - # Parse a file from our US Constitution collection and return a hashref with - # the fields title, body, and url. - sub parse_file { - my $filename = shift; - my $filepath = catfile( $uscon_source, $filename ); - open( my $fh, '<', $filepath ) or die "Can't open '$filepath': $!"; - my $text = do { local $/; <$fh> }; # slurp file content - $text =~ /\A(.+?)^\s+(.*)/ms - or die "Can't extract title/bodytext from '$filepath'"; - my $title = $1; - my $bodytext = $2; - return { - title => $title, - content => $bodytext, - url => "/us_constitution/$filename", - }; - } - -Add some elementary directory reading code... - - # Collect names of source files. - opendir( my $dh, $uscon_source ) - or die "Couldn't opendir '$uscon_source': $!"; - my @filenames = grep { $_ =~ /\.txt/ } readdir $dh; - -... and now we're ready for the meat of indexer.pl -- which occupies exactly -one line of code. - - foreach my $filename (@filenames) { - my $doc = parse_file($filename); - $lucy->add_doc($doc); # ta-da! - } - -=head2 Search: search.cgi - -As with our indexing app, the bulk of the code in our search script won't be -Lucy-specific. - -The beginning is dedicated to CGI processing and configuration. - - #!/usr/local/bin/perl -T - use strict; - use warnings; - - # (Change configuration variables as needed.) - my $path_to_index = '/path/to/index'; - - use CGI; - use List::Util qw( max min ); - use POSIX qw( ceil ); - use Encode qw( decode ); - use Lucy::Simple; - - my $cgi = CGI->new; - my $q = decode( "UTF-8", $cgi->param('q') || '' ); - my $offset = decode( "UTF-8", $cgi->param('offset') || 0 ); - my $page_size = 10; - -Once that's out of the way, we create our Lucy::Simple object and feed -it a query string. - - my $lucy = Lucy::Simple->new( - path => $path_to_index, - language => 'en', - ); - my $hit_count = $lucy->search( - query => $q, - offset => $offset, - num_wanted => $page_size, - ); - -The value returned by search() is the total number of documents in the -collection which matched the query. We'll show this hit count to the user, -and also use it in conjunction with the parameters C<offset> and C<num_wanted> -to break up results into "pages" of manageable size. - -Calling search() on our Simple object turns it into an iterator. Invoking -next() now returns hits one at a time as L<Lucy::Document::HitDoc> -objects, starting with the most relevant. - - # Create result list. - my $report = ''; - while ( my $hit = $lucy->next ) { - my $score = sprintf( "%0.3f", $hit->get_score ); - $report .= qq| - <p> - <a href="$hit->{url}"><strong>$hit->{title}</strong></a> - <em>$score</em> - <br> - <span class="excerptURL">$hit->{url}</span> - </p> - |; - } - -The rest of the script is just text wrangling. - - #---------------------------------------------------------------# - # No tutorial material below this point - just html generation. # - #---------------------------------------------------------------# - - # Generate paging links and hit count, print and exit. - my $paging_links = generate_paging_info( $q, $hit_count ); - blast_out_content( $q, $report, $paging_links ); - - # Create html fragment with links for paging through results n-at-a-time. - sub generate_paging_info { - my ( $query_string, $total_hits ) = @_; - my $escaped_q = CGI::escapeHTML($query_string); - my $paging_info; - if ( !length $query_string ) { - # No query? No display. - $paging_info = ''; - } - elsif ( $total_hits == 0 ) { - # Alert the user that their search failed. - $paging_info - = qq|<p>No matches for <strong>$escaped_q</strong></p>|; - } - else { - # Calculate the nums for the first and last hit to display. - my $last_result = min( ( $offset + $page_size ), $total_hits ); - my $first_result = min( ( $offset + 1 ), $last_result ); - - # Display the result nums, start paging info. - $paging_info = qq| - <p> - Results <strong>$first_result-$last_result</strong> - of <strong>$total_hits</strong> - for <strong>$escaped_q</strong>. - </p> - <p> - Results Page: - |; - - # Calculate first and last hits pages to display / link to. - my $current_page = int( $first_result / $page_size ) + 1; - my $last_page = ceil( $total_hits / $page_size ); - my $first_page = max( 1, ( $current_page - 9 ) ); - $last_page = min( $last_page, ( $current_page + 10 ) ); - - # Create a url for use in paging links. - my $href = $cgi->url( -relative => 1 ); - $href .= "?q=" . CGI::escape($query_string); - $href .= ";offset=" . CGI::escape($offset); - - # Generate the "Prev" link. - if ( $current_page > 1 ) { - my $new_offset = ( $current_page - 2 ) * $page_size; - $href =~ s/(?<=offset=)\d+/$new_offset/; - $paging_info .= qq|<a href="$href"><= Prev</a>\n|; - } - - # Generate paging links. - for my $page_num ( $first_page .. $last_page ) { - if ( $page_num == $current_page ) { - $paging_info .= qq|$page_num \n|; - } - else { - my $new_offset = ( $page_num - 1 ) * $page_size; - $href =~ s/(?<=offset=)\d+/$new_offset/; - $paging_info .= qq|<a href="$href">$page_num</a>\n|; - } - } - - # Generate the "Next" link. - if ( $current_page != $last_page ) { - my $new_offset = $current_page * $page_size; - $href =~ s/(?<=offset=)\d+/$new_offset/; - $paging_info .= qq|<a href="$href">Next =></a>\n|; - } - - # Close tag. - $paging_info .= "</p>\n"; - } - - return $paging_info; - } - - # Print content to output. - sub blast_out_content { - my ( $query_string, $hit_list, $paging_info ) = @_; - my $escaped_q = CGI::escapeHTML($query_string); - binmode( STDOUT, ":encoding(UTF-8)" ); - print qq|Content-type: text/html; charset=UTF-8\n\n|; - print qq| - <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" - "http://www.w3.org/TR/html4/loose.dtd"> - <html> - <head> - <meta http-equiv="Content-type" - content="text/html;charset=UTF-8"> - <link rel="stylesheet" type="text/css" - href="/us_constitution/uscon.css"> - <title>Lucy: $escaped_q</title> - </head> - - <body> - - <div id="navigation"> - <form id="usconSearch" action=""> - <strong> - Search the - <a href="/us_constitution/index.html">US Constitution</a>: - </strong> - <input type="text" name="q" id="q" value="$escaped_q"> - <input type="submit" value="=>"> - </form> - </div><!--navigation--> - - <div id="bodytext"> - - $hit_list - - $paging_info - - <p style="font-size: smaller; color: #666"> - <em> - Powered by <a href="http://lucy.apache.org/" - >Apache Lucy<small><sup>TM</sup></small></a> - </em> - </p> - </div><!--bodytext--> - - </body> - - </html> - |; - } - -=head2 OK... now what? - -Lucy::Simple is perfectly adequate for some tasks, but it's not very flexible. -Many people find that it doesn't do at least one or two things they can't live -without. - -In our next tutorial chapter, -L<BeyondSimple|Lucy::Docs::Tutorial::BeyondSimple>, we'll rewrite our -indexing and search scripts using the classes that Lucy::Simple hides -from view, opening up the possibilities for expansion; then, we'll spend the -rest of the tutorial chapters exploring these possibilities. - -=cut
