Port Lucy::Simple to C
Project: http://git-wip-us.apache.org/repos/asf/lucy/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/ebde55f3 Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/ebde55f3 Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/ebde55f3 Branch: refs/heads/master Commit: ebde55f3716e3f746095ce957054b1eecf936a4d Parents: f1c3021 Author: Nick Wellnhofer <[email protected]> Authored: Sat Aug 1 03:43:00 2015 +0200 Committer: Nick Wellnhofer <[email protected]> Committed: Wed Aug 5 15:26:15 2015 +0200 ---------------------------------------------------------------------- core/Lucy/Simple.c | 190 +++++++++++++++++++ core/Lucy/Simple.cfh | 99 ++++++++++ perl/buildlib/Lucy/Build/Binding/Misc.pm | 107 +++++++++++ perl/lib/Lucy/Simple.pm | 261 -------------------------- 4 files changed, 396 insertions(+), 261 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy/blob/ebde55f3/core/Lucy/Simple.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Simple.c b/core/Lucy/Simple.c new file mode 100644 index 0000000..2271984 --- /dev/null +++ b/core/Lucy/Simple.c @@ -0,0 +1,190 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define CFISH_USE_SHORT_NAMES +#define LUCY_USE_SHORT_NAMES + +#define C_LUCY_SIMPLE +#include "Lucy/Simple.h" + +#include "Clownfish/Err.h" +#include "Clownfish/Hash.h" +#include "Clownfish/HashIterator.h" +#include "Clownfish/String.h" +#include "Clownfish/Vector.h" +#include "Lucy/Analysis/EasyAnalyzer.h" +#include "Lucy/Document/Doc.h" +#include "Lucy/Document/HitDoc.h" +#include "Lucy/Index/Indexer.h" +#include "Lucy/Index/PolyReader.h" +#include "Lucy/Plan/FullTextType.h" +#include "Lucy/Plan/Schema.h" +#include "Lucy/Search/Hits.h" +#include "Lucy/Search/IndexSearcher.h" + +Simple* +Simple_new(Obj *index, String *language) { + Simple *self = (Simple*)Class_Make_Obj(SIMPLE); + return Simple_init(self, index, language); +} + +Simple* +Simple_init(Simple *self, Obj *index, String *language) { + SimpleIVARS *const ivars = Simple_IVARS(self); + ivars->index = INCREF(index); + ivars->language = Str_Clone(language); + return self; +} + +void +Simple_Destroy_IMP(Simple *self) { + SimpleIVARS *const ivars = Simple_IVARS(self); + + Simple_Finish_Indexing(self); + + DECREF(ivars->index); + DECREF(ivars->language); + DECREF(ivars->schema); + DECREF(ivars->type); + DECREF(ivars->indexer); + DECREF(ivars->searcher); + DECREF(ivars->hits); + + SUPER_DESTROY(self, SIMPLE); +} + +static void +S_create_indexer(Simple *self) { + SimpleIVARS *const ivars = Simple_IVARS(self); + + // Trigger searcher refresh. + DECREF(ivars->searcher); + DECREF(ivars->hits); + ivars->searcher = NULL; + ivars->hits = NULL; + + // Get type and schema + Schema *schema = NULL; + FieldType *type = NULL; + PolyReader *reader = PolyReader_open(ivars->index, NULL, NULL); + Vector *seg_readers = PolyReader_Get_Seg_Readers(reader); + + if (Vec_Get_Size(seg_readers) == 0) { + // Index is empty, create new schema and type. + schema = Schema_new(); + EasyAnalyzer *analyzer = EasyAnalyzer_new(ivars->language); + type = (FieldType*)FullTextType_new((Analyzer*)analyzer); + DECREF(analyzer); + } + else { + // Get schema from reader. + schema = (Schema*)INCREF(PolyReader_Get_Schema(reader)); + Vector *fields = Schema_All_Fields(schema); + String *field = (String*)CERTIFY(Vec_Fetch(fields, 0), STRING); + type = (FieldType*)INCREF(Schema_Fetch_Type(schema, field)); + DECREF(fields); + } + + ivars->indexer = Indexer_new(schema, ivars->index, NULL, 0); + ivars->schema = schema; + ivars->type = type; + + DECREF(reader); +} + +void +Simple_Add_Doc_IMP(Simple *self, Doc *doc) { + SimpleIVARS *const ivars = Simple_IVARS(self); + + if (!ivars->indexer) { + S_create_indexer(self); + } + + Vector *field_names = Doc_Field_Names(doc); + + for (size_t i = 0, max = Vec_Get_Size(field_names); i < max; i++) { + String *field = (String*)Vec_Fetch(field_names, i); + Schema_Spec_Field(ivars->schema, field, ivars->type); + } + + Indexer_Add_Doc(ivars->indexer, doc, 1.0); + + DECREF(field_names); +} + +uint32_t +Simple_Search_IMP(Simple *self, String *query, uint32_t offset, + uint32_t num_wanted) { + SimpleIVARS *const ivars = Simple_IVARS(self); + + // Flush recent adds; lazily create searcher. + Simple_Finish_Indexing(self); + if (!ivars->searcher) { + ivars->searcher = IxSearcher_new(ivars->index); + } + + DECREF(ivars->hits); + ivars->hits = IxSearcher_Hits(ivars->searcher, (Obj*)query, offset, + num_wanted, NULL); + + return Hits_Total_Hits(ivars->hits); +} + +HitDoc* +Simple_Next_IMP(Simple *self) { + SimpleIVARS *const ivars = Simple_IVARS(self); + + if (!ivars->hits) { return NULL; } + + // Get the hit, bail if hits are exhausted. + HitDoc *doc = Hits_Next(ivars->hits); + if (!doc) { + DECREF(ivars->hits); + ivars->hits = NULL; + } + + return doc; +} + +Indexer* +Simple_Get_Indexer_IMP(Simple *self) { + SimpleIVARS *const ivars = Simple_IVARS(self); + + if (!ivars->indexer) { + S_create_indexer(self); + } + + return ivars->indexer; +} + +void +Simple_Finish_Indexing_IMP(Simple *self) { + SimpleIVARS *const ivars = Simple_IVARS(self); + + // Don't bother to throw an error if index not modified. + if (ivars->indexer) { + Indexer_Commit(ivars->indexer); + + // Trigger searcher and indexer refresh. + DECREF(ivars->schema); + DECREF(ivars->type); + DECREF(ivars->indexer); + ivars->schema = NULL; + ivars->type = NULL; + ivars->indexer = NULL; + } +} + http://git-wip-us.apache.org/repos/asf/lucy/blob/ebde55f3/core/Lucy/Simple.cfh ---------------------------------------------------------------------- diff --git a/core/Lucy/Simple.cfh b/core/Lucy/Simple.cfh new file mode 100644 index 0000000..3680ce5 --- /dev/null +++ b/core/Lucy/Simple.cfh @@ -0,0 +1,99 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +parcel Lucy; + +/** Basic search engine. + * + * Lucy::Simple is a stripped-down interface for the Apache Lucy search + * engine library. + */ +public class Lucy::Simple { + + Obj *index; + String *language; + Schema *schema; + FieldType *type; + Indexer *indexer; + IndexSearcher *searcher; + Hits *hits; + + /** Create a Lucy::Simple object, which can be used for both indexing and + * searching. Both parameters `path` and `language` are required. + * + * @param path Where the index directory should be located. If no index + * is found at the specified location, one will be created. + * @param language The language of the documents in your collection, + * indicated by a two-letter ISO code. 12 languages are supported: + * + * |-----------------------| + * | Language | ISO code | + * |-----------------------| + * | Danish | da | + * | Dutch | nl | + * | English | en | + * | Finnish | fi | + * | French | fr | + * | German | de | + * | Italian | it | + * | Norwegian | no | + * | Portuguese | pt | + * | Spanish | es | + * | Swedish | sv | + * | Russian | ru | + * |-----------------------| + */ + public inert Simple* + new(Obj *path, String *language); + + public inert Simple* + init(Simple *self, Obj *path, String *language); + + /** Add a document to the index. + */ + public void + Add_Doc(Simple *self, Doc *doc); + + /** Search the index. Returns the total number of documents which match + * the query. (This number is unlikely to match `num_wanted`.) + * + * @param query A search query string. + * @param offset The number of most-relevant hits to discard, typically + * used when "paging" through hits N at a time. Setting offset to 20 and + * num_wanted to 10 retrieves hits 21-30, assuming that 30 hits can be + * found. + * @param num_wanted The number of hits you would like to see after + * `offset` is taken into account. + */ + public uint32_t + Search(Simple *self, String *query, uint32_t offset = 0, + uint32_t num_wanted = 10); + + /** Return the next hit, or [](cfish:@null) when the iterator is exhausted. + */ + public incremented nullable HitDoc* + Next(Simple *self); + + Indexer* + Get_Indexer(Simple *self); + + void + Finish_Indexing(Simple *self); + + public void + Destroy(Simple *self); +} + http://git-wip-us.apache.org/repos/asf/lucy/blob/ebde55f3/perl/buildlib/Lucy/Build/Binding/Misc.pm ---------------------------------------------------------------------- diff --git a/perl/buildlib/Lucy/Build/Binding/Misc.pm b/perl/buildlib/Lucy/Build/Binding/Misc.pm index ebd4fea..92c8b95 100644 --- a/perl/buildlib/Lucy/Build/Binding/Misc.pm +++ b/perl/buildlib/Lucy/Build/Binding/Misc.pm @@ -26,6 +26,7 @@ sub bind_all { $hierarchy->inherit_metadata; $class->bind_lucy; + $class->bind_simple; $class->bind_test; } @@ -148,6 +149,112 @@ END_XS_CODE Clownfish::CFC::Binding::Perl::Class->register($binding); } +sub bind_simple { + my @exposed = qw( + Search + Next + ); + my @hand_rolled = qw( Add_Doc ); + + my $pod_spec = Clownfish::CFC::Binding::Perl::Pod->new; + my $synopsis = <<'END_SYNOPSIS'; +First, build an index of your documents. + + my $index = Lucy::Simple->new( + path => '/path/to/index/' + language => 'en', + ); + + while ( my ( $title, $content ) = each %source_docs ) { + $index->add_doc({ + title => $title, + content => $content, + }); + } + +Later, search the index. + + my $total_hits = $index->search( + query => $query_string, + offset => 0, + num_wanted => 10, + ); + + print "Total hits: $total_hits\n"; + while ( my $hit = $index->next ) { + print "$hit->{title}\n", + } +END_SYNOPSIS + my $add_doc_pod = <<'END_ADD_DOC_POD'; +=head2 add_doc + + $lucy->add_doc({ + location => $url, + title => $title, + content => $content, + }); + +Add a document to the index. The document must be supplied as a hashref, +with field names as keys and content as values. + +END_ADD_DOC_POD + $pod_spec->set_synopsis($synopsis); + + # Override is necessary because there's no standard way to explain + # hash/hashref across multiple host languages. + $pod_spec->add_method( + method => 'Add_Doc', + alias => 'add_doc', + pod => $add_doc_pod, + ); + $pod_spec->add_method( method => $_, alias => lc($_) ) for @exposed; + + my $xs_code = <<'END_XS_CODE'; +MODULE = Lucy PACKAGE = Lucy::Simple + +void +add_doc(self, doc_sv) + lucy_Simple *self; + SV *doc_sv; +PPCODE: +{ + lucy_Doc *doc = NULL; + + // Either get a Doc or use the stock doc. + if (sv_isobject(doc_sv) + && sv_derived_from(doc_sv, "Lucy::Document::Doc") + ) { + IV tmp = SvIV(SvRV(doc_sv)); + doc = INT2PTR(lucy_Doc*, tmp); + } + else if (XSBind_sv_defined(aTHX_ doc_sv) && SvROK(doc_sv)) { + HV *maybe_fields = (HV*)SvRV(doc_sv); + if (SvTYPE((SV*)maybe_fields) == SVt_PVHV) { + lucy_Indexer *indexer = LUCY_Simple_Get_Indexer(self); + doc = LUCY_Indexer_Get_Stock_Doc(indexer); + LUCY_Doc_Set_Fields(doc, maybe_fields); + } + } + if (!doc) { + THROW(CFISH_ERR, "Need either a hashref or a %o", + CFISH_Class_Get_Name(LUCY_DOC)); + } + + LUCY_Simple_Add_Doc(self, doc); +} +END_XS_CODE + + my $binding = Clownfish::CFC::Binding::Perl::Class->new( + parcel => "Lucy", + class_name => "Lucy::Simple", + ); + $binding->exclude_method($_) for @hand_rolled; + $binding->append_xs($xs_code); + $binding->set_pod_spec($pod_spec); + + Clownfish::CFC::Binding::Perl::Class->register($binding); +} + sub bind_test { my $xs_code = <<'END_XS_CODE'; MODULE = Lucy PACKAGE = Lucy::Test http://git-wip-us.apache.org/repos/asf/lucy/blob/ebde55f3/perl/lib/Lucy/Simple.pm ---------------------------------------------------------------------- diff --git a/perl/lib/Lucy/Simple.pm b/perl/lib/Lucy/Simple.pm index e409615..e56a6ba 100644 --- a/perl/lib/Lucy/Simple.pm +++ b/perl/lib/Lucy/Simple.pm @@ -13,274 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -use strict; -use warnings; - package Lucy::Simple; use Lucy; our $VERSION = '0.004000'; $VERSION = eval $VERSION; -use Carp; -use Scalar::Util qw( weaken reftype refaddr ); - -use Lucy::Plan::Schema; -use Lucy::Analysis::EasyAnalyzer; -use Lucy::Index::Indexer; -use Lucy::Search::IndexSearcher; - -my %obj_cache; - -sub new { - my ( $either, %args ) = @_; - my $path = delete $args{path}; - my $language = lc( delete $args{language} ); - confess("Missing required parameter 'path'") unless defined $path; - confess("Invalid language: '$language'") - unless $language =~ /^(?:da|de|en|es|fi|fr|it|nl|no|pt|ru|sv)$/; - my @remaining = keys %args; - confess("Invalid params: @remaining") if @remaining; - my $self = bless { - type => undef, - schema => undef, - indexer => undef, - searcher => undef, - hits => undef, - language => $language, - path => $path, - }, - ref($either) || $either; - - # Cache the object for later clean-up. - weaken( $obj_cache{ refaddr $self } = $self ); - - return $self; -} - -sub _lazily_create_indexer { - my $self = shift; - if ( !defined $self->{indexer} ) { - # Get type and schema - my $schema; - my $reader = Lucy::Index::PolyReader->open( index => $self->{path} ); - if ( !@{ $reader->seg_readers } ) { - # index is empty, create new schema and type - $schema = Lucy::Plan::Schema->new; - my $analyzer = Lucy::Analysis::EasyAnalyzer->new( - language => $self->{language}, ); - $self->{type} - = Lucy::Plan::FullTextType->new( analyzer => $analyzer, ); - } - else { - # get schema from reader - $schema = $reader->get_schema; - my $field = $schema->all_fields->[0]; - $self->{type} = $schema->fetch_type($field); - } - $self->{schema} = $schema; - $self->{indexer} = Lucy::Index::Indexer->new( - schema => $schema, - index => $self->{path}, - ); - } -} - -sub add_doc { - my ( $self, $hashref ) = @_; - croak("add_doc requires exactly one argument: a hashref") - unless ( @_ == 2 and reftype($hashref) eq 'HASH' ); - $self->_lazily_create_indexer; - my $schema = $self->{schema}; - my $type = $self->{type}; - $schema->spec_field( name => $_, type => $type ) for keys %$hashref; - $self->{indexer}->add_doc($hashref); -} - -sub _finish_indexing { - my $self = shift; - - # Don't bother to throw an error if index not modified. - if ( defined $self->{indexer} ) { - $self->{indexer}->commit; - - # Trigger searcher and indexer refresh. - undef $self->{indexer}; - undef $self->{searcher}; - } -} - -sub search { - my ( $self, %args ) = @_; - - # Flush recent adds; lazily create searcher. - $self->_finish_indexing; - if ( !defined $self->{searcher} ) { - $self->{searcher} - = Lucy::Search::IndexSearcher->new( index => $self->{path} ); - } - - $self->{hits} = $self->{searcher}->hits(%args); - - return $self->{hits}->total_hits; -} - -sub next { - my $self = shift; - return unless defined $self->{hits}; - - # Get the hit, bail if hits are exhausted. - my $hit = $self->{hits}->next; - if ( !defined $hit ) { - undef $self->{hits}; - return; - } - - return $hit; -} - -sub DESTROY { - for (shift) { - $_->_finish_indexing; - delete $obj_cache{ refaddr $_ }; - } -} - -END { - # Finish indexing for any objects that still exist, since, if we wait - # until global destruction, our Indexer might no longer exist, - # (see bug #32689) - $_->_finish_indexing for values %obj_cache; -} 1; __END__ -__POD__ - -=head1 NAME - -Lucy::Simple - Basic search engine. - -=head1 SYNOPSIS - -First, build an index of your documents. - - my $index = Lucy::Simple->new( - path => '/path/to/index/' - language => 'en', - ); - - while ( my ( $title, $content ) = each %source_docs ) { - $index->add_doc({ - title => $title, - content => $content, - }); - } - -Later, search the index. - - my $total_hits = $index->search( - query => $query_string, - offset => 0, - num_wanted => 10, - ); - - print "Total hits: $total_hits\n"; - while ( my $hit = $index->next ) { - print "$hit->{title}\n", - } - -=head1 DESCRIPTION - -Lucy::Simple is a stripped-down interface for the L<Apache Lucy|Lucy> search -engine library. - -=head1 METHODS - -=head2 new - - my $lucy = Lucy::Simple->new( - path => '/path/to/index/', - language => 'en', - ); - -Create a Lucy::Simple object, which can be used for both indexing and -searching. Two hash-style parameters are required. - -=over - -=item * - -B<path> - Where the index directory should be located. If no index is found -at the specified location, one will be created. - -=item * - -B<language> - The language of the documents in your collection, indicated -by a two-letter ISO code. 12 languages are supported: - - |-----------------------| - | Language | ISO code | - |-----------------------| - | Danish | da | - | Dutch | nl | - | English | en | - | Finnish | fi | - | French | fr | - | German | de | - | Italian | it | - | Norwegian | no | - | Portuguese | pt | - | Spanish | es | - | Swedish | sv | - | Russian | ru | - |-----------------------| - -=back - -=head2 add_doc - - $lucy->add_doc({ - location => $url, - title => $title, - content => $content, - }); - -Add a document to the index. The document must be supplied as a hashref, with -field names as keys and content as values. - -=head2 search - - my $total_hits = $lucy->search( - query => $query_string, # required - offset => 40, # default 0 - num_wanted => 20, # default 10 - ); - -Search the index. Returns the total number of documents which match the -query. (This number is unlikely to match C<num_wanted>.) - -=over - -=item * - -B<query> - A search query string. - -=item * - -B<offset> - The number of most-relevant hits to discard, typically used when -"paging" through hits N at a time. Setting offset to 20 and num_wanted to 10 -retrieves hits 21-30, assuming that 30 hits can be found. - -=item * - -B<num_wanted> - The number of hits you would like to see after C<offset> is -taken into account. - -=back - -=head1 BUGS - -Not thread-safe. -=cut
