[4/5] lucy git commit: Port Lucy::Simple to C

nwellnhof Thu, 06 Aug 2015 09:22:57 -0700

Port Lucy::Simple to C


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/ebde55f3
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/ebde55f3
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/ebde55f3

Branch: refs/heads/master
Commit: ebde55f3716e3f746095ce957054b1eecf936a4d
Parents: f1c3021
Author: Nick Wellnhofer <[email protected]>
Authored: Sat Aug 1 03:43:00 2015 +0200
Committer: Nick Wellnhofer <[email protected]>
Committed: Wed Aug 5 15:26:15 2015 +0200

----------------------------------------------------------------------
 core/Lucy/Simple.c                       | 190 +++++++++++++++++++
 core/Lucy/Simple.cfh                     |  99 ++++++++++
 perl/buildlib/Lucy/Build/Binding/Misc.pm | 107 +++++++++++
 perl/lib/Lucy/Simple.pm                  | 261 --------------------------
 4 files changed, 396 insertions(+), 261 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/ebde55f3/core/Lucy/Simple.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Simple.c b/core/Lucy/Simple.c
new file mode 100644
index 0000000..2271984
--- /dev/null
+++ b/core/Lucy/Simple.c
@@ -0,0 +1,190 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define CFISH_USE_SHORT_NAMES
+#define LUCY_USE_SHORT_NAMES
+
+#define C_LUCY_SIMPLE
+#include "Lucy/Simple.h"
+
+#include "Clownfish/Err.h"
+#include "Clownfish/Hash.h"
+#include "Clownfish/HashIterator.h"
+#include "Clownfish/String.h"
+#include "Clownfish/Vector.h"
+#include "Lucy/Analysis/EasyAnalyzer.h"
+#include "Lucy/Document/Doc.h"
+#include "Lucy/Document/HitDoc.h"
+#include "Lucy/Index/Indexer.h"
+#include "Lucy/Index/PolyReader.h"
+#include "Lucy/Plan/FullTextType.h"
+#include "Lucy/Plan/Schema.h"
+#include "Lucy/Search/Hits.h"
+#include "Lucy/Search/IndexSearcher.h"
+
+Simple*
+Simple_new(Obj *index, String *language) {
+    Simple *self = (Simple*)Class_Make_Obj(SIMPLE);
+    return Simple_init(self, index, language);
+}
+
+Simple*
+Simple_init(Simple *self, Obj *index, String *language) {
+    SimpleIVARS *const ivars = Simple_IVARS(self);
+    ivars->index    = INCREF(index);
+    ivars->language = Str_Clone(language);
+    return self;
+}
+
+void
+Simple_Destroy_IMP(Simple *self) {
+    SimpleIVARS *const ivars = Simple_IVARS(self);
+
+    Simple_Finish_Indexing(self);
+
+    DECREF(ivars->index);
+    DECREF(ivars->language);
+    DECREF(ivars->schema);
+    DECREF(ivars->type);
+    DECREF(ivars->indexer);
+    DECREF(ivars->searcher);
+    DECREF(ivars->hits);
+
+    SUPER_DESTROY(self, SIMPLE);
+}
+
+static void
+S_create_indexer(Simple *self) {
+    SimpleIVARS *const ivars = Simple_IVARS(self);
+
+    // Trigger searcher refresh.
+    DECREF(ivars->searcher);
+    DECREF(ivars->hits);
+    ivars->searcher = NULL;
+    ivars->hits     = NULL;
+
+    // Get type and schema
+    Schema     *schema      = NULL;
+    FieldType  *type        = NULL;
+    PolyReader *reader      = PolyReader_open(ivars->index, NULL, NULL);
+    Vector     *seg_readers = PolyReader_Get_Seg_Readers(reader);
+
+    if (Vec_Get_Size(seg_readers) == 0) {
+        // Index is empty, create new schema and type.
+        schema = Schema_new();
+        EasyAnalyzer *analyzer = EasyAnalyzer_new(ivars->language);
+        type = (FieldType*)FullTextType_new((Analyzer*)analyzer);
+        DECREF(analyzer);
+    }
+    else {
+        // Get schema from reader.
+        schema = (Schema*)INCREF(PolyReader_Get_Schema(reader));
+        Vector *fields = Schema_All_Fields(schema);
+        String *field  = (String*)CERTIFY(Vec_Fetch(fields, 0), STRING);
+        type = (FieldType*)INCREF(Schema_Fetch_Type(schema, field));
+        DECREF(fields);
+    }
+
+    ivars->indexer = Indexer_new(schema, ivars->index, NULL, 0);
+    ivars->schema  = schema;
+    ivars->type    = type;
+
+    DECREF(reader);
+}
+
+void
+Simple_Add_Doc_IMP(Simple *self, Doc *doc) {
+    SimpleIVARS *const ivars = Simple_IVARS(self);
+
+    if (!ivars->indexer) {
+        S_create_indexer(self);
+    }
+
+    Vector *field_names = Doc_Field_Names(doc);
+
+    for (size_t i = 0, max = Vec_Get_Size(field_names); i < max; i++) {
+        String *field = (String*)Vec_Fetch(field_names, i);
+        Schema_Spec_Field(ivars->schema, field, ivars->type);
+    }
+
+    Indexer_Add_Doc(ivars->indexer, doc, 1.0);
+
+    DECREF(field_names);
+}
+
+uint32_t
+Simple_Search_IMP(Simple *self, String *query, uint32_t offset,
+                  uint32_t num_wanted) {
+    SimpleIVARS *const ivars = Simple_IVARS(self);
+
+    // Flush recent adds; lazily create searcher.
+    Simple_Finish_Indexing(self);
+    if (!ivars->searcher) {
+        ivars->searcher = IxSearcher_new(ivars->index);
+    }
+
+    DECREF(ivars->hits);
+    ivars->hits = IxSearcher_Hits(ivars->searcher, (Obj*)query, offset,
+                                  num_wanted, NULL);
+
+    return Hits_Total_Hits(ivars->hits);
+}
+
+HitDoc*
+Simple_Next_IMP(Simple *self) {
+    SimpleIVARS *const ivars = Simple_IVARS(self);
+
+    if (!ivars->hits) { return NULL; }
+
+    // Get the hit, bail if hits are exhausted.
+    HitDoc *doc = Hits_Next(ivars->hits);
+    if (!doc) {
+        DECREF(ivars->hits);
+        ivars->hits = NULL;
+    }
+
+    return doc;
+}
+
+Indexer*
+Simple_Get_Indexer_IMP(Simple *self) {
+    SimpleIVARS *const ivars = Simple_IVARS(self);
+
+    if (!ivars->indexer) {
+        S_create_indexer(self);
+    }
+
+    return ivars->indexer;
+}
+
+void
+Simple_Finish_Indexing_IMP(Simple *self) {
+    SimpleIVARS *const ivars = Simple_IVARS(self);
+
+    // Don't bother to throw an error if index not modified.
+    if (ivars->indexer) {
+        Indexer_Commit(ivars->indexer);
+
+        // Trigger searcher and indexer refresh.
+        DECREF(ivars->schema);
+        DECREF(ivars->type);
+        DECREF(ivars->indexer);
+        ivars->schema   = NULL;
+        ivars->type     = NULL;
+        ivars->indexer  = NULL;
+    }
+}
+

http://git-wip-us.apache.org/repos/asf/lucy/blob/ebde55f3/core/Lucy/Simple.cfh
----------------------------------------------------------------------
diff --git a/core/Lucy/Simple.cfh b/core/Lucy/Simple.cfh
new file mode 100644
index 0000000..3680ce5
--- /dev/null
+++ b/core/Lucy/Simple.cfh
@@ -0,0 +1,99 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+parcel Lucy;
+
+/** Basic search engine.
+ *
+ * Lucy::Simple is a stripped-down interface for the Apache Lucy search
+ * engine library.
+ */
+public class Lucy::Simple {
+
+    Obj           *index;
+    String        *language;
+    Schema        *schema;
+    FieldType     *type;
+    Indexer       *indexer;
+    IndexSearcher *searcher;
+    Hits          *hits;
+
+    /** Create a Lucy::Simple object, which can be used for both indexing and
+     * searching.  Both parameters `path` and `language` are required.
+     *
+     * @param path Where the index directory should be located.  If no index
+     * is found at the specified location, one will be created.
+     * @param language The language of the documents in your collection,
+     * indicated  by a two-letter ISO code.  12 languages are supported:
+     *
+     *     |-----------------------|
+     *     | Language   | ISO code |
+     *     |-----------------------|
+     *     | Danish     | da       |
+     *     | Dutch      | nl       |
+     *     | English    | en       |
+     *     | Finnish    | fi       |
+     *     | French     | fr       |
+     *     | German     | de       |
+     *     | Italian    | it       |
+     *     | Norwegian  | no       |
+     *     | Portuguese | pt       |
+     *     | Spanish    | es       |
+     *     | Swedish    | sv       |
+     *     | Russian    | ru       |
+     *     |-----------------------|
+     */
+    public inert Simple*
+    new(Obj *path, String *language);
+
+    public inert Simple*
+    init(Simple *self, Obj *path, String *language);
+
+    /** Add a document to the index.
+     */
+    public void
+    Add_Doc(Simple *self, Doc *doc);
+
+    /** Search the index.  Returns the total number of documents which match
+     * the query.  (This number is unlikely to match `num_wanted`.)
+     *
+     * @param query A search query string.
+     * @param offset The number of most-relevant hits to discard, typically
+     * used when "paging" through hits N at a time.  Setting offset to 20 and
+     * num_wanted to 10 retrieves hits 21-30, assuming that 30 hits can be
+     * found.
+     * @param num_wanted The number of hits you would like to see after
+     * `offset` is taken into account.
+     */
+    public uint32_t
+    Search(Simple *self, String *query, uint32_t offset = 0,
+           uint32_t num_wanted = 10);
+
+    /** Return the next hit, or [](cfish:@null) when the iterator is exhausted.
+     */
+    public incremented nullable HitDoc*
+    Next(Simple *self);
+
+    Indexer*
+    Get_Indexer(Simple *self);
+
+    void
+    Finish_Indexing(Simple *self);
+
+    public void
+    Destroy(Simple *self);
+}
+

http://git-wip-us.apache.org/repos/asf/lucy/blob/ebde55f3/perl/buildlib/Lucy/Build/Binding/Misc.pm
----------------------------------------------------------------------
diff --git a/perl/buildlib/Lucy/Build/Binding/Misc.pm 
b/perl/buildlib/Lucy/Build/Binding/Misc.pm
index ebd4fea..92c8b95 100644
--- a/perl/buildlib/Lucy/Build/Binding/Misc.pm
+++ b/perl/buildlib/Lucy/Build/Binding/Misc.pm
@@ -26,6 +26,7 @@ sub bind_all {
     $hierarchy->inherit_metadata;
 
     $class->bind_lucy;
+    $class->bind_simple;
     $class->bind_test;
 }
 
@@ -148,6 +149,112 @@ END_XS_CODE
     Clownfish::CFC::Binding::Perl::Class->register($binding);
 }
 
+sub bind_simple {
+    my @exposed = qw(
+        Search
+        Next
+    );
+    my @hand_rolled = qw( Add_Doc );
+
+    my $pod_spec = Clownfish::CFC::Binding::Perl::Pod->new;
+    my $synopsis = <<'END_SYNOPSIS';
+First, build an index of your documents.
+
+    my $index = Lucy::Simple->new(
+        path     => '/path/to/index/'
+        language => 'en',
+    );
+
+    while ( my ( $title, $content ) = each %source_docs ) {
+        $index->add_doc({
+            title    => $title,
+            content  => $content,
+        });
+    }
+
+Later, search the index.
+
+    my $total_hits = $index->search(
+        query      => $query_string,
+        offset     => 0,
+        num_wanted => 10,
+    );
+
+    print "Total hits: $total_hits\n";
+    while ( my $hit = $index->next ) {
+        print "$hit->{title}\n",
+    }
+END_SYNOPSIS
+    my $add_doc_pod = <<'END_ADD_DOC_POD';
+=head2 add_doc
+
+    $lucy->add_doc({
+        location => $url,
+        title    => $title,
+        content  => $content,
+    });
+
+Add a document to the index. The document must be supplied as a hashref,
+with field names as keys and content as values.
+
+END_ADD_DOC_POD
+    $pod_spec->set_synopsis($synopsis);
+
+    # Override is necessary because there's no standard way to explain
+    # hash/hashref across multiple host languages.
+    $pod_spec->add_method(
+        method => 'Add_Doc',
+        alias  => 'add_doc',
+        pod    => $add_doc_pod,
+    );
+    $pod_spec->add_method( method => $_, alias => lc($_) ) for @exposed;
+
+    my $xs_code = <<'END_XS_CODE';
+MODULE = Lucy  PACKAGE = Lucy::Simple
+
+void
+add_doc(self, doc_sv)
+    lucy_Simple *self;
+    SV *doc_sv;
+PPCODE:
+{
+    lucy_Doc *doc = NULL;
+
+    // Either get a Doc or use the stock doc.
+    if (sv_isobject(doc_sv)
+        && sv_derived_from(doc_sv, "Lucy::Document::Doc")
+       ) {
+        IV tmp = SvIV(SvRV(doc_sv));
+        doc = INT2PTR(lucy_Doc*, tmp);
+    }
+    else if (XSBind_sv_defined(aTHX_ doc_sv) && SvROK(doc_sv)) {
+        HV *maybe_fields = (HV*)SvRV(doc_sv);
+        if (SvTYPE((SV*)maybe_fields) == SVt_PVHV) {
+            lucy_Indexer *indexer = LUCY_Simple_Get_Indexer(self);
+            doc = LUCY_Indexer_Get_Stock_Doc(indexer);
+            LUCY_Doc_Set_Fields(doc, maybe_fields);
+        }
+    }
+    if (!doc) {
+        THROW(CFISH_ERR, "Need either a hashref or a %o",
+              CFISH_Class_Get_Name(LUCY_DOC));
+    }
+
+    LUCY_Simple_Add_Doc(self, doc);
+}
+END_XS_CODE
+
+    my $binding = Clownfish::CFC::Binding::Perl::Class->new(
+        parcel     => "Lucy",
+        class_name => "Lucy::Simple",
+    );
+    $binding->exclude_method($_) for @hand_rolled;
+    $binding->append_xs($xs_code);
+    $binding->set_pod_spec($pod_spec);
+
+    Clownfish::CFC::Binding::Perl::Class->register($binding);
+}
+
 sub bind_test {
     my $xs_code = <<'END_XS_CODE';
 MODULE = Lucy   PACKAGE = Lucy::Test

http://git-wip-us.apache.org/repos/asf/lucy/blob/ebde55f3/perl/lib/Lucy/Simple.pm
----------------------------------------------------------------------
diff --git a/perl/lib/Lucy/Simple.pm b/perl/lib/Lucy/Simple.pm
index e409615..e56a6ba 100644
--- a/perl/lib/Lucy/Simple.pm
+++ b/perl/lib/Lucy/Simple.pm
@@ -13,274 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-use strict;
-use warnings;
-
 package Lucy::Simple;
 use Lucy;
 our $VERSION = '0.004000';
 $VERSION = eval $VERSION;
-use Carp;
-use Scalar::Util qw( weaken reftype refaddr );
-
-use Lucy::Plan::Schema;
-use Lucy::Analysis::EasyAnalyzer;
-use Lucy::Index::Indexer;
-use Lucy::Search::IndexSearcher;
-
-my %obj_cache;
-
-sub new {
-    my ( $either, %args ) = @_;
-    my $path     = delete $args{path};
-    my $language = lc( delete $args{language} );
-    confess("Missing required parameter 'path'") unless defined $path;
-    confess("Invalid language: '$language'")
-        unless $language =~ /^(?:da|de|en|es|fi|fr|it|nl|no|pt|ru|sv)$/;
-    my @remaining = keys %args;
-    confess("Invalid params: @remaining") if @remaining;
-    my $self = bless {
-        type     => undef,
-        schema   => undef,
-        indexer  => undef,
-        searcher => undef,
-        hits     => undef,
-        language => $language,
-        path     => $path,
-        },
-        ref($either) || $either;
-
-    # Cache the object for later clean-up.
-    weaken( $obj_cache{ refaddr $self } = $self );
-
-    return $self;
-}
-
-sub _lazily_create_indexer {
-    my $self = shift;
-    if ( !defined $self->{indexer} ) {
-        # Get type and schema
-        my $schema;
-        my $reader = Lucy::Index::PolyReader->open( index => $self->{path} );
-        if ( !@{ $reader->seg_readers } ) {
-            # index is empty, create new schema and type
-            $schema = Lucy::Plan::Schema->new;
-            my $analyzer = Lucy::Analysis::EasyAnalyzer->new(
-                language => $self->{language}, );
-            $self->{type}
-                = Lucy::Plan::FullTextType->new( analyzer => $analyzer, );
-        }
-        else {
-            # get schema from reader
-            $schema = $reader->get_schema;
-            my $field = $schema->all_fields->[0];
-            $self->{type} = $schema->fetch_type($field);
-        }
-        $self->{schema}  = $schema;
-        $self->{indexer} = Lucy::Index::Indexer->new(
-            schema => $schema,
-            index  => $self->{path},
-        );
-    }
-}
-
-sub add_doc {
-    my ( $self, $hashref ) = @_;
-    croak("add_doc requires exactly one argument: a hashref")
-        unless ( @_ == 2 and reftype($hashref) eq 'HASH' );
-    $self->_lazily_create_indexer;
-    my $schema = $self->{schema};
-    my $type   = $self->{type};
-    $schema->spec_field( name => $_, type => $type ) for keys %$hashref;
-    $self->{indexer}->add_doc($hashref);
-}
-
-sub _finish_indexing {
-    my $self = shift;
-
-    # Don't bother to throw an error if index not modified.
-    if ( defined $self->{indexer} ) {
-        $self->{indexer}->commit;
-
-        # Trigger searcher and indexer refresh.
-        undef $self->{indexer};
-        undef $self->{searcher};
-    }
-}
-
-sub search {
-    my ( $self, %args ) = @_;
-
-    # Flush recent adds; lazily create searcher.
-    $self->_finish_indexing;
-    if ( !defined $self->{searcher} ) {
-        $self->{searcher}
-            = Lucy::Search::IndexSearcher->new( index => $self->{path} );
-    }
-
-    $self->{hits} = $self->{searcher}->hits(%args);
-
-    return $self->{hits}->total_hits;
-}
-
-sub next {
-    my $self = shift;
-    return unless defined $self->{hits};
-
-    # Get the hit, bail if hits are exhausted.
-    my $hit = $self->{hits}->next;
-    if ( !defined $hit ) {
-        undef $self->{hits};
-        return;
-    }
-
-    return $hit;
-}
-
-sub DESTROY {
-    for (shift) {
-        $_->_finish_indexing;
-        delete $obj_cache{ refaddr $_ };
-    }
-}
-
-END {
-    # Finish indexing for any objects that still exist, since, if we wait
-    # until global destruction, our Indexer might no longer exist,
-    # (see bug #32689)
-    $_->_finish_indexing for values %obj_cache;
-}
 
 1;
 
 __END__
 
-__POD__
-
-=head1 NAME
-
-Lucy::Simple - Basic search engine.
-
-=head1 SYNOPSIS
-
-First, build an index of your documents.
-
-    my $index = Lucy::Simple->new(
-        path     => '/path/to/index/'
-        language => 'en',
-    );
-
-    while ( my ( $title, $content ) = each %source_docs ) {
-        $index->add_doc({
-            title    => $title,
-            content  => $content,
-        });
-    }
-
-Later, search the index.
-
-    my $total_hits = $index->search( 
-        query      => $query_string,
-        offset     => 0,
-        num_wanted => 10,
-    );
-
-    print "Total hits: $total_hits\n";
-    while ( my $hit = $index->next ) {
-        print "$hit->{title}\n",
-    }
-
-=head1 DESCRIPTION
-
-Lucy::Simple is a stripped-down interface for the L<Apache Lucy|Lucy> search
-engine library.  
-
-=head1 METHODS 
-
-=head2 new
-
-    my $lucy = Lucy::Simple->new(
-        path     => '/path/to/index/',
-        language => 'en',
-    );
-
-Create a Lucy::Simple object, which can be used for both indexing and
-searching.  Two hash-style parameters are required.
-
-=over 
-
-=item *
-
-B<path> - Where the index directory should be located.  If no index is found
-at the specified location, one will be created.
-
-=item *
-
-B<language> - The language of the documents in your collection, indicated 
-by a two-letter ISO code.  12 languages are supported:
-
-    |-----------------------|
-    | Language   | ISO code |
-    |-----------------------|
-    | Danish     | da       |
-    | Dutch      | nl       |
-    | English    | en       |
-    | Finnish    | fi       |
-    | French     | fr       |
-    | German     | de       |
-    | Italian    | it       |
-    | Norwegian  | no       |
-    | Portuguese | pt       |
-    | Spanish    | es       |
-    | Swedish    | sv       |
-    | Russian    | ru       |
-    |-----------------------|
-
-=back
-
-=head2 add_doc 
-
-    $lucy->add_doc({
-        location => $url,
-        title    => $title,
-        content  => $content,
-    });
-
-Add a document to the index.  The document must be supplied as a hashref, with
-field names as keys and content as values.
-
-=head2 search
-
-    my $total_hits = $lucy->search( 
-        query      => $query_string,    # required
-        offset     => 40,               # default 0
-        num_wanted => 20,               # default 10
-    );
-
-Search the index.  Returns the total number of documents which match the
-query.  (This number is unlikely to match C<num_wanted>.)
-
-=over
-
-=item *
-
-B<query> - A search query string.
-
-=item *
-
-B<offset> - The number of most-relevant hits to discard, typically used when
-"paging" through hits N at a time.  Setting offset to 20 and num_wanted to 10
-retrieves hits 21-30, assuming that 30 hits can be found.
-
-=item *
-
-B<num_wanted> - The number of hits you would like to see after C<offset> is
-taken into account.  
-
-=back
-
-=head1 BUGS
-
-Not thread-safe.
 
-=cut

[4/5] lucy git commit: Port Lucy::Simple to C

Reply via email to