On Jan 23, 2007, at 5:41 AM, Adam Sjøgren wrote:

On Sun, 14 Jan 2007 19:01:00 -0800, Adam wrote:

Personally, I wrote my own model to Xapian. I also wound up re- writing
the query parser, because I needed things like range searching and
                                                 ^^..       ..^^
Sounds interesting! Do you care to share your code, or briefly explain
what is needed to do range searching?

I would be happy to share the code, although it's not really CPAN- ready quite yet. The short version is that implementing things like Range searching and Regular Expressions requires manually walking through the term list, and adding all the terms that match to the query.

It's basically the same way that Xapian's default query parser handles right side wildcarding. You jump to the point in the termlist to begin, and go until you get past the second condition.

Just for kicks, I've attached the basic class here. It's taken from part of a larger project, and it definitely has some bugs. If you decide to improve it, I would love patches.

Thanks!

Adam
#
# Search::XapianParser;
# Created by: Adam Jacob, Marchex, <[EMAIL PROTECTED]>
# Created on: 01/11/2007 04:11:38 PM PST
#
# This package is licensed under the same terms as Perl itself.  See L<perlartistic>
# or L<perlgpl>
# 
# $Id: $

package Search::XapianParser;

use Moose;
use HOP::Lexer qw(string_lexer);
use Storable qw(thaw);
use Search::Xapian qw(:standard);
use Params::Validate qw(:all);

has 'xdb' => ( is => 'ro', isa => 'Obj' );

$ENV{'XAPIAN_PREFER_FLINT'} = 1;

sub BUILD {
    my ( $self, $params ) = @_;

    $self->{'xdb'} = Search::Xapian::Database->new( $params->{'xapian'} )
      or die "Cannot create read-able Xapian Database!";
}

sub parse_query {
    my $self = shift;
    my %p    = validate(
        @_,
        {
            query      => { type => SCALAR },
            default_op => { type => SCALAR, default => OP_AND },
        },
    );

    my @input_tokens = (
        [ 'PGROUP',  qr/\(\[.*?\]\)/ ],
        [ 'REGEX',   qr/[\w\-\_\\\/\.]+\=\~\/.+?\// ],
        [ 'REGEX',   qr/[\w\-\_\\\/\.]+\!\~\/.+?\// ],
        [ 'INRANGE', qr/[\w\-\_\\\/\.]+\:\[.+?\% TO .+?\%\]/ ],
        [ 'EXRANGE', qr/[\w\-\_\\\/\.]+\:\{.+?\% TO .+?\%\}/ ],
        [ 'INRANGE', qr/[\w\-\_\\\/\.]+\:\[.+? TO .+?\]/ ],
        [ 'EXRANGE', qr/[\w\-\_\\\/\.]+\:\{.+? TO .+?\}/ ],
        [ 'TERM',    qr/[\w\-\_\\\/\.]+\:".+?"/ ],
        [ 'TERM',    qr/[\w\-\_\\\/\.]+\:'.+?'/ ],
        [ 'TERM',    qr/[\w\-\_\\\/\.]+\:[\w\-\_\\\/\.]+/ ],
        [ 'TERM',    qr/[\w\-\_\\\/\.]+\:/ ],
        [ 'OP',      qr/\bAND\b/i, ],
        [ 'OP',      qr/\bOR\b/i, ],
        [ 'OP',      qr/\bNOT\b/i, ],
        [ 'SPLAT',   qr/\*/ ],
        [ 'WORD',    qr/[\w\-\_\\\/\.]+/ ],
        [ 'SPACE', qr/\s*/, sub { () } ],
        [ 'OTHER', qr/./ ],
    );

    my $lexer = string_lexer( $p{'query'}, @input_tokens );
    my @tokens;
    my $mquery = undef;
    my $last_query;
    my @qstack;
    my $inparen = 0;
  TOKE: while ( my $token = $lexer->() ) {

        #next TOKE unless (ref($token) eq "ARRAY");
        my ( $label, $value ) = @{$token};
        push @tokens, $token;

        if ( $label eq "TERM" ) {
            my ( $field, $fv ) = split( /:/, $value );
            $fv =~ s/^["']//;
            $fv =~ s/["']$//;
            my $term  = "X" . uc($field) . lc($fv);
            my $query = Search::Xapian::Query->new($term);
            my @splat = ($query);
            if ( defined( my $next = $lexer->('peek') ) ) {

                #if (ref($next) eq "ARRAY") {
                my ( $next_label, $next_value ) = @{$next};
                if ( $next_label eq 'SPLAT' ) {
                    my @extra = $self->_parse_splat( term => $term );
                    push( @splat, @extra ) if scalar(@extra);
                }

                #}
            }
            if ( scalar(@splat) > 1 ) {
                push( @qstack, [EMAIL PROTECTED] );
            } else {
                push( @qstack, $query );
            }
        } elsif ( $label eq "INRANGE" ) {
            my ( $field, $from, $to ) =
              $value =~ /^([\w\-\_\\\/\.]+)\:\[(.+?) TO (.+?)\]$/;
            my $term    = "X" . uc($field);
            my $tfrom   = $term . $from;
            my $tto     = $term . $to;
            my $query   = Search::Xapian::Query->new( OP_OR, $tfrom, $tto );
            my @results = ($query);
            my @range_results =
              $self->_parse_range( term => $term, from => $from, to => $to );
            if ( scalar(@range_results) ) {
                push( @results, @range_results );
            }
            push( @qstack, [EMAIL PROTECTED] );
        } elsif ( $label eq "EXRANGE" ) {
            my ( $field, $from, $to ) =
              $value =~ /^([\w\-\_\\\/\.]+)\:\{(.+?) TO (.+?)\}$/;
            my $term  = "X" . uc($field);
            my $query =
              Search::Xapian::Query->new( OP_OR, $term, "never_match_me" );
            my @results       = ($query);
            my @range_results =
              $self->_parse_range( term => $term, from => $from, to => $to );
            if ( scalar(@range_results) ) {
                push( @results, @range_results );
            }
            push( @qstack, [EMAIL PROTECTED] );
        } elsif ( $label eq "REGEX" ) {
            my $field;
            my $rx;
            my $rxtype;
            if ( $value =~ /^(.+)\=\~\/(.+)\/$/ ) {
                $field  = $1;
                $rx     = $2;
                $rxtype = "standard";
            } elsif ( $value =~ /^(.+)\!\~\/(.+)\/$/ ) {
                $field  = $1;
                $rx     = $2;
                $rxtype = "negative";
            }
            my $term          = "X" . uc($field);
            my $query         = Search::Xapian::Query->new( $term . $rx );
            my @results       = ($query);
            my @regex_results = $self->_parse_regex(
                term  => $term,
                regex => $rx,
                type  => $rxtype
            );
            if ( scalar(@regex_results) ) {
                push( @results, @regex_results );
            }
            push( @qstack, [EMAIL PROTECTED] );
        } elsif ( $label eq "WORD" ) {
            $value =~ s/\s//g;
            my $query = Search::Xapian::Query->new($value);
            my @splat = ($query);
            if ( defined( my $next = $lexer->('peek') ) ) {
                my ( $next_label, $next_value ) = @{$next};
                if ( $next_label eq 'SPLAT' ) {
                    my @extra = $self->_parse_splat( term => $value );
                    push( @splat, @extra ) if scalar(@extra);
                }
            }
            if ( scalar(@splat) > 1 ) {
                push( @qstack, [EMAIL PROTECTED] );
            } else {
                push( @qstack, $query );
            }
        } elsif ( $label eq "OP" ) {
            my %ops = (
                AND => OP_AND,
                OR  => OP_OR,
                NOT => OP_AND_NOT,
            );
            push( @qstack, $ops{ uc($value) } );
        } elsif ( $label eq "SPLAT" ) {

            # Do old search
        } elsif ( $label eq 'PGROUP' ) {
            $value =~ s/^\(//;
            $value =~ s/\)$//;
            my $results = $self->parse_query( query => $value );
            push( @qstack, $results );
        }
    }
    return [EMAIL PROTECTED];
}

sub _parse_range {
    my $self = shift;
    my %p    = validate(
        @_,
        {
            'term' => { type => SCALAR },
            'from' => { type => SCALAR },
            'to'   => { type => SCALAR },
        },
    );
    my @splat;
    my $term = $p{'term'};

    my $allterms = $self->xdb->allterms_begin;
    my $allend   = $self->xdb->allterms_end;
    $allterms->skip_to($term);

    $p{'from'} =~ s/\%$//;
    $p{'to'}   =~ s/\%$//;

    my $type = "string";
    if ( $p{'from'} =~ /^\d+($|[[:alpha:]]+$)/ ) {
        $type = "numeric";
        $p{'from'} =~ s/^(\d+)[[:alpha:]]+$/$1/;
        $p{'to'}   =~ s/^(\d+)[[:alpha:]]+$/$1/;
    }
    while ( $allterms != $allend && $allterms =~ /^$term/ ) {
        my $comp_term = $allterms;
        $comp_term =~ s/^$term//g;
        my $pass = 0;
        if ( $type eq "numeric" ) {
            $comp_term =~ s/^(\d+)([[:alpha:]]|\%)+$/$1/;
            if ( $p{'from'} eq '*' ) {
                $pass++;
            } elsif ( ( $comp_term <=> $p{'from'} ) == 1 ) {
                $pass++;
            }
            if ( $p{'to'} eq '*' ) {
                $pass++;
            } elsif ( ( $comp_term <=> $p{'to'} ) == -1 ) {
                $pass++;
            }
        } elsif ( $type eq "string" ) {
            if ( $p{'from'} eq '*' ) {
                $pass++;
            } elsif ( ( $comp_term cmp $p{'from'} ) == 1 ) {
                $pass++;
            }
            if ( $p{'to'} eq '*' ) {
                $pass++;
            } elsif ( ( $comp_term cmp $p{'to'} ) == -1 ) {
                $pass++;
            }
        }
        push( @splat, OP_OR, Search::Xapian::Query->new($allterms) )
          if $pass == 2;
        ++$allterms;
    }
    return @splat;
}

sub _parse_regex {
    my $self = shift;
    my %p    = validate(
        @_,
        {
            'term'  => { type => SCALAR },
            'regex' => { type => SCALAR },
            'type'  => { type => SCALAR },
        },
    );

    my @splat;
    my $term = $p{'term'};

    my $allterms = $self->xdb->allterms_begin;
    my $allend   = $self->xdb->allterms_end;
    $allterms->skip_to($term);
    while ( $allterms != $allend && $allterms =~ /^$term/ ) {
        my $comp_term = $allterms;
        $comp_term =~ s/^$term//g;
        if ( $p{'type'} eq "standard" ) {
            if ( $comp_term =~ /$p{'regex'}/ ) {
                push( @splat, OP_OR, Search::Xapian::Query->new($allterms) );
            }
        } elsif ( $p{'type'} eq "negative" ) {
            if ( $comp_term !~ /$p{'regex'}/ ) {
                push( @splat, OP_OR, Search::Xapian::Query->new($allterms) );
            }
        }
        ++$allterms;
    }
    return @splat;
}

sub _parse_splat {
    my $self = shift;
    my %p    = validate( @_, { 'term' => { type => SCALAR }, }, );

    my @splat;
    my $term = $p{'term'};

    my $allterms = $self->xdb->allterms_begin;
    my $allend   = $self->xdb->allterms_end;
    $allterms->skip_to($term);
    while ( $allterms != $allend && $allterms =~ /^$term/ ) {
        push( @splat, OP_OR, Search::Xapian::Query->new($allterms) );
        ++$allterms;
    }
    return @splat;
}

sub build_query {
    my $self = shift;
    my %p    = validate( @_, { 'qstack' => { type => ARRAYREF }, }, );

    my $q          = undef;
    my $default_op = OP_AND;
    my $op         = $default_op;
    foreach my $part ( @{ $p{'qstack'} } ) {
        if ( ref($part) eq 'Search::Xapian::Query' ) {
            if ( !defined($q) ) {
                $q = $part;
            } else {
                $op ||= $default_op;

                #die "$op $q $part";
                $q = Search::Xapian::Query->new( $op, $q, $part );
                $op = $default_op;
            }
        } elsif ( ref($part) eq 'ARRAY' ) {
            my $sq = $self->build_query( qstack => $part );
            $op ||= $default_op;
            if ( defined $q ) {
                $q = Search::Xapian::Query->new( $op, $q, $sq );
            } else {
                $q = Search::Xapian::Query->new( $op, $sq );
            }
            $op = $default_op;
        } else {
            $op = $part;
        }
    }
    return $q;
}

sub search {
    my $self = shift;
    my %p    = validate(
        @_,
        {
            query   => { type => SCALAR },
            as_doc  => { type => SCALAR, default => 0 },
            num_per => { type => SCALAR, optional => 1 },
            page    => { type => SCALAR, optional => 1 },
        },
    );
    my $db = $self->xdb;

    my $enq = $db->enquire;
    $enq->set_weighting_scheme( Search::Xapian::BoolWeight->new );
    my $qstack = $self->parse_query( query  => $p{'query'} );
    my $query  = $self->build_query( qstack => $qstack );
    $enq->set_query($query);

    printf "Parsing query '%s'\n", $enq->get_query()->get_description();

    my @matches = $enq->matches( 0, 10000000 );
    my $num_matches = scalar(@matches);
    print scalar(@matches) . " results found\n";

    my $wantarray = wantarray;

    if ($wantarray || defined($wantarray)) {
        my @results;
        my $realset;
        my $page;
        if ( exists( $p{'num_per'} ) && exists( $p{'page'} ) ) {
            $page = Data::Page->new();
            $page->total_entries($num_matches);
            $page->entries_per_page( $p{'num_per'} );
            $page->current_page( $p{'page'} );
            my @set = $page->splice( [EMAIL PROTECTED] );
            $realset = [EMAIL PROTECTED];
        } else {
            $realset = [EMAIL PROTECTED];
        }
        foreach my $match ( @{$realset} ) {
            #printf "ID %d %d%%\n", $match->get_docid(), $match->get_percent();
            my $doc = $match->get_document;
            if ( $p{'as_doc'} ) {
                push( @results, [ $match->get_docid, $doc ] );
            } else {
                push( @results, thaw( $doc->get_data ) );
            }
        }
        if (wantarray) {
            return [EMAIL PROTECTED], $page;
        } elsif ( defined(wantarray) ) {
            return [EMAIL PROTECTED],;
        }
    } else {
        return;
    }
}

1;

_______________________________________________
List: [email protected]
Listinfo: http://lists.rawmode.org/mailman/listinfo/catalyst
Searchable archive: http://www.mail-archive.com/[email protected]/
Dev site: http://dev.catalyst.perl.org/

Reply via email to