Author: marvin
Date: Sat Jun 30 05:30:10 2012
New Revision: 1355638

URL: http://svn.apache.org/viewvc?rev=1355638&view=rev
Log:
Delegate lexing of query strings to QueryLexer.

Add new helper class QueryLexer which breaks tokenization of query
strings out of QueryParser.

Added:
    lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.c   (with props)
    lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.cfh   (with props)
Modified:
    lucy/trunk/core/Lucy/Search/QueryParser.c
    lucy/trunk/core/Lucy/Search/QueryParser.cfh

Modified: lucy/trunk/core/Lucy/Search/QueryParser.c
URL: 
http://svn.apache.org/viewvc/lucy/trunk/core/Lucy/Search/QueryParser.c?rev=1355638&r1=1355637&r2=1355638&view=diff
==============================================================================
--- lucy/trunk/core/Lucy/Search/QueryParser.c (original)
+++ lucy/trunk/core/Lucy/Search/QueryParser.c Sat Jun 30 05:30:10 2012
@@ -22,6 +22,7 @@
 
 #include "Lucy/Search/QueryParser.h"
 #include "Lucy/Search/QueryParser/ParserElem.h"
+#include "Lucy/Search/QueryParser/QueryLexer.h"
 #include "Lucy/Analysis/Analyzer.h"
 #include "Lucy/Plan/FieldType.h"
 #include "Lucy/Plan/Schema.h"
@@ -50,10 +51,10 @@
 #define TOKEN_STRING      LUCY_QPARSER_TOKEN_STRING
 #define TOKEN_QUERY       LUCY_QPARSER_TOKEN_QUERY
 
-// Recursing helper function for Tree().
+// Helper function for Tree().
 static Query*
-S_do_tree(QueryParser *self, VArray *elems, CharBuf *default_field,
-          Hash *extractions, bool_t enclosed);
+S_parse_subquery(QueryParser *self, VArray *elems, CharBuf *default_field,
+                 bool_t enclosed);
 
 static CharBuf*
 S_balance_parens_in_string(CharBuf *qstring);
@@ -69,11 +70,11 @@ S_balance_parens(QueryParser *self, VArr
 // Work from the inside out, reducing the leftmost, innermost paren groups
 // first, until the array of elems contains no parens.
 static void
-S_parse_subqueries(QueryParser *self, VArray *elems, Hash *extractions);
+S_parse_subqueries(QueryParser *self, VArray *elems);
 
 static void
 S_compose_inner_queries(QueryParser *self, VArray *elems,
-                        CharBuf *default_field, Hash *extractions);
+                        CharBuf *default_field);
 
 // Apply +, -, NOT.
 static void
@@ -166,6 +167,7 @@ QParser_init(QueryParser *self, Schema *
     // Init.
     self->heed_colons = false;
     self->label_inc   = 0;
+    self->lexer       = QueryLexer_new();
 
     // Assign.
     self->schema         = (Schema*)INCREF(schema);
@@ -227,6 +229,7 @@ QParser_destroy(QueryParser *self) {
     DECREF(self->analyzer);
     DECREF(self->default_boolop);
     DECREF(self->fields);
+    DECREF(self->lexer);
     DECREF(self->phrase_label);
     DECREF(self->bool_group_label);
     SUPER_DESTROY(self, QUERYPARSER);
@@ -260,6 +263,7 @@ QParser_heed_colons(QueryParser *self) {
 void
 QParser_set_heed_colons(QueryParser *self, bool_t heed_colons) {
     self->heed_colons = heed_colons;
+    QueryLexer_Set_Heed_Colons(self->lexer, heed_colons);
 }
 
 
@@ -279,17 +283,12 @@ QParser_parse(QueryParser *self, const C
 
 Query*
 QParser_tree(QueryParser *self, const CharBuf *query_string) {
-    Hash    *extractions = Hash_new(0);
-    CharBuf *mod1        = S_extract_phrases(self, query_string, extractions);
-    CharBuf *mod2        = S_balance_parens_in_string(mod1);
-    VArray  *elems       = S_parse_flat_string(self, mod2);
-    S_parse_subqueries(self, elems, extractions);
-    Query   *retval      = S_do_tree(self, elems, NULL, extractions, false);
+    VArray *elems = QueryLexer_Tokenize(self->lexer, query_string);
+    S_balance_parens(self, elems);
+    S_parse_subqueries(self, elems);
+    Query *query = S_parse_subquery(self, elems, NULL, false);
     DECREF(elems);
-    DECREF(mod2);
-    DECREF(mod1);
-    DECREF(extractions);
-    return retval;
+    return query;
 }
 
 static VArray*
@@ -359,7 +358,7 @@ S_parse_flat_string(QueryParser *self, C
 }
 
 static void
-S_parse_subqueries(QueryParser *self, VArray *elems, Hash *extractions) {
+S_parse_subqueries(QueryParser *self, VArray *elems) {
     while (1) {
         // Work from the inside out, starting with the leftmost innermost
         // paren group.
@@ -393,7 +392,7 @@ S_parse_subqueries(QueryParser *self, VA
 
         // Create the subquery.
         VArray *sub_elems = VA_Slice(elems, left + 1, right - left - 1);
-        Query *subquery = S_do_tree(self, sub_elems, field, extractions, true);
+        Query *subquery = S_parse_subquery(self, sub_elems, field, true);
         ParserElem *new_elem = ParserElem_new(TOKEN_QUERY, (Obj*)subquery);
         ParserElem_Set_Occur(new_elem, self->default_occur);
         DECREF(sub_elems);
@@ -421,8 +420,8 @@ S_discard_elems(VArray *elems, uint32_t 
 }
 
 static Query*
-S_do_tree(QueryParser *self, VArray *elems, CharBuf *default_field,
-          Hash *extractions, bool_t enclosed) {
+S_parse_subquery(QueryParser *self, VArray *elems, CharBuf *default_field,
+                 bool_t enclosed) {
     if (VA_Get_Size(elems)) {
         ParserElem *first = (ParserElem*)VA_Fetch(elems, 0);
         if (ParserElem_Get_Type(first) == TOKEN_OPEN_PAREN) {
@@ -431,7 +430,7 @@ S_do_tree(QueryParser *self, VArray *ele
             DECREF(VA_Pop(elems));
         }
     }
-    S_compose_inner_queries(self, elems, default_field, extractions);
+    S_compose_inner_queries(self, elems, default_field);
     S_discard_elems(elems, TOKEN_FIELD);
     S_discard_elems(elems, TOKEN_STRING);
     S_apply_plusses_and_negations(self, elems);
@@ -519,7 +518,7 @@ S_balance_parens_in_string(CharBuf *qstr
 
 static void
 S_compose_inner_queries(QueryParser *self, VArray *elems,
-                        CharBuf *default_field, Hash *extractions) {
+                        CharBuf *default_field) {
     // Generate all queries.  Apply any fields.
     for (uint32_t i = VA_Get_Size(elems); i--;) {
         CharBuf *field = default_field;
@@ -536,25 +535,12 @@ S_compose_inner_queries(QueryParser *sel
         }
 
         if (ParserElem_Get_Type(elem) == TOKEN_STRING) {
-            // Generate a LeafQuery from a Phrase.
             const CharBuf *text = (CharBuf*)ParserElem_As(elem, CHARBUF);
-            if (CB_Starts_With(text, self->phrase_label)) {
-                CharBuf *inner_text
-                    = (CharBuf*)Hash_Fetch(extractions, (Obj*)text);
-                LeafQuery *query = LeafQuery_new(field, inner_text);
-                ParserElem *new_elem = ParserElem_new(TOKEN_QUERY, 
(Obj*)query);
-                ParserElem_Set_Occur(new_elem, self->default_occur);
-                DECREF(Hash_Delete(extractions, (Obj*)text));
-                VA_Store(elems, i, (Obj*)new_elem);
-            }
-            // What's left is probably a term, so generate a LeafQuery.
-            else {
-                LeafQuery *query = LeafQuery_new(field, text);
-                ParserElem *new_elem
-                    = ParserElem_new(TOKEN_QUERY, (Obj*)query);
-                ParserElem_Set_Occur(new_elem, self->default_occur);
-                VA_Store(elems, i, (Obj*)new_elem);
-            }
+            LeafQuery *query = LeafQuery_new(field, text);
+            ParserElem *new_elem
+                = ParserElem_new(TOKEN_QUERY, (Obj*)query);
+            ParserElem_Set_Occur(new_elem, self->default_occur);
+            VA_Store(elems, i, (Obj*)new_elem);
         }
     }
 }

Modified: lucy/trunk/core/Lucy/Search/QueryParser.cfh
URL: 
http://svn.apache.org/viewvc/lucy/trunk/core/Lucy/Search/QueryParser.cfh?rev=1355638&r1=1355637&r2=1355638&view=diff
==============================================================================
--- lucy/trunk/core/Lucy/Search/QueryParser.cfh (original)
+++ lucy/trunk/core/Lucy/Search/QueryParser.cfh Sat Jun 30 05:30:10 2012
@@ -43,15 +43,16 @@ parcel Lucy;
 public class Lucy::Search::QueryParser cnick QParser
     inherits Lucy::Object::Obj {
 
-    Schema   *schema;
-    Analyzer *analyzer;
-    CharBuf  *default_boolop;
-    VArray   *fields;
-    CharBuf  *phrase_label;
-    CharBuf  *bool_group_label;
-    bool_t    heed_colons;
-    uint32_t  label_inc;
-    bool_t    default_occur;
+    Schema     *schema;
+    Analyzer   *analyzer;
+    CharBuf    *default_boolop;
+    VArray     *fields;
+    QueryLexer *lexer;
+    CharBuf    *phrase_label;
+    CharBuf    *bool_group_label;
+    bool_t      heed_colons;
+    uint32_t    label_inc;
+    bool_t      default_occur;
 
     inert incremented QueryParser*
     new(Schema *schema, Analyzer *analyzer = NULL,

Added: lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.c
URL: 
http://svn.apache.org/viewvc/lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.c?rev=1355638&view=auto
==============================================================================
--- lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.c (added)
+++ lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.c Sat Jun 30 05:30:10 
2012
@@ -0,0 +1,288 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define C_LUCY_QUERYLEXER
+#include <stdlib.h>
+#include <ctype.h>
+#include "Lucy/Util/ToolSet.h"
+
+#include "Lucy/Search/QueryParser/QueryLexer.h"
+#include "Lucy/Search/QueryParser.h"
+#include "Lucy/Search/QueryParser/ParserElem.h"
+
+#define TOKEN_OPEN_PAREN  LUCY_QPARSER_TOKEN_OPEN_PAREN
+#define TOKEN_CLOSE_PAREN LUCY_QPARSER_TOKEN_CLOSE_PAREN
+#define TOKEN_MINUS       LUCY_QPARSER_TOKEN_MINUS
+#define TOKEN_PLUS        LUCY_QPARSER_TOKEN_PLUS
+#define TOKEN_NOT         LUCY_QPARSER_TOKEN_NOT
+#define TOKEN_OR          LUCY_QPARSER_TOKEN_OR
+#define TOKEN_AND         LUCY_QPARSER_TOKEN_AND
+#define TOKEN_FIELD       LUCY_QPARSER_TOKEN_FIELD
+#define TOKEN_STRING      LUCY_QPARSER_TOKEN_STRING
+#define TOKEN_QUERY       LUCY_QPARSER_TOKEN_QUERY
+
+static ParserElem*
+S_consume_keyword(ZombieCharBuf *qstring, const char *keyword,
+                  size_t keyword_len, int type);
+
+static ParserElem*
+S_consume_field(ZombieCharBuf *qstring);
+
+static ParserElem*
+S_consume_text(ZombieCharBuf *qstring);
+
+static ParserElem*
+S_consume_quoted_string(ZombieCharBuf *qstring);
+
+QueryLexer*
+QueryLexer_new() {
+    QueryLexer *self = (QueryLexer*)VTable_Make_Obj(QUERYLEXER);
+    return QueryLexer_init(self);
+}
+
+QueryLexer*
+QueryLexer_init(QueryLexer *self) {
+    self->heed_colons = false;
+    return self;
+}
+
+bool_t
+QueryLexer_heed_colons(QueryLexer *self) {
+    return self->heed_colons;
+}
+
+void
+QueryLexer_set_heed_colons(QueryLexer *self, bool_t heed_colons) {
+    self->heed_colons = heed_colons;
+}
+
+VArray*
+QueryLexer_tokenize(QueryLexer *self, const CharBuf *query_string) {
+    CharBuf *copy = query_string
+                    ? CB_Clone(query_string)
+                    : CB_new_from_trusted_utf8("", 0);
+    ZombieCharBuf *qstring = ZCB_WRAP((CharBuf*)copy);
+    VArray *elems = VA_new(0);
+    ZCB_Trim(qstring);
+
+    while (ZCB_Get_Size(qstring)) {
+        ParserElem *elem = NULL;
+
+        if (ZCB_Trim_Top(qstring)) {
+            // Fast-forward past whitespace.
+            continue;
+        }
+
+        if (self->heed_colons) {
+            ParserElem *elem = S_consume_field(qstring);
+            if (elem) {
+                VA_Push(elems, (Obj*)elem);
+            }
+        }
+
+        uint32_t code_point = ZCB_Code_Point_At(qstring, 0);
+        switch (code_point) {
+            case '(':
+                ZCB_Nip(qstring, 1);
+                elem = ParserElem_new(TOKEN_OPEN_PAREN, NULL);
+                break;
+            case ')':
+                ZCB_Nip(qstring, 1);
+                elem = ParserElem_new(TOKEN_CLOSE_PAREN, NULL);
+                break;
+            case '+':
+                if (ZCB_Get_Size(qstring) > 1
+                    && !StrHelp_is_whitespace(ZCB_Code_Point_At(qstring, 1))
+                   ) {
+                    elem = ParserElem_new(TOKEN_PLUS, NULL);
+                }
+                else {
+                    elem = ParserElem_new(TOKEN_STRING, (Obj*)CB_newf("+"));
+                }
+                ZCB_Nip(qstring, 1);
+                break;
+            case '-':
+                if (ZCB_Get_Size(qstring) > 1
+                    && !StrHelp_is_whitespace(ZCB_Code_Point_At(qstring, 1))
+                   ) {
+                    elem = ParserElem_new(TOKEN_MINUS, NULL);
+                }
+                else {
+                    elem = ParserElem_new(TOKEN_STRING, (Obj*)CB_newf("-"));
+                }
+                ZCB_Nip(qstring, 1);
+                break;
+            case '"':
+                elem = S_consume_quoted_string(qstring);
+                break;
+            case 'O':
+                elem = S_consume_keyword(qstring, "OR", 2, TOKEN_OR);
+                if (!elem) {
+                    elem = S_consume_text(qstring);
+                }
+                break;
+            case 'A':
+                elem = S_consume_keyword(qstring, "AND", 3, TOKEN_AND);
+                if (!elem) {
+                    elem = S_consume_text(qstring);
+                }
+                break;
+            case 'N':
+                elem = S_consume_keyword(qstring, "NOT", 3, TOKEN_NOT);
+                if (!elem) {
+                    elem = S_consume_text(qstring);
+                }
+                break;
+            default:
+                elem = S_consume_text(qstring);
+                break;
+        }
+        VA_Push(elems, (Obj*)elem);
+    }
+
+    DECREF(copy);
+    return elems;
+}
+
+
+static ParserElem*
+S_consume_keyword(ZombieCharBuf *qstring, const char *keyword,
+                  size_t keyword_len, int type) {
+    if (!ZCB_Starts_With_Str(qstring, keyword, keyword_len)) {
+        return NULL;
+    }
+    uint32_t lookahead = ZCB_Code_Point_At(qstring, keyword_len);
+    if (!lookahead) {
+        return NULL;
+    }
+    if (StrHelp_is_whitespace(lookahead)
+        || lookahead == '"'
+        || lookahead == '('
+        || lookahead == ')'
+        || lookahead == '+'
+        || lookahead == '-'
+       ) {
+        ZCB_Nip(qstring, keyword_len);
+        return ParserElem_new(type, NULL);
+    }
+    return NULL;
+}
+
+static ParserElem*
+S_consume_field(ZombieCharBuf *qstring) {
+    size_t tick = 0;
+
+    // Field names constructs must start with a letter or underscore.
+    uint32_t code_point = ZCB_Code_Point_At(qstring, tick);
+    if (isalpha(code_point) || code_point == '_') {
+        tick++;
+    }
+    else {
+        return NULL;
+    }
+
+    // Only alphanumerics and underscores are allowed  in field names.
+    while (1) {
+        code_point = ZCB_Code_Point_At(qstring, tick);
+        if (isalnum(code_point) || code_point == '_') {
+            tick++;
+        }
+        else if (code_point == ':') {
+            tick++;
+            break;
+        }
+        else {
+            return NULL;
+        }
+    }
+
+    // Field name constructs must be followed by something sensible.
+    uint32_t lookahead = ZCB_Code_Point_At(qstring, tick);
+    if (!(isalnum(lookahead)
+          || lookahead == '_'
+          || lookahead > 127
+          || lookahead == '"'
+          || lookahead == '('
+         )
+       ) {
+        return NULL;
+    }
+
+    // Consume string data.
+    ZombieCharBuf *field = ZCB_WRAP((CharBuf*)qstring);
+    ZCB_Truncate(field, tick - 1);
+    ZCB_Nip(qstring, tick);
+    return ParserElem_new(TOKEN_FIELD, (Obj*)ZCB_Clone(field));
+}
+
+static ParserElem*
+S_consume_text(ZombieCharBuf *qstring) {
+    ZombieCharBuf *text  = ZCB_WRAP((CharBuf*)qstring);
+    size_t tick = 0;
+    while (1) {
+        uint32_t code_point = ZCB_Nip_One(qstring);
+        if (code_point == '\\') {
+            code_point = ZCB_Nip_One(qstring);
+            tick++;
+            if (code_point == 0) {
+                break;
+            }
+        }
+        else if (StrHelp_is_whitespace(code_point)
+            || code_point == '"'
+            || code_point == '('
+            || code_point == ')'
+            || code_point == 0 
+           ) {
+            break;
+        }
+        tick++;
+    }
+
+    ZCB_Truncate(text, tick);
+    return ParserElem_new(TOKEN_STRING, (Obj*)ZCB_Clone(text));
+}
+
+static ParserElem*
+S_consume_quoted_string(ZombieCharBuf *qstring) {
+    ZombieCharBuf *text = ZCB_WRAP((CharBuf*)qstring);
+    if (ZCB_Nip_One(qstring) != '"') {
+        THROW(ERR, "Internal error: expected a quote");
+    }
+
+    size_t tick = 1;
+    while (1) {
+        uint32_t code_point = ZCB_Nip_One(qstring);
+        if (code_point == '"') {
+            tick += 1;
+            break;
+        }
+        else if (code_point == 0) {
+            break;
+        }
+        else if (code_point == '\\') {
+            ZCB_Nip_One(qstring);
+            tick += 2;
+        }
+        else {
+            tick += 1;
+        }
+    }
+
+    ZCB_Truncate(text, tick);
+    return ParserElem_new(TOKEN_STRING, (Obj*)ZCB_Clone(text));
+}
+

Propchange: lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.c
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.cfh
URL: 
http://svn.apache.org/viewvc/lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.cfh?rev=1355638&view=auto
==============================================================================
--- lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.cfh (added)
+++ lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.cfh Sat Jun 30 05:30:10 
2012
@@ -0,0 +1,45 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+parcel Lucy;
+
+class Lucy::Search::QueryParser::QueryLexer inherits Lucy::Object::Obj {
+
+    bool_t heed_colons;
+
+    inert incremented QueryLexer*
+    new();
+
+    inert QueryLexer*
+    init(QueryLexer *self);
+
+    /** Parse a query string into an array of tokens.
+     *
+     * @param query_string The string to be parsed.  May be NULL.
+     * @return an array of ParserElems.
+     */
+    incremented VArray*
+    Tokenize(QueryLexer *self, const CharBuf *query_string = NULL);
+
+    bool_t
+    Heed_Colons(QueryLexer *self);
+
+    /** Enable/disable parsing of <code>fieldname:foo</code> constructs.
+     */
+    void
+    Set_Heed_Colons(QueryLexer *self, bool_t heed_colons);
+}
+

Propchange: lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.cfh
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to