Author: marvin
Date: Sat Jun 30 04:58:07 2012
New Revision: 1355635

URL: http://svn.apache.org/viewvc?rev=1355635&view=rev
Log:
Tokenize first in QueryParser's Tree().

Instead of recursing on a query string which gets chewed up bit by bit,
tokenize the query string up front and recurse on the resulting array of
tokens.

Modified:
    lucy/trunk/core/Lucy/Search/QueryParser.c

Modified: lucy/trunk/core/Lucy/Search/QueryParser.c
URL: 
http://svn.apache.org/viewvc/lucy/trunk/core/Lucy/Search/QueryParser.c?rev=1355635&r1=1355634&r2=1355635&view=diff
==============================================================================
--- lucy/trunk/core/Lucy/Search/QueryParser.c (original)
+++ lucy/trunk/core/Lucy/Search/QueryParser.c Sat Jun 30 04:58:07 2012
@@ -52,17 +52,25 @@
 
 // Recursing helper function for Tree().
 static Query*
-S_do_tree(QueryParser *self, CharBuf *query_string, CharBuf *default_field,
-          Hash *extractions);
+S_do_tree(QueryParser *self, VArray *elems, CharBuf *default_field,
+          Hash *extractions, bool_t enclosed);
 
 static CharBuf*
 S_balance_parens_in_string(CharBuf *qstring);
 
+static VArray*
+S_parse_flat_string(QueryParser *self, CharBuf *query_string);
+
 // Drop unmatched right parens and add matching right parens at end to
 // close paren groups implicitly.
 static void
 S_balance_parens(QueryParser *self, VArray *elems);
 
+// Work from the inside out, reducing the leftmost, innermost paren groups
+// first, until the array of elems contains no parens.
+static void
+S_parse_subqueries(QueryParser *self, VArray *elems, Hash *extractions);
+
 static void
 S_compose_inner_queries(QueryParser *self, VArray *elems,
                         CharBuf *default_field, Hash *extractions);
@@ -139,7 +147,7 @@ S_consume_field(ViewCharBuf *qstring, Vi
 
 // Consume non-whitespace from qstring and store the match in target.
 static bool_t
-S_consume_non_whitespace(ViewCharBuf *qstring, ViewCharBuf *target);
+S_consume_non_whitespace_non_paren(ViewCharBuf *qstring, ViewCharBuf *target);
 
 #define RAND_STRING_LEN      16
 #define PHRASE_LABEL_LEN     (RAND_STRING_LEN + sizeof("_phrase") - 1)
@@ -274,9 +282,10 @@ QParser_tree(QueryParser *self, const Ch
     Hash    *extractions = Hash_new(0);
     CharBuf *mod1        = S_extract_phrases(self, query_string, extractions);
     CharBuf *mod2        = S_balance_parens_in_string(mod1);
-    CharBuf *mod3        = S_extract_paren_groups(self, mod2, extractions);
-    Query   *retval      = S_do_tree(self, mod3, NULL, extractions);
-    DECREF(mod3);
+    VArray  *elems       = S_parse_flat_string(self, mod2);
+    S_parse_subqueries(self, elems, extractions);
+    Query   *retval      = S_do_tree(self, elems, NULL, extractions, false);
+    DECREF(elems);
     DECREF(mod2);
     DECREF(mod1);
     DECREF(extractions);
@@ -288,18 +297,9 @@ S_parse_flat_string(QueryParser *self, C
     VArray      *parse_tree       = VA_new(0);
     CharBuf     *qstring_copy     = CB_Clone(query_string);
     ViewCharBuf *qstring          = (ViewCharBuf*)ZCB_WRAP(qstring_copy);
-    bool_t       need_close_paren = false;
 
     ViewCB_Trim(qstring);
 
-    if (S_consume_ascii(qstring, "(", 1)) {
-        VA_Push(parse_tree, (Obj*)ParserElem_new(TOKEN_OPEN_PAREN, NULL));
-        if (ViewCB_Code_Point_From(qstring, 1) == ')') {
-            need_close_paren = true;
-            ViewCB_Chop(qstring, 1);
-        }
-    }
-
     ViewCharBuf *temp = (ViewCharBuf*)ZCB_BLANK();
     while (ViewCB_Get_Size(qstring)) {
         ParserElem *token = NULL;
@@ -308,6 +308,12 @@ S_parse_flat_string(QueryParser *self, C
             // Fast-forward past whitespace.
             continue;
         }
+        else if (S_consume_ascii(qstring, "(", 1)) {
+            token = ParserElem_new(TOKEN_OPEN_PAREN, NULL);
+        }
+        else if (S_consume_ascii(qstring, ")", 1)) {
+            token = ParserElem_new(TOKEN_CLOSE_PAREN, NULL);
+        }
         else if (S_consume_ascii(qstring, "+", 1)) {
             if (ViewCB_Trim_Top(qstring)) {
                 token = ParserElem_new(TOKEN_STRING, (Obj*)CB_newf("+"));
@@ -336,7 +342,7 @@ S_parse_flat_string(QueryParser *self, C
         else if (self->heed_colons && S_consume_field(qstring, temp)) {
             token = ParserElem_new(TOKEN_FIELD, (Obj*)ViewCB_Clone(temp));
         }
-        else if (S_consume_non_whitespace(qstring, temp)) {
+        else if (S_consume_non_whitespace_non_paren(qstring, temp)) {
             token = ParserElem_new(TOKEN_STRING, (Obj*)ViewCB_Clone(temp));
         }
         else {
@@ -346,11 +352,6 @@ S_parse_flat_string(QueryParser *self, C
         VA_Push(parse_tree, (Obj*)token);
     }
 
-    if (need_close_paren) {
-        VA_Push(parse_tree,
-                (Obj*)ParserElem_new(TOKEN_CLOSE_PAREN, NULL));
-    }
-
     // Clean up.
     DECREF(qstring_copy);
 
@@ -358,6 +359,60 @@ S_parse_flat_string(QueryParser *self, C
 }
 
 static void
+S_parse_subqueries(QueryParser *self, VArray *elems, Hash *extractions) {
+    while (1) {
+        // Work from the inside out, starting with the leftmost innermost
+        // paren group.
+        size_t left = SIZE_MAX;
+        size_t right = SIZE_MAX;
+        CharBuf *field = NULL;
+        for (size_t i = 0, max = VA_Get_Size(elems); i < max; i++) {
+            ParserElem *elem = (ParserElem*)VA_Fetch(elems, i);
+            uint32_t type = ParserElem_Get_Type(elem);
+            if (type == TOKEN_OPEN_PAREN) {
+                left = i;
+            }
+            else if (type == TOKEN_CLOSE_PAREN) {
+                right = i;
+                break;
+            }
+            else if (type == TOKEN_FIELD && i < max - 1) {
+                // If a field applies to an enclosing paren, pass it along.
+                ParserElem *next_elem = (ParserElem*)VA_Fetch(elems, i + 1);
+                uint32_t next_type = ParserElem_Get_Type(next_elem);
+                if (next_type == TOKEN_OPEN_PAREN) {
+                    field = (CharBuf*)ParserElem_As(elem, CHARBUF);
+                }
+            }
+        }
+
+        // Break out of loop when there are no parens left.
+        if (right == SIZE_MAX) {
+            break;
+        }
+
+        // Create the subquery.
+        VArray *sub_elems = VA_Slice(elems, left + 1, right - left - 1);
+        Query *subquery = S_do_tree(self, sub_elems, field, extractions, true);
+        ParserElem *new_elem = ParserElem_new(TOKEN_QUERY, (Obj*)subquery);
+        ParserElem_Set_Occur(new_elem, self->default_occur);
+        DECREF(sub_elems);
+
+        // Replace the elements used to create the subquery with the subquery
+        // itself.
+        if (left > 0) {
+            ParserElem *maybe_field = (ParserElem*)VA_Fetch(elems, left - 1);
+            uint32_t maybe_field_type = ParserElem_Get_Type(maybe_field);
+            if (maybe_field_type == TOKEN_FIELD) {
+                left -= 1;
+            }
+        }
+        VA_Excise(elems, left + 1, right - left);
+        VA_Store(elems, left, (Obj*)new_elem);
+    }
+}
+
+static void
 S_discard_elems(VArray *elems, uint32_t type) {
     for (size_t i = VA_Get_Size(elems); i--;) {
         ParserElem *elem = (ParserElem*)VA_Fetch(elems, i);
@@ -366,11 +421,8 @@ S_discard_elems(VArray *elems, uint32_t 
 }
 
 static Query*
-S_do_tree(QueryParser *self, CharBuf *query_string, CharBuf *default_field,
-          Hash *extractions) {
-    VArray *elems = S_parse_flat_string(self, query_string);
-    S_balance_parens(self, elems);
-    bool_t enclosed = false;
+S_do_tree(QueryParser *self, VArray *elems, CharBuf *default_field,
+          Hash *extractions, bool_t enclosed) {
     if (VA_Get_Size(elems)) {
         ParserElem *first = (ParserElem*)VA_Fetch(elems, 0);
         if (ParserElem_Get_Type(first) == TOKEN_OPEN_PAREN) {
@@ -401,7 +453,7 @@ S_do_tree(QueryParser *self, CharBuf *qu
     }
     S_discard_elems(elems, TOKEN_OR);
     Query *retval = S_compose_subquery(self, elems, enclosed);
-    DECREF(elems);
+
     return retval;
 }
 
@@ -495,20 +547,6 @@ S_compose_inner_queries(QueryParser *sel
                 DECREF(Hash_Delete(extractions, (Obj*)text));
                 VA_Store(elems, i, (Obj*)new_elem);
             }
-            // Recursively parse parenthetical groupings.
-            else if (CB_Starts_With(text, self->bool_group_label)) {
-                CharBuf *inner_text
-                    = (CharBuf*)Hash_Fetch(extractions, (Obj*)text);
-                Query *query
-                    = S_do_tree(self, inner_text, field, extractions);
-                DECREF(Hash_Delete(extractions, (Obj*)text));
-                if (query) {
-                    ParserElem *new_elem
-                        = ParserElem_new(TOKEN_QUERY, (Obj*)query);
-                    ParserElem_Set_Occur(new_elem, self->default_occur);
-                    VA_Store(elems, i, (Obj*)new_elem);
-                }
-            }
             // What's left is probably a term, so generate a LeafQuery.
             else {
                 LeafQuery *query = LeafQuery_new(field, text);
@@ -934,11 +972,15 @@ S_consume_field(ViewCharBuf *qstring, Vi
 }
 
 static bool_t
-S_consume_non_whitespace(ViewCharBuf *qstring, ViewCharBuf *target) {
+S_consume_non_whitespace_non_paren(ViewCharBuf *qstring, ViewCharBuf *target) {
     uint32_t code_point = ViewCB_Code_Point_At(qstring, 0);
     bool_t   success    = false;
     ViewCB_Assign(target, (CharBuf*)qstring);
-    while (code_point && !StrHelp_is_whitespace(code_point)) {
+    while (code_point
+        && !StrHelp_is_whitespace(code_point)
+        && code_point != '('
+        && code_point != ')'
+       ) {
         ViewCB_Nip_One(qstring);
         code_point = ViewCB_Code_Point_At(qstring, 0);
         success = true;


Reply via email to