Author: marvin
Date: Sat Jun 30 05:30:10 2012
New Revision: 1355638
URL: http://svn.apache.org/viewvc?rev=1355638&view=rev
Log:
Delegate lexing of query strings to QueryLexer.
Add new helper class QueryLexer which breaks tokenization of query
strings out of QueryParser.
Added:
lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.c (with props)
lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.cfh (with props)
Modified:
lucy/trunk/core/Lucy/Search/QueryParser.c
lucy/trunk/core/Lucy/Search/QueryParser.cfh
Modified: lucy/trunk/core/Lucy/Search/QueryParser.c
URL:
http://svn.apache.org/viewvc/lucy/trunk/core/Lucy/Search/QueryParser.c?rev=1355638&r1=1355637&r2=1355638&view=diff
==============================================================================
--- lucy/trunk/core/Lucy/Search/QueryParser.c (original)
+++ lucy/trunk/core/Lucy/Search/QueryParser.c Sat Jun 30 05:30:10 2012
@@ -22,6 +22,7 @@
#include "Lucy/Search/QueryParser.h"
#include "Lucy/Search/QueryParser/ParserElem.h"
+#include "Lucy/Search/QueryParser/QueryLexer.h"
#include "Lucy/Analysis/Analyzer.h"
#include "Lucy/Plan/FieldType.h"
#include "Lucy/Plan/Schema.h"
@@ -50,10 +51,10 @@
#define TOKEN_STRING LUCY_QPARSER_TOKEN_STRING
#define TOKEN_QUERY LUCY_QPARSER_TOKEN_QUERY
-// Recursing helper function for Tree().
+// Helper function for Tree().
static Query*
-S_do_tree(QueryParser *self, VArray *elems, CharBuf *default_field,
- Hash *extractions, bool_t enclosed);
+S_parse_subquery(QueryParser *self, VArray *elems, CharBuf *default_field,
+ bool_t enclosed);
static CharBuf*
S_balance_parens_in_string(CharBuf *qstring);
@@ -69,11 +70,11 @@ S_balance_parens(QueryParser *self, VArr
// Work from the inside out, reducing the leftmost, innermost paren groups
// first, until the array of elems contains no parens.
static void
-S_parse_subqueries(QueryParser *self, VArray *elems, Hash *extractions);
+S_parse_subqueries(QueryParser *self, VArray *elems);
static void
S_compose_inner_queries(QueryParser *self, VArray *elems,
- CharBuf *default_field, Hash *extractions);
+ CharBuf *default_field);
// Apply +, -, NOT.
static void
@@ -166,6 +167,7 @@ QParser_init(QueryParser *self, Schema *
// Init.
self->heed_colons = false;
self->label_inc = 0;
+ self->lexer = QueryLexer_new();
// Assign.
self->schema = (Schema*)INCREF(schema);
@@ -227,6 +229,7 @@ QParser_destroy(QueryParser *self) {
DECREF(self->analyzer);
DECREF(self->default_boolop);
DECREF(self->fields);
+ DECREF(self->lexer);
DECREF(self->phrase_label);
DECREF(self->bool_group_label);
SUPER_DESTROY(self, QUERYPARSER);
@@ -260,6 +263,7 @@ QParser_heed_colons(QueryParser *self) {
void
QParser_set_heed_colons(QueryParser *self, bool_t heed_colons) {
self->heed_colons = heed_colons;
+ QueryLexer_Set_Heed_Colons(self->lexer, heed_colons);
}
@@ -279,17 +283,12 @@ QParser_parse(QueryParser *self, const C
Query*
QParser_tree(QueryParser *self, const CharBuf *query_string) {
- Hash *extractions = Hash_new(0);
- CharBuf *mod1 = S_extract_phrases(self, query_string, extractions);
- CharBuf *mod2 = S_balance_parens_in_string(mod1);
- VArray *elems = S_parse_flat_string(self, mod2);
- S_parse_subqueries(self, elems, extractions);
- Query *retval = S_do_tree(self, elems, NULL, extractions, false);
+ VArray *elems = QueryLexer_Tokenize(self->lexer, query_string);
+ S_balance_parens(self, elems);
+ S_parse_subqueries(self, elems);
+ Query *query = S_parse_subquery(self, elems, NULL, false);
DECREF(elems);
- DECREF(mod2);
- DECREF(mod1);
- DECREF(extractions);
- return retval;
+ return query;
}
static VArray*
@@ -359,7 +358,7 @@ S_parse_flat_string(QueryParser *self, C
}
static void
-S_parse_subqueries(QueryParser *self, VArray *elems, Hash *extractions) {
+S_parse_subqueries(QueryParser *self, VArray *elems) {
while (1) {
// Work from the inside out, starting with the leftmost innermost
// paren group.
@@ -393,7 +392,7 @@ S_parse_subqueries(QueryParser *self, VA
// Create the subquery.
VArray *sub_elems = VA_Slice(elems, left + 1, right - left - 1);
- Query *subquery = S_do_tree(self, sub_elems, field, extractions, true);
+ Query *subquery = S_parse_subquery(self, sub_elems, field, true);
ParserElem *new_elem = ParserElem_new(TOKEN_QUERY, (Obj*)subquery);
ParserElem_Set_Occur(new_elem, self->default_occur);
DECREF(sub_elems);
@@ -421,8 +420,8 @@ S_discard_elems(VArray *elems, uint32_t
}
static Query*
-S_do_tree(QueryParser *self, VArray *elems, CharBuf *default_field,
- Hash *extractions, bool_t enclosed) {
+S_parse_subquery(QueryParser *self, VArray *elems, CharBuf *default_field,
+ bool_t enclosed) {
if (VA_Get_Size(elems)) {
ParserElem *first = (ParserElem*)VA_Fetch(elems, 0);
if (ParserElem_Get_Type(first) == TOKEN_OPEN_PAREN) {
@@ -431,7 +430,7 @@ S_do_tree(QueryParser *self, VArray *ele
DECREF(VA_Pop(elems));
}
}
- S_compose_inner_queries(self, elems, default_field, extractions);
+ S_compose_inner_queries(self, elems, default_field);
S_discard_elems(elems, TOKEN_FIELD);
S_discard_elems(elems, TOKEN_STRING);
S_apply_plusses_and_negations(self, elems);
@@ -519,7 +518,7 @@ S_balance_parens_in_string(CharBuf *qstr
static void
S_compose_inner_queries(QueryParser *self, VArray *elems,
- CharBuf *default_field, Hash *extractions) {
+ CharBuf *default_field) {
// Generate all queries. Apply any fields.
for (uint32_t i = VA_Get_Size(elems); i--;) {
CharBuf *field = default_field;
@@ -536,25 +535,12 @@ S_compose_inner_queries(QueryParser *sel
}
if (ParserElem_Get_Type(elem) == TOKEN_STRING) {
- // Generate a LeafQuery from a Phrase.
const CharBuf *text = (CharBuf*)ParserElem_As(elem, CHARBUF);
- if (CB_Starts_With(text, self->phrase_label)) {
- CharBuf *inner_text
- = (CharBuf*)Hash_Fetch(extractions, (Obj*)text);
- LeafQuery *query = LeafQuery_new(field, inner_text);
- ParserElem *new_elem = ParserElem_new(TOKEN_QUERY,
(Obj*)query);
- ParserElem_Set_Occur(new_elem, self->default_occur);
- DECREF(Hash_Delete(extractions, (Obj*)text));
- VA_Store(elems, i, (Obj*)new_elem);
- }
- // What's left is probably a term, so generate a LeafQuery.
- else {
- LeafQuery *query = LeafQuery_new(field, text);
- ParserElem *new_elem
- = ParserElem_new(TOKEN_QUERY, (Obj*)query);
- ParserElem_Set_Occur(new_elem, self->default_occur);
- VA_Store(elems, i, (Obj*)new_elem);
- }
+ LeafQuery *query = LeafQuery_new(field, text);
+ ParserElem *new_elem
+ = ParserElem_new(TOKEN_QUERY, (Obj*)query);
+ ParserElem_Set_Occur(new_elem, self->default_occur);
+ VA_Store(elems, i, (Obj*)new_elem);
}
}
}
Modified: lucy/trunk/core/Lucy/Search/QueryParser.cfh
URL:
http://svn.apache.org/viewvc/lucy/trunk/core/Lucy/Search/QueryParser.cfh?rev=1355638&r1=1355637&r2=1355638&view=diff
==============================================================================
--- lucy/trunk/core/Lucy/Search/QueryParser.cfh (original)
+++ lucy/trunk/core/Lucy/Search/QueryParser.cfh Sat Jun 30 05:30:10 2012
@@ -43,15 +43,16 @@ parcel Lucy;
public class Lucy::Search::QueryParser cnick QParser
inherits Lucy::Object::Obj {
- Schema *schema;
- Analyzer *analyzer;
- CharBuf *default_boolop;
- VArray *fields;
- CharBuf *phrase_label;
- CharBuf *bool_group_label;
- bool_t heed_colons;
- uint32_t label_inc;
- bool_t default_occur;
+ Schema *schema;
+ Analyzer *analyzer;
+ CharBuf *default_boolop;
+ VArray *fields;
+ QueryLexer *lexer;
+ CharBuf *phrase_label;
+ CharBuf *bool_group_label;
+ bool_t heed_colons;
+ uint32_t label_inc;
+ bool_t default_occur;
inert incremented QueryParser*
new(Schema *schema, Analyzer *analyzer = NULL,
Added: lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.c
URL:
http://svn.apache.org/viewvc/lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.c?rev=1355638&view=auto
==============================================================================
--- lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.c (added)
+++ lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.c Sat Jun 30 05:30:10
2012
@@ -0,0 +1,288 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define C_LUCY_QUERYLEXER
+#include <stdlib.h>
+#include <ctype.h>
+#include "Lucy/Util/ToolSet.h"
+
+#include "Lucy/Search/QueryParser/QueryLexer.h"
+#include "Lucy/Search/QueryParser.h"
+#include "Lucy/Search/QueryParser/ParserElem.h"
+
+#define TOKEN_OPEN_PAREN LUCY_QPARSER_TOKEN_OPEN_PAREN
+#define TOKEN_CLOSE_PAREN LUCY_QPARSER_TOKEN_CLOSE_PAREN
+#define TOKEN_MINUS LUCY_QPARSER_TOKEN_MINUS
+#define TOKEN_PLUS LUCY_QPARSER_TOKEN_PLUS
+#define TOKEN_NOT LUCY_QPARSER_TOKEN_NOT
+#define TOKEN_OR LUCY_QPARSER_TOKEN_OR
+#define TOKEN_AND LUCY_QPARSER_TOKEN_AND
+#define TOKEN_FIELD LUCY_QPARSER_TOKEN_FIELD
+#define TOKEN_STRING LUCY_QPARSER_TOKEN_STRING
+#define TOKEN_QUERY LUCY_QPARSER_TOKEN_QUERY
+
+static ParserElem*
+S_consume_keyword(ZombieCharBuf *qstring, const char *keyword,
+ size_t keyword_len, int type);
+
+static ParserElem*
+S_consume_field(ZombieCharBuf *qstring);
+
+static ParserElem*
+S_consume_text(ZombieCharBuf *qstring);
+
+static ParserElem*
+S_consume_quoted_string(ZombieCharBuf *qstring);
+
+QueryLexer*
+QueryLexer_new() {
+ QueryLexer *self = (QueryLexer*)VTable_Make_Obj(QUERYLEXER);
+ return QueryLexer_init(self);
+}
+
+QueryLexer*
+QueryLexer_init(QueryLexer *self) {
+ self->heed_colons = false;
+ return self;
+}
+
+bool_t
+QueryLexer_heed_colons(QueryLexer *self) {
+ return self->heed_colons;
+}
+
+void
+QueryLexer_set_heed_colons(QueryLexer *self, bool_t heed_colons) {
+ self->heed_colons = heed_colons;
+}
+
+VArray*
+QueryLexer_tokenize(QueryLexer *self, const CharBuf *query_string) {
+ CharBuf *copy = query_string
+ ? CB_Clone(query_string)
+ : CB_new_from_trusted_utf8("", 0);
+ ZombieCharBuf *qstring = ZCB_WRAP((CharBuf*)copy);
+ VArray *elems = VA_new(0);
+ ZCB_Trim(qstring);
+
+ while (ZCB_Get_Size(qstring)) {
+ ParserElem *elem = NULL;
+
+ if (ZCB_Trim_Top(qstring)) {
+ // Fast-forward past whitespace.
+ continue;
+ }
+
+ if (self->heed_colons) {
+ ParserElem *elem = S_consume_field(qstring);
+ if (elem) {
+ VA_Push(elems, (Obj*)elem);
+ }
+ }
+
+ uint32_t code_point = ZCB_Code_Point_At(qstring, 0);
+ switch (code_point) {
+ case '(':
+ ZCB_Nip(qstring, 1);
+ elem = ParserElem_new(TOKEN_OPEN_PAREN, NULL);
+ break;
+ case ')':
+ ZCB_Nip(qstring, 1);
+ elem = ParserElem_new(TOKEN_CLOSE_PAREN, NULL);
+ break;
+ case '+':
+ if (ZCB_Get_Size(qstring) > 1
+ && !StrHelp_is_whitespace(ZCB_Code_Point_At(qstring, 1))
+ ) {
+ elem = ParserElem_new(TOKEN_PLUS, NULL);
+ }
+ else {
+ elem = ParserElem_new(TOKEN_STRING, (Obj*)CB_newf("+"));
+ }
+ ZCB_Nip(qstring, 1);
+ break;
+ case '-':
+ if (ZCB_Get_Size(qstring) > 1
+ && !StrHelp_is_whitespace(ZCB_Code_Point_At(qstring, 1))
+ ) {
+ elem = ParserElem_new(TOKEN_MINUS, NULL);
+ }
+ else {
+ elem = ParserElem_new(TOKEN_STRING, (Obj*)CB_newf("-"));
+ }
+ ZCB_Nip(qstring, 1);
+ break;
+ case '"':
+ elem = S_consume_quoted_string(qstring);
+ break;
+ case 'O':
+ elem = S_consume_keyword(qstring, "OR", 2, TOKEN_OR);
+ if (!elem) {
+ elem = S_consume_text(qstring);
+ }
+ break;
+ case 'A':
+ elem = S_consume_keyword(qstring, "AND", 3, TOKEN_AND);
+ if (!elem) {
+ elem = S_consume_text(qstring);
+ }
+ break;
+ case 'N':
+ elem = S_consume_keyword(qstring, "NOT", 3, TOKEN_NOT);
+ if (!elem) {
+ elem = S_consume_text(qstring);
+ }
+ break;
+ default:
+ elem = S_consume_text(qstring);
+ break;
+ }
+ VA_Push(elems, (Obj*)elem);
+ }
+
+ DECREF(copy);
+ return elems;
+}
+
+
+static ParserElem*
+S_consume_keyword(ZombieCharBuf *qstring, const char *keyword,
+ size_t keyword_len, int type) {
+ if (!ZCB_Starts_With_Str(qstring, keyword, keyword_len)) {
+ return NULL;
+ }
+ uint32_t lookahead = ZCB_Code_Point_At(qstring, keyword_len);
+ if (!lookahead) {
+ return NULL;
+ }
+ if (StrHelp_is_whitespace(lookahead)
+ || lookahead == '"'
+ || lookahead == '('
+ || lookahead == ')'
+ || lookahead == '+'
+ || lookahead == '-'
+ ) {
+ ZCB_Nip(qstring, keyword_len);
+ return ParserElem_new(type, NULL);
+ }
+ return NULL;
+}
+
+static ParserElem*
+S_consume_field(ZombieCharBuf *qstring) {
+ size_t tick = 0;
+
+ // Field names constructs must start with a letter or underscore.
+ uint32_t code_point = ZCB_Code_Point_At(qstring, tick);
+ if (isalpha(code_point) || code_point == '_') {
+ tick++;
+ }
+ else {
+ return NULL;
+ }
+
+ // Only alphanumerics and underscores are allowed in field names.
+ while (1) {
+ code_point = ZCB_Code_Point_At(qstring, tick);
+ if (isalnum(code_point) || code_point == '_') {
+ tick++;
+ }
+ else if (code_point == ':') {
+ tick++;
+ break;
+ }
+ else {
+ return NULL;
+ }
+ }
+
+ // Field name constructs must be followed by something sensible.
+ uint32_t lookahead = ZCB_Code_Point_At(qstring, tick);
+ if (!(isalnum(lookahead)
+ || lookahead == '_'
+ || lookahead > 127
+ || lookahead == '"'
+ || lookahead == '('
+ )
+ ) {
+ return NULL;
+ }
+
+ // Consume string data.
+ ZombieCharBuf *field = ZCB_WRAP((CharBuf*)qstring);
+ ZCB_Truncate(field, tick - 1);
+ ZCB_Nip(qstring, tick);
+ return ParserElem_new(TOKEN_FIELD, (Obj*)ZCB_Clone(field));
+}
+
+static ParserElem*
+S_consume_text(ZombieCharBuf *qstring) {
+ ZombieCharBuf *text = ZCB_WRAP((CharBuf*)qstring);
+ size_t tick = 0;
+ while (1) {
+ uint32_t code_point = ZCB_Nip_One(qstring);
+ if (code_point == '\\') {
+ code_point = ZCB_Nip_One(qstring);
+ tick++;
+ if (code_point == 0) {
+ break;
+ }
+ }
+ else if (StrHelp_is_whitespace(code_point)
+ || code_point == '"'
+ || code_point == '('
+ || code_point == ')'
+ || code_point == 0
+ ) {
+ break;
+ }
+ tick++;
+ }
+
+ ZCB_Truncate(text, tick);
+ return ParserElem_new(TOKEN_STRING, (Obj*)ZCB_Clone(text));
+}
+
+static ParserElem*
+S_consume_quoted_string(ZombieCharBuf *qstring) {
+ ZombieCharBuf *text = ZCB_WRAP((CharBuf*)qstring);
+ if (ZCB_Nip_One(qstring) != '"') {
+ THROW(ERR, "Internal error: expected a quote");
+ }
+
+ size_t tick = 1;
+ while (1) {
+ uint32_t code_point = ZCB_Nip_One(qstring);
+ if (code_point == '"') {
+ tick += 1;
+ break;
+ }
+ else if (code_point == 0) {
+ break;
+ }
+ else if (code_point == '\\') {
+ ZCB_Nip_One(qstring);
+ tick += 2;
+ }
+ else {
+ tick += 1;
+ }
+ }
+
+ ZCB_Truncate(text, tick);
+ return ParserElem_new(TOKEN_STRING, (Obj*)ZCB_Clone(text));
+}
+
Propchange: lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.c
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.cfh
URL:
http://svn.apache.org/viewvc/lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.cfh?rev=1355638&view=auto
==============================================================================
--- lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.cfh (added)
+++ lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.cfh Sat Jun 30 05:30:10
2012
@@ -0,0 +1,45 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+parcel Lucy;
+
+class Lucy::Search::QueryParser::QueryLexer inherits Lucy::Object::Obj {
+
+ bool_t heed_colons;
+
+ inert incremented QueryLexer*
+ new();
+
+ inert QueryLexer*
+ init(QueryLexer *self);
+
+ /** Parse a query string into an array of tokens.
+ *
+ * @param query_string The string to be parsed. May be NULL.
+ * @return an array of ParserElems.
+ */
+ incremented VArray*
+ Tokenize(QueryLexer *self, const CharBuf *query_string = NULL);
+
+ bool_t
+ Heed_Colons(QueryLexer *self);
+
+ /** Enable/disable parsing of <code>fieldname:foo</code> constructs.
+ */
+ void
+ Set_Heed_Colons(QueryLexer *self, bool_t heed_colons);
+}
+
Propchange: lucy/trunk/core/Lucy/Search/QueryParser/QueryLexer.cfh
------------------------------------------------------------------------------
svn:eol-style = native