Fixing issues with the parser. Now analyzes query input correctly for fuzzy, prefix, wildcard, range, regex, etc.
Project: http://git-wip-us.apache.org/repos/asf/incubator-blur/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-blur/commit/f8c45c38 Tree: http://git-wip-us.apache.org/repos/asf/incubator-blur/tree/f8c45c38 Diff: http://git-wip-us.apache.org/repos/asf/incubator-blur/diff/f8c45c38 Branch: refs/heads/master Commit: f8c45c38051d02e775cb826c502b7b1e25270a67 Parents: 828c127 Author: Aaron McCurry <amccu...@gmail.com> Authored: Mon Sep 21 15:46:46 2015 -0400 Committer: Aaron McCurry <amccu...@gmail.com> Committed: Mon Sep 21 15:46:46 2015 -0400 ---------------------------------------------------------------------- .../blur/lucene/search/BlurQueryParser.java | 203 ++++++++++++++++++- .../apache/blur/lucene/search/SuperParser.java | 24 ++- .../blur/lucene/search/SuperParserTest.java | 155 +++++++++++++- 3 files changed, 377 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/f8c45c38/blur-query/src/main/java/org/apache/blur/lucene/search/BlurQueryParser.java ---------------------------------------------------------------------- diff --git a/blur-query/src/main/java/org/apache/blur/lucene/search/BlurQueryParser.java b/blur-query/src/main/java/org/apache/blur/lucene/search/BlurQueryParser.java index 801c73f..dd95aaa 100644 --- a/blur-query/src/main/java/org/apache/blur/lucene/search/BlurQueryParser.java +++ b/blur-query/src/main/java/org/apache/blur/lucene/search/BlurQueryParser.java @@ -17,14 +17,28 @@ package org.apache.blur.lucene.search; * limitations under the License. */ import java.io.IOException; +import java.io.StringReader; +import java.text.DateFormat; +import java.util.Calendar; +import java.util.Date; import java.util.HashMap; +import java.util.Locale; import java.util.Map; +import java.util.TimeZone; import java.util.UUID; import org.apache.blur.analysis.FieldManager; +import org.apache.blur.analysis.FieldTypeDefinition; import org.apache.blur.utils.BlurConstants; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.KeywordAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.DateTools; import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; @@ -39,11 +53,17 @@ public class BlurQueryParser extends QueryParser { protected final Map<Query, String> _fieldNames; protected final FieldManager _fieldManager; + protected final Locale _locale = Locale.getDefault(); + protected final TimeZone _timeZone = TimeZone.getDefault(); + protected final boolean _allowLeadingWildcard; + protected final int _fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength; + public BlurQueryParser(Version matchVersion, String f, Map<Query, String> fieldNames, FieldManager fieldManager) { super(matchVersion, f, fieldManager.getAnalyzerForQuery()); _fieldNames = fieldNames == null ? new HashMap<Query, String>() : fieldNames; _fieldManager = fieldManager; - setAllowLeadingWildcard(true); + _allowLeadingWildcard = true; + setAllowLeadingWildcard(_allowLeadingWildcard); setAutoGeneratePhraseQueries(true); } @@ -129,7 +149,7 @@ public class BlurQueryParser extends QueryParser { return addField(super.newRangeQuery(resolvedField, part1, part2, startInclusive, endInclusive), resolvedField); } - private void customQueryCheck(String field) { + protected void customQueryCheck(String field) { try { Boolean b = _fieldManager.checkSupportForCustomQuery(field); if (b != null && b) { @@ -206,9 +226,186 @@ public class BlurQueryParser extends QueryParser { return addField(super.newRegexpQuery(new Term(resolvedField, t.text())), resolvedField); } - private Query addField(Query q, String field) { + protected Query addField(Query q, String field) { _fieldNames.put(q, field); return q; } + protected String analyzeField(String field, String text) throws ParseException { + try { + FieldTypeDefinition fieldTypeDefinition = _fieldManager.getFieldTypeDefinition(field); + if (fieldTypeDefinition == null) { + return text; + } + Analyzer analyzerForQuery = fieldTypeDefinition.getAnalyzerForQuery(field); + if (analyzerForQuery instanceof KeywordAnalyzer) { + return text; + } + + StringBuilder builder = new StringBuilder(); + StringBuilder result = new StringBuilder(); + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (isSpecialChar(c) && !isEscaped(text, i - 1)) { + if (builder.length() > 0) { + result.append(analyze(field, builder.toString(), analyzerForQuery)); + builder.setLength(0); + } + if (isSpecialRange(c)) { + char closingChar = getClosingChar(c); + int indexOf = text.indexOf(closingChar, i); + if (indexOf < 0) { + throw new ParseException("Could not find closing char [" + closingChar + "] in text [" + text + "]"); + } + String s = text.substring(i, indexOf + 1); + result.append(s); + i += s.length() - 1; + } else { + result.append(c); + } + } else { + builder.append(c); + } + } + if (builder.length() > 0) { + result.append(analyze(field, builder.toString(), analyzerForQuery)); + builder.setLength(0); + } + return result.toString(); + } catch (IOException e) { + throw new ParseException(e.getMessage()); + } + } + + private char getClosingChar(char c) throws ParseException { + switch (c) { + case '[': + return ']'; + default: + throw new ParseException("Closing char for " + c + " not found."); + } + } + + private boolean isSpecialRange(char c) { + switch (c) { + case '[': + return true; + case '{': + return true; + default: + return false; + } + } + + private boolean isSpecialChar(char c) { + switch (c) { + case '?': + case '/': + case '[': + case ']': + case '}': + case '{': + case '*': + return true; + default: + return false; + } + } + + private boolean isEscaped(String text, int pos) { + if (pos == 0) { + return false; + } + return text.charAt(pos) == '\\'; + } + + private String analyze(String field, String text, Analyzer analyzerForQuery) throws IOException, ParseException { + StringBuilder result = new StringBuilder(); + TokenStream tokenStream = analyzerForQuery.tokenStream(field, new StringReader(text)); + CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class); + tokenStream.reset(); + if (tokenStream.incrementToken()) { + result.append(termAttribute.toString()); + } + if (tokenStream.incrementToken()) { + throw new ParseException("Should not have multiple tokens in text [" + text + "] for field [" + field + "]."); + } + return result.toString(); + } + + @Override + protected Query getRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) + throws ParseException { + part1 = part1 == null ? null : analyzeField(field, part1); + part2 = part2 == null ? null : analyzeField(field, part2); + + DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT, _locale); + df.setLenient(true); + DateTools.Resolution resolution = getDateResolution(field); + + try { + part1 = DateTools.dateToString(df.parse(part1), resolution); + } catch (Exception e) { + } + + try { + Date d2 = df.parse(part2); + if (endInclusive) { + // The user can only specify the date, not the time, so make sure + // the time is set to the latest possible time of that date to really + // include all documents: + Calendar cal = Calendar.getInstance(_timeZone, _locale); + cal.setTime(d2); + cal.set(Calendar.HOUR_OF_DAY, 23); + cal.set(Calendar.MINUTE, 59); + cal.set(Calendar.SECOND, 59); + cal.set(Calendar.MILLISECOND, 999); + d2 = cal.getTime(); + } + part2 = DateTools.dateToString(d2, resolution); + } catch (Exception e) { + } + return newRangeQuery(field, part1, part2, startInclusive, endInclusive); + } + + @Override + protected Query getWildcardQuery(String field, String termStr) throws ParseException { + if ("*".equals(field)) { + if ("*".equals(termStr)) { + return newMatchAllDocsQuery(); + } + } + if (!_allowLeadingWildcard && (termStr.startsWith("*") || termStr.startsWith("?"))) { + throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"); + } + if (!"*".equals(termStr)) { + termStr = analyzeField(field, termStr); + } + Term t = new Term(field, termStr); + return newWildcardQuery(t); + } + + @Override + protected Query getRegexpQuery(String field, String termStr) throws ParseException { + termStr = analyzeField(field, termStr); + Term t = new Term(field, termStr); + return newRegexpQuery(t); + } + + @Override + protected Query getPrefixQuery(String field, String termStr) throws ParseException { + if (!_allowLeadingWildcard && termStr.startsWith("*")) + throw new ParseException("'*' not allowed as first character in PrefixQuery"); + termStr = analyzeField(field, termStr); + Term t = new Term(field, termStr); + return newPrefixQuery(t); + } + + @Override + protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException { + termStr = analyzeField(field, termStr); + Term t = new Term(field, termStr); + return newFuzzyQuery(t, minSimilarity, _fuzzyPrefixLength); + } + } http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/f8c45c38/blur-query/src/main/java/org/apache/blur/lucene/search/SuperParser.java ---------------------------------------------------------------------- diff --git a/blur-query/src/main/java/org/apache/blur/lucene/search/SuperParser.java b/blur-query/src/main/java/org/apache/blur/lucene/search/SuperParser.java index b2bc871..40cde15 100644 --- a/blur-query/src/main/java/org/apache/blur/lucene/search/SuperParser.java +++ b/blur-query/src/main/java/org/apache/blur/lucene/search/SuperParser.java @@ -274,11 +274,30 @@ public class SuperParser extends BlurQueryParser { private boolean isSameGroupName(BooleanQuery booleanQuery) { String groupName = findFirstGroupName(booleanQuery); if (groupName == null) { + if (allFieldQueriesAreSystemFields(booleanQuery)) { + return true; + } return false; } return isSameGroupName(booleanQuery, groupName); } + private boolean allFieldQueriesAreSystemFields(Query query) { + if (query instanceof BooleanQuery) { + BooleanQuery booleanQuery = (BooleanQuery) query; + for (BooleanClause clause : booleanQuery.clauses()) { + if (!allFieldQueriesAreSystemFields(clause.getQuery())) { + return false; + } + } + return true; + } else if (query instanceof SuperQuery) { + return allFieldQueriesAreSystemFields(((SuperQuery) query).getQuery()); + } else { + return isSystemField(_fieldNames.get(query)); + } + } + private boolean isSameGroupName(Query query, String groupName) { if (query instanceof BooleanQuery) { BooleanQuery booleanQuery = (BooleanQuery) query; @@ -317,7 +336,10 @@ public class SuperParser extends BlurQueryParser { if (query instanceof BooleanQuery) { BooleanQuery booleanQuery = (BooleanQuery) query; for (BooleanClause clause : booleanQuery.clauses()) { - return findFirstGroupName(clause.getQuery()); + String groupName = findFirstGroupName(clause.getQuery()); + if (groupName != null) { + return groupName; + } } return null; } else if (query instanceof SuperQuery) { http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/f8c45c38/blur-query/src/test/java/org/apache/blur/lucene/search/SuperParserTest.java ---------------------------------------------------------------------- diff --git a/blur-query/src/test/java/org/apache/blur/lucene/search/SuperParserTest.java b/blur-query/src/test/java/org/apache/blur/lucene/search/SuperParserTest.java index 937589f..f3d8326 100644 --- a/blur-query/src/test/java/org/apache/blur/lucene/search/SuperParserTest.java +++ b/blur-query/src/test/java/org/apache/blur/lucene/search/SuperParserTest.java @@ -22,8 +22,10 @@ import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; +import java.lang.reflect.Field; import java.text.SimpleDateFormat; import java.util.ArrayList; +import java.util.Arrays; import java.util.Date; import java.util.List; import java.util.concurrent.TimeUnit; @@ -41,12 +43,18 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.search.AutomatonQuery; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.NumericRangeQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.RegexpQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.WildcardQuery; @@ -96,6 +104,7 @@ public class SuperParserTest { fieldManager.addColumnDefinitionDouble("a", "id_d"); fieldManager.addColumnDefinitionFloat("a", "id_f"); fieldManager.addColumnDefinitionLong("a", "id_l"); + fieldManager.addColumnDefinitionString("a", "id_s"); fieldManager.addColumnDefinitionDate("a", "id_date", "yyyy-MM-dd"); fieldManager.addColumnDefinitionGisRecursivePrefixTree("a", "id_gis"); return fieldManager; @@ -565,6 +574,84 @@ public class SuperParserTest { assertTrue(equals); } + @Test + public void test48() throws ParseException { + Query q = parseSq("<a.id_s:ABC*>"); + Query q1 = sq(pq("a.id_s", "ABC")); + assertQuery(q, q1); + } + + @Test + public void test49() throws ParseException { + Query q = parseSq("<the cow Jumped Over the moon>"); + Query q1 = sq(bq(bc(tq("super", "the")), bc(tq("super", "cow")), bc(tq("super", "jumped")), + bc(tq("super", "over")), bc(tq("super", "the")), bc(tq("super", "moon")))); + assertQuery(q, q1); + } + + @Test + public void test50() throws ParseException { + Query q = parseSq("<the cow Jumped Over the moon a.id_s:ABC>"); + Query q1 = sq(bq(bc(tq("super", "the")), bc(tq("super", "cow")), bc(tq("super", "jumped")), + bc(tq("super", "over")), bc(tq("super", "the")), bc(tq("super", "moon")), bc(tq("a.id_s", "ABC")))); + assertQuery(q, q1); + } + + @Test + public void test51() throws ParseException { + Query q = parseSq("<Here* We Go*>"); + Query q1 = sq(bq(bc(pq("super", "here")), bc(tq("super", "we")), bc(pq("super", "go")))); + assertQuery(q, q1); + } + + @Test + public void test52() throws ParseException { + Query q = parseSq("<He?e We Go*>"); + Query q1 = sq(bq(bc(wq("super", "he?e")), bc(tq("super", "we")), bc(pq("super", "go")))); + assertQuery(q, q1); + } + + @Test + public void test53() throws ParseException { + Query q = parseSq("</He[rR]e/ We Go*>"); + Query q1 = sq(bq(bc(rxq("super", "he[rR]e")), bc(tq("super", "we")), bc(pq("super", "go")))); + assertQuery(q, q1); + } + + @Test + public void test54() throws ParseException { + Query q = parseSq("<Here~1 We Go*>"); + Query q1 = sq(bq(bc(fzq("super", "here", 1)), bc(tq("super", "we")), bc(pq("super", "go")))); + assertQuery(q, q1); + } + + private Query fzq(String field, String text, int maxEdits) { + return new FuzzyQuery(new Term(field, text), maxEdits); + } + + @Test + public void test55() throws ParseException { + Query q = parseSq("<a.id_s:[A TO Z}>"); + Query q1 = sq(rq_ie("a.id_s", "A", "Z")); + assertQuery(q, q1); + } + + private Query rq_ie(String field, String part1, String part2) { + return TermRangeQuery.newStringRange(field, part1, part2, true, false); + } + + private RegexpQuery rxq(String field, String text) { + return new RegexpQuery(new Term(field, text)); + } + + private WildcardQuery wq(String field, String text) { + return new WildcardQuery(new Term(field, text)); + } + + private PrefixQuery pq(String field, String text) { + return new PrefixQuery(new Term(field, text)); + } + public static BooleanClause bc_m(Query q) { return new BooleanClause(q, Occur.MUST); } @@ -591,6 +678,22 @@ public class SuperParserTest { assertEqualsSuperQuery((SuperQuery) expected, (SuperQuery) actual); } else if (expected instanceof TermQuery) { assertEqualsTermQuery((TermQuery) expected, (TermQuery) actual); + } else if (expected instanceof PrefixQuery) { + assertEqualsPrefixQuery((PrefixQuery) expected, (PrefixQuery) actual); + } else if (expected instanceof WildcardQuery) { + assertEqualsWildcardQuery((WildcardQuery) expected, (WildcardQuery) actual); + } else if (expected instanceof FuzzyQuery) { + assertEqualsFuzzyQuery((FuzzyQuery) expected, (FuzzyQuery) actual); + } else if (expected instanceof RegexpQuery) { + assertEqualsRegexpQuery((RegexpQuery) expected, (RegexpQuery) actual); + } else if (expected instanceof TermRangeQuery) { + assertEqualsTermRangeQuery((TermRangeQuery) expected, (TermRangeQuery) actual); + } else if (expected instanceof MatchAllDocsQuery) { + assertEqualsMatchAllDocsQuery((MatchAllDocsQuery) expected, (MatchAllDocsQuery) actual); + } else if (expected instanceof MultiPhraseQuery) { + assertEqualsMultiPhraseQuery((MultiPhraseQuery) expected, (MultiPhraseQuery) actual); + } else if (expected instanceof PhraseQuery) { + assertEqualsPhraseQuery((PhraseQuery) expected, (PhraseQuery) actual); } else if (expected instanceof NumericRangeQuery<?>) { assertEqualsNumericRangeQuery((NumericRangeQuery<?>) expected, (NumericRangeQuery<?>) actual); } else { @@ -598,6 +701,56 @@ public class SuperParserTest { } } + private static void assertEqualsFuzzyQuery(FuzzyQuery expected, FuzzyQuery actual) { + assertEquals(expected.getField(), actual.getField()); + assertEquals(expected.getTerm(), actual.getTerm()); + assertEquals(expected.getMaxEdits(), actual.getMaxEdits()); + } + + private static void assertEqualsPhraseQuery(PhraseQuery expected, PhraseQuery actual) { + assertTrue(Arrays.equals(expected.getTerms(), actual.getTerms())); + assertTrue(Arrays.equals(expected.getPositions(), actual.getPositions())); + } + + private static void assertEqualsMultiPhraseQuery(MultiPhraseQuery expected, MultiPhraseQuery actual) { + throw new RuntimeException("Not Implemented"); + } + + private static void assertEqualsMatchAllDocsQuery(MatchAllDocsQuery expected, MatchAllDocsQuery actual) { + // do nothing + } + + private static void assertEqualsTermRangeQuery(TermRangeQuery expected, TermRangeQuery actual) { + assertEquals(expected.getField(), actual.getField()); + assertEquals(expected.getLowerTerm(), actual.getLowerTerm()); + assertEquals(expected.getUpperTerm(), actual.getUpperTerm()); + } + + private static void assertEqualsRegexpQuery(RegexpQuery expected, RegexpQuery actual) { + assertEquals(expected.getField(), actual.getField()); + assertEquals(getTerm(expected), getTerm(actual)); + } + + private static Term getTerm(RegexpQuery regexpQuery) { + try { + Field field = AutomatonQuery.class.getDeclaredField("term"); + field.setAccessible(true); + return (Term) field.get(regexpQuery); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private static void assertEqualsWildcardQuery(WildcardQuery expected, WildcardQuery actual) { + assertEquals(expected.getField(), actual.getField()); + assertEquals(expected.getTerm(), actual.getTerm()); + } + + private static void assertEqualsPrefixQuery(PrefixQuery expected, PrefixQuery actual) { + assertEquals(expected.getField(), actual.getField()); + assertEquals(expected.getPrefix(), actual.getPrefix()); + } + public static void assertEqualsTermQuery(TermQuery expected, TermQuery actual) { Term term1 = expected.getTerm(); Term term2 = actual.getTerm(); @@ -609,7 +762,7 @@ public class SuperParserTest { } public static void assertEqualsSuperQuery(SuperQuery expected, SuperQuery actual) { - assertEquals(expected.getQuery(), actual.getQuery()); + assertEqualsQuery(expected.getQuery(), actual.getQuery()); } public static void assertEqualsBooleanQuery(BooleanQuery expected, BooleanQuery actual) {