[
https://issues.apache.org/jira/browse/LUCENE-949?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13630226#comment-13630226
]
Robert Muir commented on LUCENE-949:
------------------------------------
Thank you Timothy. the patch looks very good to me, thanks also for adding
tests!
A few questions:
* The regex simplification looks good to me, but I'm not a regex expert. Maybe
someone that is better with regex like [~steve_rowe] can have a look. If nobody
objects after a few days I'm inclined to move forward though.
* What about the case where someone has escaped wildcards? I'm not sure whats
even happening today in this case... perhaps it already has surprising behavior
and should really be a separate bug, or maybe its working and I just dont see
it. I doubt its tested though... but it seems the regex would need to
accomodate that?
* The String.format() invocations should probably pass getLocale() from the
superclass as the first argument, rather than depending on the default locale.
Since they are locale sensitive I think its best to use the one that someone
configured on the queryparser (e.g. via setLocale)
> AnalyzingQueryParser can't work with leading wildcards.
> -------------------------------------------------------
>
> Key: LUCENE-949
> URL: https://issues.apache.org/jira/browse/LUCENE-949
> Project: Lucene - Core
> Issue Type: Bug
> Components: core/queryparser
> Affects Versions: 2.2
> Reporter: Stefan Klein
> Attachments: AnalyzingQueryParser.java, LUCENE-949.patch
>
>
> The getWildcardQuery mehtod in AnalyzingQueryParser.java need the following
> changes to accept leading wildcards:
> protected Query getWildcardQuery(String field, String termStr) throws
> ParseException
> {
> String useTermStr = termStr;
> String leadingWildcard = null;
> if ("*".equals(field))
> {
> if ("*".equals(useTermStr))
> return new MatchAllDocsQuery();
> }
> boolean hasLeadingWildcard = (useTermStr.startsWith("*") ||
> useTermStr.startsWith("?")) ? true : false;
> if (!getAllowLeadingWildcard() && hasLeadingWildcard)
> throw new ParseException("'*' or '?' not allowed as
> first character in WildcardQuery");
> if (getLowercaseExpandedTerms())
> {
> useTermStr = useTermStr.toLowerCase();
> }
> if (hasLeadingWildcard)
> {
> leadingWildcard = useTermStr.substring(0, 1);
> useTermStr = useTermStr.substring(1);
> }
> List tlist = new ArrayList();
> List wlist = new ArrayList();
> /*
> * somewhat a hack: find/store wildcard chars in order to put
> them back
> * after analyzing
> */
> boolean isWithinToken = (!useTermStr.startsWith("?") &&
> !useTermStr.startsWith("*"));
> isWithinToken = true;
> StringBuffer tmpBuffer = new StringBuffer();
> char[] chars = useTermStr.toCharArray();
> for (int i = 0; i < useTermStr.length(); i++)
> {
> if (chars[i] == '?' || chars[i] == '*')
> {
> if (isWithinToken)
> {
> tlist.add(tmpBuffer.toString());
> tmpBuffer.setLength(0);
> }
> isWithinToken = false;
> }
> else
> {
> if (!isWithinToken)
> {
> wlist.add(tmpBuffer.toString());
> tmpBuffer.setLength(0);
> }
> isWithinToken = true;
> }
> tmpBuffer.append(chars[i]);
> }
> if (isWithinToken)
> {
> tlist.add(tmpBuffer.toString());
> }
> else
> {
> wlist.add(tmpBuffer.toString());
> }
> // get Analyzer from superclass and tokenize the term
> TokenStream source = getAnalyzer().tokenStream(field, new
> StringReader(useTermStr));
> org.apache.lucene.analysis.Token t;
> int countTokens = 0;
> while (true)
> {
> try
> {
> t = source.next();
> }
> catch (IOException e)
> {
> t = null;
> }
> if (t == null)
> {
> break;
> }
> if (!"".equals(t.termText()))
> {
> try
> {
> tlist.set(countTokens++, t.termText());
> }
> catch (IndexOutOfBoundsException ioobe)
> {
> countTokens = -1;
> }
> }
> }
> try
> {
> source.close();
> }
> catch (IOException e)
> {
> // ignore
> }
> if (countTokens != tlist.size())
> {
> /*
> * this means that the analyzer used either added or
> consumed
> * (common for a stemmer) tokens, and we can't build a
> WildcardQuery
> */
> throw new ParseException("Cannot build WildcardQuery
> with analyzer " + getAnalyzer().getClass()
> + " - tokens added or lost");
> }
> if (tlist.size() == 0)
> {
> return null;
> }
> else if (tlist.size() == 1)
> {
> if (wlist.size() == 1)
> {
> /*
> * if wlist contains one wildcard, it must be
> at the end,
> * because: 1) wildcards at 1st position of a
> term by
> * QueryParser where truncated 2) if wildcard
> was *not* in end,
> * there would be *two* or more tokens
> */
> StringBuffer sb = new StringBuffer();
> if (hasLeadingWildcard)
> {
> // adding leadingWildcard
> sb.append(leadingWildcard);
> }
> sb.append((String) tlist.get(0));
> sb.append(wlist.get(0).toString());
> return super.getWildcardQuery(field,
> sb.toString());
> }
> else if (wlist.size() == 0 && hasLeadingWildcard)
> {
> /*
> * if wlist contains no wildcard, it must be at
> 1st position
> */
> StringBuffer sb = new StringBuffer();
> if (hasLeadingWildcard)
> {
> // adding leadingWildcard
> sb.append(leadingWildcard);
> }
> sb.append((String) tlist.get(0));
> sb.append(wlist.get(0).toString());
> return super.getWildcardQuery(field,
> sb.toString());
> }
> else
> {
> /*
> * we should never get here! if so, this method
> was called with
> * a termStr containing no wildcard ...
> */
> throw new
> IllegalArgumentException("getWildcardQuery called without wildcard");
> }
> }
> else
> {
> /*
> * the term was tokenized, let's rebuild to one token
> with wildcards
> * put back in postion
> */
> StringBuffer sb = new StringBuffer();
> if (hasLeadingWildcard)
> {
> // adding leadingWildcard
> sb.append(leadingWildcard);
> }
> for (int i = 0; i < tlist.size(); i++)
> {
> sb.append((String) tlist.get(i));
> if (wlist != null && wlist.size() > i)
> {
> sb.append((String) wlist.get(i));
> }
> }
> return super.getWildcardQuery(field, sb.toString());
> }
> }
--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]