I spent some time tracing down strange OutOfMemory errors in nutch and when I got it I found two issues on nutch issue tracker that are related to it:
[ 1110947 ] Serious bug: OutOfMemoryError: Java heap space
[ 957684 ] endless loop with unbalanced quote in query
I have prepared a JUnit test and patch for NutchAnalysis.jj. JUnit test throws OutOfMemory with current NutchAnalysis. After applying the patch all test succeeded. At first I decided to throw ParseException on unbalanced quote but after looking at google and short discussion with friend I have chosen to automatically add missing quote at the end of the query. If you think throwing an exception is better I can submit ealier version. I think my solution should not change other behavior of NutchAnalysis but I hope it can be quickly reviewed by more experienced JavaCC developer.
After applying the patch you can close both bugs mentioned ealier.
Regards
Piotr Kosiorowski
Index: NutchAnalysis.jj
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/analysis/NutchAnalysis.jj,v
retrieving revision 1.10
diff -u -r1.10 NutchAnalysis.jj
--- NutchAnalysis.jj 29 Sep 2004 18:55:35 -0000 1.10
+++ NutchAnalysis.jj 22 Feb 2005 20:22:42 -0000
@@ -226,7 +226,7 @@
{ end = token.endColumn; }
- <QUOTE>
+ (<QUOTE>|<EOF>)
{
if (QueryFilters.isRawField(field)) {
@@ -281,7 +281,13 @@
void nonTerm() :
{}
{
- <WHITE> | infix() | <EOF>
+ <WHITE> | infix()
+}
+
+void nonTermOrEOF() :
+{}
+{
+ nonTerm() | <EOF>
}
@@ -289,7 +295,7 @@
void nonOpOrTerm() :
{}
{
- (LOOKAHEAD(2) (<WHITE> | nonOpInfix() | ((<PLUS>|<MINUS>) nonTerm())))*
+ (LOOKAHEAD(2) (<WHITE> | nonOpInfix() | ((<PLUS>|<MINUS>) nonTermOrEOF())))*
}
/** Characters which can be used to form compound terms. */
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.analysis;
import java.io.IOException;
import net.nutch.searcher.Query;
import junit.framework.TestCase;
/**
* Tests for NutchAnalysis.
*
* @author Piotr Kosiorowski
*/
public class TestNutchAnalysis extends TestCase {
public static void main(String[] args) {
junit.textui.TestRunner.run(TestNutchAnalysis.class);
}
public void testSimpleQuery() throws IOException{
Query q=NutchAnalysis.parseQuery("test");
assertNotNull(q);
assertEquals("test",q.toString());
String [] terms =q.getTerms();
assertEquals(1,terms.length );
assertEquals("test", terms[0]);
}
public void testTwoTokenQuery() throws IOException{
Query q=NutchAnalysis.parseQuery("test abc");
assertNotNull(q);
assertEquals("test abc",q.toString());
String [] terms =q.getTerms();
assertEquals(2,terms.length );
assertEquals("test", terms[0]);
assertEquals("abc", terms[1]);
}
public void testMinusQuery() throws IOException{
Query q=NutchAnalysis.parseQuery("-def ");
assertNotNull(q);
assertEquals("-def",q.toString());
Query.Clause[] clauses=q.getClauses();
assertEquals(1,clauses.length );
assertTrue(clauses[0].isProhibited());
}
public void testIgnoredMinusQuery() throws IOException{
Query q=NutchAnalysis.parseQuery("- def ");
assertNotNull(q);
assertEquals("def",q.toString());
Query.Clause[] clauses=q.getClauses();
assertEquals(1,clauses.length );
assertFalse(clauses[0].isProhibited());
}
public void testPhraseQuery() throws IOException{
Query q=NutchAnalysis.parseQuery("\"abc def\"");
assertNotNull(q);
assertEquals("\"abc def\"",q.toString());
String [] terms =q.getTerms();
assertEquals(2,terms.length );
assertEquals("abc", terms[0]);
assertEquals("def", terms[1]);
}
public void testBrokenQuote() throws IOException{
Query q=NutchAnalysis.parseQuery("\"");
assertNotNull(q);
assertEquals("",q.toString());
String [] terms =q.getTerms();
assertEquals(0,terms.length );
}
public void testBrokenQuote2() throws IOException{
Query q=NutchAnalysis.parseQuery("\" abc def ");
assertNotNull(q);
assertEquals("\"abc def\"",q.toString());
String [] terms =q.getTerms();
assertEquals(2,terms.length );
assertEquals("abc", terms[0]);
assertEquals("def", terms[1]);
}
public void testBrokenQuote3() throws IOException{
Query q=NutchAnalysis.parseQuery("\" abc def \"\"");
assertNotNull(q);
assertEquals("\"abc def\"",q.toString());
String [] terms =q.getTerms();
assertEquals(2,terms.length );
assertEquals("abc", terms[0]);
assertEquals("def", terms[1]);
}
public void testBrokenQuote5() throws IOException{
Query q=NutchAnalysis.parseQuery("\" abc def \" \" def ghi ");
assertNotNull(q);
assertEquals("\"abc def\" \"def ghi\"",q.toString());
String [] terms =q.getTerms();
assertEquals(4,terms.length );
assertEquals("abc", terms[0]);
assertEquals("def", terms[1]);
assertEquals("def", terms[2]);
assertEquals("ghi", terms[3]);
}
}
