Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java?rev=925179&r1=925178&r2=925179&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java (original) +++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java Fri Mar 19 11:34:33 2010 @@ -20,6 +20,7 @@ package org.creativecommons.nutch; import org.apache.nutch.indexer.Indexer; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.store.FSDirectory; import org.apache.lucene.document.Document; import org.apache.commons.logging.Log; @@ -83,7 +84,7 @@ public class CCDeleteUnlicensedTool { File indexDone = new File(directories[i], Indexer.DONE_NAME); if (indexDone.exists() && indexDone.isFile()){ File indexDir = new File(directories[i], "index"); - IndexReader reader = IndexReader.open(indexDir); + IndexReader reader = IndexReader.open(FSDirectory.open(indexDir)); maxDoc += reader.maxDoc(); vReaders.add(reader); }
Modified: lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java?rev=925179&r1=925178&r2=925179&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java (original) +++ lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java Fri Mar 19 11:34:33 2010 @@ -80,8 +80,10 @@ public class BasicFieldFilter // create lucene fields from the FieldWritable objects Field.Store store = field.isStored() ? Field.Store.YES : Field.Store.NO; - Field.Index indexed = field.isIndexed() ? field.isTokenized() - ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED : Field.Index.NO; + Field.Index indexed = + field.isIndexed() + ? field.isTokenized() ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED + : Field.Index.NO; Field docField = new Field(fieldName, field.getValue(), store, indexed); Added: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-3.0.1.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-3.0.1.jar?rev=925179&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-3.0.1.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml?rev=925179&r1=925178&r2=925179&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml Fri Mar 19 11:34:33 2010 @@ -25,11 +25,11 @@ <plugin id="lib-lucene-analyzers" name="Lucene Analysers" - version="2.9.1" + version="3.0.1" provider-name="org.apache.lucene"> <runtime> - <library name="lucene-analyzers-2.9.1.jar"> + <library name="lucene-analyzers-3.0.1.jar"> <export name="*"/> </library> </runtime> Modified: lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java?rev=925179&r1=925178&r2=925179&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java (original) +++ lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java Fri Mar 19 11:34:33 2010 @@ -29,8 +29,7 @@ import org.apache.commons.logging.LogFac import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.RangeQuery; -import org.apache.lucene.index.Term; +import org.apache.lucene.search.TermRangeQuery; import java.util.regex.Pattern; import java.util.regex.Matcher; @@ -74,11 +73,12 @@ public class DateQueryFilter implements } // do it as lucene RangeQuery - Term xLower = new Term(FIELD_NAME, matcher.group(1)); - Term xUpper = new Term(FIELD_NAME, matcher.group(2)); + String xLower = matcher.group(1); + String xUpper = matcher.group(2); // inclusive - RangeQuery rangeQuery = new RangeQuery(xLower, xUpper, true); + TermRangeQuery rangeQuery = new TermRangeQuery( + c.getField(), xLower, xUpper, true, true); rangeQuery.setBoost(0.0f); // trigger filterization Modified: lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java?rev=925179&r1=925178&r2=925179&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java (original) +++ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java Fri Mar 19 11:34:33 2010 @@ -39,6 +39,7 @@ import org.apache.hadoop.conf.Configurat import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.*; // Nutch imports import org.apache.nutch.analysis.NutchDocumentAnalyzer; @@ -152,7 +153,7 @@ public class BasicSummarizer implements // // If we find a term that's in the query... // - if (highlight.contains(tokens[i].termText())) { + if (highlight.contains(tokens[i].term())) { // // Start searching at a point SUM_CONTEXT terms back, // and move SUM_CONTEXT terms into the future. @@ -182,8 +183,8 @@ public class BasicSummarizer implements // Now grab the hit-element, if present // Token t = tokens[j]; - if (highlight.contains(t.termText())) { - excerpt.addToken(t.termText()); + if (highlight.contains(t.term())) { + excerpt.addToken(t.term()); excerpt.add(new Fragment(text.substring(offset, t.startOffset()))); excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset()))); offset = t.endOffset(); @@ -354,18 +355,25 @@ public class BasicSummarizer implements private Token[] getTokens(String text) { - ArrayList result = new ArrayList(); + ArrayList<Token> result = new ArrayList<Token>(); TokenStream ts = analyzer.tokenStream("content", new StringReader(text)); - Token token = null; - while (result.size()<token_deep) { - try { - token = ts.next(); - } catch (IOException e) { - token = null; + TermAttribute termAtt = ts.getAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class); + PositionIncrementAttribute posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class); + TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class); + try { + while (result.size() < token_deep && ts.incrementToken()) { + final Token token = new Token( + termAtt.termBuffer(), 0, termAtt.termLength(), + offsetAtt.startOffset(), offsetAtt.endOffset()); + token.setType(typeAtt.type()); + token.setPositionIncrement(posIncrAtt.getPositionIncrement()); + result.add(token); } - if (token == null) { break; } - result.add(token); + } catch (IOException e) { + // Ignore (?) } + try { ts.close(); } catch (IOException e) { Added: lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-3.0.1.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-3.0.1.jar?rev=925179&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-3.0.1.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml?rev=925179&r1=925178&r2=925179&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml Fri Mar 19 11:34:33 2010 @@ -25,7 +25,7 @@ <library name="summary-lucene.jar"> <export name="*"/> </library> - <library name="lucene-highlighter-2.9.1.jar"/> + <library name="lucene-highlighter-3.0.1.jar"/> </runtime> <requires> Modified: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java?rev=925179&r1=925178&r2=925179&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java Fri Mar 19 11:34:33 2010 @@ -16,6 +16,7 @@ */ package org.apache.nutch.indexer; +import java.io.File; import java.util.Random; import org.apache.hadoop.conf.Configuration; @@ -62,7 +63,7 @@ public class TestDeleteDuplicates extend private Path createIndex(String name, boolean hashDup, float inc, long time, boolean incFirst) throws Exception { Path idx = new Path(root, name); Path sub = new Path(idx, "part-0000"); - Directory dir = FSDirectory.getDirectory(sub.toString()); + Directory dir = FSDirectory.open(new File(sub.toString())); IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true, MaxFieldLength.UNLIMITED); Document doc = makeDoc(name, @@ -89,7 +90,7 @@ public class TestDeleteDuplicates extend private Path createSingleDocIndex(String name, float inc, long time) throws Exception { Path idx = new Path(root, name); Path sub = new Path(idx, "part-0000"); - Directory dir = FSDirectory.getDirectory(sub.toString()); + Directory dir = FSDirectory.open(new File(sub.toString())); IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true, MaxFieldLength.UNLIMITED); Document doc = makeDoc(name, @@ -105,7 +106,7 @@ public class TestDeleteDuplicates extend Document doc = new Document(); doc.add(new Field("segment", segment, Field.Store.YES, Field.Index.NO)); doc.add(new Field("digest", digest, Field.Store.YES, Field.Index.NO)); - doc.add(new Field("url", url, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("url", url, Field.Store.YES, Field.Index.ANALYZED)); doc.setBoost(boost); doc.add(new Field("boost", "" + boost, Field.Store.YES, Field.Index.NO)); doc.add(new Field("tstamp", DateTools.timeToString(time, Resolution.MILLISECOND), Field.Store.YES, Field.Index.NO)); Modified: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java?rev=925179&r1=925178&r2=925179&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java Fri Mar 19 11:34:33 2010 @@ -29,6 +29,7 @@ import org.apache.lucene.document.Field. import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.search.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; @@ -67,8 +68,9 @@ public class TestIndexSorter extends Tes } LOG.info("Creating test index: " + testDir.getAbsolutePath()); File plain = new File(testDir, INDEX_PLAIN); - Directory dir = FSDirectory.getDirectory(plain); - IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true); + Directory dir = FSDirectory.open(plain); + IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true, + MaxFieldLength.UNLIMITED); // create test documents for (int i = 0; i < NUM_DOCS; i++) { Document doc = new Document(); @@ -79,19 +81,19 @@ public class TestIndexSorter extends Tes String val = null; if (fieldNames[k].equals("id")) { s = Store.YES; - ix = Index.UN_TOKENIZED; + ix = Index.NOT_ANALYZED; val = String.valueOf(i); } else if (fieldNames[k].equals("host")) { s = Store.YES; - ix = Index.UN_TOKENIZED; + ix = Index.NOT_ANALYZED; val = "www.example" + i + ".com"; } else if (fieldNames[k].equals("site")) { s = Store.NO; - ix = Index.UN_TOKENIZED; + ix = Index.NOT_ANALYZED; val = "www.example" + i + ".com"; } else if (fieldNames[k].equals("content")) { s = Store.NO; - ix = Index.TOKENIZED; + ix = Index.ANALYZED; val = "This is the content of the " + i + "-th document."; } else if (fieldNames[k].equals("boost")) { s = Store.YES; @@ -104,7 +106,7 @@ public class TestIndexSorter extends Tes doc.setBoost(boost); } else { s = Store.YES; - ix = Index.TOKENIZED; + ix = Index.ANALYZED; if (fieldNames[k].equals("anchor")) { val = "anchors to " + i + "-th page."; } else if (fieldNames[k].equals("url")) { @@ -127,8 +129,9 @@ public class TestIndexSorter extends Tes public void testSorting() throws Exception { IndexSorter sorter = new IndexSorter(conf); sorter.sort(testDir); + // read back documents - IndexReader reader = IndexReader.open(new File(testDir, INDEX_SORTED)); + IndexReader reader = IndexReader.open(FSDirectory.open(new File(testDir, INDEX_SORTED))); assertEquals(reader.numDocs(), NUM_DOCS); for (int i = 0; i < reader.maxDoc(); i++) { Document doc = reader.document(i);