Author: ab Date: Mon Aug 18 16:56:20 2008 New Revision: 686900 URL: http://svn.apache.org/viewvc?rev=686900&view=rev Log: NUTCH-641 IndexSorter incorrectly copies stored fields.
Added: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java (with props) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=686900&r1=686899&r2=686900&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Aug 18 16:56:20 2008 @@ -256,6 +256,9 @@ 93. NUTCH-634 - Upgrade Nutch to Hadoop 0.17.1 (Michael Gottesman, Lincoln Ritter, ab) +94. NUTCH-641 - IndexSorter inorrectly copies stored fields (ab) + + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java?rev=686900&r1=686899&r2=686900&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Mon Aug 18 16:56:20 2008 @@ -191,6 +191,11 @@ return super.document(newToOld[n]); } + public Document document(int n, FieldSelector fieldSelector) + throws CorruptIndexException, IOException { + return super.document(newToOld[n], fieldSelector); + } + public boolean isDeleted(int n) { return false; } @@ -240,6 +245,10 @@ return this.score < that.score ? 1 : -1 ; } } + + public String toString() { + return "oldDoc=" + oldDoc + ",score=" + score; + } } public IndexSorter() { Added: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java?rev=686900&view=auto ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java Mon Aug 18 16:56:20 2008 @@ -0,0 +1,145 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer; + +import java.io.File; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileUtil; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.nutch.analysis.NutchDocumentAnalyzer; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestIndexSorter extends TestCase { + private static final Log LOG = LogFactory.getLog(TestIndexSorter.class); + + private static final String INDEX_PLAIN = "index"; + private static final String INDEX_SORTED = "index-sorted"; + private static final int NUM_DOCS = 254; + private String[] fieldNames = new String[] { + "id", + "url", + "site", + "content", + "host", + "anchor", + "boost" + }; + + Configuration conf = null; + File testDir = null; + Directory dir = null; + + + protected void setUp() throws Exception { + if (conf == null) conf = NutchConfiguration.create(); + // create test index + testDir = new File("indexSorter-test-" + System.currentTimeMillis()); + if (!testDir.mkdirs()) { + throw new Exception("Can't create test dir " + testDir.toString()); + } + LOG.info("Creating test index: " + testDir.getAbsolutePath()); + File plain = new File(testDir, INDEX_PLAIN); + Directory dir = FSDirectory.getDirectory(plain); + IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true); + // create test documents + for (int i = 0; i < NUM_DOCS; i++) { + Document doc = new Document(); + for (int k = 0; k < fieldNames.length; k++) { + Field f; + Store s; + Index ix; + String val = null; + if (fieldNames[k].equals("id")) { + s = Store.YES; + ix = Index.UN_TOKENIZED; + val = String.valueOf(i); + } else if (fieldNames[k].equals("host")) { + s = Store.YES; + ix = Index.UN_TOKENIZED; + val = "www.example" + i + ".com"; + } else if (fieldNames[k].equals("site")) { + s = Store.NO; + ix = Index.UN_TOKENIZED; + val = "www.example" + i + ".com"; + } else if (fieldNames[k].equals("content")) { + s = Store.NO; + ix = Index.TOKENIZED; + val = "This is the content of the " + i + "-th document."; + } else if (fieldNames[k].equals("boost")) { + s = Store.YES; + ix = Index.NO; + // XXX note that this way we ensure different values of encoded boost + // XXX note also that for this reason we can't reliably test more than + // XXX 255 documents. + float boost = Similarity.decodeNorm((byte)(i + 1)); + val = String.valueOf(boost); + doc.setBoost(boost); + } else { + s = Store.YES; + ix = Index.TOKENIZED; + if (fieldNames[k].equals("anchor")) { + val = "anchors to " + i + "-th page."; + } else if (fieldNames[k].equals("url")) { + val = "http://www.example" + i + ".com/" + i + ".html"; + } + } + f = new Field(fieldNames[k], val, s, ix); + doc.add(f); + } + writer.addDocument(doc); + } + writer.optimize(); + writer.close(); + } + + protected void tearDown() throws Exception { + FileUtil.fullyDelete(testDir); + } + + public void testSorting() throws Exception { + IndexSorter sorter = new IndexSorter(conf); + sorter.sort(testDir); + // read back documents + IndexReader reader = IndexReader.open(new File(testDir, INDEX_SORTED)); + assertEquals(reader.numDocs(), NUM_DOCS); + for (int i = 0; i < reader.maxDoc(); i++) { + Document doc = reader.document(i); + Field f = doc.getField("content"); + assertNull(f); + f = doc.getField("boost"); + float boost = Similarity.decodeNorm((byte)(NUM_DOCS - i)); + String cmp = String.valueOf(boost); + assertEquals(cmp, f.stringValue()); + } + reader.close(); + } + +} Propchange: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java ------------------------------------------------------------------------------ svn:eol-style = native