Author: kubes Date: Tue Dec 2 06:41:09 2008 New Revision: 722475 URL: http://svn.apache.org/viewvc?rev=722475&view=rev Log: NUTCH-662: Upgrade Nutch to use Lucene 2.4
Added: lucene/nutch/trunk/lib/lucene-core-2.4.0.jar (with props) lucene/nutch/trunk/lib/lucene-misc-2.4.0.jar (with props) lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.4.0.jar (with props) Removed: lucene/nutch/trunk/lib/lucene-core-2.3.0.jar lucene/nutch/trunk/lib/lucene-misc-2.3.0.jar lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.3.0.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=722475&r1=722474&r2=722475&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Dec 2 06:41:09 2008 @@ -283,7 +283,9 @@ 104. NUTCH-640 - confusing description "set it to Integer.MAX_VALUE". (dogacan) - + +105. NUTCH-662 - Upgrade Nutch to use Lucene 2.4. (kubes) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Added: lucene/nutch/trunk/lib/lucene-core-2.4.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/lucene-core-2.4.0.jar?rev=722475&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-core-2.4.0.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/lucene-misc-2.4.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/lucene-misc-2.4.0.jar?rev=722475&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-misc-2.4.0.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=722475&r1=722474&r2=722475&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Tue Dec 2 06:41:09 2008 @@ -145,10 +145,11 @@ /** Return each index as a split. */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { - Path[] files = listPaths(job); + FileStatus[] files = listStatus(job); InputSplit[] splits = new InputSplit[files.length]; for (int i = 0; i < files.length; i++) { - splits[i] = new FileSplit(files[i], 0, INDEX_LENGTH, (String[])null); + FileStatus cur = files[i]; + splits[i] = new FileSplit(cur.getPath(), 0, INDEX_LENGTH, (String[])null); } return splits; } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java?rev=722475&r1=722474&r2=722475&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java Tue Dec 2 06:41:09 2008 @@ -18,6 +18,8 @@ package org.apache.nutch.indexer; import java.io.*; +import java.util.Random; + import org.apache.lucene.store.*; import org.apache.nutch.util.HadoopFSUtil; import org.apache.hadoop.fs.*; @@ -207,30 +209,45 @@ private class DfsIndexOutput extends BufferedIndexOutput { private FSDataOutputStream out; + private RandomAccessFile local; + private File localFile; public DfsIndexOutput(Path path, int ioFileBufferSize) throws IOException { + + // create a temporary local file and set it to delete on exit + String randStr = Integer.toString(new Random().nextInt(Integer.MAX_VALUE)); + localFile = File.createTempFile("index_" + randStr, ".tmp"); + localFile.deleteOnExit(); + local = new RandomAccessFile(localFile, "rw"); + out = fs.create(path); } public void flushBuffer(byte[] b, int offset, int size) throws IOException { - out.write(b, offset, size); + local.write(b, offset, size); } public void close() throws IOException { super.close(); + + // transfer to dfs from local + byte[] buffer = new byte[4096]; + local.seek(0); + int read = -1; + while ((read = local.read(buffer)) != -1) { + out.write(buffer, 0, read); + } out.close(); + local.close(); } public void seek(long pos) throws IOException { - throw new UnsupportedOperationException(); + super.seek(pos); + local.seek(pos); } public long length() throws IOException { - return out.getPos(); - } - - protected void finalize() throws IOException { - out.close(); // close the file + return local.length(); } } Added: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.4.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.4.0.jar?rev=722475&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.4.0.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml?rev=722475&r1=722474&r2=722475&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml Tue Dec 2 06:41:09 2008 @@ -25,11 +25,11 @@ <plugin id="lib-lucene-analyzers" name="Lucene Analysers" - version="2.3.0" + version="2.4.0" provider-name="org.apache.lucene"> <runtime> - <library name="lucene-analyzers-2.3.0.jar"> + <library name="lucene-analyzers-2.4.0.jar"> <export name="*"/> </library> </runtime>