Author: siren Date: Sat Jan 19 00:59:29 2008 New Revision: 613378 URL: http://svn.apache.org/viewvc?rev=613378&view=rev Log: NUTCH-580 Remove deprecated hadoop api calls (FS)
Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java (with props) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java lucene/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=613378&r1=613377&r2=613378&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sat Jan 19 00:59:29 2008 @@ -191,6 +191,8 @@ 66. NUTCH-584 - urls missing from fetchlist (Ruslan Ermilov, ab) +67. NUTCH-580 - Remove deprecated hadoop api calls (FS) (siren) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=613378&r1=613377&r2=613378&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Sat Jan 19 00:59:29 2008 @@ -32,6 +32,7 @@ import org.apache.nutch.indexer.DeleteDuplicates; import org.apache.nutch.indexer.IndexMerger; import org.apache.nutch.indexer.Indexer; +import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -131,9 +132,9 @@ linkDbTool.invert(linkDb, segments, true, true, false); // invert links // index, dedup & merge - indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments)); + indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments, HadoopFSUtil.getPassAllFilter())); dedup.dedup(new Path[] { indexes }); - merger.merge(fs.listPaths(indexes), index, tmpDir); + merger.merge(fs.listPaths(indexes, HadoopFSUtil.getPassAllFilter()), index, tmpDir); } else { LOG.warn("No URLs to fetch - check your seed list and URL filters."); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=613378&r1=613377&r2=613378&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Sat Jan 19 00:59:29 2008 @@ -31,6 +31,7 @@ import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.ToolBase; +import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -181,15 +182,7 @@ } else if (args[i].equals("-noAdditions")) { additionsAllowed = false; } else if (args[i].equals("-dir")) { - Path[] paths = fs.listPaths(new Path(args[++i]), new PathFilter() { - public boolean accept(Path dir) { - try { - return fs.isDirectory(dir); - } catch (IOException ioe) { - return false; - } - } - }); + Path[] paths = fs.listPaths(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs)); dirs.addAll(Arrays.asList(paths)); } else { dirs.add(new Path(args[i])); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=613378&r1=613377&r2=613378&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Sat Jan 19 00:59:29 2008 @@ -36,6 +36,7 @@ import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.parse.*; +import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -146,14 +147,7 @@ public void invert(Path linkDb, final Path segmentsDir, boolean normalize, boolean filter, boolean force) throws IOException { final FileSystem fs = FileSystem.get(getConf()); - Path[] files = fs.listPaths(segmentsDir, new PathFilter() { - public boolean accept(Path f) { - try { - if (fs.isDirectory(f)) return true; - } catch (IOException ioe) {}; - return false; - } - }); + Path[] files = fs.listPaths(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); invert(linkDb, files, normalize, filter, force); } @@ -283,7 +277,7 @@ Path[] files = fs.listPaths(segDir, new PathFilter() { public boolean accept(Path f) { try { - if (fs.isDirectory(f)) return true; + if (fs.getFileStatus(f).isDir()) return true; } catch (IOException ioe) {}; return false; } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=613378&r1=613377&r2=613378&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Jan 19 00:59:29 2008 @@ -63,7 +63,7 @@ FileSystem fs = FileSystem.get(job); InputSplit[] splits = new InputSplit[files.length]; for (int i = 0; i < files.length; i++) { - splits[i] = new FileSplit(files[i], 0, fs.getLength(files[i]), job); + splits[i] = new FileSplit(files[i], 0, fs.getFileStatus(files[i]).getLen(), job); } return splits; } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=613378&r1=613377&r2=613378&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Sat Jan 19 00:59:29 2008 @@ -95,7 +95,7 @@ FileSplit[] splits = new FileSplit[files.length]; FileSystem fs = FileSystem.get(job); for (int i = 0; i < files.length; i++) { - splits[i] = new FileSplit(files[i], 0, fs.getLength(files[i]), job); + splits[i] = new FileSplit(files[i], 0, fs.getFileStatus(files[i]).getLen(), job); } return splits; } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java?rev=613378&r1=613377&r2=613378&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java Sat Jan 19 00:59:29 2008 @@ -19,6 +19,7 @@ import java.io.*; import org.apache.lucene.store.*; +import org.apache.nutch.util.HadoopFSUtil; import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.Configuration; @@ -40,7 +41,7 @@ create(); } - if (!fs.isDirectory(directory)) + if (!fs.getFileStatus(directory).isDir()) throw new IOException(directory + " not a directory"); } @@ -49,11 +50,11 @@ fs.mkdirs(directory); } - if (!fs.isDirectory(directory)) + if (!fs.getFileStatus(directory).isDir()) throw new IOException(directory + " not a directory"); // clear old files - Path[] files = fs.listPaths(directory); + Path[] files = fs.listPaths(directory, HadoopFSUtil.getPassAllFilter()); for (int i = 0; i < files.length; i++) { if (!fs.delete(files[i])) throw new IOException("Cannot delete " + files[i]); @@ -61,7 +62,7 @@ } public String[] list() throws IOException { - Path[] files = fs.listPaths(directory); + Path[] files = fs.listPaths(directory, HadoopFSUtil.getPassAllFilter()); if (files == null) return null; String[] result = new String[files.length]; @@ -84,7 +85,7 @@ } public long fileLength(String name) throws IOException { - return fs.getLength(new Path(directory, name)); + return fs.getFileStatus(new Path(directory, name)).getLen(); } public void deleteFile(String name) throws IOException { @@ -157,7 +158,7 @@ public DfsIndexInput(Path path, int ioFileBufferSize) throws IOException { descriptor = new Descriptor(path,ioFileBufferSize); - length = fs.getLength(path); + length = fs.getFileStatus(path).getLen(); } protected void readInternal(byte[] b, int offset, int len) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java?rev=613378&r1=613377&r2=613378&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java Sat Jan 19 00:59:29 2008 @@ -29,6 +29,7 @@ import org.apache.hadoop.util.ToolBase; import org.apache.hadoop.conf.*; +import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.LogUtil; import org.apache.nutch.util.NutchConfiguration; @@ -135,7 +136,7 @@ Path outputIndex = new Path(args[i++]); for (; i < args.length; i++) { - indexDirs.addAll(Arrays.asList(fs.listPaths(new Path(args[i])))); + indexDirs.addAll(Arrays.asList(fs.listPaths(new Path(args[i]), HadoopFSUtil.getPassAllFilter()))); } // Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java?rev=613378&r1=613377&r2=613378&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java Sat Jan 19 00:59:29 2008 @@ -82,7 +82,7 @@ } private Directory getDirectory(Path file) throws IOException { - if ("local".equals(this.fs.getName())) { + if ("file".equals(this.fs.getUri().getScheme())) { Path qualified = file.makeQualified(FileSystem.getLocal(conf)); File fsLocal = new File(qualified.toUri()); return FSDirectory.getDirectory(fsLocal.getAbsolutePath(), false); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=613378&r1=613377&r2=613378&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Sat Jan 19 00:59:29 2008 @@ -31,6 +31,7 @@ import org.apache.nutch.parse.*; import org.apache.nutch.indexer.*; import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.NutchConfiguration; /** @@ -121,8 +122,8 @@ } Vector vDirs=new Vector(); - Path [] directories = fs.listPaths(indexesDir); - for(int i = 0; i < fs.listPaths(indexesDir).length; i++) { + Path [] directories = fs.listPaths(indexesDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); + for(int i = 0; i < directories.length; i++) { Path indexdone = new Path(directories[i], Indexer.DONE_NAME); if(fs.isFile(indexdone)) { vDirs.add(directories[i]); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=613378&r1=613377&r2=613378&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Sat Jan 19 00:59:29 2008 @@ -29,7 +29,6 @@ import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; @@ -60,6 +59,7 @@ import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseText; import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -626,15 +626,7 @@ boolean normalize = false; for (int i = 1; i < args.length; i++) { if (args[i].equals("-dir")) { - Path[] files = fs.listPaths(new Path(args[++i]), new PathFilter() { - public boolean accept(Path f) { - try { - if (fs.isDirectory(f)) return true; - } catch (IOException e) {} - ; - return false; - } - }); + Path[] files = fs.listPaths(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs)); for (int j = 0; j < files.length; j++) segs.add(files[j]); } else if (args[i].equals("-filter")) { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=613378&r1=613377&r2=613378&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Sat Jan 19 00:59:29 2008 @@ -40,7 +40,6 @@ import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; @@ -64,6 +63,7 @@ import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseText; import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.LogUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -220,7 +220,7 @@ // remove the old file fs.delete(dumpFile); - Path[] files = fs.listPaths(tempDir); + Path[] files = fs.listPaths(tempDir, HadoopFSUtil.getPassAllFilter()); PrintWriter writer = null; int currentRecordNumber = 0; @@ -451,7 +451,7 @@ } stats.generated = cnt; Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME); - if (fs.exists(fetchDir) && fs.isDirectory(fetchDir)) { + if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) { cnt = 0L; long start = Long.MAX_VALUE; long end = Long.MIN_VALUE; @@ -470,7 +470,7 @@ stats.fetched = cnt; } Path parseDir = new Path(segment, ParseData.DIR_NAME); - if (fs.exists(fetchDir) && fs.isDirectory(fetchDir)) { + if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) { cnt = 0L; long errors = 0L; ParseData value = new ParseData(); @@ -559,14 +559,7 @@ if (args[i] == null) continue; if (args[i].equals("-dir")) { Path dir = new Path(args[++i]); - Path[] files = fs.listPaths(dir, new PathFilter() { - public boolean accept(Path pathname) { - try { - if (fs.isDirectory(pathname)) return true; - } catch (IOException e) {}; - return false; - } - }); + Path[] files = fs.listPaths(dir, HadoopFSUtil.getPassDirectoriesFilter(fs)); if (files != null && files.length > 0) { dirs.addAll(Arrays.asList(files)); } Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java?rev=613378&view=auto ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java Sat Jan 19 00:59:29 2008 @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.util; + +import java.io.IOException; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; + +public class HadoopFSUtil { + + /** + * Returns PathFilter that passes all paths through. + */ + public static PathFilter getPassAllFilter() { + return new PathFilter() { + public boolean accept(Path arg0) { + return true; + } + }; + } + + /** + * Returns PathFilter that passes directories through. + */ + public static PathFilter getPassDirectoriesFilter(final FileSystem fs) { + return new PathFilter() { + public boolean accept(final Path path) { + try { + return fs.getFileStatus(path).isDir(); + } catch (IOException ioe) { + return false; + } + } + + }; + } + +} Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java?rev=613378&r1=613377&r2=613378&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java Sat Jan 19 00:59:29 2008 @@ -42,7 +42,7 @@ if (fs.exists(lockFile)) { if(!accept) throw new IOException("lock file " + lockFile + " already exists."); - if (fs.isDirectory(lockFile)) + if (fs.getFileStatus(lockFile).isDir()) throw new IOException("lock file " + lockFile + " already exists and is a directory."); // do nothing - the file already exists. } else { @@ -63,7 +63,7 @@ */ public static boolean removeLockFile(FileSystem fs, Path lockFile) throws IOException { if (!fs.exists(lockFile)) return false; - if (fs.isDirectory(lockFile)) + if (fs.getFileStatus(lockFile).isDir()) throw new IOException("lock file " + lockFile + " exists but is a directory!"); return fs.delete(lockFile); }