package foobit.nutch; import net.nutch.db.Link; import net.nutch.db.Page; import net.nutch.db.WebDBReader; import net.nutch.fetcher.Fetcher; import net.nutch.fetcher.FetcherOutput; import net.nutch.fs.LocalFileSystem; import net.nutch.fs.NutchFileSystem; import net.nutch.io.UTF8; import net.nutch.pagedb.FetchListEntry; import net.nutch.segment.SegmentReader; import net.nutch.tools.FetchListTool; import net.nutch.tools.LinkAnalysisTool; import net.nutch.tools.WebDBAdminTool; import net.nutch.util.LogFormatter; import net.nutch.util.NutchConf; import java.io.File; import java.io.FileFilter; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.logging.Logger; /** * Test suite for SegmentNormalizeTool: * (a) generate a sample on-disk crawl * (b) verify that all segment pages in the crawl have correct anchors & scores * * FIXME: verifyNutchConfig() DOES NOT successfully override the on-disk * NutchConf, so we CANNOT use this as a unit test in a normal Nutch setup * * (1/2005 kangas) */ public class TestSegmentNormalizeTool { public static final Logger LOG = LogFormatter.getLogger("foobit.TestSegmentNormalizeTool"); private NutchFileSystem nfs; private String rootDir; private String testDir; private String urlfile_text; private String nutchconf_file; // Content that Nutch will crawl for this test static final String INDEX_HTML = "\ndo you foobit?\n\n" + "

do you foobit?

\n" + "a dish fit for the eggs1

\n" + "this was the noblest eggs2 of them all

\n" + "et tu, eggs3!

\n" + "beware the ides of eggs4

\n" + "\n"; static final String EGGS1_HTML = "\neggs1\n\n" + "go home 1\n" + "\n"; static final String EGGS2_HTML = "\neggs2\n\n" + "go home 2\n" + "go eggs1\n" + "\n"; static final String EGGS3_HTML = "\neggs3\n\n" + "go home 3\n" + "go eggs1\n" + "\n"; static final String EGGS4_HTML = "\neggs4\n\n" + "go home 4\n" + "\n"; // Written into nutchconf_file to ensure that protocol-file is enabled static final String NUTCHTEST_CONF = "\n"+ "\n" + " file.content.ignored\n" + " false\n" + "\n" + "\n" + " plugin.includes\n" + " protocol-file|parse-html|index-basic\n" + "\n" + "\n" + " db.ignore.internal.links\n" + " false\n" + "\n" + ""; // ---------------------------------- public TestSegmentNormalizeTool(String rootDir) throws IOException { this.nfs = new LocalFileSystem(); this.rootDir = rootDir; testDir = rootDir +"/TEST"; // File written/read for test-specific Nutch config nutchconf_file = testDir +"/nutchtest.conf"; // This will be the URL Nutch first crawls urlfile_text = "file:"+ testDir +"/htdocs/index.html\n"; } // Try to override Nutch config parameters, and verify that the changes stuck // FIXME: NEED A BETTER WAY TO OVERRIDE NUTCH CONFIG private void verifyNutchConfig() { if (! new File(nutchconf_file).canRead()) throw new RuntimeException("Cannot read conf file: '"+nutchconf_file+"'"); // FIXME: This does NOT work unless testDir is already in CLASSPATH! NutchConf.addConfResource("nutchtest.conf"); if (! NutchConf.get("file.content.ignored").equals("false")) throw new RuntimeException("NutchConf ERROR: test requires 'file.content.ignored'=false"); if (! NutchConf.get("db.ignore.internal.links").equals("false")) throw new RuntimeException("NutchConf ERROR: test requires 'db.ignore.internal.links'=false"); if (NutchConf.get("plugin.includes").indexOf("protocol-file") == -1) throw new RuntimeException("NutchConf ERROR: test requires 'protocol-file' in 'plugins.includes'"); } /** * Returns path of the latest segment in a segments directory. * Copied from CrawlTool */ private static String getLatestSegment(File segmentsDir) { File bestSegment = null; File[] allSegmentFiles = segmentsDir.listFiles(); for (int i = 0; i < allSegmentFiles.length; i++) { String name = allSegmentFiles[i].getName(); if (bestSegment == null || bestSegment.getName().compareTo(name) < 0) { bestSegment = allSegmentFiles[i]; } } return bestSegment.getPath(); } /** Given a "segments" dir, return subdirs. Copied from SegmentMergeTool. */ private static ArrayList getSegmentDirs(File sDir) { ArrayList dirs = new ArrayList(); if (!sDir.exists() || !sDir.isDirectory()) { LOG.warning("Invalid path: " + sDir); } else { File[] files = sDir.listFiles(new FileFilter() { public boolean accept(File f) { return f.isDirectory(); } }); if (files != null && files.length > 0) { for (int i = 0; i < files.length; i++) dirs.add(files[i]); } } return dirs; } /** * Write sample files to disk and run the nutch crawler * @return isError boolean */ public boolean buildCrawl() throws Exception { File test = new File(testDir); File htdocs = new File(testDir +"/htdocs"); File db = new File(testDir +"/db"); File segments = new File(testDir +"/segments"); // Make working directories if (test.isDirectory()) { String testDirOld = testDir +"-"+ test.lastModified(); LOG.info("Moving old testDir '"+testDir+"' -> '"+testDirOld+"'"); if (! test.renameTo(new File(testDirOld))) throw new RuntimeException("Failed to rename '"+testDir+"' to '"+testDirOld+"'"); } if (! test.mkdir()) throw new RuntimeException("Failed to create testDir '"+testDir+"'"); if (! (htdocs.mkdir() && db.mkdir() && segments.mkdir())) throw new RuntimeException("Failed to create one of: htdocs db segments"); // Write content to htdocs, testDir FileWriter fw; fw = new FileWriter(nutchconf_file); fw.write(NUTCHTEST_CONF); fw.close(); fw = new FileWriter(testDir+"/urlfile"); fw.write(urlfile_text); fw.close(); fw = new FileWriter(new File(htdocs, "index.html")); fw.write(INDEX_HTML); fw.close(); fw = new FileWriter(new File(htdocs, "eggs1.html")); fw.write(EGGS1_HTML); fw.close(); fw = new FileWriter(new File(htdocs, "eggs2.html")); fw.write(EGGS2_HTML); fw.close(); fw = new FileWriter(new File(htdocs, "eggs3.html")); fw.write(EGGS3_HTML); fw.close(); fw = new FileWriter(new File(htdocs, "eggs4.html")); fw.write(EGGS4_HTML); fw.close(); verifyNutchConfig(); // Now run the crawler against these files int depth = 3; LOG.info("--ADMIN-- "+db.getPath()+"-create"); WebDBAdminTool.main(new String[] {"-local", db.getPath(), "-create"}); LOG.info("--INJECT-- -urlfile"); WebDBInjector.main(new String[] {"-local", db.getPath(), "-urlfile", testDir+"/urlfile"}); for (int i = 0; i < depth; i++) { // generate a new s1 LOG.info("--------- s1 "+i+" ----------"); FetchListTool.main(new String[] {"-local", db.getPath(), segments.getPath()} ); String s1 = getLatestSegment(segments); Fetcher.main(new String[] {"-local", "-logLevel", "warning", "-threads", "1", s1 } ); // Verify that Fetcher picked up index.html if (i == 0) { SegmentReader sr = new SegmentReader(new File(s1)); FetcherOutput fo = new FetcherOutput(); sr.next(fo, null, null, null); String url = fo.getUrl().toString(); if (url == null || (! url.endsWith("index.html"))) throw new RuntimeException("Fetcher failed to retrieve index.html "+ "(check 'crawl-urlfilter.txt' for 'skip file:' rule)"); } UpdateDatabaseTool.main(new String[] {"-local", db.getPath(), s1 } ); LinkAnalysisTool.main(new String[] {"-local", db.getPath(), "2"} ); } return false; } /** * Verify that score and anchors for each page in segments matches the same * data in the WebDB * @return isError boolean */ public boolean testSegments() throws Exception { File db = new File(testDir +"/db"); File segments = new File(testDir +"/segments"); WebDBReader dbReader = new WebDBReader(nfs, db); FetcherOutput fo = new FetcherOutput(); SegmentReader segReader; int errors=0; ArrayList dirs = getSegmentDirs(segments); HashMap segAnchorMap = new HashMap(); // Step through segment dirs for (int i = 0; i < dirs.size(); i++) { File segment = (File) dirs.get(i); segReader = new SegmentReader(nfs, segment); LOG.info("--segment-- "+ segment); // Step through each page of this segment while (segReader.next(fo, null, null, null)) { FetchListEntry fle = fo.getFetchListEntry(); UTF8 url = fle.getUrl(); LOG.info(" "+url); String[] anchors = fle.getAnchors(); float score = fle.getPage().getScore(); segAnchorMap.clear(); for (int j = 0; j < anchors.length; j++) segAnchorMap.put(anchors[j], null); // Compare with WebDb details // Note: these are dbLinks that point TO the url Page dbPage = dbReader.getPage(url.toString()); float dbScore = dbPage.getScore(); if (dbScore != score) { LOG.warning("ERROR_SCORE: url='"+url+"' dbScore="+dbPage.getScore()+" segScore="+score); errors++; } Link[] dbLinks = dbReader.getLinks(url); for (int j = 0; j < dbLinks.length; j++) { String linkAnchor = dbLinks[j].getAnchorText().toString(); if (! segAnchorMap.containsKey(linkAnchor)) { LOG.warning("ERROR_NO_ANCHOR: url='"+url+"' anchor='"+linkAnchor+"'"); errors++; } } } } return errors != 0; } static void usage() { System.err.println("TestSegmentLinksConsistent (-init|-test)"); } public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Too few arguments.\n"); usage(); System.exit(-1); } String root = args[0]; String cmd = args[1]; TestSegmentNormalizeTool test = new TestSegmentNormalizeTool(root); if (cmd.equals("-init")) { if (test.buildCrawl()) System.err.println("ERROR init"); else System.err.println("OK init"); } else if (cmd.equals("-test")) { if (test.testSegments()) System.err.println("ERROR testSegments"); else System.err.println("OK testSegments"); } else System.err.println("Unrecognized command '"+cmd+"'"); } }