package foobit.nutch; import net.nutch.db.WebDBReader; import net.nutch.db.Link; import net.nutch.fs.NutchFileSystem; import net.nutch.segment.SegmentReader; import net.nutch.util.LogFormatter; import net.nutch.fetcher.FetcherOutput; import net.nutch.protocol.Content; import net.nutch.parse.ParseText; import net.nutch.parse.ParseData; import net.nutch.pagedb.FetchListEntry; import net.nutch.io.UTF8; import java.io.File; import java.io.IOException; import java.io.FileFilter; import java.util.logging.Logger; import java.util.ArrayList; import java.util.HashMap; /** * Quick hack to verify if a set of segments' pages all have consistent inlinks * (1/2005 kangas) */ public class TestSegmentLinksConsistent { public static final Logger LOG = LogFormatter.getLogger("foobit.TestSegmentLinksConsistent"); private NutchFileSystem nfs; private String root; public TestSegmentLinksConsistent(NutchFileSystem nfs, String rootDir) { this.nfs = nfs; this.root = rootDir; } // FIXME: Boilerplate; scrap if possible private ArrayList getSegmentDirs(File sDir) { ArrayList dirs = new ArrayList(); if (!sDir.exists() || !sDir.isDirectory()) { LOG.warning("Invalid path: " + sDir); } else { File[] files = sDir.listFiles(new FileFilter() { public boolean accept(File f) { return f.isDirectory(); } }); if (files != null && files.length > 0) { for (int i = 0; i < files.length; i++) dirs.add(files[i]); } } return dirs; } /** * Run tests */ public boolean run() throws IOException { File dbDir = new File(root +"/db"); WebDBReader dbReader = new WebDBReader(nfs, dbDir); File sDirs = new File(root +"/segments"); ArrayList dirs = getSegmentDirs(sDirs); // Loop over segments SegmentReader segReader; FetcherOutput fo = new FetcherOutput(); Content co = new Content(); ParseText pt = new ParseText(); ParseData pd = new ParseData(); int errors=0; for (int i = 0; i < dirs.size(); i++) { File segment = (File) dirs.get(i); try { segReader = new SegmentReader(nfs, segment, false); } catch (Exception e) { LOG.warning("Error opening segment: "+ segment); e.printStackTrace(); continue; } LOG.info(" "+ segment); while (segReader.next(fo, co, pt, pd)) { // Get segment details FetchListEntry fle = fo.getFetchListEntry(); UTF8 url = fle.getUrl(); // FIXME: this gets us anchor texts, NOT inlinks to a page! // (So, where is that in the segment?) String[] anchors = fle.getAnchors(); LOG.info("URL: "+ url.toString() +" -----"); HashMap anchorMap = new HashMap(); for (int j = 0; j < anchors.length; j++) { anchorMap.put(anchors[j], null); LOG.info("--anchor: "+ anchors[j]); } // Compare with WebDb details // Note: these are links that point TO the url Link[] links = dbReader.getLinks(url); for (int j = 0; j < links.length; j++) { String dblink = links[j].getURL().toString(); LOG.info("Page: "+ url +" inlink: "+dblink); // FIXME: THIS IS NOT THE CORRECT TEST // if (! anchorMap.containsKey(dblink) ) { // LOG.warning("Page missing inlink: "+url +", "+dblink); // errors++; // } // else { // LOG.info("Page found inlink: "+url +", "+dblink); // } } } } return errors == 0; } static void usage() { System.err.println("TestSegmentLinksConsistent "); } public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Too few arguments.\n"); usage(); System.exit(-1); } NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0); try { String root = args[0]; TestSegmentLinksConsistent test = new TestSegmentLinksConsistent(nfs, root); test.run(); } finally { nfs.close(); } } }