package foobit.nutch;

import net.nutch.db.WebDBReader;
import net.nutch.db.Link;
import net.nutch.fs.NutchFileSystem;
import net.nutch.segment.SegmentReader;
import net.nutch.util.LogFormatter;
import net.nutch.fetcher.FetcherOutput;
import net.nutch.protocol.Content;
import net.nutch.parse.ParseText;
import net.nutch.parse.ParseData;
import net.nutch.pagedb.FetchListEntry;
import net.nutch.io.UTF8;

import java.io.File;
import java.io.IOException;
import java.io.FileFilter;
import java.util.logging.Logger;
import java.util.ArrayList;
import java.util.HashMap;

/**
 * Quick hack to verify if a set of segments' pages all have consistent inlinks
 * (1/2005 kangas)
 */
public class TestSegmentLinksConsistent
{
  public static final Logger LOG = LogFormatter.getLogger("foobit.TestSegmentLinksConsistent");
  private NutchFileSystem nfs;
  private String root;

  public TestSegmentLinksConsistent(NutchFileSystem nfs, String rootDir)
  {
    this.nfs = nfs;
    this.root = rootDir;
  }

  // FIXME: Boilerplate; scrap if possible
  private ArrayList getSegmentDirs(File sDir)
  {
    ArrayList dirs = new ArrayList();
    if (!sDir.exists() || !sDir.isDirectory()) {
      LOG.warning("Invalid path: " + sDir);
    } else {
      File[] files = sDir.listFiles(new FileFilter() {
	public boolean accept(File f) {
	  return f.isDirectory();
	}
      });
      if (files != null && files.length > 0) {
	for (int i = 0; i < files.length; i++) dirs.add(files[i]);
      }
    }
    return dirs;
  }

  /**
   * Run tests
   */
  public boolean run() throws IOException
  {
    File dbDir = new File(root +"/db");
    WebDBReader dbReader = new WebDBReader(nfs, dbDir);

    File sDirs = new File(root +"/segments");
    ArrayList dirs = getSegmentDirs(sDirs);

    // Loop over segments
    SegmentReader segReader;
    FetcherOutput fo = new FetcherOutput();
    Content co = new Content();
    ParseText pt = new ParseText();
    ParseData pd = new ParseData();
    int errors=0;

    for (int i = 0; i < dirs.size(); i++) {
      File segment = (File) dirs.get(i);
      try {
	segReader = new SegmentReader(nfs, segment, false);
      } catch (Exception e) {
	LOG.warning("Error opening segment: "+ segment);
	e.printStackTrace();
	continue;
      }

      LOG.info("<segment> "+ segment);

      while (segReader.next(fo, co, pt, pd)) {
	// Get segment details
	FetchListEntry fle = fo.getFetchListEntry();
	UTF8 url = fle.getUrl();

	// FIXME: this gets us anchor texts, NOT inlinks to a page!
	// (So, where is that in the segment?)
	String[] anchors = fle.getAnchors();
	LOG.info("URL: "+ url.toString() +" -----");

	HashMap anchorMap = new HashMap();
	for (int j = 0; j < anchors.length; j++) {
	  anchorMap.put(anchors[j], null);
	  LOG.info("--anchor: "+ anchors[j]);
	}

	// Compare with WebDb details
	// Note: these are links that point TO the url
	Link[] links = dbReader.getLinks(url);
	for (int j = 0; j < links.length; j++) {
	  String dblink = links[j].getURL().toString();
	  LOG.info("Page: "+ url +" inlink: "+dblink);


	  // FIXME: THIS IS NOT THE CORRECT TEST
//	  if (! anchorMap.containsKey(dblink) ) {
//	    LOG.warning("Page missing inlink: "+url +", "+dblink);
//	    errors++;
//	  }
//	  else {
//	    LOG.info("Page found inlink: "+url +", "+dblink);
//	  }
	}
      }
    }

    return errors == 0;
  }

  static void usage() {
    System.err.println("TestSegmentLinksConsistent <crawl_root>");
  }

  public static void main(String[] args) throws Exception {
    if (args.length < 1) {
      System.err.println("Too few arguments.\n");
      usage();
      System.exit(-1);
    }
    NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0);
    try {
      String root = args[0];

      TestSegmentLinksConsistent test = new TestSegmentLinksConsistent(nfs, root);
      test.run();
    }
    finally {
      nfs.close();
    }

  }
}
