package foobit.nutch;

import net.nutch.db.Link;
import net.nutch.db.Page;
import net.nutch.db.WebDBReader;
import net.nutch.fetcher.Fetcher;
import net.nutch.fetcher.FetcherOutput;
import net.nutch.fs.LocalFileSystem;
import net.nutch.fs.NutchFileSystem;
import net.nutch.io.UTF8;
import net.nutch.pagedb.FetchListEntry;
import net.nutch.segment.SegmentReader;
import net.nutch.tools.FetchListTool;
import net.nutch.tools.LinkAnalysisTool;
import net.nutch.tools.WebDBAdminTool;
import net.nutch.util.LogFormatter;
import net.nutch.util.NutchConf;

import java.io.File;
import java.io.FileFilter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.logging.Logger;

/**
 * Test suite for SegmentNormalizeTool:
 * (a) generate a sample on-disk crawl
 * (b) verify that all segment pages in the crawl have correct anchors & scores
 *
 * FIXME: verifyNutchConfig() DOES NOT successfully override the on-disk 
 * NutchConf, so we CANNOT use this as a unit test in a normal Nutch setup
 *
 * (1/2005 kangas)
 */
public class TestSegmentNormalizeTool
{
  public static final Logger LOG = LogFormatter.getLogger("foobit.TestSegmentNormalizeTool");
  private NutchFileSystem nfs;
  private String rootDir;
  private String testDir;
  private String urlfile_text;
  private String nutchconf_file;

  // Content that Nutch will crawl for this test
  static final String INDEX_HTML =
	  "<html>\n<head><title>do you foobit?</title></head>\n<body>\n" +
	  "<h1>do you foobit?</h1>\n" +
	  "<a href=\"eggs1.html\">a dish fit for the eggs1</a><p>\n" +
	  "<a href=\"eggs2.html\">this was the noblest eggs2 of them all</a><p>\n" +
	  "<a href=\"eggs3.html\">et tu, eggs3!</a><p>\n" +
	  "<a href=\"eggs4.html\">beware the ides of eggs4</a><p>\n" +
	  "</body>\n</html>";

  static final String EGGS1_HTML =
	  "<html>\n<head><title>eggs1</title></head>\n<body>\n" +
	  "<a href=\"index.html\">go home 1</a>\n" +
	  "</body>\n</html>";

  static final String EGGS2_HTML =
	  "<html>\n<head><title>eggs2</title></head>\n<body>\n" +
	  "<a href=\"index.html\">go home 2</a>\n" +
	  "<a href=\"eggs1.html\">go eggs1</a>\n" +
	  "</body>\n</html>";

  static final String EGGS3_HTML =
	  "<html>\n<head><title>eggs3</title></head>\n<body>\n" +
	  "<a href=\"index.html\">go home 3</a>\n" +
	  "<a href=\"eggs1.html\">go eggs1</a>\n" +
	  "</body>\n</html>";

  static final String EGGS4_HTML =
	  "<html>\n<head><title>eggs4</title></head>\n<body>\n" +
	  "<a href=\"index.html\">go home 4</a>\n" +
	  "</body>\n</html>";

  // Written into nutchconf_file to ensure that protocol-file is enabled
  static final String NUTCHTEST_CONF = "<nutch-conf>\n"+
	  "<property>\n" +
	  "  <name>file.content.ignored</name>\n" +
	  "  <value>false</value>\n" +
	  "</property>\n" +
	  "<property>\n" +
	  "  <name>plugin.includes</name>\n" +
	  "  <value>protocol-file|parse-html|index-basic</value>\n" +
	  "</property>\n" +
	  "<property>\n" +
	  "  <name>db.ignore.internal.links</name>\n" +
	  "  <value>false</value>\n" +
	  "</property>\n" +
	  "</nutch-conf>";


  // ----------------------------------

  public TestSegmentNormalizeTool(String rootDir) throws IOException
  {
    this.nfs = new LocalFileSystem();
    this.rootDir = rootDir;
    testDir = rootDir +"/TEST";

    // File written/read for test-specific Nutch config
    nutchconf_file = testDir +"/nutchtest.conf";

    // This will be the URL Nutch first crawls
    urlfile_text = "file:"+ testDir +"/htdocs/index.html\n";
  }

  // Try to override Nutch config parameters, and verify that the changes stuck
  // FIXME: NEED A BETTER WAY TO OVERRIDE NUTCH CONFIG
  private void verifyNutchConfig() {
    if (! new File(nutchconf_file).canRead())
      throw new RuntimeException("Cannot read conf file: '"+nutchconf_file+"'");

    // FIXME: This does NOT work unless testDir is already in CLASSPATH!
    NutchConf.addConfResource("nutchtest.conf");

    if (! NutchConf.get("file.content.ignored").equals("false"))
      throw new RuntimeException("NutchConf ERROR: test requires 'file.content.ignored'=false");
    if (! NutchConf.get("db.ignore.internal.links").equals("false"))
      throw new RuntimeException("NutchConf ERROR: test requires 'db.ignore.internal.links'=false");
    if (NutchConf.get("plugin.includes").indexOf("protocol-file") == -1)
      throw new RuntimeException("NutchConf ERROR: test requires 'protocol-file' in 'plugins.includes'");
  }

  /**
   * Returns path of the latest segment in a segments directory.
   * Copied from CrawlTool
   */
  private static String getLatestSegment(File segmentsDir)  {
    File bestSegment = null;
    File[] allSegmentFiles = segmentsDir.listFiles();
    for (int i = 0; i < allSegmentFiles.length; i++) {
      String name = allSegmentFiles[i].getName();
      if (bestSegment == null || bestSegment.getName().compareTo(name) < 0) {
	bestSegment = allSegmentFiles[i];
      }
    }
    return bestSegment.getPath();
  }

  /** Given a "segments" dir, return subdirs. Copied from SegmentMergeTool. */
  private static ArrayList getSegmentDirs(File sDir)
  {
    ArrayList dirs = new ArrayList();
    if (!sDir.exists() || !sDir.isDirectory()) {
      LOG.warning("Invalid path: " + sDir);
    } else {
      File[] files = sDir.listFiles(new FileFilter() {
	public boolean accept(File f) { return f.isDirectory(); }
      });
      if (files != null && files.length > 0) {
	for (int i = 0; i < files.length; i++) dirs.add(files[i]);
      }
    }
    return dirs;
  }

  /**
   * Write sample files to disk and run the nutch crawler
   * @return isError boolean
   */
  public boolean buildCrawl() throws Exception {
    File test = new File(testDir);
    File htdocs = new File(testDir +"/htdocs");
    File db = new File(testDir +"/db");
    File segments = new File(testDir +"/segments");

    // Make working directories
    if (test.isDirectory()) {
      String testDirOld = testDir +"-"+ test.lastModified();
      LOG.info("Moving old testDir '"+testDir+"' -> '"+testDirOld+"'");
      if (! test.renameTo(new File(testDirOld)))
	throw new RuntimeException("Failed to rename '"+testDir+"' to '"+testDirOld+"'");
    }
    if (! test.mkdir())
      throw new RuntimeException("Failed to create testDir '"+testDir+"'");
    if (! (htdocs.mkdir() && db.mkdir() && segments.mkdir()))
      throw new RuntimeException("Failed to create one of: htdocs db segments");

    // Write content to htdocs, testDir
    FileWriter fw;
    fw = new FileWriter(nutchconf_file); fw.write(NUTCHTEST_CONF); fw.close();
    fw = new FileWriter(testDir+"/urlfile"); fw.write(urlfile_text); fw.close();
    fw = new FileWriter(new File(htdocs, "index.html")); fw.write(INDEX_HTML); fw.close();
    fw = new FileWriter(new File(htdocs, "eggs1.html")); fw.write(EGGS1_HTML); fw.close();
    fw = new FileWriter(new File(htdocs, "eggs2.html")); fw.write(EGGS2_HTML); fw.close();
    fw = new FileWriter(new File(htdocs, "eggs3.html")); fw.write(EGGS3_HTML); fw.close();
    fw = new FileWriter(new File(htdocs, "eggs4.html")); fw.write(EGGS4_HTML); fw.close();

    verifyNutchConfig();

    // Now run the crawler against these files
    int depth = 3;

    LOG.info("--ADMIN-- "+db.getPath()+"-create");
    WebDBAdminTool.main(new String[] {"-local", db.getPath(), "-create"});
    LOG.info("--INJECT-- -urlfile");
    WebDBInjector.main(new String[] {"-local", db.getPath(), "-urlfile", testDir+"/urlfile"});

    for (int i = 0; i < depth; i++) {
      // generate a new s1
      LOG.info("--------- s1 "+i+" ----------");
      FetchListTool.main(new String[] {"-local", db.getPath(), segments.getPath()} );
      String s1 = getLatestSegment(segments);
      Fetcher.main(new String[] {"-local", "-logLevel", "warning", "-threads", "1", s1 } );

      // Verify that Fetcher picked up index.html
      if (i == 0) {
	SegmentReader sr = new SegmentReader(new File(s1));
	FetcherOutput fo = new FetcherOutput();
	sr.next(fo, null, null, null);
	String url = fo.getUrl().toString();
	if (url == null || (! url.endsWith("index.html")))
	  throw new RuntimeException("Fetcher failed to retrieve index.html "+
		  "(check 'crawl-urlfilter.txt' for 'skip file:' rule)");
      }

      UpdateDatabaseTool.main(new String[] {"-local", db.getPath(), s1 } );
      LinkAnalysisTool.main(new String[] {"-local", db.getPath(), "2"} );
    }

    return false;
  }

  /**
   * Verify that score and anchors for each page in segments matches the same
   * data in the WebDB
   * @return isError boolean
   */
  public boolean testSegments() throws Exception
  {
    File db = new File(testDir +"/db");
    File segments = new File(testDir +"/segments");

    WebDBReader dbReader = new WebDBReader(nfs, db);
    FetcherOutput fo = new FetcherOutput();
    SegmentReader segReader;

    int errors=0;
    ArrayList dirs = getSegmentDirs(segments);
    HashMap segAnchorMap = new HashMap();

    // Step through segment dirs
    for (int i = 0; i < dirs.size(); i++) {
      File segment = (File) dirs.get(i);
      segReader = new SegmentReader(nfs, segment);
      LOG.info("--segment-- "+ segment);

      // Step through each page of this segment
      while (segReader.next(fo, null, null, null)) {
	FetchListEntry fle = fo.getFetchListEntry();
	UTF8 url = fle.getUrl();
	LOG.info("<url> "+url);
	String[] anchors = fle.getAnchors();
	float score = fle.getPage().getScore();

	segAnchorMap.clear();
	for (int j = 0; j < anchors.length; j++)
	  segAnchorMap.put(anchors[j], null);

	// Compare with WebDb details
	// Note: these are dbLinks that point TO the url
	Page dbPage = dbReader.getPage(url.toString());
	float dbScore = dbPage.getScore();
	if (dbScore != score) {
	  LOG.warning("ERROR_SCORE:     url='"+url+"' dbScore="+dbPage.getScore()+" segScore="+score);
	  errors++;
	}

	Link[] dbLinks = dbReader.getLinks(url);

	for (int j = 0; j < dbLinks.length; j++) {
	  String linkAnchor = dbLinks[j].getAnchorText().toString();
	  if (! segAnchorMap.containsKey(linkAnchor)) {
	    LOG.warning("ERROR_NO_ANCHOR: url='"+url+"' anchor='"+linkAnchor+"'");
	    errors++;
	  }
	}
      }
    }

    return errors != 0;
  }

  static void usage() {
    System.err.println("TestSegmentLinksConsistent <crawl_root> (-init|-test)");
  }

  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println("Too few arguments.\n");
      usage();
      System.exit(-1);
    }
    String root = args[0];
    String cmd = args[1];

    TestSegmentNormalizeTool test = new TestSegmentNormalizeTool(root);

    if (cmd.equals("-init")) {
      if (test.buildCrawl())
	System.err.println("ERROR init");
      else
	System.err.println("OK init");
    }
    else if (cmd.equals("-test")) {
      if (test.testSegments())
	System.err.println("ERROR testSegments");
      else
	System.err.println("OK testSegments");
    }
    else
      System.err.println("Unrecognized command '"+cmd+"'");
  }
}
