Update of /cvsroot/nutch/playground/src/test/net/nutch/fetcher
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv10313/src/test/net/nutch/fetcher
Added Files:
TestFetcherContent.java TestRobotRulesParser.java
TestFetcherText.java TestFetcher.java TestFetcherOutput.java
Log Message:
intial commit
--- NEW FILE: TestFetcherContent.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.fetcher;
import java.io.*;
import net.nutch.io.*;
import net.nutch.pagedb.*;
import junit.framework.TestCase;
/** Unit tests for FetcherContent. */
public class TestFetcherContent extends TestCase {
public TestFetcherContent(String name) { super(name); }
public void testFetcherContent() throws Exception {
String page = "<HTML><BODY><H1>Hello World</H1><P>The Quick Brown Fox Jumped Over
the Lazy Fox.</BODY></HTML>";
FetcherContent r = new FetcherContent(page.getBytes("UTF8"));
TestWritable.testWritable(r);
}
}
--- NEW FILE: TestRobotRulesParser.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.fetcher;
import net.nutch.fetcher.RobotRulesParser.RobotRuleSet;
import junit.framework.TestCase;
public class TestRobotRulesParser extends TestCase {
private static final String LF= "\n";
private static final String CR= "\r";
private static final String CRLF= "\r\n";
private static final String[] ROBOTS_STRINGS= new String[] {
"User-Agent: Agent1 #foo" + CR
+ "Disallow: /a" + CR
+ "Disallow: /b/a" + CR
+ "#Disallow: /c" + CR
+ "" + CR
+ "" + CR
+ "User-Agent: Agent2 Agent3#foo" + CR
+ "User-Agent: Agent4" + CR
+ "Disallow: /d" + CR
+ "Disallow: /e/d/" + CR
+ "" + CR
+ "User-Agent: *" + CR
+ "Disallow: /foo/bar/" + CR,
};
private static final String[] AGENT_STRINGS= new String[] {
"Agent1",
"Agent2",
"Agent3",
"Agent4",
"Agent5",
};
private static final boolean[][] NOT_IN_ROBOTS_STRING= new boolean[][] {
{
false,
false,
false,
false,
true,
}
};
private static final String[] TEST_PATHS= new String[] {
"/a",
"/a/",
"/a/bloh/foo.html",
"/b",
"/b/a",
"/b/a/index.html",
"/b/b/foo.html",
"/c",
"/c/a",
"/c/a/index.html",
"/c/b/foo.html",
"/d",
"/d/a",
"/e/a/index.html",
"/e/d",
"/e/d/foo.html",
"/e/doh.html",
"/f/index.html",
"/foo/bar/baz.html",
"/f/",
};
private static final boolean[][][] ALLOWED= new boolean[][][] {
{ // ROBOTS_STRINGS[0]
{ // Agent1
false, // "/a",
false, // "/a/",
false, // "/a/bloh/foo.html"
true, // "/b",
false, // "/b/a",
false, // "/b/a/index.html",
true, // "/b/b/foo.html",
true, // "/c",
true, // "/c/a",
true, // "/c/a/index.html",
true, // "/c/b/foo.html",
true, // "/d",
true, // "/d/a",
true, // "/e/a/index.html",
true, // "/e/d",
true, // "/e/d/foo.html",
true, // "/e/doh.html",
true, // "/f/index.html",
true, // "/foo/bar.html",
true, // "/f/",
},
{ // Agent2
true, // "/a",
true, // "/a/",
true, // "/a/bloh/foo.html"
true, // "/b",
true, // "/b/a",
true, // "/b/a/index.html",
true, // "/b/b/foo.html",
true, // "/c",
true, // "/c/a",
true, // "/c/a/index.html",
true, // "/c/b/foo.html",
false, // "/d",
false, // "/d/a",
true, // "/e/a/index.html",
true, // "/e/d",
false, // "/e/d/foo.html",
true, // "/e/doh.html",
true, // "/f/index.html",
true, // "/foo/bar.html",
true, // "/f/",
},
{ // Agent3
true, // "/a",
true, // "/a/",
true, // "/a/bloh/foo.html"
true, // "/b",
true, // "/b/a",
true, // "/b/a/index.html",
true, // "/b/b/foo.html",
true, // "/c",
true, // "/c/a",
true, // "/c/a/index.html",
true, // "/c/b/foo.html",
false, // "/d",
false, // "/d/a",
true, // "/e/a/index.html",
true, // "/e/d",
false, // "/e/d/foo.html",
true, // "/e/doh.html",
true, // "/f/index.html",
true, // "/foo/bar.html",
true, // "/f/",
},
{ // Agent4
true, // "/a",
true, // "/a/",
true, // "/a/bloh/foo.html"
true, // "/b",
true, // "/b/a",
true, // "/b/a/index.html",
true, // "/b/b/foo.html",
true, // "/c",
true, // "/c/a",
true, // "/c/a/index.html",
true, // "/c/b/foo.html",
false, // "/d",
false, // "/d/a",
true, // "/e/a/index.html",
true, // "/e/d",
false, // "/e/d/foo.html",
true, // "/e/doh.html",
true, // "/f/index.html",
true, // "/foo/bar.html",
true, // "/f/",
},
{ // Agent5/"*"
true, // "/a",
true, // "/a/",
true, // "/a/bloh/foo.html"
true, // "/b",
true, // "/b/a",
true, // "/b/a/index.html",
true, // "/b/b/foo.html",
true, // "/c",
true, // "/c/a",
true, // "/c/a/index.html",
true, // "/c/b/foo.html",
true, // "/d",
true, // "/d/a",
true, // "/e/a/index.html",
true, // "/e/d",
true, // "/e/d/foo.html",
true, // "/e/doh.html",
true, // "/f/index.html",
false, // "/foo/bar.html",
true, // "/f/",
}
}
};
public TestRobotRulesParser(String name) {
super(name);
}
public void testRobotsOneAgent() {
for (int i= 0; i < ROBOTS_STRINGS.length; i++) {
for (int j= 0; j < AGENT_STRINGS.length; j++) {
testRobots(i, new String[] { AGENT_STRINGS[j] },
TEST_PATHS, ALLOWED[i][j]);
}
}
}
public void testRobotsTwoAgents() {
for (int i= 0; i < ROBOTS_STRINGS.length; i++) {
for (int j= 0; j < AGENT_STRINGS.length; j++) {
for (int k= 0; k < AGENT_STRINGS.length; k++) {
int key= j;
if (NOT_IN_ROBOTS_STRING[i][j])
key= k;
testRobots(i, new String[] { AGENT_STRINGS[j], AGENT_STRINGS[k] },
TEST_PATHS, ALLOWED[i][key]);
}
}
}
}
// helper
public void testRobots(int robotsString, String[] agents, String[] paths,
boolean[] allowed) {
String agentsString= agents[0];
for (int i= 1; i < agents.length; i++)
agentsString= agentsString + "," + agents[i];
RobotRulesParser p= new RobotRulesParser(agents);
RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString].getBytes());
for (int i= 0; i < paths.length; i++) {
assertTrue("testing robots file "+robotsString+", on agents ("
+ agentsString + "), and path " + TEST_PATHS[i] + "; got "
+ rules.isAllowed(TEST_PATHS[i]) + ", rules are: " + LF
+ rules,
rules.isAllowed(TEST_PATHS[i]) == allowed[i]);
}
}
}
--- NEW FILE: TestFetcherText.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.fetcher;
import java.io.*;
import net.nutch.io.*;
import net.nutch.pagedb.*;
import junit.framework.TestCase;
/** Unit tests for FetcherText. */
public class TestFetcherText extends TestCase {
public TestFetcherText(String name) { super(name); }
public void testFetcherText() throws Exception {
String page = "Hello World The Quick Brown Fox Jumped Over the Lazy Fox";
FetcherText s = new FetcherText(page);
TestWritable.testWritable(s);
}
}
--- NEW FILE: TestFetcher.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.fetcher;
import net.nutch.io.*;
import net.nutch.db.*;
import net.nutch.pagedb.*;
import java.io.*;
import java.util.logging.Level;
import junit.framework.TestCase;
public class TestFetcher extends TestCase {
public TestFetcher(String name) { super(name); }
public void testFetcher() throws Exception {
String directory = System.getProperty("test.build.data",".");
String fetchListFilename = directory + "/" + FetchListEntry.DIR_NAME;
ArrayFile.Writer testFetchList =
new ArrayFile.Writer(fetchListFilename, FetchListEntry.class);
MD5Hash id1 = new MD5Hash(new byte[]{0,0,0,0, 0,0,0,0, 0,0,0,0, 1,2,3,4});
MD5Hash id2 = new MD5Hash(new byte[]{0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0});
String url1 = "http://sourceforge.net/projects/nutch/";
String url2 = "http://www.yahoo.com/";
String url3 = "http://jakarta.apache.org/lucene/";
String url4 = "http://nutch.org/docs/index.html";
String url5 = "ftp://ftp.redhat.com/";
Page page1 = new Page(url1, id1);
Page page2 = new Page(url2, id2);
Page page3 = new Page(url3, id2);
Page page4 = new Page(url4, id2);
Page page5 = new Page(url5, id2);
String[] anchors = new String[] {"foo", "bar"};
FetchListEntry fe1 = new FetchListEntry(true, page1, anchors);
FetchListEntry fe2 = new FetchListEntry(true, page2, anchors);
FetchListEntry fe3 = new FetchListEntry(true, page3, anchors);
FetchListEntry fe4 = new FetchListEntry(true, page4, anchors);
FetchListEntry fe5 = new FetchListEntry(false, page5, anchors);
testFetchList.append(fe1);
testFetchList.append(fe2);
testFetchList.append(fe3);
testFetchList.append(fe4);
testFetchList.append(fe5);
testFetchList.close();
Fetcher fetcher = new Fetcher(directory);
fetcher.setLogLevel(Level.FINE);
//fetcher.getHttp().setMaxContentLength(4096);
fetcher.getHttp().setAgentString("NutchCVS");
fetcher.run();
ArrayFile.Reader fetcher_stripped;
String stripped = directory + "/" + FetcherText.DIR_NAME;
FetcherText s = new FetcherText();
fetcher_stripped = new ArrayFile.Reader(stripped);
boolean yahoo = false;
boolean nutch = false;
while (fetcher_stripped.next(s) != null) {
if (s.toString().indexOf("Yahoo!") >= 0)
yahoo = true;
if (s.toString().indexOf("Nutch") >= 0 )
nutch = true;
}
fetcher_stripped.close();
assertTrue(yahoo);
assertTrue(nutch);
}
}
--- NEW FILE: TestFetcherOutput.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.fetcher;
import java.io.*;
import net.nutch.io.*;
import net.nutch.pagedb.*;
import junit.framework.TestCase;
/** Unit tests for FetcherOutput. */
public class TestFetcherOutput extends TestCase {
public TestFetcherOutput(String name) { super(name); }
public void testFetcherOutput() throws Exception {
String[] anchors = new String[] {"foo", "bar"};
Outlink[] outlinks = new Outlink[] {
new Outlink("http://foo.com/", "Foo"),
new Outlink("http://bar.com/", "Bar")
};
FetcherOutput o =
new FetcherOutput(new FetchListEntry(true, TestPage.getTestPage(),
anchors),
TestMD5Hash.getTestHash(), true, "Foo",
outlinks);
TestWritable.testWritable(o);
}
}
-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs