http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java new file mode 100644 index 0000000..b631319 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java @@ -0,0 +1,569 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import java.util.ArrayList; +import java.util.Date; +import java.util.Iterator; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.util.StringUtils; + +import org.apache.nutch.crawl.CrawlDatum; + +import static org.apache.nutch.crawl.CrawlDatum.*; + +import org.apache.nutch.scoring.ScoringFilterException; +import org.apache.nutch.scoring.ScoringFilters; + +import static org.junit.Assert.*; + +import org.apache.nutch.test.IntegrationTest; +import org.junit.Test; + +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Test transitions of {@link CrawlDatum} states during an update of + * {@link CrawlDb} (command {@literal updatedb}): + * <ul> + * <li>simulate updatedb with the old CrawlDatum (db status) and the new one + * (fetch status) and test whether the resulting CrawlDatum has the appropriate + * status.</li> + * <li>also check for further CrawlDatum fields (signature, etc.)</li> + * <li>and additional conditions:</li> + * <ul> + * <li>retry counters</li> + * <li>signatures</li> + * <li>configuration properties</li> + * <li>(additional) CrawlDatums of status linked (stemming from inlinks)</li> + * </ul> + * </li> </ul> + */ +@Category({IntegrationTest.class}) +public class TestCrawlDbStates { + + private static final Logger LOG = LoggerFactory + .getLogger(TestCrawlDbStates.class); + + protected static final byte[][] fetchDbStatusPairs = { + { -1, STATUS_DB_UNFETCHED }, { STATUS_FETCH_SUCCESS, STATUS_DB_FETCHED }, + { STATUS_FETCH_GONE, STATUS_DB_GONE }, + { STATUS_FETCH_REDIR_TEMP, STATUS_DB_REDIR_TEMP }, + { STATUS_FETCH_REDIR_PERM, STATUS_DB_REDIR_PERM }, + { STATUS_FETCH_NOTMODIFIED, STATUS_DB_NOTMODIFIED }, + { STATUS_FETCH_RETRY, -1 }, // fetch_retry does not have a CrawlDb + // counter-part + { -1, STATUS_DB_DUPLICATE }, }; + + /** tested {@link FetchSchedule} implementations */ + protected String[] schedules = { "DefaultFetchSchedule", + "AdaptiveFetchSchedule" }; + + /** CrawlDatum as result of a link */ + protected final CrawlDatum linked = new CrawlDatum(STATUS_LINKED, + CrawlDBTestUtil.createConfiguration().getInt("db.fetch.interval.default", + 2592000), 0.1f); + + /** + * Test the matrix of state transitions: + * <ul> + * <li>for all available {@link FetchSchedule} implementations</li> + * <li>for every possible status in CrawlDb (including "not in CrawlDb")</li> + * <li>for every possible fetch status</li> + * <li>and zero or more (0-3) additional in-links</li> + * </ul> + * call {@literal updatedb} and check whether the resulting CrawlDb status is + * the expected one. + */ + @Test + public void testCrawlDbStateTransitionMatrix() { + LOG.info("Test CrawlDatum state transitions"); + Configuration conf = CrawlDBTestUtil.createConfiguration(); + CrawlDbUpdateUtil<CrawlDbReducer> updateDb = new CrawlDbUpdateUtil<CrawlDbReducer>( + new CrawlDbReducer(), conf); + int retryMax = conf.getInt("db.fetch.retry.max", 3); + for (String sched : schedules) { + LOG.info("Testing state transitions with " + sched); + conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched); + FetchSchedule schedule = FetchScheduleFactory + .getFetchSchedule(new JobConf(conf)); + for (int i = 0; i < fetchDbStatusPairs.length; i++) { + byte fromDbStatus = fetchDbStatusPairs[i][1]; + for (int j = 0; j < fetchDbStatusPairs.length; j++) { + byte fetchStatus = fetchDbStatusPairs[j][0]; + CrawlDatum fromDb = null; + if (fromDbStatus == -1) { + // nothing yet in CrawlDb + // CrawlDatum added by FreeGenerator or via outlink + } else { + fromDb = new CrawlDatum(); + fromDb.setStatus(fromDbStatus); + // initialize fetchInterval: + schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb); + } + // expected db status + byte toDbStatus = fetchDbStatusPairs[j][1]; + if (fetchStatus == -1) { + if (fromDbStatus == -1) { + // nothing fetched yet: new document detected via outlink + toDbStatus = STATUS_DB_UNFETCHED; + } else { + // nothing fetched but new inlinks detected: status is unchanged + toDbStatus = fromDbStatus; + } + } else if (fetchStatus == STATUS_FETCH_RETRY) { + // a simple test of fetch_retry (without retries) + if (fromDb == null || fromDb.getRetriesSinceFetch() < retryMax) { + toDbStatus = STATUS_DB_UNFETCHED; + } else { + toDbStatus = STATUS_DB_GONE; + } + } + String fromDbStatusName = (fromDbStatus == -1 ? "<not in CrawlDb>" + : getStatusName(fromDbStatus)); + String fetchStatusName = (fetchStatus == -1 ? "<only inlinks>" + : CrawlDatum.getStatusName(fetchStatus)); + LOG.info(fromDbStatusName + " + " + fetchStatusName + " => " + + getStatusName(toDbStatus)); + List<CrawlDatum> values = new ArrayList<CrawlDatum>(); + for (int l = 0; l <= 2; l++) { // number of additional in-links + CrawlDatum fetch = null; + if (fetchStatus == -1) { + // nothing fetched, need at least one in-link + if (l == 0) + continue; + } else { + fetch = new CrawlDatum(); + if (fromDb != null) { + fetch.set(fromDb); + } else { + // not yet in CrawlDb: added by FreeGenerator + schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fetch); + } + fetch.setStatus(fetchStatus); + fetch.setFetchTime(System.currentTimeMillis()); + } + if (fromDb != null) + values.add(fromDb); + if (fetch != null) + values.add(fetch); + for (int n = 0; n < l; n++) { + values.add(linked); + } + List<CrawlDatum> res = updateDb.update(values); + if (res.size() != 1) { + fail("CrawlDb update didn't result in one single CrawlDatum per URL"); + continue; + } + byte status = res.get(0).getStatus(); + if (status != toDbStatus) { + fail("CrawlDb update for " + fromDbStatusName + " and " + + fetchStatusName + " and " + l + " inlinks results in " + + getStatusName(status) + " (expected: " + + getStatusName(toDbStatus) + ")"); + } + values.clear(); + } + } + } + } + } + + /** + * Test states after inject: inject must not modify the status of CrawlDatums + * already in CrawlDb. Newly injected elements have status "db_unfetched". + * Inject is simulated by calling {@link Injector.InjectReducer#reduce()}. + */ + @Test + public void testCrawlDbStatTransitionInject() { + LOG.info("Test CrawlDatum states in Injector after inject"); + Configuration conf = CrawlDBTestUtil.createConfiguration(); + Injector.InjectReducer injector = new Injector.InjectReducer(); + CrawlDbUpdateTestDriver<Injector.InjectReducer> injectDriver = + new CrawlDbUpdateTestDriver<Injector.InjectReducer>(injector, conf); + ScoringFilters scfilters = new ScoringFilters(conf); + for (String sched : schedules) { + LOG.info("Testing inject with " + sched); + conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched); + FetchSchedule schedule = FetchScheduleFactory + .getFetchSchedule(new JobConf(conf)); + List<CrawlDatum> values = new ArrayList<CrawlDatum>(); + for (int i = 0; i < fetchDbStatusPairs.length; i++) { + byte fromDbStatus = fetchDbStatusPairs[i][1]; + byte toDbStatus = fromDbStatus; + if (fromDbStatus == -1) { + toDbStatus = STATUS_DB_UNFETCHED; + } else { + CrawlDatum fromDb = new CrawlDatum(); + fromDb.setStatus(fromDbStatus); + schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb); + values.add(fromDb); + } + LOG.info("inject " + + (fromDbStatus == -1 ? "<not in CrawlDb>" : CrawlDatum + .getStatusName(fromDbStatus)) + " + " + + getStatusName(STATUS_INJECTED) + " => " + + getStatusName(toDbStatus)); + CrawlDatum injected = new CrawlDatum(STATUS_INJECTED, conf.getInt( + "db.fetch.interval.default", 2592000), 0.1f); + schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, injected); + try { + scfilters.injectedScore(CrawlDbUpdateUtil.dummyURL, injected); + } catch (ScoringFilterException e) { + LOG.error(StringUtils.stringifyException(e)); + } + values.add(injected); + List<CrawlDatum> res = injectDriver.update(values); + if (res.size() != 1) { + fail("Inject didn't result in one single CrawlDatum per URL"); + continue; + } + byte status = res.get(0).getStatus(); + if (status != toDbStatus) { + fail("Inject for " + + (fromDbStatus == -1 ? "" : getStatusName(fromDbStatus) + + " and ") + getStatusName(STATUS_INJECTED) + " results in " + + getStatusName(status) + " (expected: " + + getStatusName(toDbStatus) + ")"); + } + values.clear(); + } + } + } + + /** + * Test status db_notmodified detected by + * <ul> + * <li>signature comparison</li> + * <li>or HTTP 304</li> + * </ul> + * In addition, test for all available {@link FetchSchedule} implementations + * whether + * <ul> + * <li>modified time is set</li> + * <li>re-fetch is triggered after a certain time to force the fetched content + * to be in a recent segment (old segments are deleted, see comments in + * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}</li> + * </ul> + */ + @Test + public void testCrawlDbReducerNotModified() { + LOG.info("Test state notmodified"); + Configuration conf = CrawlDBTestUtil.createConfiguration(); + // test not modified detected by signature comparison + for (String sched : schedules) { + String desc = "test notmodified by signature comparison + " + sched; + LOG.info(desc); + conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched); + ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchNotModified(conf); + if (!crawlUtil.run(20)) { + fail("failed: " + desc); + } + } + // test not modified detected by HTTP 304 + for (String sched : schedules) { + String desc = "test notmodified by HTTP 304 + " + sched; + LOG.info(desc); + conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched); + ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchNotModifiedHttp304( + conf); + if (!crawlUtil.run(20)) { + fail("failed: " + desc); + } + } + } + + protected class CrawlTestFetchNotModified extends ContinuousCrawlTestUtil { + + /** time of the current fetch */ + protected long currFetchTime; + /** time the last fetch took place */ + protected long lastFetchTime; + /** + * time the document was fetched first (at all or after it has been changed) + */ + protected long firstFetchTime; + /** state in CrawlDb before the last fetch */ + protected byte previousDbState; + /** signature in CrawlDb of previous fetch */ + protected byte[] lastSignature; + + private long maxFetchInterval; + private FetchSchedule schedule; + + CrawlTestFetchNotModified(Configuration conf) { + super(conf); + maxFetchInterval = conf.getLong("db.fetch.interval.max", 7776000); // default + // = 90 + // days + maxFetchInterval += (24 * 60 * 60); // but take one day more to avoid + // false alarms + maxFetchInterval *= 1000; // in milli-seconds + schedule = FetchScheduleFactory.getFetchSchedule(new JobConf(conf)); + } + + @Override + protected boolean check(CrawlDatum result) { + if (lastFetchTime > 0 + && (currFetchTime - lastFetchTime) > maxFetchInterval) { + LOG.error("last effective fetch (HTTP 200, not HTTP 304), at " + + new Date(lastFetchTime) + + ", took place more than db.fetch.interval.max time, " + + "segment containing fetched content may have been deleted"); + return false; + } + switch (result.getStatus()) { + case STATUS_DB_NOTMODIFIED: + // db_notmodified is correct if the document has been fetched previously + // and it has not been changed since + if ((previousDbState == STATUS_DB_FETCHED || previousDbState == STATUS_DB_NOTMODIFIED)) { + if (lastSignature != null + && result.getSignature() != null + && SignatureComparator._compare(lastSignature, + result.getSignature()) != 0) { + LOG.error("document has changed (signature changed) but state is still " + + getStatusName(STATUS_DB_NOTMODIFIED)); + return false; + } + LOG.info("ok: " + result); + return checkModifiedTime(result, firstFetchTime); + } + LOG.warn("notmodified without previous fetch"); + break; + case STATUS_DB_FETCHED: + if (previousDbState == STATUS_DB_UNFETCHED) { + LOG.info("ok (first fetch): " + result); + return checkModifiedTime(result, firstFetchTime); + } else if (lastSignature != null + && result.getSignature() != null + && SignatureComparator._compare(lastSignature, + result.getSignature()) != 0) { + LOG.info("ok (content changed): " + result); + // expect modified time == now + return checkModifiedTime(result, currFetchTime); + } else { + LOG.warn("document has not changed, db_notmodified expected"); + } + break; + case STATUS_DB_UNFETCHED: + /** + * Status db_unfetched is possible with {@link AdaptiveFetchSchedule} + * because {@link CrawlDbReducer#reduce} calls + * {@link FetchSchedule#forceRefetch} to force a re-fetch if fetch + * interval grows too large. + */ + if (schedule.getClass() == AdaptiveFetchSchedule.class) { + LOG.info("state set to unfetched by AdaptiveFetchSchedule"); + if (result.getSignature() != null) { + LOG.warn("must reset signature: " + result); + return false; + } + LOG.info("ok: " + result); + firstFetchTime = 0; + return true; + } + } + LOG.warn("wrong result: " + result); + return false; + } + + // test modified time + private boolean checkModifiedTime(CrawlDatum result, long modifiedTime) { + if (result.getModifiedTime() == 0) { + LOG.error("modified time not set (TODO: not set by DefaultFetchSchedule)"); + // TODO: return false (but DefaultFetchSchedule does not set modified + // time, see NUTCH-933) + return true; + } else if (modifiedTime == result.getModifiedTime()) { + return true; + } + LOG.error("wrong modified time: " + new Date(result.getModifiedTime()) + + " (expected " + new Date(modifiedTime) + ")"); + return false; + } + + @Override + protected CrawlDatum fetch(CrawlDatum datum, long currentTime) { + lastFetchTime = currFetchTime; + currFetchTime = currentTime; + previousDbState = datum.getStatus(); + lastSignature = datum.getSignature(); + datum = super.fetch(datum, currentTime); + if (firstFetchTime == 0) { + firstFetchTime = currFetchTime; + } else if ((currFetchTime - firstFetchTime) > (duration / 2)) { + // simulate a modification after "one year" + changeContent(); + firstFetchTime = currFetchTime; + } + return datum; + } + } + + protected class CrawlTestFetchNotModifiedHttp304 extends + CrawlTestFetchNotModified { + + CrawlTestFetchNotModifiedHttp304(Configuration conf) { + super(conf); + } + + @Override + protected CrawlDatum fetch(CrawlDatum datum, long currentTime) { + lastFetchTime = currFetchTime; + currFetchTime = currentTime; + previousDbState = datum.getStatus(); + lastSignature = datum.getSignature(); + int httpCode; + /* + * document is "really" fetched (no HTTP 304) - if last-modified time or + * signature are unset (page has not been fetched before or fetch is + * forced) - for test purposes, we simulate a modified after "one year" + */ + if (datum.getModifiedTime() == 0 && datum.getSignature() == null + || (currFetchTime - firstFetchTime) > (duration / 2)) { + firstFetchTime = currFetchTime; + httpCode = 200; + datum.setStatus(STATUS_FETCH_SUCCESS); + // modify content to change signature + changeContent(); + } else { + httpCode = 304; + datum.setStatus(STATUS_FETCH_NOTMODIFIED); + } + LOG.info("fetched with HTTP " + httpCode + " => " + + getStatusName(datum.getStatus())); + datum.setFetchTime(currentTime); + return datum; + } + } + + /** + * NUTCH-1245: a fetch_gone should always result in a db_gone. + * <p> + * Even in a long-running continuous crawl, when a gone page is re-fetched + * several times over time. + * </p> + */ + @Test + public void testCrawlDbReducerPageGoneSchedule1() { + LOG.info("NUTCH-1245: test long running continuous crawl"); + ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestUtil( + STATUS_FETCH_GONE, STATUS_DB_GONE); + if (!crawlUtil.run(20)) { + fail("fetch_gone did not result in a db_gone (NUTCH-1245)"); + } + } + + /** + * NUTCH-1245: a fetch_gone should always result in a db_gone. + * <p> + * As some kind of misconfiguration set db.fetch.interval.default to a value + * > (fetchIntervalMax * 1.5). + * </p> + */ + @Test + public void testCrawlDbReducerPageGoneSchedule2() { + LOG.info("NUTCH-1245 (misconfiguration): test with db.fetch.interval.default > (1.5 * db.fetch.interval.max)"); + Configuration conf = CrawlDBTestUtil.createConfiguration(); + int fetchIntervalMax = conf.getInt("db.fetch.interval.max", 0); + conf.setInt("db.fetch.interval.default", 3 + (int) (fetchIntervalMax * 1.5)); + ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestUtil(conf, + STATUS_FETCH_GONE, STATUS_DB_GONE); + if (!crawlUtil.run(0)) { + fail("fetch_gone did not result in a db_gone (NUTCH-1245)"); + } + } + + /** + * Test whether signatures are reset for "content-less" states (gone, + * redirect, etc.): otherwise, if this state is temporary and the document + * appears again with the old content, it may get marked as not_modified in + * CrawlDb just after the redirect state. In this case we cannot expect + * content in segments. Cf. NUTCH-1422: reset signature for redirects. + */ + // TODO: can only test if solution is done in CrawlDbReducer + @Test + public void testSignatureReset() { + LOG.info("NUTCH-1422 must reset signature for redirects and similar states"); + Configuration conf = CrawlDBTestUtil.createConfiguration(); + for (String sched : schedules) { + LOG.info("Testing reset signature with " + sched); + conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched); + ContinuousCrawlTestUtil crawlUtil = new CrawlTestSignatureReset(conf); + if (!crawlUtil.run(20)) { + fail("failed: signature not reset"); + } + } + } + + private class CrawlTestSignatureReset extends ContinuousCrawlTestUtil { + + byte[][] noContentStates = { { STATUS_FETCH_GONE, STATUS_DB_GONE }, + { STATUS_FETCH_REDIR_TEMP, STATUS_DB_REDIR_TEMP }, + { STATUS_FETCH_REDIR_PERM, STATUS_DB_REDIR_PERM } }; + + int counter = 0; + byte fetchState; + + public CrawlTestSignatureReset(Configuration conf) { + super(conf); + } + + @Override + protected CrawlDatum fetch(CrawlDatum datum, long currentTime) { + datum = super.fetch(datum, currentTime); + counter++; + // flip-flopping between successful fetch and one of content-less states + if (counter % 2 == 1) { + fetchState = STATUS_FETCH_SUCCESS; + } else { + fetchState = noContentStates[(counter % 6) / 2][0]; + } + LOG.info("Step " + counter + ": fetched with " + + getStatusName(fetchState)); + datum.setStatus(fetchState); + return datum; + } + + @Override + protected boolean check(CrawlDatum result) { + if (result.getStatus() == STATUS_DB_NOTMODIFIED + && !(fetchState == STATUS_FETCH_SUCCESS || fetchState == STATUS_FETCH_NOTMODIFIED)) { + LOG.error("Should never get into state " + + getStatusName(STATUS_DB_NOTMODIFIED) + " from " + + getStatusName(fetchState)); + return false; + } + if (result.getSignature() != null + && !(result.getStatus() == STATUS_DB_FETCHED || result.getStatus() == STATUS_DB_NOTMODIFIED)) { + LOG.error("Signature not reset in state " + + getStatusName(result.getStatus())); + // ok here: since it's not the problem itself (the db_notmodified), but + // the reason for it + } + return true; + } + + } + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java new file mode 100644 index 0000000..0ce3c5f --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java @@ -0,0 +1,373 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.crawl; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.Reader.Option; +import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum; +import org.apache.nutch.test.IntegrationTest; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +/** + * Basic generator test. 1. Insert entries in crawldb 2. Generates entries to + * fetch 3. Verifies that number of generated urls match 4. Verifies that + * highest scoring urls are generated + * + */ +@Category({IntegrationTest.class}) +public class TestGenerator { + + Configuration conf; + + Path dbDir; + + Path segmentsDir; + + FileSystem fs; + + final static Path testdir = new Path("build/test/generator-test"); + + @Before + public void setUp() throws Exception { + conf = CrawlDBTestUtil.createConfiguration(); + fs = FileSystem.get(conf); + fs.delete(testdir, true); + } + + @After + public void tearDown() { + delete(testdir); + } + + private void delete(Path p) { + try { + fs.delete(p, true); + } catch (IOException e) { + } + } + + /** + * Test that generator generates fetchlish ordered by score (desc). + * + * @throws Exception + */ + @Test + public void testGenerateHighest() throws Exception { + + final int NUM_RESULTS = 2; + + ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>(); + + for (int i = 0; i <= 100; i++) { + list.add(createURLCrawlDatum("http://aaa/" + pad(i), 1, i)); + } + + createCrawlDB(list); + + Path generatedSegment = generateFetchlist(NUM_RESULTS, conf, false); + + Path fetchlist = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + ArrayList<URLCrawlDatum> l = readContents(fetchlist); + + // sort urls by score desc + Collections.sort(l, new ScoreComparator()); + + // verify we got right amount of records + Assert.assertEquals(NUM_RESULTS, l.size()); + + // verify we have the highest scoring urls + Assert.assertEquals("http://aaa/100", (l.get(0).url.toString())); + Assert.assertEquals("http://aaa/099", (l.get(1).url.toString())); + } + + private String pad(int i) { + String s = Integer.toString(i); + while (s.length() < 3) { + s = "0" + s; + } + return s; + } + + /** + * Comparator that sorts by score desc. + */ + public class ScoreComparator implements Comparator<URLCrawlDatum> { + + public int compare(URLCrawlDatum tuple1, URLCrawlDatum tuple2) { + if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0) { + return -1; + } + if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0) { + return 1; + } + return 0; + } + } + + /** + * Test that generator obeys the property "generate.max.per.host". + * + * @throws Exception + */ + @Test + public void testGenerateHostLimit() throws Exception { + ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>(); + + list.add(createURLCrawlDatum("http://www.example.com/index1.html", 1, 1)); + list.add(createURLCrawlDatum("http://www.example.com/index2.html", 1, 1)); + list.add(createURLCrawlDatum("http://www.example.com/index3.html", 1, 1)); + + createCrawlDB(list); + + Configuration myConfiguration = new Configuration(conf); + myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 2); + Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, + myConfiguration, false); + + Path fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath); + + // verify we got right amount of records + Assert.assertEquals(1, fetchList.size()); + + myConfiguration = new Configuration(conf); + myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 3); + generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, + false); + + fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + fetchList = readContents(fetchlistPath); + + // verify we got right amount of records + Assert.assertEquals(2, fetchList.size()); + + myConfiguration = new Configuration(conf); + myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 4); + generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, + false); + + fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + fetchList = readContents(fetchlistPath); + + // verify we got right amount of records + Assert.assertEquals(3, fetchList.size()); + } + + /** + * Test that generator obeys the property "generator.max.count" and + * "generator.count.per.domain". + * + * @throws Exception + */ + @Test + public void testGenerateDomainLimit() throws Exception { + ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>(); + + list.add(createURLCrawlDatum("http://a.example.com/index.html", 1, 1)); + list.add(createURLCrawlDatum("http://b.example.com/index.html", 1, 1)); + list.add(createURLCrawlDatum("http://c.example.com/index.html", 1, 1)); + + createCrawlDB(list); + + Configuration myConfiguration = new Configuration(conf); + myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 2); + myConfiguration.set(Generator.GENERATOR_COUNT_MODE, + Generator.GENERATOR_COUNT_VALUE_DOMAIN); + + Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, + myConfiguration, false); + + Path fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath); + + // verify we got right amount of records + Assert.assertEquals(1, fetchList.size()); + + myConfiguration = new Configuration(myConfiguration); + myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 3); + generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, + false); + + fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + fetchList = readContents(fetchlistPath); + + // verify we got right amount of records + Assert.assertEquals(2, fetchList.size()); + + myConfiguration = new Configuration(myConfiguration); + myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 4); + generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, + false); + + fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + fetchList = readContents(fetchlistPath); + + // verify we got right amount of records + Assert.assertEquals(3, fetchList.size()); + } + + /** + * Test generator obeys the filter setting. + * + * @throws Exception + * @throws IOException + */ + @Test + public void testFilter() throws IOException, Exception { + + ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>(); + + list.add(createURLCrawlDatum("http://www.example.com/index.html", 1, 1)); + list.add(createURLCrawlDatum("http://www.example.net/index.html", 1, 1)); + list.add(createURLCrawlDatum("http://www.example.org/index.html", 1, 1)); + + createCrawlDB(list); + + Configuration myConfiguration = new Configuration(conf); + myConfiguration.set("urlfilter.suffix.file", "filter-all.txt"); + + Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, + myConfiguration, true); + + Assert.assertNull("should be null (0 entries)", generatedSegment); + + generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, + false); + + Path fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath); + + // verify nothing got filtered + Assert.assertEquals(list.size(), fetchList.size()); + + } + + /** + * Read contents of fetchlist. + * + * @param fetchlist + * path to Generated fetchlist + * @return Generated {@link URLCrawlDatum} objects + * @throws IOException + */ + private ArrayList<URLCrawlDatum> readContents(Path fetchlist) + throws IOException { + // verify results + Option rFile = SequenceFile.Reader.file(fetchlist); + SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile); + + ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>(); + + READ: do { + Text key = new Text(); + CrawlDatum value = new CrawlDatum(); + if (!reader.next(key, value)) { + break READ; + } + l.add(new URLCrawlDatum(key, value)); + } while (true); + + reader.close(); + return l; + } + + /** + * Generate Fetchlist. + * + * @param numResults + * number of results to generate + * @param config + * Configuration to use + * @return path to generated segment + * @throws IOException + */ + private Path generateFetchlist(int numResults, Configuration config, + boolean filter) throws IOException { + // generate segment + Generator g = new Generator(config); + Path[] generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults, + Long.MAX_VALUE, filter, false); + if (generatedSegment == null) + return null; + return generatedSegment[0]; + } + + /** + * Creates CrawlDB. + * + * @param list + * database contents + * @throws IOException + * @throws Exception + */ + private void createCrawlDB(ArrayList<URLCrawlDatum> list) throws IOException, + Exception { + dbDir = new Path(testdir, "crawldb"); + segmentsDir = new Path(testdir, "segments"); + fs.mkdirs(dbDir); + fs.mkdirs(segmentsDir); + + // create crawldb + CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list); + } + + /** + * Constructs new {@link URLCrawlDatum} from submitted parameters. + * + * @param url + * url to use + * @param fetchInterval + * {@link CrawlDatum#setFetchInterval(float)} + * @param score + * {@link CrawlDatum#setScore(float)} + * @return Constructed object + */ + private URLCrawlDatum createURLCrawlDatum(final String url, + final int fetchInterval, final float score) { + return new CrawlDBTestUtil.URLCrawlDatum(new Text(url), new CrawlDatum( + CrawlDatum.STATUS_DB_UNFETCHED, fetchInterval, score)); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java new file mode 100644 index 0000000..59a3e8c --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.crawl; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.Reader.Option; +import org.apache.nutch.test.IntegrationTest; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +/** + * Basic injector test: 1. Creates a text file with urls 2. Injects them into + * crawldb 3. Reads crawldb entries and verifies contents 4. Injects more urls + * into webdb 5. Reads crawldb entries and verifies contents + * + */ +@Category({IntegrationTest.class}) +public class TestInjector { + + private Configuration conf; + private FileSystem fs; + final static Path testdir = new Path("build/test/inject-test"); + Path crawldbPath; + Path urlPath; + + @Before + public void setUp() throws Exception { + conf = CrawlDBTestUtil.createConfiguration(); + urlPath = new Path(testdir, "urls"); + crawldbPath = new Path(testdir, "crawldb"); + fs = FileSystem.get(conf); + if (fs.exists(urlPath)) + fs.delete(urlPath, false); + if (fs.exists(crawldbPath)) + fs.delete(crawldbPath, true); + } + + @After + public void tearDown() throws IOException { + fs.delete(testdir, true); + } + + @Test + public void testInject() + throws IOException, ClassNotFoundException, InterruptedException { + ArrayList<String> urls = new ArrayList<String>(); + // We'll use a separate list for MD so we can still compare url with + // containsAll + ArrayList<String> metadata = new ArrayList<String>(); + for (int i = 0; i < 100; i++) { + urls.add("http://zzz.com/" + i + ".html"); + metadata.add("\tnutch.score=2." + i + + "\tnutch.fetchInterval=171717\tkey=value"); + } + CrawlDBTestUtil.generateSeedList(fs, urlPath, urls, metadata); + + Injector injector = new Injector(conf); + injector.inject(crawldbPath, urlPath); + + // verify results + List<String> read = readCrawldb(); + + Collections.sort(read); + Collections.sort(urls); + + Assert.assertEquals(urls.size(), read.size()); + + Assert.assertTrue(read.containsAll(urls)); + Assert.assertTrue(urls.containsAll(read)); + + // inject more urls + ArrayList<String> urls2 = new ArrayList<String>(); + for (int i = 0; i < 100; i++) { + urls2.add("http://xxx.com/" + i + ".html"); + // We'll overwrite previously injected records but preserve their original + // MD + urls2.add("http://zzz.com/" + i + ".html"); + } + CrawlDBTestUtil.generateSeedList(fs, urlPath, urls2); + injector = new Injector(conf); + conf.setBoolean("db.injector.update", true); + injector.inject(crawldbPath, urlPath); + urls.addAll(urls2); + + // verify results + read = readCrawldb(); + + Collections.sort(read); + Collections.sort(urls); + + // We should have 100 less records because we've overwritten + Assert.assertEquals(urls.size() - 100, read.size()); + + Assert.assertTrue(read.containsAll(urls)); + Assert.assertTrue(urls.containsAll(read)); + + // Check if we correctly preserved MD + Map<String, CrawlDatum> records = readCrawldbRecords(); + + // Iterate over the urls, we're looking for http://zzz.com/ prefixed URLs + // so we can check for MD and score and interval + Text writableKey = new Text("key"); + Text writableValue = new Text("value"); + for (String url : urls) { + if (url.indexOf("http://zzz") == 0) { + // Check for fetch interval + Assert.assertTrue(records.get(url).getFetchInterval() == 171717); + // Check for default score + Assert.assertTrue(records.get(url).getScore() != 1.0); + // Check for MD key=value + Assert.assertEquals(writableValue, + records.get(url).getMetaData().get(writableKey)); + } + } + } + + private List<String> readCrawldb() throws IOException { + Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME + + "/part-r-00000/data"); + System.out.println("reading:" + dbfile); + Option rFile = SequenceFile.Reader.file(dbfile); + @SuppressWarnings("resource") + SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile); + ArrayList<String> read = new ArrayList<String>(); + + READ: do { + Text key = new Text(); + CrawlDatum value = new CrawlDatum(); + if (!reader.next(key, value)) + break READ; + read.add(key.toString()); + } while (true); + + return read; + } + + private HashMap<String, CrawlDatum> readCrawldbRecords() throws IOException { + Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME + + "/part-r-00000/data"); + System.out.println("reading:" + dbfile); + Option rFile = SequenceFile.Reader.file(dbfile); + @SuppressWarnings("resource") + SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile); + HashMap<String, CrawlDatum> read = new HashMap<String, CrawlDatum>(); + + READ: do { + Text key = new Text(); + CrawlDatum value = new CrawlDatum(); + if (!reader.next(key, value)) + break READ; + read.put(key.toString(), value); + } while (true); + + return read; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java new file mode 100644 index 0000000..23aaa88 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java @@ -0,0 +1,160 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.TreeMap; +import java.util.logging.Logger; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.MapFile.Writer.Option; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestLinkDbMerger { + private static final Logger LOG = Logger.getLogger(TestLinkDbMerger.class + .getName()); + + String url10 = "http://example.com/foo"; + String[] urls10 = new String[] { "http://example.com/100", + "http://example.com/101" }; + + String url11 = "http://example.com/"; + String[] urls11 = new String[] { "http://example.com/110", + "http://example.com/111" }; + + String url20 = "http://example.com/"; + String[] urls20 = new String[] { "http://foo.com/200", "http://foo.com/201" }; + String url21 = "http://example.com/bar"; + String[] urls21 = new String[] { "http://foo.com/210", "http://foo.com/211" }; + + String[] urls10_expected = urls10; + String[] urls11_expected = new String[] { urls11[0], urls11[1], urls20[0], + urls20[1] }; + String[] urls20_expected = urls11_expected; + String[] urls21_expected = urls21; + + TreeMap<String, String[]> init1 = new TreeMap<String, String[]>(); + TreeMap<String, String[]> init2 = new TreeMap<String, String[]>(); + HashMap<String, String[]> expected = new HashMap<String, String[]>(); + Configuration conf; + Path testDir; + FileSystem fs; + LinkDbReader reader; + + @Before + public void setUp() throws Exception { + init1.put(url10, urls10); + init1.put(url11, urls11); + init2.put(url20, urls20); + init2.put(url21, urls21); + expected.put(url10, urls10_expected); + expected.put(url11, urls11_expected); + expected.put(url20, urls20_expected); + expected.put(url21, urls21_expected); + conf = NutchConfiguration.create(); + fs = FileSystem.get(conf); + testDir = new Path("build/test/test-linkdb-" + + new java.util.Random().nextInt()); + fs.mkdirs(testDir); + } + + @After + public void tearDown() { + try { + if (fs.exists(testDir)) + fs.delete(testDir, true); + } catch (Exception e) { + } + try { + reader.close(); + } catch (Exception e) { + } + } + + @Test + public void testMerge() throws Exception { + Configuration conf = NutchConfiguration.create(); + FileSystem fs = FileSystem.get(conf); + fs.mkdirs(testDir); + Path linkdb1 = new Path(testDir, "linkdb1"); + Path linkdb2 = new Path(testDir, "linkdb2"); + Path output = new Path(testDir, "output"); + createLinkDb(conf, fs, linkdb1, init1); + createLinkDb(conf, fs, linkdb2, init2); + LinkDbMerger merger = new LinkDbMerger(conf); + LOG.fine("* merging linkdbs to " + output); + merger.merge(output, new Path[] { linkdb1, linkdb2 }, false, false); + LOG.fine("* reading linkdb: " + output); + reader = new LinkDbReader(conf, output); + Iterator<String> it = expected.keySet().iterator(); + while (it.hasNext()) { + String url = it.next(); + LOG.fine("url=" + url); + String[] vals = expected.get(url); + Inlinks inlinks = reader.getInlinks(new Text(url)); + // may not be null + Assert.assertNotNull(inlinks); + ArrayList<String> links = new ArrayList<String>(); + Iterator<?> it2 = inlinks.iterator(); + while (it2.hasNext()) { + Inlink in = (Inlink) it2.next(); + links.add(in.getFromUrl()); + } + for (int i = 0; i < vals.length; i++) { + LOG.fine(" -> " + vals[i]); + Assert.assertTrue(links.contains(vals[i])); + } + } + reader.close(); + fs.delete(testDir, true); + } + + private void createLinkDb(Configuration config, FileSystem fs, Path linkdb, + TreeMap<String, String[]> init) throws Exception { + LOG.fine("* creating linkdb: " + linkdb); + Path dir = new Path(linkdb, LinkDb.CURRENT_NAME); + + Option wKeyOpt = MapFile.Writer.keyClass(Text.class); + org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(Inlinks.class); + MapFile.Writer writer = new MapFile.Writer(config, new Path(dir, + "part-00000"), wKeyOpt, wValueOpt); + Iterator<String> it = init.keySet().iterator(); + while (it.hasNext()) { + String key = it.next(); + Inlinks inlinks = new Inlinks(); + String[] vals = init.get(key); + for (int i = 0; i < vals.length; i++) { + Inlink in = new Inlink(vals[i], vals[i]); + inlinks.add(in); + } + writer.append(new Text(key), inlinks); + } + writer.close(); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java new file mode 100644 index 0000000..db82d7a --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.crawl; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +public class TestSignatureFactory { + + @Test + public void testGetSignature() { + Configuration conf = NutchConfiguration.create(); + Signature signature1 = SignatureFactory.getSignature(conf); + Signature signature2 = SignatureFactory.getSignature(conf); + Assert.assertNotNull(signature1); + Assert.assertNotNull(signature2); + Assert.assertEquals(signature1, signature2); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java b/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java new file mode 100644 index 0000000..a23d080 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.fetcher; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDBTestUtil; +import org.apache.nutch.crawl.Generator; +import org.apache.nutch.crawl.Injector; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.test.IntegrationTest; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.mortbay.jetty.Server; + +/** + * Basic fetcher test 1. generate seedlist 2. inject 3. generate 3. fetch 4. + * Verify contents + * + */ +public class TestFetcher { + + final static Path testdir = new Path("build/test/fetch-test"); + Configuration conf; + FileSystem fs; + Path crawldbPath; + Path segmentsPath; + Path urlPath; + Server server; + + @Before + public void setUp() throws Exception { + conf = CrawlDBTestUtil.createConfiguration(); + fs = FileSystem.get(conf); + fs.delete(testdir, true); + urlPath = new Path(testdir, "urls"); + crawldbPath = new Path(testdir, "crawldb"); + segmentsPath = new Path(testdir, "segments"); + server = CrawlDBTestUtil.getServer( + conf.getInt("content.server.port", 50000), + "build/test/data/fetch-test-site"); + server.start(); + } + + @After + public void tearDown() throws Exception { + server.stop(); + for (int i = 0; i < 5; i++) { + if (!server.isStopped()) { + Thread.sleep(1000); + } + } + fs.delete(testdir, true); + } + + @Test + @Category(IntegrationTest.class) + public void testFetch() throws IOException, ClassNotFoundException, InterruptedException { + + // generate seedlist + ArrayList<String> urls = new ArrayList<String>(); + + addUrl(urls, "index.html"); + addUrl(urls, "pagea.html"); + addUrl(urls, "pageb.html"); + addUrl(urls, "dup_of_pagea.html"); + addUrl(urls, "nested_spider_trap.html"); + addUrl(urls, "exception.html"); + + CrawlDBTestUtil.generateSeedList(fs, urlPath, urls); + + // inject + Injector injector = new Injector(conf); + injector.inject(crawldbPath, urlPath); + + // generate + Generator g = new Generator(conf); + Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1, + Long.MAX_VALUE, Long.MAX_VALUE, false, false); + + long time = System.currentTimeMillis(); + // fetch + Fetcher fetcher = new Fetcher(conf); + + // Set fetcher.parse to true + conf.setBoolean("fetcher.parse", true); + + fetcher.fetch(generatedSegment[0], 1); + + time = System.currentTimeMillis() - time; + + // verify politeness, time taken should be more than (num_of_pages +1)*delay + int minimumTime = (int) ((urls.size() + 1) * 1000 * conf.getFloat( + "fetcher.server.delay", 5)); + Assert.assertTrue(time > minimumTime); + + // verify content + Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME), + "part-00000/data"); + @SuppressWarnings("resource") + SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content)); + + ArrayList<String> handledurls = new ArrayList<String>(); + + READ_CONTENT: do { + Text key = new Text(); + Content value = new Content(); + if (!reader.next(key, value)) + break READ_CONTENT; + String contentString = new String(value.getContent()); + if (contentString.indexOf("Nutch fetcher test page") != -1) { + handledurls.add(key.toString()); + } + } while (true); + + reader.close(); + + Collections.sort(urls); + Collections.sort(handledurls); + + // verify that enough pages were handled + Assert.assertEquals(urls.size(), handledurls.size()); + + // verify that correct pages were handled + Assert.assertTrue(handledurls.containsAll(urls)); + Assert.assertTrue(urls.containsAll(handledurls)); + + handledurls.clear(); + + // verify parse data + Path parseData = new Path( + new Path(generatedSegment[0], ParseData.DIR_NAME), "part-00000/data"); + reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData)); + + READ_PARSE_DATA: do { + Text key = new Text(); + ParseData value = new ParseData(); + if (!reader.next(key, value)) + break READ_PARSE_DATA; + // make sure they all contain "nutch.segment.name" and + // "nutch.content.digest" + // keys in parse metadata + Metadata contentMeta = value.getContentMeta(); + if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null + && contentMeta.get(Nutch.SIGNATURE_KEY) != null) { + handledurls.add(key.toString()); + } + } while (true); + + Collections.sort(handledurls); + + Assert.assertEquals(urls.size(), handledurls.size()); + + Assert.assertTrue(handledurls.containsAll(urls)); + Assert.assertTrue(urls.containsAll(handledurls)); + } + + private void addUrl(ArrayList<String> urls, String page) { + urls.add("http://127.0.0.1:" + server.getConnectors()[0].getPort() + "/" + + page); + } + + @Test + public void testAgentNameCheck() { + + boolean failedNoAgentName = false; + conf.set("http.agent.name", ""); + + try { + conf.setBoolean("fetcher.parse", false); + Fetcher fetcher = new Fetcher(conf); + fetcher.fetch(null, 1); + } catch (IllegalArgumentException iae) { + String message = iae.getMessage(); + failedNoAgentName = message.equals("Fetcher: No agents listed in " + + "'http.agent.name' property."); + } catch (Exception e) { + } + + Assert.assertTrue(failedNoAgentName); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java new file mode 100644 index 0000000..3a25f26 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java @@ -0,0 +1,190 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer; + +import org.apache.commons.codec.binary.Base64; +import org.apache.hadoop.mrunit.ReduceDriver; +import org.apache.hadoop.mrunit.types.Pair; +import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.NutchWritable; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.ParseText; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.test.IntegrationTest; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.Reducer; + +import static org.junit.Assert.*; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; + +/** Test {@link IndexerMapReduce} */ +public class TestIndexerMapReduce { + + private static final Logger LOG = LoggerFactory + .getLogger(TestIndexerMapReduce.class); + + public static String testUrl = "http://nutch.apache.org/"; + public static Text testUrlText = new Text(testUrl); + public static String htmlContentType = "text/html"; + public static String testHtmlDoc = "<!DOCTYPE html>\n" + + "<html>\n" + + "<head>\n" + + "<title>Test Indexing Binary Content</title>\n" + + "<meta charset=\"utf-8\">\n" + + "<meta name=\"keywords\" lang=\"en\" content=\"charset, encoding\" />\n" + + "<meta name=\"keywords\" lang=\"fr\" content=\"codage des caractères\" />\n" + + "<meta name=\"keywords\" lang=\"cs\" content=\"kódovánà znaků\" />\n" + + "</head>\n" + + "<body>\n" + + "<p>\n" + + "<ul>\n" + + " <li lang=\"en\">English: character set, encoding\n" + + " <li lang=\"fr\">Français: codage des caractères\n" + + " <li lang=\"cs\">ÄeÅ¡tina: kódovánà znaků (not covered by Latin-1)\n" + + "</ul>\n" + + "</body>\n" + + "</html>"; + public static Metadata htmlMeta = new Metadata(); + static { + htmlMeta.add("Content-Type", "text/html"); + // add segment and signature to avoid NPEs + htmlMeta.add(Nutch.SEGMENT_NAME_KEY, "123"); + htmlMeta.add(Nutch.SIGNATURE_KEY, "123"); + } + public static ParseText parseText = new ParseText("Test"); + public static ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, + "Test", new Outlink[] {}, htmlMeta); + public static CrawlDatum crawlDatumDbFetched = new CrawlDatum( + CrawlDatum.STATUS_DB_FETCHED, 60 * 60 * 24); + public static CrawlDatum crawlDatumFetchSuccess = new CrawlDatum( + CrawlDatum.STATUS_FETCH_SUCCESS, 60 * 60 * 24); + + private Reducer<Text, NutchWritable, Text, NutchIndexAction> reducer = new IndexerMapReduce(); + private ReduceDriver<Text, NutchWritable, Text, NutchIndexAction> reduceDriver; + private Configuration configuration; + + + /** + * Test indexing of base64-encoded binary content. + */ + @Test + @Category(IntegrationTest.class) + public void testBinaryContentBase64() { + configuration = NutchConfiguration.create(); + configuration.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, true); + + Charset[] testCharsets = { StandardCharsets.UTF_8, + Charset.forName("iso-8859-1"), Charset.forName("iso-8859-2") }; + for (Charset charset : testCharsets) { + LOG.info("Testing indexing binary content as base64 for charset {}", + charset.name()); + + String htmlDoc = testHtmlDoc; + if (charset != StandardCharsets.UTF_8) { + htmlDoc = htmlDoc.replaceAll("utf-8", charset.name()); + if (charset.name().equalsIgnoreCase("iso-8859-1")) { + // Western-European character set: remove Czech content + htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"cs\".+?\\n", ""); + } else if (charset.name().equalsIgnoreCase("iso-8859-2")) { + // Eastern-European character set: remove French content + htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"fr\".+?\\n", ""); + } + } + + Content content = new Content(testUrl, testUrl, + htmlDoc.getBytes(charset), htmlContentType, htmlMeta, + configuration); + + NutchDocument doc = runIndexer(crawlDatumDbFetched, + crawlDatumFetchSuccess, parseText, parseData, content); + assertNotNull("No NutchDocument indexed", doc); + + String binaryContentBase64 = (String) doc.getField("binaryContent") + .getValues().get(0); + LOG.info("binary content (base64): {}", binaryContentBase64); + String binaryContent = new String( + Base64.decodeBase64(binaryContentBase64), charset); + LOG.info("binary content (decoded): {}", binaryContent); + assertEquals( + "Binary content (" + charset + ") not correctly saved as base64", + htmlDoc, binaryContent); + } + } + + /** + * Run {@link IndexerMapReduce.reduce(...)} to get a "indexed" + * {@link NutchDocument} by passing objects from segment and CrawlDb to the + * indexer. + * + * @param dbDatum + * crawl datum from CrawlDb + * @param fetchDatum + * crawl datum (fetch status) from segment + * @param parseText + * plain text from parsed document + * @param parseData + * parse data + * @param content + * (optional, if index binary content) protocol content + * @return "indexed" document + */ + public NutchDocument runIndexer(CrawlDatum dbDatum, CrawlDatum fetchDatum, + ParseText parseText, ParseData parseData, Content content) { + List<NutchWritable> values = new ArrayList<NutchWritable>(); + values.add(new NutchWritable(dbDatum)); + values.add(new NutchWritable(fetchDatum)); + values.add(new NutchWritable(parseText)); + values.add(new NutchWritable(parseData)); + values.add(new NutchWritable(content)); + reduceDriver = ReduceDriver.newReduceDriver(reducer); + reduceDriver.setConfiguration(configuration); + reduceDriver.withInput(testUrlText, values); + List<Pair<Text, NutchIndexAction>> reduceResult; + NutchDocument doc = null; + try { + reduceResult = reduceDriver.run(); + for (Pair<Text, NutchIndexAction> p : reduceResult) { + if (p.getSecond().action != NutchIndexAction.DELETE) { + doc = p.getSecond().doc; + } + } + } catch (IOException e) { + LOG.error(StringUtils.stringifyException(e)); + } + return doc; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java new file mode 100644 index 0000000..14b246b --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.test.IntegrationTest; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class TestIndexingFilters { + + /** + * Test behaviour when defined filter does not exist. + * + * @throws IndexingException + */ + @Test + public void testNonExistingIndexingFilter() throws IndexingException { + Configuration conf = NutchConfiguration.create(); + conf.addResource("nutch-default.xml"); + conf.addResource("crawl-tests.xml"); + + String class1 = "NonExistingFilter"; + String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter"; + conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2); + + IndexingFilters filters = new IndexingFilters(conf); + filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData( + new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text( + "http://www.example.com/"), new CrawlDatum(), new Inlinks()); + } + + /** + * Test behaviour when NutchDOcument is null + */ + @Test + public void testNutchDocumentNullIndexingFilter() throws IndexingException { + Configuration conf = NutchConfiguration.create(); + conf.addResource("nutch-default.xml"); + conf.addResource("crawl-tests.xml"); + + IndexingFilters filters = new IndexingFilters(conf); + NutchDocument doc = filters.filter(null, new ParseImpl("text", + new ParseData(new ParseStatus(), "title", new Outlink[0], + new Metadata())), new Text("http://www.example.com/"), + new CrawlDatum(), new Inlinks()); + + Assert.assertNull(doc); + } + + /** + * Test behaviour when reset the index filter order will not take effect + * + * @throws IndexingException + */ + @Test + public void testFilterCacheIndexingFilter() throws IndexingException { + Configuration conf = NutchConfiguration.create(); + conf.addResource("nutch-default.xml"); + conf.addResource("crawl-tests.xml"); + + String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter"; + conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1); + + IndexingFilters filters1 = new IndexingFilters(conf); + NutchDocument fdoc1 = filters1.filter(new NutchDocument(), new ParseImpl( + "text", new ParseData(new ParseStatus(), "title", new Outlink[0], + new Metadata())), new Text("http://www.example.com/"), + new CrawlDatum(), new Inlinks()); + + // add another index filter + String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer"; + // set content metadata + Metadata md = new Metadata(); + md.add("example", "data"); + // set content metadata property defined in MetadataIndexer + conf.set("index.content.md", "example"); + // add MetadataIndxer filter + conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2); + IndexingFilters filters2 = new IndexingFilters(conf); + NutchDocument fdoc2 = filters2.filter(new NutchDocument(), new ParseImpl( + "text", new ParseData(new ParseStatus(), "title", new Outlink[0], md)), + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + Assert.assertEquals(fdoc1.getFieldNames().size(), fdoc2.getFieldNames() + .size()); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java b/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java new file mode 100644 index 0000000..f3a320d --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java @@ -0,0 +1,281 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metadata; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.Properties; + +import org.junit.Assert; +import org.junit.Test; + +/** + * JUnit based tests of class {@link org.apache.nutch.metadata.Metadata}. + */ +public class TestMetadata { + + private static final String CONTENTTYPE = "contenttype"; + + /** + * Test to ensure that only non-null values get written when the + * {@link Metadata} object is written using a Writeable. + * + * @since NUTCH-406 + * + */ + @Test + public void testWriteNonNull() { + Metadata met = new Metadata(); + met.add(CONTENTTYPE, null); + met.add(CONTENTTYPE, "text/bogus"); + met.add(CONTENTTYPE, "text/bogus2"); + met = writeRead(met); + + Assert.assertNotNull(met); + Assert.assertEquals(met.size(), 1); + + boolean hasBogus = false, hasBogus2 = false; + + String[] values = met.getValues(CONTENTTYPE); + Assert.assertNotNull(values); + Assert.assertEquals(values.length, 2); + + for (int i = 0; i < values.length; i++) { + if (values[i].equals("text/bogus")) { + hasBogus = true; + } + + if (values[i].equals("text/bogus2")) { + hasBogus2 = true; + } + } + + Assert.assertTrue(hasBogus && hasBogus2); + } + + /** Test for the <code>add(String, String)</code> method. */ + @Test + public void testAdd() { + String[] values = null; + Metadata meta = new Metadata(); + + values = meta.getValues(CONTENTTYPE); + Assert.assertEquals(0, values.length); + + meta.add(CONTENTTYPE, "value1"); + values = meta.getValues(CONTENTTYPE); + Assert.assertEquals(1, values.length); + Assert.assertEquals("value1", values[0]); + + meta.add(CONTENTTYPE, "value2"); + values = meta.getValues(CONTENTTYPE); + Assert.assertEquals(2, values.length); + Assert.assertEquals("value1", values[0]); + Assert.assertEquals("value2", values[1]); + + // NOTE : For now, the same value can be added many times. + // Should it be changed? + meta.add(CONTENTTYPE, "value1"); + values = meta.getValues(CONTENTTYPE); + Assert.assertEquals(3, values.length); + Assert.assertEquals("value1", values[0]); + Assert.assertEquals("value2", values[1]); + Assert.assertEquals("value1", values[2]); + } + + /** Test for the <code>set(String, String)</code> method. */ + @Test + public void testSet() { + String[] values = null; + Metadata meta = new Metadata(); + + values = meta.getValues(CONTENTTYPE); + Assert.assertEquals(0, values.length); + + meta.set(CONTENTTYPE, "value1"); + values = meta.getValues(CONTENTTYPE); + Assert.assertEquals(1, values.length); + Assert.assertEquals("value1", values[0]); + + meta.set(CONTENTTYPE, "value2"); + values = meta.getValues(CONTENTTYPE); + Assert.assertEquals(1, values.length); + Assert.assertEquals("value2", values[0]); + + meta.set(CONTENTTYPE, "new value 1"); + meta.add("contenttype", "new value 2"); + values = meta.getValues(CONTENTTYPE); + Assert.assertEquals(2, values.length); + Assert.assertEquals("new value 1", values[0]); + Assert.assertEquals("new value 2", values[1]); + } + + /** Test for <code>setAll(Properties)</code> method. */ + @Test + public void testSetProperties() { + String[] values = null; + Metadata meta = new Metadata(); + Properties props = new Properties(); + + meta.setAll(props); + Assert.assertEquals(0, meta.size()); + + props.setProperty("name-one", "value1.1"); + meta.setAll(props); + Assert.assertEquals(1, meta.size()); + values = meta.getValues("name-one"); + Assert.assertEquals(1, values.length); + Assert.assertEquals("value1.1", values[0]); + + props.setProperty("name-two", "value2.1"); + meta.setAll(props); + Assert.assertEquals(2, meta.size()); + values = meta.getValues("name-one"); + Assert.assertEquals(1, values.length); + Assert.assertEquals("value1.1", values[0]); + values = meta.getValues("name-two"); + Assert.assertEquals(1, values.length); + Assert.assertEquals("value2.1", values[0]); + } + + /** Test for <code>get(String)</code> method. */ + @Test + public void testGet() { + Metadata meta = new Metadata(); + Assert.assertNull(meta.get("a-name")); + meta.add("a-name", "value-1"); + Assert.assertEquals("value-1", meta.get("a-name")); + meta.add("a-name", "value-2"); + Assert.assertEquals("value-1", meta.get("a-name")); + } + + /** Test for <code>isMultiValued()</code> method. */ + @Test + public void testIsMultiValued() { + Metadata meta = new Metadata(); + Assert.assertFalse(meta.isMultiValued("key")); + meta.add("key", "value1"); + Assert.assertFalse(meta.isMultiValued("key")); + meta.add("key", "value2"); + Assert.assertTrue(meta.isMultiValued("key")); + } + + /** Test for <code>names</code> method. */ + @Test + public void testNames() { + String[] names = null; + Metadata meta = new Metadata(); + names = meta.names(); + Assert.assertEquals(0, names.length); + + meta.add("name-one", "value"); + names = meta.names(); + Assert.assertEquals(1, names.length); + Assert.assertEquals("name-one", names[0]); + meta.add("name-two", "value"); + names = meta.names(); + Assert.assertEquals(2, names.length); + } + + /** Test for <code>remove(String)</code> method. */ + @Test + public void testRemove() { + Metadata meta = new Metadata(); + meta.remove("name-one"); + Assert.assertEquals(0, meta.size()); + meta.add("name-one", "value-1.1"); + meta.add("name-one", "value-1.2"); + meta.add("name-two", "value-2.2"); + Assert.assertEquals(2, meta.size()); + Assert.assertNotNull(meta.get("name-one")); + Assert.assertNotNull(meta.get("name-two")); + meta.remove("name-one"); + Assert.assertEquals(1, meta.size()); + Assert.assertNull(meta.get("name-one")); + Assert.assertNotNull(meta.get("name-two")); + meta.remove("name-two"); + Assert.assertEquals(0, meta.size()); + Assert.assertNull(meta.get("name-one")); + Assert.assertNull(meta.get("name-two")); + } + + /** Test for <code>equals(Object)</code> method. */ + @Test + public void testObject() { + Metadata meta1 = new Metadata(); + Metadata meta2 = new Metadata(); + Assert.assertFalse(meta1.equals(null)); + Assert.assertFalse(meta1.equals("String")); + Assert.assertTrue(meta1.equals(meta2)); + meta1.add("name-one", "value-1.1"); + Assert.assertFalse(meta1.equals(meta2)); + meta2.add("name-one", "value-1.1"); + Assert.assertTrue(meta1.equals(meta2)); + meta1.add("name-one", "value-1.2"); + Assert.assertFalse(meta1.equals(meta2)); + meta2.add("name-one", "value-1.2"); + Assert.assertTrue(meta1.equals(meta2)); + meta1.add("name-two", "value-2.1"); + Assert.assertFalse(meta1.equals(meta2)); + meta2.add("name-two", "value-2.1"); + Assert.assertTrue(meta1.equals(meta2)); + meta1.add("name-two", "value-2.2"); + Assert.assertFalse(meta1.equals(meta2)); + meta2.add("name-two", "value-2.x"); + Assert.assertFalse(meta1.equals(meta2)); + } + + /** Test for <code>Writable</code> implementation. */ + @Test + public void testWritable() { + Metadata result = null; + Metadata meta = new Metadata(); + result = writeRead(meta); + Assert.assertEquals(0, result.size()); + meta.add("name-one", "value-1.1"); + result = writeRead(meta); + Assert.assertEquals(1, result.size()); + Assert.assertEquals(1, result.getValues("name-one").length); + Assert.assertEquals("value-1.1", result.get("name-one")); + meta.add("name-two", "value-2.1"); + meta.add("name-two", "value-2.2"); + result = writeRead(meta); + Assert.assertEquals(2, result.size()); + Assert.assertEquals(1, result.getValues("name-one").length); + Assert.assertEquals("value-1.1", result.getValues("name-one")[0]); + Assert.assertEquals(2, result.getValues("name-two").length); + Assert.assertEquals("value-2.1", result.getValues("name-two")[0]); + Assert.assertEquals("value-2.2", result.getValues("name-two")[1]); + } + + private Metadata writeRead(Metadata meta) { + Metadata readed = new Metadata(); + try { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + meta.write(new DataOutputStream(out)); + readed.readFields(new DataInputStream(new ByteArrayInputStream(out + .toByteArray()))); + } catch (IOException ioe) { + Assert.fail(ioe.toString()); + } + return readed; + } + +}
