Author: lewismc Date: Fri Aug 10 14:11:30 2012 New Revision: 1371708 URL: http://svn.apache.org/viewvc?rev=1371708&view=rev Log: NUTCH-1160 Write JUnit test for index-basic
Added: nutch/branches/2.x/src/plugin/index-basic/src/test/ nutch/branches/2.x/src/plugin/index-basic/src/test/org/ nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/ nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/ nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/ nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/ nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java Modified: nutch/branches/2.x/conf/schema-solr4.xml nutch/branches/2.x/src/plugin/build.xml nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Modified: nutch/branches/2.x/conf/schema-solr4.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema-solr4.xml?rev=1371708&r1=1371707&r2=1371708&view=diff ============================================================================== --- nutch/branches/2.x/conf/schema-solr4.xml (original) +++ nutch/branches/2.x/conf/schema-solr4.xml Fri Aug 10 14:11:30 2012 @@ -311,6 +311,8 @@ <!-- fields for index-basic plugin --> <field name="host" type="url" stored="false" indexed="true"/> <field name="url" type="url" stored="true" indexed="true" required="true"/> + <field name="orig" type="url" stored="true" indexed="true" /> + <field name="site" type="string" stored="false" indexed="true"/> <!-- stored=true for highlighting, use term vectors and positions for fast highlighting --> <field name="content" type="text_general" stored="true" indexed="true"/> <field name="title" type="text_general" stored="true" indexed="true"/> Modified: nutch/branches/2.x/src/plugin/build.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1371708&r1=1371707&r2=1371708&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/build.xml (original) +++ nutch/branches/2.x/src/plugin/build.xml Fri Aug 10 14:11:30 2012 @@ -74,7 +74,8 @@ <ant dir="parse-tika" target="test"/> <ant dir="protocol-file" target="test"/> <ant dir="parse-html" target="test"/> - <ant dir="index-anchor" target="test"/> + <ant dir="index-anchor" target="test"/> + <ant dir="index-basic" target="test"/> <ant dir="index-more" target="test"/> <ant dir="language-identifier" target="test"/> <ant dir="protocol-httpclient" target="test"/> Modified: nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java?rev=1371708&r1=1371707&r2=1371708&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java Fri Aug 10 14:11:30 2012 @@ -16,15 +16,10 @@ */ package org.apache.nutch.indexer.anchor; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.util.Collection; - import junit.framework.TestCase; import org.apache.avro.util.Utf8; import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.indexer.IndexingException; import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.storage.WebPage; import org.apache.nutch.util.NutchConfiguration; Modified: nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1371708&r1=1371707&r2=1371708&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Fri Aug 10 14:11:30 2012 @@ -36,7 +36,17 @@ import org.apache.nutch.util.Bytes; import org.apache.nutch.util.TableUtil; import org.apache.solr.common.util.DateUtil; -/** Adds basic searchable fields to a document. */ +/** Adds basic searchable fields to a document. The fields are: + * host - add host as un-stored, indexed and tokenized + * site - add site as un-stored, indexed and un-tokenized + * url - url is both stored and indexed, so it's both searchable and returned. + * This is also a required field. + * orig - also store original url as both stored and indexed + * content - content is indexed, so that it's searchable, but not stored in index + * title - title is stored and indexed + * cache - add cached content/summary display policy, if available + * tstamp - add timestamp when fetched, for deduplication + */ public class BasicIndexingFilter implements IndexingFilter { public static final Logger LOG = LoggerFactory.getLogger(BasicIndexingFilter.class); @@ -51,6 +61,16 @@ public class BasicIndexingFilter impleme FIELDS.add(WebPage.Field.FETCH_TIME); } + /** + * The {@link BasicIndexingFilter} filter object which supports boolean + * configurable value for length of characters permitted within the + * title @see {@code indexer.max.title.length} in nutch-default.xml + * + * @param doc The {@link NutchDocument} object + * @param url URL to be filtered for anchor text + * @param page {@link WebPage} object relative to the URL + * @return filtered NutchDocument + */ public NutchDocument filter(NutchDocument doc, String url, WebPage page) throws IndexingException { @@ -83,7 +103,7 @@ public class BasicIndexingFilter impleme doc.add("url", reprUrl == null ? url : reprUrl); if (reprUrl != null) { - // also store original url as both stored and indexes + // also store original url as both stored and indexed doc.add("orig", url); } @@ -118,15 +138,28 @@ public class BasicIndexingFilter impleme public void addIndexBackendOptions(Configuration conf) { } + /** + * Set the {@link Configuration} object + */ public void setConf(Configuration conf) { this.conf = conf; this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100); + LOG.info("Maximum title length for indexing set to: " + this.MAX_TITLE_LENGTH); } + /** + * Get the {@link Configuration} object + */ public Configuration getConf() { return this.conf; } + /** + * Gets all the fields for a given {@link WebPage} + * Many datastores need to setup the mapreduce job by specifying the fields + * needed. All extensions that work on WebPage are able to specify what fields + * they need. + */ @Override public Collection<WebPage.Field> getFields() { return FIELDS; Added: nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java?rev=1371708&view=auto ============================================================================== --- nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java (added) +++ nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java Fri Aug 10 14:11:30 2012 @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.basic; + +import java.nio.ByteBuffer; + +import org.apache.avro.util.Utf8; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.fetcher.FetcherJob; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.storage.WebPage; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Test; +import junit.framework.TestCase; + +/** + * JUnit test case which tests + * 1. that the host, site, url, orig, content, title, cache and tstamp fields + * are obtained by the filter. + * 2. that configurable maximum length functionality for titles actually works. . + * This property defaults at 100 characters @see {@code indexer.max.title.length} + * in nutch-default.xml but has been set to 10 for this test. + * + * @author lewismc + */ + +public class TestBasicIndexingFilter extends TestCase { + + @Test + public void testBasicFields() throws Exception { + Configuration conf = NutchConfiguration.create(); + BasicIndexingFilter filter = new BasicIndexingFilter(); + filter.setConf(conf); + assertNotNull(filter); + NutchDocument doc = new NutchDocument(); + WebPage page = new WebPage(); + page.putToInlinks(new Utf8("http://nutch.apache.org/"), new Utf8("Welcome to Nutch")); + page.setTitle(new Utf8("Welcome to Nutch")); + page.setReprUrl(new Utf8("http://www.urldoesnotmatter.org")); + //ByteBuffer bbuf = ByteBuffer.allocate(10); + //bbuf.putInt(123456789); + //page.putToMetadata(new Utf8("Cache_policy"), bbuf); + page.setFetchTime(System.currentTimeMillis()); + try { + filter.filter(doc, "http://www.apache.org/", page); + } catch(Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + assertNotNull(doc); + assertTrue("check for host field ", doc.getFieldNames().contains("host")); + assertTrue("check for site field", doc.getFieldNames().contains("site")); + assertTrue("check for url field", doc.getFieldNames().contains("url")); + assertTrue("check for orig field", doc.getFieldNames().contains("orig")); + assertTrue("check for content field", doc.getFieldNames().contains("content")); + assertTrue("check for title field", doc.getFieldNames().contains("title")); + //assertTrue("check for cache field", doc.getFieldNames().contains("cache")); + assertTrue("check for tstamp field", doc.getFieldNames().contains("tstamp")); + } + + @Test + public void testTitleFieldLength() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.setInt("indexer.max.title.length", 10); + BasicIndexingFilter filter = new BasicIndexingFilter(); + filter.setConf(conf); + assertNotNull(filter); + NutchDocument doc = new NutchDocument(); + WebPage page = new WebPage(); + page.putToInlinks(new Utf8("http://exceedmaximumtitleurl.org/"), new Utf8("exceeding title site")); + page.setTitle(new Utf8("This title exceeds maximum characters")); + try { + filter.filter(doc, "http://www.apache.org/", page); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + assertNotNull(doc); + assertEquals("assert title field only has 10 characters", 10, doc.getFieldValue("title").length()); + } +}