Author: lewismc
Date: Fri Aug 10 14:11:30 2012
New Revision: 1371708
URL: http://svn.apache.org/viewvc?rev=1371708&view=rev
Log:
NUTCH-1160 Write JUnit test for index-basic
Added:
nutch/branches/2.x/src/plugin/index-basic/src/test/
nutch/branches/2.x/src/plugin/index-basic/src/test/org/
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
Modified:
nutch/branches/2.x/conf/schema-solr4.xml
nutch/branches/2.x/src/plugin/build.xml
nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
Modified: nutch/branches/2.x/conf/schema-solr4.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema-solr4.xml?rev=1371708&r1=1371707&r2=1371708&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema-solr4.xml (original)
+++ nutch/branches/2.x/conf/schema-solr4.xml Fri Aug 10 14:11:30 2012
@@ -311,6 +311,8 @@
<!-- fields for index-basic plugin -->
<field name="host" type="url" stored="false" indexed="true"/>
<field name="url" type="url" stored="true" indexed="true" required="true"/>
+ <field name="orig" type="url" stored="true" indexed="true" />
+ <field name="site" type="string" stored="false" indexed="true"/>
<!-- stored=true for highlighting, use term vectors and positions for
fast highlighting -->
<field name="content" type="text_general" stored="true" indexed="true"/>
<field name="title" type="text_general" stored="true" indexed="true"/>
Modified: nutch/branches/2.x/src/plugin/build.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1371708&r1=1371707&r2=1371708&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/build.xml (original)
+++ nutch/branches/2.x/src/plugin/build.xml Fri Aug 10 14:11:30 2012
@@ -74,7 +74,8 @@
<ant dir="parse-tika" target="test"/>
<ant dir="protocol-file" target="test"/>
<ant dir="parse-html" target="test"/>
- <ant dir="index-anchor" target="test"/>
+ <ant dir="index-anchor" target="test"/>
+ <ant dir="index-basic" target="test"/>
<ant dir="index-more" target="test"/>
<ant dir="language-identifier" target="test"/>
<ant dir="protocol-httpclient" target="test"/>
Modified:
nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java?rev=1371708&r1=1371707&r2=1371708&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
(original)
+++
nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
Fri Aug 10 14:11:30 2012
@@ -16,15 +16,10 @@
*/
package org.apache.nutch.indexer.anchor;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.util.Collection;
-
import junit.framework.TestCase;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.NutchConfiguration;
Modified:
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1371708&r1=1371707&r2=1371708&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
(original)
+++
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
Fri Aug 10 14:11:30 2012
@@ -36,7 +36,17 @@ import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.TableUtil;
import org.apache.solr.common.util.DateUtil;
-/** Adds basic searchable fields to a document. */
+/** Adds basic searchable fields to a document. The fields are:
+ * host - add host as un-stored, indexed and tokenized
+ * site - add site as un-stored, indexed and un-tokenized
+ * url - url is both stored and indexed, so it's both searchable and returned.
+ * This is also a required field.
+ * orig - also store original url as both stored and indexed
+ * content - content is indexed, so that it's searchable, but not stored in
index
+ * title - title is stored and indexed
+ * cache - add cached content/summary display policy, if available
+ * tstamp - add timestamp when fetched, for deduplication
+ */
public class BasicIndexingFilter implements IndexingFilter {
public static final Logger LOG =
LoggerFactory.getLogger(BasicIndexingFilter.class);
@@ -51,6 +61,16 @@ public class BasicIndexingFilter impleme
FIELDS.add(WebPage.Field.FETCH_TIME);
}
+ /**
+ * The {@link BasicIndexingFilter} filter object which supports boolean
+ * configurable value for length of characters permitted within the
+ * title @see {@code indexer.max.title.length} in nutch-default.xml
+ *
+ * @param doc The {@link NutchDocument} object
+ * @param url URL to be filtered for anchor text
+ * @param page {@link WebPage} object relative to the URL
+ * @return filtered NutchDocument
+ */
public NutchDocument filter(NutchDocument doc, String url, WebPage page)
throws IndexingException {
@@ -83,7 +103,7 @@ public class BasicIndexingFilter impleme
doc.add("url", reprUrl == null ? url : reprUrl);
if (reprUrl != null) {
- // also store original url as both stored and indexes
+ // also store original url as both stored and indexed
doc.add("orig", url);
}
@@ -118,15 +138,28 @@ public class BasicIndexingFilter impleme
public void addIndexBackendOptions(Configuration conf) {
}
+ /**
+ * Set the {@link Configuration} object
+ */
public void setConf(Configuration conf) {
this.conf = conf;
this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
+ LOG.info("Maximum title length for indexing set to: " +
this.MAX_TITLE_LENGTH);
}
+ /**
+ * Get the {@link Configuration} object
+ */
public Configuration getConf() {
return this.conf;
}
+ /**
+ * Gets all the fields for a given {@link WebPage}
+ * Many datastores need to setup the mapreduce job by specifying the fields
+ * needed. All extensions that work on WebPage are able to specify what
fields
+ * they need.
+ */
@Override
public Collection<WebPage.Field> getFields() {
return FIELDS;
Added:
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java?rev=1371708&view=auto
==============================================================================
---
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
(added)
+++
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
Fri Aug 10 14:11:30 2012
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.basic;
+
+import java.nio.ByteBuffer;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.fetcher.FetcherJob;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Test;
+import junit.framework.TestCase;
+
+/**
+ * JUnit test case which tests
+ * 1. that the host, site, url, orig, content, title, cache and tstamp fields
+ * are obtained by the filter.
+ * 2. that configurable maximum length functionality for titles actually
works. .
+ * This property defaults at 100 characters @see {@code
indexer.max.title.length}
+ * in nutch-default.xml but has been set to 10 for this test.
+ *
+ * @author lewismc
+ */
+
+public class TestBasicIndexingFilter extends TestCase {
+
+ @Test
+ public void testBasicFields() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ BasicIndexingFilter filter = new BasicIndexingFilter();
+ filter.setConf(conf);
+ assertNotNull(filter);
+ NutchDocument doc = new NutchDocument();
+ WebPage page = new WebPage();
+ page.putToInlinks(new Utf8("http://nutch.apache.org/"), new
Utf8("Welcome to Nutch"));
+ page.setTitle(new Utf8("Welcome to Nutch"));
+ page.setReprUrl(new Utf8("http://www.urldoesnotmatter.org"));
+ //ByteBuffer bbuf = ByteBuffer.allocate(10);
+ //bbuf.putInt(123456789);
+ //page.putToMetadata(new Utf8("Cache_policy"), bbuf);
+ page.setFetchTime(System.currentTimeMillis());
+ try {
+ filter.filter(doc, "http://www.apache.org/", page);
+ } catch(Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ assertNotNull(doc);
+ assertTrue("check for host field ",
doc.getFieldNames().contains("host"));
+ assertTrue("check for site field",
doc.getFieldNames().contains("site"));
+ assertTrue("check for url field", doc.getFieldNames().contains("url"));
+ assertTrue("check for orig field",
doc.getFieldNames().contains("orig"));
+ assertTrue("check for content field",
doc.getFieldNames().contains("content"));
+ assertTrue("check for title field",
doc.getFieldNames().contains("title"));
+ //assertTrue("check for cache field",
doc.getFieldNames().contains("cache"));
+ assertTrue("check for tstamp field",
doc.getFieldNames().contains("tstamp"));
+ }
+
+ @Test
+ public void testTitleFieldLength() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.setInt("indexer.max.title.length", 10);
+ BasicIndexingFilter filter = new BasicIndexingFilter();
+ filter.setConf(conf);
+ assertNotNull(filter);
+ NutchDocument doc = new NutchDocument();
+ WebPage page = new WebPage();
+ page.putToInlinks(new Utf8("http://exceedmaximumtitleurl.org/"), new
Utf8("exceeding title site"));
+ page.setTitle(new Utf8("This title exceeds maximum characters"));
+ try {
+ filter.filter(doc, "http://www.apache.org/", page);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ assertNotNull(doc);
+ assertEquals("assert title field only has 10 characters", 10,
doc.getFieldValue("title").length());
+ }
+}