http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java new file mode 100644 index 0000000..6e64ede --- /dev/null +++ b/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java @@ -0,0 +1,344 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.more; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.nutch.metadata.Metadata; + +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.net.protocols.Response; + +import org.apache.nutch.parse.Parse; + +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.util.MimeUtil; +import org.apache.tika.Tika; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; + +import java.text.ParseException; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.Date; +import java.util.regex.*; +import java.util.HashMap; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang.time.DateUtils; + +/** + * Add (or reset) a few metaData properties as respective fields (if they are + * available), so that they can be accurately used within the search index. + * + * 'lastModifed' is indexed to support query by date, 'contentLength' obtains + * content length from the HTTP header, 'type' field is indexed to support query + * by type and finally the 'title' field is an attempt to reset the title if a + * content-disposition hint exists. The logic is that such a presence is + * indicative that the content provider wants the filename therein to be used as + * the title. + * + * Still need to make content-length searchable! + * + * @author John Xing + */ + +public class MoreIndexingFilter implements IndexingFilter { + public static final Logger LOG = LoggerFactory + .getLogger(MoreIndexingFilter.class); + + /** Get the MimeTypes resolver instance. */ + private MimeUtil MIME; + private Tika tika = new Tika(); + + /** Map for mime-type substitution */ + private HashMap<String, String> mimeMap = null; + private boolean mapMimes = false; + + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + String url_s = url.toString(); + + addTime(doc, parse.getData(), url_s, datum); + addLength(doc, parse.getData(), url_s); + addType(doc, parse.getData(), url_s, datum); + resetTitle(doc, parse.getData(), url_s); + + return doc; + } + + // Add time related meta info. Add last-modified if present. Index date as + // last-modified, or, if that's not present, use fetch time. + private NutchDocument addTime(NutchDocument doc, ParseData data, String url, + CrawlDatum datum) { + long time = -1; + + String lastModified = data.getMeta(Metadata.LAST_MODIFIED); + if (lastModified != null) { // try parse last-modified + time = getTime(lastModified, url); // use as time + // store as string + doc.add("lastModified", new Date(time)); + } + + if (time == -1) { // if no last-modified specified in HTTP header + time = datum.getModifiedTime(); // use value in CrawlDatum + if (time <= 0) { // if also unset + time = datum.getFetchTime(); // use time the fetch took place (fetchTime + // of fetchDatum) + } + } + + // un-stored, indexed and un-tokenized + doc.add("date", new Date(time)); + return doc; + } + + private long getTime(String date, String url) { + long time = -1; + try { + time = HttpDateFormat.toLong(date); + } catch (ParseException e) { + // try to parse it as date in alternative format + try { + Date parsedDate = DateUtils.parseDate(date, new String[] { + "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz", + "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz", + "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz", + "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss", + "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz", + "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd", + "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm", + "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz", + "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz", + "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz", + "yyyy-MM-dd'T'HH:mm:ss'Z'" }); + time = parsedDate.getTime(); + // if (LOG.isWarnEnabled()) { + // LOG.warn(url + ": parsed date: " + date +" to:"+time); + // } + } catch (Exception e2) { + if (LOG.isWarnEnabled()) { + LOG.warn(url + ": can't parse erroneous date: " + date); + } + } + } + return time; + } + + // Add Content-Length + private NutchDocument addLength(NutchDocument doc, ParseData data, String url) { + String contentLength = data.getMeta(Response.CONTENT_LENGTH); + + if (contentLength != null) { + // NUTCH-1010 ContentLength not trimmed + String trimmed = contentLength.toString().trim(); + if (!trimmed.isEmpty()) + doc.add("contentLength", trimmed); + } + return doc; + } + + /** + * <p> + * Add Content-Type and its primaryType and subType add contentType, + * primaryType and subType to field "type" as un-stored, indexed and + * un-tokenized, so that search results can be confined by contentType or its + * primaryType or its subType. + * </p> + * <p> + * For example, if contentType is application/vnd.ms-powerpoint, search can be + * done with one of the following qualifiers + * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint + * all case insensitive. The query filter is implemented in + * {@link TypeQueryFilter}. + * </p> + * + * @param doc + * @param data + * @param url + * @return + */ + private NutchDocument addType(NutchDocument doc, ParseData data, String url, + CrawlDatum datum) { + String mimeType = null; + String contentType = null; + + Writable tcontentType = datum.getMetaData().get( + new Text(Response.CONTENT_TYPE)); + if (tcontentType != null) { + contentType = tcontentType.toString(); + } else + contentType = data.getMeta(Response.CONTENT_TYPE); + if (contentType == null) { + // Note by Jerome Charron on 20050415: + // Content Type not solved by a previous plugin + // Or unable to solve it... Trying to find it + // Should be better to use the doc content too + // (using MimeTypes.getMimeType(byte[], String), but I don't know + // which field it is? + // if (MAGIC) { + // contentType = MIME.getMimeType(url, content); + // } else { + // contentType = MIME.getMimeType(url); + // } + + mimeType = tika.detect(url); + } else { + mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType)); + } + + // Checks if we solved the content-type. + if (mimeType == null) { + return doc; + } + + // Check if we have to map mime types + if (mapMimes) { + // Check if the current mime is mapped + if (mimeMap.containsKey(mimeType)) { + // It's mapped, let's replace it + mimeType = mimeMap.get(mimeType); + } + } + + contentType = mimeType; + doc.add("type", contentType); + + // Check if we need to split the content type in sub parts + if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) { + String[] parts = getParts(contentType); + + for (String part : parts) { + doc.add("type", part); + } + } + + // leave this for future improvement + // MimeTypeParameterList parameterList = mimeType.getParameters() + + return doc; + } + + /** + * Utility method for splitting mime type into type and subtype. + * + * @param mimeType + * @return + */ + static String[] getParts(String mimeType) { + return mimeType.split("/"); + } + + // Reset title if we see non-standard HTTP header "Content-Disposition". + // It's a good indication that content provider wants filename therein + // be used as the title of this url. + + // Patterns used to extract filename from possible non-standard + // HTTP header "Content-Disposition". Typically it looks like: + // Content-Disposition: inline; filename="foo.ppt" + private Configuration conf; + + static Pattern patterns[] = { null, null }; + + static { + try { + // order here is important + patterns[0] = Pattern.compile("\\bfilename=['\"](.+)['\"]"); + patterns[1] = Pattern.compile("\\bfilename=(\\S+)\\b"); + } catch (PatternSyntaxException e) { + // just ignore + } + } + + private NutchDocument resetTitle(NutchDocument doc, ParseData data, String url) { + String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION); + if (contentDisposition == null || doc.getFieldValue("title") != null) + return doc; + + for (int i = 0; i < patterns.length; i++) { + Matcher matcher = patterns[i].matcher(contentDisposition); + if (matcher.find()) { + doc.add("title", matcher.group(1)); + break; + } + } + + return doc; + } + + public void setConf(Configuration conf) { + this.conf = conf; + MIME = new MimeUtil(conf); + + if (conf.getBoolean("moreIndexingFilter.mapMimeTypes", false) == true) { + mapMimes = true; + + // Load the mapping + try { + readConfiguration(); + } catch (Exception e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } + } + + public Configuration getConf() { + return this.conf; + } + + private void readConfiguration() throws IOException { + LOG.info("Reading content type mappings from file contenttype-mapping.txt"); + BufferedReader reader = new BufferedReader( + conf.getConfResourceAsReader("contenttype-mapping.txt")); + String line; + String parts[]; + boolean formatWarningShown = false; + + mimeMap = new HashMap<String, String>(); + + while ((line = reader.readLine()) != null) { + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { + line.trim(); + parts = line.split("\t"); + + // Must be at least two parts + if (parts.length > 1) { + for (int i = 1; i < parts.length; i++) { + mimeMap.put(parts[i].trim(), parts[0].trim()); + } + } else { + LOG.warn("Wrong format of line: {}", line); + if (!formatWarningShown) { + LOG.warn("Expected format: <target type> <tab> <type1> [<tab> <type2> ...]"); + formatWarningShown = true; + } + } + } + } + } +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/package.html b/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/package.html new file mode 100644 index 0000000..7b8fade --- /dev/null +++ b/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/package.html @@ -0,0 +1,6 @@ +<html> +<body> +<p>A more indexing plugin, adds "more" index fields: +last modified date, MIME type, content length.</p><p></p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java b/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java new file mode 100644 index 0000000..f918dde --- /dev/null +++ b/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.more; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +public class TestMoreIndexingFilter { + + @Test + public void testContentType() throws IndexingException { + Configuration conf = NutchConfiguration.create(); + assertContentType(conf, "text/html", "text/html"); + assertContentType(conf, "text/html; charset=UTF-8", "text/html"); + } + + @Test + public void testGetParts() { + String[] parts = MoreIndexingFilter.getParts("text/html"); + assertParts(parts, 2, "text", "html"); + } + + /** + * @since NUTCH-901 + */ + @Test + public void testNoParts() { + Configuration conf = NutchConfiguration.create(); + conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false); + MoreIndexingFilter filter = new MoreIndexingFilter(); + filter.setConf(conf); + Assert.assertNotNull(filter); + NutchDocument doc = new NutchDocument(); + ParseImpl parse = new ParseImpl("foo bar", new ParseData()); + + try { + filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), + new CrawlDatum(), new Inlinks()); + } catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.getMessage()); + } + Assert.assertNotNull(doc); + Assert.assertTrue(doc.getFieldNames().contains("type")); + Assert.assertEquals(1, doc.getField("type").getValues().size()); + Assert.assertEquals("text/html", doc.getFieldValue("type")); + } + + @Test + public void testContentDispositionTitle() throws IndexingException { + Configuration conf = NutchConfiguration.create(); + + Metadata metadata = new Metadata(); + metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext"); + MoreIndexingFilter filter = new MoreIndexingFilter(); + filter.setConf(conf); + + Text url = new Text("http://www.example.com/"); + ParseImpl parseImpl = new ParseImpl("text", new ParseData( + new ParseStatus(), "title", new Outlink[0], metadata)); + + NutchDocument doc = new NutchDocument(); + doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks()); + + Assert.assertEquals("content-disposition not detected", "filename.ext", + doc.getFieldValue("title")); + + /* NUTCH-1140: do not add second title to avoid a multi-valued title field */ + doc = new NutchDocument(); + doc.add("title", "title"); + doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks()); + Assert.assertEquals("do not add second title by content-disposition", + "title", doc.getFieldValue("title")); + } + + private void assertParts(String[] parts, int count, String... expected) { + Assert.assertEquals(count, parts.length); + for (int i = 0; i < expected.length; i++) { + Assert.assertEquals(expected[i], parts[i]); + } + } + + private void assertContentType(Configuration conf, String source, + String expected) throws IndexingException { + Metadata metadata = new Metadata(); + metadata.add(Response.CONTENT_TYPE, source); + MoreIndexingFilter filter = new MoreIndexingFilter(); + filter.setConf(conf); + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl( + "text", new ParseData(new ParseStatus(), "title", new Outlink[0], + metadata)), new Text("http://www.example.com/"), new CrawlDatum(), + new Inlinks()); + Assert.assertEquals("mime type not detected", expected, + doc.getFieldValue("type")); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-replace/README.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-replace/README.txt b/nutch-plugins/index-replace/README.txt new file mode 100644 index 0000000..4c866a7 --- /dev/null +++ b/nutch-plugins/index-replace/README.txt @@ -0,0 +1,95 @@ +IndexReplace plugin + +Allows indexing-time regexp replace manipulation of metadata fields. + +Configuration Example + <property> + <name>index.replace.regexp</name> + <value> + id=/file\:/http\:my.site.com/ + url=/file\:/http\:my.site.com/2 + </value> + </property + +Property format: index.replace.regexp + The format of the property is a list of regexp replacements, one line per field being + modified. Field names would be one of those from https://wiki.apache.org/nutch/IndexStructure. + + The fieldname precedes the equal sign. The first character after the equal sign signifies + the delimiter for the regexp, the replacement value and the flags. + +Replacement Sequence + The replacements will happen in the order listed. If a field needs multiple replacement operations + they may be listed more than once. + +RegExp Format + The regexp and the optional flags should correspond to Pattern.compile(String regexp, int flags) defined + here: http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#compile%28java.lang.String,%20int%29 + Patterns are compiled when the plugin is initialized for efficiency. + +Replacement Format + The replacement value should correspond to Java Matcher(CharSequence input).replaceAll(String replacement): + http://docs.oracle.com/javase/7/docs/api/java/util/regex/Matcher.html#replaceAll%28java.lang.String%29 + +Flags + The flags is an integer sum of the flag values defined in + http://docs.oracle.com/javase/7/docs/api/constant-values.html (Sec: java.util.regex.Pattern) + +Creating New Fields + If you express the fieldname as fldname1:fldname2=[replacement], then the replacer will create a new field + from the source field. The source field remains unmodified. This is an alternative to solrindex-mapping + which is only able to copy fields verbatim. + +Multi-valued Fields + If a field has multiple values, the replacement will be applied to each value in turn. + +Non-string Datatypes + Replacement is possible only on String field datatypes. If the field you name in the property is + not a String datatype, it will be silently ignored. + +Host and URL specific replacements. + If the replacements should apply only to specific pages, then add a sequence like + + hostmatch=hostmatchpattern + fld1=/regexp/replace/flags + fld2=/regexp/replace/flags + + or + urlmatch=urlmatchpattern + fld1=/regexp/replace/flags + fld2=/regexp/replace/flags + +When using Host and URL replacements, all replacements preceding the first hostmatch or urlmatch +will apply to all parsed pages. Replacements following a hostmatch or urlmatch will be applied +to pages which match the host or url field (up to the next hostmatch or urlmatch line). hostmatch +and urlmatch patterns must be unique in this property. + +Plugin order + In most cases you will want this plugin to run last. + +Testing your match patterns + Online Regexp testers like http://www.regexplanet.com/advanced/java/index.html + can help get the basics of your pattern working. + To test in nutch: + Prepare a test HTML file with the field contents you want to test. + Place this in a directory accessible to nutch. + Use the file:/// syntax to list the test file(s) in a test/urls seed list. + See the nutch faq "index my local file system" for conf settings you will need. + (Note the urlmatch and hostmatch patterns may not conform to your test file host and url; This + test approach confirms only how your global matches behave, unless your urlmatch and hostmatch + patterns also match the file: URL pattern) + + Run.. + bin/nutch inject crawl/crawldb test + bin/nutch generate crawl/crawldb crawl/segments + bin/nutch fetch crawl/segments/[segment] + bin/nutch parse crawl/segments/[segment] + bin/nutch invertlinks crawl/linkdb -dir crawl/segments + ...index your document, for example with SOLR... + bin/nutch solrindex http://localhost:8983/solr crawl/crawldb/ -linkdb crawl/linkdb/ crawl/segement[segment] -filter -normalize + + Inspect hadoop.log for info about pattern parsing and compilation.. + grep replace logs/hadoop.log + + To inspect your index with the solr admin panel... + http://localhost:8983/solr/#/ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-replace/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-replace/build.xml b/nutch-plugins/index-replace/build.xml new file mode 100644 index 0000000..ea8c95d --- /dev/null +++ b/nutch-plugins/index-replace/build.xml @@ -0,0 +1,55 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="index-replace" default="jar-core"> + + <import file="../build-plugin.xml" /> + + <!-- Add compilation dependencies to classpath --> + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/index-basic/*.jar" /> + <include name="**/index-metadata/*.jar" /> + </fileset> + <pathelement location="${nutch.root}/build/lib-regex-filter/test"/> + </path> + + <!-- Compile Unit test dependencies --> + <target name="deps-test-compile"> + <ant target="compile-test" inheritall="false" dir="../index-basic"/> + <ant target="compile-test" inheritall="false" dir="../index-metadata"/> + </target> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" /> + <ant target="deploy" inheritall="false" dir="../protocol-file" /> + <ant target="deploy" inheritall="false" dir="../parse-html" /> + <ant target="deploy" inheritall="false" dir="../parse-metatags" /> + <ant target="deploy" inheritall="false" dir="../index-basic" /> + <ant target="deploy" inheritall="false" dir="../index-metadata" /> + </target> + + <!-- Copy test file for junit test --> + <mkdir dir="${build.test}/data" /> + <copy todir="${build.test}/data"> + <fileset dir="sample"> + <include name="*.html" /> + </fileset> + </copy> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-replace/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-replace/ivy.xml b/nutch-plugins/index-replace/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/index-replace/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-replace/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-replace/plugin.xml b/nutch-plugins/index-replace/plugin.xml new file mode 100644 index 0000000..3cffe60 --- /dev/null +++ b/nutch-plugins/index-replace/plugin.xml @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="UTF-8"?> +<plugin + id="index-replace" + name="Replace Indexer" + version="1.0" + provider-name="PeterCiuffetti"> + + <runtime> + <library name="index-replace.jar"> + <export name="*"/> + </library> + </runtime> + + <extension id="org.apache.nutch.indexer.replace" + name="Replace Indexer" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="ReplaceIndexer" + class="org.apache.nutch.indexer.replace.ReplaceIndexer"/> + </extension> + +</plugin> + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-replace/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-replace/pom.xml b/nutch-plugins/index-replace/pom.xml new file mode 100644 index 0000000..599170c --- /dev/null +++ b/nutch-plugins/index-replace/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>index-replace</artifactId> + <packaging>jar</packaging> + + <name>index-replace</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-replace/sample/testIndexReplace.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-replace/sample/testIndexReplace.html b/nutch-plugins/index-replace/sample/testIndexReplace.html new file mode 100644 index 0000000..0b90fc2 --- /dev/null +++ b/nutch-plugins/index-replace/sample/testIndexReplace.html @@ -0,0 +1,12 @@ +<html> + <head> + <title>Testing the power of the index-replace plugin</title> + <meta name="description" content="With this plugin, I control the description! Bwuhuhuhaha!"> + <meta name="keywords" content="Breathtaking, Riveting, Two Thumbs Up!"> + <meta name="author" content="Peter Ciuffetti"> + </head> + <body> + <p>This html file is used to test the Nutch index-replace regexp replacer plugin. + A decidedly boring thing to do.</p> + </body> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/FieldReplacer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/FieldReplacer.java b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/FieldReplacer.java new file mode 100644 index 0000000..ddfe24d --- /dev/null +++ b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/FieldReplacer.java @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.replace; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * POJO to store a filename, its match pattern and its replacement string. + * + * A checkAndReplace method is provided where you can simultaneously check if + * the field matches this replacer and if the pattern matches your field value. + * + * @author Peter Ciuffetti + */ +public class FieldReplacer { + + private static final Log LOG = LogFactory.getLog(FieldReplacer.class + .getName()); + + private final String fieldName; + private final String toFieldName; + private final Pattern pattern; + private final String replacement; + private boolean isValid; + + /** + * Create a FieldReplacer for a field. + * + * Any pattern exceptions are caught within this constructor and the object is + * marked inValid. The error will be logged. This prevents this caller from + * attempting invalid replacements. + * + * @param fieldName + * the name of the source field to operate on. Required. + * @param toFieldName + * the name of the target field. Required. + * @param pattern + * the pattern the field must match. Required. + * @param replacement + * the replacement string + * @param flags + * the Pattern flags value, or null if no flags are needed + */ + public FieldReplacer(String fieldName, String toFieldName, String pattern, + String replacement, Integer flags) { + + this.isValid = true; + // Must have a non-empty field name and pattern. + if (fieldName == null || fieldName.trim().length() == 0) { + LOG.error("Empty fieldName provided, FieldReplacer marked invalid."); + this.isValid = false; + } + if (pattern == null || pattern.trim().length() == 0) { + LOG.error("Empty pattern for field " + fieldName + + "provided, FieldReplacer marked invalid."); + this.isValid = false; + } + + if (replacement == null) { + this.replacement = ""; + } else { + this.replacement = replacement; + } + + this.fieldName = fieldName.trim(); + this.toFieldName = toFieldName.trim(); + + if (this.isValid) { + LOG.info("Compiling pattern " + pattern + " for field " + fieldName); + Pattern myPattern = null; + try { + if (flags != null) { + myPattern = Pattern.compile(pattern, flags); + } else { + myPattern = Pattern.compile(pattern); + } + } catch (PatternSyntaxException e) { + LOG.error("Pattern " + pattern + " for field " + fieldName + + " failed to compile: " + e.toString()); + this.isValid = false; + } + this.pattern = myPattern; + } else { + this.pattern = null; + } + } + + /** + * Field replacer with the input and output field the same. + * + * @param fieldName + * @param pattern + * @param replacement + * @param flags + */ + public FieldReplacer(String fieldName, String pattern, String replacement, + Integer flags) { + this(fieldName, fieldName, pattern, replacement, flags); + } + + public String getFieldName() { + return this.fieldName; + } + + public String getToFieldName() { + return this.toFieldName; + } + + public Pattern getPattern() { + return this.pattern; + } + + public String getReplacement() { + return this.replacement; + } + + /** + * Does this FieldReplacer have a valid fieldname and pattern? + * + * @return + */ + public boolean isValid() { + return this.isValid; + } + + /** + * Return the replacement value for a field value. + * + * This does not check for a matching field; the caller must decide if this + * FieldReplacer should operate on this value by checking getFieldName(). + * + * The method returns the value with the replacement. If the value returned is + * not different then eiher the pattern didn't match or the replacement was a + * no-op. + * + * @param value + * @return + */ + public String replace(String value) { + if (this.isValid) { + return this.pattern.matcher(value).replaceAll(replacement); + } else { + return value; + } + } + + /** + * Return a replacement value for a field. + * + * This is designed to fail fast and trigger a replacement only when + * necessary. If this method returns null, either the field does not match or + * the value does not match the pattern (or possibly the pattern is invalid). + * + * So only if the method returns a non-null value will you need to replace the + * value for the field. + * + * @param fieldName + * the name of the field you are checking + * @param value + * the value of the field you are checking + * @return a replacement value. If null, either the field does not match or + * the value does not match. + */ + public String checkAndReplace(String fieldName, String value) { + if (this.fieldName.equals(fieldName)) { + if (value != null && value.length() > 0) { + if (this.isValid) { + Matcher m = this.pattern.matcher(value); + if (m.find()) { + return m.replaceAll(this.replacement); + } + } + } + } + return null; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java new file mode 100644 index 0000000..7017603 --- /dev/null +++ b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java @@ -0,0 +1,330 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.replace; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.NutchField; +import org.apache.nutch.parse.Parse; + +/** + * Do pattern replacements on selected field contents prior to indexing. + * + * To use this plugin, add <code>index-replace</code> to your + * <code>plugin.includes</code>. Example: + * + * <pre> + * <property> + * <name>plugin.includes</name> + * <value>protocol-(http)|urlfilter-regex|parse-(html|tika|metatags)|index-(basic|anchor|metadata|replace)|urlnormalizer-(pass|regex|basic)|indexer-solr</value> + * </property> + * </pre> + * + * And then add the <code>index.replace.regexp</code> property to + * <code>conf/nutch-site.xml</code>. This contains a list of replacement + * instructions per field name, one per line. eg. + * + * <pre> + * fieldname=/regexp/replacement/[flags] + * </pre> + * + * <pre> + * <property> + * <name>index.replace.regexp</name> + * <value> + * hostmatch=.*\\.com + * title=/search/replace/2 + * </value> + * </property> + * </pre> + * + * <code>hostmatch=</code> and <code>urlmatch=</code> lines indicate the match + * pattern for a host or url. The field replacements that follow this line will + * apply only to pages from the matching host or url. Replacements run in the + * order specified. Field names may appear multiple times if multiple + * replacements are needed. + * + * The property format is defined in greater detail in + * <code>conf/nutch-default.xml</code>. + * + * @author Peter Ciuffetti + * @see <a + * href="https://issues.apache.org/jira/browse/NUTCH-2058">NUTCH-2058</a> + */ +public class ReplaceIndexer implements IndexingFilter { + + private static final Log LOG = LogFactory.getLog(ReplaceIndexer.class + .getName()); + + /** Special field name signifying the start of a host-specific match set */ + private static final String HOSTMATCH = "hostmatch"; + /** Special field name signifying the start of a url-specific match set */ + private static final String URLMATCH = "urlmatch"; + + private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_HOST = new LinkedHashMap<Pattern, List<FieldReplacer>>(); + private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_URL = new LinkedHashMap<Pattern, List<FieldReplacer>>(); + + private static Pattern LINE_SPLIT = Pattern.compile("(^.+$)+", + Pattern.MULTILINE); + private static Pattern NAME_VALUE_SPLIT = Pattern.compile("(.*?)=(.*)"); + + private Configuration conf; + + /** + * {@inheritDoc} + */ + public void setConf(Configuration conf) { + this.conf = conf; + FIELDREPLACERS_BY_HOST.clear(); + FIELDREPLACERS_BY_URL.clear(); + String value = conf.get("index.replace.regexp", null); + if (value != null) { + LOG.debug("Parsing index.replace.regexp property"); + this.parseConf(value); + } + } + + /** + * {@inheritDoc} + */ + public Configuration getConf() { + return this.conf; + } + + /** + * Parse the property value into a set of maps that store a list of + * replacements by field for each host and url configured into the property. + * + * @param propertyValue + */ + private void parseConf(String propertyValue) { + if (propertyValue == null || propertyValue.trim().length() == 0) { + return; + } + + // At the start, all replacements apply globally to every host. + Pattern hostPattern = Pattern.compile(".*"); + Pattern urlPattern = null; + + // Split the property into lines + Matcher lineMatcher = LINE_SPLIT.matcher(propertyValue); + while (lineMatcher.find()) { + String line = lineMatcher.group(); + if (line != null && line.length() > 0) { + + // Split the line into field and value + Matcher nameValueMatcher = NAME_VALUE_SPLIT.matcher(line.trim()); + if (nameValueMatcher.find()) { + String fieldName = nameValueMatcher.group(1).trim(); + String value = nameValueMatcher.group(2); + if (fieldName != null && value != null) { + // Check if the field name is one of our special cases. + if (HOSTMATCH.equals(fieldName)) { + urlPattern = null; + try { + hostPattern = Pattern.compile(value); + } catch (PatternSyntaxException pse) { + LOG.error("hostmatch pattern " + value + " does not compile: " + + pse.getMessage()); + // Deactivate this invalid match set by making it match no host. + hostPattern = Pattern.compile("willnotmatchanyhost"); + } + } else if (URLMATCH.equals(fieldName)) { + try { + urlPattern = Pattern.compile(value); + } catch (PatternSyntaxException pse) { + LOG.error("urlmatch pattern " + value + " does not compile: " + + pse.getMessage()); + // Deactivate this invalid match set by making it match no url. + urlPattern = Pattern.compile("willnotmatchanyurl"); + } + } else if (value.length() > 3) { + String toFieldName = fieldName; + // If the fieldname has a colon, this indicates a different target + // field. + if (fieldName.indexOf(':') > 0) { + toFieldName = fieldName.substring(fieldName.indexOf(':') + 1); + fieldName = fieldName.substring(0, fieldName.indexOf(':')); + } + String sep = value.substring(0, 1); + + // Divide the value into pattern / replacement / flags. + value = value.substring(1); + if (!value.contains(sep)) { + LOG.error("Pattern '" + line + + "', not parseable. Missing separator " + sep); + continue; + } + String pattern = value.substring(0, value.indexOf(sep)); + value = value.substring(pattern.length() + 1); + String replacement = value; + if (value.contains(sep)) { + replacement = value.substring(0, value.indexOf(sep)); + } + int flags = 0; + if (value.length() > replacement.length() + 1) { + value = value.substring(replacement.length() + 1).trim(); + try { + flags = Integer.parseInt(value); + } catch (NumberFormatException e) { + LOG.error("Pattern " + line + ", has invalid flags component"); + continue; + } + } + Integer iFlags = (flags > 0) ? new Integer(flags) : null; + + // Make a FieldReplacer out of these params. + FieldReplacer fr = new FieldReplacer(fieldName, toFieldName, + pattern, replacement, iFlags); + + // Add this field replacer to the list for this host or URL. + if (urlPattern != null) { + List<FieldReplacer> lfp = FIELDREPLACERS_BY_URL.get(urlPattern); + if (lfp == null) { + lfp = new ArrayList<FieldReplacer>(); + } + lfp.add(fr); + FIELDREPLACERS_BY_URL.put(urlPattern, lfp); + } else { + List<FieldReplacer> lfp = FIELDREPLACERS_BY_HOST + .get(hostPattern); + if (lfp == null) { + lfp = new ArrayList<FieldReplacer>(); + } + lfp.add(fr); + FIELDREPLACERS_BY_HOST.put(hostPattern, lfp); + } + } + } + } + } + } + } + + /** + * {@inheritDoc} + */ + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + if (doc != null) { + if (FIELDREPLACERS_BY_HOST.size() > 0) { + this.doReplace(doc, "host", FIELDREPLACERS_BY_HOST); + } + + if (FIELDREPLACERS_BY_URL.size() > 0) { + this.doReplace(doc, "url", FIELDREPLACERS_BY_URL); + } + } + + return doc; + } + + /** + * Iterates through the replacement map provided, to update the fields in the + * Nutch Document. + * + * @param doc + * the document we are modifying + * @param keyName + * either "host" or "url" -- the field that determines the + * replacement set used + * @param replaceMap + * the list of FieldReplacers that applies to this keyName. + */ + private void doReplace(NutchDocument doc, String keyName, + Map<Pattern, List<FieldReplacer>> replaceMap) { + + if (doc == null || replaceMap.size() == 0) { + return; + } + + Collection<String> docFieldNames = doc.getFieldNames(); + NutchField keyField = doc.getField(keyName); + if (keyField == null) { + // This document doesn't have the key field; no work to do. + return; + } + + List<Object> keyFieldValues = keyField.getValues(); + if (keyFieldValues.size() == 0) { + // This document doesn't have any values for the key field; no work to do. + return; + } + + // For every value of the keyField (one expected) + for (Object oKeyFieldValue : keyFieldValues) { + if (oKeyFieldValue != null && oKeyFieldValue instanceof java.lang.String) { + String keyFieldValue = (String) oKeyFieldValue; + + // For each pattern that we have a replacement list for... + for (Map.Entry<Pattern, List<FieldReplacer>> entries : replaceMap + .entrySet()) { + // If this key is a match for a replacement set... + if (entries.getKey().matcher(keyFieldValue).find()) { + + // For each field we will replace for this key... + for (FieldReplacer fp : entries.getValue()) { + String fieldName = fp.getFieldName(); + + // Does this document contain the FieldReplacer's field? + if (docFieldNames.contains(fieldName)) { + NutchField docField = doc.getField(fieldName); + List<Object> fieldValues = docField.getValues(); + ArrayList<String> newFieldValues = new ArrayList<String>(); + + // For each value of the field, match against our + // replacer... + for (Object oFieldValue : fieldValues) { + if (oFieldValue != null + && oFieldValue instanceof java.lang.String) { + String fieldValue = (String) oFieldValue; + String newValue = fp.replace(fieldValue); + newFieldValues.add(newValue); + } + } + + // Remove the target field and add our replaced values. + String targetFieldName = fp.getToFieldName(); + doc.removeField(targetFieldName); + for (String newFieldValue : newFieldValues) { + doc.add(targetFieldName, newFieldValue); + } + } + } + } + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/package-info.java b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/package-info.java new file mode 100644 index 0000000..28c24a4 --- /dev/null +++ b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Indexing filter to allow pattern replacements on metadata. + */ +package org.apache.nutch.indexer.replace; + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java b/nutch-plugins/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java new file mode 100644 index 0000000..ca90ca3 --- /dev/null +++ b/nutch-plugins/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java @@ -0,0 +1,456 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer.replace; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.basic.BasicIndexingFilter; +import org.apache.nutch.indexer.metadata.MetadataIndexer; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +/** + * JUnit tests for the <code>index-replace</code> plugin. + * + * In these tests, the sample file has some meta tags added to the Nutch + * document by the <code>index-metadata</code> plugin. The + * <code>index-replace</code> plugin is then used to either change (or not + * change) the fields depending on the various values of + * <code>index.replace.regexp</code> property being provided to Nutch. + * + * + * @author Peter Ciuffetti + * + */ +public class TestIndexReplace { + + private static final String INDEX_REPLACE_PROPERTY = "index.replace.regexp"; + + private String fileSeparator = System.getProperty("file.separator"); + private String sampleDir = System.getProperty("test.data", "."); + private String sampleFile = "testIndexReplace.html"; + + /** + * Run a test file through the Nutch parser and index filters. + * + * @param fileName + * @param conf + * @return the Nutch document with the replace indexer applied + */ + public NutchDocument parseAndFilterFile(String fileName, Configuration conf) { + NutchDocument doc = new NutchDocument(); + + BasicIndexingFilter basicIndexer = new BasicIndexingFilter(); + basicIndexer.setConf(conf); + Assert.assertNotNull(basicIndexer); + + MetadataIndexer metaIndexer = new MetadataIndexer(); + metaIndexer.setConf(conf); + Assert.assertNotNull(basicIndexer); + + ReplaceIndexer replaceIndexer = new ReplaceIndexer(); + replaceIndexer.setConf(conf); + Assert.assertNotNull(replaceIndexer); + + try { + String urlString = "file:" + sampleDir + fileSeparator + fileName; + Text text = new Text(urlString); + CrawlDatum crawlDatum = new CrawlDatum(); + Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); + Content content = protocol.getProtocolOutput(text, crawlDatum) + .getContent(); + Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); + crawlDatum.setFetchTime(100L); + + Inlinks inlinks = new Inlinks(); + doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks); + doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks); + doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks); + } catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.toString()); + } + + return doc; + } + + /** + * Test property parsing. + * + * The filter does not expose details of the parse. So all we are checking is + * that the parse does not throw a runtime exception and that the value + * provided is the value returned. + */ + @Test + public void testPropertyParse() { + Configuration conf = NutchConfiguration.create(); + String indexReplaceProperty = " metatag.description=/this(.*)plugin/this awesome plugin/2\n" + + " metatag.keywords=/\\,/\\!/\n" + + " hostmatch=.*.com\n" + + " metatag.keywords=/\\,/\\?/\n" + + " metatag.author:dc_author=/\\s+/ David /\n" + + " urlmatch=.*.html\n" + + " metatag.keywords=/\\,/\\./\n" + " metatag.author=/\\s+/ D. /\n"; + + conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); + + ReplaceIndexer rp = new ReplaceIndexer(); + try { + rp.setConf(conf); + } catch (RuntimeException ohno) { + Assert.fail("Unable to parse a valid index.replace.regexp property! " + + ohno.getMessage()); + } + + Configuration parsedConf = rp.getConf(); + + // Does the getter equal the setter? Too easy! + Assert.assertEquals(indexReplaceProperty, + parsedConf.get(INDEX_REPLACE_PROPERTY)); + } + + /** + * Test metatag value replacement using global replacement settings. + * + * The index.replace.regexp property does not use hostmatch or urlmatch, so + * all patterns are global. + */ + @Test + public void testGlobalReplacement() { + String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; + String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!"; + String expectedAuthor = "Peter D. Ciuffetti"; + String indexReplaceProperty = " metatag.description=/this(.*)plugin/this awesome plugin/\n" + + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n"; + + Configuration conf = NutchConfiguration.create(); + conf.set( + "plugin.includes", + "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); + conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); + conf.set("metatags.names", "author,description,keywords"); + conf.set("index.parse.md", + "metatag.author,metatag.description,metatag.keywords"); + // Not necessary but helpful when debugging the filter. + conf.set("http.timeout", "99999999999"); + + // Run the document through the parser and index filters. + NutchDocument doc = parseAndFilterFile(sampleFile, conf); + + Assert.assertEquals(expectedDescription, + doc.getFieldValue("metatag.description")); + Assert + .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); + Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); + } + + /** + * Test that invalid property settings are handled and ignored. + * + * This test provides an invalid property setting that will fail property + * parsing and Pattern.compile. The expected outcome is that the patterns will + * not cause failure and the targeted fields will not be modified by the + * filter. + */ + @Test + public void testInvalidPatterns() { + String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!"; + String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!"; + String expectedAuthor = "Peter Ciuffetti"; + // Contains: invalid pattern, invalid flags, incomplete property + String indexReplaceProperty = " metatag.description=/this\\s+**plugin/this awesome plugin/\n" + + " metatag.keywords=/\\,/\\!/what\n" + " metatag.author=#notcomplete"; + + Configuration conf = NutchConfiguration.create(); + conf.set( + "plugin.includes", + "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); + conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); + conf.set("metatags.names", "author,description,keywords"); + conf.set("index.parse.md", + "metatag.author,metatag.description,metatag.keywords"); + // Not necessary but helpful when debugging the filter. + conf.set("http.timeout", "99999999999"); + + // Run the document through the parser and index filters. + NutchDocument doc = parseAndFilterFile(sampleFile, conf); + + // Assert that our metatags have not changed. + Assert.assertEquals(expectedDescription, + doc.getFieldValue("metatag.description")); + Assert + .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); + Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); + + } + + /** + * Test URL pattern matching + */ + @Test + public void testUrlMatchesPattern() { + String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; + String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!"; + String expectedAuthor = "Peter D. Ciuffetti"; + String indexReplaceProperty = " urlmatch=.*.html\n" + + " metatag.description=/this(.*)plugin/this awesome plugin/\n" + + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n"; + + Configuration conf = NutchConfiguration.create(); + conf.set( + "plugin.includes", + "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); + conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); + conf.set("metatags.names", "author,description,keywords"); + conf.set("index.parse.md", + "metatag.author,metatag.description,metatag.keywords"); + // Not necessary but helpful when debugging the filter. + conf.set("http.timeout", "99999999999"); + + // Run the document through the parser and index filters. + NutchDocument doc = parseAndFilterFile(sampleFile, conf); + + // Assert that our metatags have changed. + Assert.assertEquals(expectedDescription, + doc.getFieldValue("metatag.description")); + Assert + .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); + Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); + + } + + /** + * Test URL pattern not matching. + * + * Expected result is that the filter does not change the fields. + */ + @Test + public void testUrlNotMatchesPattern() { + String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!"; + String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!"; + String expectedAuthor = "Peter Ciuffetti"; + String indexReplaceProperty = " urlmatch=.*.xml\n" + + " metatag.description=/this(.*)plugin/this awesome plugin/\n" + + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n"; + + Configuration conf = NutchConfiguration.create(); + conf.set( + "plugin.includes", + "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); + conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); + conf.set("metatags.names", "author,description,keywords"); + conf.set("index.parse.md", + "metatag.author,metatag.description,metatag.keywords"); + // Not necessary but helpful when debugging the filter. + conf.set("http.timeout", "99999999999"); + + // Run the document through the parser and index filters. + NutchDocument doc = parseAndFilterFile(sampleFile, conf); + + // Assert that our metatags have not changed. + Assert.assertEquals(expectedDescription, + doc.getFieldValue("metatag.description")); + Assert + .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); + Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); + + } + + /** + * Test a global pattern match for description and URL pattern match for + * keywords and author. + * + * All three should be triggered. It also tests replacement groups. + */ + @Test + public void testGlobalAndUrlMatchesPattern() { + String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; + String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!"; + String expectedAuthor = "Peter D. Ciuffetti"; + String indexReplaceProperty = " metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n" + + " urlmatch=.*.html\n" + + " metatag.keywords=/\\,/\\!/\n" + + " metatag.author=/\\s+/ D. /\n"; + + Configuration conf = NutchConfiguration.create(); + conf.set( + "plugin.includes", + "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); + conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); + conf.set("metatags.names", "author,description,keywords"); + conf.set("index.parse.md", + "metatag.author,metatag.description,metatag.keywords"); + // Not necessary but helpful when debugging the filter. + conf.set("http.timeout", "99999999999"); + + // Run the document through the parser and index filters. + NutchDocument doc = parseAndFilterFile(sampleFile, conf); + + // Assert that our metatags have changed. + Assert.assertEquals(expectedDescription, + doc.getFieldValue("metatag.description")); + Assert + .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); + Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); + + } + + /** + * Test a global pattern match for description and URL pattern match for + * keywords and author. + * + * Only the global match should be triggered. + */ + @Test + public void testGlobalAndUrlNotMatchesPattern() { + String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; + String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!"; + String expectedAuthor = "Peter Ciuffetti"; + String indexReplaceProperty = " metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n" + + " urlmatch=.*.xml\n" + + " metatag.keywords=/\\,/\\!/\n" + + " metatag.author=/\\s+/ D. /\n"; + + Configuration conf = NutchConfiguration.create(); + conf.set( + "plugin.includes", + "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); + conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); + conf.set("metatags.names", "author,description,keywords"); + conf.set("index.parse.md", + "metatag.author,metatag.description,metatag.keywords"); + // Not necessary but helpful when debugging the filter. + conf.set("http.timeout", "99999999999"); + + // Run the document through the parser and index filters. + NutchDocument doc = parseAndFilterFile(sampleFile, conf); + + // Assert that description has changed and the others have not changed. + Assert.assertEquals(expectedDescription, + doc.getFieldValue("metatag.description")); + Assert + .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); + Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); + } + + /** + * Test order-specific replacement settings. + * + * This makes multiple replacements on the same field and will produce the + * expected value only if the replacements are run in the order specified. + */ + @Test + public void testReplacementsRunInSpecifedOrder() { + String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; + String indexReplaceProperty = " metatag.description=/this plugin/this amazing plugin/\n" + + " metatag.description=/this amazing plugin/this valuable plugin/\n" + + " metatag.description=/this valuable plugin/this cool plugin/\n" + + " metatag.description=/this cool plugin/this wicked plugin/\n" + + " metatag.description=/this wicked plugin/this awesome plugin/\n"; + + Configuration conf = NutchConfiguration.create(); + conf.set( + "plugin.includes", + "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); + conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); + conf.set("metatags.names", "author,description,keywords"); + conf.set("index.parse.md", + "metatag.author,metatag.description,metatag.keywords"); + // Not necessary but helpful when debugging the filter. + conf.set("http.timeout", "99999999999"); + + // Run the document through the parser and index filters. + NutchDocument doc = parseAndFilterFile(sampleFile, conf); + + // Check that the value produced by the last replacement has worked. + Assert.assertEquals(expectedDescription, + doc.getFieldValue("metatag.description")); + } + + /** + * Test a replacement pattern that uses the flags feature. + * + * A 2 is Pattern.CASE_INSENSITIVE. We look for upper case and expect to match + * any case. + */ + @Test + public void testReplacementsWithFlags() { + String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; + String indexReplaceProperty = " metatag.description=/THIS PLUGIN/this awesome plugin/2"; + + Configuration conf = NutchConfiguration.create(); + conf.set( + "plugin.includes", + "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); + conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); + conf.set("metatags.names", "author,description,keywords"); + conf.set("index.parse.md", + "metatag.author,metatag.description,metatag.keywords"); + // Not necessary but helpful when debugging the filter. + conf.set("http.timeout", "99999999999"); + + // Run the document through the parser and index filters. + NutchDocument doc = parseAndFilterFile(sampleFile, conf); + + // Check that the value produced by the case-insensitive replacement has + // worked. + Assert.assertEquals(expectedDescription, + doc.getFieldValue("metatag.description")); + } + + /** + * Test a replacement pattern that uses the target field feature. + * Check that the input is not modifid and that the taret field is added. + */ + @Test + public void testReplacementsDifferentTarget() { + String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!"; + String expectedTargetDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; + String indexReplaceProperty = " metatag.description:new=/this plugin/this awesome plugin/"; + + Configuration conf = NutchConfiguration.create(); + conf.set( + "plugin.includes", + "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); + conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); + conf.set("metatags.names", "author,description,keywords"); + conf.set("index.parse.md", + "metatag.author,metatag.description,metatag.keywords"); + // Not necessary but helpful when debugging the filter. + conf.set("http.timeout", "99999999999"); + + // Run the document through the parser and index filters. + NutchDocument doc = parseAndFilterFile(sampleFile, conf); + + // Check that the input field has not been modified + Assert.assertEquals(expectedDescription, + doc.getFieldValue("metatag.description")); + // Check that the output field has created + Assert.assertEquals(expectedTargetDescription, + doc.getFieldValue("new")); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-static/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-static/build.xml b/nutch-plugins/index-static/build.xml new file mode 100644 index 0000000..0ec5665 --- /dev/null +++ b/nutch-plugins/index-static/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="index-static" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-static/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-static/ivy.xml b/nutch-plugins/index-static/ivy.xml new file mode 100644 index 0000000..24d7606 --- /dev/null +++ b/nutch-plugins/index-static/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../../ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-static/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-static/plugin.xml b/nutch-plugins/index-static/plugin.xml new file mode 100644 index 0000000..539e355 --- /dev/null +++ b/nutch-plugins/index-static/plugin.xml @@ -0,0 +1,42 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="index-static" + name="Index Static" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="index-static.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + + <extension id="org.apache.nutch.indexer.staticfield" + name="Nutch static field index" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="StaticField" + class="org.apache.nutch.indexer.staticfield.StaticFieldIndexer"/> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-static/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-static/pom.xml b/nutch-plugins/index-static/pom.xml new file mode 100644 index 0000000..6eaf0ba --- /dev/null +++ b/nutch-plugins/index-static/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>index-static</artifactId> + <packaging>jar</packaging> + + <name>index-static</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java b/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java new file mode 100644 index 0000000..1a81041 --- /dev/null +++ b/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java @@ -0,0 +1,143 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer.staticfield; + +import java.util.HashMap; +import java.util.Map.Entry; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.Parse; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.conf.Configuration; + +/** + * A simple plugin called at indexing that adds fields with static data. You can + * specify a list of fieldname:fieldcontent per nutch job. It can be useful when + * collections can't be created by urlpatterns, like in subcollection, but on a + * job-basis. + */ + +public class StaticFieldIndexer implements IndexingFilter { + private Configuration conf; + private HashMap<String, String[]> fields; + private boolean addStaticFields = false; + private String fieldSep = ","; + private String kevSep = ":"; + private String valueSep = " "; + + /** + * The {@link StaticFieldIndexer} filter object which adds fields as per + * configuration setting. See {@code index.static} in nutch-default.xml. + * + * @param doc + * The {@link NutchDocument} object + * @param parse + * The relevant {@link Parse} object passing through the filter + * @param url + * URL to be filtered for anchor text + * @param datum + * The {@link CrawlDatum} entry + * @param inlinks + * The {@link Inlinks} containing anchor text + * @return filtered NutchDocument + */ + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + if (this.addStaticFields == true) { + for (Entry<String, String[]> entry : this.fields.entrySet()) { + for (String val : entry.getValue()) { + doc.add(entry.getKey(), val); + } + } + } + return doc; + } + + /** + * Populate a HashMap from a list of fieldname:fieldcontent. See + * {@index.static} in nutch-default.xml. + * + * @param fieldsString + * string containing field:value pairs + * @return HashMap of fields and their corresponding values + */ + private HashMap<String, String[]> parseFields(String fieldsString) { + HashMap<String, String[]> fields = new HashMap<String, String[]>(); + + /* + * The format is very easy, it's a comma-separated list of fields in the + * form <name>:<value> + */ + for (String field : fieldsString.split(this.fieldSep)) { + String[] entry = field.split(this.kevSep); + if (entry.length == 2) + fields.put(entry[0].trim(), entry[1].trim().split(this.valueSep)); + } + + return fields; + } + + /** + * Set the {@link Configuration} object + */ + public void setConf(Configuration conf) { + this.conf = conf; + + // NUTCH-2052: Allow user-defined delimiters in index.static + this.fieldSep = this.regexEscape(conf.get("index.static.fieldsep", ",")); + this.kevSep = this.regexEscape(conf.get("index.static.keysep", ":")); + this.valueSep = this.regexEscape(conf.get("index.static.valuesep", " ")); + + String fieldsString = conf.get("index.static", null); + if (fieldsString != null) { + this.addStaticFields = true; + this.fields = parseFields(fieldsString); + } + } + + /** + * Get the {@link Configuration} object + */ + public Configuration getConf() { + return this.conf; + } + + /** + * Escapes any character that needs escaping so it can be used in a regexp. + */ + protected String regexEscape(String in) { + String result = in; + if (in != null) { + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < in.length(); i++) { + CharSequence c = in.subSequence(i, i+1); + if ("<([{\\^-=$!|]})?*+.>".contains(c)) { + sb.append('\\'); + } + sb.append(c); + } + result = sb.toString(); + } + return result; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/package.html b/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/package.html new file mode 100644 index 0000000..f4b5146 --- /dev/null +++ b/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/package.html @@ -0,0 +1,5 @@ +<html> +<body> +<p>A simple plugin called at indexing that adds fields with static data. You can specify a list of fieldname:fieldcontent per nutch job. It can be useful when collections can't be created by urlpatterns, like in subcollection, but on a job-basis.</p><p></p> +</body> +</html>
