http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java deleted file mode 100644 index 8f2bee5..0000000 --- a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Indexing filter to add document metadata to the index. - * Metadata may come from CrawlDb, parse or content metadata. - */ -package org.apache.nutch.indexer.metadata; -
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-more/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-more/build.xml b/src/plugin/index-more/build.xml deleted file mode 100644 index dec1e12..0000000 --- a/src/plugin/index-more/build.xml +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="index-more" default="jar-core"> - - <import file="../build-plugin.xml"/> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-more/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-more/ivy.xml b/src/plugin/index-more/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/index-more/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-more/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-more/plugin.xml b/src/plugin/index-more/plugin.xml deleted file mode 100644 index d920f72..0000000 --- a/src/plugin/index-more/plugin.xml +++ /dev/null @@ -1,42 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="index-more" - name="More Indexing Filter" - version="1.0.0" - provider-name="nutch.org"> - - - <runtime> - <library name="index-more.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - <extension id="org.apache.nutch.indexer.more" - name="Nutch More Indexing Filter" - point="org.apache.nutch.indexer.IndexingFilter"> - <implementation id="MoreIndexingFilter" - class="org.apache.nutch.indexer.more.MoreIndexingFilter"/> - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java deleted file mode 100644 index 6e64ede..0000000 --- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java +++ /dev/null @@ -1,344 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.indexer.more; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.nutch.metadata.Metadata; - -import org.apache.nutch.net.protocols.HttpDateFormat; -import org.apache.nutch.net.protocols.Response; - -import org.apache.nutch.parse.Parse; - -import org.apache.nutch.indexer.IndexingFilter; -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.NutchDocument; - -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.util.MimeUtil; -import org.apache.tika.Tika; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; - -import java.text.ParseException; - -import java.io.BufferedReader; -import java.io.IOException; -import java.util.Date; -import java.util.regex.*; -import java.util.HashMap; - -import org.apache.commons.lang.StringUtils; -import org.apache.commons.lang.time.DateUtils; - -/** - * Add (or reset) a few metaData properties as respective fields (if they are - * available), so that they can be accurately used within the search index. - * - * 'lastModifed' is indexed to support query by date, 'contentLength' obtains - * content length from the HTTP header, 'type' field is indexed to support query - * by type and finally the 'title' field is an attempt to reset the title if a - * content-disposition hint exists. The logic is that such a presence is - * indicative that the content provider wants the filename therein to be used as - * the title. - * - * Still need to make content-length searchable! - * - * @author John Xing - */ - -public class MoreIndexingFilter implements IndexingFilter { - public static final Logger LOG = LoggerFactory - .getLogger(MoreIndexingFilter.class); - - /** Get the MimeTypes resolver instance. */ - private MimeUtil MIME; - private Tika tika = new Tika(); - - /** Map for mime-type substitution */ - private HashMap<String, String> mimeMap = null; - private boolean mapMimes = false; - - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, - CrawlDatum datum, Inlinks inlinks) throws IndexingException { - - String url_s = url.toString(); - - addTime(doc, parse.getData(), url_s, datum); - addLength(doc, parse.getData(), url_s); - addType(doc, parse.getData(), url_s, datum); - resetTitle(doc, parse.getData(), url_s); - - return doc; - } - - // Add time related meta info. Add last-modified if present. Index date as - // last-modified, or, if that's not present, use fetch time. - private NutchDocument addTime(NutchDocument doc, ParseData data, String url, - CrawlDatum datum) { - long time = -1; - - String lastModified = data.getMeta(Metadata.LAST_MODIFIED); - if (lastModified != null) { // try parse last-modified - time = getTime(lastModified, url); // use as time - // store as string - doc.add("lastModified", new Date(time)); - } - - if (time == -1) { // if no last-modified specified in HTTP header - time = datum.getModifiedTime(); // use value in CrawlDatum - if (time <= 0) { // if also unset - time = datum.getFetchTime(); // use time the fetch took place (fetchTime - // of fetchDatum) - } - } - - // un-stored, indexed and un-tokenized - doc.add("date", new Date(time)); - return doc; - } - - private long getTime(String date, String url) { - long time = -1; - try { - time = HttpDateFormat.toLong(date); - } catch (ParseException e) { - // try to parse it as date in alternative format - try { - Date parsedDate = DateUtils.parseDate(date, new String[] { - "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz", - "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz", - "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz", - "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss", - "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz", - "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd", - "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm", - "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz", - "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz", - "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz", - "yyyy-MM-dd'T'HH:mm:ss'Z'" }); - time = parsedDate.getTime(); - // if (LOG.isWarnEnabled()) { - // LOG.warn(url + ": parsed date: " + date +" to:"+time); - // } - } catch (Exception e2) { - if (LOG.isWarnEnabled()) { - LOG.warn(url + ": can't parse erroneous date: " + date); - } - } - } - return time; - } - - // Add Content-Length - private NutchDocument addLength(NutchDocument doc, ParseData data, String url) { - String contentLength = data.getMeta(Response.CONTENT_LENGTH); - - if (contentLength != null) { - // NUTCH-1010 ContentLength not trimmed - String trimmed = contentLength.toString().trim(); - if (!trimmed.isEmpty()) - doc.add("contentLength", trimmed); - } - return doc; - } - - /** - * <p> - * Add Content-Type and its primaryType and subType add contentType, - * primaryType and subType to field "type" as un-stored, indexed and - * un-tokenized, so that search results can be confined by contentType or its - * primaryType or its subType. - * </p> - * <p> - * For example, if contentType is application/vnd.ms-powerpoint, search can be - * done with one of the following qualifiers - * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint - * all case insensitive. The query filter is implemented in - * {@link TypeQueryFilter}. - * </p> - * - * @param doc - * @param data - * @param url - * @return - */ - private NutchDocument addType(NutchDocument doc, ParseData data, String url, - CrawlDatum datum) { - String mimeType = null; - String contentType = null; - - Writable tcontentType = datum.getMetaData().get( - new Text(Response.CONTENT_TYPE)); - if (tcontentType != null) { - contentType = tcontentType.toString(); - } else - contentType = data.getMeta(Response.CONTENT_TYPE); - if (contentType == null) { - // Note by Jerome Charron on 20050415: - // Content Type not solved by a previous plugin - // Or unable to solve it... Trying to find it - // Should be better to use the doc content too - // (using MimeTypes.getMimeType(byte[], String), but I don't know - // which field it is? - // if (MAGIC) { - // contentType = MIME.getMimeType(url, content); - // } else { - // contentType = MIME.getMimeType(url); - // } - - mimeType = tika.detect(url); - } else { - mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType)); - } - - // Checks if we solved the content-type. - if (mimeType == null) { - return doc; - } - - // Check if we have to map mime types - if (mapMimes) { - // Check if the current mime is mapped - if (mimeMap.containsKey(mimeType)) { - // It's mapped, let's replace it - mimeType = mimeMap.get(mimeType); - } - } - - contentType = mimeType; - doc.add("type", contentType); - - // Check if we need to split the content type in sub parts - if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) { - String[] parts = getParts(contentType); - - for (String part : parts) { - doc.add("type", part); - } - } - - // leave this for future improvement - // MimeTypeParameterList parameterList = mimeType.getParameters() - - return doc; - } - - /** - * Utility method for splitting mime type into type and subtype. - * - * @param mimeType - * @return - */ - static String[] getParts(String mimeType) { - return mimeType.split("/"); - } - - // Reset title if we see non-standard HTTP header "Content-Disposition". - // It's a good indication that content provider wants filename therein - // be used as the title of this url. - - // Patterns used to extract filename from possible non-standard - // HTTP header "Content-Disposition". Typically it looks like: - // Content-Disposition: inline; filename="foo.ppt" - private Configuration conf; - - static Pattern patterns[] = { null, null }; - - static { - try { - // order here is important - patterns[0] = Pattern.compile("\\bfilename=['\"](.+)['\"]"); - patterns[1] = Pattern.compile("\\bfilename=(\\S+)\\b"); - } catch (PatternSyntaxException e) { - // just ignore - } - } - - private NutchDocument resetTitle(NutchDocument doc, ParseData data, String url) { - String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION); - if (contentDisposition == null || doc.getFieldValue("title") != null) - return doc; - - for (int i = 0; i < patterns.length; i++) { - Matcher matcher = patterns[i].matcher(contentDisposition); - if (matcher.find()) { - doc.add("title", matcher.group(1)); - break; - } - } - - return doc; - } - - public void setConf(Configuration conf) { - this.conf = conf; - MIME = new MimeUtil(conf); - - if (conf.getBoolean("moreIndexingFilter.mapMimeTypes", false) == true) { - mapMimes = true; - - // Load the mapping - try { - readConfiguration(); - } catch (Exception e) { - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); - } - } - } - - public Configuration getConf() { - return this.conf; - } - - private void readConfiguration() throws IOException { - LOG.info("Reading content type mappings from file contenttype-mapping.txt"); - BufferedReader reader = new BufferedReader( - conf.getConfResourceAsReader("contenttype-mapping.txt")); - String line; - String parts[]; - boolean formatWarningShown = false; - - mimeMap = new HashMap<String, String>(); - - while ((line = reader.readLine()) != null) { - if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { - line.trim(); - parts = line.split("\t"); - - // Must be at least two parts - if (parts.length > 1) { - for (int i = 1; i < parts.length; i++) { - mimeMap.put(parts[i].trim(), parts[0].trim()); - } - } else { - LOG.warn("Wrong format of line: {}", line); - if (!formatWarningShown) { - LOG.warn("Expected format: <target type> <tab> <type1> [<tab> <type2> ...]"); - formatWarningShown = true; - } - } - } - } - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html ---------------------------------------------------------------------- diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html deleted file mode 100644 index 7b8fade..0000000 --- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html +++ /dev/null @@ -1,6 +0,0 @@ -<html> -<body> -<p>A more indexing plugin, adds "more" index fields: -last modified date, MIME type, content length.</p><p></p> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java b/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java deleted file mode 100644 index f918dde..0000000 --- a/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.indexer.more; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -public class TestMoreIndexingFilter { - - @Test - public void testContentType() throws IndexingException { - Configuration conf = NutchConfiguration.create(); - assertContentType(conf, "text/html", "text/html"); - assertContentType(conf, "text/html; charset=UTF-8", "text/html"); - } - - @Test - public void testGetParts() { - String[] parts = MoreIndexingFilter.getParts("text/html"); - assertParts(parts, 2, "text", "html"); - } - - /** - * @since NUTCH-901 - */ - @Test - public void testNoParts() { - Configuration conf = NutchConfiguration.create(); - conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false); - MoreIndexingFilter filter = new MoreIndexingFilter(); - filter.setConf(conf); - Assert.assertNotNull(filter); - NutchDocument doc = new NutchDocument(); - ParseImpl parse = new ParseImpl("foo bar", new ParseData()); - - try { - filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), - new CrawlDatum(), new Inlinks()); - } catch (Exception e) { - e.printStackTrace(); - Assert.fail(e.getMessage()); - } - Assert.assertNotNull(doc); - Assert.assertTrue(doc.getFieldNames().contains("type")); - Assert.assertEquals(1, doc.getField("type").getValues().size()); - Assert.assertEquals("text/html", doc.getFieldValue("type")); - } - - @Test - public void testContentDispositionTitle() throws IndexingException { - Configuration conf = NutchConfiguration.create(); - - Metadata metadata = new Metadata(); - metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext"); - MoreIndexingFilter filter = new MoreIndexingFilter(); - filter.setConf(conf); - - Text url = new Text("http://www.example.com/"); - ParseImpl parseImpl = new ParseImpl("text", new ParseData( - new ParseStatus(), "title", new Outlink[0], metadata)); - - NutchDocument doc = new NutchDocument(); - doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks()); - - Assert.assertEquals("content-disposition not detected", "filename.ext", - doc.getFieldValue("title")); - - /* NUTCH-1140: do not add second title to avoid a multi-valued title field */ - doc = new NutchDocument(); - doc.add("title", "title"); - doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks()); - Assert.assertEquals("do not add second title by content-disposition", - "title", doc.getFieldValue("title")); - } - - private void assertParts(String[] parts, int count, String... expected) { - Assert.assertEquals(count, parts.length); - for (int i = 0; i < expected.length; i++) { - Assert.assertEquals(expected[i], parts[i]); - } - } - - private void assertContentType(Configuration conf, String source, - String expected) throws IndexingException { - Metadata metadata = new Metadata(); - metadata.add(Response.CONTENT_TYPE, source); - MoreIndexingFilter filter = new MoreIndexingFilter(); - filter.setConf(conf); - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl( - "text", new ParseData(new ParseStatus(), "title", new Outlink[0], - metadata)), new Text("http://www.example.com/"), new CrawlDatum(), - new Inlinks()); - Assert.assertEquals("mime type not detected", expected, - doc.getFieldValue("type")); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/README.txt ---------------------------------------------------------------------- diff --git a/src/plugin/index-replace/README.txt b/src/plugin/index-replace/README.txt deleted file mode 100644 index 4c866a7..0000000 --- a/src/plugin/index-replace/README.txt +++ /dev/null @@ -1,95 +0,0 @@ -IndexReplace plugin - -Allows indexing-time regexp replace manipulation of metadata fields. - -Configuration Example - <property> - <name>index.replace.regexp</name> - <value> - id=/file\:/http\:my.site.com/ - url=/file\:/http\:my.site.com/2 - </value> - </property - -Property format: index.replace.regexp - The format of the property is a list of regexp replacements, one line per field being - modified. Field names would be one of those from https://wiki.apache.org/nutch/IndexStructure. - - The fieldname precedes the equal sign. The first character after the equal sign signifies - the delimiter for the regexp, the replacement value and the flags. - -Replacement Sequence - The replacements will happen in the order listed. If a field needs multiple replacement operations - they may be listed more than once. - -RegExp Format - The regexp and the optional flags should correspond to Pattern.compile(String regexp, int flags) defined - here: http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#compile%28java.lang.String,%20int%29 - Patterns are compiled when the plugin is initialized for efficiency. - -Replacement Format - The replacement value should correspond to Java Matcher(CharSequence input).replaceAll(String replacement): - http://docs.oracle.com/javase/7/docs/api/java/util/regex/Matcher.html#replaceAll%28java.lang.String%29 - -Flags - The flags is an integer sum of the flag values defined in - http://docs.oracle.com/javase/7/docs/api/constant-values.html (Sec: java.util.regex.Pattern) - -Creating New Fields - If you express the fieldname as fldname1:fldname2=[replacement], then the replacer will create a new field - from the source field. The source field remains unmodified. This is an alternative to solrindex-mapping - which is only able to copy fields verbatim. - -Multi-valued Fields - If a field has multiple values, the replacement will be applied to each value in turn. - -Non-string Datatypes - Replacement is possible only on String field datatypes. If the field you name in the property is - not a String datatype, it will be silently ignored. - -Host and URL specific replacements. - If the replacements should apply only to specific pages, then add a sequence like - - hostmatch=hostmatchpattern - fld1=/regexp/replace/flags - fld2=/regexp/replace/flags - - or - urlmatch=urlmatchpattern - fld1=/regexp/replace/flags - fld2=/regexp/replace/flags - -When using Host and URL replacements, all replacements preceding the first hostmatch or urlmatch -will apply to all parsed pages. Replacements following a hostmatch or urlmatch will be applied -to pages which match the host or url field (up to the next hostmatch or urlmatch line). hostmatch -and urlmatch patterns must be unique in this property. - -Plugin order - In most cases you will want this plugin to run last. - -Testing your match patterns - Online Regexp testers like http://www.regexplanet.com/advanced/java/index.html - can help get the basics of your pattern working. - To test in nutch: - Prepare a test HTML file with the field contents you want to test. - Place this in a directory accessible to nutch. - Use the file:/// syntax to list the test file(s) in a test/urls seed list. - See the nutch faq "index my local file system" for conf settings you will need. - (Note the urlmatch and hostmatch patterns may not conform to your test file host and url; This - test approach confirms only how your global matches behave, unless your urlmatch and hostmatch - patterns also match the file: URL pattern) - - Run.. - bin/nutch inject crawl/crawldb test - bin/nutch generate crawl/crawldb crawl/segments - bin/nutch fetch crawl/segments/[segment] - bin/nutch parse crawl/segments/[segment] - bin/nutch invertlinks crawl/linkdb -dir crawl/segments - ...index your document, for example with SOLR... - bin/nutch solrindex http://localhost:8983/solr crawl/crawldb/ -linkdb crawl/linkdb/ crawl/segement[segment] -filter -normalize - - Inspect hadoop.log for info about pattern parsing and compilation.. - grep replace logs/hadoop.log - - To inspect your index with the solr admin panel... - http://localhost:8983/solr/#/ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-replace/build.xml b/src/plugin/index-replace/build.xml deleted file mode 100644 index ea8c95d..0000000 --- a/src/plugin/index-replace/build.xml +++ /dev/null @@ -1,55 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="index-replace" default="jar-core"> - - <import file="../build-plugin.xml" /> - - <!-- Add compilation dependencies to classpath --> - <path id="plugin.deps"> - <fileset dir="${nutch.root}/build"> - <include name="**/index-basic/*.jar" /> - <include name="**/index-metadata/*.jar" /> - </fileset> - <pathelement location="${nutch.root}/build/lib-regex-filter/test"/> - </path> - - <!-- Compile Unit test dependencies --> - <target name="deps-test-compile"> - <ant target="compile-test" inheritall="false" dir="../index-basic"/> - <ant target="compile-test" inheritall="false" dir="../index-metadata"/> - </target> - - <!-- Deploy Unit test dependencies --> - <target name="deps-test"> - <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" /> - <ant target="deploy" inheritall="false" dir="../protocol-file" /> - <ant target="deploy" inheritall="false" dir="../parse-html" /> - <ant target="deploy" inheritall="false" dir="../parse-metatags" /> - <ant target="deploy" inheritall="false" dir="../index-basic" /> - <ant target="deploy" inheritall="false" dir="../index-metadata" /> - </target> - - <!-- Copy test file for junit test --> - <mkdir dir="${build.test}/data" /> - <copy todir="${build.test}/data"> - <fileset dir="sample"> - <include name="*.html" /> - </fileset> - </copy> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-replace/ivy.xml b/src/plugin/index-replace/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/index-replace/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-replace/plugin.xml b/src/plugin/index-replace/plugin.xml deleted file mode 100644 index 3cffe60..0000000 --- a/src/plugin/index-replace/plugin.xml +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<plugin - id="index-replace" - name="Replace Indexer" - version="1.0" - provider-name="PeterCiuffetti"> - - <runtime> - <library name="index-replace.jar"> - <export name="*"/> - </library> - </runtime> - - <extension id="org.apache.nutch.indexer.replace" - name="Replace Indexer" - point="org.apache.nutch.indexer.IndexingFilter"> - <implementation id="ReplaceIndexer" - class="org.apache.nutch.indexer.replace.ReplaceIndexer"/> - </extension> - -</plugin> - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/sample/testIndexReplace.html ---------------------------------------------------------------------- diff --git a/src/plugin/index-replace/sample/testIndexReplace.html b/src/plugin/index-replace/sample/testIndexReplace.html deleted file mode 100644 index 0b90fc2..0000000 --- a/src/plugin/index-replace/sample/testIndexReplace.html +++ /dev/null @@ -1,12 +0,0 @@ -<html> - <head> - <title>Testing the power of the index-replace plugin</title> - <meta name="description" content="With this plugin, I control the description! Bwuhuhuhaha!"> - <meta name="keywords" content="Breathtaking, Riveting, Two Thumbs Up!"> - <meta name="author" content="Peter Ciuffetti"> - </head> - <body> - <p>This html file is used to test the Nutch index-replace regexp replacer plugin. - A decidedly boring thing to do.</p> - </body> -</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java deleted file mode 100644 index ddfe24d..0000000 --- a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java +++ /dev/null @@ -1,196 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.indexer.replace; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -/** - * POJO to store a filename, its match pattern and its replacement string. - * - * A checkAndReplace method is provided where you can simultaneously check if - * the field matches this replacer and if the pattern matches your field value. - * - * @author Peter Ciuffetti - */ -public class FieldReplacer { - - private static final Log LOG = LogFactory.getLog(FieldReplacer.class - .getName()); - - private final String fieldName; - private final String toFieldName; - private final Pattern pattern; - private final String replacement; - private boolean isValid; - - /** - * Create a FieldReplacer for a field. - * - * Any pattern exceptions are caught within this constructor and the object is - * marked inValid. The error will be logged. This prevents this caller from - * attempting invalid replacements. - * - * @param fieldName - * the name of the source field to operate on. Required. - * @param toFieldName - * the name of the target field. Required. - * @param pattern - * the pattern the field must match. Required. - * @param replacement - * the replacement string - * @param flags - * the Pattern flags value, or null if no flags are needed - */ - public FieldReplacer(String fieldName, String toFieldName, String pattern, - String replacement, Integer flags) { - - this.isValid = true; - // Must have a non-empty field name and pattern. - if (fieldName == null || fieldName.trim().length() == 0) { - LOG.error("Empty fieldName provided, FieldReplacer marked invalid."); - this.isValid = false; - } - if (pattern == null || pattern.trim().length() == 0) { - LOG.error("Empty pattern for field " + fieldName - + "provided, FieldReplacer marked invalid."); - this.isValid = false; - } - - if (replacement == null) { - this.replacement = ""; - } else { - this.replacement = replacement; - } - - this.fieldName = fieldName.trim(); - this.toFieldName = toFieldName.trim(); - - if (this.isValid) { - LOG.info("Compiling pattern " + pattern + " for field " + fieldName); - Pattern myPattern = null; - try { - if (flags != null) { - myPattern = Pattern.compile(pattern, flags); - } else { - myPattern = Pattern.compile(pattern); - } - } catch (PatternSyntaxException e) { - LOG.error("Pattern " + pattern + " for field " + fieldName - + " failed to compile: " + e.toString()); - this.isValid = false; - } - this.pattern = myPattern; - } else { - this.pattern = null; - } - } - - /** - * Field replacer with the input and output field the same. - * - * @param fieldName - * @param pattern - * @param replacement - * @param flags - */ - public FieldReplacer(String fieldName, String pattern, String replacement, - Integer flags) { - this(fieldName, fieldName, pattern, replacement, flags); - } - - public String getFieldName() { - return this.fieldName; - } - - public String getToFieldName() { - return this.toFieldName; - } - - public Pattern getPattern() { - return this.pattern; - } - - public String getReplacement() { - return this.replacement; - } - - /** - * Does this FieldReplacer have a valid fieldname and pattern? - * - * @return - */ - public boolean isValid() { - return this.isValid; - } - - /** - * Return the replacement value for a field value. - * - * This does not check for a matching field; the caller must decide if this - * FieldReplacer should operate on this value by checking getFieldName(). - * - * The method returns the value with the replacement. If the value returned is - * not different then eiher the pattern didn't match or the replacement was a - * no-op. - * - * @param value - * @return - */ - public String replace(String value) { - if (this.isValid) { - return this.pattern.matcher(value).replaceAll(replacement); - } else { - return value; - } - } - - /** - * Return a replacement value for a field. - * - * This is designed to fail fast and trigger a replacement only when - * necessary. If this method returns null, either the field does not match or - * the value does not match the pattern (or possibly the pattern is invalid). - * - * So only if the method returns a non-null value will you need to replace the - * value for the field. - * - * @param fieldName - * the name of the field you are checking - * @param value - * the value of the field you are checking - * @return a replacement value. If null, either the field does not match or - * the value does not match. - */ - public String checkAndReplace(String fieldName, String value) { - if (this.fieldName.equals(fieldName)) { - if (value != null && value.length() > 0) { - if (this.isValid) { - Matcher m = this.pattern.matcher(value); - if (m.find()) { - return m.replaceAll(this.replacement); - } - } - } - } - return null; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java deleted file mode 100644 index 7017603..0000000 --- a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java +++ /dev/null @@ -1,330 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.indexer.replace; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.IndexingFilter; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.indexer.NutchField; -import org.apache.nutch.parse.Parse; - -/** - * Do pattern replacements on selected field contents prior to indexing. - * - * To use this plugin, add <code>index-replace</code> to your - * <code>plugin.includes</code>. Example: - * - * <pre> - * <property> - * <name>plugin.includes</name> - * <value>protocol-(http)|urlfilter-regex|parse-(html|tika|metatags)|index-(basic|anchor|metadata|replace)|urlnormalizer-(pass|regex|basic)|indexer-solr</value> - * </property> - * </pre> - * - * And then add the <code>index.replace.regexp</code> property to - * <code>conf/nutch-site.xml</code>. This contains a list of replacement - * instructions per field name, one per line. eg. - * - * <pre> - * fieldname=/regexp/replacement/[flags] - * </pre> - * - * <pre> - * <property> - * <name>index.replace.regexp</name> - * <value> - * hostmatch=.*\\.com - * title=/search/replace/2 - * </value> - * </property> - * </pre> - * - * <code>hostmatch=</code> and <code>urlmatch=</code> lines indicate the match - * pattern for a host or url. The field replacements that follow this line will - * apply only to pages from the matching host or url. Replacements run in the - * order specified. Field names may appear multiple times if multiple - * replacements are needed. - * - * The property format is defined in greater detail in - * <code>conf/nutch-default.xml</code>. - * - * @author Peter Ciuffetti - * @see <a - * href="https://issues.apache.org/jira/browse/NUTCH-2058">NUTCH-2058</a> - */ -public class ReplaceIndexer implements IndexingFilter { - - private static final Log LOG = LogFactory.getLog(ReplaceIndexer.class - .getName()); - - /** Special field name signifying the start of a host-specific match set */ - private static final String HOSTMATCH = "hostmatch"; - /** Special field name signifying the start of a url-specific match set */ - private static final String URLMATCH = "urlmatch"; - - private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_HOST = new LinkedHashMap<Pattern, List<FieldReplacer>>(); - private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_URL = new LinkedHashMap<Pattern, List<FieldReplacer>>(); - - private static Pattern LINE_SPLIT = Pattern.compile("(^.+$)+", - Pattern.MULTILINE); - private static Pattern NAME_VALUE_SPLIT = Pattern.compile("(.*?)=(.*)"); - - private Configuration conf; - - /** - * {@inheritDoc} - */ - public void setConf(Configuration conf) { - this.conf = conf; - FIELDREPLACERS_BY_HOST.clear(); - FIELDREPLACERS_BY_URL.clear(); - String value = conf.get("index.replace.regexp", null); - if (value != null) { - LOG.debug("Parsing index.replace.regexp property"); - this.parseConf(value); - } - } - - /** - * {@inheritDoc} - */ - public Configuration getConf() { - return this.conf; - } - - /** - * Parse the property value into a set of maps that store a list of - * replacements by field for each host and url configured into the property. - * - * @param propertyValue - */ - private void parseConf(String propertyValue) { - if (propertyValue == null || propertyValue.trim().length() == 0) { - return; - } - - // At the start, all replacements apply globally to every host. - Pattern hostPattern = Pattern.compile(".*"); - Pattern urlPattern = null; - - // Split the property into lines - Matcher lineMatcher = LINE_SPLIT.matcher(propertyValue); - while (lineMatcher.find()) { - String line = lineMatcher.group(); - if (line != null && line.length() > 0) { - - // Split the line into field and value - Matcher nameValueMatcher = NAME_VALUE_SPLIT.matcher(line.trim()); - if (nameValueMatcher.find()) { - String fieldName = nameValueMatcher.group(1).trim(); - String value = nameValueMatcher.group(2); - if (fieldName != null && value != null) { - // Check if the field name is one of our special cases. - if (HOSTMATCH.equals(fieldName)) { - urlPattern = null; - try { - hostPattern = Pattern.compile(value); - } catch (PatternSyntaxException pse) { - LOG.error("hostmatch pattern " + value + " does not compile: " - + pse.getMessage()); - // Deactivate this invalid match set by making it match no host. - hostPattern = Pattern.compile("willnotmatchanyhost"); - } - } else if (URLMATCH.equals(fieldName)) { - try { - urlPattern = Pattern.compile(value); - } catch (PatternSyntaxException pse) { - LOG.error("urlmatch pattern " + value + " does not compile: " - + pse.getMessage()); - // Deactivate this invalid match set by making it match no url. - urlPattern = Pattern.compile("willnotmatchanyurl"); - } - } else if (value.length() > 3) { - String toFieldName = fieldName; - // If the fieldname has a colon, this indicates a different target - // field. - if (fieldName.indexOf(':') > 0) { - toFieldName = fieldName.substring(fieldName.indexOf(':') + 1); - fieldName = fieldName.substring(0, fieldName.indexOf(':')); - } - String sep = value.substring(0, 1); - - // Divide the value into pattern / replacement / flags. - value = value.substring(1); - if (!value.contains(sep)) { - LOG.error("Pattern '" + line - + "', not parseable. Missing separator " + sep); - continue; - } - String pattern = value.substring(0, value.indexOf(sep)); - value = value.substring(pattern.length() + 1); - String replacement = value; - if (value.contains(sep)) { - replacement = value.substring(0, value.indexOf(sep)); - } - int flags = 0; - if (value.length() > replacement.length() + 1) { - value = value.substring(replacement.length() + 1).trim(); - try { - flags = Integer.parseInt(value); - } catch (NumberFormatException e) { - LOG.error("Pattern " + line + ", has invalid flags component"); - continue; - } - } - Integer iFlags = (flags > 0) ? new Integer(flags) : null; - - // Make a FieldReplacer out of these params. - FieldReplacer fr = new FieldReplacer(fieldName, toFieldName, - pattern, replacement, iFlags); - - // Add this field replacer to the list for this host or URL. - if (urlPattern != null) { - List<FieldReplacer> lfp = FIELDREPLACERS_BY_URL.get(urlPattern); - if (lfp == null) { - lfp = new ArrayList<FieldReplacer>(); - } - lfp.add(fr); - FIELDREPLACERS_BY_URL.put(urlPattern, lfp); - } else { - List<FieldReplacer> lfp = FIELDREPLACERS_BY_HOST - .get(hostPattern); - if (lfp == null) { - lfp = new ArrayList<FieldReplacer>(); - } - lfp.add(fr); - FIELDREPLACERS_BY_HOST.put(hostPattern, lfp); - } - } - } - } - } - } - } - - /** - * {@inheritDoc} - */ - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, - CrawlDatum datum, Inlinks inlinks) throws IndexingException { - - if (doc != null) { - if (FIELDREPLACERS_BY_HOST.size() > 0) { - this.doReplace(doc, "host", FIELDREPLACERS_BY_HOST); - } - - if (FIELDREPLACERS_BY_URL.size() > 0) { - this.doReplace(doc, "url", FIELDREPLACERS_BY_URL); - } - } - - return doc; - } - - /** - * Iterates through the replacement map provided, to update the fields in the - * Nutch Document. - * - * @param doc - * the document we are modifying - * @param keyName - * either "host" or "url" -- the field that determines the - * replacement set used - * @param replaceMap - * the list of FieldReplacers that applies to this keyName. - */ - private void doReplace(NutchDocument doc, String keyName, - Map<Pattern, List<FieldReplacer>> replaceMap) { - - if (doc == null || replaceMap.size() == 0) { - return; - } - - Collection<String> docFieldNames = doc.getFieldNames(); - NutchField keyField = doc.getField(keyName); - if (keyField == null) { - // This document doesn't have the key field; no work to do. - return; - } - - List<Object> keyFieldValues = keyField.getValues(); - if (keyFieldValues.size() == 0) { - // This document doesn't have any values for the key field; no work to do. - return; - } - - // For every value of the keyField (one expected) - for (Object oKeyFieldValue : keyFieldValues) { - if (oKeyFieldValue != null && oKeyFieldValue instanceof java.lang.String) { - String keyFieldValue = (String) oKeyFieldValue; - - // For each pattern that we have a replacement list for... - for (Map.Entry<Pattern, List<FieldReplacer>> entries : replaceMap - .entrySet()) { - // If this key is a match for a replacement set... - if (entries.getKey().matcher(keyFieldValue).find()) { - - // For each field we will replace for this key... - for (FieldReplacer fp : entries.getValue()) { - String fieldName = fp.getFieldName(); - - // Does this document contain the FieldReplacer's field? - if (docFieldNames.contains(fieldName)) { - NutchField docField = doc.getField(fieldName); - List<Object> fieldValues = docField.getValues(); - ArrayList<String> newFieldValues = new ArrayList<String>(); - - // For each value of the field, match against our - // replacer... - for (Object oFieldValue : fieldValues) { - if (oFieldValue != null - && oFieldValue instanceof java.lang.String) { - String fieldValue = (String) oFieldValue; - String newValue = fp.replace(fieldValue); - newFieldValues.add(newValue); - } - } - - // Remove the target field and add our replaced values. - String targetFieldName = fp.getToFieldName(); - doc.removeField(targetFieldName); - for (String newFieldValue : newFieldValues) { - doc.add(targetFieldName, newFieldValue); - } - } - } - } - } - } - } - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/package-info.java b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/package-info.java deleted file mode 100644 index 28c24a4..0000000 --- a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Indexing filter to allow pattern replacements on metadata. - */ -package org.apache.nutch.indexer.replace; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java b/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java deleted file mode 100644 index ca90ca3..0000000 --- a/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java +++ /dev/null @@ -1,456 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.indexer.replace; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.indexer.basic.BasicIndexingFilter; -import org.apache.nutch.indexer.metadata.MetadataIndexer; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -/** - * JUnit tests for the <code>index-replace</code> plugin. - * - * In these tests, the sample file has some meta tags added to the Nutch - * document by the <code>index-metadata</code> plugin. The - * <code>index-replace</code> plugin is then used to either change (or not - * change) the fields depending on the various values of - * <code>index.replace.regexp</code> property being provided to Nutch. - * - * - * @author Peter Ciuffetti - * - */ -public class TestIndexReplace { - - private static final String INDEX_REPLACE_PROPERTY = "index.replace.regexp"; - - private String fileSeparator = System.getProperty("file.separator"); - private String sampleDir = System.getProperty("test.data", "."); - private String sampleFile = "testIndexReplace.html"; - - /** - * Run a test file through the Nutch parser and index filters. - * - * @param fileName - * @param conf - * @return the Nutch document with the replace indexer applied - */ - public NutchDocument parseAndFilterFile(String fileName, Configuration conf) { - NutchDocument doc = new NutchDocument(); - - BasicIndexingFilter basicIndexer = new BasicIndexingFilter(); - basicIndexer.setConf(conf); - Assert.assertNotNull(basicIndexer); - - MetadataIndexer metaIndexer = new MetadataIndexer(); - metaIndexer.setConf(conf); - Assert.assertNotNull(basicIndexer); - - ReplaceIndexer replaceIndexer = new ReplaceIndexer(); - replaceIndexer.setConf(conf); - Assert.assertNotNull(replaceIndexer); - - try { - String urlString = "file:" + sampleDir + fileSeparator + fileName; - Text text = new Text(urlString); - CrawlDatum crawlDatum = new CrawlDatum(); - Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); - Content content = protocol.getProtocolOutput(text, crawlDatum) - .getContent(); - Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); - crawlDatum.setFetchTime(100L); - - Inlinks inlinks = new Inlinks(); - doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks); - doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks); - doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks); - } catch (Exception e) { - e.printStackTrace(); - Assert.fail(e.toString()); - } - - return doc; - } - - /** - * Test property parsing. - * - * The filter does not expose details of the parse. So all we are checking is - * that the parse does not throw a runtime exception and that the value - * provided is the value returned. - */ - @Test - public void testPropertyParse() { - Configuration conf = NutchConfiguration.create(); - String indexReplaceProperty = " metatag.description=/this(.*)plugin/this awesome plugin/2\n" - + " metatag.keywords=/\\,/\\!/\n" - + " hostmatch=.*.com\n" - + " metatag.keywords=/\\,/\\?/\n" - + " metatag.author:dc_author=/\\s+/ David /\n" - + " urlmatch=.*.html\n" - + " metatag.keywords=/\\,/\\./\n" + " metatag.author=/\\s+/ D. /\n"; - - conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); - - ReplaceIndexer rp = new ReplaceIndexer(); - try { - rp.setConf(conf); - } catch (RuntimeException ohno) { - Assert.fail("Unable to parse a valid index.replace.regexp property! " - + ohno.getMessage()); - } - - Configuration parsedConf = rp.getConf(); - - // Does the getter equal the setter? Too easy! - Assert.assertEquals(indexReplaceProperty, - parsedConf.get(INDEX_REPLACE_PROPERTY)); - } - - /** - * Test metatag value replacement using global replacement settings. - * - * The index.replace.regexp property does not use hostmatch or urlmatch, so - * all patterns are global. - */ - @Test - public void testGlobalReplacement() { - String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; - String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!"; - String expectedAuthor = "Peter D. Ciuffetti"; - String indexReplaceProperty = " metatag.description=/this(.*)plugin/this awesome plugin/\n" - + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n"; - - Configuration conf = NutchConfiguration.create(); - conf.set( - "plugin.includes", - "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); - conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); - conf.set("metatags.names", "author,description,keywords"); - conf.set("index.parse.md", - "metatag.author,metatag.description,metatag.keywords"); - // Not necessary but helpful when debugging the filter. - conf.set("http.timeout", "99999999999"); - - // Run the document through the parser and index filters. - NutchDocument doc = parseAndFilterFile(sampleFile, conf); - - Assert.assertEquals(expectedDescription, - doc.getFieldValue("metatag.description")); - Assert - .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); - Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); - } - - /** - * Test that invalid property settings are handled and ignored. - * - * This test provides an invalid property setting that will fail property - * parsing and Pattern.compile. The expected outcome is that the patterns will - * not cause failure and the targeted fields will not be modified by the - * filter. - */ - @Test - public void testInvalidPatterns() { - String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!"; - String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!"; - String expectedAuthor = "Peter Ciuffetti"; - // Contains: invalid pattern, invalid flags, incomplete property - String indexReplaceProperty = " metatag.description=/this\\s+**plugin/this awesome plugin/\n" - + " metatag.keywords=/\\,/\\!/what\n" + " metatag.author=#notcomplete"; - - Configuration conf = NutchConfiguration.create(); - conf.set( - "plugin.includes", - "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); - conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); - conf.set("metatags.names", "author,description,keywords"); - conf.set("index.parse.md", - "metatag.author,metatag.description,metatag.keywords"); - // Not necessary but helpful when debugging the filter. - conf.set("http.timeout", "99999999999"); - - // Run the document through the parser and index filters. - NutchDocument doc = parseAndFilterFile(sampleFile, conf); - - // Assert that our metatags have not changed. - Assert.assertEquals(expectedDescription, - doc.getFieldValue("metatag.description")); - Assert - .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); - Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); - - } - - /** - * Test URL pattern matching - */ - @Test - public void testUrlMatchesPattern() { - String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; - String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!"; - String expectedAuthor = "Peter D. Ciuffetti"; - String indexReplaceProperty = " urlmatch=.*.html\n" - + " metatag.description=/this(.*)plugin/this awesome plugin/\n" - + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n"; - - Configuration conf = NutchConfiguration.create(); - conf.set( - "plugin.includes", - "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); - conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); - conf.set("metatags.names", "author,description,keywords"); - conf.set("index.parse.md", - "metatag.author,metatag.description,metatag.keywords"); - // Not necessary but helpful when debugging the filter. - conf.set("http.timeout", "99999999999"); - - // Run the document through the parser and index filters. - NutchDocument doc = parseAndFilterFile(sampleFile, conf); - - // Assert that our metatags have changed. - Assert.assertEquals(expectedDescription, - doc.getFieldValue("metatag.description")); - Assert - .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); - Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); - - } - - /** - * Test URL pattern not matching. - * - * Expected result is that the filter does not change the fields. - */ - @Test - public void testUrlNotMatchesPattern() { - String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!"; - String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!"; - String expectedAuthor = "Peter Ciuffetti"; - String indexReplaceProperty = " urlmatch=.*.xml\n" - + " metatag.description=/this(.*)plugin/this awesome plugin/\n" - + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n"; - - Configuration conf = NutchConfiguration.create(); - conf.set( - "plugin.includes", - "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); - conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); - conf.set("metatags.names", "author,description,keywords"); - conf.set("index.parse.md", - "metatag.author,metatag.description,metatag.keywords"); - // Not necessary but helpful when debugging the filter. - conf.set("http.timeout", "99999999999"); - - // Run the document through the parser and index filters. - NutchDocument doc = parseAndFilterFile(sampleFile, conf); - - // Assert that our metatags have not changed. - Assert.assertEquals(expectedDescription, - doc.getFieldValue("metatag.description")); - Assert - .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); - Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); - - } - - /** - * Test a global pattern match for description and URL pattern match for - * keywords and author. - * - * All three should be triggered. It also tests replacement groups. - */ - @Test - public void testGlobalAndUrlMatchesPattern() { - String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; - String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!"; - String expectedAuthor = "Peter D. Ciuffetti"; - String indexReplaceProperty = " metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n" - + " urlmatch=.*.html\n" - + " metatag.keywords=/\\,/\\!/\n" - + " metatag.author=/\\s+/ D. /\n"; - - Configuration conf = NutchConfiguration.create(); - conf.set( - "plugin.includes", - "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); - conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); - conf.set("metatags.names", "author,description,keywords"); - conf.set("index.parse.md", - "metatag.author,metatag.description,metatag.keywords"); - // Not necessary but helpful when debugging the filter. - conf.set("http.timeout", "99999999999"); - - // Run the document through the parser and index filters. - NutchDocument doc = parseAndFilterFile(sampleFile, conf); - - // Assert that our metatags have changed. - Assert.assertEquals(expectedDescription, - doc.getFieldValue("metatag.description")); - Assert - .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); - Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); - - } - - /** - * Test a global pattern match for description and URL pattern match for - * keywords and author. - * - * Only the global match should be triggered. - */ - @Test - public void testGlobalAndUrlNotMatchesPattern() { - String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; - String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!"; - String expectedAuthor = "Peter Ciuffetti"; - String indexReplaceProperty = " metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n" - + " urlmatch=.*.xml\n" - + " metatag.keywords=/\\,/\\!/\n" - + " metatag.author=/\\s+/ D. /\n"; - - Configuration conf = NutchConfiguration.create(); - conf.set( - "plugin.includes", - "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); - conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); - conf.set("metatags.names", "author,description,keywords"); - conf.set("index.parse.md", - "metatag.author,metatag.description,metatag.keywords"); - // Not necessary but helpful when debugging the filter. - conf.set("http.timeout", "99999999999"); - - // Run the document through the parser and index filters. - NutchDocument doc = parseAndFilterFile(sampleFile, conf); - - // Assert that description has changed and the others have not changed. - Assert.assertEquals(expectedDescription, - doc.getFieldValue("metatag.description")); - Assert - .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); - Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); - } - - /** - * Test order-specific replacement settings. - * - * This makes multiple replacements on the same field and will produce the - * expected value only if the replacements are run in the order specified. - */ - @Test - public void testReplacementsRunInSpecifedOrder() { - String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; - String indexReplaceProperty = " metatag.description=/this plugin/this amazing plugin/\n" - + " metatag.description=/this amazing plugin/this valuable plugin/\n" - + " metatag.description=/this valuable plugin/this cool plugin/\n" - + " metatag.description=/this cool plugin/this wicked plugin/\n" - + " metatag.description=/this wicked plugin/this awesome plugin/\n"; - - Configuration conf = NutchConfiguration.create(); - conf.set( - "plugin.includes", - "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); - conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); - conf.set("metatags.names", "author,description,keywords"); - conf.set("index.parse.md", - "metatag.author,metatag.description,metatag.keywords"); - // Not necessary but helpful when debugging the filter. - conf.set("http.timeout", "99999999999"); - - // Run the document through the parser and index filters. - NutchDocument doc = parseAndFilterFile(sampleFile, conf); - - // Check that the value produced by the last replacement has worked. - Assert.assertEquals(expectedDescription, - doc.getFieldValue("metatag.description")); - } - - /** - * Test a replacement pattern that uses the flags feature. - * - * A 2 is Pattern.CASE_INSENSITIVE. We look for upper case and expect to match - * any case. - */ - @Test - public void testReplacementsWithFlags() { - String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; - String indexReplaceProperty = " metatag.description=/THIS PLUGIN/this awesome plugin/2"; - - Configuration conf = NutchConfiguration.create(); - conf.set( - "plugin.includes", - "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); - conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); - conf.set("metatags.names", "author,description,keywords"); - conf.set("index.parse.md", - "metatag.author,metatag.description,metatag.keywords"); - // Not necessary but helpful when debugging the filter. - conf.set("http.timeout", "99999999999"); - - // Run the document through the parser and index filters. - NutchDocument doc = parseAndFilterFile(sampleFile, conf); - - // Check that the value produced by the case-insensitive replacement has - // worked. - Assert.assertEquals(expectedDescription, - doc.getFieldValue("metatag.description")); - } - - /** - * Test a replacement pattern that uses the target field feature. - * Check that the input is not modifid and that the taret field is added. - */ - @Test - public void testReplacementsDifferentTarget() { - String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!"; - String expectedTargetDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; - String indexReplaceProperty = " metatag.description:new=/this plugin/this awesome plugin/"; - - Configuration conf = NutchConfiguration.create(); - conf.set( - "plugin.includes", - "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); - conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); - conf.set("metatags.names", "author,description,keywords"); - conf.set("index.parse.md", - "metatag.author,metatag.description,metatag.keywords"); - // Not necessary but helpful when debugging the filter. - conf.set("http.timeout", "99999999999"); - - // Run the document through the parser and index filters. - NutchDocument doc = parseAndFilterFile(sampleFile, conf); - - // Check that the input field has not been modified - Assert.assertEquals(expectedDescription, - doc.getFieldValue("metatag.description")); - // Check that the output field has created - Assert.assertEquals(expectedTargetDescription, - doc.getFieldValue("new")); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-static/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-static/build.xml b/src/plugin/index-static/build.xml deleted file mode 100644 index 0ec5665..0000000 --- a/src/plugin/index-static/build.xml +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="index-static" default="jar-core"> - - <import file="../build-plugin.xml"/> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-static/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-static/ivy.xml b/src/plugin/index-static/ivy.xml deleted file mode 100644 index 24d7606..0000000 --- a/src/plugin/index-static/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../../ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-static/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/index-static/plugin.xml b/src/plugin/index-static/plugin.xml deleted file mode 100644 index 539e355..0000000 --- a/src/plugin/index-static/plugin.xml +++ /dev/null @@ -1,42 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="index-static" - name="Index Static" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="index-static.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - - <extension id="org.apache.nutch.indexer.staticfield" - name="Nutch static field index" - point="org.apache.nutch.indexer.IndexingFilter"> - <implementation id="StaticField" - class="org.apache.nutch.indexer.staticfield.StaticFieldIndexer"/> - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java b/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java deleted file mode 100644 index 1a81041..0000000 --- a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java +++ /dev/null @@ -1,143 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.indexer.staticfield; - -import java.util.HashMap; -import java.util.Map.Entry; - -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.IndexingFilter; -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.parse.Parse; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.conf.Configuration; - -/** - * A simple plugin called at indexing that adds fields with static data. You can - * specify a list of fieldname:fieldcontent per nutch job. It can be useful when - * collections can't be created by urlpatterns, like in subcollection, but on a - * job-basis. - */ - -public class StaticFieldIndexer implements IndexingFilter { - private Configuration conf; - private HashMap<String, String[]> fields; - private boolean addStaticFields = false; - private String fieldSep = ","; - private String kevSep = ":"; - private String valueSep = " "; - - /** - * The {@link StaticFieldIndexer} filter object which adds fields as per - * configuration setting. See {@code index.static} in nutch-default.xml. - * - * @param doc - * The {@link NutchDocument} object - * @param parse - * The relevant {@link Parse} object passing through the filter - * @param url - * URL to be filtered for anchor text - * @param datum - * The {@link CrawlDatum} entry - * @param inlinks - * The {@link Inlinks} containing anchor text - * @return filtered NutchDocument - */ - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, - CrawlDatum datum, Inlinks inlinks) throws IndexingException { - - if (this.addStaticFields == true) { - for (Entry<String, String[]> entry : this.fields.entrySet()) { - for (String val : entry.getValue()) { - doc.add(entry.getKey(), val); - } - } - } - return doc; - } - - /** - * Populate a HashMap from a list of fieldname:fieldcontent. See - * {@index.static} in nutch-default.xml. - * - * @param fieldsString - * string containing field:value pairs - * @return HashMap of fields and their corresponding values - */ - private HashMap<String, String[]> parseFields(String fieldsString) { - HashMap<String, String[]> fields = new HashMap<String, String[]>(); - - /* - * The format is very easy, it's a comma-separated list of fields in the - * form <name>:<value> - */ - for (String field : fieldsString.split(this.fieldSep)) { - String[] entry = field.split(this.kevSep); - if (entry.length == 2) - fields.put(entry[0].trim(), entry[1].trim().split(this.valueSep)); - } - - return fields; - } - - /** - * Set the {@link Configuration} object - */ - public void setConf(Configuration conf) { - this.conf = conf; - - // NUTCH-2052: Allow user-defined delimiters in index.static - this.fieldSep = this.regexEscape(conf.get("index.static.fieldsep", ",")); - this.kevSep = this.regexEscape(conf.get("index.static.keysep", ":")); - this.valueSep = this.regexEscape(conf.get("index.static.valuesep", " ")); - - String fieldsString = conf.get("index.static", null); - if (fieldsString != null) { - this.addStaticFields = true; - this.fields = parseFields(fieldsString); - } - } - - /** - * Get the {@link Configuration} object - */ - public Configuration getConf() { - return this.conf; - } - - /** - * Escapes any character that needs escaping so it can be used in a regexp. - */ - protected String regexEscape(String in) { - String result = in; - if (in != null) { - StringBuffer sb = new StringBuffer(); - for (int i = 0; i < in.length(); i++) { - CharSequence c = in.subSequence(i, i+1); - if ("<([{\\^-=$!|]})?*+.>".contains(c)) { - sb.append('\\'); - } - sb.append(c); - } - result = sb.toString(); - } - return result; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/package.html ---------------------------------------------------------------------- diff --git a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/package.html b/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/package.html deleted file mode 100644 index f4b5146..0000000 --- a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>A simple plugin called at indexing that adds fields with static data. You can specify a list of fieldname:fieldcontent per nutch job. It can be useful when collections can't be created by urlpatterns, like in subcollection, but on a job-basis.</p><p></p> -</body> -</html>
