[20/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

thammegowda Tue, 05 Jul 2016 15:49:38 -0700

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java
 
b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java
deleted file mode 100644
index 8f2bee5..0000000
--- 
a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Indexing filter to add document metadata to the index.
- * Metadata may come from CrawlDb, parse or content metadata.
- */
-package org.apache.nutch.indexer.metadata;
-


http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-more/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-more/build.xml b/src/plugin/index-more/build.xml
deleted file mode 100644
index dec1e12..0000000
--- a/src/plugin/index-more/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="index-more" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-more/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-more/ivy.xml b/src/plugin/index-more/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/index-more/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-more/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-more/plugin.xml b/src/plugin/index-more/plugin.xml
deleted file mode 100644
index d920f72..0000000
--- a/src/plugin/index-more/plugin.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="index-more"
-   name="More Indexing Filter"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-
-   <runtime>
-      <library name="index-more.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.indexer.more"
-              name="Nutch More Indexing Filter"
-              point="org.apache.nutch.indexer.IndexingFilter">
-      <implementation id="MoreIndexingFilter"
-                      
class="org.apache.nutch.indexer.more.MoreIndexingFilter"/>
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 
b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
deleted file mode 100644
index 6e64ede..0000000
--- 
a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
+++ /dev/null
@@ -1,344 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.more;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.nutch.metadata.Metadata;
-
-import org.apache.nutch.net.protocols.HttpDateFormat;
-import org.apache.nutch.net.protocols.Response;
-
-import org.apache.nutch.parse.Parse;
-
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.NutchDocument;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.util.MimeUtil;
-import org.apache.tika.Tika;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-
-import java.text.ParseException;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.util.Date;
-import java.util.regex.*;
-import java.util.HashMap;
-
-import org.apache.commons.lang.StringUtils;
-import org.apache.commons.lang.time.DateUtils;
-
-/**
- * Add (or reset) a few metaData properties as respective fields (if they are
- * available), so that they can be accurately used within the search index.
- * 
- * 'lastModifed' is indexed to support query by date, 'contentLength' obtains
- * content length from the HTTP header, 'type' field is indexed to support 
query
- * by type and finally the 'title' field is an attempt to reset the title if a
- * content-disposition hint exists. The logic is that such a presence is
- * indicative that the content provider wants the filename therein to be used 
as
- * the title.
- * 
- * Still need to make content-length searchable!
- * 
- * @author John Xing
- */
-
-public class MoreIndexingFilter implements IndexingFilter {
-  public static final Logger LOG = LoggerFactory
-      .getLogger(MoreIndexingFilter.class);
-
-  /** Get the MimeTypes resolver instance. */
-  private MimeUtil MIME;
-  private Tika tika = new Tika();
-
-  /** Map for mime-type substitution */
-  private HashMap<String, String> mimeMap = null;
-  private boolean mapMimes = false;
-
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
-      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
-    String url_s = url.toString();
-
-    addTime(doc, parse.getData(), url_s, datum);
-    addLength(doc, parse.getData(), url_s);
-    addType(doc, parse.getData(), url_s, datum);
-    resetTitle(doc, parse.getData(), url_s);
-
-    return doc;
-  }
-
-  // Add time related meta info. Add last-modified if present. Index date as
-  // last-modified, or, if that's not present, use fetch time.
-  private NutchDocument addTime(NutchDocument doc, ParseData data, String url,
-      CrawlDatum datum) {
-    long time = -1;
-
-    String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
-    if (lastModified != null) { // try parse last-modified
-      time = getTime(lastModified, url); // use as time
-                                         // store as string
-      doc.add("lastModified", new Date(time));
-    }
-
-    if (time == -1) { // if no last-modified specified in HTTP header
-      time = datum.getModifiedTime(); // use value in CrawlDatum
-      if (time <= 0) { // if also unset
-        time = datum.getFetchTime(); // use time the fetch took place 
(fetchTime
-                                     // of fetchDatum)
-      }
-    }
-
-    // un-stored, indexed and un-tokenized
-    doc.add("date", new Date(time));
-    return doc;
-  }
-
-  private long getTime(String date, String url) {
-    long time = -1;
-    try {
-      time = HttpDateFormat.toLong(date);
-    } catch (ParseException e) {
-      // try to parse it as date in alternative format
-      try {
-        Date parsedDate = DateUtils.parseDate(date, new String[] {
-            "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz",
-            "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz",
-            "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz",
-            "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss",
-            "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz",
-            "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd",
-            "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm",
-            "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz",
-            "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz",
-            "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz",
-            "yyyy-MM-dd'T'HH:mm:ss'Z'" });
-        time = parsedDate.getTime();
-        // if (LOG.isWarnEnabled()) {
-        // LOG.warn(url + ": parsed date: " + date +" to:"+time);
-        // }
-      } catch (Exception e2) {
-        if (LOG.isWarnEnabled()) {
-          LOG.warn(url + ": can't parse erroneous date: " + date);
-        }
-      }
-    }
-    return time;
-  }
-
-  // Add Content-Length
-  private NutchDocument addLength(NutchDocument doc, ParseData data, String 
url) {
-    String contentLength = data.getMeta(Response.CONTENT_LENGTH);
-
-    if (contentLength != null) {
-      // NUTCH-1010 ContentLength not trimmed
-      String trimmed = contentLength.toString().trim();
-      if (!trimmed.isEmpty())
-        doc.add("contentLength", trimmed);
-    }
-    return doc;
-  }
-
-  /**
-   * <p>
-   * Add Content-Type and its primaryType and subType add contentType,
-   * primaryType and subType to field "type" as un-stored, indexed and
-   * un-tokenized, so that search results can be confined by contentType or its
-   * primaryType or its subType.
-   * </p>
-   * <p>
-   * For example, if contentType is application/vnd.ms-powerpoint, search can 
be
-   * done with one of the following qualifiers
-   * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint
-   * all case insensitive. The query filter is implemented in
-   * {@link TypeQueryFilter}.
-   * </p>
-   * 
-   * @param doc
-   * @param data
-   * @param url
-   * @return
-   */
-  private NutchDocument addType(NutchDocument doc, ParseData data, String url,
-      CrawlDatum datum) {
-    String mimeType = null;
-    String contentType = null;
-
-    Writable tcontentType = datum.getMetaData().get(
-        new Text(Response.CONTENT_TYPE));
-    if (tcontentType != null) {
-      contentType = tcontentType.toString();
-    } else
-      contentType = data.getMeta(Response.CONTENT_TYPE);
-    if (contentType == null) {
-      // Note by Jerome Charron on 20050415:
-      // Content Type not solved by a previous plugin
-      // Or unable to solve it... Trying to find it
-      // Should be better to use the doc content too
-      // (using MimeTypes.getMimeType(byte[], String), but I don't know
-      // which field it is?
-      // if (MAGIC) {
-      // contentType = MIME.getMimeType(url, content);
-      // } else {
-      // contentType = MIME.getMimeType(url);
-      // }
-
-      mimeType = tika.detect(url);
-    } else {
-      mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
-    }
-
-    // Checks if we solved the content-type.
-    if (mimeType == null) {
-      return doc;
-    }
-
-    // Check if we have to map mime types
-    if (mapMimes) {
-      // Check if the current mime is mapped
-      if (mimeMap.containsKey(mimeType)) {
-        // It's mapped, let's replace it
-        mimeType = mimeMap.get(mimeType);
-      }
-    }
-
-    contentType = mimeType;
-    doc.add("type", contentType);
-
-    // Check if we need to split the content type in sub parts
-    if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
-      String[] parts = getParts(contentType);
-
-      for (String part : parts) {
-        doc.add("type", part);
-      }
-    }
-
-    // leave this for future improvement
-    // MimeTypeParameterList parameterList = mimeType.getParameters()
-
-    return doc;
-  }
-
-  /**
-   * Utility method for splitting mime type into type and subtype.
-   * 
-   * @param mimeType
-   * @return
-   */
-  static String[] getParts(String mimeType) {
-    return mimeType.split("/");
-  }
-
-  // Reset title if we see non-standard HTTP header "Content-Disposition".
-  // It's a good indication that content provider wants filename therein
-  // be used as the title of this url.
-
-  // Patterns used to extract filename from possible non-standard
-  // HTTP header "Content-Disposition". Typically it looks like:
-  // Content-Disposition: inline; filename="foo.ppt"
-  private Configuration conf;
-
-  static Pattern patterns[] = { null, null };
-
-  static {
-    try {
-      // order here is important
-      patterns[0] = Pattern.compile("\\bfilename=['\"](.+)['\"]");
-      patterns[1] = Pattern.compile("\\bfilename=(\\S+)\\b");
-    } catch (PatternSyntaxException e) {
-      // just ignore
-    }
-  }
-
-  private NutchDocument resetTitle(NutchDocument doc, ParseData data, String 
url) {
-    String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION);
-    if (contentDisposition == null || doc.getFieldValue("title") != null)
-      return doc;
-
-    for (int i = 0; i < patterns.length; i++) {
-      Matcher matcher = patterns[i].matcher(contentDisposition);
-      if (matcher.find()) {
-        doc.add("title", matcher.group(1));
-        break;
-      }
-    }
-
-    return doc;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    MIME = new MimeUtil(conf);
-
-    if (conf.getBoolean("moreIndexingFilter.mapMimeTypes", false) == true) {
-      mapMimes = true;
-
-      // Load the mapping
-      try {
-        readConfiguration();
-      } catch (Exception e) {
-        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
-      }
-    }
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  private void readConfiguration() throws IOException {
-    LOG.info("Reading content type mappings from file 
contenttype-mapping.txt");
-    BufferedReader reader = new BufferedReader(
-        conf.getConfResourceAsReader("contenttype-mapping.txt"));
-    String line;
-    String parts[];
-    boolean formatWarningShown = false;
-
-    mimeMap = new HashMap<String, String>();
-
-    while ((line = reader.readLine()) != null) {
-      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
-        line.trim();
-        parts = line.split("\t");
-
-        // Must be at least two parts
-        if (parts.length > 1) {
-          for (int i = 1; i < parts.length; i++) {
-            mimeMap.put(parts[i].trim(), parts[0].trim());
-          }
-        } else {
-          LOG.warn("Wrong format of line: {}", line);
-          if (!formatWarningShown) {
-            LOG.warn("Expected format: <target type> <tab> <type1> [<tab> 
<type2> ...]");
-            formatWarningShown = true;
-          }
-        }
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html
----------------------------------------------------------------------
diff --git 
a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html 
b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html
deleted file mode 100644
index 7b8fade..0000000
--- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html
+++ /dev/null
@@ -1,6 +0,0 @@
-<html>
-<body>
-<p>A more indexing plugin, adds "more" index fields:
-last modified date, MIME type, content length.</p><p></p>
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
 
b/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
deleted file mode 100644
index f918dde..0000000
--- 
a/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.more;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestMoreIndexingFilter {
-
-  @Test
-  public void testContentType() throws IndexingException {
-    Configuration conf = NutchConfiguration.create();
-    assertContentType(conf, "text/html", "text/html");
-    assertContentType(conf, "text/html; charset=UTF-8", "text/html");
-  }
-
-  @Test
-  public void testGetParts() {
-    String[] parts = MoreIndexingFilter.getParts("text/html");
-    assertParts(parts, 2, "text", "html");
-  }
-
-  /**
-   * @since NUTCH-901
-   */
-  @Test
-  public void testNoParts() {
-    Configuration conf = NutchConfiguration.create();
-    conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
-    MoreIndexingFilter filter = new MoreIndexingFilter();
-    filter.setConf(conf);
-    Assert.assertNotNull(filter);
-    NutchDocument doc = new NutchDocument();
-    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
-
-    try {
-      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html";),
-          new CrawlDatum(), new Inlinks());
-    } catch (Exception e) {
-      e.printStackTrace();
-      Assert.fail(e.getMessage());
-    }
-    Assert.assertNotNull(doc);
-    Assert.assertTrue(doc.getFieldNames().contains("type"));
-    Assert.assertEquals(1, doc.getField("type").getValues().size());
-    Assert.assertEquals("text/html", doc.getFieldValue("type"));
-  }
-
-  @Test
-  public void testContentDispositionTitle() throws IndexingException {
-    Configuration conf = NutchConfiguration.create();
-
-    Metadata metadata = new Metadata();
-    metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
-    MoreIndexingFilter filter = new MoreIndexingFilter();
-    filter.setConf(conf);
-
-    Text url = new Text("http://www.example.com/";);
-    ParseImpl parseImpl = new ParseImpl("text", new ParseData(
-        new ParseStatus(), "title", new Outlink[0], metadata));
-
-    NutchDocument doc = new NutchDocument();
-    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
-
-    Assert.assertEquals("content-disposition not detected", "filename.ext",
-        doc.getFieldValue("title"));
-
-    /* NUTCH-1140: do not add second title to avoid a multi-valued title field 
*/
-    doc = new NutchDocument();
-    doc.add("title", "title");
-    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
-    Assert.assertEquals("do not add second title by content-disposition",
-        "title", doc.getFieldValue("title"));
-  }
-
-  private void assertParts(String[] parts, int count, String... expected) {
-    Assert.assertEquals(count, parts.length);
-    for (int i = 0; i < expected.length; i++) {
-      Assert.assertEquals(expected[i], parts[i]);
-    }
-  }
-
-  private void assertContentType(Configuration conf, String source,
-      String expected) throws IndexingException {
-    Metadata metadata = new Metadata();
-    metadata.add(Response.CONTENT_TYPE, source);
-    MoreIndexingFilter filter = new MoreIndexingFilter();
-    filter.setConf(conf);
-    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl(
-        "text", new ParseData(new ParseStatus(), "title", new Outlink[0],
-            metadata)), new Text("http://www.example.com/";), new CrawlDatum(),
-        new Inlinks());
-    Assert.assertEquals("mime type not detected", expected,
-        doc.getFieldValue("type"));
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/README.txt
----------------------------------------------------------------------
diff --git a/src/plugin/index-replace/README.txt 
b/src/plugin/index-replace/README.txt
deleted file mode 100644
index 4c866a7..0000000
--- a/src/plugin/index-replace/README.txt
+++ /dev/null
@@ -1,95 +0,0 @@
-IndexReplace plugin
-
-Allows indexing-time regexp replace manipulation of metadata fields.
-
-Configuration Example
-    <property>
-      <name>index.replace.regexp</name>
-      <value>
-        id=/file\:/http\:my.site.com/
-        url=/file\:/http\:my.site.com/2
-      </value>
-    </property
-
-Property format: index.replace.regexp
-    The format of the property is a list of regexp replacements, one line per 
field being
-    modified.  Field names would be one of those from 
https://wiki.apache.org/nutch/IndexStructure.
-
-    The fieldname precedes the equal sign.  The first character after the 
equal sign signifies
-    the delimiter for the regexp, the replacement value and the flags.
-
-Replacement Sequence
-    The replacements will happen in the order listed. If a field needs 
multiple replacement operations
-    they may be listed more than once.
-
-RegExp Format
-    The regexp and the optional flags should correspond to 
Pattern.compile(String regexp, int flags) defined
-    here: 
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#compile%28java.lang.String,%20int%29
-    Patterns are compiled when the plugin is initialized for efficiency.
-
-Replacement Format
-    The replacement value should correspond to Java Matcher(CharSequence 
input).replaceAll(String replacement):
-    
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Matcher.html#replaceAll%28java.lang.String%29
-
-Flags
-    The flags is an integer sum of the flag values defined in
-    http://docs.oracle.com/javase/7/docs/api/constant-values.html (Sec: 
java.util.regex.Pattern)
-
-Creating New Fields
-    If you express the fieldname as fldname1:fldname2=[replacement], then the 
replacer will create a new field
-    from the source field.  The source field remains unmodified.  This is an 
alternative to solrindex-mapping
-    which is only able to copy fields verbatim.
-
-Multi-valued Fields
-    If a field has multiple values, the replacement will be applied to each 
value in turn.
-
-Non-string Datatypes
-    Replacement is possible only on String field datatypes.  If the field you 
name in the property is
-    not a String datatype, it will be silently ignored.
-
-Host and URL specific replacements.
-    If the replacements should apply only to specific pages, then add a 
sequence like
-
-    hostmatch=hostmatchpattern
-    fld1=/regexp/replace/flags
-    fld2=/regexp/replace/flags
-
-    or
-    urlmatch=urlmatchpattern
-    fld1=/regexp/replace/flags
-    fld2=/regexp/replace/flags
-
-When using Host and URL replacements, all replacements preceding the first 
hostmatch or urlmatch
-will apply to all parsed pages.  Replacements following a hostmatch or 
urlmatch will be applied
-to pages which match the host or url field (up to the next hostmatch or 
urlmatch line).  hostmatch
-and urlmatch patterns must be unique in this property.
-
-Plugin order
-    In most cases you will want this plugin to run last.
-
-Testing your match patterns
-    Online Regexp testers like 
http://www.regexplanet.com/advanced/java/index.html
-    can help get the basics of your pattern working.
-    To test in nutch: 
-        Prepare a test HTML file with the field contents you want to test. 
-        Place this in a directory accessible to nutch.
-        Use the file:/// syntax to list the test file(s) in a test/urls seed 
list.
-        See the nutch faq "index my local file system" for conf settings you 
will need.
-        (Note the urlmatch and hostmatch patterns may not conform to your test 
file host and url; This
-        test approach confirms only how your global matches behave, unless 
your urlmatch and hostmatch
-        patterns also match the file: URL pattern)
- 
-    Run..
-        bin/nutch inject crawl/crawldb test
-        bin/nutch generate crawl/crawldb crawl/segments
-        bin/nutch fetch crawl/segments/[segment]
-        bin/nutch parse crawl/segments/[segment]
-        bin/nutch invertlinks crawl/linkdb -dir crawl/segments
-        ...index your document, for example with SOLR...
-        bin/nutch solrindex http://localhost:8983/solr crawl/crawldb/ -linkdb 
crawl/linkdb/ crawl/segement[segment] -filter -normalize
-
-    Inspect hadoop.log for info about pattern parsing and compilation..
-        grep replace logs/hadoop.log
-
-    To inspect your index with the solr admin panel...
-        http://localhost:8983/solr/#/

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-replace/build.xml 
b/src/plugin/index-replace/build.xml
deleted file mode 100644
index ea8c95d..0000000
--- a/src/plugin/index-replace/build.xml
+++ /dev/null
@@ -1,55 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="index-replace" default="jar-core">
-
-       <import file="../build-plugin.xml" />
-
-       <!-- Add compilation dependencies to classpath -->
-       <path id="plugin.deps">
-               <fileset dir="${nutch.root}/build">
-                       <include name="**/index-basic/*.jar" />
-                       <include name="**/index-metadata/*.jar" />
-               </fileset>
-               <pathelement 
location="${nutch.root}/build/lib-regex-filter/test"/>
-       </path>
-
-       <!-- Compile Unit test dependencies -->
-       <target name="deps-test-compile">
-               <ant target="compile-test" inheritall="false" 
dir="../index-basic"/>
-               <ant target="compile-test" inheritall="false" 
dir="../index-metadata"/>
-       </target>
-
-       <!-- Deploy Unit test dependencies -->
-       <target name="deps-test">
-               <ant target="deploy" inheritall="false" 
dir="../nutch-extensionpoints" />
-               <ant target="deploy" inheritall="false" dir="../protocol-file" 
/>
-               <ant target="deploy" inheritall="false" dir="../parse-html" />
-               <ant target="deploy" inheritall="false" dir="../parse-metatags" 
/>
-               <ant target="deploy" inheritall="false" dir="../index-basic" />
-               <ant target="deploy" inheritall="false" dir="../index-metadata" 
/>
-       </target>
-
-       <!-- Copy test file for junit test -->
-       <mkdir dir="${build.test}/data" />
-       <copy todir="${build.test}/data">
-               <fileset dir="sample">
-                       <include name="*.html" />
-               </fileset>
-       </copy>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-replace/ivy.xml b/src/plugin/index-replace/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/index-replace/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-replace/plugin.xml 
b/src/plugin/index-replace/plugin.xml
deleted file mode 100644
index 3cffe60..0000000
--- a/src/plugin/index-replace/plugin.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<plugin
-   id="index-replace"
-   name="Replace Indexer"
-   version="1.0"
-   provider-name="PeterCiuffetti">
-
-   <runtime>
-      <library name="index-replace.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <extension id="org.apache.nutch.indexer.replace"
-              name="Replace Indexer"
-              point="org.apache.nutch.indexer.IndexingFilter">
-      <implementation id="ReplaceIndexer"
-                      class="org.apache.nutch.indexer.replace.ReplaceIndexer"/>
-   </extension>
-
-</plugin>
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/sample/testIndexReplace.html
----------------------------------------------------------------------
diff --git a/src/plugin/index-replace/sample/testIndexReplace.html 
b/src/plugin/index-replace/sample/testIndexReplace.html
deleted file mode 100644
index 0b90fc2..0000000
--- a/src/plugin/index-replace/sample/testIndexReplace.html
+++ /dev/null
@@ -1,12 +0,0 @@
-<html>
-  <head>
-    <title>Testing the power of the index-replace plugin</title>
-    <meta name="description" content="With this plugin, I control the 
description! Bwuhuhuhaha!">
-    <meta name="keywords" content="Breathtaking, Riveting, Two Thumbs Up!">
-    <meta name="author" content="Peter Ciuffetti">
-  </head>
-  <body>
-    <p>This html file is used to test the Nutch index-replace regexp replacer 
plugin.
-    A decidedly boring thing to do.</p>
-  </body>
-</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java
 
b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java
deleted file mode 100644
index ddfe24d..0000000
--- 
a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java
+++ /dev/null
@@ -1,196 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.replace;
-
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.regex.PatternSyntaxException;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-/**
- * POJO to store a filename, its match pattern and its replacement string.
- *
- * A checkAndReplace method is provided where you can simultaneously check if
- * the field matches this replacer and if the pattern matches your field value.
- *
- * @author Peter Ciuffetti
- */
-public class FieldReplacer {
-
-  private static final Log LOG = LogFactory.getLog(FieldReplacer.class
-      .getName());
-
-  private final String fieldName;
-  private final String toFieldName;
-  private final Pattern pattern;
-  private final String replacement;
-  private boolean isValid;
-
-  /**
-   * Create a FieldReplacer for a field.
-   *
-   * Any pattern exceptions are caught within this constructor and the object 
is
-   * marked inValid. The error will be logged. This prevents this caller from
-   * attempting invalid replacements.
-   *
-   * @param fieldName
-   *          the name of the source field to operate on. Required.
-   * @param toFieldName
-   *          the name of the target field. Required.
-   * @param pattern
-   *          the pattern the field must match. Required.
-   * @param replacement
-   *          the replacement string
-   * @param flags
-   *          the Pattern flags value, or null if no flags are needed
-   */
-  public FieldReplacer(String fieldName, String toFieldName, String pattern,
-      String replacement, Integer flags) {
-
-    this.isValid = true;
-    // Must have a non-empty field name and pattern.
-    if (fieldName == null || fieldName.trim().length() == 0) {
-      LOG.error("Empty fieldName provided, FieldReplacer marked invalid.");
-      this.isValid = false;
-    }
-    if (pattern == null || pattern.trim().length() == 0) {
-      LOG.error("Empty pattern for field " + fieldName
-          + "provided, FieldReplacer marked invalid.");
-      this.isValid = false;
-    }
-
-    if (replacement == null) {
-      this.replacement = "";
-    } else {
-      this.replacement = replacement;
-    }
-
-    this.fieldName = fieldName.trim();
-    this.toFieldName = toFieldName.trim();
-
-    if (this.isValid) {
-      LOG.info("Compiling pattern " + pattern + " for field " + fieldName);
-      Pattern myPattern = null;
-      try {
-        if (flags != null) {
-          myPattern = Pattern.compile(pattern, flags);
-        } else {
-          myPattern = Pattern.compile(pattern);
-        }
-      } catch (PatternSyntaxException e) {
-        LOG.error("Pattern " + pattern + " for field " + fieldName
-            + " failed to compile: " + e.toString());
-        this.isValid = false;
-      }
-      this.pattern = myPattern;
-    } else {
-      this.pattern = null;
-    }
-  }
-
-  /**
-   * Field replacer with the input and output field the same.
-   *
-   * @param fieldName
-   * @param pattern
-   * @param replacement
-   * @param flags
-   */
-  public FieldReplacer(String fieldName, String pattern, String replacement,
-      Integer flags) {
-    this(fieldName, fieldName, pattern, replacement, flags);
-  }
-
-  public String getFieldName() {
-    return this.fieldName;
-  }
-
-  public String getToFieldName() {
-    return this.toFieldName;
-  }
-
-  public Pattern getPattern() {
-    return this.pattern;
-  }
-
-  public String getReplacement() {
-    return this.replacement;
-  }
-
-  /**
-   * Does this FieldReplacer have a valid fieldname and pattern?
-   *
-   * @return
-   */
-  public boolean isValid() {
-    return this.isValid;
-  }
-
-  /**
-   * Return the replacement value for a field value.
-   *
-   * This does not check for a matching field; the caller must decide if this
-   * FieldReplacer should operate on this value by checking getFieldName().
-   *
-   * The method returns the value with the replacement. If the value returned 
is
-   * not different then eiher the pattern didn't match or the replacement was a
-   * no-op.
-   *
-   * @param value
-   * @return
-   */
-  public String replace(String value) {
-    if (this.isValid) {
-      return this.pattern.matcher(value).replaceAll(replacement);
-    } else {
-      return value;
-    }
-  }
-
-  /**
-   * Return a replacement value for a field.
-   *
-   * This is designed to fail fast and trigger a replacement only when
-   * necessary. If this method returns null, either the field does not match or
-   * the value does not match the pattern (or possibly the pattern is invalid).
-   *
-   * So only if the method returns a non-null value will you need to replace 
the
-   * value for the field.
-   *
-   * @param fieldName
-   *          the name of the field you are checking
-   * @param value
-   *          the value of the field you are checking
-   * @return a replacement value. If null, either the field does not match or
-   *         the value does not match.
-   */
-  public String checkAndReplace(String fieldName, String value) {
-    if (this.fieldName.equals(fieldName)) {
-      if (value != null && value.length() > 0) {
-        if (this.isValid) {
-          Matcher m = this.pattern.matcher(value);
-          if (m.find()) {
-            return m.replaceAll(this.replacement);
-          }
-        }
-      }
-    }
-    return null;
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
 
b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
deleted file mode 100644
index 7017603..0000000
--- 
a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
+++ /dev/null
@@ -1,330 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.replace;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.regex.PatternSyntaxException;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.NutchField;
-import org.apache.nutch.parse.Parse;
-
-/**
- * Do pattern replacements on selected field contents prior to indexing.
- * 
- * To use this plugin, add <code>index-replace</code> to your
- * <code>plugin.includes</code>. Example:
- * 
- * <pre>
- *   &lt;property>
- *    &lt;name>plugin.includes&lt;/name>
- *    
&lt;value>protocol-(http)|urlfilter-regex|parse-(html|tika|metatags)|index-(basic|anchor|metadata|replace)|urlnormalizer-(pass|regex|basic)|indexer-solr&lt;/value>
- *   &lt;/property>
- * </pre>
- *
- * And then add the <code>index.replace.regexp</code> property to
- * <code>conf/nutch-site.xml</code>. This contains a list of replacement
- * instructions per field name, one per line. eg.
- * 
- * <pre>
- *   fieldname=/regexp/replacement/[flags]
- * </pre>
- * 
- * <pre>
- *   &lt;property>
- *    &lt;name>index.replace.regexp&lt;/name>
- *    &lt;value>
- *      hostmatch=.*\\.com
- *      title=/search/replace/2
- *    &lt;/value>
- *   &lt;/property>
- * </pre>
- * 
- * <code>hostmatch=</code> and <code>urlmatch=</code> lines indicate the match
- * pattern for a host or url. The field replacements that follow this line will
- * apply only to pages from the matching host or url. Replacements run in the
- * order specified. Field names may appear multiple times if multiple
- * replacements are needed.
- * 
- * The property format is defined in greater detail in
- * <code>conf/nutch-default.xml</code>.
- *
- * @author Peter Ciuffetti
- * @see <a
- *      href="https://issues.apache.org/jira/browse/NUTCH-2058";>NUTCH-2058</a>
- */
-public class ReplaceIndexer implements IndexingFilter {
-
-  private static final Log LOG = LogFactory.getLog(ReplaceIndexer.class
-      .getName());
-
-  /** Special field name signifying the start of a host-specific match set */
-  private static final String HOSTMATCH = "hostmatch";
-  /** Special field name signifying the start of a url-specific match set */
-  private static final String URLMATCH = "urlmatch";
-
-  private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_HOST = 
new LinkedHashMap<Pattern, List<FieldReplacer>>();
-  private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_URL = new 
LinkedHashMap<Pattern, List<FieldReplacer>>();
-
-  private static Pattern LINE_SPLIT = Pattern.compile("(^.+$)+",
-      Pattern.MULTILINE);
-  private static Pattern NAME_VALUE_SPLIT = Pattern.compile("(.*?)=(.*)");
-
-  private Configuration conf;
-
-  /**
-   * {@inheritDoc}
-   */
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    FIELDREPLACERS_BY_HOST.clear();
-    FIELDREPLACERS_BY_URL.clear();
-    String value = conf.get("index.replace.regexp", null);
-    if (value != null) {
-      LOG.debug("Parsing index.replace.regexp property");
-      this.parseConf(value);
-    }
-  }
-
-  /**
-   * {@inheritDoc}
-   */
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  /**
-   * Parse the property value into a set of maps that store a list of
-   * replacements by field for each host and url configured into the property.
-   * 
-   * @param propertyValue
-   */
-  private void parseConf(String propertyValue) {
-    if (propertyValue == null || propertyValue.trim().length() == 0) {
-      return;
-    }
-
-    // At the start, all replacements apply globally to every host.
-    Pattern hostPattern = Pattern.compile(".*");
-    Pattern urlPattern = null;
-
-    // Split the property into lines
-    Matcher lineMatcher = LINE_SPLIT.matcher(propertyValue);
-    while (lineMatcher.find()) {
-      String line = lineMatcher.group();
-      if (line != null && line.length() > 0) {
-
-        // Split the line into field and value
-        Matcher nameValueMatcher = NAME_VALUE_SPLIT.matcher(line.trim());
-        if (nameValueMatcher.find()) {
-          String fieldName = nameValueMatcher.group(1).trim();
-          String value = nameValueMatcher.group(2);
-          if (fieldName != null && value != null) {
-            // Check if the field name is one of our special cases.
-            if (HOSTMATCH.equals(fieldName)) {
-              urlPattern = null;
-              try {
-                hostPattern = Pattern.compile(value);
-              } catch (PatternSyntaxException pse) {
-                LOG.error("hostmatch pattern " + value + " does not compile: "
-                    + pse.getMessage());
-                // Deactivate this invalid match set by making it match no 
host.
-                hostPattern = Pattern.compile("willnotmatchanyhost");
-              }
-            } else if (URLMATCH.equals(fieldName)) {
-              try {
-                urlPattern = Pattern.compile(value);
-              } catch (PatternSyntaxException pse) {
-                LOG.error("urlmatch pattern " + value + " does not compile: "
-                    + pse.getMessage());
-                // Deactivate this invalid match set by making it match no url.
-                urlPattern = Pattern.compile("willnotmatchanyurl");
-              }
-            } else if (value.length() > 3) {
-              String toFieldName = fieldName;
-              // If the fieldname has a colon, this indicates a different 
target
-              // field.
-              if (fieldName.indexOf(':') > 0) {
-                toFieldName = fieldName.substring(fieldName.indexOf(':') + 1);
-                fieldName = fieldName.substring(0, fieldName.indexOf(':'));
-              }
-              String sep = value.substring(0, 1);
-
-              // Divide the value into pattern / replacement / flags.
-              value = value.substring(1);
-              if (!value.contains(sep)) {
-                LOG.error("Pattern '" + line
-                    + "', not parseable.  Missing separator " + sep);
-                continue;
-              }
-              String pattern = value.substring(0, value.indexOf(sep));
-              value = value.substring(pattern.length() + 1);
-              String replacement = value;
-              if (value.contains(sep)) {
-                replacement = value.substring(0, value.indexOf(sep));
-              }
-              int flags = 0;
-              if (value.length() > replacement.length() + 1) {
-                value = value.substring(replacement.length() + 1).trim();
-                try {
-                  flags = Integer.parseInt(value);
-                } catch (NumberFormatException e) {
-                  LOG.error("Pattern " + line + ", has invalid flags 
component");
-                  continue;
-                }
-              }
-              Integer iFlags = (flags > 0) ? new Integer(flags) : null;
-
-              // Make a FieldReplacer out of these params.
-              FieldReplacer fr = new FieldReplacer(fieldName, toFieldName,
-                  pattern, replacement, iFlags);
-
-              // Add this field replacer to the list for this host or URL.
-              if (urlPattern != null) {
-                List<FieldReplacer> lfp = 
FIELDREPLACERS_BY_URL.get(urlPattern);
-                if (lfp == null) {
-                  lfp = new ArrayList<FieldReplacer>();
-                }
-                lfp.add(fr);
-                FIELDREPLACERS_BY_URL.put(urlPattern, lfp);
-              } else {
-                List<FieldReplacer> lfp = FIELDREPLACERS_BY_HOST
-                    .get(hostPattern);
-                if (lfp == null) {
-                  lfp = new ArrayList<FieldReplacer>();
-                }
-                lfp.add(fr);
-                FIELDREPLACERS_BY_HOST.put(hostPattern, lfp);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  /**
-   * {@inheritDoc}
-   */
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
-      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
-    if (doc != null) {
-      if (FIELDREPLACERS_BY_HOST.size() > 0) {
-        this.doReplace(doc, "host", FIELDREPLACERS_BY_HOST);
-      }
-
-      if (FIELDREPLACERS_BY_URL.size() > 0) {
-        this.doReplace(doc, "url", FIELDREPLACERS_BY_URL);
-      }
-    }
-
-    return doc;
-  }
-
-  /**
-   * Iterates through the replacement map provided, to update the fields in the
-   * Nutch Document.
-   * 
-   * @param doc
-   *          the document we are modifying
-   * @param keyName
-   *          either "host" or "url" -- the field that determines the
-   *          replacement set used
-   * @param replaceMap
-   *          the list of FieldReplacers that applies to this keyName.
-   */
-  private void doReplace(NutchDocument doc, String keyName,
-      Map<Pattern, List<FieldReplacer>> replaceMap) {
-
-    if (doc == null || replaceMap.size() == 0) {
-      return;
-    }
-
-    Collection<String> docFieldNames = doc.getFieldNames();
-    NutchField keyField = doc.getField(keyName);
-    if (keyField == null) {
-      // This document doesn't have the key field; no work to do.
-      return;
-    }
-
-    List<Object> keyFieldValues = keyField.getValues();
-    if (keyFieldValues.size() == 0) {
-      // This document doesn't have any values for the key field; no work to 
do.
-      return;
-    }
-
-    // For every value of the keyField (one expected)
-    for (Object oKeyFieldValue : keyFieldValues) {
-      if (oKeyFieldValue != null && oKeyFieldValue instanceof 
java.lang.String) {
-        String keyFieldValue = (String) oKeyFieldValue;
-
-        // For each pattern that we have a replacement list for...
-        for (Map.Entry<Pattern, List<FieldReplacer>> entries : replaceMap
-            .entrySet()) {
-          // If this key is a match for a replacement set...
-          if (entries.getKey().matcher(keyFieldValue).find()) {
-
-            // For each field we will replace for this key...
-            for (FieldReplacer fp : entries.getValue()) {
-              String fieldName = fp.getFieldName();
-
-              // Does this document contain the FieldReplacer's field?
-              if (docFieldNames.contains(fieldName)) {
-                NutchField docField = doc.getField(fieldName);
-                List<Object> fieldValues = docField.getValues();
-                ArrayList<String> newFieldValues = new ArrayList<String>();
-
-                // For each value of the field, match against our
-                // replacer...
-                for (Object oFieldValue : fieldValues) {
-                  if (oFieldValue != null
-                      && oFieldValue instanceof java.lang.String) {
-                    String fieldValue = (String) oFieldValue;
-                    String newValue = fp.replace(fieldValue);
-                    newFieldValues.add(newValue);
-                  }
-                }
-
-                // Remove the target field and add our replaced values.
-                String targetFieldName = fp.getToFieldName();
-                doc.removeField(targetFieldName);
-                for (String newFieldValue : newFieldValues) {
-                  doc.add(targetFieldName, newFieldValue);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/package-info.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/package-info.java
 
b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/package-info.java
deleted file mode 100644
index 28c24a4..0000000
--- 
a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Indexing filter to allow pattern replacements on metadata.
- */
-package org.apache.nutch.indexer.replace;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java
 
b/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java
deleted file mode 100644
index ca90ca3..0000000
--- 
a/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java
+++ /dev/null
@@ -1,456 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer.replace;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.basic.BasicIndexingFilter;
-import org.apache.nutch.indexer.metadata.MetadataIndexer;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * JUnit tests for the <code>index-replace</code> plugin.
- * 
- * In these tests, the sample file has some meta tags added to the Nutch
- * document by the <code>index-metadata</code> plugin. The
- * <code>index-replace</code> plugin is then used to either change (or not
- * change) the fields depending on the various values of
- * <code>index.replace.regexp</code> property being provided to Nutch.
- * 
- * 
- * @author Peter Ciuffetti
- *
- */
-public class TestIndexReplace {
-
-  private static final String INDEX_REPLACE_PROPERTY = "index.replace.regexp";
-
-  private String fileSeparator = System.getProperty("file.separator");
-  private String sampleDir = System.getProperty("test.data", ".");
-  private String sampleFile = "testIndexReplace.html";
-
-  /**
-   * Run a test file through the Nutch parser and index filters.
-   * 
-   * @param fileName
-   * @param conf
-   * @return the Nutch document with the replace indexer applied
-   */
-  public NutchDocument parseAndFilterFile(String fileName, Configuration conf) 
{
-    NutchDocument doc = new NutchDocument();
-
-    BasicIndexingFilter basicIndexer = new BasicIndexingFilter();
-    basicIndexer.setConf(conf);
-    Assert.assertNotNull(basicIndexer);
-
-    MetadataIndexer metaIndexer = new MetadataIndexer();
-    metaIndexer.setConf(conf);
-    Assert.assertNotNull(basicIndexer);
-
-    ReplaceIndexer replaceIndexer = new ReplaceIndexer();
-    replaceIndexer.setConf(conf);
-    Assert.assertNotNull(replaceIndexer);
-
-    try {
-      String urlString = "file:" + sampleDir + fileSeparator + fileName;
-      Text text = new Text(urlString);
-      CrawlDatum crawlDatum = new CrawlDatum();
-      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      Content content = protocol.getProtocolOutput(text, crawlDatum)
-          .getContent();
-      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-      crawlDatum.setFetchTime(100L);
-
-      Inlinks inlinks = new Inlinks();
-      doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks);
-      doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks);
-      doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks);
-    } catch (Exception e) {
-      e.printStackTrace();
-      Assert.fail(e.toString());
-    }
-
-    return doc;
-  }
-
-  /**
-   * Test property parsing.
-   * 
-   * The filter does not expose details of the parse. So all we are checking is
-   * that the parse does not throw a runtime exception and that the value
-   * provided is the value returned.
-   */
-  @Test
-  public void testPropertyParse() {
-    Configuration conf = NutchConfiguration.create();
-    String indexReplaceProperty = "  metatag.description=/this(.*)plugin/this 
awesome plugin/2\n"
-        + "  metatag.keywords=/\\,/\\!/\n"
-        + "  hostmatch=.*.com\n"
-        + "  metatag.keywords=/\\,/\\?/\n"
-        + "  metatag.author:dc_author=/\\s+/ David /\n"
-        + "  urlmatch=.*.html\n"
-        + "  metatag.keywords=/\\,/\\./\n" + "  metatag.author=/\\s+/ D. /\n";
-
-    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
-
-    ReplaceIndexer rp = new ReplaceIndexer();
-    try {
-      rp.setConf(conf);
-    } catch (RuntimeException ohno) {
-      Assert.fail("Unable to parse a valid index.replace.regexp property! "
-          + ohno.getMessage());
-    }
-
-    Configuration parsedConf = rp.getConf();
-
-    // Does the getter equal the setter? Too easy!
-    Assert.assertEquals(indexReplaceProperty,
-        parsedConf.get(INDEX_REPLACE_PROPERTY));
-  }
-
-  /**
-   * Test metatag value replacement using global replacement settings.
-   * 
-   * The index.replace.regexp property does not use hostmatch or urlmatch, so
-   * all patterns are global.
-   */
-  @Test
-  public void testGlobalReplacement() {
-    String expectedDescription = "With this awesome plugin, I control the 
description! Bwuhuhuhaha!";
-    String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
-    String expectedAuthor = "Peter D. Ciuffetti";
-    String indexReplaceProperty = "  metatag.description=/this(.*)plugin/this 
awesome plugin/\n"
-        + "  metatag.keywords=/\\,/\\!/\n" + "  metatag.author=/\\s+/ D. /\n";
-
-    Configuration conf = NutchConfiguration.create();
-    conf.set(
-        "plugin.includes",
-        
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
-    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
-    conf.set("metatags.names", "author,description,keywords");
-    conf.set("index.parse.md",
-        "metatag.author,metatag.description,metatag.keywords");
-    // Not necessary but helpful when debugging the filter.
-    conf.set("http.timeout", "99999999999");
-
-    // Run the document through the parser and index filters.
-    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
-
-    Assert.assertEquals(expectedDescription,
-        doc.getFieldValue("metatag.description"));
-    Assert
-        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
-    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
-  }
-
-  /**
-   * Test that invalid property settings are handled and ignored.
-   * 
-   * This test provides an invalid property setting that will fail property
-   * parsing and Pattern.compile. The expected outcome is that the patterns 
will
-   * not cause failure and the targeted fields will not be modified by the
-   * filter.
-   */
-  @Test
-  public void testInvalidPatterns() {
-    String expectedDescription = "With this plugin, I control the description! 
Bwuhuhuhaha!";
-    String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
-    String expectedAuthor = "Peter Ciuffetti";
-    // Contains: invalid pattern, invalid flags, incomplete property
-    String indexReplaceProperty = "  
metatag.description=/this\\s+**plugin/this awesome plugin/\n"
-        + "  metatag.keywords=/\\,/\\!/what\n" + " 
metatag.author=#notcomplete";
-
-    Configuration conf = NutchConfiguration.create();
-    conf.set(
-        "plugin.includes",
-        
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
-    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
-    conf.set("metatags.names", "author,description,keywords");
-    conf.set("index.parse.md",
-        "metatag.author,metatag.description,metatag.keywords");
-    // Not necessary but helpful when debugging the filter.
-    conf.set("http.timeout", "99999999999");
-
-    // Run the document through the parser and index filters.
-    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
-
-    // Assert that our metatags have not changed.
-    Assert.assertEquals(expectedDescription,
-        doc.getFieldValue("metatag.description"));
-    Assert
-        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
-    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
-
-  }
-
-  /**
-   * Test URL pattern matching
-   */
-  @Test
-  public void testUrlMatchesPattern() {
-    String expectedDescription = "With this awesome plugin, I control the 
description! Bwuhuhuhaha!";
-    String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
-    String expectedAuthor = "Peter D. Ciuffetti";
-    String indexReplaceProperty = " urlmatch=.*.html\n"
-        + "  metatag.description=/this(.*)plugin/this awesome plugin/\n"
-        + "  metatag.keywords=/\\,/\\!/\n" + "  metatag.author=/\\s+/ D. /\n";
-
-    Configuration conf = NutchConfiguration.create();
-    conf.set(
-        "plugin.includes",
-        
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
-    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
-    conf.set("metatags.names", "author,description,keywords");
-    conf.set("index.parse.md",
-        "metatag.author,metatag.description,metatag.keywords");
-    // Not necessary but helpful when debugging the filter.
-    conf.set("http.timeout", "99999999999");
-
-    // Run the document through the parser and index filters.
-    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
-
-    // Assert that our metatags have changed.
-    Assert.assertEquals(expectedDescription,
-        doc.getFieldValue("metatag.description"));
-    Assert
-        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
-    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
-
-  }
-
-  /**
-   * Test URL pattern not matching.
-   * 
-   * Expected result is that the filter does not change the fields.
-   */
-  @Test
-  public void testUrlNotMatchesPattern() {
-    String expectedDescription = "With this plugin, I control the description! 
Bwuhuhuhaha!";
-    String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
-    String expectedAuthor = "Peter Ciuffetti";
-    String indexReplaceProperty = " urlmatch=.*.xml\n"
-        + "  metatag.description=/this(.*)plugin/this awesome plugin/\n"
-        + "  metatag.keywords=/\\,/\\!/\n" + "  metatag.author=/\\s+/ D. /\n";
-
-    Configuration conf = NutchConfiguration.create();
-    conf.set(
-        "plugin.includes",
-        
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
-    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
-    conf.set("metatags.names", "author,description,keywords");
-    conf.set("index.parse.md",
-        "metatag.author,metatag.description,metatag.keywords");
-    // Not necessary but helpful when debugging the filter.
-    conf.set("http.timeout", "99999999999");
-
-    // Run the document through the parser and index filters.
-    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
-
-    // Assert that our metatags have not changed.
-    Assert.assertEquals(expectedDescription,
-        doc.getFieldValue("metatag.description"));
-    Assert
-        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
-    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
-
-  }
-
-  /**
-   * Test a global pattern match for description and URL pattern match for
-   * keywords and author.
-   * 
-   * All three should be triggered. It also tests replacement groups.
-   */
-  @Test
-  public void testGlobalAndUrlMatchesPattern() {
-    String expectedDescription = "With this awesome plugin, I control the 
description! Bwuhuhuhaha!";
-    String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
-    String expectedAuthor = "Peter D. Ciuffetti";
-    String indexReplaceProperty = "  
metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n"
-        + "  urlmatch=.*.html\n"
-        + "  metatag.keywords=/\\,/\\!/\n"
-        + "  metatag.author=/\\s+/ D. /\n";
-
-    Configuration conf = NutchConfiguration.create();
-    conf.set(
-        "plugin.includes",
-        
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
-    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
-    conf.set("metatags.names", "author,description,keywords");
-    conf.set("index.parse.md",
-        "metatag.author,metatag.description,metatag.keywords");
-    // Not necessary but helpful when debugging the filter.
-    conf.set("http.timeout", "99999999999");
-
-    // Run the document through the parser and index filters.
-    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
-
-    // Assert that our metatags have changed.
-    Assert.assertEquals(expectedDescription,
-        doc.getFieldValue("metatag.description"));
-    Assert
-        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
-    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
-
-  }
-
-  /**
-   * Test a global pattern match for description and URL pattern match for
-   * keywords and author.
-   * 
-   * Only the global match should be triggered.
-   */
-  @Test
-  public void testGlobalAndUrlNotMatchesPattern() {
-    String expectedDescription = "With this awesome plugin, I control the 
description! Bwuhuhuhaha!";
-    String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
-    String expectedAuthor = "Peter Ciuffetti";
-    String indexReplaceProperty = "  
metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n"
-        + "  urlmatch=.*.xml\n"
-        + "  metatag.keywords=/\\,/\\!/\n"
-        + "  metatag.author=/\\s+/ D. /\n";
-
-    Configuration conf = NutchConfiguration.create();
-    conf.set(
-        "plugin.includes",
-        
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
-    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
-    conf.set("metatags.names", "author,description,keywords");
-    conf.set("index.parse.md",
-        "metatag.author,metatag.description,metatag.keywords");
-    // Not necessary but helpful when debugging the filter.
-    conf.set("http.timeout", "99999999999");
-
-    // Run the document through the parser and index filters.
-    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
-
-    // Assert that description has changed and the others have not changed.
-    Assert.assertEquals(expectedDescription,
-        doc.getFieldValue("metatag.description"));
-    Assert
-        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
-    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
-  }
-
-  /**
-   * Test order-specific replacement settings.
-   * 
-   * This makes multiple replacements on the same field and will produce the
-   * expected value only if the replacements are run in the order specified.
-   */
-  @Test
-  public void testReplacementsRunInSpecifedOrder() {
-    String expectedDescription = "With this awesome plugin, I control the 
description! Bwuhuhuhaha!";
-    String indexReplaceProperty = "  metatag.description=/this plugin/this 
amazing plugin/\n"
-        + "  metatag.description=/this amazing plugin/this valuable plugin/\n"
-        + "  metatag.description=/this valuable plugin/this cool plugin/\n"
-        + "  metatag.description=/this cool plugin/this wicked plugin/\n"
-        + "  metatag.description=/this wicked plugin/this awesome plugin/\n";
-
-    Configuration conf = NutchConfiguration.create();
-    conf.set(
-        "plugin.includes",
-        
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
-    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
-    conf.set("metatags.names", "author,description,keywords");
-    conf.set("index.parse.md",
-        "metatag.author,metatag.description,metatag.keywords");
-    // Not necessary but helpful when debugging the filter.
-    conf.set("http.timeout", "99999999999");
-
-    // Run the document through the parser and index filters.
-    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
-
-    // Check that the value produced by the last replacement has worked.
-    Assert.assertEquals(expectedDescription,
-        doc.getFieldValue("metatag.description"));
-  }
-
-  /**
-   * Test a replacement pattern that uses the flags feature.
-   * 
-   * A 2 is Pattern.CASE_INSENSITIVE. We look for upper case and expect to 
match
-   * any case.
-   */
-  @Test
-  public void testReplacementsWithFlags() {
-    String expectedDescription = "With this awesome plugin, I control the 
description! Bwuhuhuhaha!";
-    String indexReplaceProperty = "  metatag.description=/THIS PLUGIN/this 
awesome plugin/2";
-
-    Configuration conf = NutchConfiguration.create();
-    conf.set(
-        "plugin.includes",
-        
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
-    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
-    conf.set("metatags.names", "author,description,keywords");
-    conf.set("index.parse.md",
-        "metatag.author,metatag.description,metatag.keywords");
-    // Not necessary but helpful when debugging the filter.
-    conf.set("http.timeout", "99999999999");
-
-    // Run the document through the parser and index filters.
-    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
-
-    // Check that the value produced by the case-insensitive replacement has
-    // worked.
-    Assert.assertEquals(expectedDescription,
-        doc.getFieldValue("metatag.description"));
-  }
-
-  /**
-   * Test a replacement pattern that uses the target field feature.
-   * Check that the input is not modifid and that the taret field is added.
-   */
-  @Test
-  public void testReplacementsDifferentTarget() {
-    String expectedDescription = "With this plugin, I control the description! 
Bwuhuhuhaha!";
-    String expectedTargetDescription = "With this awesome plugin, I control 
the description! Bwuhuhuhaha!";
-    String indexReplaceProperty = "  metatag.description:new=/this plugin/this 
awesome plugin/";
-
-    Configuration conf = NutchConfiguration.create();
-    conf.set(
-        "plugin.includes",
-        
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
-    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
-    conf.set("metatags.names", "author,description,keywords");
-    conf.set("index.parse.md",
-        "metatag.author,metatag.description,metatag.keywords");
-    // Not necessary but helpful when debugging the filter.
-    conf.set("http.timeout", "99999999999");
-
-    // Run the document through the parser and index filters.
-    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
-
-    // Check that the input field has not been modified
-    Assert.assertEquals(expectedDescription,
-        doc.getFieldValue("metatag.description"));
-    // Check that the output field has created
-    Assert.assertEquals(expectedTargetDescription,
-        doc.getFieldValue("new"));
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-static/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-static/build.xml 
b/src/plugin/index-static/build.xml
deleted file mode 100644
index 0ec5665..0000000
--- a/src/plugin/index-static/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="index-static" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-static/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-static/ivy.xml b/src/plugin/index-static/ivy.xml
deleted file mode 100644
index 24d7606..0000000
--- a/src/plugin/index-static/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../../ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-static/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-static/plugin.xml 
b/src/plugin/index-static/plugin.xml
deleted file mode 100644
index 539e355..0000000
--- a/src/plugin/index-static/plugin.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="index-static"
-   name="Index Static"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-    <runtime>
-      <library name="index-static.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-
-   <extension id="org.apache.nutch.indexer.staticfield"
-              name="Nutch static field index"
-              point="org.apache.nutch.indexer.IndexingFilter">
-      <implementation id="StaticField"
-                      
class="org.apache.nutch.indexer.staticfield.StaticFieldIndexer"/>
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
 
b/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
deleted file mode 100644
index 1a81041..0000000
--- 
a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
+++ /dev/null
@@ -1,143 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer.staticfield;
-
-import java.util.HashMap;
-import java.util.Map.Entry;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.Parse;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * A simple plugin called at indexing that adds fields with static data. You 
can
- * specify a list of fieldname:fieldcontent per nutch job. It can be useful 
when
- * collections can't be created by urlpatterns, like in subcollection, but on a
- * job-basis.
- */
-
-public class StaticFieldIndexer implements IndexingFilter {
-  private Configuration conf;
-  private HashMap<String, String[]> fields;
-  private boolean addStaticFields = false;
-  private String fieldSep = ",";
-  private String kevSep = ":";
-  private String valueSep = " ";
-
-  /**
-   * The {@link StaticFieldIndexer} filter object which adds fields as per
-   * configuration setting. See {@code index.static} in nutch-default.xml.
-   * 
-   * @param doc
-   *          The {@link NutchDocument} object
-   * @param parse
-   *          The relevant {@link Parse} object passing through the filter
-   * @param url
-   *          URL to be filtered for anchor text
-   * @param datum
-   *          The {@link CrawlDatum} entry
-   * @param inlinks
-   *          The {@link Inlinks} containing anchor text
-   * @return filtered NutchDocument
-   */
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
-      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
-    if (this.addStaticFields == true) {
-      for (Entry<String, String[]> entry : this.fields.entrySet()) {
-        for (String val : entry.getValue()) {
-          doc.add(entry.getKey(), val);
-        }
-      }
-    }
-    return doc;
-  }
-
-  /**
-   * Populate a HashMap from a list of fieldname:fieldcontent. See
-   * {@index.static} in nutch-default.xml.
-   * 
-   * @param fieldsString
-   *          string containing field:value pairs
-   * @return HashMap of fields and their corresponding values
-   */
-  private HashMap<String, String[]> parseFields(String fieldsString) {
-    HashMap<String, String[]> fields = new HashMap<String, String[]>();
-
-    /*
-     * The format is very easy, it's a comma-separated list of fields in the
-     * form <name>:<value>
-     */
-    for (String field : fieldsString.split(this.fieldSep)) {
-      String[] entry = field.split(this.kevSep);
-      if (entry.length == 2)
-        fields.put(entry[0].trim(), entry[1].trim().split(this.valueSep));
-    }
-
-    return fields;
-  }
-
-  /**
-   * Set the {@link Configuration} object
-   */
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-
-    // NUTCH-2052: Allow user-defined delimiters in index.static
-    this.fieldSep = this.regexEscape(conf.get("index.static.fieldsep", ","));
-    this.kevSep = this.regexEscape(conf.get("index.static.keysep", ":"));
-    this.valueSep = this.regexEscape(conf.get("index.static.valuesep", " "));
-
-    String fieldsString = conf.get("index.static", null);
-    if (fieldsString != null) {
-      this.addStaticFields = true;
-      this.fields = parseFields(fieldsString);
-    }
-  }
-
-  /**
-   * Get the {@link Configuration} object
-   */
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  /**
-   * Escapes any character that needs escaping so it can be used in a regexp.
-   */
-  protected String regexEscape(String in) {
-    String result = in;
-    if (in != null) {
-      StringBuffer sb = new StringBuffer();
-      for (int i = 0; i < in.length(); i++) {
-        CharSequence c = in.subSequence(i, i+1);
-        if ("<([{\\^-=$!|]})?*+.>".contains(c)) {
-          sb.append('\\');
-        }
-        sb.append(c);
-      }
-      result = sb.toString();
-    }
-    return result;
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/package.html
----------------------------------------------------------------------
diff --git 
a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/package.html
 
b/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/package.html
deleted file mode 100644
index f4b5146..0000000
--- 
a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/package.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>A simple plugin called at indexing that adds fields with static data. You 
can specify a list of fieldname:fieldcontent per nutch job. It can be useful 
when collections can't be created by urlpatterns, like in subcollection, but on 
a job-basis.</p><p></p>
-</body>
-</html>

[20/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

Reply via email to