[11/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

thammegowda Sat, 16 Jul 2016 12:48:34 -0700

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java
 
b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java
new file mode 100644
index 0000000..5c4c990
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java
@@ -0,0 +1,402 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NodeWalker;
+import org.apache.nutch.util.URLUtil;
+import org.apache.tika.sax.Link;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * A collection of methods for extracting content from DOM trees.
+ * 
+ * This class holds a few utility methods for pulling content out of DOM nodes,
+ * such as getOutlinks, getText, etc.
+ * 
+ */
+public class DOMContentUtils {
+
+  private static class LinkParams {
+    private String elName;
+    private String attrName;
+    private int childLen;
+
+    private LinkParams(String elName, String attrName, int childLen) {
+      this.elName = elName;
+      this.attrName = attrName;
+      this.childLen = childLen;
+    }
+
+    public String toString() {
+      return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + 
"]";
+    }
+  }
+
+  private HashMap<String, LinkParams> linkParams = new HashMap<String, 
LinkParams>();
+  private HashSet<String> ignoredTags = new HashSet<String>();
+  private Configuration conf;
+
+  public DOMContentUtils(Configuration conf) {
+    setConf(conf);
+  }
+
+  public void setConf(Configuration conf) {
+    // forceTags is used to override configurable tag ignoring, later on
+    Collection<String> forceTags = new ArrayList<String>(1);
+
+    this.conf = conf;
+    linkParams.clear();
+    linkParams.put("a", new LinkParams("a", "href", 1));
+    linkParams.put("area", new LinkParams("area", "href", 0));
+    if (conf.getBoolean("parser.html.form.use_action", true)) {
+      linkParams.put("form", new LinkParams("form", "action", 1));
+      if (conf.get("parser.html.form.use_action") != null)
+        forceTags.add("form");
+    }
+    linkParams.put("frame", new LinkParams("frame", "src", 0));
+    linkParams.put("iframe", new LinkParams("iframe", "src", 0));
+    linkParams.put("script", new LinkParams("script", "src", 0));
+    linkParams.put("link", new LinkParams("link", "href", 0));
+    linkParams.put("img", new LinkParams("img", "src", 0));
+
+    // remove unwanted link tags from the linkParams map
+    String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
+    for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) {
+      ignoredTags.add(ignoreTags[i].toLowerCase());
+      if (!forceTags.contains(ignoreTags[i]))
+        linkParams.remove(ignoreTags[i]);
+    }
+  }
+
+  /**
+   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
+   * append all the content text found beneath the DOM node to the
+   * <code>StringBuffer</code>.
+   * 
+   * <p>
+   * 
+   * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be 
aborted
+   * and the <code>StringBuffer</code> will not contain any text encountered
+   * after a nested anchor is found.
+   * 
+   * <p>
+   * 
+   * @return true if nested anchors were found
+   */
+  private boolean getText(StringBuffer sb, Node node,
+      boolean abortOnNestedAnchors) {
+    if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * This is a convinience method, equivalent to
+   * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
+   * 
+   */
+  public void getText(StringBuffer sb, Node node) {
+    getText(sb, node, false);
+  }
+
+  // returns true if abortOnNestedAnchors is true and we find nested
+  // anchors
+  private boolean getTextHelper(StringBuffer sb, Node node,
+      boolean abortOnNestedAnchors, int anchorDepth) {
+    boolean abort = false;
+    NodeWalker walker = new NodeWalker(node);
+
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+
+      if ("script".equalsIgnoreCase(nodeName)) {
+        walker.skipChildren();
+      }
+      if ("style".equalsIgnoreCase(nodeName)) {
+        walker.skipChildren();
+      }
+      if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
+        anchorDepth++;
+        if (anchorDepth > 1) {
+          abort = true;
+          break;
+        }
+      }
+      if (nodeType == Node.COMMENT_NODE) {
+        walker.skipChildren();
+      }
+      if (nodeType == Node.TEXT_NODE) {
+        // cleanup and trim the value
+        String text = currentNode.getNodeValue();
+        text = text.replaceAll("\\s+", " ");
+        text = text.trim();
+        if (text.length() > 0) {
+          if (sb.length() > 0)
+            sb.append(' ');
+          sb.append(text);
+        }
+      }
+    }
+
+    return abort;
+  }
+
+  /**
+   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
+   * append the content text found beneath the first <code>title</code> node to
+   * the <code>StringBuffer</code>.
+   * 
+   * @return true if a title node was found, false otherwise
+   */
+  public boolean getTitle(StringBuffer sb, Node node) {
+
+    NodeWalker walker = new NodeWalker(node);
+
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+
+      if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
+        return false;
+      }
+
+      if (nodeType == Node.ELEMENT_NODE) {
+        if ("title".equalsIgnoreCase(nodeName)) {
+          getText(sb, currentNode);
+          return true;
+        }
+      }
+    }
+
+    return false;
+  }
+
+  /** If Node contains a BASE tag then it's HREF is returned. */
+  URL getBase(Node node) {
+
+    NodeWalker walker = new NodeWalker(node);
+
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+
+      // is this node a BASE tag?
+      if (nodeType == Node.ELEMENT_NODE) {
+
+        if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
+          return null;
+        }
+
+        if ("base".equalsIgnoreCase(nodeName)) {
+          NamedNodeMap attrs = currentNode.getAttributes();
+          for (int i = 0; i < attrs.getLength(); i++) {
+            Node attr = attrs.item(i);
+            if ("href".equalsIgnoreCase(attr.getNodeName())) {
+              try {
+                return new URL(attr.getNodeValue());
+              } catch (MalformedURLException e) {
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // no.
+    return null;
+  }
+
+  private boolean hasOnlyWhiteSpace(Node node) {
+    String val = node.getNodeValue();
+    for (int i = 0; i < val.length(); i++) {
+      if (!Character.isWhitespace(val.charAt(i)))
+        return false;
+    }
+    return true;
+  }
+  
+  // this only covers a few cases of empty links that are symptomatic
+  // of nekohtml's DOM-fixup process...
+  private boolean shouldThrowAwayLink(Node node, NodeList children,
+      int childLen, LinkParams params) {
+    if (childLen == 0) {
+      // this has no inner structure
+      if (params.childLen == 0)
+        return false;
+      else
+        return true;
+    } else if ((childLen == 1)
+        && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
+        && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
+      // single nested link
+      return true;
+
+    } else if (childLen == 2) {
+
+      Node c0 = children.item(0);
+      Node c1 = children.item(1);
+
+      if ((c0.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c0.getNodeName()))
+          && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) {
+        // single link followed by whitespace node
+        return true;
+      }
+
+      if ((c1.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+          && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) {
+        // whitespace node followed by single link
+        return true;
+      }
+
+    } else if (childLen == 3) {
+      Node c0 = children.item(0);
+      Node c1 = children.item(1);
+      Node c2 = children.item(2);
+
+      if ((c1.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+          && (c0.getNodeType() == Node.TEXT_NODE)
+          && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)
+          && hasOnlyWhiteSpace(c2)) {
+        // single link surrounded by whitespace nodes
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  /**
+   * This method finds all anchors below the supplied DOM <code>node</code>, 
and
+   * creates appropriate {@link Outlink} records for each (relative to the
+   * supplied <code>base</code> URL), and adds them to the 
<code>outlinks</code>
+   * {@link ArrayList}.
+   * 
+   * <p>
+   * 
+   * Links without inner structure (tags, text, etc) are discarded, as are 
links
+   * which contain only single nested links and empty text nodes (this is a
+   * common DOM-fixup artifact, at least with nekohtml).
+   */
+  public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
+
+    NodeWalker walker = new NodeWalker(node);
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+      NodeList children = currentNode.getChildNodes();
+      int childLen = (children != null) ? children.getLength() : 0;
+
+      if (nodeType == Node.ELEMENT_NODE) {
+
+        nodeName = nodeName.toLowerCase();
+        LinkParams params = (LinkParams) linkParams.get(nodeName);
+        if (params != null) {
+          if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
+
+            StringBuffer linkText = new StringBuffer();
+            getText(linkText, currentNode, true);
+
+            NamedNodeMap attrs = currentNode.getAttributes();
+            String target = null;
+            boolean noFollow = false;
+            boolean post = false;
+            for (int i = 0; i < attrs.getLength(); i++) {
+              Node attr = attrs.item(i);
+              String attrName = attr.getNodeName();
+              if (params.attrName.equalsIgnoreCase(attrName)) {
+                target = attr.getNodeValue();
+              } else if ("rel".equalsIgnoreCase(attrName)
+                  && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+                noFollow = true;
+              } else if ("method".equalsIgnoreCase(attrName)
+                  && "post".equalsIgnoreCase(attr.getNodeValue())) {
+                post = true;
+              }
+            }
+            if (target != null && !noFollow && !post)
+              try {
+
+                URL url = URLUtil.resolveURL(base, target);
+                outlinks.add(new Outlink(url.toString(), linkText.toString()
+                    .trim()));
+              } catch (MalformedURLException e) {
+                // don't care
+              }
+          }
+          // this should not have any children, skip them
+          if (params.childLen == 0)
+            continue;
+        }
+      }
+    }
+  }
+  
+  // This one is used by NUTCH-1918
+  public void getOutlinks(URL base, ArrayList<Outlink> outlinks, List<Link> 
tikaExtractedOutlinks) {
+    String target = null;
+    String anchor = null;
+    boolean noFollow = false;
+
+    for (Link link : tikaExtractedOutlinks) {
+      target = link.getUri();
+      noFollow = (link.getRel().toLowerCase().equals("nofollow")) ? true : 
false;
+      anchor = link.getText();
+
+      if (!ignoredTags.contains(link.getType())) {
+        if (target != null && !noFollow) {
+          try {
+            URL url = URLUtil.resolveURL(base, target);
+            
+            // clean the anchor
+            anchor = anchor.replaceAll("\\s+", " ");
+            anchor = anchor.trim();
+            
+            outlinks.add(new Outlink(url.toString(), anchor));
+          } catch (MalformedURLException e) {
+            // don't care
+          }
+        }
+      }
+    }
+  }
+}
\ No newline at end of file


http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
 
b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
new file mode 100644
index 0000000..294bde9
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
@@ -0,0 +1,214 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import java.net.URL;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.w3c.dom.*;
+
+/**
+ * Class for parsing META Directives from DOM trees. This class handles
+ * specifically Robots META directives (all, none, nofollow, noindex), finding
+ * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives 
are
+ * stored in a HTMLMetaTags instance.
+ */
+public class HTMLMetaProcessor {
+
+  /**
+   * Utility class with indicators for the robots directives "noindex" and
+   * "nofollow", and HTTP-EQUIV/no-cache
+   */
+
+  /**
+   * Sets the indicators in <code>robotsMeta</code> to appropriate values, 
based
+   * on any META tags found under the given <code>node</code>.
+   */
+  public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
+      URL currURL) {
+
+    metaTags.reset();
+    getMetaTagsHelper(metaTags, node, currURL);
+  }
+
+  private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
+      URL currURL) {
+
+    if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+      if ("body".equalsIgnoreCase(node.getNodeName())) {
+        // META tags should not be under body
+        return;
+      }
+
+      if ("meta".equalsIgnoreCase(node.getNodeName())) {
+        NamedNodeMap attrs = node.getAttributes();
+        Node nameNode = null;
+        Node equivNode = null;
+        Node contentNode = null;
+        // Retrieves name, http-equiv and content attribues
+        for (int i = 0; i < attrs.getLength(); i++) {
+          Node attr = attrs.item(i);
+          String attrName = attr.getNodeName().toLowerCase();
+          if (attrName.equals("name")) {
+            nameNode = attr;
+          } else if (attrName.equals("http-equiv")) {
+            equivNode = attr;
+          } else if (attrName.equals("content")) {
+            contentNode = attr;
+          }
+        }
+
+        if (nameNode != null) {
+          if (contentNode != null) {
+            String name = nameNode.getNodeValue().toLowerCase();
+            metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
+            if ("robots".equals(name)) {
+
+              if (contentNode != null) {
+                String directives = contentNode.getNodeValue().toLowerCase();
+                int index = directives.indexOf("none");
+
+                if (index >= 0) {
+                  metaTags.setNoIndex();
+                  metaTags.setNoFollow();
+                }
+
+                index = directives.indexOf("all");
+                if (index >= 0) {
+                  // do nothing...
+                }
+
+                index = directives.indexOf("noindex");
+                if (index >= 0) {
+                  metaTags.setNoIndex();
+                }
+
+                index = directives.indexOf("nofollow");
+                if (index >= 0) {
+                  metaTags.setNoFollow();
+                }
+
+                index = directives.indexOf("noarchive");
+                if (index >= 0) {
+                  metaTags.setNoCache();
+                }
+              }
+
+            } // end if (name == robots)
+          }
+        }
+
+        if (equivNode != null) {
+          if (contentNode != null) {
+            String name = equivNode.getNodeValue().toLowerCase();
+            String content = contentNode.getNodeValue();
+            metaTags.getHttpEquivTags().setProperty(name, content);
+            if ("pragma".equals(name)) {
+              content = content.toLowerCase();
+              int index = content.indexOf("no-cache");
+              if (index >= 0)
+                metaTags.setNoCache();
+            } else if ("refresh".equals(name)) {
+              int idx = content.indexOf(';');
+              String time = null;
+              if (idx == -1) { // just the refresh time
+                time = content;
+              } else
+                time = content.substring(0, idx);
+              try {
+                metaTags.setRefreshTime(Integer.parseInt(time));
+                // skip this if we couldn't parse the time
+                metaTags.setRefresh(true);
+              } catch (Exception e) {
+                ;
+              }
+              URL refreshUrl = null;
+              if (metaTags.getRefresh() && idx != -1) { // set the URL
+                idx = content.toLowerCase().indexOf("url=");
+                if (idx == -1) { // assume a mis-formatted entry with just the
+                                 // url
+                  idx = content.indexOf(';') + 1;
+                } else
+                  idx += 4;
+                if (idx != -1) {
+                  String url = content.substring(idx);
+                  try {
+                    refreshUrl = new URL(url);
+                  } catch (Exception e) {
+                    // XXX according to the spec, this has to be an absolute
+                    // XXX url. However, many websites use relative URLs and
+                    // XXX expect browsers to handle that.
+                    // XXX Unfortunately, in some cases this may create a
+                    // XXX infinitely recursive paths (a crawler trap)...
+                    // if (!url.startsWith("/")) url = "/" + url;
+                    try {
+                      refreshUrl = new URL(currURL, url);
+                    } catch (Exception e1) {
+                      refreshUrl = null;
+                    }
+                  }
+                }
+              }
+              if (metaTags.getRefresh()) {
+                if (refreshUrl == null) {
+                  // apparently only refresh time was present. set the URL
+                  // to the same URL.
+                  refreshUrl = currURL;
+                }
+                metaTags.setRefreshHref(refreshUrl);
+              }
+            }
+          }
+        }
+
+      } else if ("base".equalsIgnoreCase(node.getNodeName())) {
+        NamedNodeMap attrs = node.getAttributes();
+        Node hrefNode = attrs.getNamedItem("href");
+
+        if (hrefNode != null) {
+          String urlString = hrefNode.getNodeValue();
+
+          URL url = null;
+          try {
+            if (currURL == null)
+              url = new URL(urlString);
+            else
+              url = new URL(currURL, urlString);
+          } catch (Exception e) {
+            ;
+          }
+
+          if (url != null)
+            metaTags.setBaseHref(url);
+        }
+
+      }
+
+    }
+
+    NodeList children = node.getChildNodes();
+    if (children != null) {
+      int len = children.getLength();
+      for (int i = 0; i < len; i++) {
+        getMetaTagsHelper(metaTags, children.item(i), currURL);
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java
 
b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java
new file mode 100644
index 0000000..5d7eca9
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -0,0 +1,286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import java.io.ByteArrayInputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.html.dom.HTMLDocumentImpl;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilters;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.protocol.Content;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlMapper;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.sax.Link;
+import org.apache.tika.sax.LinkContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML
+ * representation returned by Tika as SAX events
+ ***/
+
+public class TikaParser implements org.apache.nutch.parse.Parser {
+
+  public static final Logger LOG = LoggerFactory.getLogger(TikaParser.class);
+
+  private Configuration conf;
+  private TikaConfig tikaConfig = null;
+  private DOMContentUtils utils;
+  private HtmlParseFilters htmlParseFilters;
+  private String cachingPolicy;
+  private HtmlMapper HTMLMapper;
+  private boolean upperCaseElementNames = true;
+
+  @SuppressWarnings("deprecation")
+  public ParseResult getParse(Content content) {
+    String mimeType = content.getContentType();
+    
+    boolean useBoilerpipe = getConf().get("tika.extractor", 
"none").equals("boilerpipe");
+    String boilerpipeExtractorName = 
getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
+
+    URL base;
+    try {
+      base = new URL(content.getBaseUrl());
+    } catch (MalformedURLException e) {
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    }
+
+    // get the right parser using the mime type as a clue
+    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
+    byte[] raw = content.getContent();
+
+    if (parser == null) {
+      String message = "Can't retrieve Tika parser for mime-type " + mimeType;
+      LOG.error(message);
+      return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(
+          content.getUrl(), getConf());
+    }
+
+    LOG.debug("Using Tika parser " + parser.getClass().getName()
+        + " for mime-type " + mimeType);
+
+    Metadata tikamd = new Metadata();
+
+    HTMLDocumentImpl doc = new HTMLDocumentImpl();
+    doc.setErrorChecking(false);
+    DocumentFragment root = doc.createDocumentFragment();
+
+    ContentHandler domHandler;
+    
+    // Check whether to use Tika's BoilerplateContentHandler
+    if (useBoilerpipe) {
+      BoilerpipeContentHandler bpHandler = new 
BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root),
+      BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
+      bpHandler.setIncludeMarkup(true);
+      domHandler = (ContentHandler)bpHandler;
+    } else {
+      DOMBuilder domBuilder = new DOMBuilder(doc, root);
+      domBuilder.setUpperCaseElementNames(upperCaseElementNames);
+      domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
+      domHandler = (ContentHandler)domBuilder;
+    }
+
+    LinkContentHandler linkContentHandler = new LinkContentHandler();
+
+    ParseContext context = new ParseContext();
+    TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, 
linkContentHandler);
+    
+    if (HTMLMapper != null)
+      context.set(HtmlMapper.class, HTMLMapper);
+    tikamd.set(Metadata.CONTENT_TYPE, mimeType);
+    try {
+      parser.parse(new ByteArrayInputStream(raw), 
(ContentHandler)teeContentHandler, tikamd, context);
+    } catch (Exception e) {
+      LOG.error("Error parsing " + content.getUrl(), e);
+      return new ParseStatus(ParseStatus.FAILED, e.getMessage())
+          .getEmptyParseResult(content.getUrl(), getConf());
+    }
+
+    HTMLMetaTags metaTags = new HTMLMetaTags();
+    String text = "";
+    String title = "";
+    Outlink[] outlinks = new Outlink[0];
+    org.apache.nutch.metadata.Metadata nutchMetadata = new 
org.apache.nutch.metadata.Metadata();
+
+    // we have converted the sax events generated by Tika into a DOM object
+    // so we can now use the usual HTML resources from Nutch
+    // get meta directives
+    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
+    }
+
+    // check meta directives
+    if (!metaTags.getNoIndex()) { // okay to index
+      StringBuffer sb = new StringBuffer();
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting text...");
+      }
+      utils.getText(sb, root); // extract text
+      text = sb.toString();
+      sb.setLength(0);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting title...");
+      }
+      utils.getTitle(sb, root); // extract title
+      title = sb.toString().trim();
+    }
+
+    if (!metaTags.getNoFollow()) { // okay to follow links
+      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
+      URL baseTag = utils.getBase(root);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting links...");
+      }
+      
+      // pre-1233 outlink extraction
+      //utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
+      // Get outlinks from Tika
+      List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
+      utils.getOutlinks(baseTag != null ? baseTag : base, l, 
tikaExtractedOutlinks);
+      outlinks = l.toArray(new Outlink[l.size()]);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("found " + outlinks.length + " outlinks in "
+            + content.getUrl());
+      }
+    }
+
+    // populate Nutch metadata with Tika metadata
+    String[] TikaMDNames = tikamd.names();
+    for (String tikaMDName : TikaMDNames) {
+      if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
+        continue;
+      String[] values = tikamd.getValues(tikaMDName);
+      for (String v : values)
+        nutchMetadata.add(tikaMDName, v);
+    }
+
+    // no outlinks? try OutlinkExtractor e.g works for mime types where no
+    // explicit markup for anchors
+
+    if (outlinks.length == 0) {
+      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
+    }
+
+    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
+    if (metaTags.getRefresh()) {
+      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
+      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
+          Integer.toString(metaTags.getRefreshTime()) });
+    }
+    ParseData parseData = new ParseData(status, title, outlinks,
+        content.getMetadata(), nutchMetadata);
+    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
+        new ParseImpl(text, parseData));
+
+    // run filters on parse
+    ParseResult filteredParse = this.htmlParseFilters.filter(content,
+        parseResult, metaTags, root);
+    if (metaTags.getNoCache()) { // not okay to cache
+      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
+        entry.getValue().getData().getParseMeta()
+            .set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
+    }
+    return filteredParse;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.tikaConfig = null;
+
+    // do we want a custom Tika configuration file
+    // deprecated since Tika 0.7 which is based on
+    // a service provider based configuration
+    String customConfFile = conf.get("tika.config.file");
+    if (customConfFile != null) {
+      try {
+        // see if a Tika config file can be found in the job file
+        URL customTikaConfig = conf.getResource(customConfFile);
+        if (customTikaConfig != null)
+          tikaConfig = new TikaConfig(customTikaConfig);
+      } catch (Exception e1) {
+        String message = "Problem loading custom Tika configuration from "
+            + customConfFile;
+        LOG.error(message, e1);
+      }
+    } else {
+      try {
+        tikaConfig = new TikaConfig(this.getClass().getClassLoader());
+      } catch (Exception e2) {
+        String message = "Problem loading default Tika configuration";
+        LOG.error(message, e2);
+      }
+    }
+
+    // use a custom htmlmapper
+    String htmlmapperClassName = conf.get("tika.htmlmapper.classname");
+    if (StringUtils.isNotBlank(htmlmapperClassName)) {
+      try {
+        Class HTMLMapperClass = Class.forName(htmlmapperClassName);
+        boolean interfaceOK = HtmlMapper.class
+            .isAssignableFrom(HTMLMapperClass);
+        if (!interfaceOK) {
+          throw new RuntimeException("Class " + htmlmapperClassName
+              + " does not implement HtmlMapper");
+        }
+        HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance();
+      } catch (Exception e) {
+        LOG.error("Can't generate instance for class " + htmlmapperClassName);
+        throw new RuntimeException("Can't generate instance for class "
+            + htmlmapperClassName);
+      }
+    }
+
+    this.htmlParseFilters = new HtmlParseFilters(getConf());
+    this.utils = new DOMContentUtils(conf);
+    this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+        Nutch.CACHING_FORBIDDEN_CONTENT);
+    this.upperCaseElementNames = getConf().getBoolean(
+        "tika.uppercase.element.names", true);
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
 
b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
new file mode 100644
index 0000000..d625c33
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
@@ -0,0 +1,112 @@
+/*
+ * XXX [email protected]: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer,
+ * XXX in order to avoid dependency on Xalan.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id: XMLCharacterRecognizer.java 823614 2009-10-09 17:02:32Z ab $
+ */
+package org.apache.nutch.parse.tika;
+
+/**
+ * Class used to verify whether the specified <var>ch</var> conforms to the XML
+ * 1.0 definition of whitespace.
+ */
+class XMLCharacterRecognizer {
+
+  /**
+   * Returns whether the specified <var>ch</var> conforms to the XML 1.0
+   * definition of whitespace. Refer to <A
+   * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S";> the definition of
+   * <CODE>S</CODE></A> for details.
+   * 
+   * @param ch
+   *          Character to check as XML whitespace.
+   * @return =true if <var>ch</var> is XML whitespace; otherwise =false.
+   */
+  static boolean isWhiteSpace(char ch) {
+    return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA);
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   * 
+   * @param ch
+   *          Character array to check as XML whitespace.
+   * @param start
+   *          Start index of characters in the array
+   * @param length
+   *          Number of characters in the array
+   * @return True if the characters in the array are XML whitespace; otherwise,
+   *         false.
+   */
+  static boolean isWhiteSpace(char ch[], int start, int length) {
+
+    int end = start + length;
+
+    for (int s = start; s < end; s++) {
+      if (!isWhiteSpace(ch[s]))
+        return false;
+    }
+
+    return true;
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   * 
+   * @param buf
+   *          StringBuffer to check as XML whitespace.
+   * @return True if characters in buffer are XML whitespace, false otherwise
+   */
+  static boolean isWhiteSpace(StringBuffer buf) {
+
+    int n = buf.length();
+
+    for (int i = 0; i < n; i++) {
+      if (!isWhiteSpace(buf.charAt(i)))
+        return false;
+    }
+
+    return true;
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   * 
+   * @param s
+   *          String to check as XML whitespace.
+   * @return True if characters in buffer are XML whitespace, false otherwise
+   */
+  static boolean isWhiteSpace(String s) {
+
+    if (null != s) {
+      int n = s.length();
+
+      for (int i = 0; i < n; i++) {
+        if (!isWhiteSpace(s.charAt(i)))
+          return false;
+      }
+    }
+
+    return true;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java
 
b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java
new file mode 100644
index 0000000..19e3f47
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse various document formats with help of
+ * <a href="http://tika.apache.org/";>Apache Tika</a>.
+ */
+package org.apache.nutch.parse.tika;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java
new file mode 100644
index 0000000..96029a6
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java
@@ -0,0 +1,337 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.tika.DOMContentUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Unit tests for DOMContentUtils.
+ */
+public class TestDOMContentUtils {
+
+  private static final String[] testPages = {
+
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"http://www.nutch.org\";>"
+          + " anchor </a><!--comment-->" + "</body></html>"),
+
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
+          + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
+          + "</body></html>"),
+
+      new String("<html><head><title> </title>" + "</head><body> "
+          + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
+          + "</a></a>" + "</body></html>"),
+
+      // this one relies on certain neko fixup behavior, possibly
+      // distributing the anchors into the LI's-but not the other
+      // anchors (outside of them, instead)! So you get a tree that
+      // looks like:
+      // ... <li> <a href=/> home </a> </li>
+      // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+      // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+      new String("<html><head><title> my title </title>"
+          + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
+          + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
+          + "</body></html>"),
+
+      // test frameset link extraction. The invalid frame in the middle
+      // will be
+      // fixed to a third standalone frame.
+      new String("<html><head><title> my title </title>"
+          + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
+          + "</frame>" + "<frameset cols=\"20,*\">"
+          + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
+          + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
+          + "</frameset>" + "</frameset>" + "</body></html>"),
+
+      // test <area> and <iframe> link extraction + url normalization
+      new String(
+          "<html><head><title> my title </title>"
+              + "</head><body>"
+              + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+              + "<map name=\"green\">"
+              + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" 
href=\"../index.html\">"
+              + "<area shape=\"rect\" coords=\"128,132,241,179\" 
href=\"#bottom\">"
+              + "<area shape=\"circle\" coords=\"68,211,35\" 
href=\"../bot.html\">"
+              + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
+              + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
+
+      // test whitespace processing for plain text extraction
+      new String(
+          "<html><head>\n <title> my\t\n  title\r\n </title>\n"
+              + " </head>\n"
+              + " <body>\n"
+              + "    <h1> Whitespace\ttest  </h1> \n"
+              + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  
\t\n"
+              + "    <p> This is<span> a whitespace<span></span> test</span>. 
Newlines\n"
+              + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+              + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> 
break</i>.<br>\n"
+              + "<table>"
+              + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+              + "    <tr><td>space here </td><td> space there</td><td>no 
space</td></tr>"
+              + 
"\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+              + "</table>put some text here<Br>and there."
+              + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+              + "         .        .        .         ." + "</body>  </html>"),
+
+      // test that <a rel=nofollow> links are not returned
+      new String("<html><head></head><body>"
+          + "<a href=\"http://www.nutch.org\"; rel=\"nofollow\"> ignore </a>"
+          + "<a rel=\"nofollow\" href=\"http://www.nutch.org\";> ignore </a>"
+          + "</body></html>"),
+      // test that POST form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      // test that all form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
+          + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
+          + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
+          + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), 
};
+
+  private static int SKIP = 9;
+
+  private static String[] testBaseHrefs = { "http://www.nutch.org";,
+      "http://www.nutch.org/docs/foo.html";, "http://www.nutch.org/docs/";,
+      "http://www.nutch.org/docs/";, "http://www.nutch.org/frames/";,
+      "http://www.nutch.org/maps/";, "http://www.nutch.org/whitespace/";,
+      "http://www.nutch.org//";, "http://www.nutch.org/";,
+      "http://www.nutch.org/";, "http://www.nutch.org/";,
+      "http://www.nutch.org/;something"; };
+
+  private static final DocumentFragment testDOMs[] = new 
DocumentFragment[testPages.length];
+
+  private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+  private static final String[] answerText = {
+      "title body anchor",
+      "title body home bots",
+      "separate this from this",
+      "my title body home 1 2",
+      "my title",
+      "my title the bottom",
+      "my title Whitespace test whitespace test "
+          + "This is a whitespace test . Newlines should appear as space too. "
+          + "Tabs are spaces too. This is a break -> and the line after break 
. "
+          + "one two three space here space there no space "
+          + "one two two three three four put some text here and there. "
+          + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+      "test1 test2", "title anchor1 anchor2 anchor3",
+      "title anchor1 anchor2 anchor3 anchor4 anchor5" };
+
+  private static final String[] answerTitle = { "title", "title", "",
+      "my title", "my title", "my title", "my title", "", "", "", "title",
+      "title" };
+
+  // note: should be in page-order
+  private static Outlink[][] answerOutlinks;
+
+  private static Configuration conf;
+  private static DOMContentUtils utils = null;
+
+  @Before
+  public void setup() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.setBoolean("parser.html.form.use_action", true);
+    utils = new DOMContentUtils(conf);
+    DOMFragmentParser parser = new DOMFragmentParser();
+    parser.setFeature(
+        "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe";,
+        true);
+    for (int i = 0; i < testPages.length; i++) {
+      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+      try {
+        parser.parse(
+            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
+            node);
+        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
+      } catch (Exception e) {
+        Assert.assertTrue("caught exception: " + e, false);
+      }
+      testDOMs[i] = node;
+    }
+    answerOutlinks = new Outlink[][] {
+        { new Outlink("http://www.nutch.org";, "anchor"), },
+        { new Outlink("http://www.nutch.org/";, "home"),
+            new Outlink("http://www.nutch.org/docs/bot.html";, "bots"), },
+        { new Outlink("http://www.nutch.org/";, "separate this"),
+            new Outlink("http://www.nutch.org/docs/ok";, "from this"), },
+        { new Outlink("http://www.nutch.org/";, "home"),
+            new Outlink("http://www.nutch.org/docs/1";, "1"),
+            new Outlink("http://www.nutch.org/docs/2";, "2"), },
+        { new Outlink("http://www.nutch.org/frames/top.html";, ""),
+            new Outlink("http://www.nutch.org/frames/left.html";, ""),
+            new Outlink("http://www.nutch.org/frames/invalid.html";, ""),
+            new Outlink("http://www.nutch.org/frames/right.html";, ""), },
+        { new Outlink("http://www.nutch.org/maps/logo.gif";, ""),
+            new Outlink("http://www.nutch.org/index.html";, ""),
+            new Outlink("http://www.nutch.org/maps/#bottom";, ""),
+            new Outlink("http://www.nutch.org/bot.html";, ""),
+            new Outlink("http://www.nutch.org/docs/index.html";, ""), },
+        { new Outlink("http://www.nutch.org/index.html";, "whitespace test"), },
+        {},
+        { new Outlink("http://www.nutch.org/dummy.jsp";, "test2"), },
+        {},
+        { new Outlink("http://www.nutch.org/;x";, "anchor1"),
+            new Outlink("http://www.nutch.org/g;x";, "anchor2"),
+            new Outlink("http://www.nutch.org/g;x?y#s";, "anchor3") },
+        {
+            // this is tricky - see RFC3986 section 5.4.1 example 7
+            new Outlink("http://www.nutch.org/g";, "anchor1"),
+            new Outlink("http://www.nutch.org/g?y#s";, "anchor2"),
+            new Outlink("http://www.nutch.org/;something?y=1";, "anchor3"),
+            new Outlink("http://www.nutch.org/;something?y=1#s";, "anchor4"),
+            new Outlink("http://www.nutch.org/;something?y=1;somethingelse";,
+                "anchor5") } };
+
+  }
+
+  private static boolean equalsIgnoreWhitespace(String s1, String s2) {
+    StringTokenizer st1 = new StringTokenizer(s1);
+    StringTokenizer st2 = new StringTokenizer(s2);
+
+    while (st1.hasMoreTokens()) {
+      if (!st2.hasMoreTokens())
+        return false;
+      if (!st1.nextToken().equals(st2.nextToken()))
+        return false;
+    }
+    if (st2.hasMoreTokens())
+      return false;
+    return true;
+  }
+
+  @Test
+  public void testGetText() throws Exception {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
+      utils.getText(sb, testDOMs[i]);
+      String text = sb.toString();
+      Assert.assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerText[i], text));
+    }
+  }
+
+  @Test
+  public void testGetTitle() throws Exception {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
+      utils.getTitle(sb, testDOMs[i]);
+      String text = sb.toString();
+      Assert.assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerTitle[i], text));
+    }
+  }
+
+  @Test
+  public void testGetOutlinks() throws Exception {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
+      if (i == SKIP) {
+        conf.setBoolean("parser.html.form.use_action", false);
+        utils.setConf(conf);
+      } else {
+        conf.setBoolean("parser.html.form.use_action", true);
+        utils.setConf(conf);
+      }
+      utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+      Outlink[] outlinkArr = new Outlink[outlinks.size()];
+      outlinkArr = outlinks.toArray(outlinkArr);
+      compareOutlinks(answerOutlinks[i], outlinkArr);
+    }
+  }
+
+  private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
+    for (int i = 0; i < o.length; i++) {
+      sb.append(o[i].toString());
+      sb.append(System.getProperty("line.separator"));
+    }
+  }
+
+  private static final String outlinksString(Outlink[] o) {
+    StringBuffer sb = new StringBuffer();
+    appendOutlinks(sb, o);
+    return sb.toString();
+  }
+
+  private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
+    if (o1.length != o2.length) {
+      Assert.assertTrue(
+          "got wrong number of outlinks (expecting " + o1.length + ", got "
+              + o2.length + ")" + System.getProperty("line.separator")
+              + "answer: " + System.getProperty("line.separator")
+              + outlinksString(o1) + System.getProperty("line.separator")
+              + "got: " + System.getProperty("line.separator")
+              + outlinksString(o2) + System.getProperty("line.separator"),
+          false);
+    }
+
+    for (int i = 0; i < o1.length; i++) {
+      if (!o1[i].equals(o2[i])) {
+        Assert.assertTrue(
+            "got wrong outlinks at position " + i
+                + System.getProperty("line.separator") + "answer: "
+                + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
+                + "', anchor: '" + o1[i].getAnchor() + "'"
+                + System.getProperty("line.separator") + "got: "
+                + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
+                + "', anchor: '" + o2[i].getAnchor() + "'", false);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java
new file mode 100644
index 0000000..c9394dc
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.tika.TikaParser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * 
+ * @author mattmann / jnioche
+ * 
+ *         Test Suite for the RSS feeds with the {@link TikaParser}.
+ * 
+ */
+public class TestFeedParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  private String[] sampleFiles = { "rsstest.rss" };
+
+  public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
+      .getName());
+
+  /**
+   * <p>
+   * The test method: tests out the following 2 asserts:
+   * </p>
+   * 
+   * <ul>
+   * <li>There are 3 outlinks read from the sample rss file</li>
+   * <li>The 3 outlinks read are in fact the correct outlinks from the sample
+   * file</li>
+   * </ul>
+   */
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    Configuration conf = NutchConfiguration.create();
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
+
+      // check that there are 2 outlinks:
+      // unlike the original parse-rss
+      // tika ignores the URL and description of the channel
+
+      // http://test.channel.com
+      // http://www-scf.usc.edu/~mattmann/
+      // http://www.nutch.org
+
+      ParseData theParseData = parse.getData();
+
+      Outlink[] theOutlinks = theParseData.getOutlinks();
+
+      Assert.assertTrue("There aren't 2 outlinks read!",
+          theOutlinks.length == 2);
+
+      // now check to make sure that those are the two outlinks
+      boolean hasLink1 = false, hasLink2 = false;
+
+      for (int j = 0; j < theOutlinks.length; j++) {
+        if (theOutlinks[j].getToUrl().equals(
+            "http://www-scf.usc.edu/~mattmann/";)) {
+          hasLink1 = true;
+        }
+
+        if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/";)) {
+          hasLink2 = true;
+        }
+      }
+
+      if (!hasLink1 || !hasLink2) {
+        Assert.fail("Outlinks read from sample rss file are not correct!");
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java
new file mode 100644
index 0000000..b1762e6
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Test extraction of image metadata
+ */
+public class TestImageMetadata {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  private String[] sampleFiles = { "nutch_logo_tm.gif", };
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      Configuration conf = NutchConfiguration.create();
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
+
+      Assert.assertEquals("121", parse.getData().getMeta("width"));
+      Assert.assertEquals("48", parse.getData().getMeta("height"));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java
new file mode 100644
index 0000000..576b3df
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+
+/**
+ * Unit tests for MSWordParser.
+ * 
+ * @author John Xing
+ */
+public class TestMSWordParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-msword/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
+  private String[] sampleFiles = { "word97.doc" };
+
+  private String expectedText = "This is a sample doc file prepared for 
nutch.";
+
+  private Configuration conf;
+
+  @Before
+  public void setUp() {
+    conf = NutchConfiguration.create();
+    conf.set("file.content.limit", "-1");
+  }
+
+  public String getTextContent(String fileName) throws ProtocolException,
+      ParseException {
+    String urlString = "file:" + sampleDir + fileSeparator + fileName;
+    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    Content content = protocol.getProtocolOutput(new Text(urlString),
+        new CrawlDatum()).getContent();
+    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+        .get(content.getUrl());
+    return parse.getText();
+  }
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    for (int i = 0; i < sampleFiles.length; i++) {
+      String found = getTextContent(sampleFiles[i]);
+      Assert.assertTrue("text found : '" + found + "'",
+          found.startsWith(expectedText));
+    }
+  }
+
+  @Test
+  public void testOpeningDocs() throws ProtocolException, ParseException {
+    String[] filenames = new File(sampleDir).list();
+    for (int i = 0; i < filenames.length; i++) {
+      if (filenames[i].endsWith(".doc") == false)
+        continue;
+      Assert.assertTrue("cann't read content of " + filenames[i],
+          getTextContent(filenames[i]).length() > 0);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java
new file mode 100644
index 0000000..6960bad
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.protocol.*;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Unit tests for OOParser.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class TestOOParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-oo/build.xml during plugin compilation.
+  private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
+
+  private String expectedText;
+
+  private String sampleText = "ootest.txt";
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Content content;
+    Parse parse;
+    Configuration conf = NutchConfiguration.create();
+    Protocol protocol;
+    ProtocolFactory factory = new ProtocolFactory(conf);
+
+    System.out.println("Expected : " + expectedText);
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      if (sampleFiles[i].startsWith("ootest") == false)
+        continue;
+
+      protocol = factory.getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
+
+      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+
+      // simply test for the presence of a text - the ordering of the elements
+      // may differ from what was expected
+      // in the previous tests
+      Assert.assertTrue(text != null && text.length() > 0);
+
+      System.out.println("Found " + sampleFiles[i] + ": " + text);
+    }
+  }
+
+  public TestOOParser() {
+    try {
+      // read the test string
+      FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+          + sampleText);
+      StringBuffer sb = new StringBuffer();
+      int len = 0;
+      InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
+      char[] buf = new char[1024];
+      while ((len = isr.read(buf)) > 0) {
+        sb.append(buf, 0, len);
+      }
+      isr.close();
+      expectedText = sb.toString();
+      // normalize space
+      expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java
new file mode 100644
index 0000000..9884f0c
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Unit tests for PdfParser.
+ * 
+ * @author John Xing
+ */
+public class TestPdfParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-pdf/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
+  private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
+
+  private String expectedText = "A VERY SMALL PDF FILE";
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      Configuration conf = NutchConfiguration.create();
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
+
+      int index = parse.getText().indexOf(expectedText);
+      Assert.assertTrue(index > 0);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java
new file mode 100644
index 0000000..f15d821
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.tika;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
+ * 
+ * @author Andy Hedges
+ */
+public class TestRTFParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-rtf/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
+  private String rtfFile = "test.rtf";
+
+  @Ignore("There seems to be an issue with line 71 e.g. text.trim()")
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    Configuration conf = NutchConfiguration.create();
+    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
+    protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
+        .getContent();
+    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(
+        content.getUrl());
+    String text = parse.getText();
+    Assert.assertEquals("The quick brown fox jumps over the lazy dog",
+        text.trim());
+
+    String title = parse.getData().getTitle();
+    Metadata meta = parse.getData().getParseMeta();
+
+    Assert.assertEquals("test rft document", title);
+    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java
new file mode 100644
index 0000000..4224f93
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java
@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.tika.HTMLMetaProcessor;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for HTMLMetaProcessor. */
+public class TestRobotsMetaProcessor {
+
+  /*
+   * 
+   * some sample tags:
+   * 
+   * <meta name="robots" content="index,follow"> <meta name="robots"
+   * content="noindex,follow"> <meta name="robots" content="index,nofollow">
+   * <meta name="robots" content="noindex,nofollow">
+   * 
+   * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
+   */
+
+  public static String[] tests = {
+      "<html><head><title>test page</title>"
+          + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
+          + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"all\"> "
+          + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
+          + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,follow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,follow\"> "
+          + "<base href=\"http://www.nutch.org/\";>" + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
+          + "<base href=\"http://www.nutch.org/base/\";>" + "</head><body>"
+          + " some text" + "</body></html>",
+
+  };
+
+  public static final boolean[][] answers = { { true, true, true }, // NONE
+      { false, false, true }, // all
+      { true, true, true }, // nOnE
+      { true, true, false }, // none
+      { true, true, false }, // noindex,nofollow
+      { true, false, false }, // noindex,follow
+      { false, true, false }, // index,nofollow
+      { false, false, false }, // index,follow
+      { false, false, false }, // missing!
+  };
+
+  private URL[][] currURLsAndAnswers;
+
+  @Test
+  public void testRobotsMetaProcessor() {
+    DOMFragmentParser parser = new DOMFragmentParser();
+    ;
+
+    try {
+      currURLsAndAnswers = new URL[][] {
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org/foo/";),
+              new URL("http://www.nutch.org/";) },
+          { new URL("http://www.nutch.org";),
+              new URL("http://www.nutch.org/base/";) } };
+    } catch (Exception e) {
+      Assert.assertTrue("couldn't make test URLs!", false);
+    }
+
+    for (int i = 0; i < tests.length; i++) {
+      byte[] bytes = tests[i].getBytes();
+
+      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+
+      try {
+        parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
+      } catch (Exception e) {
+        e.printStackTrace();
+      }
+
+      HTMLMetaTags robotsMeta = new HTMLMetaTags();
+      HTMLMetaProcessor.getMetaTags(robotsMeta, node, 
currURLsAndAnswers[i][0]);
+
+      Assert.assertTrue("got index wrong on test " + i,
+          robotsMeta.getNoIndex() == answers[i][0]);
+      Assert.assertTrue("got follow wrong on test " + i,
+          robotsMeta.getNoFollow() == answers[i][1]);
+      Assert.assertTrue("got cache wrong on test " + i,
+          robotsMeta.getNoCache() == answers[i][2]);
+      Assert
+          .assertTrue(
+              "got base href wrong on test " + i + " (got "
+                  + robotsMeta.getBaseHref() + ")",
+              ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] 
== null))
+                  || ((robotsMeta.getBaseHref() != null) && robotsMeta
+                      .getBaseHref().equals(currURLsAndAnswers[i][1])));
+
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf 
b/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf
new file mode 100644
index 0000000..383cebb
Binary files /dev/null and 
b/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf differ

[11/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Reply via email to