[12/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

thammegowda Sat, 16 Jul 2016 12:48:37 -0700

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/build.xml 
b/nutch-plugins/parse-swf/build.xml
new file mode 100644
index 0000000..f4fb20f
--- /dev/null
+++ b/nutch-plugins/parse-swf/build.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-swf" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
+  </target>
+
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy file="sample/test1.swf" todir="${build.test}/data"/>
+  <copy file="sample/test2.swf" todir="${build.test}/data"/>
+  <copy file="sample/test3.swf" todir="${build.test}/data"/>
+  <copy file="sample/test1.txt" todir="${build.test}/data"/>
+  <copy file="sample/test2.txt" todir="${build.test}/data"/>
+  <copy file="sample/test3.txt" todir="${build.test}/data"/>
+
+</project>


http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/ivy.xml b/nutch-plugins/parse-swf/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/parse-swf/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt 
b/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt
new file mode 100644
index 0000000..4138a66
--- /dev/null
+++ b/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt
@@ -0,0 +1,33 @@
+
+  Copyright (c) 2001-2005, David N. Main, All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or
+  without modification, are permitted provided that the 
+  following conditions are met:
+ 
+  1. Redistributions of source code must retain the above 
+  copyright notice, this list of conditions and the following 
+  disclaimer. 
+  
+  2. Redistributions in binary form must reproduce the above 
+  copyright notice, this list of conditions and the following 
+  disclaimer in the documentation and/or other materials 
+  provided with the distribution.
+  
+  3. The name of the author may not be used to endorse or 
+  promote products derived from this software without specific 
+  prior written permission. 
+  
+  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY 
+  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
+  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+  AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 
+  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
+  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
+  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 
+  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/lib/javaswf.jar
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/lib/javaswf.jar 
b/nutch-plugins/parse-swf/lib/javaswf.jar
new file mode 100644
index 0000000..78f9b0b
Binary files /dev/null and b/nutch-plugins/parse-swf/lib/javaswf.jar differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/plugin.xml 
b/nutch-plugins/parse-swf/plugin.xml
new file mode 100644
index 0000000..8cc72c0
--- /dev/null
+++ b/nutch-plugins/parse-swf/plugin.xml
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-swf"
+   name="SWF Parse Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="parse-swf.jar">
+         <export name="*"/>
+      </library>
+      <library name="javaswf.jar"/>
+   </runtime>
+
+   <extension id="org.apache.nutch.parse.swf"
+              name="SWFParse"
+              point="org.apache.nutch.parse.Parser">
+
+      <implementation id="org.apache.nutch.parse.swf.SWFParser"
+                      class="org.apache.nutch.parse.swf.SWFParser">
+        <parameter name="contentType" value="application/x-shockwave-flash"/>
+        <parameter name="pathSuffix"  value="swf"/>
+      </implementation>
+      
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/pom.xml b/nutch-plugins/parse-swf/pom.xml
new file mode 100644
index 0000000..743511e
--- /dev/null
+++ b/nutch-plugins/parse-swf/pom.xml
@@ -0,0 +1,46 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-swf</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-swf</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>com.google.gwt</groupId>
+            <artifactId>gwt-incubator</artifactId>
+            <version>2.0.1</version>
+        </dependency>
+
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java
 
b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java
new file mode 100644
index 0000000..9251366
--- /dev/null
+++ 
b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java
@@ -0,0 +1,685 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.swf;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.*;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.hadoop.conf.Configuration;
+
+import com.anotherbigidea.flash.interfaces.*;
+import com.anotherbigidea.flash.readers.*;
+import com.anotherbigidea.flash.structs.*;
+import com.anotherbigidea.flash.writers.SWFActionBlockImpl;
+import com.anotherbigidea.flash.writers.SWFTagTypesImpl;
+import com.anotherbigidea.io.InStream;
+
+/**
+ * Parser for Flash SWF files. Loosely based on the sample in JavaSWF
+ * distribution.
+ */
+public class SWFParser implements Parser {
+  public static final Logger LOG = LoggerFactory
+      .getLogger("org.apache.nutch.parse.swf");
+
+  private Configuration conf = null;
+
+  public SWFParser() {
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public ParseResult getParse(Content content) {
+
+    String text = null;
+    Vector<Outlink> outlinks = new Vector<Outlink>();
+
+    try {
+
+      byte[] raw = content.getContent();
+
+      String contentLength = 
content.getMetadata().get(Response.CONTENT_LENGTH);
+      if (contentLength != null
+          && raw.length != Integer.parseInt(contentLength)) {
+        return new ParseStatus(ParseStatus.FAILED,
+            ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length
+                + " bytes. Parser can't handle incomplete files.")
+            .getEmptyParseResult(content.getUrl(), getConf());
+      }
+      ExtractText extractor = new ExtractText();
+
+      // TagParser implements SWFTags and drives a SWFTagTypes interface
+      TagParser parser = new TagParser(extractor);
+      // use this instead to debug the file
+      // TagParser parser = new TagParser( new SWFTagDumper(true, true) );
+
+      // SWFReader reads an input file and drives a SWFTags interface
+      SWFReader reader = new SWFReader(parser, new InStream(raw));
+
+      // read the input SWF file and pass it through the interface pipeline
+      reader.readFile();
+      text = extractor.getText();
+      String atext = extractor.getActionText();
+      if (atext != null && atext.length() > 0)
+        text += "\n--------\n" + atext;
+      // harvest potential outlinks
+      String[] links = extractor.getUrls();
+      for (int i = 0; i < links.length; i++) {
+        Outlink out = new Outlink(links[i], "");
+        outlinks.add(out);
+      }
+      Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf);
+      if (olinks != null)
+        for (int i = 0; i < olinks.length; i++) {
+          outlinks.add(olinks[i]);
+        }
+    } catch (Exception e) { // run time exception
+      LOG.error("Error, runtime exception: ", e);
+      return new ParseStatus(ParseStatus.FAILED,
+          "Can't be handled as SWF document. " + e).getEmptyParseResult(
+          content.getUrl(), getConf());
+    }
+    if (text == null)
+      text = "";
+
+    Outlink[] links = (Outlink[]) outlinks
+        .toArray(new Outlink[outlinks.size()]);
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links,
+        content.getMetadata());
+    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text,
+        parseData));
+  }
+
+  /**
+   * Arguments are: 0. Name of input SWF file.
+   */
+  public static void main(String[] args) throws IOException {
+    FileInputStream in = new FileInputStream(args[0]);
+
+    byte[] buf = new byte[in.available()];
+    in.read(buf);
+    in.close();
+    SWFParser parser = new SWFParser();
+    ParseResult parseResult = parser.getParse(new Content("file:" + args[0],
+        "file:" + args[0], buf, "application/x-shockwave-flash",
+        new Metadata(), NutchConfiguration.create()));
+    Parse p = parseResult.get("file:" + args[0]);
+    System.out.println("Parse Text:");
+    System.out.println(p.getText());
+    System.out.println("Parse Data:");
+    System.out.println(p.getData());
+  }
+}
+
+/**
+ * Shows how to parse a Flash movie and extract all the text in Text symbols 
and
+ * the initial text in Edit Fields. Output is to System.out.
+ * 
+ * A "pipeline" is set up in the main method:
+ * 
+ * SWFReader-->TagParser-->ExtractText
+ * 
+ * SWFReader reads the input SWF file and separates out the header and the 
tags.
+ * The separated contents are passed to TagParser which parses out the
+ * individual tag types and passes them to ExtractText.
+ * 
+ * ExtractText extends SWFTagTypesImpl and overrides some methods.
+ */
+class ExtractText extends SWFTagTypesImpl {
+  /**
+   * Store font info keyed by the font symbol id. Each entry is an int[] of
+   * character codes for the correspnding font glyphs (An empty array denotes a
+   * System Font).
+   */
+  protected HashMap<Integer, int[]> fontCodes = new HashMap<Integer, int[]>();
+
+  public ArrayList<String> strings = new ArrayList<String>();
+
+  public HashSet<String> actionStrings = new HashSet<String>();
+
+  public ArrayList<String> urls = new ArrayList<String>();
+
+  public ExtractText() {
+    super(null);
+  }
+
+  public String getText() {
+    StringBuffer res = new StringBuffer();
+    Iterator<String> it = strings.iterator();
+    while (it.hasNext()) {
+      if (res.length() > 0)
+        res.append(' ');
+      res.append(it.next());
+    }
+    return res.toString();
+  }
+
+  public String getActionText() {
+    StringBuffer res = new StringBuffer();
+    String[] strings = (String[]) actionStrings
+        .toArray(new String[actionStrings.size()]);
+    Arrays.sort(strings);
+    for (int i = 0; i < strings.length; i++) {
+      if (i > 0)
+        res.append('\n');
+      res.append(strings[i]);
+    }
+    return res.toString();
+  }
+
+  public String[] getUrls() {
+    String[] res = new String[urls.size()];
+    int i = 0;
+    Iterator<String> it = urls.iterator();
+    while (it.hasNext()) {
+      res[i] = (String) it.next();
+      i++;
+    }
+    return res;
+  }
+
+  public void tagDefineFontInfo2(int arg0, String arg1, int arg2, int[] arg3,
+      int arg4) throws IOException {
+    tagDefineFontInfo(arg0, arg1, arg2, arg3);
+  }
+
+  /**
+   * SWFTagTypes interface Save the Text Font character code info
+   */
+  public void tagDefineFontInfo(int fontId, String fontName, int flags,
+      int[] codes) throws IOException {
+    // System.out.println("-defineFontInfo id=" + fontId + ", name=" +
+    // fontName);
+    fontCodes.put(new Integer(fontId), codes);
+  }
+
+  // XXX too much hassle for too little return ... we cannot guess character
+  // XXX codes anyway, so we just give up.
+  /*
+   * public SWFVectors tagDefineFont(int arg0, int arg1) throws IOException {
+   * return null; }
+   */
+
+  /**
+   * SWFTagTypes interface. Save the character code info.
+   */
+  public SWFVectors tagDefineFont2(int id, int flags, String name,
+      int numGlyphs, int ascent, int descent, int leading, int[] codes,
+      int[] advances, Rect[] bounds, int[] kernCodes1, int[] kernCodes2,
+      int[] kernAdjustments) throws IOException {
+    // System.out.println("-defineFontInfo id=" + id + ", name=" + name);
+    fontCodes.put(new Integer(id), (codes != null) ? codes : new int[0]);
+
+    return null;
+  }
+
+  /**
+   * SWFTagTypes interface. Dump any initial text in the field.
+   */
+  public void tagDefineTextField(int fieldId, String fieldName,
+      String initialText, Rect boundary, int flags, AlphaColor textColor,
+      int alignment, int fontId, int fontSize, int charLimit, int leftMargin,
+      int rightMargin, int indentation, int lineSpacing) throws IOException {
+    if (initialText != null) {
+      strings.add(initialText);
+    }
+  }
+
+  /**
+   * SWFTagTypes interface
+   */
+  public SWFText tagDefineText(int id, Rect bounds, Matrix matrix)
+      throws IOException {
+    lastBounds = curBounds;
+    curBounds = bounds;
+    return new TextDumper();
+  }
+
+  Rect lastBounds = null;
+  Rect curBounds = null;
+
+  /**
+   * SWFTagTypes interface
+   */
+  public SWFText tagDefineText2(int id, Rect bounds, Matrix matrix)
+      throws IOException {
+    lastBounds = curBounds;
+    curBounds = bounds;
+    return new TextDumper();
+  }
+
+  public class TextDumper implements SWFText {
+    protected Integer fontId;
+
+    protected boolean firstY = true;
+
+    public void font(int fontId, int textHeight) {
+      this.fontId = new Integer(fontId);
+    }
+
+    public void setY(int y) {
+      if (firstY)
+        firstY = false;
+      else
+        strings.add("\n"); // Change in Y - dump a new line
+    }
+
+    /*
+     * There are some issues with this method: sometimes SWF files define their
+     * own font, so short of OCR we cannot guess what is the glyph code ->
+     * character mapping. Additionally, some files don't use literal space
+     * character, instead they adjust glyphAdvances. We don't handle it at all 
-
+     * in such cases the text will be all glued together.
+     */
+    public void text(int[] glyphIndices, int[] glyphAdvances) {
+      // System.out.println("-text id=" + fontId);
+      int[] codes = (int[]) fontCodes.get(fontId);
+      if (codes == null) {
+        // unknown font, better not guess
+        strings.add("\n**** ?????????????? ****\n");
+        return;
+      }
+
+      // --Translate the glyph indices to character codes
+      char[] chars = new char[glyphIndices.length];
+
+      for (int i = 0; i < chars.length; i++) {
+        int index = glyphIndices[i];
+
+        if (index >= codes.length) // System Font ?
+        {
+          chars[i] = (char) index;
+        } else {
+          chars[i] = (char) (codes[index]);
+        }
+        // System.out.println("-ch[" + i + "]='" + chars[i] + "'(" +
+        // (int)chars[i] + ") +" + glyphAdvances[i]);
+      }
+      strings.add(new String(chars));
+    }
+
+    public void color(Color color) {
+    }
+
+    public void setX(int x) {
+    }
+
+    public void done() {
+      strings.add("\n");
+    }
+  }
+
+  public SWFActions tagDoAction() throws IOException {
+    // ActionTextWriter actions = new ActionTextWriter(new
+    // PrintWriter(System.out));
+    NutchSWFActions actions = new NutchSWFActions(actionStrings, urls);
+    return actions;
+  }
+
+  public SWFActions tagDoInitAction(int arg0) throws IOException {
+    // ActionTextWriter actions = new ActionTextWriter(new
+    // PrintWriter(System.out));
+    NutchSWFActions actions = new NutchSWFActions(actionStrings, urls);
+    return actions;
+  }
+
+  public void tagGeneratorFont(byte[] arg0) throws IOException {
+    // TODO Auto-generated method stub
+    super.tagGeneratorFont(arg0);
+  }
+
+  public void tagGeneratorText(byte[] arg0) throws IOException {
+    // TODO Auto-generated method stub
+    super.tagGeneratorText(arg0);
+  }
+
+}
+
+/**
+ * ActionScript parser. This parser tries to extract free text embedded inside
+ * the script, but without polluting it too much with names of variables,
+ * methods, etc. Not ideal, but it works.
+ */
+class NutchSWFActions extends SWFActionBlockImpl implements SWFActions {
+  private HashSet<String> strings = null;
+
+  private ArrayList<String> urls = null;
+
+  String[] dict = null;
+
+  Stack<Object> stack = null;
+
+  public NutchSWFActions(HashSet<String> strings, ArrayList<String> urls) {
+    this.strings = strings;
+    this.urls = urls;
+    stack = new SmallStack(100, strings);
+  }
+
+  public void lookupTable(String[] values) throws IOException {
+    for (int i = 0; i < values.length; i++) {
+      if (!strings.contains(values[i]))
+        strings.add(values[i]);
+    }
+    super.lookupTable(values);
+    dict = values;
+  }
+
+  public void defineLocal() throws IOException {
+    stack.pop();
+    super.defineLocal();
+  }
+
+  public void getURL(int vars, int mode) {
+    // System.out.println("-getURL: vars=" + vars + ", mode=" + mode);
+  }
+
+  public void getURL(String url, String target) throws IOException {
+    // System.out.println("-getURL: url=" + url + ", target=" + target);
+    stack.push(url);
+    stack.push(target);
+    strings.remove(url);
+    strings.remove(target);
+    urls.add(url);
+    super.getURL(url, target);
+  }
+
+  public SWFActionBlock.TryCatchFinally _try(String var) throws IOException {
+    // stack.push(var);
+    strings.remove(var);
+    return super._try(var);
+  }
+
+  public void comment(String var) throws IOException {
+    // stack.push(var);
+    strings.remove(var);
+    super.comment(var);
+  }
+
+  public void goToFrame(String var) throws IOException {
+    stack.push(var);
+    strings.remove(var);
+    super.gotoFrame(var);
+  }
+
+  public void ifJump(String var) throws IOException {
+    strings.remove(var);
+    super.ifJump(var);
+  }
+
+  public void jump(String var) throws IOException {
+    strings.remove(var);
+    super.jump(var);
+  }
+
+  public void jumpLabel(String var) throws IOException {
+    strings.remove(var);
+    super.jumpLabel(var);
+  }
+
+  public void lookup(int var) throws IOException {
+    if (dict != null && var >= 0 && var < dict.length) {
+      stack.push(dict[var]);
+    }
+    super.lookup(var);
+  }
+
+  public void push(String var) throws IOException {
+    stack.push(var);
+    strings.remove(var);
+    super.push(var);
+  }
+
+  public void setTarget(String var) throws IOException {
+    stack.push(var);
+    strings.remove(var);
+    super.setTarget(var);
+  }
+
+  public SWFActionBlock startFunction(String var, String[] params)
+      throws IOException {
+    stack.push(var);
+    strings.remove(var);
+    if (params != null) {
+      for (int i = 0; i < params.length; i++) {
+        strings.remove(params[i]);
+      }
+    }
+    return this;
+  }
+
+  public SWFActionBlock startFunction2(String var, int arg1, int arg2,
+      String[] params, int[] arg3) throws IOException {
+    stack.push(var);
+    strings.remove(var);
+    if (params != null) {
+      for (int i = 0; i < params.length; i++) {
+        strings.remove(params[i]);
+      }
+    }
+    return this;
+  }
+
+  public void waitForFrame(int num, String var) throws IOException {
+    stack.push(var);
+    strings.remove(var);
+    super.waitForFrame(num, var);
+  }
+
+  public void waitForFrame(String var) throws IOException {
+    stack.push(var);
+    strings.remove(var);
+    super.waitForFrame(var);
+  }
+
+  public void done() throws IOException {
+    while (stack.size() > 0) {
+      strings.remove(stack.pop());
+    }
+  }
+
+  public SWFActionBlock start(int arg0, int arg1) throws IOException {
+    return this;
+  }
+
+  public SWFActionBlock start(int arg0) throws IOException {
+    return this;
+  }
+
+  public void add() throws IOException {
+    super.add();
+  }
+
+  public void asciiToChar() throws IOException {
+    super.asciiToChar();
+  }
+
+  public void asciiToCharMB() throws IOException {
+    super.asciiToCharMB();
+  }
+
+  public void push(int var) throws IOException {
+    if (dict != null && var >= 0 && var < dict.length) {
+      stack.push(dict[var]);
+    }
+    super.push(var);
+  }
+
+  public void callFunction() throws IOException {
+    strings.remove(stack.pop());
+    super.callFunction();
+  }
+
+  public void callMethod() throws IOException {
+    strings.remove(stack.pop());
+    super.callMethod();
+  }
+
+  public void getMember() throws IOException {
+    // 0: name
+    String val = (String) stack.pop();
+    strings.remove(val);
+    super.getMember();
+  }
+
+  public void setMember() throws IOException {
+    // 0: value -1: name
+    stack.pop(); // value
+    String name = (String) stack.pop();
+    strings.remove(name);
+    super.setMember();
+  }
+
+  public void setProperty() throws IOException {
+    super.setProperty();
+  }
+
+  public void setVariable() throws IOException {
+    super.setVariable();
+  }
+
+  public void call() throws IOException {
+    strings.remove(stack.pop());
+    super.call();
+  }
+
+  public void setTarget() throws IOException {
+    strings.remove(stack.pop());
+    super.setTarget();
+  }
+
+  public void pop() throws IOException {
+    strings.remove(stack.pop());
+    super.pop();
+  }
+
+  public void push(boolean arg0) throws IOException {
+    stack.push("" + arg0);
+    super.push(arg0);
+  }
+
+  public void push(double arg0) throws IOException {
+    stack.push("" + arg0);
+    super.push(arg0);
+  }
+
+  public void push(float arg0) throws IOException {
+    stack.push("" + arg0);
+    super.push(arg0);
+  }
+
+  public void pushNull() throws IOException {
+    stack.push("");
+    super.pushNull();
+  }
+
+  public void pushRegister(int arg0) throws IOException {
+    stack.push("" + arg0);
+    super.pushRegister(arg0);
+  }
+
+  public void pushUndefined() throws IOException {
+    stack.push("???");
+    super.pushUndefined();
+  }
+
+  public void getProperty() throws IOException {
+    stack.pop();
+    super.getProperty();
+  }
+
+  public void getVariable() throws IOException {
+    strings.remove(stack.pop());
+    super.getVariable();
+  }
+
+  public void gotoFrame(boolean arg0) throws IOException {
+    stack.push("" + arg0);
+    super.gotoFrame(arg0);
+  }
+
+  public void gotoFrame(int arg0) throws IOException {
+    stack.push("" + arg0);
+    super.gotoFrame(arg0);
+  }
+
+  public void gotoFrame(String arg0) throws IOException {
+    stack.push("" + arg0);
+    strings.remove(arg0);
+    super.gotoFrame(arg0);
+  }
+
+  public void newObject() throws IOException {
+    stack.pop();
+    super.newObject();
+  }
+
+  public SWFActionBlock startWith() throws IOException {
+    return this;
+  }
+
+}
+
+/*
+ * Small bottom-less stack.
+ */
+class SmallStack extends Stack<Object> {
+
+  private static final long serialVersionUID = 1L;
+
+  private int maxSize;
+
+  private HashSet<String> strings = null;
+
+  public SmallStack(int maxSize, HashSet<String> strings) {
+    this.maxSize = maxSize;
+    this.strings = strings;
+  }
+
+  public Object push(Object o) {
+    // limit max size
+    if (this.size() > maxSize) {
+      String val = (String) remove(0);
+      strings.remove(val);
+    }
+    return super.push(o);
+  }
+
+  public Object pop() {
+    // tolerate underruns
+    if (this.size() == 0)
+      return null;
+    else
+      return super.pop();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java
 
b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java
new file mode 100644
index 0000000..5942e64
--- /dev/null
+++ 
b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse Flash SWF files.
+ */
+package org.apache.nutch.parse.swf;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java
 
b/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java
new file mode 100644
index 0000000..129b85f
--- /dev/null
+++ 
b/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.swf;
+
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Unit tests for SWFParser.
+ */
+public class TestSWFParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  private String[] sampleFiles = new String[] { "test1.swf", "test2.swf",
+      "test3.swf" };
+  private String[] sampleTexts = new String[] { "test1.txt", "test2.txt",
+      "test3.txt" };
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+    Configuration conf = NutchConfiguration.create();
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+
+      parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+
+      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+      Assert.assertTrue(sampleTexts[i].equals(text));
+    }
+  }
+
+  public TestSWFParser() {
+    for (int i = 0; i < sampleFiles.length; i++) {
+      try {
+        // read the test string
+        FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+            + sampleTexts[i]);
+        StringBuffer sb = new StringBuffer();
+        int len = 0;
+        InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
+        char[] buf = new char[1024];
+        while ((len = isr.read(buf)) > 0) {
+          sb.append(buf, 0, len);
+        }
+        isr.close();
+        sampleTexts[i] = sb.toString().replaceAll("[ \t\r\n]+", " ").trim();
+      } catch (Exception e) {
+        e.printStackTrace();
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test1.swf
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test1.swf 
b/nutch-plugins/parse-swf/src/test/resources/test1.swf
new file mode 100644
index 0000000..cd2019b
Binary files /dev/null and 
b/nutch-plugins/parse-swf/src/test/resources/test1.swf differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test1.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test1.txt 
b/nutch-plugins/parse-swf/src/test/resources/test1.txt
new file mode 100644
index 0000000..68505d5
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/test/resources/test1.txt
@@ -0,0 +1,60 @@
+
+--------
+/go/gnav_cart
+/go/gnav_company
+/go/gnav_devnet
+/go/gnav_downloads
+/go/gnav_fl_minmessage
+/go/gnav_help
+/go/gnav_mm_home
+/go/gnav_products
+/go/gnav_search?loc=en_us
+/go/gnav_showcase
+/go/gnav_solutions
+/go/gnav_store
+/go/gnav_support
+/go/gnav_your_account
+Acquisition Info
+Adobe Home
+AppleGothic
+Array
+Company
+Developers
+Downloads
+Help
+Home
+International
+LocaleManager
+Macromedia Flash Player
+Macromedia Home
+MovieClip
+Products
+Showcase
+Solutions
+Store
+String
+Support
+TextFormat
+To ensure the best possible Internet Experience, please download the latest 
version of the free
+Verdana
+_sans
+active
+bluePill
+button
+color
+company
+devnet
+downloads
+en_us
+home
+javascript:openCrosslinkWindow('/go/adobeacquisition')
+javascript:openCrosslinkWindow('/go/gnav_adobe_home')
+products
+rollOut
+rollOver
+selected
+showcase
+solutions
+support
+tabHolder
+textColor

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test2.swf
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test2.swf 
b/nutch-plugins/parse-swf/src/test/resources/test2.swf
new file mode 100644
index 0000000..eb9b03d
Binary files /dev/null and 
b/nutch-plugins/parse-swf/src/test/resources/test2.swf differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test2.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test2.txt 
b/nutch-plugins/parse-swf/src/test/resources/test2.txt
new file mode 100644
index 0000000..f77b78a
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/test/resources/test2.txt
@@ -0,0 +1,5 @@
+Impact Impact Impact  Arial Arial Arial  Webdings Webdings Webdings  Verdana 
Verdana Verdana  CourierNew CourierNew CourierNew  Bimini Bimini Bimini 
+--------
+TextFormat
+color
+font

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test3.swf
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test3.swf 
b/nutch-plugins/parse-swf/src/test/resources/test3.swf
new file mode 100644
index 0000000..4df9f1e
Binary files /dev/null and 
b/nutch-plugins/parse-swf/src/test/resources/test3.swf differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test3.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test3.txt 
b/nutch-plugins/parse-swf/src/test/resources/test3.txt
new file mode 100644
index 0000000..66ae3d8
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/test/resources/test3.txt
@@ -0,0 +1,11 @@
+Mix. 
+ Edit. 
+ Master. 
+ Compose. 
+ Animate. 
+ With a single suite of powerful tools 
+ that work together as one. 
+ World-class video and audio tools that bring  
+ new power and efficiency to your film, video,  
+ DVD, and web workflows. 
+ Learn more. 

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/build-ivy.xml 
b/nutch-plugins/parse-tika/build-ivy.xml
new file mode 100644
index 0000000..e4984d8
--- /dev/null
+++ b/nutch-plugins/parse-tika/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-tika" default="deps-jar" 
xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without 
any special installation -->
+        <get 
src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar";
 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not 
already dropped
+              it into ant's lib dir (note that the latter copy will always 
take precedence).
+              We will not fail as long as local lib dir exists (it may be 
empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/build.xml 
b/nutch-plugins/parse-tika/build.xml
new file mode 100644
index 0000000..4ecb3f8
--- /dev/null
+++ b/nutch-plugins/parse-tika/build.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-tika" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+  
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-nekohtml" />
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-nekohtml/*.jar" />
+    </fileset>
+  </path>
+  
+    <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
+    <ant target="deploy" inheritall="false" dir="../lib-nekohtml" />
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample">
+      <include name="*.rss"/>
+      <include name="*.rtf"/>
+      <include name="*.pdf"/>
+      <include name="ootest.*"/>
+      <include name="*.doc"/>
+      <include name="*.gif"/>
+    </fileset>
+  </copy>
+  
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/howto_upgrade_tika.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/howto_upgrade_tika.txt 
b/nutch-plugins/parse-tika/howto_upgrade_tika.txt
new file mode 100644
index 0000000..63a05a4
--- /dev/null
+++ b/nutch-plugins/parse-tika/howto_upgrade_tika.txt
@@ -0,0 +1,8 @@
+1. Upgrade Tika depencency in trunk/ivy/ivy.xml
+
+2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml
+
+3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml
+   To get the list of dependencies and their versions execute:
+   $ ant -f ./build-ivy.xml
+   $ ls lib | sed 's/^/      <library name="/g' | sed 's/$/"\/>/g'

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/ivy.xml b/nutch-plugins/parse-tika/ivy.xml
new file mode 100644
index 0000000..7a9e959
--- /dev/null
+++ b/nutch-plugins/parse-tika/ivy.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="org.apache.tika" name="tika-parsers" rev="1.12" 
conf="*->default">
+     <exclude org="org.apache.tika" name="tika-core" />
+     <exclude org="org.apache.httpcomponents" name="httpclient" />
+     <exclude org="org.apache.httpcomponents" name="httpcore" />
+    </dependency>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/plugin.xml 
b/nutch-plugins/parse-tika/plugin.xml
new file mode 100644
index 0000000..04fcd2e
--- /dev/null
+++ b/nutch-plugins/parse-tika/plugin.xml
@@ -0,0 +1,136 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-tika"
+   name="Tika Parser Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-tika.jar">
+         <export name="*"/>
+      </library>
+      <library name="apache-mime4j-core-0.7.2.jar"/>
+      <library name="apache-mime4j-dom-0.7.2.jar"/>
+      <library name="asm-5.0.4.jar"/>
+      <library name="aspectjrt-1.8.0.jar"/>
+      <library name="bcmail-jdk15on-1.52.jar"/>
+      <library name="bcpkix-jdk15on-1.52.jar"/>
+      <library name="bcprov-jdk15on-1.52.jar"/>
+      <library name="boilerpipe-1.1.0.jar"/>
+      <library name="bzip2-0.9.1.jar"/>
+      <library name="c3p0-0.9.1.1.jar"/>
+      <library name="cdm-4.5.5.jar"/>
+      <library name="commons-codec-1.6.jar"/>
+      <library name="commons-compress-1.10.jar"/>
+      <library name="commons-csv-1.0.jar"/>
+      <library name="commons-exec-1.3.jar"/>
+      <library name="commons-io-2.4.jar"/>
+      <library name="commons-lang-2.6.jar"/>
+      <library name="commons-logging-1.1.3.jar"/>
+      <library name="commons-logging-api-1.1.jar"/>
+      <library name="commons-vfs2-2.0.jar"/>
+      <library name="cxf-core-3.0.3.jar"/>
+      <library name="cxf-rt-frontend-jaxrs-3.0.3.jar"/>
+      <library name="cxf-rt-rs-client-3.0.3.jar"/>
+      <library name="cxf-rt-transports-http-3.0.3.jar"/>
+      <library name="ehcache-core-2.6.2.jar"/>
+      <library name="fontbox-1.8.10.jar"/>
+      <library name="geoapi-3.0.0.jar"/>
+      <library name="grib-4.5.5.jar"/>
+      <library name="gson-2.2.4.jar"/>
+      <library name="guava-17.0.jar"/>
+      <library name="httpmime-4.2.6.jar"/>
+      <library name="httpservices-4.5.5.jar"/>
+      <library name="isoparser-1.0.2.jar"/>
+      <library name="jackcess-2.1.2.jar"/>
+      <library name="jackcess-encrypt-2.1.1.jar"/>
+      <library name="java-libpst-0.8.1.jar"/>
+      <library name="javax.annotation-api-1.2.jar"/>
+      <library name="javax.ws.rs-api-2.0.1.jar"/>
+      <library name="jcip-annotations-1.0.jar"/>
+      <library name="jcommander-1.35.jar"/>
+      <library name="jdom-2.0.2.jar"/>
+      <library name="jdom2-2.0.4.jar"/>
+      <library name="jempbox-1.8.10.jar"/>
+      <library name="jhighlight-1.0.2.jar"/>
+      <library name="jj2000-5.2.jar"/>
+      <library name="jmatio-1.0.jar"/>
+      <library name="jna-4.1.0.jar"/>
+      <library name="joda-time-2.2.jar"/>
+      <library name="json-20140107.jar"/>
+      <library name="json-simple-1.1.1.jar"/>
+      <library name="jsoup-1.7.2.jar"/>
+      <library name="jsr-275-0.9.3.jar"/>
+      <library name="juniversalchardet-1.0.3.jar"/>
+      <library name="junrar-0.7.jar"/>
+      <library name="jwnl-1.3.3.jar"/>
+      <library name="maven-scm-api-1.4.jar"/>
+      <library name="maven-scm-provider-svn-commons-1.4.jar"/>
+      <library name="maven-scm-provider-svnexe-1.4.jar"/>
+      <library name="metadata-extractor-2.8.0.jar"/>
+      <library name="netcdf4-4.5.5.jar"/>
+      <library name="opennlp-maxent-3.0.3.jar"/>
+      <library name="opennlp-tools-1.5.3.jar"/>
+      <library name="pdfbox-1.8.10.jar"/>
+      <library name="plexus-utils-1.5.6.jar"/>
+      <library name="poi-3.13.jar"/>
+      <library name="poi-ooxml-3.13.jar"/>
+      <library name="poi-ooxml-schemas-3.13.jar"/>
+      <library name="poi-scratchpad-3.13.jar"/>
+      <library name="protobuf-java-2.5.0.jar"/>
+      <library name="quartz-2.2.0.jar"/>
+      <library name="regexp-1.3.jar"/>
+      <library name="rome-1.5.1.jar"/>
+      <library name="rome-utils-1.5.1.jar"/>
+      <library name="sis-metadata-0.5.jar"/>
+      <library name="sis-netcdf-0.5.jar"/>
+      <library name="sis-referencing-0.5.jar"/>
+      <library name="sis-storage-0.5.jar"/>
+      <library name="sis-utility-0.5.jar"/>
+      <library name="slf4j-api-1.7.12.jar"/>
+      <library name="stax2-api-3.1.4.jar"/>
+      <library name="tagsoup-1.2.1.jar"/>
+      <library name="tika-parsers-1.12.jar"/>
+      <library name="udunits-4.5.5.jar"/>
+      <library name="vorbis-java-core-0.6.jar"/>
+      <library name="vorbis-java-tika-0.6.jar"/>
+      <library name="woodstox-core-asl-4.4.1.jar"/>
+      <library name="xmlbeans-2.6.0.jar"/>
+      <library name="xmlschema-core-2.1.0.jar"/>
+      <library name="xmpcore-5.1.2.jar"/>
+      <library name="xz-1.5.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-nekohtml"/>
+   </requires>
+
+   <extension point="org.apache.nutch.parse.Parser"
+              id="org.apache.nutch.parse.tika"
+              name="TikaParser">
+
+      <implementation id="org.apache.nutch.parse.tika.TikaParser"
+                      class="org.apache.nutch.parse.tika.TikaParser">
+       <parameter name="contentType" value="*"/>
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/pom.xml b/nutch-plugins/parse-tika/pom.xml
new file mode 100644
index 0000000..0cf2340
--- /dev/null
+++ b/nutch-plugins/parse-tika/pom.xml
@@ -0,0 +1,54 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-tika</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-tika</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-parsers</artifactId>
+            <version>1.13</version>
+            <exclusions>
+                <!-- TODO -->
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-nekohtml</artifactId>
+            <version>${project.parent.version}</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
 
b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
new file mode 100644
index 0000000..7c0d71b
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import java.lang.ClassLoader;
+import java.lang.InstantiationException;
+import java.util.HashMap;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.extractors.*;
+
+class BoilerpipeExtractorRepository {
+
+    public static final Log LOG = 
LogFactory.getLog(BoilerpipeExtractorRepository.class);
+    public static final HashMap<String, BoilerpipeExtractor> 
extractorRepository = new HashMap<String, BoilerpipeExtractor>();
+ 
+    /**
+     * Returns an instance of the specified extractor
+     */
+    public static synchronized BoilerpipeExtractor getExtractor(String 
boilerpipeExtractorName) {
+      // Check if there's no instance of this extractor
+      if (!extractorRepository.containsKey(boilerpipeExtractorName)) {
+        // FQCN
+        boilerpipeExtractorName = "de.l3s.boilerpipe.extractors." + 
boilerpipeExtractorName;
+
+        // Attempt to load the class
+        try {
+          ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
+          Class extractorClass = loader.loadClass(boilerpipeExtractorName);
+
+          // Add an instance to the repository
+          extractorRepository.put(boilerpipeExtractorName, 
(BoilerpipeExtractor)extractorClass.newInstance());
+
+        } catch (ClassNotFoundException e) {
+          LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not 
found!");
+        } catch (InstantiationException e) {
+          LOG.error("Could not instantiate " + boilerpipeExtractorName);
+        } catch (Exception e) {
+          LOG.error(e);
+        }
+      }
+
+      return extractorRepository.get(boilerpipeExtractorName);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java
 
b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java
new file mode 100644
index 0000000..77a1044
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java
@@ -0,0 +1,794 @@
+/*
+ * XXX [email protected]: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to
+ * avoid dependency on Xalan.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id: DOMBuilder.java 823614 2009-10-09 17:02:32Z ab $
+ */
+package org.apache.nutch.parse.tika;
+
+import java.util.Stack;
+
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.Text;
+import org.w3c.dom.CDATASection;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.ext.LexicalHandler;
+
+/**
+ * This class takes SAX events (in addition to some extra events that SAX
+ * doesn't handle yet) and adds the result to a document or document fragment.
+ */
+class DOMBuilder implements ContentHandler, LexicalHandler {
+  private boolean upperCaseElementNames = true;
+
+  /** Root document */
+  public Document m_doc;
+
+  /** Current node */
+  protected Node m_currentNode = null;
+
+  /** First node of document fragment or null if not a DocumentFragment */
+  public DocumentFragment m_docFrag = null;
+
+  /** Vector of element nodes */
+  protected Stack<Element> m_elemStack = new Stack<Element>();
+
+  /**
+  * Element recorded with this namespace will be converted to Node without a
+  * namespace
+  */
+  private String defaultNamespaceURI = null;
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes to the
+   * document fragment.
+   * 
+   * @param doc
+   *          Root document
+   * @param node
+   *          Current node
+   */
+  DOMBuilder(Document doc, Node node) {
+    m_doc = doc;
+    m_currentNode = node;
+  }
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes to the
+   * document fragment.
+   * 
+   * @param doc
+   *          Root document
+   * @param docFrag
+   *          Document fragment
+   */
+  DOMBuilder(Document doc, DocumentFragment docFrag) {
+    m_doc = doc;
+    m_docFrag = docFrag;
+  }
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes to the
+   * document.
+   * 
+   * @param doc
+   *          Root document
+   */
+  DOMBuilder(Document doc) {
+    m_doc = doc;
+  }
+
+  /**
+   * Get the root node of the DOM being created. This is either a Document or a
+   * DocumentFragment.
+   * 
+   * @return The root document or document fragment if not null
+   */
+  Node getRootNode() {
+    return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc;
+  }
+
+  /**
+   * Get the node currently being processed.
+   * 
+   * @return the current node being processed
+   */
+  Node getCurrentNode() {
+    return m_currentNode;
+  }
+
+  /**
+   * Return null since there is no Writer for this class.
+   * 
+   * @return null
+   */
+  java.io.Writer getWriter() {
+    return null;
+  }
+
+  /**
+   * Append a node to the current container.
+   * 
+   * @param newNode
+   *          New node to append
+   */
+  protected void append(Node newNode) throws org.xml.sax.SAXException {
+
+    Node currentNode = m_currentNode;
+
+    if (null != currentNode) {
+      currentNode.appendChild(newNode);
+
+      // System.out.println(newNode.getNodeName());
+    } else if (null != m_docFrag) {
+      m_docFrag.appendChild(newNode);
+    } else {
+      boolean ok = true;
+      short type = newNode.getNodeType();
+
+      if (type == Node.TEXT_NODE) {
+        String data = newNode.getNodeValue();
+
+        if ((null != data) && (data.trim().length() > 0)) {
+          throw new org.xml.sax.SAXException(
+              "Warning: can't output text before document element!  
Ignoring...");
+        }
+
+        ok = false;
+      } else if (type == Node.ELEMENT_NODE) {
+        if (m_doc.getDocumentElement() != null) {
+          throw new org.xml.sax.SAXException(
+              "Can't have more than one root on a DOM!");
+        }
+      }
+
+      if (ok)
+        m_doc.appendChild(newNode);
+    }
+  }
+
+  /**
+   * Receive an object for locating the origin of SAX document events.
+   * 
+   * <p>
+   * SAX parsers are strongly encouraged (though not absolutely required) to
+   * supply a locator: if it does so, it must supply the locator to the
+   * application by invoking this method before invoking any of the other
+   * methods in the ContentHandler interface.
+   * </p>
+   * 
+   * <p>
+   * The locator allows the application to determine the end position of any
+   * document-related event, even if the parser is not reporting an error.
+   * Typically, the application will use this information for reporting its own
+   * errors (such as character content that does not match an application's
+   * business rules). The information returned by the locator is probably not
+   * sufficient for use with a search engine.
+   * </p>
+   * 
+   * <p>
+   * Note that the locator will return correct information only during the
+   * invocation of the events in this interface. The application should not
+   * attempt to use it at any other time.
+   * </p>
+   * 
+   * @param locator
+   *          An object that can return the location of any SAX document event.
+   * @see org.xml.sax.Locator
+   */
+  public void setDocumentLocator(Locator locator) {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the beginning of a document.
+   * 
+   * <p>
+   * The SAX parser will invoke this method only once, before any other methods
+   * in this interface or in DTDHandler (except for setDocumentLocator).
+   * </p>
+   */
+  public void startDocument() throws org.xml.sax.SAXException {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the end of a document.
+   * 
+   * <p>
+   * The SAX parser will invoke this method only once, and it will be the last
+   * method invoked during the parse. The parser shall not invoke this method
+   * until it has either abandoned parsing (because of an unrecoverable error)
+   * or reached the end of input.
+   * </p>
+   */
+  public void endDocument() throws org.xml.sax.SAXException {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the beginning of an element.
+   * 
+   * <p>
+   * The Parser will invoke this method at the beginning of every element in 
the
+   * XML document; there will be a corresponding endElement() event for every
+   * startElement() event (even when the element is empty). All of the 
element's
+   * content will be reported, in order, before the corresponding endElement()
+   * event.
+   * </p>
+   * 
+   * <p>
+   * If the element name has a namespace prefix, the prefix will still be
+   * attached. Note that the attribute list provided will contain only
+   * attributes with explicit values (specified or defaulted): #IMPLIED
+   * attributes will be omitted.
+   * </p>
+   * 
+   * 
+   * @param ns
+   *          The namespace of the node
+   * @param localName
+   *          The local part of the qualified name
+   * @param name
+   *          The element name.
+   * @param atts
+   *          The attributes attached to the element, if any.
+   * @see #endElement
+   * @see org.xml.sax.Attributes
+   */
+  public void startElement(String ns, String localName, String name,
+      Attributes atts) throws org.xml.sax.SAXException {
+
+    Element elem;
+
+    if (upperCaseElementNames)
+      name = name.toUpperCase();
+
+    // Note that the namespace-aware call must be used to correctly
+    // construct a Level 2 DOM, even for non-namespaced nodes.
+    if ((null == ns) || (ns.length() == 0) || ns.equals(defaultNamespaceURI))
+      elem = m_doc.createElementNS(null, name);
+    else
+      elem = m_doc.createElementNS(ns, name);
+
+    append(elem);
+
+    try {
+      int nAtts = atts.getLength();
+
+      if (0 != nAtts) {
+        for (int i = 0; i < nAtts; i++) {
+
+          // System.out.println("type " + atts.getType(i) + " name " +
+          // atts.getLocalName(i) );
+          // First handle a possible ID attribute
+          if (atts.getType(i).equalsIgnoreCase("ID"))
+            setIDAttribute(atts.getValue(i), elem);
+
+          String attrNS = atts.getURI(i);
+
+          if ("".equals(attrNS))
+            attrNS = null; // DOM represents no-namespace as null
+
+          // System.out.println("attrNS: "+attrNS+", localName: 
"+atts.getQName(i)
+          // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i));
+          // Crimson won't let us set an xmlns: attribute on the DOM.
+          String attrQName = atts.getQName(i);
+
+          // In SAX, xmlns: attributes have an empty namespace, while in DOM
+          // they should have the xmlns namespace
+          if (attrQName.startsWith("xmlns:"))
+            attrNS = "http://www.w3.org/2000/xmlns/";;
+
+          // ALWAYS use the DOM Level 2 call!
+          elem.setAttributeNS(attrNS, attrQName, atts.getValue(i));
+        }
+      }
+
+      // append(elem);
+
+      m_elemStack.push(elem);
+
+      m_currentNode = elem;
+
+      // append(elem);
+    } catch (java.lang.Exception de) {
+      // de.printStackTrace();
+      throw new org.xml.sax.SAXException(de);
+    }
+
+  }
+
+  /**
+   * 
+   * 
+   * 
+   * Receive notification of the end of an element.
+   * 
+   * <p>
+   * The SAX parser will invoke this method at the end of every element in the
+   * XML document; there will be a corresponding startElement() event for every
+   * endElement() event (even when the element is empty).
+   * </p>
+   * 
+   * <p>
+   * If the element name has a namespace prefix, the prefix will still be
+   * attached to the name.
+   * </p>
+   * 
+   * 
+   * @param ns
+   *          the namespace of the element
+   * @param localName
+   *          The local part of the qualified name of the element
+   * @param name
+   *          The element name
+   */
+  public void endElement(String ns, String localName, String name)
+      throws org.xml.sax.SAXException {
+    if (!m_elemStack.isEmpty()) {
+      m_elemStack.pop();
+    }
+    m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek();
+  }
+
+  /**
+   * Set an ID string to node association in the ID table.
+   * 
+   * @param id
+   *          The ID string.
+   * @param elem
+   *          The associated ID.
+   */
+  public void setIDAttribute(String id, Element elem) {
+
+    // Do nothing. This method is meant to be overiden.
+  }
+
+  /**
+   * Receive notification of character data.
+   * 
+   * <p>
+   * The Parser will call this method to report each chunk of character data.
+   * SAX parsers may return all contiguous character data in a single chunk, or
+   * they may split it into several chunks; however, all of the characters in
+   * any single event must come from the same external entity, so that the
+   * Locator provides useful information.
+   * </p>
+   * 
+   * <p>
+   * The application must not attempt to read from the array outside of the
+   * specified range.
+   * </p>
+   * 
+   * <p>
+   * Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating parsers must
+   * do so).
+   * </p>
+   * 
+   * @param ch
+   *          The characters from the XML document.
+   * @param start
+   *          The start position in the array.
+   * @param length
+   *          The number of characters to read from the array.
+   * @see #ignorableWhitespace
+   * @see org.xml.sax.Locator
+   */
+  public void characters(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem()
+        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return; // avoid DOM006 Hierarchy request error
+
+    if (m_inCData) {
+      cdata(ch, start, length);
+
+      return;
+    }
+
+    String s = new String(ch, start, length);
+    Node childNode;
+    childNode = m_currentNode != null ? m_currentNode.getLastChild() : null;
+    if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) {
+      ((Text) childNode).appendData(s);
+    } else {
+      Text text = m_doc.createTextNode(s);
+      append(text);
+    }
+  }
+
+  /**
+   * If available, when the disable-output-escaping attribute is used, output
+   * raw text without escaping. A PI will be inserted in front of the node with
+   * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom".
+   * 
+   * @param ch
+   *          Array containing the characters
+   * @param start
+   *          Index to start of characters in the array
+   * @param length
+   *          Number of characters in the array
+   */
+  public void charactersRaw(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem()
+        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return; // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    append(m_doc.createProcessingInstruction("xslt-next-is-raw",
+        "formatter-to-dom"));
+    append(m_doc.createTextNode(s));
+  }
+
+  /**
+   * Report the beginning of an entity.
+   * 
+   * The start and end of the document entity are not reported. The start and
+   * end of the external DTD subset are reported using the pseudo-name "[dtd]".
+   * All other events must be properly nested within start/end entity events.
+   * 
+   * @param name
+   *          The name of the entity. If it is a parameter entity, the name 
will
+   *          begin with '%'.
+   * @see #endEntity
+   * @see org.xml.sax.ext.DeclHandler#internalEntityDecl
+   * @see org.xml.sax.ext.DeclHandler#externalEntityDecl
+   */
+  public void startEntity(String name) throws org.xml.sax.SAXException {
+
+    // Almost certainly the wrong behavior...
+    // entityReference(name);
+  }
+
+  /**
+   * Report the end of an entity.
+   * 
+   * @param name
+   *          The name of the entity that is ending.
+   * @see #startEntity
+   */
+  public void endEntity(String name) throws org.xml.sax.SAXException {
+  }
+
+  /**
+   * Receive notivication of a entityReference.
+   * 
+   * @param name
+   *          name of the entity reference
+   */
+  public void entityReference(String name) throws org.xml.sax.SAXException {
+    append(m_doc.createEntityReference(name));
+  }
+
+  /**
+   * Receive notification of ignorable whitespace in element content.
+   * 
+   * <p>
+   * Validating Parsers must use this method to report each chunk of ignorable
+   * whitespace (see the W3C XML 1.0 recommendation, section 2.10):
+   * non-validating parsers may also use this method if they are capable of
+   * parsing and using content models.
+   * </p>
+   * 
+   * <p>
+   * SAX parsers may return all contiguous whitespace in a single chunk, or 
they
+   * may split it into several chunks; however, all of the characters in any
+   * single event must come from the same external entity, so that the Locator
+   * provides useful information.
+   * </p>
+   * 
+   * <p>
+   * The application must not attempt to read from the array outside of the
+   * specified range.
+   * </p>
+   * 
+   * @param ch
+   *          The characters from the XML document.
+   * @param start
+   *          The start position in the array.
+   * @param length
+   *          The number of characters to read from the array.
+   * @see #characters
+   */
+  public void ignorableWhitespace(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem())
+      return; // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    append(m_doc.createTextNode(s));
+  }
+
+  /**
+   * Tell if the current node is outside the document element.
+   * 
+   * @return true if the current node is outside the document element.
+   */
+  private boolean isOutsideDocElem() {
+    return (null == m_docFrag)
+        && m_elemStack.size() == 0
+        && (null == m_currentNode || m_currentNode.getNodeType() == 
Node.DOCUMENT_NODE);
+  }
+
+  /**
+   * Receive notification of a processing instruction.
+   * 
+   * <p>
+   * The Parser will invoke this method once for each processing instruction
+   * found: note that processing instructions may occur before or after the 
main
+   * document element.
+   * </p>
+   * 
+   * <p>
+   * A SAX parser should never report an XML declaration (XML 1.0, section 2.8)
+   * or a text declaration (XML 1.0, section 4.3.1) using this method.
+   * </p>
+   * 
+   * @param target
+   *          The processing instruction target.
+   * @param data
+   *          The processing instruction data, or null if none was supplied.
+   */
+  public void processingInstruction(String target, String data)
+      throws org.xml.sax.SAXException {
+    append(m_doc.createProcessingInstruction(target, data));
+  }
+
+  /**
+   * Report an XML comment anywhere in the document.
+   * 
+   * This callback will be used for comments inside or outside the document
+   * element, including comments in the external DTD subset (if read).
+   * 
+   * @param ch
+   *          An array holding the characters in the comment.
+   * @param start
+   *          The starting position in the array.
+   * @param length
+   *          The number of characters to use from the array.
+   */
+  public void comment(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    // tagsoup sometimes submits invalid values here
+    if (ch == null || start < 0 || length >= (ch.length - start) || length < 0)
+      return;
+    append(m_doc.createComment(new String(ch, start, length)));
+  }
+
+  /** Flag indicating that we are processing a CData section */
+  protected boolean m_inCData = false;
+
+  /**
+   * Report the start of a CDATA section.
+   * 
+   * @see #endCDATA
+   */
+  public void startCDATA() throws org.xml.sax.SAXException {
+    m_inCData = true;
+    append(m_doc.createCDATASection(""));
+  }
+
+  /**
+   * Report the end of a CDATA section.
+   * 
+   * @see #startCDATA
+   */
+  public void endCDATA() throws org.xml.sax.SAXException {
+    m_inCData = false;
+  }
+
+  /**
+   * Receive notification of cdata.
+   * 
+   * <p>
+   * The Parser will call this method to report each chunk of character data.
+   * SAX parsers may return all contiguous character data in a single chunk, or
+   * they may split it into several chunks; however, all of the characters in
+   * any single event must come from the same external entity, so that the
+   * Locator provides useful information.
+   * </p>
+   * 
+   * <p>
+   * The application must not attempt to read from the array outside of the
+   * specified range.
+   * </p>
+   * 
+   * <p>
+   * Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating parsers must
+   * do so).
+   * </p>
+   * 
+   * @param ch
+   *          The characters from the XML document.
+   * @param start
+   *          The start position in the array.
+   * @param length
+   *          The number of characters to read from the array.
+   * @see #ignorableWhitespace
+   * @see org.xml.sax.Locator
+   */
+  public void cdata(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem()
+        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return; // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    // XXX [email protected]: modified from the original, to accomodate TagSoup.
+    Node n = m_currentNode.getLastChild();
+    if (n instanceof CDATASection)
+      ((CDATASection) n).appendData(s);
+    else if (n instanceof Comment)
+      ((Comment) n).appendData(s);
+  }
+
+  /**
+   * Report the start of DTD declarations, if any.
+   * 
+   * Any declarations are assumed to be in the internal subset unless otherwise
+   * indicated.
+   * 
+   * @param name
+   *          The document type name.
+   * @param publicId
+   *          The declared public identifier for the external DTD subset, or
+   *          null if none was declared.
+   * @param systemId
+   *          The declared system identifier for the external DTD subset, or
+   *          null if none was declared.
+   * @see #endDTD
+   * @see #startEntity
+   */
+  public void startDTD(String name, String publicId, String systemId)
+      throws org.xml.sax.SAXException {
+
+    // Do nothing for now.
+  }
+
+  /**
+   * Report the end of DTD declarations.
+   * 
+   * @see #startDTD
+   */
+  public void endDTD() throws org.xml.sax.SAXException {
+
+    // Do nothing for now.
+  }
+
+  /**
+   * Begin the scope of a prefix-URI Namespace mapping.
+   * 
+   * <p>
+   * The information from this event is not necessary for normal Namespace
+   * processing: the SAX XML reader will automatically replace prefixes for
+   * element and attribute names when the 
http://xml.org/sax/features/namespaces
+   * feature is true (the default).
+   * </p>
+   * 
+   * <p>
+   * There are cases, however, when applications need to use prefixes in
+   * character data or in attribute values, where they cannot safely be 
expanded
+   * automatically; the start/endPrefixMapping event supplies the information 
to
+   * the application to expand prefixes in those contexts itself, if necessary.
+   * </p>
+   * 
+   * <p>
+   * Note that start/endPrefixMapping events are not guaranteed to be properly
+   * nested relative to each-other: all startPrefixMapping events will occur
+   * before the corresponding startElement event, and all endPrefixMapping
+   * events will occur after the corresponding endElement event, but their 
order
+   * is not guaranteed.
+   * </p>
+   * 
+   * @param prefix
+   *          The Namespace prefix being declared.
+   * @param uri
+   *          The Namespace URI the prefix is mapped to.
+   * @see #endPrefixMapping
+   * @see #startElement
+   */
+  public void startPrefixMapping(String prefix, String uri)
+      throws org.xml.sax.SAXException {
+
+    /*
+     * // Not sure if this is needed or wanted // Also, it fails in the stree.
+     * if((null != m_currentNode) && (m_currentNode.getNodeType() ==
+     * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) &&
+     * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname
+     * = "xmlns:"+prefix;
+     * 
+     * Element elem = (Element)m_currentNode; String val =
+     * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == 
null)
+     * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace";, qname,
+     * uri); } }
+     */
+  }
+
+  /**
+   * End the scope of a prefix-URI mapping.
+   * 
+   * <p>
+   * See startPrefixMapping for details. This event will always occur after the
+   * corresponding endElement event, but the order of endPrefixMapping events 
is
+   * not otherwise guaranteed.
+   * </p>
+   * 
+   * @param prefix
+   *          The prefix that was being mapping.
+   * @see #startPrefixMapping
+   * @see #endElement
+   */
+  public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException {
+  }
+
+  /**
+   * Receive notification of a skipped entity.
+   * 
+   * <p>
+   * The Parser will invoke this method once for each entity skipped.
+   * Non-validating processors may skip entities if they have not seen the
+   * declarations (because, for example, the entity was declared in an external
+   * DTD subset). All processors may skip external entities, depending on the
+   * values of the http://xml.org/sax/features/external-general-entities and 
the
+   * http://xml.org/sax/features/external-parameter-entities properties.
+   * </p>
+   * 
+   * @param name
+   *          The name of the skipped entity. If it is a parameter entity, the
+   *          name will begin with '%'.
+   */
+  public void skippedEntity(String name) throws org.xml.sax.SAXException {
+  }
+
+  public boolean isUpperCaseElementNames() {
+    return upperCaseElementNames;
+  }
+
+  public void setUpperCaseElementNames(boolean upperCaseElementNames) {
+    this.upperCaseElementNames = upperCaseElementNames;
+  }
+ 
+  public String getDefaultNamespaceURI() {
+    return defaultNamespaceURI;
+  }
+
+  public void setDefaultNamespaceURI(String defaultNamespaceURI) {
+    this.defaultNamespaceURI = defaultNamespaceURI;
+  }
+}

[12/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Reply via email to