http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/build.xml b/nutch-plugins/parse-swf/build.xml new file mode 100644 index 0000000..f4fb20f --- /dev/null +++ b/nutch-plugins/parse-swf/build.xml @@ -0,0 +1,38 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-swf" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + <ant target="deploy" inheritall="false" dir="../protocol-file"/> + </target> + + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy file="sample/test1.swf" todir="${build.test}/data"/> + <copy file="sample/test2.swf" todir="${build.test}/data"/> + <copy file="sample/test3.swf" todir="${build.test}/data"/> + <copy file="sample/test1.txt" todir="${build.test}/data"/> + <copy file="sample/test2.txt" todir="${build.test}/data"/> + <copy file="sample/test3.txt" todir="${build.test}/data"/> + +</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/ivy.xml b/nutch-plugins/parse-swf/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/parse-swf/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt b/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt new file mode 100644 index 0000000..4138a66 --- /dev/null +++ b/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt @@ -0,0 +1,33 @@ + + Copyright (c) 2001-2005, David N. Main, All rights reserved. + + Redistribution and use in source and binary forms, with or + without modification, are permitted provided that the + following conditions are met: + + 1. Redistributions of source code must retain the above + copyright notice, this list of conditions and the following + disclaimer. + + 2. Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + + 3. The name of the author may not be used to endorse or + promote products derived from this software without specific + prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY + EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/lib/javaswf.jar ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/lib/javaswf.jar b/nutch-plugins/parse-swf/lib/javaswf.jar new file mode 100644 index 0000000..78f9b0b Binary files /dev/null and b/nutch-plugins/parse-swf/lib/javaswf.jar differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/plugin.xml b/nutch-plugins/parse-swf/plugin.xml new file mode 100644 index 0000000..8cc72c0 --- /dev/null +++ b/nutch-plugins/parse-swf/plugin.xml @@ -0,0 +1,44 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="parse-swf" + name="SWF Parse Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + + <runtime> + <library name="parse-swf.jar"> + <export name="*"/> + </library> + <library name="javaswf.jar"/> + </runtime> + + <extension id="org.apache.nutch.parse.swf" + name="SWFParse" + point="org.apache.nutch.parse.Parser"> + + <implementation id="org.apache.nutch.parse.swf.SWFParser" + class="org.apache.nutch.parse.swf.SWFParser"> + <parameter name="contentType" value="application/x-shockwave-flash"/> + <parameter name="pathSuffix" value="swf"/> + </implementation> + + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/pom.xml b/nutch-plugins/parse-swf/pom.xml new file mode 100644 index 0000000..743511e --- /dev/null +++ b/nutch-plugins/parse-swf/pom.xml @@ -0,0 +1,46 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>parse-swf</artifactId> + <packaging>jar</packaging> + + <name>parse-swf</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + <dependencies> + <dependency> + <groupId>com.google.gwt</groupId> + <artifactId>gwt-incubator</artifactId> + <version>2.0.1</version> + </dependency> + + </dependencies> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java new file mode 100644 index 0000000..9251366 --- /dev/null +++ b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java @@ -0,0 +1,685 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.swf; + +import java.io.FileInputStream; +import java.io.IOException; +import java.util.*; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.parse.*; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; + +import org.apache.hadoop.conf.Configuration; + +import com.anotherbigidea.flash.interfaces.*; +import com.anotherbigidea.flash.readers.*; +import com.anotherbigidea.flash.structs.*; +import com.anotherbigidea.flash.writers.SWFActionBlockImpl; +import com.anotherbigidea.flash.writers.SWFTagTypesImpl; +import com.anotherbigidea.io.InStream; + +/** + * Parser for Flash SWF files. Loosely based on the sample in JavaSWF + * distribution. + */ +public class SWFParser implements Parser { + public static final Logger LOG = LoggerFactory + .getLogger("org.apache.nutch.parse.swf"); + + private Configuration conf = null; + + public SWFParser() { + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return conf; + } + + public ParseResult getParse(Content content) { + + String text = null; + Vector<Outlink> outlinks = new Vector<Outlink>(); + + try { + + byte[] raw = content.getContent(); + + String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH); + if (contentLength != null + && raw.length != Integer.parseInt(contentLength)) { + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + + " bytes. Parser can't handle incomplete files.") + .getEmptyParseResult(content.getUrl(), getConf()); + } + ExtractText extractor = new ExtractText(); + + // TagParser implements SWFTags and drives a SWFTagTypes interface + TagParser parser = new TagParser(extractor); + // use this instead to debug the file + // TagParser parser = new TagParser( new SWFTagDumper(true, true) ); + + // SWFReader reads an input file and drives a SWFTags interface + SWFReader reader = new SWFReader(parser, new InStream(raw)); + + // read the input SWF file and pass it through the interface pipeline + reader.readFile(); + text = extractor.getText(); + String atext = extractor.getActionText(); + if (atext != null && atext.length() > 0) + text += "\n--------\n" + atext; + // harvest potential outlinks + String[] links = extractor.getUrls(); + for (int i = 0; i < links.length; i++) { + Outlink out = new Outlink(links[i], ""); + outlinks.add(out); + } + Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf); + if (olinks != null) + for (int i = 0; i < olinks.length; i++) { + outlinks.add(olinks[i]); + } + } catch (Exception e) { // run time exception + LOG.error("Error, runtime exception: ", e); + return new ParseStatus(ParseStatus.FAILED, + "Can't be handled as SWF document. " + e).getEmptyParseResult( + content.getUrl(), getConf()); + } + if (text == null) + text = ""; + + Outlink[] links = (Outlink[]) outlinks + .toArray(new Outlink[outlinks.size()]); + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links, + content.getMetadata()); + return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, + parseData)); + } + + /** + * Arguments are: 0. Name of input SWF file. + */ + public static void main(String[] args) throws IOException { + FileInputStream in = new FileInputStream(args[0]); + + byte[] buf = new byte[in.available()]; + in.read(buf); + in.close(); + SWFParser parser = new SWFParser(); + ParseResult parseResult = parser.getParse(new Content("file:" + args[0], + "file:" + args[0], buf, "application/x-shockwave-flash", + new Metadata(), NutchConfiguration.create())); + Parse p = parseResult.get("file:" + args[0]); + System.out.println("Parse Text:"); + System.out.println(p.getText()); + System.out.println("Parse Data:"); + System.out.println(p.getData()); + } +} + +/** + * Shows how to parse a Flash movie and extract all the text in Text symbols and + * the initial text in Edit Fields. Output is to System.out. + * + * A "pipeline" is set up in the main method: + * + * SWFReader-->TagParser-->ExtractText + * + * SWFReader reads the input SWF file and separates out the header and the tags. + * The separated contents are passed to TagParser which parses out the + * individual tag types and passes them to ExtractText. + * + * ExtractText extends SWFTagTypesImpl and overrides some methods. + */ +class ExtractText extends SWFTagTypesImpl { + /** + * Store font info keyed by the font symbol id. Each entry is an int[] of + * character codes for the correspnding font glyphs (An empty array denotes a + * System Font). + */ + protected HashMap<Integer, int[]> fontCodes = new HashMap<Integer, int[]>(); + + public ArrayList<String> strings = new ArrayList<String>(); + + public HashSet<String> actionStrings = new HashSet<String>(); + + public ArrayList<String> urls = new ArrayList<String>(); + + public ExtractText() { + super(null); + } + + public String getText() { + StringBuffer res = new StringBuffer(); + Iterator<String> it = strings.iterator(); + while (it.hasNext()) { + if (res.length() > 0) + res.append(' '); + res.append(it.next()); + } + return res.toString(); + } + + public String getActionText() { + StringBuffer res = new StringBuffer(); + String[] strings = (String[]) actionStrings + .toArray(new String[actionStrings.size()]); + Arrays.sort(strings); + for (int i = 0; i < strings.length; i++) { + if (i > 0) + res.append('\n'); + res.append(strings[i]); + } + return res.toString(); + } + + public String[] getUrls() { + String[] res = new String[urls.size()]; + int i = 0; + Iterator<String> it = urls.iterator(); + while (it.hasNext()) { + res[i] = (String) it.next(); + i++; + } + return res; + } + + public void tagDefineFontInfo2(int arg0, String arg1, int arg2, int[] arg3, + int arg4) throws IOException { + tagDefineFontInfo(arg0, arg1, arg2, arg3); + } + + /** + * SWFTagTypes interface Save the Text Font character code info + */ + public void tagDefineFontInfo(int fontId, String fontName, int flags, + int[] codes) throws IOException { + // System.out.println("-defineFontInfo id=" + fontId + ", name=" + + // fontName); + fontCodes.put(new Integer(fontId), codes); + } + + // XXX too much hassle for too little return ... we cannot guess character + // XXX codes anyway, so we just give up. + /* + * public SWFVectors tagDefineFont(int arg0, int arg1) throws IOException { + * return null; } + */ + + /** + * SWFTagTypes interface. Save the character code info. + */ + public SWFVectors tagDefineFont2(int id, int flags, String name, + int numGlyphs, int ascent, int descent, int leading, int[] codes, + int[] advances, Rect[] bounds, int[] kernCodes1, int[] kernCodes2, + int[] kernAdjustments) throws IOException { + // System.out.println("-defineFontInfo id=" + id + ", name=" + name); + fontCodes.put(new Integer(id), (codes != null) ? codes : new int[0]); + + return null; + } + + /** + * SWFTagTypes interface. Dump any initial text in the field. + */ + public void tagDefineTextField(int fieldId, String fieldName, + String initialText, Rect boundary, int flags, AlphaColor textColor, + int alignment, int fontId, int fontSize, int charLimit, int leftMargin, + int rightMargin, int indentation, int lineSpacing) throws IOException { + if (initialText != null) { + strings.add(initialText); + } + } + + /** + * SWFTagTypes interface + */ + public SWFText tagDefineText(int id, Rect bounds, Matrix matrix) + throws IOException { + lastBounds = curBounds; + curBounds = bounds; + return new TextDumper(); + } + + Rect lastBounds = null; + Rect curBounds = null; + + /** + * SWFTagTypes interface + */ + public SWFText tagDefineText2(int id, Rect bounds, Matrix matrix) + throws IOException { + lastBounds = curBounds; + curBounds = bounds; + return new TextDumper(); + } + + public class TextDumper implements SWFText { + protected Integer fontId; + + protected boolean firstY = true; + + public void font(int fontId, int textHeight) { + this.fontId = new Integer(fontId); + } + + public void setY(int y) { + if (firstY) + firstY = false; + else + strings.add("\n"); // Change in Y - dump a new line + } + + /* + * There are some issues with this method: sometimes SWF files define their + * own font, so short of OCR we cannot guess what is the glyph code -> + * character mapping. Additionally, some files don't use literal space + * character, instead they adjust glyphAdvances. We don't handle it at all - + * in such cases the text will be all glued together. + */ + public void text(int[] glyphIndices, int[] glyphAdvances) { + // System.out.println("-text id=" + fontId); + int[] codes = (int[]) fontCodes.get(fontId); + if (codes == null) { + // unknown font, better not guess + strings.add("\n**** ?????????????? ****\n"); + return; + } + + // --Translate the glyph indices to character codes + char[] chars = new char[glyphIndices.length]; + + for (int i = 0; i < chars.length; i++) { + int index = glyphIndices[i]; + + if (index >= codes.length) // System Font ? + { + chars[i] = (char) index; + } else { + chars[i] = (char) (codes[index]); + } + // System.out.println("-ch[" + i + "]='" + chars[i] + "'(" + + // (int)chars[i] + ") +" + glyphAdvances[i]); + } + strings.add(new String(chars)); + } + + public void color(Color color) { + } + + public void setX(int x) { + } + + public void done() { + strings.add("\n"); + } + } + + public SWFActions tagDoAction() throws IOException { + // ActionTextWriter actions = new ActionTextWriter(new + // PrintWriter(System.out)); + NutchSWFActions actions = new NutchSWFActions(actionStrings, urls); + return actions; + } + + public SWFActions tagDoInitAction(int arg0) throws IOException { + // ActionTextWriter actions = new ActionTextWriter(new + // PrintWriter(System.out)); + NutchSWFActions actions = new NutchSWFActions(actionStrings, urls); + return actions; + } + + public void tagGeneratorFont(byte[] arg0) throws IOException { + // TODO Auto-generated method stub + super.tagGeneratorFont(arg0); + } + + public void tagGeneratorText(byte[] arg0) throws IOException { + // TODO Auto-generated method stub + super.tagGeneratorText(arg0); + } + +} + +/** + * ActionScript parser. This parser tries to extract free text embedded inside + * the script, but without polluting it too much with names of variables, + * methods, etc. Not ideal, but it works. + */ +class NutchSWFActions extends SWFActionBlockImpl implements SWFActions { + private HashSet<String> strings = null; + + private ArrayList<String> urls = null; + + String[] dict = null; + + Stack<Object> stack = null; + + public NutchSWFActions(HashSet<String> strings, ArrayList<String> urls) { + this.strings = strings; + this.urls = urls; + stack = new SmallStack(100, strings); + } + + public void lookupTable(String[] values) throws IOException { + for (int i = 0; i < values.length; i++) { + if (!strings.contains(values[i])) + strings.add(values[i]); + } + super.lookupTable(values); + dict = values; + } + + public void defineLocal() throws IOException { + stack.pop(); + super.defineLocal(); + } + + public void getURL(int vars, int mode) { + // System.out.println("-getURL: vars=" + vars + ", mode=" + mode); + } + + public void getURL(String url, String target) throws IOException { + // System.out.println("-getURL: url=" + url + ", target=" + target); + stack.push(url); + stack.push(target); + strings.remove(url); + strings.remove(target); + urls.add(url); + super.getURL(url, target); + } + + public SWFActionBlock.TryCatchFinally _try(String var) throws IOException { + // stack.push(var); + strings.remove(var); + return super._try(var); + } + + public void comment(String var) throws IOException { + // stack.push(var); + strings.remove(var); + super.comment(var); + } + + public void goToFrame(String var) throws IOException { + stack.push(var); + strings.remove(var); + super.gotoFrame(var); + } + + public void ifJump(String var) throws IOException { + strings.remove(var); + super.ifJump(var); + } + + public void jump(String var) throws IOException { + strings.remove(var); + super.jump(var); + } + + public void jumpLabel(String var) throws IOException { + strings.remove(var); + super.jumpLabel(var); + } + + public void lookup(int var) throws IOException { + if (dict != null && var >= 0 && var < dict.length) { + stack.push(dict[var]); + } + super.lookup(var); + } + + public void push(String var) throws IOException { + stack.push(var); + strings.remove(var); + super.push(var); + } + + public void setTarget(String var) throws IOException { + stack.push(var); + strings.remove(var); + super.setTarget(var); + } + + public SWFActionBlock startFunction(String var, String[] params) + throws IOException { + stack.push(var); + strings.remove(var); + if (params != null) { + for (int i = 0; i < params.length; i++) { + strings.remove(params[i]); + } + } + return this; + } + + public SWFActionBlock startFunction2(String var, int arg1, int arg2, + String[] params, int[] arg3) throws IOException { + stack.push(var); + strings.remove(var); + if (params != null) { + for (int i = 0; i < params.length; i++) { + strings.remove(params[i]); + } + } + return this; + } + + public void waitForFrame(int num, String var) throws IOException { + stack.push(var); + strings.remove(var); + super.waitForFrame(num, var); + } + + public void waitForFrame(String var) throws IOException { + stack.push(var); + strings.remove(var); + super.waitForFrame(var); + } + + public void done() throws IOException { + while (stack.size() > 0) { + strings.remove(stack.pop()); + } + } + + public SWFActionBlock start(int arg0, int arg1) throws IOException { + return this; + } + + public SWFActionBlock start(int arg0) throws IOException { + return this; + } + + public void add() throws IOException { + super.add(); + } + + public void asciiToChar() throws IOException { + super.asciiToChar(); + } + + public void asciiToCharMB() throws IOException { + super.asciiToCharMB(); + } + + public void push(int var) throws IOException { + if (dict != null && var >= 0 && var < dict.length) { + stack.push(dict[var]); + } + super.push(var); + } + + public void callFunction() throws IOException { + strings.remove(stack.pop()); + super.callFunction(); + } + + public void callMethod() throws IOException { + strings.remove(stack.pop()); + super.callMethod(); + } + + public void getMember() throws IOException { + // 0: name + String val = (String) stack.pop(); + strings.remove(val); + super.getMember(); + } + + public void setMember() throws IOException { + // 0: value -1: name + stack.pop(); // value + String name = (String) stack.pop(); + strings.remove(name); + super.setMember(); + } + + public void setProperty() throws IOException { + super.setProperty(); + } + + public void setVariable() throws IOException { + super.setVariable(); + } + + public void call() throws IOException { + strings.remove(stack.pop()); + super.call(); + } + + public void setTarget() throws IOException { + strings.remove(stack.pop()); + super.setTarget(); + } + + public void pop() throws IOException { + strings.remove(stack.pop()); + super.pop(); + } + + public void push(boolean arg0) throws IOException { + stack.push("" + arg0); + super.push(arg0); + } + + public void push(double arg0) throws IOException { + stack.push("" + arg0); + super.push(arg0); + } + + public void push(float arg0) throws IOException { + stack.push("" + arg0); + super.push(arg0); + } + + public void pushNull() throws IOException { + stack.push(""); + super.pushNull(); + } + + public void pushRegister(int arg0) throws IOException { + stack.push("" + arg0); + super.pushRegister(arg0); + } + + public void pushUndefined() throws IOException { + stack.push("???"); + super.pushUndefined(); + } + + public void getProperty() throws IOException { + stack.pop(); + super.getProperty(); + } + + public void getVariable() throws IOException { + strings.remove(stack.pop()); + super.getVariable(); + } + + public void gotoFrame(boolean arg0) throws IOException { + stack.push("" + arg0); + super.gotoFrame(arg0); + } + + public void gotoFrame(int arg0) throws IOException { + stack.push("" + arg0); + super.gotoFrame(arg0); + } + + public void gotoFrame(String arg0) throws IOException { + stack.push("" + arg0); + strings.remove(arg0); + super.gotoFrame(arg0); + } + + public void newObject() throws IOException { + stack.pop(); + super.newObject(); + } + + public SWFActionBlock startWith() throws IOException { + return this; + } + +} + +/* + * Small bottom-less stack. + */ +class SmallStack extends Stack<Object> { + + private static final long serialVersionUID = 1L; + + private int maxSize; + + private HashSet<String> strings = null; + + public SmallStack(int maxSize, HashSet<String> strings) { + this.maxSize = maxSize; + this.strings = strings; + } + + public Object push(Object o) { + // limit max size + if (this.size() > maxSize) { + String val = (String) remove(0); + strings.remove(val); + } + return super.push(o); + } + + public Object pop() { + // tolerate underruns + if (this.size() == 0) + return null; + else + return super.pop(); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java new file mode 100644 index 0000000..5942e64 --- /dev/null +++ b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parse Flash SWF files. + */ +package org.apache.nutch.parse.swf; + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java b/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java new file mode 100644 index 0000000..129b85f --- /dev/null +++ b/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.swf; + +import java.io.FileInputStream; +import java.io.InputStreamReader; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.hadoop.io.Text; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +/** + * Unit tests for SWFParser. + */ +public class TestSWFParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + + private String[] sampleFiles = new String[] { "test1.swf", "test2.swf", + "test3.swf" }; + private String[] sampleTexts = new String[] { "test1.txt", "test2.txt", + "test3.txt" }; + + @Test + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parse parse; + Configuration conf = NutchConfiguration.create(); + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + + parse = new ParseUtil(conf).parse(content).get(content.getUrl()); + + String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim(); + Assert.assertTrue(sampleTexts[i].equals(text)); + } + } + + public TestSWFParser() { + for (int i = 0; i < sampleFiles.length; i++) { + try { + // read the test string + FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + + sampleTexts[i]); + StringBuffer sb = new StringBuffer(); + int len = 0; + InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); + char[] buf = new char[1024]; + while ((len = isr.read(buf)) > 0) { + sb.append(buf, 0, len); + } + isr.close(); + sampleTexts[i] = sb.toString().replaceAll("[ \t\r\n]+", " ").trim(); + } catch (Exception e) { + e.printStackTrace(); + } + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test1.swf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/test/resources/test1.swf b/nutch-plugins/parse-swf/src/test/resources/test1.swf new file mode 100644 index 0000000..cd2019b Binary files /dev/null and b/nutch-plugins/parse-swf/src/test/resources/test1.swf differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test1.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/test/resources/test1.txt b/nutch-plugins/parse-swf/src/test/resources/test1.txt new file mode 100644 index 0000000..68505d5 --- /dev/null +++ b/nutch-plugins/parse-swf/src/test/resources/test1.txt @@ -0,0 +1,60 @@ + +-------- +/go/gnav_cart +/go/gnav_company +/go/gnav_devnet +/go/gnav_downloads +/go/gnav_fl_minmessage +/go/gnav_help +/go/gnav_mm_home +/go/gnav_products +/go/gnav_search?loc=en_us +/go/gnav_showcase +/go/gnav_solutions +/go/gnav_store +/go/gnav_support +/go/gnav_your_account +Acquisition Info +Adobe Home +AppleGothic +Array +Company +Developers +Downloads +Help +Home +International +LocaleManager +Macromedia Flash Player +Macromedia Home +MovieClip +Products +Showcase +Solutions +Store +String +Support +TextFormat +To ensure the best possible Internet Experience, please download the latest version of the free +Verdana +_sans +active +bluePill +button +color +company +devnet +downloads +en_us +home +javascript:openCrosslinkWindow('/go/adobeacquisition') +javascript:openCrosslinkWindow('/go/gnav_adobe_home') +products +rollOut +rollOver +selected +showcase +solutions +support +tabHolder +textColor http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test2.swf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/test/resources/test2.swf b/nutch-plugins/parse-swf/src/test/resources/test2.swf new file mode 100644 index 0000000..eb9b03d Binary files /dev/null and b/nutch-plugins/parse-swf/src/test/resources/test2.swf differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test2.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/test/resources/test2.txt b/nutch-plugins/parse-swf/src/test/resources/test2.txt new file mode 100644 index 0000000..f77b78a --- /dev/null +++ b/nutch-plugins/parse-swf/src/test/resources/test2.txt @@ -0,0 +1,5 @@ +Impact Impact Impact Arial Arial Arial Webdings Webdings Webdings Verdana Verdana Verdana CourierNew CourierNew CourierNew Bimini Bimini Bimini +-------- +TextFormat +color +font http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test3.swf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/test/resources/test3.swf b/nutch-plugins/parse-swf/src/test/resources/test3.swf new file mode 100644 index 0000000..4df9f1e Binary files /dev/null and b/nutch-plugins/parse-swf/src/test/resources/test3.swf differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test3.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/test/resources/test3.txt b/nutch-plugins/parse-swf/src/test/resources/test3.txt new file mode 100644 index 0000000..66ae3d8 --- /dev/null +++ b/nutch-plugins/parse-swf/src/test/resources/test3.txt @@ -0,0 +1,11 @@ +Mix. + Edit. + Master. + Compose. + Animate. + With a single suite of powerful tools + that work together as one. + World-class video and audio tools that bring + new power and efficiency to your film, video, + DVD, and web workflows. + Learn more. http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/build-ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/build-ivy.xml b/nutch-plugins/parse-tika/build-ivy.xml new file mode 100644 index 0000000..e4984d8 --- /dev/null +++ b/nutch-plugins/parse-tika/build-ivy.xml @@ -0,0 +1,54 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-tika" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> + + <property name="ivy.install.version" value="2.1.0" /> + <condition property="ivy.home" value="${env.IVY_HOME}"> + <isset property="env.IVY_HOME" /> + </condition> + <property name="ivy.home" value="${user.home}/.ant" /> + <property name="ivy.checksums" value="" /> + <property name="ivy.jar.dir" value="${ivy.home}/lib" /> + <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" /> + + <target name="download-ivy" unless="offline"> + + <mkdir dir="${ivy.jar.dir}"/> + <!-- download Ivy from web site so that it can be used even without any special installation --> + <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" + dest="${ivy.jar.file}" usetimestamp="true"/> + </target> + + <target name="init-ivy" depends="download-ivy"> + <!-- try to load ivy here from ivy home, in case the user has not already dropped + it into ant's lib dir (note that the latter copy will always take precedence). + We will not fail as long as local lib dir exists (it may be empty) and + ivy is in at least one of ant's lib dir or the local lib dir. --> + <path id="ivy.lib.path"> + <fileset dir="${ivy.jar.dir}" includes="*.jar"/> + + </path> + <taskdef resource="org/apache/ivy/ant/antlib.xml" + uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/> + </target> + + <target name="deps-jar" depends="init-ivy"> + <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/> + </target> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/build.xml b/nutch-plugins/parse-tika/build.xml new file mode 100644 index 0000000..4ecb3f8 --- /dev/null +++ b/nutch-plugins/parse-tika/build.xml @@ -0,0 +1,55 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-tika" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Build compilation dependencies --> + <target name="deps-jar"> + <ant target="jar" inheritall="false" dir="../lib-nekohtml" /> + </target> + + <!-- Add compilation dependencies to classpath --> + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/lib-nekohtml/*.jar" /> + </fileset> + </path> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + <ant target="deploy" inheritall="false" dir="../protocol-file"/> + <ant target="deploy" inheritall="false" dir="../lib-nekohtml" /> + </target> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="sample"> + <include name="*.rss"/> + <include name="*.rtf"/> + <include name="*.pdf"/> + <include name="ootest.*"/> + <include name="*.doc"/> + <include name="*.gif"/> + </fileset> + </copy> + + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/howto_upgrade_tika.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/howto_upgrade_tika.txt b/nutch-plugins/parse-tika/howto_upgrade_tika.txt new file mode 100644 index 0000000..63a05a4 --- /dev/null +++ b/nutch-plugins/parse-tika/howto_upgrade_tika.txt @@ -0,0 +1,8 @@ +1. Upgrade Tika depencency in trunk/ivy/ivy.xml + +2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml + +3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml + To get the list of dependencies and their versions execute: + $ ant -f ./build-ivy.xml + $ ls lib | sed 's/^/ <library name="/g' | sed 's/$/"\/>/g' http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/ivy.xml b/nutch-plugins/parse-tika/ivy.xml new file mode 100644 index 0000000..7a9e959 --- /dev/null +++ b/nutch-plugins/parse-tika/ivy.xml @@ -0,0 +1,46 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../../ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + <dependency org="org.apache.tika" name="tika-parsers" rev="1.12" conf="*->default"> + <exclude org="org.apache.tika" name="tika-core" /> + <exclude org="org.apache.httpcomponents" name="httpclient" /> + <exclude org="org.apache.httpcomponents" name="httpcore" /> + </dependency> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/plugin.xml b/nutch-plugins/parse-tika/plugin.xml new file mode 100644 index 0000000..04fcd2e --- /dev/null +++ b/nutch-plugins/parse-tika/plugin.xml @@ -0,0 +1,136 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="parse-tika" + name="Tika Parser Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="parse-tika.jar"> + <export name="*"/> + </library> + <library name="apache-mime4j-core-0.7.2.jar"/> + <library name="apache-mime4j-dom-0.7.2.jar"/> + <library name="asm-5.0.4.jar"/> + <library name="aspectjrt-1.8.0.jar"/> + <library name="bcmail-jdk15on-1.52.jar"/> + <library name="bcpkix-jdk15on-1.52.jar"/> + <library name="bcprov-jdk15on-1.52.jar"/> + <library name="boilerpipe-1.1.0.jar"/> + <library name="bzip2-0.9.1.jar"/> + <library name="c3p0-0.9.1.1.jar"/> + <library name="cdm-4.5.5.jar"/> + <library name="commons-codec-1.6.jar"/> + <library name="commons-compress-1.10.jar"/> + <library name="commons-csv-1.0.jar"/> + <library name="commons-exec-1.3.jar"/> + <library name="commons-io-2.4.jar"/> + <library name="commons-lang-2.6.jar"/> + <library name="commons-logging-1.1.3.jar"/> + <library name="commons-logging-api-1.1.jar"/> + <library name="commons-vfs2-2.0.jar"/> + <library name="cxf-core-3.0.3.jar"/> + <library name="cxf-rt-frontend-jaxrs-3.0.3.jar"/> + <library name="cxf-rt-rs-client-3.0.3.jar"/> + <library name="cxf-rt-transports-http-3.0.3.jar"/> + <library name="ehcache-core-2.6.2.jar"/> + <library name="fontbox-1.8.10.jar"/> + <library name="geoapi-3.0.0.jar"/> + <library name="grib-4.5.5.jar"/> + <library name="gson-2.2.4.jar"/> + <library name="guava-17.0.jar"/> + <library name="httpmime-4.2.6.jar"/> + <library name="httpservices-4.5.5.jar"/> + <library name="isoparser-1.0.2.jar"/> + <library name="jackcess-2.1.2.jar"/> + <library name="jackcess-encrypt-2.1.1.jar"/> + <library name="java-libpst-0.8.1.jar"/> + <library name="javax.annotation-api-1.2.jar"/> + <library name="javax.ws.rs-api-2.0.1.jar"/> + <library name="jcip-annotations-1.0.jar"/> + <library name="jcommander-1.35.jar"/> + <library name="jdom-2.0.2.jar"/> + <library name="jdom2-2.0.4.jar"/> + <library name="jempbox-1.8.10.jar"/> + <library name="jhighlight-1.0.2.jar"/> + <library name="jj2000-5.2.jar"/> + <library name="jmatio-1.0.jar"/> + <library name="jna-4.1.0.jar"/> + <library name="joda-time-2.2.jar"/> + <library name="json-20140107.jar"/> + <library name="json-simple-1.1.1.jar"/> + <library name="jsoup-1.7.2.jar"/> + <library name="jsr-275-0.9.3.jar"/> + <library name="juniversalchardet-1.0.3.jar"/> + <library name="junrar-0.7.jar"/> + <library name="jwnl-1.3.3.jar"/> + <library name="maven-scm-api-1.4.jar"/> + <library name="maven-scm-provider-svn-commons-1.4.jar"/> + <library name="maven-scm-provider-svnexe-1.4.jar"/> + <library name="metadata-extractor-2.8.0.jar"/> + <library name="netcdf4-4.5.5.jar"/> + <library name="opennlp-maxent-3.0.3.jar"/> + <library name="opennlp-tools-1.5.3.jar"/> + <library name="pdfbox-1.8.10.jar"/> + <library name="plexus-utils-1.5.6.jar"/> + <library name="poi-3.13.jar"/> + <library name="poi-ooxml-3.13.jar"/> + <library name="poi-ooxml-schemas-3.13.jar"/> + <library name="poi-scratchpad-3.13.jar"/> + <library name="protobuf-java-2.5.0.jar"/> + <library name="quartz-2.2.0.jar"/> + <library name="regexp-1.3.jar"/> + <library name="rome-1.5.1.jar"/> + <library name="rome-utils-1.5.1.jar"/> + <library name="sis-metadata-0.5.jar"/> + <library name="sis-netcdf-0.5.jar"/> + <library name="sis-referencing-0.5.jar"/> + <library name="sis-storage-0.5.jar"/> + <library name="sis-utility-0.5.jar"/> + <library name="slf4j-api-1.7.12.jar"/> + <library name="stax2-api-3.1.4.jar"/> + <library name="tagsoup-1.2.1.jar"/> + <library name="tika-parsers-1.12.jar"/> + <library name="udunits-4.5.5.jar"/> + <library name="vorbis-java-core-0.6.jar"/> + <library name="vorbis-java-tika-0.6.jar"/> + <library name="woodstox-core-asl-4.4.1.jar"/> + <library name="xmlbeans-2.6.0.jar"/> + <library name="xmlschema-core-2.1.0.jar"/> + <library name="xmpcore-5.1.2.jar"/> + <library name="xz-1.5.jar"/> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + <import plugin="lib-nekohtml"/> + </requires> + + <extension point="org.apache.nutch.parse.Parser" + id="org.apache.nutch.parse.tika" + name="TikaParser"> + + <implementation id="org.apache.nutch.parse.tika.TikaParser" + class="org.apache.nutch.parse.tika.TikaParser"> + <parameter name="contentType" value="*"/> + </implementation> + + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/pom.xml b/nutch-plugins/parse-tika/pom.xml new file mode 100644 index 0000000..0cf2340 --- /dev/null +++ b/nutch-plugins/parse-tika/pom.xml @@ -0,0 +1,54 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>parse-tika</artifactId> + <packaging>jar</packaging> + + <name>parse-tika</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + <dependencies> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parsers</artifactId> + <version>1.13</version> + <exclusions> + <!-- TODO --> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.nutch</groupId> + <artifactId>lib-nekohtml</artifactId> + <version>${project.parent.version}</version> + <scope>test</scope> + </dependency> + </dependencies> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java new file mode 100644 index 0000000..7c0d71b --- /dev/null +++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.tika; + +import java.lang.ClassLoader; +import java.lang.InstantiationException; +import java.util.HashMap; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.tika.parser.html.BoilerpipeContentHandler; +import de.l3s.boilerpipe.BoilerpipeExtractor; +import de.l3s.boilerpipe.extractors.*; + +class BoilerpipeExtractorRepository { + + public static final Log LOG = LogFactory.getLog(BoilerpipeExtractorRepository.class); + public static final HashMap<String, BoilerpipeExtractor> extractorRepository = new HashMap<String, BoilerpipeExtractor>(); + + /** + * Returns an instance of the specified extractor + */ + public static synchronized BoilerpipeExtractor getExtractor(String boilerpipeExtractorName) { + // Check if there's no instance of this extractor + if (!extractorRepository.containsKey(boilerpipeExtractorName)) { + // FQCN + boilerpipeExtractorName = "de.l3s.boilerpipe.extractors." + boilerpipeExtractorName; + + // Attempt to load the class + try { + ClassLoader loader = BoilerpipeExtractor.class.getClassLoader(); + Class extractorClass = loader.loadClass(boilerpipeExtractorName); + + // Add an instance to the repository + extractorRepository.put(boilerpipeExtractorName, (BoilerpipeExtractor)extractorClass.newInstance()); + + } catch (ClassNotFoundException e) { + LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not found!"); + } catch (InstantiationException e) { + LOG.error("Could not instantiate " + boilerpipeExtractorName); + } catch (Exception e) { + LOG.error(e); + } + } + + return extractorRepository.get(boilerpipeExtractorName); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java new file mode 100644 index 0000000..77a1044 --- /dev/null +++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java @@ -0,0 +1,794 @@ +/* + * XXX [email protected]: This class is copied verbatim from Xalan-J 2.6.0 + * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to + * avoid dependency on Xalan. + */ + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * $Id: DOMBuilder.java 823614 2009-10-09 17:02:32Z ab $ + */ +package org.apache.nutch.parse.tika; + +import java.util.Stack; + +import org.w3c.dom.Comment; +import org.w3c.dom.Document; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.Text; +import org.w3c.dom.CDATASection; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.ext.LexicalHandler; + +/** + * This class takes SAX events (in addition to some extra events that SAX + * doesn't handle yet) and adds the result to a document or document fragment. + */ +class DOMBuilder implements ContentHandler, LexicalHandler { + private boolean upperCaseElementNames = true; + + /** Root document */ + public Document m_doc; + + /** Current node */ + protected Node m_currentNode = null; + + /** First node of document fragment or null if not a DocumentFragment */ + public DocumentFragment m_docFrag = null; + + /** Vector of element nodes */ + protected Stack<Element> m_elemStack = new Stack<Element>(); + + /** + * Element recorded with this namespace will be converted to Node without a + * namespace + */ + private String defaultNamespaceURI = null; + + /** + * DOMBuilder instance constructor... it will add the DOM nodes to the + * document fragment. + * + * @param doc + * Root document + * @param node + * Current node + */ + DOMBuilder(Document doc, Node node) { + m_doc = doc; + m_currentNode = node; + } + + /** + * DOMBuilder instance constructor... it will add the DOM nodes to the + * document fragment. + * + * @param doc + * Root document + * @param docFrag + * Document fragment + */ + DOMBuilder(Document doc, DocumentFragment docFrag) { + m_doc = doc; + m_docFrag = docFrag; + } + + /** + * DOMBuilder instance constructor... it will add the DOM nodes to the + * document. + * + * @param doc + * Root document + */ + DOMBuilder(Document doc) { + m_doc = doc; + } + + /** + * Get the root node of the DOM being created. This is either a Document or a + * DocumentFragment. + * + * @return The root document or document fragment if not null + */ + Node getRootNode() { + return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc; + } + + /** + * Get the node currently being processed. + * + * @return the current node being processed + */ + Node getCurrentNode() { + return m_currentNode; + } + + /** + * Return null since there is no Writer for this class. + * + * @return null + */ + java.io.Writer getWriter() { + return null; + } + + /** + * Append a node to the current container. + * + * @param newNode + * New node to append + */ + protected void append(Node newNode) throws org.xml.sax.SAXException { + + Node currentNode = m_currentNode; + + if (null != currentNode) { + currentNode.appendChild(newNode); + + // System.out.println(newNode.getNodeName()); + } else if (null != m_docFrag) { + m_docFrag.appendChild(newNode); + } else { + boolean ok = true; + short type = newNode.getNodeType(); + + if (type == Node.TEXT_NODE) { + String data = newNode.getNodeValue(); + + if ((null != data) && (data.trim().length() > 0)) { + throw new org.xml.sax.SAXException( + "Warning: can't output text before document element! Ignoring..."); + } + + ok = false; + } else if (type == Node.ELEMENT_NODE) { + if (m_doc.getDocumentElement() != null) { + throw new org.xml.sax.SAXException( + "Can't have more than one root on a DOM!"); + } + } + + if (ok) + m_doc.appendChild(newNode); + } + } + + /** + * Receive an object for locating the origin of SAX document events. + * + * <p> + * SAX parsers are strongly encouraged (though not absolutely required) to + * supply a locator: if it does so, it must supply the locator to the + * application by invoking this method before invoking any of the other + * methods in the ContentHandler interface. + * </p> + * + * <p> + * The locator allows the application to determine the end position of any + * document-related event, even if the parser is not reporting an error. + * Typically, the application will use this information for reporting its own + * errors (such as character content that does not match an application's + * business rules). The information returned by the locator is probably not + * sufficient for use with a search engine. + * </p> + * + * <p> + * Note that the locator will return correct information only during the + * invocation of the events in this interface. The application should not + * attempt to use it at any other time. + * </p> + * + * @param locator + * An object that can return the location of any SAX document event. + * @see org.xml.sax.Locator + */ + public void setDocumentLocator(Locator locator) { + + // No action for the moment. + } + + /** + * Receive notification of the beginning of a document. + * + * <p> + * The SAX parser will invoke this method only once, before any other methods + * in this interface or in DTDHandler (except for setDocumentLocator). + * </p> + */ + public void startDocument() throws org.xml.sax.SAXException { + + // No action for the moment. + } + + /** + * Receive notification of the end of a document. + * + * <p> + * The SAX parser will invoke this method only once, and it will be the last + * method invoked during the parse. The parser shall not invoke this method + * until it has either abandoned parsing (because of an unrecoverable error) + * or reached the end of input. + * </p> + */ + public void endDocument() throws org.xml.sax.SAXException { + + // No action for the moment. + } + + /** + * Receive notification of the beginning of an element. + * + * <p> + * The Parser will invoke this method at the beginning of every element in the + * XML document; there will be a corresponding endElement() event for every + * startElement() event (even when the element is empty). All of the element's + * content will be reported, in order, before the corresponding endElement() + * event. + * </p> + * + * <p> + * If the element name has a namespace prefix, the prefix will still be + * attached. Note that the attribute list provided will contain only + * attributes with explicit values (specified or defaulted): #IMPLIED + * attributes will be omitted. + * </p> + * + * + * @param ns + * The namespace of the node + * @param localName + * The local part of the qualified name + * @param name + * The element name. + * @param atts + * The attributes attached to the element, if any. + * @see #endElement + * @see org.xml.sax.Attributes + */ + public void startElement(String ns, String localName, String name, + Attributes atts) throws org.xml.sax.SAXException { + + Element elem; + + if (upperCaseElementNames) + name = name.toUpperCase(); + + // Note that the namespace-aware call must be used to correctly + // construct a Level 2 DOM, even for non-namespaced nodes. + if ((null == ns) || (ns.length() == 0) || ns.equals(defaultNamespaceURI)) + elem = m_doc.createElementNS(null, name); + else + elem = m_doc.createElementNS(ns, name); + + append(elem); + + try { + int nAtts = atts.getLength(); + + if (0 != nAtts) { + for (int i = 0; i < nAtts; i++) { + + // System.out.println("type " + atts.getType(i) + " name " + + // atts.getLocalName(i) ); + // First handle a possible ID attribute + if (atts.getType(i).equalsIgnoreCase("ID")) + setIDAttribute(atts.getValue(i), elem); + + String attrNS = atts.getURI(i); + + if ("".equals(attrNS)) + attrNS = null; // DOM represents no-namespace as null + + // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i) + // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i)); + // Crimson won't let us set an xmlns: attribute on the DOM. + String attrQName = atts.getQName(i); + + // In SAX, xmlns: attributes have an empty namespace, while in DOM + // they should have the xmlns namespace + if (attrQName.startsWith("xmlns:")) + attrNS = "http://www.w3.org/2000/xmlns/"; + + // ALWAYS use the DOM Level 2 call! + elem.setAttributeNS(attrNS, attrQName, atts.getValue(i)); + } + } + + // append(elem); + + m_elemStack.push(elem); + + m_currentNode = elem; + + // append(elem); + } catch (java.lang.Exception de) { + // de.printStackTrace(); + throw new org.xml.sax.SAXException(de); + } + + } + + /** + * + * + * + * Receive notification of the end of an element. + * + * <p> + * The SAX parser will invoke this method at the end of every element in the + * XML document; there will be a corresponding startElement() event for every + * endElement() event (even when the element is empty). + * </p> + * + * <p> + * If the element name has a namespace prefix, the prefix will still be + * attached to the name. + * </p> + * + * + * @param ns + * the namespace of the element + * @param localName + * The local part of the qualified name of the element + * @param name + * The element name + */ + public void endElement(String ns, String localName, String name) + throws org.xml.sax.SAXException { + if (!m_elemStack.isEmpty()) { + m_elemStack.pop(); + } + m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek(); + } + + /** + * Set an ID string to node association in the ID table. + * + * @param id + * The ID string. + * @param elem + * The associated ID. + */ + public void setIDAttribute(String id, Element elem) { + + // Do nothing. This method is meant to be overiden. + } + + /** + * Receive notification of character data. + * + * <p> + * The Parser will call this method to report each chunk of character data. + * SAX parsers may return all contiguous character data in a single chunk, or + * they may split it into several chunks; however, all of the characters in + * any single event must come from the same external entity, so that the + * Locator provides useful information. + * </p> + * + * <p> + * The application must not attempt to read from the array outside of the + * specified range. + * </p> + * + * <p> + * Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating parsers must + * do so). + * </p> + * + * @param ch + * The characters from the XML document. + * @param start + * The start position in the array. + * @param length + * The number of characters to read from the array. + * @see #ignorableWhitespace + * @see org.xml.sax.Locator + */ + public void characters(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + if (m_inCData) { + cdata(ch, start, length); + + return; + } + + String s = new String(ch, start, length); + Node childNode; + childNode = m_currentNode != null ? m_currentNode.getLastChild() : null; + if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) { + ((Text) childNode).appendData(s); + } else { + Text text = m_doc.createTextNode(s); + append(text); + } + } + + /** + * If available, when the disable-output-escaping attribute is used, output + * raw text without escaping. A PI will be inserted in front of the node with + * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom". + * + * @param ch + * Array containing the characters + * @param start + * Index to start of characters in the array + * @param length + * Number of characters in the array + */ + public void charactersRaw(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + String s = new String(ch, start, length); + + append(m_doc.createProcessingInstruction("xslt-next-is-raw", + "formatter-to-dom")); + append(m_doc.createTextNode(s)); + } + + /** + * Report the beginning of an entity. + * + * The start and end of the document entity are not reported. The start and + * end of the external DTD subset are reported using the pseudo-name "[dtd]". + * All other events must be properly nested within start/end entity events. + * + * @param name + * The name of the entity. If it is a parameter entity, the name will + * begin with '%'. + * @see #endEntity + * @see org.xml.sax.ext.DeclHandler#internalEntityDecl + * @see org.xml.sax.ext.DeclHandler#externalEntityDecl + */ + public void startEntity(String name) throws org.xml.sax.SAXException { + + // Almost certainly the wrong behavior... + // entityReference(name); + } + + /** + * Report the end of an entity. + * + * @param name + * The name of the entity that is ending. + * @see #startEntity + */ + public void endEntity(String name) throws org.xml.sax.SAXException { + } + + /** + * Receive notivication of a entityReference. + * + * @param name + * name of the entity reference + */ + public void entityReference(String name) throws org.xml.sax.SAXException { + append(m_doc.createEntityReference(name)); + } + + /** + * Receive notification of ignorable whitespace in element content. + * + * <p> + * Validating Parsers must use this method to report each chunk of ignorable + * whitespace (see the W3C XML 1.0 recommendation, section 2.10): + * non-validating parsers may also use this method if they are capable of + * parsing and using content models. + * </p> + * + * <p> + * SAX parsers may return all contiguous whitespace in a single chunk, or they + * may split it into several chunks; however, all of the characters in any + * single event must come from the same external entity, so that the Locator + * provides useful information. + * </p> + * + * <p> + * The application must not attempt to read from the array outside of the + * specified range. + * </p> + * + * @param ch + * The characters from the XML document. + * @param start + * The start position in the array. + * @param length + * The number of characters to read from the array. + * @see #characters + */ + public void ignorableWhitespace(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem()) + return; // avoid DOM006 Hierarchy request error + + String s = new String(ch, start, length); + + append(m_doc.createTextNode(s)); + } + + /** + * Tell if the current node is outside the document element. + * + * @return true if the current node is outside the document element. + */ + private boolean isOutsideDocElem() { + return (null == m_docFrag) + && m_elemStack.size() == 0 + && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE); + } + + /** + * Receive notification of a processing instruction. + * + * <p> + * The Parser will invoke this method once for each processing instruction + * found: note that processing instructions may occur before or after the main + * document element. + * </p> + * + * <p> + * A SAX parser should never report an XML declaration (XML 1.0, section 2.8) + * or a text declaration (XML 1.0, section 4.3.1) using this method. + * </p> + * + * @param target + * The processing instruction target. + * @param data + * The processing instruction data, or null if none was supplied. + */ + public void processingInstruction(String target, String data) + throws org.xml.sax.SAXException { + append(m_doc.createProcessingInstruction(target, data)); + } + + /** + * Report an XML comment anywhere in the document. + * + * This callback will be used for comments inside or outside the document + * element, including comments in the external DTD subset (if read). + * + * @param ch + * An array holding the characters in the comment. + * @param start + * The starting position in the array. + * @param length + * The number of characters to use from the array. + */ + public void comment(char ch[], int start, int length) + throws org.xml.sax.SAXException { + // tagsoup sometimes submits invalid values here + if (ch == null || start < 0 || length >= (ch.length - start) || length < 0) + return; + append(m_doc.createComment(new String(ch, start, length))); + } + + /** Flag indicating that we are processing a CData section */ + protected boolean m_inCData = false; + + /** + * Report the start of a CDATA section. + * + * @see #endCDATA + */ + public void startCDATA() throws org.xml.sax.SAXException { + m_inCData = true; + append(m_doc.createCDATASection("")); + } + + /** + * Report the end of a CDATA section. + * + * @see #startCDATA + */ + public void endCDATA() throws org.xml.sax.SAXException { + m_inCData = false; + } + + /** + * Receive notification of cdata. + * + * <p> + * The Parser will call this method to report each chunk of character data. + * SAX parsers may return all contiguous character data in a single chunk, or + * they may split it into several chunks; however, all of the characters in + * any single event must come from the same external entity, so that the + * Locator provides useful information. + * </p> + * + * <p> + * The application must not attempt to read from the array outside of the + * specified range. + * </p> + * + * <p> + * Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating parsers must + * do so). + * </p> + * + * @param ch + * The characters from the XML document. + * @param start + * The start position in the array. + * @param length + * The number of characters to read from the array. + * @see #ignorableWhitespace + * @see org.xml.sax.Locator + */ + public void cdata(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + String s = new String(ch, start, length); + + // XXX [email protected]: modified from the original, to accomodate TagSoup. + Node n = m_currentNode.getLastChild(); + if (n instanceof CDATASection) + ((CDATASection) n).appendData(s); + else if (n instanceof Comment) + ((Comment) n).appendData(s); + } + + /** + * Report the start of DTD declarations, if any. + * + * Any declarations are assumed to be in the internal subset unless otherwise + * indicated. + * + * @param name + * The document type name. + * @param publicId + * The declared public identifier for the external DTD subset, or + * null if none was declared. + * @param systemId + * The declared system identifier for the external DTD subset, or + * null if none was declared. + * @see #endDTD + * @see #startEntity + */ + public void startDTD(String name, String publicId, String systemId) + throws org.xml.sax.SAXException { + + // Do nothing for now. + } + + /** + * Report the end of DTD declarations. + * + * @see #startDTD + */ + public void endDTD() throws org.xml.sax.SAXException { + + // Do nothing for now. + } + + /** + * Begin the scope of a prefix-URI Namespace mapping. + * + * <p> + * The information from this event is not necessary for normal Namespace + * processing: the SAX XML reader will automatically replace prefixes for + * element and attribute names when the http://xml.org/sax/features/namespaces + * feature is true (the default). + * </p> + * + * <p> + * There are cases, however, when applications need to use prefixes in + * character data or in attribute values, where they cannot safely be expanded + * automatically; the start/endPrefixMapping event supplies the information to + * the application to expand prefixes in those contexts itself, if necessary. + * </p> + * + * <p> + * Note that start/endPrefixMapping events are not guaranteed to be properly + * nested relative to each-other: all startPrefixMapping events will occur + * before the corresponding startElement event, and all endPrefixMapping + * events will occur after the corresponding endElement event, but their order + * is not guaranteed. + * </p> + * + * @param prefix + * The Namespace prefix being declared. + * @param uri + * The Namespace URI the prefix is mapped to. + * @see #endPrefixMapping + * @see #startElement + */ + public void startPrefixMapping(String prefix, String uri) + throws org.xml.sax.SAXException { + + /* + * // Not sure if this is needed or wanted // Also, it fails in the stree. + * if((null != m_currentNode) && (m_currentNode.getNodeType() == + * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) && + * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname + * = "xmlns:"+prefix; + * + * Element elem = (Element)m_currentNode; String val = + * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == null) + * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", qname, + * uri); } } + */ + } + + /** + * End the scope of a prefix-URI mapping. + * + * <p> + * See startPrefixMapping for details. This event will always occur after the + * corresponding endElement event, but the order of endPrefixMapping events is + * not otherwise guaranteed. + * </p> + * + * @param prefix + * The prefix that was being mapping. + * @see #startPrefixMapping + * @see #endElement + */ + public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException { + } + + /** + * Receive notification of a skipped entity. + * + * <p> + * The Parser will invoke this method once for each entity skipped. + * Non-validating processors may skip entities if they have not seen the + * declarations (because, for example, the entity was declared in an external + * DTD subset). All processors may skip external entities, depending on the + * values of the http://xml.org/sax/features/external-general-entities and the + * http://xml.org/sax/features/external-parameter-entities properties. + * </p> + * + * @param name + * The name of the skipped entity. If it is a parameter entity, the + * name will begin with '%'. + */ + public void skippedEntity(String name) throws org.xml.sax.SAXException { + } + + public boolean isUpperCaseElementNames() { + return upperCaseElementNames; + } + + public void setUpperCaseElementNames(boolean upperCaseElementNames) { + this.upperCaseElementNames = upperCaseElementNames; + } + + public String getDefaultNamespaceURI() { + return defaultNamespaceURI; + } + + public void setDefaultNamespaceURI(String defaultNamespaceURI) { + this.defaultNamespaceURI = defaultNamespaceURI; + } +}
