Author: ab Date: Fri Feb 3 10:49:07 2006 New Revision: 374724 URL: http://svn.apache.org/viewcvs?rev=374724&view=rev Log: Add a parse plugin for SWF (Macromedia Flash) files.
Add a mapping in parse-plugins.xml . Add also an entry in mime-types.xml corresponding to an alternative, compressed SWF file format. This work has been sponsored by Zaheed Haque. Thank you! Added: lucene/nutch/trunk/src/plugin/parse-swf/ lucene/nutch/trunk/src/plugin/parse-swf/build.xml (with props) lucene/nutch/trunk/src/plugin/parse-swf/lib/ lucene/nutch/trunk/src/plugin/parse-swf/lib/javaswf-LICENSE.txt (with props) lucene/nutch/trunk/src/plugin/parse-swf/lib/javaswf.jar (with props) lucene/nutch/trunk/src/plugin/parse-swf/plugin.xml (with props) lucene/nutch/trunk/src/plugin/parse-swf/sample/ lucene/nutch/trunk/src/plugin/parse-swf/sample/test1.swf (with props) lucene/nutch/trunk/src/plugin/parse-swf/sample/test1.txt (with props) lucene/nutch/trunk/src/plugin/parse-swf/sample/test2.swf (with props) lucene/nutch/trunk/src/plugin/parse-swf/sample/test2.txt (with props) lucene/nutch/trunk/src/plugin/parse-swf/sample/test3.swf (with props) lucene/nutch/trunk/src/plugin/parse-swf/sample/test3.txt (with props) lucene/nutch/trunk/src/plugin/parse-swf/src/ lucene/nutch/trunk/src/plugin/parse-swf/src/java/ lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/ lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/ lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/ lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/ lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java (with props) lucene/nutch/trunk/src/plugin/parse-swf/src/test/ lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/ lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/ lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/ lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/ lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java (with props) Modified: lucene/nutch/trunk/conf/mime-types.xml lucene/nutch/trunk/conf/parse-plugins.xml lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/conf/mime-types.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/mime-types.xml?rev=374724&r1=374723&r2=374724&view=diff ============================================================================== --- lucene/nutch/trunk/conf/mime-types.xml (original) +++ lucene/nutch/trunk/conf/mime-types.xml Fri Feb 3 10:49:07 2006 @@ -255,6 +255,8 @@ description="Macromedia Flash Format File"> <ext>swf</ext> <magic offset="0" value="FWS"/> + <!-- compressed --> + <magic offset="0" value="CWS"/> </mime-type> <mime-type name="application/x-stuffit" Modified: lucene/nutch/trunk/conf/parse-plugins.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=374724&r1=374723&r2=374724&view=diff ============================================================================== --- lucene/nutch/trunk/conf/parse-plugins.xml (original) +++ lucene/nutch/trunk/conf/parse-plugins.xml Fri Feb 3 10:49:07 2006 @@ -114,6 +114,10 @@ <plugin id="parse-text" /> </mimeType> + <mimeType name="application/x-shockwave-flash"> + <plugin id="parse-swf" /> + </mimeType> + <mimeType name="application/x-tcl"> <plugin id="parse-text" /> </mimeType> Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=374724&r1=374723&r2=374724&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Fri Feb 3 10:49:07 2006 @@ -29,6 +29,7 @@ <ant dir="parse-pdf" target="deploy"/> <ant dir="parse-rss" target="deploy"/> <!-- <ant dir="parse-rtf" target="deploy"/> --> + <ant dir="parse-swf" target="deploy"/> <ant dir="parse-text" target="deploy"/> <ant dir="parse-zip" target="deploy"/> <ant dir="query-basic" target="deploy"/> @@ -55,6 +56,7 @@ <ant dir="parse-pdf" target="test"/> <ant dir="parse-rss" target="test"/> <!-- <ant dir="parse-rtf" target="test"/> --> + <ant dir="parse-swf" target="test"/> <ant dir="parse-zip" target="test"/> </target> @@ -87,6 +89,7 @@ <ant dir="parse-pdf" target="clean"/> <ant dir="parse-rss" target="clean"/> <ant dir="parse-rtf" target="clean"/> + <ant dir="parse-swf" target="clean"/> <ant dir="parse-text" target="clean"/> <ant dir="parse-zip" target="clean"/> <ant dir="query-basic" target="clean"/> Added: lucene/nutch/trunk/src/plugin/parse-swf/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-swf/build.xml?rev=374724&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-swf/build.xml (added) +++ lucene/nutch/trunk/src/plugin/parse-swf/build.xml Fri Feb 3 10:49:07 2006 @@ -0,0 +1,16 @@ +<?xml version="1.0"?> + +<project name="parse-swf" default="jar"> + + <import file="../build-plugin.xml"/> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy file="sample/test1.swf" todir="${build.test}/data"/> + <copy file="sample/test2.swf" todir="${build.test}/data"/> + <copy file="sample/test3.swf" todir="${build.test}/data"/> + <copy file="sample/test1.txt" todir="${build.test}/data"/> + <copy file="sample/test2.txt" todir="${build.test}/data"/> + <copy file="sample/test3.txt" todir="${build.test}/data"/> + +</project> Propchange: lucene/nutch/trunk/src/plugin/parse-swf/build.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-swf/lib/javaswf-LICENSE.txt URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-swf/lib/javaswf-LICENSE.txt?rev=374724&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-swf/lib/javaswf-LICENSE.txt (added) +++ lucene/nutch/trunk/src/plugin/parse-swf/lib/javaswf-LICENSE.txt Fri Feb 3 10:49:07 2006 @@ -0,0 +1,33 @@ + + Copyright (c) 2001-2005, David N. Main, All rights reserved. + + Redistribution and use in source and binary forms, with or + without modification, are permitted provided that the + following conditions are met: + + 1. Redistributions of source code must retain the above + copyright notice, this list of conditions and the following + disclaimer. + + 2. Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + + 3. The name of the author may not be used to endorse or + promote products derived from this software without specific + prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY + EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + Propchange: lucene/nutch/trunk/src/plugin/parse-swf/lib/javaswf-LICENSE.txt ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-swf/lib/javaswf.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-swf/lib/javaswf.jar?rev=374724&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-swf/lib/javaswf.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/parse-swf/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-swf/plugin.xml?rev=374724&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-swf/plugin.xml (added) +++ lucene/nutch/trunk/src/plugin/parse-swf/plugin.xml Fri Feb 3 10:49:07 2006 @@ -0,0 +1,26 @@ +<?xml version="1.0" encoding="UTF-8"?> +<plugin + id="parse-swf" + name="SWF Parse Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + + <runtime> + <library name="parse-swf.jar"> + <export name="*"/> + </library> + <library name="javaswf.jar"/> + </runtime> + + <extension id="org.apache.nutch.parse.swf" + name="SWFParse" + point="org.apache.nutch.parse.Parser"> + + <implementation id="org.apache.nutch.parse.swf.SWFParser" + class="org.apache.nutch.parse.swf.SWFParser" + contentType="application/x-shockwave-flash" + pathSuffix="swf"/> + </extension> + +</plugin> Propchange: lucene/nutch/trunk/src/plugin/parse-swf/plugin.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-swf/sample/test1.swf URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-swf/sample/test1.swf?rev=374724&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-swf/sample/test1.swf ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/parse-swf/sample/test1.txt URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-swf/sample/test1.txt?rev=374724&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-swf/sample/test1.txt (added) +++ lucene/nutch/trunk/src/plugin/parse-swf/sample/test1.txt Fri Feb 3 10:49:07 2006 @@ -0,0 +1,60 @@ + +-------- +Help +javascript:openCrosslinkWindow('/go/adobeacquisition') +Macromedia Home +/go/gnav_search?loc=en_us +MovieClip +solutions +/go/gnav_showcase +_sans +rollOut +To ensure the best possible Internet Experience, please download the latest version of the free +/go/gnav_store +International +Products +devnet +en_us +/go/gnav_products +AppleGothic +Macromedia Flash Player +active +products +String +Store +downloads +rollOver +Adobe Home +/go/gnav_your_account +/go/gnav_downloads +Showcase +bluePill +/go/gnav_company +/go/gnav_support +/go/gnav_help +javascript:openCrosslinkWindow('/go/gnav_adobe_home') +home +Home +Array +/go/gnav_fl_minmessage +textColor +Developers +Support +color +support +showcase +button +/go/gnav_mm_home +tabHolder +selected +Solutions +LocaleManager +Verdana +/go/gnav_devnet +Acquisition Info +/go/gnav_cart +Company +/go/gnav_solutions +company +Downloads +TextFormat Propchange: lucene/nutch/trunk/src/plugin/parse-swf/sample/test1.txt ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-swf/sample/test2.swf URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-swf/sample/test2.swf?rev=374724&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-swf/sample/test2.swf ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/parse-swf/sample/test2.txt URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-swf/sample/test2.txt?rev=374724&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-swf/sample/test2.txt (added) +++ lucene/nutch/trunk/src/plugin/parse-swf/sample/test2.txt Fri Feb 3 10:49:07 2006 @@ -0,0 +1,5 @@ +Impact Impact Impact Arial Arial Arial Webdings Webdings Webdings Verdana Verdana Verdana CourierNew CourierNew CourierNew Bimini Bimini Bimini +-------- +font +color +TextFormat Propchange: lucene/nutch/trunk/src/plugin/parse-swf/sample/test2.txt ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-swf/sample/test3.swf URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-swf/sample/test3.swf?rev=374724&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-swf/sample/test3.swf ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/parse-swf/sample/test3.txt URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-swf/sample/test3.txt?rev=374724&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-swf/sample/test3.txt (added) +++ lucene/nutch/trunk/src/plugin/parse-swf/sample/test3.txt Fri Feb 3 10:49:07 2006 @@ -0,0 +1,11 @@ +Mix. + Edit. + Master. + Compose. + Animate. + With a single suite of powerful tools + that work together as one. + World-class video and audio tools that bring + new power and efficiency to your film, video, + DVD, and web workflows. + Learn more. Propchange: lucene/nutch/trunk/src/plugin/parse-swf/sample/test3.txt ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java?rev=374724&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java (added) +++ lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java Fri Feb 3 10:49:07 2006 @@ -0,0 +1,699 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.swf; + +import java.io.FileInputStream; +import java.io.IOException; +import java.util.*; +import java.util.logging.Logger; + +import org.apache.nutch.parse.*; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; +import org.apache.nutch.util.LogFormatter; +import org.apache.nutch.util.NutchConf; + +import com.anotherbigidea.flash.interfaces.*; +import com.anotherbigidea.flash.readers.*; +import com.anotherbigidea.flash.structs.*; +import com.anotherbigidea.flash.writers.SWFActionBlockImpl; +import com.anotherbigidea.flash.writers.SWFTagTypesImpl; +import com.anotherbigidea.io.InStream; + +/** + * Parser for Flash SWF files. Loosely based on the sample in JavaSWF + * distribution. + * + * @author Andrzej Bialecki + */ +public class SWFParser implements Parser { + public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.parse.swf"); + + private NutchConf nutchConf = null; + + public SWFParser() {} + + public void setConf(NutchConf conf) { + this.nutchConf = conf; + } + + public NutchConf getConf() { + return nutchConf; + } + + public Parse getParse(Content content) { + + String text = null; + // collect meta data + ContentProperties metadata = new ContentProperties(); + metadata.putAll(content.getMetadata()); // copy through + Vector outlinks = new Vector(); + + try { + + byte[] raw = content.getContent(); + + String contentLength = content.get("Content-Length"); + if (contentLength != null && raw.length != Integer.parseInt(contentLength)) { + return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + + " bytes. Parser can't handle incomplete files.").getEmptyParse(nutchConf); + } + ExtractText extractor = new ExtractText(); + + // TagParser implements SWFTags and drives a SWFTagTypes interface + TagParser parser = new TagParser(extractor); + // use this instead to debug the file + // TagParser parser = new TagParser( new SWFTagDumper(true, true) ); + + // SWFReader reads an input file and drives a SWFTags interface + SWFReader reader = new SWFReader(parser, new InStream(raw)); + + // read the input SWF file and pass it through the interface pipeline + reader.readFile(); + text = extractor.getText(); + String atext = extractor.getActionText(); + if (atext != null && atext.length() > 0) text += "\n--------\n" + atext; + // harvest potential outlinks + String[] links = extractor.getUrls(); + for (int i = 0; i < links.length; i++) { + Outlink out = new Outlink(links[i], "", nutchConf); + outlinks.add(out); + } + Outlink[] olinks = OutlinkExtractor.getOutlinks(text, nutchConf); + if (olinks != null) for (int i = 0; i < olinks.length; i++) { + outlinks.add(olinks[i]); + } + } catch (Exception e) { // run time exception + e.printStackTrace(); + return new ParseStatus(ParseStatus.FAILED, "Can't be handled as SWF document. " + e).getEmptyParse(nutchConf); + } finally {} + if (text == null) text = ""; + + Outlink[] links = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]); + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links, metadata); + return new ParseImpl(text, parseData); + } + + /** + * Arguments are: 0. Name of input SWF file. + */ + public static void main(String[] args) throws IOException { + FileInputStream in = new FileInputStream(args[0]); + + byte[] buf = new byte[in.available()]; + in.read(buf); + SWFParser parser = new SWFParser(); + Parse p = parser.getParse(new Content("file:" + args[0], "file:" + args[0], buf, "application/x-shockwave-flash", + new ContentProperties(), new NutchConf())); + System.out.println("Parse Text:"); + System.out.println(p.getText()); + System.out.println("Parse Data:"); + System.out.println(p.getData()); + } +} + +/** + * Shows how to parse a Flash movie and extract all the text in Text symbols and + * the initial text in Edit Fields. Output is to System.out. + * + * A "pipeline" is set up in the main method: + * + * SWFReader-->TagParser-->ExtractText + * + * SWFReader reads the input SWF file and separates out the header and the tags. + * The separated contents are passed to TagParser which parses out the + * individual tag types and passes them to ExtractText. + * + * ExtractText extends SWFTagTypesImpl and overrides some methods. + */ +class ExtractText extends SWFTagTypesImpl { + /** + * Store font info keyed by the font symbol id. Each entry is an int[] of + * character codes for the correspnding font glyphs (An empty array denotes a + * System Font). + */ + protected HashMap fontCodes = new HashMap(); + + public ArrayList strings = new ArrayList(); + + public HashSet actionStrings = new HashSet(); + + public ArrayList urls = new ArrayList(); + + public ExtractText() { + super(null); + } + + public String getText() { + StringBuffer res = new StringBuffer(); + Iterator it = strings.iterator(); + while (it.hasNext()) { + if (res.length() > 0) res.append(' '); + res.append(it.next()); + } + return res.toString(); + } + + public String getActionText() { + StringBuffer res = new StringBuffer(); + Iterator it = actionStrings.iterator(); + while (it.hasNext()) { + if (res.length() > 0) res.append('\n'); + res.append(it.next()); + } + return res.toString(); + } + + public String[] getUrls() { + String[] res = new String[urls.size()]; + int i = 0; + Iterator it = urls.iterator(); + while (it.hasNext()) { + res[i] = (String) it.next(); + i++; + } + return res; + } + + public void tagDefineFontInfo2(int arg0, String arg1, int arg2, int[] arg3, int arg4) throws IOException { + tagDefineFontInfo(arg0, arg1, arg2, arg3); + } + + /** + * SWFTagTypes interface Save the Text Font character code info + */ + public void tagDefineFontInfo(int fontId, String fontName, int flags, int[] codes) throws IOException { + // System.out.println("-defineFontInfo id=" + fontId + ", name=" + + // fontName); + fontCodes.put(new Integer(fontId), codes); + } + + // XXX too much hassle for too little return ... we cannot guess character + // XXX codes anyway, so we just give up. + /* + * public SWFVectors tagDefineFont(int arg0, int arg1) throws IOException { + * return null; + * } + */ + + /** + * SWFTagTypes interface. Save the character code info. + */ + public SWFVectors tagDefineFont2(int id, int flags, String name, int numGlyphs, int ascent, int descent, int leading, + int[] codes, int[] advances, Rect[] bounds, int[] kernCodes1, int[] kernCodes2, int[] kernAdjustments) + throws IOException { + // System.out.println("-defineFontInfo id=" + id + ", name=" + name); + fontCodes.put(new Integer(id), (codes != null) ? codes : new int[0]); + + return null; + } + + /** + * SWFTagTypes interface. Dump any initial text in the field. + */ + public void tagDefineTextField(int fieldId, String fieldName, String initialText, Rect boundary, int flags, + AlphaColor textColor, int alignment, int fontId, int fontSize, int charLimit, int leftMargin, + int rightMargin, int indentation, int lineSpacing) throws IOException { + if (initialText != null) { + strings.add(initialText); + } + } + + /** + * SWFTagTypes interface + */ + public SWFText tagDefineText(int id, Rect bounds, Matrix matrix) throws IOException { + lastBounds = curBounds; + curBounds = bounds; + return new TextDumper(); + } + + Rect lastBounds = null; + Rect curBounds = null; + + /** + * SWFTagTypes interface + */ + public SWFText tagDefineText2(int id, Rect bounds, Matrix matrix) throws IOException { + lastBounds = curBounds; + curBounds = bounds; + return new TextDumper(); + } + + public class TextDumper implements SWFText { + protected Integer fontId; + + protected boolean firstY = true; + + public void font(int fontId, int textHeight) { + this.fontId = new Integer(fontId); + } + + public void setY(int y) { + if (firstY) + firstY = false; + else strings.add("\n"); // Change in Y - dump a new line + } + + /* + * There are some issues with this method: sometimes SWF files define their + * own font, so short of OCR we cannot guess what is the glyph code -> character + * mapping. Additionally, some files don't use literal space character, instead + * they adjust glyphAdvances. We don't handle it at all - in such cases the text + * will be all glued together. + */ + public void text(int[] glyphIndices, int[] glyphAdvances) { + // System.out.println("-text id=" + fontId); + int[] codes = (int[]) fontCodes.get(fontId); + if (codes == null) { + // unknown font, better not guess + strings.add("\n**** ?????????????? ****\n"); + return; + } + + // --Translate the glyph indices to character codes + char[] chars = new char[glyphIndices.length]; + + for (int i = 0; i < chars.length; i++) { + int index = glyphIndices[i]; + + if (index >= codes.length) // System Font ? + { + chars[i] = (char) index; + } else { + chars[i] = (char) (codes[index]); + } + // System.out.println("-ch[" + i + "]='" + chars[i] + "'(" + + // (int)chars[i] + ") +" + glyphAdvances[i]); + } + strings.add(new String(chars)); + } + + public void color(Color color) {} + + public void setX(int x) {} + + public void done() { + strings.add("\n"); + } + } + + public SWFActions tagDoAction() throws IOException { + // ActionTextWriter actions = new ActionTextWriter(new + // PrintWriter(System.out)); + NutchSWFActions actions = new NutchSWFActions(actionStrings, urls); + return actions; + } + + public SWFActions tagDoInitAction(int arg0) throws IOException { + // ActionTextWriter actions = new ActionTextWriter(new + // PrintWriter(System.out)); + NutchSWFActions actions = new NutchSWFActions(actionStrings, urls); + return actions; + } + + public void tagGeneratorFont(byte[] arg0) throws IOException { + // TODO Auto-generated method stub + super.tagGeneratorFont(arg0); + } + + public void tagGeneratorText(byte[] arg0) throws IOException { + // TODO Auto-generated method stub + super.tagGeneratorText(arg0); + } + +} + +/** + * ActionScript parser. This parser tries to extract free text embedded inside + * the script, but without polluting it too much with names of variables, + * methods, etc. Not ideal, but it works. + * + * @author Andrzej Bialecki + */ +class NutchSWFActions extends SWFActionBlockImpl implements SWFActions { + private HashSet strings = null; + + private ArrayList urls = null; + + String[] dict = null; + + Stack stack = null; + + public NutchSWFActions(HashSet strings, ArrayList urls) { + this.strings = strings; + this.urls = urls; + stack = new SmallStack(100, strings); + } + + public void lookupTable(String[] values) throws IOException { + // System.out.println("-lookupTable: " + values.length); + for (int i = 0; i < values.length; i++) { + if (!strings.contains(values[i])) strings.add(values[i]); + } + super.lookupTable(values); + dict = values; + } + + public void defineLocal() throws IOException { + // System.out.println("-defineLocal"); + stack.pop(); + super.defineLocal(); + } + + public void getURL(int vars, int mode) { + // System.out.println("-getURL: vars=" + vars + ", mode=" + mode); + } + + public void getURL(String url, String target) throws IOException { + // System.out.println("-getURL: url=" + url + ", target=" + target); + stack.push(url); + stack.push(target); + strings.remove(url); + strings.remove(target); + urls.add(url); + super.getURL(url, target); + } + + public SWFActionBlock.TryCatchFinally _try(String var) throws IOException { + // System.out.println("_try: var=" + var); + // stack.push(var); + strings.remove(var); + return super._try(var); + } + + public void comment(String var) throws IOException { + // System.out.println("-comment: var=" + var); + // stack.push(var); + strings.remove(var); + super.comment(var); + } + + public void goToFrame(String var) throws IOException { + // System.out.println("-goToFrame: var=" + var); + stack.push(var); + strings.remove(var); + super.gotoFrame(var); + } + + public void ifJump(String var) throws IOException { + // System.out.println("-ifJump: var=" + var); + strings.remove(var); + super.ifJump(var); + } + + public void jump(String var) throws IOException { + // System.out.println("-jump: var=" + var); + strings.remove(var); + super.jump(var); + } + + public void jumpLabel(String var) throws IOException { + // System.out.println("-jumpLabel: var=" + var); + strings.remove(var); + super.jumpLabel(var); + } + + public void lookup(int var) throws IOException { + // System.out.println("-lookup: var=" + var); + if (dict != null && var >= 0 && var < dict.length) { + // System.out.println(" push " + dict[var]); + stack.push(dict[var]); + } + super.lookup(var); + } + + public void push(String var) throws IOException { + // System.out.println("-push: var=" + var); + stack.push(var); + strings.remove(var); + super.push(var); + } + + public void setTarget(String var) throws IOException { + // System.out.println("-setTarget: var=" + var); + stack.push(var); + strings.remove(var); + super.setTarget(var); + } + + public SWFActionBlock startFunction(String var, String[] params) throws IOException { + // System.out.println("-startFunction1: var=" + var); + stack.push(var); + strings.remove(var); + if (params != null) { + for (int i = 0; i < params.length; i++) { + strings.remove(params[i]); + } + } + return this; + } + + public SWFActionBlock startFunction2(String var, int arg1, int arg2, String[] params, int[] arg3) throws IOException { + // System.out.println("-startFunction2: var=" + var); + stack.push(var); + strings.remove(var); + if (params != null) { + for (int i = 0; i < params.length; i++) { + strings.remove(params[i]); + } + } + return this; + } + + public void waitForFrame(int num, String var) throws IOException { + // System.out.println("-waitForFrame: var=" + var); + stack.push(var); + strings.remove(var); + super.waitForFrame(num, var); + } + + public void waitForFrame(String var) throws IOException { + // System.out.println("-waitForFrame: var=" + var); + stack.push(var); + strings.remove(var); + super.waitForFrame(var); + } + + public void done() throws IOException { + // System.out.println("-done"); + while (stack.size() > 0) { + strings.remove(stack.pop()); + } + } + + public SWFActionBlock start(int arg0, int arg1) throws IOException { + // System.out.println("-start"); + return this; + } + + public SWFActionBlock start(int arg0) throws IOException { + // System.out.println("-start"); + return this; + } + + public void add() throws IOException { + // System.out.println("-add"); + super.add(); + } + + public void asciiToChar() throws IOException { + // System.out.println("-asciitochar"); + super.asciiToChar(); + } + + public void asciiToCharMB() throws IOException { + // System.out.println("-asciitocharMB"); + super.asciiToCharMB(); + } + + public void push(int var) throws IOException { + // System.out.println("-push(int)"); + if (dict != null && var >= 0 && var < dict.length) { + // System.out.println(" push " + dict[var]); + stack.push(dict[var]); + } + super.push(var); + } + + public void callFunction() throws IOException { + // System.out.println("-callFunction"); + strings.remove(stack.pop()); + super.callFunction(); + } + + public void callMethod() throws IOException { + // System.out.println("-callMethod"); + strings.remove(stack.pop()); + super.callMethod(); + } + + public void getMember() throws IOException { + // System.out.println("-getMember"); + // 0: name + String val = (String) stack.pop(); + strings.remove(val); + super.getMember(); + } + + public void setMember() throws IOException { + // 0: value -1: name + String val = (String) stack.pop(); + String name = (String) stack.pop(); + // System.out.println("-setMember: name=" + name + ", val=" + val); + strings.remove(name); + super.setMember(); + } + + public void setProperty() throws IOException { + // System.out.println("-setProperty"); + super.setProperty(); + } + + public void setVariable() throws IOException { + // System.out.println("-setVariable"); + super.setVariable(); + } + + public void call() throws IOException { + // System.out.println("-call"); + strings.remove(stack.pop()); + super.call(); + } + + public void setTarget() throws IOException { + // System.out.println("-setTarget"); + strings.remove(stack.pop()); + super.setTarget(); + } + + public void pop() throws IOException { + // System.out.println("-pop"); + strings.remove(stack.pop()); + super.pop(); + } + + public void push(boolean arg0) throws IOException { + // System.out.println("-push(b)"); + stack.push("" + arg0); + super.push(arg0); + } + + public void push(double arg0) throws IOException { + // System.out.println("-push(d)"); + stack.push("" + arg0); + super.push(arg0); + } + + public void push(float arg0) throws IOException { + // System.out.println("-push(f)"); + stack.push("" + arg0); + super.push(arg0); + } + + public void pushNull() throws IOException { + // System.out.println("-push(null)"); + stack.push(""); + super.pushNull(); + } + + public void pushRegister(int arg0) throws IOException { + // System.out.println("-push(reg)"); + stack.push("" + arg0); + super.pushRegister(arg0); + } + + public void pushUndefined() throws IOException { + // System.out.println("-push(undef)"); + stack.push("???"); + super.pushUndefined(); + } + + public void getProperty() throws IOException { + // System.out.println("-getProperty"); + stack.pop(); + super.getProperty(); + } + + public void getVariable() throws IOException { + // System.out.println("-getVariable"); + strings.remove(stack.pop()); + super.getVariable(); + } + + public void gotoFrame(boolean arg0) throws IOException { + // System.out.println("-gotoFrame(b)"); + stack.push("" + arg0); + super.gotoFrame(arg0); + } + + public void gotoFrame(int arg0) throws IOException { + // System.out.println("-gotoFrame(int)"); + stack.push("" + arg0); + super.gotoFrame(arg0); + } + + public void gotoFrame(String arg0) throws IOException { + // System.out.println("-gotoFrame(string)"); + stack.push("" + arg0); + strings.remove(arg0); + super.gotoFrame(arg0); + } + + public void newObject() throws IOException { + // System.out.println("-newObject"); + stack.pop(); + super.newObject(); + } + + public SWFActionBlock startWith() throws IOException { + // System.out.println("-startWith"); + return this; + } + +} + +/* + * Small bottom-less stack. + */ +class SmallStack extends Stack { + + private int maxSize; + + private HashSet strings = null; + + public SmallStack(int maxSize, HashSet strings) { + this.maxSize = maxSize; + this.strings = strings; + } + + public Object push(Object o) { + // limit max size + if (this.size() > maxSize) { + String val = (String) remove(0); + strings.remove(val); + } + return super.push(o); + } + + public Object pop() { + // tolerate underruns + if (this.size() == 0) + return null; + else return super.pop(); + } +} \ No newline at end of file Propchange: lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java?rev=374724&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java (added) +++ lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java Fri Feb 3 10:49:07 2006 @@ -0,0 +1,99 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.swf; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.io.UTF8; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; + +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParserFactory; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.util.NutchConf; + +import junit.framework.TestCase; + +/** + * Unit tests for SWFParser. + * + * @author Andrzej Bialecki + */ +public class TestSWFParser extends TestCase { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data","."); + + private String[] sampleFiles = new String[]{"test1.swf", "test2.swf", "test3.swf"}; + private String[] sampleTexts = new String[]{"test1.txt", "test2.txt", "test3.txt"}; + private String[] texts = new String[sampleTexts.length]; + + public TestSWFParser(String name) { + super(name); + for (int i = 0; i < sampleFiles.length; i++) { + try { + // read the test string + FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + sampleTexts[i]); + StringBuffer sb = new StringBuffer(); + int len = 0; + InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); + char[] buf = new char[1024]; + while ((len = isr.read(buf)) > 0) { + sb.append(buf, 0, len); + } + isr.close(); + sampleTexts[i] = sb.toString().replaceAll("[ \t\r\n]+", " ").trim(); + } catch (Exception e) { + e.printStackTrace(); + } + } + } + + protected void setUp() {} + + protected void tearDown() {} + + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parse parse; + NutchConf conf = new NutchConf(); + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); + + parse = new ParseUtil(conf).parse(content); + + String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim(); + assertTrue(sampleTexts[i].equals(text)); + } + } + +} Propchange: lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java ------------------------------------------------------------------------------ svn:eol-style = native ------------------------------------------------------- This SF.net email is sponsored by: Splunk Inc. Do you grep through log files for problems? Stop! Download the new AJAX search engine that makes searching your log files as easy as surfing the web. DOWNLOAD SPLUNK! http://sel.as-us.falkag.net/sel?cmd=lnk&kid=103432&bid=230486&dat=121642 _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs