Author: jerome Date: Mon Feb 13 13:28:13 2006 New Revision: 377494 URL: http://svn.apache.org/viewcvs?rev=377494&view=rev Log: Make use of lib-parsems in word, powerpoint and excel parsers
Removed: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html lucene/nutch/trunk/src/plugin/parse-msword/build.xml lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml Mon Feb 13 13:28:13 2006 @@ -2,19 +2,23 @@ <project name="parse-msexcel" default="jar"> - <import file="../build-plugin.xml" /> + <import file="../build-plugin.xml" /> <path id="plugin.deps"> <fileset dir="../lib-jakarta-poi/lib"> <include name="*.jar" /> </fileset> + <fileset dir="../../../build/lib-parsems"> + <include name="*.jar" /> + </fileset> </path> - <!-- for junit test --> - <mkdir dir="${build.test}/data" /> - <copy todir="${build.test}/data"> - <fileset dir="sample"> - <include name="*.xls" /> - </fileset> - </copy> + <!-- for junit test --> + <mkdir dir="${build.test}/data" /> + <copy todir="${build.test}/data"> + <fileset dir="sample"> + <include name="*.xls" /> + </fileset> + </copy> + </project> Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml Mon Feb 13 13:28:13 2006 @@ -14,6 +14,7 @@ <requires> <import plugin="nutch-extensionpoints"/> <import plugin="lib-jakarta-poi"/> + <import plugin="lib-parsems"/> </requires> <extension id="org.apache.nutch.parse.msexcel" Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java Mon Feb 13 13:28:13 2006 @@ -16,17 +16,17 @@ package org.apache.nutch.parse.msexcel; // JDK imports -import java.io.IOException; import java.io.InputStream; -import java.util.Date; -import java.util.Properties; // Jakarta POI imports import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; -import org.apache.poi.poifs.eventfilesystem.POIFSReader; + +// Nutch imports +import org.apache.nutch.parse.ms.MSExtractor; + /** * Excel Text and Properties extractor. @@ -34,10 +34,10 @@ * @author Rohit Kulkarni & Ashish Vaidya * @author Jérôme Charron */ -public class ExcelExtractor { +class ExcelExtractor extends MSExtractor { - public String extractText(InputStream input) throws IOException { + protected String extractText(InputStream input) throws Exception { String resultText = ""; HSSFWorkbook wb = new HSSFWorkbook(input); @@ -88,45 +88,5 @@ return resultText; } - - public Properties extractProperties(InputStream input) throws IOException { - - PropertiesBroker propertiesBroker = new PropertiesBroker(); - POIFSReader reader = new POIFSReader(); - reader.registerListener(new PropertiesReaderListener(propertiesBroker), - "\005SummaryInformation"); - reader.read(input); - return propertiesBroker.getProperties(); - } - - - class PropertiesBroker { - - private Properties properties; - private int timeoutMillis = 2 * 1000; - - - public synchronized Properties getProperties() { - - long start = new Date().getTime(); - long now = start; - - while ((properties == null) && (now-start < timeoutMillis)) { - try { - wait(timeoutMillis / 10); - } catch (InterruptedException e) {} - now = new Date().getTime(); - } - - notifyAll(); - return properties; - } - - public synchronized void setProperties(Properties properties) { - this.properties = properties; - notifyAll(); - } - } - } Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java Mon Feb 13 13:28:13 2006 @@ -15,111 +15,36 @@ */ package org.apache.nutch.parse.msexcel; -// JDK imports -import java.io.ByteArrayInputStream; -import java.util.Properties; -import java.util.logging.Logger; - -// Hadoop imports -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.util.LogFormatter; - // Nutch imports -import org.apache.nutch.metadata.DublinCore; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.OutlinkExtractor; import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.parse.Parser; +import org.apache.nutch.parse.ms.MSBaseParser; import org.apache.nutch.protocol.Content; + /** * An Excel document parser. * * @author Rohit Kulkarni & Ashish Vaidya * @author Jérôme Charron */ -public class MSExcelParser implements Parser { - - private Configuration conf; - - private static final Logger LOG = LogFormatter.getLogger(MSExcelParser.class.getName()); - - /** Creates a new instance of MSExcelParser */ - public MSExcelParser() { } - - public Parse getParse(Content content) { - - String text = null; - String title = null; - Properties properties = null; - - try { - byte[] raw = content.getContent(); - String contentLength = content.getMetadata().get(Metadata.CONTENT_LENGTH); - if ((contentLength != null) && - (raw.length != Integer.parseInt(contentLength))) { - return new ParseStatus(ParseStatus.FAILED, - ParseStatus.FAILED_TRUNCATED, - "Content truncated at " + raw.length +" bytes. " + - "Parser can't handle incomplete msexcelfile.") - .getEmptyParse(this.conf); - } - - ExcelExtractor extractor = new ExcelExtractor(); - // Extract text - text = extractor.extractText(new ByteArrayInputStream(raw)); - // Extract properties - properties = extractor.extractProperties(new ByteArrayInputStream(raw)); - - //currently returning empty outlinks array - //outlinks = this.fetchOutlinks(resultText); - - } catch (Exception e) { - return new ParseStatus(ParseStatus.FAILED, - "Can't be handled as msexcel document. " + e) - .getEmptyParse(this.conf); - } finally { - // nothing so far - } +public class MSExcelParser extends MSBaseParser { - // collect meta data - Metadata metadata = new Metadata(); - title = properties.getProperty(DublinCore.TITLE); - properties.remove(DublinCore.TITLE); - metadata.setAll(properties); - - if (text == null) { text = ""; } - if (title == null) { title = ""; } - - // collect outlink - Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, this.conf); - - ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, - outlinks, content.getMetadata(), - metadata); - parseData.setConf(this.conf); - return new ParseImpl(text, parseData); - } + /** + * Associated Mime type for Excel files + * (<code>application/vnd.ms-excel</code>). + */ + public static final String MIME_TYPE = "application/vnd.ms-excel"; - - /* ---------------------------- * - * <implemenation:Configurable> * - * ---------------------------- */ - public void setConf(Configuration conf) { - this.conf = conf; + public Parse getParse(Content content) { + return getParse(new ExcelExtractor(), content); } - public Configuration getConf() { - return this.conf; + /** + * Main for testing. Pass an excel document as argument + */ + public static void main(String args[]) { + main(MIME_TYPE, new MSExcelParser(), args); } - - /* ----------------------------- * - * </implemenation:Configurable> * - * ----------------------------- */ - + } Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html (original) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html Mon Feb 13 13:28:13 2006 @@ -1,6 +1,6 @@ <html> <body> -<p>An Excel document parsing plugin.</p> +<p>A Microsoft © Excel document parsing plugin.</p> <p>This package relies on Jakarta <a href="http://jakarta.apache.org/poi/index.html">POI</a>.</p> </body> </html> Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml Mon Feb 13 13:28:13 2006 @@ -8,6 +8,9 @@ <fileset dir="../lib-jakarta-poi/lib"> <include name="*.jar" /> </fileset> + <fileset dir="../../../build/lib-parsems"> + <include name="*.jar" /> + </fileset> </path> <!-- for junit test --> Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml Mon Feb 13 13:28:13 2006 @@ -14,6 +14,7 @@ <requires> <import plugin="lib-jakarta-poi"/> <import plugin="nutch-extensionpoints"/> + <import plugin="lib-parsems"/> </requires> <extension id="net.nutch.parse.mspowerpoint" Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java Mon Feb 13 13:28:13 2006 @@ -13,29 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.parse.mspowerpoint; -import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.util.Properties; -import java.util.logging.Logger; - -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.OutlinkExtractor; +// Nutch imports import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.parse.Parser; +import org.apache.nutch.parse.ms.MSBaseParser; import org.apache.nutch.protocol.Content; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.net.protocols.Response; - -import org.apache.hadoop.util.LogFormatter; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; /** @@ -45,133 +28,27 @@ * It is based on org.apache.poi.*. * * @author Stephan Strittmatter - http://www.sybit.de + * @author Jérôme Charron * @see <a href="http://jakarta.apache.org/poi">Jakarta POI</a> - * @version 1.0 */ -public class MSPowerPointParser implements Parser { - - /** associated Mime type for PowerPoint files (application/vnd.ms-powerpoint) */ - public static final String MIME_TYPE = "application/vnd.ms-powerpoint"; - - private static final Logger LOG = LogFormatter - .getLogger(MSPowerPointParser.class.getName()); - - private Configuration conf; +public class MSPowerPointParser extends MSBaseParser { /** - * + * Associated Mime type for PowerPoint files + * (<code>application/vnd.ms-powerpoint</code>). */ - public MSPowerPointParser() { - } + public static final String MIME_TYPE = "application/vnd.ms-powerpoint"; - /** - * Main for testing. Pass a ppt-file as argument - * - * @param args - */ - public static void main(String args[]) { - if (args.length < 1) { - System.err.println("Useage:"); - System.err.println("\tMSPowerPointParser <file>"); - System.exit(1); - } - - String file = args[0]; - MSPowerPointParser ppe = new MSPowerPointParser(); - - byte[] raw = getRawBytes(new File(file)); - - Metadata meta = new Metadata(); - meta.set(Response.CONTENT_LENGTH, "" + raw.length); - Content content = new Content(file, file, raw, MIME_TYPE, meta, NutchConfiguration.create()); - System.out.println(ppe.getParse(content).getText()); - } - - /** - * Parses the MS PowerPoint file. - * - * @see org.apache.nutch.parse.Parser#getParse(Content) - */ public Parse getParse(final Content content) { - - String plainText = null; - String title = null; - Outlink[] outlinks = null; - Properties properties = null; - - try { - final String contentLen = content.getMetadata().get(Response.CONTENT_LENGTH); - final byte[] raw = content.getContent(); - - if (contentLen != null && raw.length != Integer.parseInt(contentLen)) { - return new ParseStatus( - ParseStatus.FAILED, - ParseStatus.FAILED_TRUNCATED, - "Content truncated at " - + raw.length - + " bytes. Please increase <protocol>.content.limit at nutch-default.xml. " - + "Parser can't handle incomplete PowerPoint files.") - .getEmptyParse(getConf()); - } - - final PPTExtractor extractor = new PPTExtractor(new ByteArrayInputStream( - raw)); - - plainText = extractor.getText(); - properties = extractor.getProperties(); - outlinks = OutlinkExtractor.getOutlinks(plainText, content.getUrl(), getConf()); - - } catch (Exception e) { - LOG.throwing(this.getClass().getName(), "getParse", e); - return new ParseStatus(e).getEmptyParse(getConf()); - } - - Metadata metadata = new Metadata(); - - if (properties != null) { - title = properties.getProperty(Metadata.TITLE); - properties.remove(Metadata.TITLE); - metadata.setAll(properties); - } - - if (plainText == null) { - plainText = ""; - } - - if (title == null) { - title = ""; - } - - final ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); - final ParseData parseData = new ParseData(status, title, outlinks, metadata); - parseData.setConf(this.conf); - - LOG.finest("PowerPoint file parsed sucessful."); - return new ParseImpl(plainText, parseData); + return getParse(new PPTExtractor(), content); } - private final static byte[] getRawBytes(File f) { - try { - if (!f.exists()) - return null; - FileInputStream fin = new FileInputStream(f); - byte[] buffer = new byte[(int) f.length()]; - fin.read(buffer); - fin.close(); - return buffer; - } catch (Exception err) { - err.printStackTrace(); - return null; - } - + /** + * Main for testing. Pass a powerpoint document as argument + */ + public static void main(String args[]) { + main(MIME_TYPE, new MSPowerPointParser(), args); } - public void setConf(Configuration conf) { - this.conf = conf; - } - - public Configuration getConf() { - return this.conf; - } } Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java Mon Feb 13 13:28:13 2006 @@ -13,141 +13,44 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.parse.mspowerpoint; +// JDK imports import java.io.IOException; import java.io.InputStream; -import java.util.Date; -import java.util.Properties; -import java.util.logging.Logger; -import org.apache.hadoop.util.LogFormatter; -import org.apache.poi.hpsf.SummaryInformation; +// Nutch imports +import org.apache.nutch.parse.ms.MSExtractor; + +// Jakarta POI imports import org.apache.poi.poifs.eventfilesystem.POIFSReader; + /** * Converts the Powerpoint document content to plain text. * * @author Stephan Strittmatter - http://www.sybit.de - * - * @version 1.0 + * @author Jérôme Charron */ +class PPTExtractor extends MSExtractor { -public class PPTExtractor { - - private static final Logger LOG = LogFormatter.getLogger(PPTExtractor.class - .getName()); - - /** Parsed plain Powerpoint Text */ - private final transient StringBuffer contentBuf; - - private final PropertiesBroker propertiesBroker; - - private final POIFSReader poireader; - - /** - * Constructor that takes a PowerPoint file as <code>InputStream</code> to - * parse it. - * - * @param in - * <code>InputStream</code> containing the PowerPoint file - * @throws PowerPointDocumentException - * thrown if parsing failed - */ - public PPTExtractor(final InputStream in) throws PowerPointDocumentException { - this.poireader = new POIFSReader(); - this.propertiesBroker = new PropertiesBroker(); - this.contentBuf = new StringBuffer(); - - this.init(in); - } - - /** - * Get the PowerPoint content text as plain text - * - * @return String the content text - */ - public String getText() { - return this.contentBuf.toString(); - } - - /** - * Get the <code>Properties</code> of the PowerPoint document. - * - * @return the properties of the document - */ - public Properties getProperties() { - return this.propertiesBroker.getProperties(); - } - - /** - * @param input - * @throws PowerPointDocumentException - */ - private void init(final InputStream input) throws PowerPointDocumentException { - // register listener for SummaryInformation - this.poireader.registerListener(new PropertiesReaderListener( - this.propertiesBroker), SummaryInformation.DEFAULT_STREAM_NAME); - - // register listener for PPT-document content - this.poireader.registerListener(new ContentReaderListener(this.contentBuf), - PPTConstants.POWERPOINT_DOCUMENT); - - try { - input.reset(); - if (input.available() > 0) { - this.poireader.read(input); - } else { - LOG.warning("Input <=0 :" + input.available()); - } - } catch (IOException e) { - throw new PowerPointDocumentException(e); + private StringBuffer text = null; + private POIFSReader reader = null; + + + protected String extractText(InputStream input) throws Exception { + this.reader = new POIFSReader(); + this.text = new StringBuffer(); + reader.registerListener( + new ContentReaderListener(this.text), + PPTConstants.POWERPOINT_DOCUMENT); + input.reset(); + if (input.available() > 0) { + this.reader.read(input); + } else { + LOG.warning("Input <=0 :" + input.available()); } + return (this.text != null) ? text.toString() : null; } - /** - * The PropertiesBroker - * - * @author Stephan Strittmatter - * @version 1.0 - */ - static class PropertiesBroker { - - private final static int TIMEOUT = 2 * 1000; - - private Properties properties = null; - - /** - * Get the collected properties. - * - * @return properties of the PowerPoint file - */ - public synchronized Properties getProperties() { - - final long start = new Date().getTime(); - long now = start; - - while (this.properties == null && now - start < TIMEOUT) { - try { - wait(TIMEOUT / 10); - } catch (InterruptedException e) { - } - now = new Date().getTime(); - } - - notifyAll(); - - return this.properties; - } - - /** - * - * @param properties - */ - public synchronized void setProperties(Properties properties) { - this.properties = properties; - notifyAll(); - } - } } Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html (original) +++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html Mon Feb 13 13:28:13 2006 @@ -21,8 +21,9 @@ </head> <body> <p>A Microsoft © PowerPoint document parsing plugin.</p> - <p>This package relies on <a - href="http://www.apache.org/poi/index.html">POI</a>.</p> + <p>This package relies on Jakarta + <a href="http://jakarta.apache.org/poi/index.html">POI</a>. + </p> <p> Implementation based on sources found at <a href="http://groups.google.com/groups?selm=a4f8800541bc694d5af7dabb35e83b72%40localhost.talkaboutsoftware.com">Google Groups </a>. It can also be found at <a Modified: lucene/nutch/trunk/src/plugin/parse-msword/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/build.xml?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/build.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/build.xml Mon Feb 13 13:28:13 2006 @@ -8,6 +8,9 @@ <fileset dir="../lib-jakarta-poi/lib"> <include name="*.jar" /> </fileset> + <fileset dir="../../../build/lib-parsems"> + <include name="*.jar" /> + </fileset> </path> <!-- for junit test --> Modified: lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml Mon Feb 13 13:28:13 2006 @@ -14,6 +14,7 @@ <requires> <import plugin="nutch-extensionpoints"/> <import plugin="lib-jakarta-poi"/> + <import plugin="lib-parsems"/> </requires> <extension id="org.apache.nutch.parse.msword" Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java Mon Feb 13 13:28:13 2006 @@ -12,22 +12,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.parse.msword; -/** - * <p>Title: </p> - * <p>Description: </p> - * <p>Copyright: Copyright (c) 2003</p> - * <p>Company: </p> - * @author not attributable - * @version 1.0 - */ -public class FastSavedException extends Exception -{ - public FastSavedException(String msg) - { +public class FastSavedException extends Exception { + + public FastSavedException(String msg) { super(msg); } Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java Mon Feb 13 13:28:13 2006 @@ -13,118 +13,41 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.parse.msword; -import org.apache.nutch.metadata.DublinCore; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.net.protocols.Response; +// Nutch imports import org.apache.nutch.protocol.Content; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.parse.Parser; import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.OutlinkExtractor; -import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ms.MSBaseParser; -import java.util.Properties; -import java.io.ByteArrayInputStream; /** - * parser for mime type application/msword. + * Parser for mime type application/msword. * It is based on org.apache.poi.*. We have to see how well it performs. * * @author John Xing - * - * Note on 20040614 by Xing: - * Some codes are stacked here for convenience (see inline comments). - * They may be moved to more appropriate places when new codebase - * stabilizes, especially after code for indexing is written. - * * @author Andy Hedges - * code to extract all msword properties. - * + * @author Jérôme Charron */ -public class MSWordParser implements Parser { - private Configuration conf; +public class MSWordParser extends MSBaseParser { -// public static final Logger LOG = -// LogFormatter.getLogger("org.apache.nutch.parse.msword"); - - public MSWordParser () {} + /** + * Associated Mime type for Word files + * (<code>application/msword</code>). + */ + public static final String MIME_TYPE = "application/msword"; + public Parse getParse(Content content) { - - String text = null; - String title = null; - Properties properties = null; - - try { - - byte[] raw = content.getContent(); - - String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH); - if (contentLength != null - && raw.length != Integer.parseInt(contentLength)) { - return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, - "Content truncated at " + raw.length - +" bytes. Parser can't handle incomplete msword file.").getEmptyParse(this.conf); - } - - WordExtractor extractor = new WordExtractor(); - - // collect text - text = extractor.extractText(new ByteArrayInputStream(raw)); - - // collect meta info - properties = extractor.extractProperties(new ByteArrayInputStream(raw)); - - extractor = null; - - } catch (ParseException e) { - return new ParseStatus(e).getEmptyParse(this.conf); - } catch (FastSavedException e) { - return new ParseStatus(e).getEmptyParse(this.conf); - } catch (PasswordProtectedException e) { - return new ParseStatus(e).getEmptyParse(this.conf); - } catch (Exception e) { // run time exception - return new ParseStatus(ParseStatus.FAILED, - "Can't be handled as msword document. " + e).getEmptyParse(this.conf); - } finally { - // nothing so far - } - - // collect meta data - Metadata metadata = new Metadata(); - title = properties.getProperty(DublinCore.TITLE); - properties.remove(DublinCore.TITLE); - metadata.setAll(properties); - - if (text == null) { text = ""; } - if (title == null) { title = ""; } - - // collect outlink - Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, this.conf); - - ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, - outlinks, content.getMetadata(), - metadata); - parseData.setConf(this.conf); - return new ParseImpl(text, parseData); - // any filter? - //return HtmlParseFilters.filter(content, parse, root); - } - - public void setConf(Configuration conf) { - this.conf = conf; + return getParse(new WordExtractor(), content); } - public Configuration getConf() { - return this.conf; + /** + * Main for testing. Pass an word document as argument + */ + public static void main(String args[]) { + main(MIME_TYPE, new MSWordParser(), args); } } Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java Mon Feb 13 13:28:13 2006 @@ -14,11 +14,10 @@ */ package org.apache.nutch.parse.msword; -public class PasswordProtectedException - extends Exception -{ - public PasswordProtectedException(String msg) - { + +public class PasswordProtectedException extends Exception { + + public PasswordProtectedException(String msg) { super(msg); } Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java Mon Feb 13 13:28:13 2006 @@ -14,42 +14,47 @@ */ package org.apache.nutch.parse.msword; -import org.apache.poi.hpsf.*; -import org.apache.poi.hwpf.model.*; -import org.apache.poi.hwpf.sprm.*; -import org.apache.poi.poifs.eventfilesystem.*; -import org.apache.poi.poifs.filesystem.*; +// JDK imports +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +// Jakarta POI imports +import org.apache.poi.hwpf.model.CHPBinTable; +import org.apache.poi.hwpf.model.CHPX; +import org.apache.poi.hwpf.model.ComplexFileTable; +import org.apache.poi.hwpf.model.TextPiece; +import org.apache.poi.hwpf.model.TextPieceTable; +import org.apache.poi.hwpf.sprm.SprmIterator; +import org.apache.poi.hwpf.sprm.SprmOperation; +import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.util.LittleEndian; -import org.apache.nutch.metadata.Metadata; -import java.util.*; -import java.io.*; +// Nutch imports +import org.apache.nutch.parse.ms.MSExtractor; + /** * This class extracts the text from a Word 6.0/95/97/2000/XP word doc * * @author Ryan Ackley - * * @author Andy Hedges - * code to extract all msword properties. + * @author Jérôme Charron * */ -public class WordExtractor { +class WordExtractor extends MSExtractor { - /** - * Constructor - */ - public WordExtractor() - { - } /** * Gets the text from a Word document. * * @param in The InputStream representing the Word file. */ - public String extractText(InputStream in) throws Exception - { + protected String extractText(InputStream in) throws Exception { + ArrayList text = new ArrayList(); POIFSFileSystem fsys = new POIFSFileSystem(in); @@ -221,128 +226,5 @@ return false; } - public Properties extractProperties(InputStream in) - throws IOException { - - PropertiesBroker propertiesBroker = new PropertiesBroker(); - POIFSReader reader = new POIFSReader(); - reader.registerListener(new PropertiesReaderListener(propertiesBroker), - "\005SummaryInformation"); - reader.read(in); - return propertiesBroker.getProperties(); - } - - class PropertiesReaderListener - implements POIFSReaderListener { - - private PropertiesBroker propertiesBroker; - private Properties metaData = new Properties(); - - public PropertiesReaderListener(PropertiesBroker propertiesBroker) { - this.propertiesBroker = propertiesBroker; - } - - public void processPOIFSReaderEvent(POIFSReaderEvent event) { - - SummaryInformation si = null; - Properties properties = new Properties(); - - try { - si = (SummaryInformation)PropertySetFactory.create(event.getStream()); - } catch (Exception ex) { - properties = null; - } - - Date tmp = null; - - String title = si.getTitle(); - String applicationName = si.getApplicationName(); - String author = si.getAuthor(); - int charCount = si.getCharCount(); - String comments = si.getComments(); - Date createDateTime = si.getCreateDateTime(); - long editTime = si.getEditTime(); - String keywords = si.getKeywords(); - String lastAuthor = si.getLastAuthor(); - Date lastPrinted = si.getLastPrinted(); - Date lastSaveDateTime = si.getLastSaveDateTime(); - int pageCount = si.getPageCount(); - String revNumber = si.getRevNumber(); - int security = si.getSecurity(); - String subject = si.getSubject(); - String template = si.getTemplate(); - int wordCount = si.getWordCount(); - - /*Dates are being stored in millis since the epoch to aid - localization*/ - if(title != null) - properties.setProperty(Metadata.TITLE, title); - if(applicationName != null) - properties.setProperty(Metadata.APPLICATION_NAME, applicationName); - if(author != null) - properties.setProperty(Metadata.AUTHOR, author); - if(charCount != 0) - properties.setProperty(Metadata.CHARACTER_COUNT, charCount + ""); - if(comments != null) - properties.setProperty(Metadata.COMMENTS, comments); - if(createDateTime != null) - properties.setProperty(Metadata.DATE, - Metadata.DATE_FORMAT.format(createDateTime)); - if(editTime != 0) - properties.setProperty(Metadata.LAST_MODIFIED, editTime + ""); - if(keywords != null) - properties.setProperty(Metadata.KEYWORDS, keywords); - if(lastAuthor != null) - properties.setProperty(Metadata.LAST_AUTHOR, lastAuthor); - if(lastPrinted != null) - properties.setProperty(Metadata.LAST_PRINTED, lastPrinted.getTime() + ""); - if(lastSaveDateTime != null) - properties.setProperty(Metadata.LAST_SAVED, lastSaveDateTime.getTime() + ""); - if(pageCount != 0) - properties.setProperty(Metadata.PAGE_COUNT, pageCount + ""); - if(revNumber != null) - properties.setProperty(Metadata.REVISION_NUMBER, revNumber); - if(security != 0) - properties.setProperty(Metadata.RIGHTS, security + ""); - if(subject != null) - properties.setProperty(Metadata.SUBJECT, subject); - if(template != null) - properties.setProperty(Metadata.TEMPLATE, template); - if(wordCount != 0) - properties.setProperty(Metadata.WORD_COUNT, wordCount + ""); - propertiesBroker.setProperties(properties); - - //si.getThumbnail(); // can't think of a sensible way of turning this into a string. - } - } - - class PropertiesBroker { - - private Properties properties; - private int timeoutMillis = 2 * 1000; - - - public synchronized Properties getProperties() { - - long start = new Date().getTime(); - long now = start; - - while (properties == null && now - start < timeoutMillis) { - try { - wait(timeoutMillis / 10); - } catch (InterruptedException e) {} - now = new Date().getTime(); - } - - notifyAll(); - - return properties; - } - - public synchronized void setProperties(Properties properties) { - this.properties = properties; - notifyAll(); - } - } } Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html?rev=377494&r1=377493&r2=377494&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html Mon Feb 13 13:28:13 2006 @@ -1,5 +1,6 @@ <html> <body> -<p>A Word document parsing plugin.</p><p>This package relies on <a href="http://jakarta.apache.org/poi/index.html">POI</a>.</p> +<p>A Microsoft © Word document parsing plugin.</p> +<p>This package relies on <a href="http://jakarta.apache.org/poi/index.html">POI</a>.</p> </body> </html>