Author: jerome Date: Mon Feb 13 13:26:15 2006 New Revision: 377493 URL: http://svn.apache.org/viewcvs?rev=377493&view=rev Log: Add a mini framework for microsoft documents parsing
Added: lucene/nutch/trunk/src/plugin/lib-parsems/ lucene/nutch/trunk/src/plugin/lib-parsems/build.xml (with props) lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml (with props) lucene/nutch/trunk/src/plugin/lib-parsems/src/ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java (with props) lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java (with props) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java?rev=377493&r1=377492&r2=377493&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java Mon Feb 13 13:26:15 2006 @@ -149,6 +149,14 @@ } + /** + * Checks if a string is empty (ie is null or empty). + */ + public static boolean isEmpty(String str) { + return (str == null) || (str.equals("")); + } + + private static HashMap encodingAliases = new HashMap(); /** Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=377493&r1=377492&r2=377493&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Mon Feb 13 13:26:15 2006 @@ -14,6 +14,7 @@ <ant dir="lib-http" target="deploy"/> <ant dir="lib-jakarta-poi" target="deploy"/> <ant dir="lib-lucene-analyzers" target="deploy"/> + <ant dir="lib-parsems" target="deploy"/> <ant dir="nutch-extensionpoints" target="deploy"/> <ant dir="ontology" target="deploy"/> <ant dir="protocol-file" target="deploy"/> @@ -78,6 +79,7 @@ <ant dir="lib-http" target="clean"/> <ant dir="lib-jakarta-poi" target="clean"/> <ant dir="lib-lucene-analyzers" target="clean"/> + <ant dir="lib-parsems" target="clean"/> <ant dir="nutch-extensionpoints" target="clean"/> <ant dir="ontology" target="clean"/> <ant dir="protocol-file" target="clean"/> Added: lucene/nutch/trunk/src/plugin/lib-parsems/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/build.xml?rev=377493&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-parsems/build.xml (added) +++ lucene/nutch/trunk/src/plugin/lib-parsems/build.xml Mon Feb 13 13:26:15 2006 @@ -0,0 +1,13 @@ +<?xml version="1.0"?> + +<project name="lib-parsems" default="jar"> + + <import file="../build-plugin.xml"/> + + <path id="plugin.deps"> + <fileset dir="../lib-jakarta-poi/lib"> + <include name="*.jar" /> + </fileset> + </path> + +</project> Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/build.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml?rev=377493&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml (added) +++ lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml Mon Feb 13 13:26:15 2006 @@ -0,0 +1,21 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + ! A common framework for microsoft documents parsers implementations + !--> +<plugin + id="lib-parsems" + name="Parse MS Documents Framework" + version="1.0" + provider-name="org.apache.nutch"> + + <runtime> + <library name="lib-parsems.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="lib-jakarta-poi"/> + </requires> + +</plugin> Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java?rev=377493&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java (added) +++ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java Mon Feb 13 13:26:15 2006 @@ -0,0 +1,161 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.ms; + +// JDK imports +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.util.Properties; +import java.util.logging.Logger; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.LogFormatter; + +// Nutch imports +import org.apache.nutch.metadata.DublinCore; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.OutlinkExtractor; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; + + +/** + * A generic Microsoft document parser. + * + * @author Jérôme Charron + */ +public abstract class MSBaseParser implements Parser { + + private Configuration conf; + + protected static final Logger LOG = + LogFormatter.getLogger(MSBaseParser.class.getName()); + + + /** + * Parses a Content with a specific [EMAIL PROTECTED] MSExtractor Microsoft document + * extractor. + */ + protected Parse getParse(MSExtractor extractor, Content content) { + + String text = null; + String title = null; + Outlink[] outlinks = null; + Properties properties = null; + + try { + byte[] raw = content.getContent(); + String contentLength = content.getMetadata().get(Metadata.CONTENT_LENGTH); + if ((contentLength != null) && + (raw.length != Integer.parseInt(contentLength))) { + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_TRUNCATED, + "Content truncated at " + raw.length +" bytes. " + + "Parser can't handle incomplete file.") + .getEmptyParse(this.conf); + } + extractor.extract(new ByteArrayInputStream(raw)); + text = extractor.getText(); + properties = extractor.getProperties(); + outlinks = OutlinkExtractor.getOutlinks(text, content.getUrl(), getConf()); + + } catch (Exception e) { + return new ParseStatus(ParseStatus.FAILED, + "Can't be handled as micrsosoft document. " + e) + .getEmptyParse(this.conf); + } + + // collect meta data + Metadata metadata = new Metadata(); + title = properties.getProperty(DublinCore.TITLE); + properties.remove(DublinCore.TITLE); + metadata.setAll(properties); + + if (text == null) { text = ""; } + if (title == null) { title = ""; } + + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, + outlinks, content.getMetadata(), + metadata); + parseData.setConf(this.conf); + return new ParseImpl(text, parseData); + } + + + /** + * Main for testing. Pass a ms document as argument + */ + public static void main(String mime, MSBaseParser parser, String args[]) { + if (args.length < 1) { + System.err.println("Usage:"); + System.err.println("\t" + parser.getClass().getName() + " <file>"); + System.exit(1); + } + + String file = args[0]; + byte[] raw = getRawBytes(new File(file)); + + Metadata meta = new Metadata(); + meta.set(Response.CONTENT_LENGTH, "" + raw.length); + Content content = new Content(file, file, raw, mime, meta, + NutchConfiguration.create()); + + System.out.println(parser.getParse(content).getText()); + } + + private final static byte[] getRawBytes(File f) { + try { + if (!f.exists()) + return null; + FileInputStream fin = new FileInputStream(f); + byte[] buffer = new byte[(int) f.length()]; + fin.read(buffer); + fin.close(); + return buffer; + } catch (Exception err) { + err.printStackTrace(); + return null; + } + + } + + + /* ---------------------------- * + * <implemenation:Configurable> * + * ---------------------------- */ + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } + + /* ----------------------------- * + * </implemenation:Configurable> * + * ----------------------------- */ + +} Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java?rev=377493&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java (added) +++ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java Mon Feb 13 13:26:15 2006 @@ -0,0 +1,199 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.ms; + +// JDK imports +import java.io.InputStream; +import java.util.Date; +import java.util.Properties; +import java.util.logging.Logger; + +// Hadoop imports +import org.apache.hadoop.util.LogFormatter; + +// Nutch imports +import org.apache.nutch.metadata.DublinCore; +import org.apache.nutch.metadata.HttpHeaders; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Office; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.util.StringUtil; + +// Jakarta POI imports +import org.apache.poi.hpsf.PropertySetFactory; +import org.apache.poi.hpsf.SummaryInformation; +import org.apache.poi.poifs.eventfilesystem.POIFSReader; +import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent; +import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; + + +/** + * Defines a Microsoft document content extractor. + * + * @author Jérôme Charron + */ +public abstract class MSExtractor { + + protected final static Logger LOG = + LogFormatter.getLogger(MSExtractor.class.getName()); + + private String text = null; + private POIFSReader reader = null; + private PropertiesBroker properties = null; + + + /** Constructs a new Microsoft document extractor. */ + protected MSExtractor() { } + + + /** + * Extracts properties and text from an MS Document input stream + */ + protected void extract(InputStream input) throws Exception { + // First, extract properties + this.reader = new POIFSReader(); + this.properties = new PropertiesBroker(); + this.reader.registerListener( + new PropertiesReaderListener(this.properties), + SummaryInformation.DEFAULT_STREAM_NAME); + input.reset(); + if (input.available() > 0) { + reader.read(input); + } + // Then, extract text + input.reset(); + this.text = extractText(input); + } + + /** + * Extracts the text content from a Microsoft document input stream. + */ + protected abstract String extractText(InputStream input) throws Exception; + + + /** + * Get the content text of the Microsoft document. + * @return the content text of the document + */ + protected String getText() { + return this.text; + } + + + /** + * Get the <code>Properties</code> of the Microsoft document. + * @return the properties of the document + */ + protected Properties getProperties() { + return properties.getProperties(); + } + + + private final static class PropertiesBroker { + + private final static int TIMEOUT = 2 * 1000; + private Properties properties = null; + + public synchronized Properties getProperties() { + + final long start = new Date().getTime(); + long now = start; + + while (this.properties == null && now - start < TIMEOUT) { + try { + wait(TIMEOUT / 10); + } catch (InterruptedException e) { + } + now = new Date().getTime(); + } + notifyAll(); + return this.properties; + } + + public synchronized void setProperties(Properties properties) { + this.properties = properties; + notifyAll(); + } + } + + + private class PropertiesReaderListener implements POIFSReaderListener { + + private PropertiesBroker propertiesBroker; + private Properties metadata = new Properties(); + + PropertiesReaderListener(PropertiesBroker propertiesBroker) { + this.propertiesBroker = propertiesBroker; + } + + public void processPOIFSReaderEvent(POIFSReaderEvent event) { + if (!event.getName().startsWith(SummaryInformation.DEFAULT_STREAM_NAME)) { + return; + } + + try { + SummaryInformation si = (SummaryInformation) + PropertySetFactory.create(event.getStream()); + setProperty(DublinCore.TITLE, si.getTitle()); + setProperty(Office.APPLICATION_NAME, si.getApplicationName()); + setProperty(Office.AUTHOR, si.getAuthor()); + setProperty(Office.CHARACTER_COUNT, si.getCharCount()); + setProperty(Office.COMMENTS, si.getComments()); + setProperty(DublinCore.DATE, si.getCreateDateTime()); +// setProperty(Office.EDIT_TIME, si.getEditTime()); + setProperty(HttpHeaders.LAST_MODIFIED, si.getLastSaveDateTime()); + setProperty(Office.KEYWORDS, si.getKeywords()); + setProperty(Office.LAST_AUTHOR, si.getLastAuthor()); + setProperty(Office.LAST_PRINTED, si.getLastPrinted()); + setProperty(Office.LAST_SAVED, si.getLastSaveDateTime()); + setProperty(Office.PAGE_COUNT, si.getPageCount()); + setProperty(Office.REVISION_NUMBER, si.getRevNumber()); + setProperty(DublinCore.RIGHTS, si.getSecurity()); + setProperty(DublinCore.SUBJECT, si.getSubject()); + setProperty(Office.TEMPLATE, si.getTemplate()); + setProperty(Office.WORD_COUNT, si.getWordCount()); + } catch (Exception ex) { + } + propertiesBroker.setProperties(metadata); + } + + private final void setProperty(String name, String value) { + if (!StringUtil.isEmpty(name) && !StringUtil.isEmpty(value)) { + metadata.setProperty(name, value); + } + } + + private final void setProperty(String name, int value) { + if (value != 0) { + setProperty(name, String.valueOf(value)); + } + } + + private final void setProperty(String name, long value) { + if (value != 0) { + setProperty(name, String.valueOf(value)); + } + } + + private final void setProperty(String name, Date date) { + if (date != null) { + setProperty(name, HttpDateFormat.toString(date)); + } + } + + } + +} Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native ------------------------------------------------------- This SF.net email is sponsored by: Splunk Inc. Do you grep through log files for problems? Stop! Download the new AJAX search engine that makes searching your log files as easy as surfing the web. DOWNLOAD SPLUNK! http://sel.as-us.falkag.net/sel?cmd=lnk&kid=103432&bid=230486&dat=121642 _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs