Author: jerome Date: Fri Feb 17 15:22:55 2006 New Revision: 378653 URL: http://svn.apache.org/viewcvs?rev=378653&view=rev Log: Adapt parse-rtf to nutch APIs changes (metadata, parse, protocol, ...)
Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=378653&r1=378652&r2=378653&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java Fri Feb 17 15:22:55 2006 @@ -13,38 +13,42 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.parse.rtf; -import org.apache.nutch.parse.*; -import org.apache.nutch.protocol.Content; -<<<<<<< .mine -<<<<<<< .mine -import org.apache.nutch.util.MetadataNames; - -======= -import org.apache.nutch.util.NutchConf; -======= -import org.apache.hadoop.conf.Configuration; ->>>>>>> .r374853 ->>>>>>> .r373941 +// JDK imports import java.io.ByteArrayInputStream; import java.io.InputStreamReader; import java.io.Reader; -import java.util.Properties; +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Nutch imports +import org.apache.nutch.metadata.DublinCore; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.OutlinkExtractor; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.protocol.Content; + +// RTF Parser imports +import com.etranslate.tm.processing.rtf.ParseException; import com.etranslate.tm.processing.rtf.RTFParser; + /** * A parser for RTF documents * * @author Andy Hedges */ -public class RTFParseFactory implements Parser, MetadataNames { +public class RTFParseFactory implements Parser { private Configuration conf; - public Parse getParse(Content content) throws ParseException { + public Parse getParse(Content content) { byte[] raw = content.getContent(); Reader reader = new InputStreamReader(new ByteArrayInputStream(raw)); RTFParserDelegateImpl delegate = new RTFParserDelegateImpl(); @@ -55,28 +59,31 @@ try { rtfParser.parse(); - } catch (com.etranslate.tm.processing.rtf.ParseException e) { - throw new ParseException("Exception parsing RTF document", e); + } catch (ParseException e) { + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_EXCEPTION, + e.toString()).getEmptyParse(conf); } - Properties metadata = new Properties(); - metadata.putAll(content.getMetadata()); - metadata.putAll(delegate.getMetaData()); - String title = metadata.getProperty(TITLE); + Metadata metadata = new Metadata(); + metadata.setAll(delegate.getMetaData()); + String title = metadata.get(DublinCore.TITLE); if (title != null) { - //(CM): Why remove the title metadata property here? Even - //though it's stored in the ParseData, it still might be useful - //to have via this properties object? - //metadata.remove(title); + metadata.remove(DublinCore.TITLE); } else { title = ""; } String text = delegate.getText(); - return new ParseImpl(text, new ParseData(title, OutlinkExtractor - .getOutlinks(text, this.conf), metadata)); + return new ParseImpl(text, + new ParseData(ParseStatus.STATUS_SUCCESS, + title, + OutlinkExtractor + . getOutlinks(text, this.conf), + content.getMetadata(), + metadata)); } public void setConf(Configuration conf) { Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java?rev=378653&r1=378652&r2=378653&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java Fri Feb 17 15:22:55 2006 @@ -16,27 +16,48 @@ package org.apache.nutch.parse.rtf; +// RTF Parser imports import com.etranslate.tm.processing.rtf.RTFParserDelegate; +// JDK imports import java.util.Arrays; import java.util.List; import java.util.Properties; -import org.apache.nutch.util.MetadataNames; +// Nutch imports +import org.apache.nutch.metadata.DublinCore; +import org.apache.nutch.metadata.Office; + /** * A parser delegate for handling rtf events. * @author Andy Hedges */ -public class RTFParserDelegateImpl implements RTFParserDelegate, MetadataNames { +public class RTFParserDelegateImpl implements RTFParserDelegate { String tabs = ""; Properties metadata = new Properties(); - String[] META_NAMES_TEXT = {TITLE, SUBJECT, AUTHOR, "manager", - "company", "operator", "category", KEYWORDS, - COMMENTS, "doccomm", "hlinkbase"}; - String[] META_NAMES_DATE = {"creatim", "creatim", "printim", "buptim"}; + String[] META_NAMES_TEXT = { + DublinCore.TITLE, + DublinCore.SUBJECT, + Office.AUTHOR, + "manager", + "company", + "operator", + "category", + Office.KEYWORDS, + Office.COMMENTS, + "doccomm", + "hlinkbase" + }; + + String[] META_NAMES_DATE = { + "creatim", + "creatim", + "printim", + "buptim" + }; String metaName = ""; List metaNamesText = Arrays.asList(META_NAMES_TEXT); Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?rev=378653&r1=378652&r2=378653&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Fri Feb 17 15:22:55 2006 @@ -16,33 +16,33 @@ package org.apache.nutch.parse.rtf; +// JUnit imports import junit.framework.TestCase; + +// Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.DublinCore; +import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.parse.ParseException; -import org.apache.nutch.parse.ParserFactory; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.protocol.ProtocolFactory; -<<<<<<< .mine -<<<<<<< .mine -import org.apache.nutch.util.MetadataNames; -======= -import org.apache.nutch.util.NutchConf; -======= +import org.apache.nutch.util.NutchConfiguration; + +// Hadoop imports import org.apache.hadoop.conf.Configuration; ->>>>>>> .r374853 ->>>>>>> .r373941 +import org.apache.hadoop.io.UTF8; -import java.util.Properties; /** * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests). * * @author Andy Hedges */ -public class TestRTFParser extends TestCase implements MetadataNames { +public class TestRTFParser extends TestCase { private String fileSeparator = System.getProperty("file.separator"); // This system property is defined in ./src/plugin/build-plugin.xml @@ -72,16 +72,16 @@ Configuration conf = NutchConfiguration.create(); urlString = "file:" + sampleDir + fileSeparator + rtfFile; protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getContent(urlString); - + content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()) + .getContent(); parse = new ParseUtil(conf).parseByParserId("parse-rtf", content); String text = parse.getText(); assertEquals("The quick brown fox jumps over the lazy dog", text.trim()); String title = parse.getData().getTitle(); - Properties meta = parse.getData().getMetadata(); + Metadata meta = parse.getData().getParseMeta(); assertEquals("test rft document", title); - assertEquals("tests", meta.getProperty(SUBJECT)); + assertEquals("tests", meta.get(DublinCore.SUBJECT));